Spaces:

CohereLabs
/

command-a-reasoning

Running on CPU Upgrade

File size: 7,940 Bytes

import os
from collections.abc import Iterator

import gradio as gr
from gradio import ChatMessage
from cohere import ClientV2
from cohere.core import RequestOptions

model_id = "command-a-reasoning-08-2025"

# Initialize Cohere client
api_key = os.getenv("COHERE_API_KEY")
if not api_key:
    raise ValueError("COHERE_API_KEY environment variable is required")

client = ClientV2(api_key=api_key, client_name="hf-command-a-reasoning-08-2025")

def format_chat_history(messages: list) -> list:
    """
    Formats the chat history into a structure Cohere can understand
    """
    formatted_history = []
    for message in messages:
        # Handle both ChatMessage objects and regular dictionaries
        if hasattr(message, "metadata") and message.metadata:
            # Skip thinking messages (messages with metadata)
            continue

        # Extract role and content safely
        if hasattr(message, "role"):
            role = message.role
            content = message.content
        elif isinstance(message, dict):
            role = message.get("role")
            content = message.get("content")
        else:
            continue

        if role and content:
            # Ensure content is a string to prevent validation issues
            if content is None:
                content = ""
            elif not isinstance(content, str):
                content = str(content)

            formatted_history.append({
                "role": role,
                "content": content
            })
    return formatted_history

def generate(message: str, history: list, thinking_budget: int) -> Iterator[list]:
    # Create a clean working copy of the history (excluding thinking messages)
    working_history = []
    for msg in history:
        # Skip thinking messages (messages with metadata)
        if hasattr(msg, "metadata") and msg.metadata:
            continue
        working_history.append(msg)

    # Format chat history for Cohere API (exclude thinking messages)
    messages = format_chat_history(working_history)

    # Add current message
    if message:
        messages.append({"role": "user", "content": message})

    try:
        # Set thinking type based on thinking_budget
        if thinking_budget == 0:
            thinking_param = {"type": "disabled"}
        else:
            thinking_param = {"type": "enabled", "token_budget": thinking_budget}
        # Call Cohere API using the correct event type and delta access
        response = client.chat_stream(
            model=model_id,
            messages=messages,
            temperature=0.3,
            request_options=RequestOptions(additional_body_parameters={"thinking": thinking_param})
        )

        # Initialize buffers
        thought_buffer = ""
        response_buffer = ""
        thinking_complete = False

        # Start with just the new assistant messages for this interaction
        current_interaction = [
            ChatMessage(
                role="assistant",
                content="",
                metadata={"title": "🧠 Thinking..."}
            )
        ]

        for event in response:
            if getattr(event, "type", None) == "content-delta":
                delta = event.delta

                if hasattr(delta, 'message'):
                    message = delta.message

                    if hasattr(message, 'content'):
                        content = message.content

                        # Check for thinking tokens first
                        thinking_text = getattr(content, 'thinking', None)
                        if thinking_text:
                            thought_buffer += thinking_text
                            # Update thinking message with metadata
                            current_interaction[0] = ChatMessage(
                                role="assistant",
                                content=thought_buffer,
                                metadata={"title": "🧠 Thinking..."}
                            )
                            # Yield only the current interaction, but ensure proper formatting
                            yield [
                                {
                                    "role": msg.role,
                                    "content": msg.content,
                                    "metadata": getattr(msg, "metadata", None)
                                } for msg in current_interaction
                            ]
                            continue

                        # Check for regular text tokens
                        text = getattr(content, 'text', None)
                        if text:
                            # Ensure text is a string
                            if text is None:
                                text = ""
                            elif not isinstance(text, str):
                                text = str(text)

                            # If we haven't completed thinking yet, this might be the start of the response
                            if not thinking_complete and thought_buffer:
                                thinking_complete = True
                                # Add response message below thinking
                                current_interaction.append(
                                    ChatMessage(
                                        role="assistant",
                                        content=""
                                    )
                                )

                            if thinking_complete:
                                # if thinking is complete, we collapse the thinking message
                                current_interaction[0] = ChatMessage(
                                    role="assistant",
                                    content=thought_buffer,
                                    metadata={"title": "🧠 Thoughts", "status": "done"}
                                )

                            response_buffer += text
                            # Update response message
                            current_interaction[-1] = ChatMessage(
                                role="assistant",
                                content=response_buffer
                            )
                            # Yield only the current interaction, but ensure proper formatting
                            yield [
                                {
                                    "role": msg.role,
                                    "content": msg.content,
                                    "metadata": getattr(msg, "metadata", None)
                                } for msg in current_interaction
                            ]

        # Final cleanup: ensure the final response is clean
        if thought_buffer and response_buffer:
            # Keep both thinking and response messages in the final history
            # The thinking message will be preserved with its metadata
            pass

    except Exception as e:
        gr.Warning(f"Error calling Cohere API: {str(e)}")
        yield []


examples = [
    [
        "Write a COBOL function to reverse a string"
    ],
    [
        "Como sair de um helicóptero que caiu na água?"
    ],
    [
        "What is the best way to learn machine learning?"
    ],
    [
        "Explain quantum computing in simple terms"
    ],
    [
        "How do I implement a binary search tree?"
    ],
    [
        "Explique la théorie de la relativité en français"
    ],
]

demo = gr.ChatInterface(
    fn=generate,
    type="messages",
    autofocus=True,
    title="Command A Reasoning",
    examples=examples,
    run_examples_on_click=True,
    css_paths="style.css",
    delete_cache=(1800, 1800),
    cache_examples=False,
    additional_inputs=[
        gr.Slider(label="Thinking Budget", minimum=0, maximum=2000, step=10, value=500),
    ],
)

if __name__ == "__main__":
    demo.launch()