renamed to agent

2026-01-10 07:20:10 -06:00 · 2025-07-25 19:01:20 -04:00
parent 52005c592f
commit 6f177d7f6a
29 changed files with 1320 additions and 25 deletions
--- a/libs/python/agent2/README.md
+++ b/libs/python/agent2/README.md
@@ -147,12 +147,12 @@ agent = ComputerAgent(

 ## Callbacks System

-Agent2 provides a comprehensive callback system for extending functionality:
+agent provides a comprehensive callback system for extending functionality:

 ### Built-in Callbacks

 ```python
-from agent2.callbacks import (
+from agent.callbacks import (
    ImageRetentionCallback,
    TrajectorySaverCallback, 
    BudgetManagerCallback,
@@ -174,7 +174,7 @@ agent = ComputerAgent(
 ### Custom Callbacks

 ```python
-from agent2.callbacks.base import AsyncCallbackHandler
+from agent.callbacks.base import AsyncCallbackHandler

 class CustomCallback(AsyncCallbackHandler):
    async def on_llm_start(self, messages):
--- a/libs/python/agent2/agent2/init.py
+++ b/libs/python/agent2/agent2/init.py
@@ -1,5 +1,5 @@
 """
-Agent2 - Decorator-based Computer Use Agent with liteLLM integration
+agent - Decorator-based Computer Use Agent with liteLLM integration
 """

 from .decorators import agent_loop
--- a/libs/python/agent2/agent/main.py
+++ b/libs/python/agent2/agent/main.py
@@ -0,0 +1,21 @@
+"""
+Entry point for running agent CLI module.
+
+Usage:
+    python -m agent.cli <model_string>
+"""
+
+import sys
+import asyncio
+from .cli import main
+
+if __name__ == "__main__":
+    # Check if 'cli' is specified as the module
+    if len(sys.argv) > 1 and sys.argv[1] == "cli":
+        # Remove 'cli' from arguments and run CLI
+        sys.argv.pop(1)
+        asyncio.run(main())
+    else:
+        print("Usage: python -m agent.cli <model_string>")
+        print("Example: python -m agent.cli openai/computer-use-preview")
+        sys.exit(1)
--- a/libs/python/agent2/agent2/adapters/init.py
+++ b/libs/python/agent2/agent2/adapters/init.py
@@ -1,5 +1,5 @@
 """
-Adapters package for agent2 - Custom LLM adapters for LiteLLM
+Adapters package for agent - Custom LLM adapters for LiteLLM
 """

 from .huggingfacelocal_adapter import HuggingFaceLocalAdapter
--- a/libs/python/agent2/agent2/adapters/huggingfacelocal_adapter.py
+++ b/libs/python/agent2/agent2/adapters/huggingfacelocal_adapter.py
--- a/libs/python/agent2/agent2/agent.py
+++ b/libs/python/agent2/agent2/agent.py
@@ -192,7 +192,7 @@ class ComputerAgent:
        ]

        # == Initialize computer agent ==
-        
+
        # Find the appropriate agent loop
        if custom_loop:
            self.agent_loop = custom_loop
@@ -204,16 +204,26 @@ class ComputerAgent:
            self.agent_loop = loop_info.func
            self.agent_loop_info = loop_info
        
-        # Process tools and create tool schemas
-        self.tool_schemas = self._process_tools()
+        self.tool_schemas = []
+        self.computer_handler = None
        
-        # Find computer tool and create interface adapter
-        computer_handler = None
-        for schema in self.tool_schemas:
-            if schema["type"] == "computer":
-                computer_handler = OpenAIComputerHandler(schema["computer"].interface)
-                break
-        self.computer_handler = computer_handler
+    async def _initialize_computers(self):
+        """Initialize computer objects"""
+        if not self.tool_schemas:
+            for tool in self.tools:
+                if hasattr(tool, '_initialized') and not tool._initialized:
+                    await tool.run()
+                
+            # Process tools and create tool schemas
+            self.tool_schemas = self._process_tools()
+            
+            # Find computer tool and create interface adapter
+            computer_handler = None
+            for schema in self.tool_schemas:
+                if schema["type"] == "computer":
+                    computer_handler = OpenAIComputerHandler(schema["computer"].interface)
+                    break
+            self.computer_handler = computer_handler
    
    def _process_input(self, input: Messages) -> List[Dict[str, Any]]:
        """Process input messages and create schemas for the agent loop"""
@@ -484,6 +494,9 @@ class ComputerAgent:
        Returns:
            AsyncGenerator that yields response chunks
        """
+
+        await self._initialize_computers()
+        
        # Merge kwargs
        merged_kwargs = {**self.kwargs, **kwargs}
        
--- a/libs/python/agent2/agent2/callbacks/init.py
+++ b/libs/python/agent2/agent2/callbacks/init.py
--- a/libs/python/agent2/agent2/callbacks/base.py
+++ b/libs/python/agent2/agent2/callbacks/base.py
--- a/libs/python/agent2/agent2/callbacks/budget_manager.py
+++ b/libs/python/agent2/agent2/callbacks/budget_manager.py
--- a/libs/python/agent2/agent2/callbacks/image_retention.py
+++ b/libs/python/agent2/agent2/callbacks/image_retention.py
--- a/libs/python/agent2/agent2/callbacks/logging.py
+++ b/libs/python/agent2/agent2/callbacks/logging.py
@@ -54,10 +54,10 @@ class LoggingCallback(AsyncCallbackHandler):
        Initialize the logging callback.
        
        Args:
-            logger: Logger instance to use. If None, creates a logger named 'agent2.ComputerAgent'
+            logger: Logger instance to use. If None, creates a logger named 'agent.ComputerAgent'
            level: Logging level (logging.DEBUG, logging.INFO, etc.)
        """
-        self.logger = logger or logging.getLogger('agent2.ComputerAgent')
+        self.logger = logger or logging.getLogger('agent.ComputerAgent')
        self.level = level
        
        # Set up logger if it doesn't have handlers
--- a/libs/python/agent2/agent2/callbacks/pii_anonymization.py
+++ b/libs/python/agent2/agent2/callbacks/pii_anonymization.py
--- a/libs/python/agent2/agent2/callbacks/trajectory_saver.py
+++ b/libs/python/agent2/agent2/callbacks/trajectory_saver.py
--- a/libs/python/agent2/agent/cli.py
+++ b/libs/python/agent2/agent/cli.py
@@ -0,0 +1,290 @@
+"""
+CLI chat interface for agent - Computer Use Agent
+
+Usage:
+    python -m agent.cli <model_string>
+    
+Examples:
+    python -m agent.cli openai/computer-use-preview
+    python -m agent.cli anthropic/claude-3-5-sonnet-20241022
+    python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
+"""
+
+import asyncio
+import argparse
+import os
+import sys
+import json
+from typing import List, Dict, Any
+import dotenv
+from yaspin import yaspin
+
+# Load environment variables
+dotenv.load_dotenv()
+
+# Color codes for terminal output
+class Colors:
+    RESET = '\033[0m'
+    BOLD = '\033[1m'
+    DIM = '\033[2m'
+    
+    # Text colors
+    RED = '\033[31m'
+    GREEN = '\033[32m'
+    YELLOW = '\033[33m'
+    BLUE = '\033[34m'
+    MAGENTA = '\033[35m'
+    CYAN = '\033[36m'
+    WHITE = '\033[37m'
+    GRAY = '\033[90m'
+    
+    # Background colors
+    BG_RED = '\033[41m'
+    BG_GREEN = '\033[42m'
+    BG_YELLOW = '\033[43m'
+    BG_BLUE = '\033[44m'
+
+
+def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n"):
+    """Print colored text to terminal."""
+    prefix = ""
+    if bold:
+        prefix += Colors.BOLD
+    if dim:
+        prefix += Colors.DIM
+    if color:
+        prefix += color
+    
+    print(f"{prefix}{text}{Colors.RESET}", end=end)
+
+
+def print_action(action_type: str, details: Dict[str, Any]):
+    """Print computer action with nice formatting."""
+    # Format action details
+    args_str = ""
+    if action_type == "click" and "x" in details and "y" in details:
+        args_str = f"({details['x']}, {details['y']})"
+    elif action_type == "type" and "text" in details:
+        text = details["text"]
+        if len(text) > 50:
+            text = text[:47] + "..."
+        args_str = f'"{text}"'
+    elif action_type == "key" and "key" in details:
+        args_str = f"'{details['key']}'"
+    elif action_type == "scroll" and "x" in details and "y" in details:
+        args_str = f"({details['x']}, {details['y']})"
+    
+    print_colored(f"🛠️  {action_type}{args_str}", dim=True)
+
+
+def print_welcome(model: str, agent_loop: str, container_name: str):
+    """Print welcome message."""
+    print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
+    print_colored("Type 'exit' to quit.", dim=True)
+
+async def ainput(prompt: str = ""):
+    return await asyncio.to_thread(input, prompt)
+
+async def chat_loop(agent, model: str, container_name: str):
+    """Main chat loop with the agent."""
+    print_welcome(model, agent.agent_loop.__name__, container_name)
+    
+    history = []
+    
+    while True:
+        # Get user input with prompt
+        print_colored("> ", end="")
+        user_input = await ainput()
+        
+        if user_input.lower() in ['exit', 'quit', 'q']:
+            print_colored("\n👋 Goodbye!")
+            break
+            
+        if not user_input:
+            continue
+            
+        # Add user message to history
+        history.append({"role": "user", "content": user_input})
+        
+        # Stream responses from the agent with spinner
+        with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
+            spinner.hide()
+            
+            async for result in agent.run(history):
+                # Add agent responses to history
+                history.extend(result.get("output", []))
+                
+                # Process and display the output
+                for item in result.get("output", []):
+                    if item.get("type") == "message":
+                        # Display agent text response
+                        content = item.get("content", [])
+                        for content_part in content:
+                            if content_part.get("text"):
+                                text = content_part.get("text", "").strip()
+                                if text:
+                                    spinner.hide()
+                                    print_colored(text)
+                    
+                    elif item.get("type") == "computer_call":
+                        # Display computer action
+                        action = item.get("action", {})
+                        action_type = action.get("type", "")
+                        if action_type:
+                            spinner.hide()
+                            print_action(action_type, action)
+                            spinner.text = f"Performing {action_type}..."
+                            spinner.show()
+                    
+                    elif item.get("type") == "function_call":
+                        # Display function call
+                        function_name = item.get("name", "")
+                        spinner.hide()
+                        print_colored(f"🔧 Calling function: {function_name}", dim=True)
+                        spinner.text = f"Calling {function_name}..."
+                        spinner.show()
+                    
+                    elif item.get("type") == "function_call_output":
+                        # Display function output (dimmed)
+                        output = item.get("output", "")
+                        if output and len(output.strip()) > 0:
+                            spinner.hide()
+                            print_colored(f"📤 {output}", dim=True)
+            
+            spinner.hide()
+        
+
+async def main():
+    """Main CLI function."""
+    parser = argparse.ArgumentParser(
+        description="CUA Agent CLI - Interactive computer use assistant",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python -m agent.cli openai/computer-use-preview
+  python -m agent.cli anthropic/claude-3-5-sonnet-20241022
+  python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
+  python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
+        """
+    )
+    
+    parser.add_argument(
+        "model",
+        help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
+    )
+    
+    parser.add_argument(
+        "--images",
+        type=int,
+        default=3,
+        help="Number of recent images to keep in context (default: 3)"
+    )
+    
+    parser.add_argument(
+        "--trajectory",
+        action="store_true",
+        help="Save trajectory for debugging"
+    )
+    
+    parser.add_argument(
+        "--budget",
+        type=float,
+        help="Maximum budget for the session (in dollars)"
+    )
+    
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose logging"
+    )
+    
+    args = parser.parse_args()
+    
+    # Check for required environment variables
+    container_name = os.getenv("CUA_CONTAINER_NAME")
+    cua_api_key = os.getenv("CUA_API_KEY")
+    
+    # Prompt for missing environment variables
+    if not container_name:
+        print_colored("CUA_CONTAINER_NAME not set.", dim=True)
+        print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
+        container_name = input("Enter your CUA container name: ").strip()
+        if not container_name:
+            print_colored("❌ Container name is required.")
+            sys.exit(1)
+    
+    if not cua_api_key:
+        print_colored("CUA_API_KEY not set.", dim=True)
+        cua_api_key = input("Enter your CUA API key: ").strip()
+        if not cua_api_key:
+            print_colored("❌ API key is required.")
+            sys.exit(1)
+    
+    # Check for provider-specific API keys based on model
+    provider_api_keys = {
+        "openai/": "OPENAI_API_KEY",
+        "anthropic/": "ANTHROPIC_API_KEY",
+        "omniparser+": "OPENAI_API_KEY",
+        "omniparser+": "ANTHROPIC_API_KEY",
+    }
+    
+    # Find matching provider and check for API key
+    for prefix, env_var in provider_api_keys.items():
+        if args.model.startswith(prefix):
+            if not os.getenv(env_var):
+                print_colored(f"{env_var} not set.", dim=True)
+                api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
+                if not api_key:
+                    print_colored(f"❌ {env_var.replace('_', ' ').title()} is required.")
+                    sys.exit(1)
+                # Set the environment variable for the session
+                os.environ[env_var] = api_key
+            break
+    
+    # Import here to avoid import errors if dependencies are missing
+    try:
+        from agent import ComputerAgent
+        from computer import Computer
+    except ImportError as e:
+        print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
+        print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
+        sys.exit(1)
+    
+    # Create computer instance
+    async with Computer(
+        os_type="linux",
+        provider_type="cloud",
+        name=container_name,
+        api_key=cua_api_key
+    ) as computer:
+        
+        # Create agent
+        agent_kwargs = {
+            "model": args.model,
+            "tools": [computer],
+            "only_n_most_recent_images": args.images,
+            "verbosity": 20 if args.verbose else 30,  # DEBUG vs WARNING
+        }
+        
+        if args.trajectory:
+            agent_kwargs["trajectory_dir"] = "trajectories"
+        
+        if args.budget:
+            agent_kwargs["max_trajectory_budget"] = {
+                "max_budget": args.budget,
+                "raise_error": True,
+                "reset_after_each_run": False
+            }
+        
+        agent = ComputerAgent(**agent_kwargs)
+        
+        # Start chat loop
+        await chat_loop(agent, args.model, container_name)
+
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except (KeyboardInterrupt, EOFError) as _:
+        print_colored("\n\n👋 Goodbye!")
--- a/libs/python/agent2/agent2/computer_handler.py
+++ b/libs/python/agent2/agent2/computer_handler.py
--- a/libs/python/agent2/agent2/decorators.py
+++ b/libs/python/agent2/agent2/decorators.py
@@ -1,5 +1,5 @@
 """
-Decorators for agent2 - agent_loop decorator
+Decorators for agent - agent_loop decorator
 """

 import asyncio
--- a/libs/python/agent2/agent2/loops/init.py
+++ b/libs/python/agent2/agent2/loops/init.py
@@ -1,5 +1,5 @@
 """
-Agent loops for agent2
+Agent loops for agent
 """

 # Import the loops to register them
--- a/libs/python/agent2/agent2/loops/anthropic.py
+++ b/libs/python/agent2/agent2/loops/anthropic.py
--- a/libs/python/agent2/agent2/loops/omniparser.py
+++ b/libs/python/agent2/agent2/loops/omniparser.py
--- a/libs/python/agent2/agent2/loops/openai.py
+++ b/libs/python/agent2/agent2/loops/openai.py
--- a/libs/python/agent2/agent2/loops/uitars.py
+++ b/libs/python/agent2/agent2/loops/uitars.py
--- a/libs/python/agent2/agent2/responses.py
+++ b/libs/python/agent2/agent2/responses.py
--- a/libs/python/agent2/agent2/types.py
+++ b/libs/python/agent2/agent2/types.py
@@ -1,5 +1,5 @@
 """
-Type definitions for agent2
+Type definitions for agent
 """

 from typing import Dict, List, Any, Optional, Callable, Protocol, Literal
--- a/libs/python/agent2/agent/ui/init.py
+++ b/libs/python/agent2/agent/ui/init.py
@@ -0,0 +1,7 @@
+"""
+UI components for agent
+"""
+
+from .gradio import test_cua, create_gradio_ui
+
+__all__ = ["test_cua", "create_gradio_ui"]
--- a/libs/python/agent2/agent/ui/gradio/init.py
+++ b/libs/python/agent2/agent/ui/gradio/init.py
@@ -0,0 +1,8 @@
+"""
+Gradio UI for agent
+"""
+
+from .app import test_cua
+from .ui_components import create_gradio_ui
+
+__all__ = ["test_cua", "create_gradio_ui"]
--- a/libs/python/agent2/agent/ui/gradio/app.py
+++ b/libs/python/agent2/agent/ui/gradio/app.py
@@ -0,0 +1,248 @@
+"""
+Advanced Gradio UI for Computer-Use Agent (cua-agent)
+
+This is a Gradio interface for the Computer-Use Agent v0.4.x (cua-agent)
+with an advanced UI for model selection and configuration.
+
+Supported Agent Models:
+- OpenAI: openai/computer-use-preview
+- Anthropic: anthropic/claude-3-5-sonnet-20241022, anthropic/claude-3-7-sonnet-20250219
+- UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
+- Omniparser: omniparser+anthropic/claude-3-5-sonnet-20241022, omniparser+ollama_chat/gemma3
+
+Requirements:
+    - Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
+    - macOS 14 (Sonoma) or newer / Ubuntu 20.04+
+    - Python 3.11+
+    - Lume CLI installed (https://github.com/trycua/cua)
+    - OpenAI or Anthropic API key
+"""
+
+import os
+import asyncio
+import logging
+import json
+import platform
+from pathlib import Path
+from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
+import gradio as gr
+from gradio.components.chatbot import MetadataDict
+from typing import cast
+
+# Import from agent package
+from agent import ComputerAgent
+from agent.types import Messages, AgentResponse
+from computer import Computer
+
+# Global variables
+global_agent = None
+global_computer = None
+SETTINGS_FILE = Path(".gradio_settings.json")
+
+
+import dotenv
+if dotenv.load_dotenv():
+    print(f"DEBUG - Loaded environment variables from {dotenv.find_dotenv()}")
+else:
+    print("DEBUG - No .env file found")
+
+# --- Settings Load/Save Functions ---
+def load_settings() -> Dict[str, Any]:
+    """Loads settings from the JSON file."""
+    if SETTINGS_FILE.exists():
+        try:
+            with open(SETTINGS_FILE, "r") as f:
+                settings = json.load(f)
+                if isinstance(settings, dict):
+                    print(f"DEBUG - Loaded settings from {SETTINGS_FILE}")
+                    return settings
+        except (json.JSONDecodeError, IOError) as e:
+            print(f"Warning: Could not load settings from {SETTINGS_FILE}: {e}")
+    return {}
+
+
+def save_settings(settings: Dict[str, Any]):
+    """Saves settings to the JSON file."""
+    settings.pop("provider_api_key", None)
+    try:
+        with open(SETTINGS_FILE, "w") as f:
+            json.dump(settings, f, indent=4)
+        print(f"DEBUG - Saved settings to {SETTINGS_FILE}")
+    except IOError as e:
+        print(f"Warning: Could not save settings to {SETTINGS_FILE}: {e}")
+
+
+# Custom Screenshot Handler for Gradio chat
+class GradioChatScreenshotHandler:
+    """Custom handler that adds screenshots to the Gradio chatbot."""
+
+    def __init__(self, chatbot_history: List[gr.ChatMessage]):
+        self.chatbot_history = chatbot_history
+        print("GradioChatScreenshotHandler initialized")
+
+    async def on_screenshot(self, screenshot_base64: str, action_type: str = "") -> None:
+        """Add screenshot to chatbot when a screenshot is taken."""
+        image_markdown = f"![Screenshot after {action_type}](data:image/png;base64,{screenshot_base64})"
+        
+        if self.chatbot_history is not None:
+            self.chatbot_history.append(
+                gr.ChatMessage(
+                    role="assistant",
+                    content=image_markdown,
+                    metadata={"title": f"🖥️ Screenshot - {action_type}", "status": "done"},
+                )
+            )
+
+
+# Detect platform capabilities
+is_mac = platform.system().lower() == "darwin"
+is_lume_available = is_mac or (os.environ.get("PYLUME_HOST", "localhost") != "localhost")
+
+print("PYLUME_HOST: ", os.environ.get("PYLUME_HOST", "localhost"))
+print("is_mac: ", is_mac)
+print("Lume available: ", is_lume_available)
+
+# Map model names to agent model strings
+MODEL_MAPPINGS = {
+    "openai": {
+        "default": "openai/computer-use-preview",
+        "OpenAI: Computer-Use Preview": "openai/computer-use-preview",
+    },
+    "anthropic": {
+        "default": "anthropic/claude-3-7-sonnet-20250219",
+        "Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
+        "Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
+        "Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
+        "Anthropic: Claude 3.5 Sonnet (20240620)": "anthropic/claude-3-5-sonnet-20240620",
+    },
+    "omni": {
+        "default": "omniparser+openai/gpt-4o",
+        "OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
+        "OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
+        "OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
+        "OMNI: Claude 3.5 Sonnet (20240620)": "omniparser+anthropic/claude-3-5-sonnet-20240620",
+    },
+    "uitars": {
+        "default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",
+        "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
+    },
+}
+
+
+def get_model_string(model_name: str, loop_provider: str) -> str:
+    """Determine the agent model string based on the input."""
+    if model_name == "Custom model (OpenAI compatible API)":
+        return "custom_oaicompat"
+    elif model_name == "Custom model (ollama)":
+        return "custom_ollama"
+    elif loop_provider == "OMNI-OLLAMA" or model_name.startswith("OMNI: Ollama "):
+        if model_name.startswith("OMNI: Ollama "):
+            ollama_model = model_name.split("OMNI: Ollama ", 1)[1]
+            return f"omniparser+ollama_chat/{ollama_model}"
+        return "omniparser+ollama_chat/llama3"
+    
+    # Map based on loop provider
+    mapping = MODEL_MAPPINGS.get(loop_provider.lower(), MODEL_MAPPINGS["openai"])
+    return mapping.get(model_name, mapping["default"])
+
+
+def get_ollama_models() -> List[str]:
+    """Get available models from Ollama if installed."""
+    try:
+        import subprocess
+        result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
+        if result.returncode == 0:
+            lines = result.stdout.strip().split("\n")
+            if len(lines) < 2:
+                return []
+            models = []
+            for line in lines[1:]:
+                parts = line.split()
+                if parts:
+                    model_name = parts[0]
+                    models.append(f"OMNI: Ollama {model_name}")
+            return models
+        return []
+    except Exception as e:
+        logging.error(f"Error getting Ollama models: {e}")
+        return []
+
+
+def create_computer_instance(
+    verbosity: int = logging.INFO,
+    os_type: str = "macos",
+    provider_type: str = "lume",
+    name: Optional[str] = None,
+    api_key: Optional[str] = None
+) -> Computer:
+    """Create or get the global Computer instance."""
+    global global_computer
+    if global_computer is None:
+        global_computer = Computer(
+            verbosity=verbosity,
+            os_type=os_type,
+            provider_type=provider_type,
+            name=name if name else "",
+            api_key=api_key
+        )
+    return global_computer
+
+
+def create_agent(
+    model_string: str,
+    save_trajectory: bool = True,
+    only_n_most_recent_images: int = 3,
+    verbosity: int = logging.INFO,
+    custom_model_name: Optional[str] = None,
+    computer_os: str = "macos",
+    computer_provider: str = "lume",
+    computer_name: Optional[str] = None,
+    computer_api_key: Optional[str] = None,
+    max_trajectory_budget: Optional[float] = None,
+) -> ComputerAgent:
+    """Create or update the global agent with the specified parameters."""
+    global global_agent
+
+    # Create the computer
+    computer = create_computer_instance(
+        verbosity=verbosity,
+        os_type=computer_os,
+        provider_type=computer_provider,
+        name=computer_name,
+        api_key=computer_api_key
+    )
+
+    # Handle custom models
+    if model_string == "custom_oaicompat" and custom_model_name:
+        model_string = custom_model_name
+    elif model_string == "custom_ollama" and custom_model_name:
+        model_string = f"omniparser+ollama_chat/{custom_model_name}"
+
+    # Create agent kwargs
+    agent_kwargs = {
+        "model": model_string,
+        "tools": [computer],
+        "only_n_most_recent_images": only_n_most_recent_images,
+        "verbosity": verbosity,
+    }
+    
+    if save_trajectory:
+        agent_kwargs["trajectory_dir"] = "trajectories"
+    
+    if max_trajectory_budget:
+        agent_kwargs["max_trajectory_budget"] = {"max_budget": max_trajectory_budget, "raise_error": True}
+
+    global_agent = ComputerAgent(**agent_kwargs)
+    return global_agent
+
+
+def test_cua():
+    """Standalone function to launch the Gradio app."""
+    from agent.ui.gradio.ui_components import create_gradio_ui
+    print(f"Starting Gradio app for CUA Agent...")
+    demo = create_gradio_ui()
+    demo.launch(share=False, inbrowser=True)
+
+
+if __name__ == "__main__":
+    test_cua()
--- a/libs/python/agent2/agent/ui/gradio/ui_components.py
+++ b/libs/python/agent2/agent/ui/gradio/ui_components.py
@@ -0,0 +1,703 @@
+"""
+UI Components for the Gradio interface
+"""
+
+import os
+import asyncio
+import logging
+import json
+import platform
+from pathlib import Path
+from typing import Dict, List, Optional, Any, cast
+import gradio as gr
+from gradio.components.chatbot import MetadataDict
+
+from .app import (
+    load_settings, save_settings, create_agent, get_model_string, 
+    get_ollama_models, GradioChatScreenshotHandler, global_agent, global_computer
+)
+
+
+def create_gradio_ui() -> gr.Blocks:
+    """Create a Gradio UI for the Computer-Use Agent."""
+    
+    # Load settings
+    saved_settings = load_settings()
+    
+    # Check for API keys
+    openai_api_key = os.environ.get("OPENAI_API_KEY", "")
+    anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
+    cua_api_key = os.environ.get("CUA_API_KEY", "")
+    
+    # Model choices
+    openai_models = ["OpenAI: Computer-Use Preview"]
+    anthropic_models = [
+        "Anthropic: Claude 4 Opus (20250514)",
+        "Anthropic: Claude 4 Sonnet (20250514)",
+        "Anthropic: Claude 3.7 Sonnet (20250219)",
+        "Anthropic: Claude 3.5 Sonnet (20240620)",
+    ]
+    omni_models = [
+        "OMNI: OpenAI GPT-4o",
+        "OMNI: OpenAI GPT-4o mini",
+        "OMNI: Claude 3.7 Sonnet (20250219)", 
+        "OMNI: Claude 3.5 Sonnet (20240620)"
+    ]
+    
+    # Check if API keys are available
+    has_openai_key = bool(openai_api_key)
+    has_anthropic_key = bool(anthropic_api_key)
+    has_cua_key = bool(cua_api_key)
+
+    # Get Ollama models for OMNI
+    ollama_models = get_ollama_models()
+    if ollama_models:
+        omni_models += ollama_models
+
+    # Detect platform
+    is_mac = platform.system().lower() == "darwin"
+    
+    # Format model choices
+    provider_to_models = {
+        "OPENAI": openai_models,
+        "ANTHROPIC": anthropic_models,
+        "OMNI": omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
+        "UITARS": ([
+            "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
+        ] if is_mac else []) + ["Custom model (OpenAI compatible API)"],
+    }
+
+    # Apply saved settings
+    initial_loop = saved_settings.get("agent_loop", "OMNI")
+    available_models_for_loop = provider_to_models.get(initial_loop, [])
+    saved_model_choice = saved_settings.get("model_choice")
+    if saved_model_choice and saved_model_choice in available_models_for_loop:
+        initial_model = saved_model_choice
+    else:
+        if initial_loop == "OPENAI":
+            initial_model = openai_models[0] if openai_models else "No models available"
+        elif initial_loop == "ANTHROPIC":
+            initial_model = anthropic_models[0] if anthropic_models else "No models available"
+        else:  # OMNI
+            initial_model = omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
+
+    initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
+    initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
+    initial_save_trajectory = saved_settings.get("save_trajectory", True)
+    initial_recent_images = saved_settings.get("recent_images", 3)
+
+    # Example prompts
+    example_messages = [
+        "Create a Python virtual environment, install pandas and matplotlib, then plot stock data",
+        "Open a PDF in Preview, add annotations, and save it as a compressed version",
+        "Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
+        "Configure SSH keys and set up a connection to a remote server",
+    ]
+    
+    def generate_python_code(agent_loop_choice, model_name, tasks, recent_images=3, save_trajectory=True, computer_os="linux", computer_provider="cloud", container_name="", cua_cloud_api_key="", max_budget=None):
+        """Generate Python code for the current configuration and tasks."""
+        tasks_str = ""
+        for task in tasks:
+            if task and task.strip():
+                tasks_str += f'            "{task}",\n'
+        
+        model_string = get_model_string(model_name, agent_loop_choice)
+        
+        computer_args = []
+        if computer_os != "macos":
+            computer_args.append(f'os_type="{computer_os}"')
+        if computer_provider != "lume":
+            computer_args.append(f'provider_type="{computer_provider}"')
+        if container_name:
+            computer_args.append(f'name="{container_name}"')
+        if cua_cloud_api_key:
+            computer_args.append(f'api_key="{cua_cloud_api_key}"')
+        
+        computer_args_str = ", ".join(computer_args)
+        if computer_args_str:
+            computer_args_str = f"({computer_args_str})"
+        else:
+            computer_args_str = "()"
+        
+        code = f'''import asyncio
+from computer import Computer
+from agent import ComputerAgent
+
+async def main():
+    async with Computer{computer_args_str} as computer:
+        agent = ComputerAgent(
+            model="{model_string}",
+            tools=[computer],
+            only_n_most_recent_images={recent_images},'''
+        
+        if save_trajectory:
+            code += '''
+            trajectory_dir="trajectories",'''
+        
+        if max_budget:
+            code += f'''
+            max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},'''
+            
+        code += '''
+        )
+        '''
+        
+        if tasks_str:
+            code += f'''
+        # Prompts for the computer-use agent
+        tasks = [
+{tasks_str.rstrip()}
+        ]
+
+        for task in tasks:
+            print(f"Executing task: {{task}}")
+            messages = [{{"role": "user", "content": task}}]
+            async for result in agent.run(messages):
+                for item in result["output"]:
+                    if item["type"] == "message":
+                        print(item["content"][0]["text"])'''
+        else:
+            code += f'''
+        # Execute a single task
+        task = "Search for information about CUA on GitHub"
+        print(f"Executing task: {{task}}")
+        messages = [{{"role": "user", "content": task}}]
+        async for result in agent.run(messages):
+            for item in result["output"]:
+                if item["type"] == "message":
+                    print(item["content"][0]["text"])'''
+
+        code += '''
+
+if __name__ == "__main__":
+    asyncio.run(main())'''
+        
+        return code
+
+    # Create the Gradio interface
+    with gr.Blocks(title="Computer-Use Agent") as demo:
+        with gr.Row():
+            # Left column for settings
+            with gr.Column(scale=1):
+                # Logo
+                gr.HTML(
+                    """
+                    <div style="display: flex; justify-content: center; margin-bottom: 0.5em">
+                        <img alt="CUA Logo" style="width: 80px;"
+                             src="https://github.com/trycua/cua/blob/main/img/logo_black.png?raw=true" />
+                    </div>
+                    """
+                )
+
+                # Python code accordion
+                with gr.Accordion("Python Code", open=False):
+                    code_display = gr.Code(
+                        language="python",
+                        value=generate_python_code(initial_loop, "gpt-4o", []),
+                        interactive=False,
+                    )
+                    
+                with gr.Accordion("Computer Configuration", open=True):
+                    computer_os = gr.Radio(
+                        choices=["macos", "linux", "windows"],
+                        label="Operating System",
+                        value="macos",
+                        info="Select the operating system for the computer",
+                    )
+                    
+                    is_windows = platform.system().lower() == "windows"
+                    is_mac = platform.system().lower() == "darwin"
+                    
+                    providers = ["cloud"]
+                    if is_mac:
+                        providers += ["lume"]
+                    if is_windows:
+                        providers += ["winsandbox"]
+
+                    computer_provider = gr.Radio(
+                        choices=providers,
+                        label="Provider",
+                        value="lume" if is_mac else "cloud",
+                        info="Select the computer provider",
+                    )
+                    
+                    container_name = gr.Textbox(
+                        label="Container Name",
+                        placeholder="Enter container name (optional)",
+                        value=os.environ.get("CUA_CONTAINER_NAME", ""),
+                        info="Optional name for the container",
+                    )
+                    
+                    cua_cloud_api_key = gr.Textbox(
+                        label="CUA Cloud API Key",
+                        placeholder="Enter your CUA Cloud API key",
+                        value=os.environ.get("CUA_API_KEY", ""),
+                        type="password",
+                        info="Required for cloud provider",
+                        visible=(not has_cua_key)
+                    )
+                    
+                with gr.Accordion("Agent Configuration", open=True):
+                    agent_loop = gr.Dropdown(
+                        choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
+                        label="Agent Loop",
+                        value=initial_loop,
+                        info="Select the agent loop provider",
+                    )
+
+                    # Model selection dropdowns
+                    with gr.Group() as model_selection_group:
+                        openai_model_choice = gr.Dropdown(
+                            choices=openai_models,
+                            label="OpenAI Model",
+                            value=openai_models[0] if openai_models else "No models available",
+                            info="Select OpenAI model",
+                            interactive=True,
+                            visible=(initial_loop == "OPENAI")
+                        )
+                        
+                        anthropic_model_choice = gr.Dropdown(
+                            choices=anthropic_models,
+                            label="Anthropic Model",
+                            value=anthropic_models[0] if anthropic_models else "No models available",
+                            info="Select Anthropic model",
+                            interactive=True,
+                            visible=(initial_loop == "ANTHROPIC")
+                        )
+                        
+                        omni_model_choice = gr.Dropdown(
+                            choices=omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
+                            label="OMNI Model",
+                            value=omni_models[0] if omni_models else "Custom model (OpenAI compatible API)",
+                            info="Select OMNI model or choose a custom model option",
+                            interactive=True,
+                            visible=(initial_loop == "OMNI")
+                        )
+                        
+                        uitars_model_choice = gr.Dropdown(
+                            choices=provider_to_models.get("UITARS", ["No models available"]),
+                            label="UITARS Model",
+                            value=provider_to_models.get("UITARS", ["No models available"])[0] if provider_to_models.get("UITARS") else "No models available",
+                            info="Select UITARS model",
+                            interactive=True,
+                            visible=(initial_loop == "UITARS")
+                        )
+                        
+                        model_choice = gr.Textbox(visible=False)
+
+                    # API key inputs
+                    with gr.Group(visible=not has_openai_key and (initial_loop == "OPENAI" or initial_loop == "OMNI")) as openai_key_group:
+                        openai_api_key_input = gr.Textbox(
+                            label="OpenAI API Key",
+                            placeholder="Enter your OpenAI API key",
+                            value=os.environ.get("OPENAI_API_KEY", ""),
+                            interactive=True,
+                            type="password",
+                            info="Required for OpenAI models"
+                        )
+                    
+                    with gr.Group(visible=not has_anthropic_key and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")) as anthropic_key_group:
+                        anthropic_api_key_input = gr.Textbox(
+                            label="Anthropic API Key",
+                            placeholder="Enter your Anthropic API key",
+                            value=os.environ.get("ANTHROPIC_API_KEY", ""),
+                            interactive=True,
+                            type="password",
+                            info="Required for Anthropic models"
+                        )
+                        
+                    # API key handlers
+                    def set_openai_api_key(key):
+                        if key and key.strip():
+                            os.environ["OPENAI_API_KEY"] = key.strip()
+                            print(f"DEBUG - Set OpenAI API key environment variable")
+                        return key
+                    
+                    def set_anthropic_api_key(key):
+                        if key and key.strip():
+                            os.environ["ANTHROPIC_API_KEY"] = key.strip()
+                            print(f"DEBUG - Set Anthropic API key environment variable")
+                        return key
+                    
+                    openai_api_key_input.change(
+                        fn=set_openai_api_key,
+                        inputs=[openai_api_key_input],
+                        outputs=[openai_api_key_input],
+                        queue=False
+                    )
+                    
+                    anthropic_api_key_input.change(
+                        fn=set_anthropic_api_key,
+                        inputs=[anthropic_api_key_input],
+                        outputs=[anthropic_api_key_input],
+                        queue=False
+                    )
+
+                    # UI update function
+                    def update_ui(loop=None, openai_model=None, anthropic_model=None, omni_model=None, uitars_model=None):
+                        loop = loop or agent_loop.value
+                        
+                        model_value = None
+                        if loop == "OPENAI" and openai_model:
+                            model_value = openai_model
+                        elif loop == "ANTHROPIC" and anthropic_model:
+                            model_value = anthropic_model
+                        elif loop == "OMNI" and omni_model:
+                            model_value = omni_model
+                        elif loop == "UITARS" and uitars_model:
+                            model_value = uitars_model
+                        
+                        openai_visible = (loop == "OPENAI")
+                        anthropic_visible = (loop == "ANTHROPIC")
+                        omni_visible = (loop == "OMNI")
+                        uitars_visible = (loop == "UITARS")
+                        
+                        show_openai_key = not has_openai_key and (loop == "OPENAI" or (loop == "OMNI" and model_value and "OpenAI" in model_value and "Custom" not in model_value))
+                        show_anthropic_key = not has_anthropic_key and (loop == "ANTHROPIC" or (loop == "OMNI" and model_value and "Claude" in model_value and "Custom" not in model_value))
+                        
+                        is_custom_openai_api = model_value == "Custom model (OpenAI compatible API)"
+                        is_custom_ollama = model_value == "Custom model (ollama)"
+                        is_any_custom = is_custom_openai_api or is_custom_ollama
+                        
+                        model_choice_value = model_value if model_value else ""
+                        
+                        return [
+                            gr.update(visible=openai_visible),
+                            gr.update(visible=anthropic_visible),
+                            gr.update(visible=omni_visible),
+                            gr.update(visible=uitars_visible),
+                            gr.update(visible=show_openai_key),
+                            gr.update(visible=show_anthropic_key),
+                            gr.update(visible=is_any_custom),
+                            gr.update(visible=is_custom_openai_api),
+                            gr.update(visible=is_custom_openai_api),
+                            gr.update(value=model_choice_value)
+                        ]
+                        
+                    # Custom model inputs
+                    custom_model = gr.Textbox(
+                        label="Custom Model Name",
+                        placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct or llama3)",
+                        value=initial_custom_model,
+                        visible=(initial_model == "Custom model (OpenAI compatible API)" or initial_model == "Custom model (ollama)"),
+                        interactive=True,
+                    )
+
+                    provider_base_url = gr.Textbox(
+                        label="Provider Base URL",
+                        placeholder="Enter provider base URL (e.g., http://localhost:1234/v1)",
+                        value=initial_provider_base_url,
+                        visible=(initial_model == "Custom model (OpenAI compatible API)"),
+                        interactive=True,
+                    )
+
+                    provider_api_key = gr.Textbox(
+                        label="Provider API Key",
+                        placeholder="Enter provider API key (if required)",
+                        value="",
+                        visible=(initial_model == "Custom model (OpenAI compatible API)"),
+                        interactive=True,
+                        type="password",
+                    )
+                    
+                    # Connect UI update events
+                    for dropdown in [agent_loop, omni_model_choice, uitars_model_choice, openai_model_choice, anthropic_model_choice]:
+                        dropdown.change(
+                            fn=update_ui,
+                            inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
+                            outputs=[
+                                openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice, 
+                                openai_key_group, anthropic_key_group,
+                                custom_model, provider_base_url, provider_api_key,
+                                model_choice
+                            ],
+                            queue=False
+                        )
+
+                    save_trajectory = gr.Checkbox(
+                        label="Save Trajectory",
+                        value=initial_save_trajectory,
+                        info="Save the agent's trajectory for debugging",
+                        interactive=True,
+                    )
+
+                    recent_images = gr.Slider(
+                        label="Recent Images",
+                        minimum=1,
+                        maximum=10,
+                        value=initial_recent_images,
+                        step=1,
+                        info="Number of recent images to keep in context",
+                        interactive=True,
+                    )
+                    
+                    max_budget = gr.Number(
+                        label="Max Budget ($)",
+                        value=lambda: None,
+                        minimum=-1,
+                        maximum=100.0,
+                        step=0.1,
+                        info="Optional budget limit for trajectory (0 = no limit)",
+                        interactive=True,
+                    )
+
+            # Right column for chat interface
+            with gr.Column(scale=2):
+                gr.Markdown(
+                    "Ask me to perform tasks in a virtual environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
+                )
+
+                chatbot_history = gr.Chatbot(type="messages")
+                msg = gr.Textbox(
+                    placeholder="Ask me to perform tasks in a virtual environment"
+                )
+                clear = gr.Button("Clear")
+                cancel_button = gr.Button("Cancel", variant="stop")
+
+                # Add examples
+                example_group = gr.Examples(examples=example_messages, inputs=msg)
+
+                # Chat submission function
+                def chat_submit(message, history):
+                    history.append(gr.ChatMessage(role="user", content=message))
+                    return "", history
+
+                # Cancel function
+                async def cancel_agent_task(history):
+                    global global_agent
+                    if global_agent:
+                        print("DEBUG - Cancelling agent task")
+                        history.append(gr.ChatMessage(role="assistant", content="Task cancelled by user", metadata={"title": "❌ Cancelled"}))
+                    else:
+                        history.append(gr.ChatMessage(role="assistant", content="No active agent task to cancel", metadata={"title": "ℹ️ Info"}))
+                    return history
+                
+                # Process response function
+                async def process_response(
+                    history,
+                    openai_model_value,
+                    anthropic_model_value,
+                    omni_model_value,
+                    uitars_model_value,
+                    custom_model_value,
+                    agent_loop_choice,
+                    save_traj,
+                    recent_imgs,
+                    custom_url_value=None,
+                    custom_api_key=None,
+                    openai_key_input=None,
+                    anthropic_key_input=None,
+                    computer_os="linux",
+                    computer_provider="cloud",
+                    container_name="",
+                    cua_cloud_api_key="",
+                    max_budget_value=None,
+                ):
+                    if not history:
+                        yield history
+                        return
+
+                    # Get the last user message
+                    last_user_message = history[-1]["content"]
+
+                    # Get the appropriate model value based on the agent loop
+                    if agent_loop_choice == "OPENAI":
+                        model_choice_value = openai_model_value
+                    elif agent_loop_choice == "ANTHROPIC":
+                        model_choice_value = anthropic_model_value
+                    elif agent_loop_choice == "OMNI":
+                        model_choice_value = omni_model_value
+                    elif agent_loop_choice == "UITARS":
+                        model_choice_value = uitars_model_value
+                    else:
+                        model_choice_value = "No models available"
+                    
+                    # Determine if this is a custom model selection
+                    is_custom_model_selected = model_choice_value in ["Custom model (OpenAI compatible API)", "Custom model (ollama)"]
+                    
+                    # Determine the model name string to analyze
+                    if is_custom_model_selected:
+                        model_string_to_analyze = custom_model_value
+                    else:
+                        model_string_to_analyze = model_choice_value
+
+                    try:
+                        # Get the model string
+                        model_string = get_model_string(model_string_to_analyze, agent_loop_choice)
+
+                        # Set API keys if provided
+                        if openai_key_input:
+                            os.environ["OPENAI_API_KEY"] = openai_key_input
+                        if anthropic_key_input:
+                            os.environ["ANTHROPIC_API_KEY"] = anthropic_key_input
+                        if cua_cloud_api_key:
+                            os.environ["CUA_API_KEY"] = cua_cloud_api_key
+
+                        # Save settings
+                        current_settings = {
+                            "agent_loop": agent_loop_choice,
+                            "model_choice": model_choice_value,
+                            "custom_model": custom_model_value,
+                            "provider_base_url": custom_url_value,
+                            "save_trajectory": save_traj,
+                            "recent_images": recent_imgs,
+                            "computer_os": computer_os,
+                            "computer_provider": computer_provider,
+                            "container_name": container_name,
+                        }
+                        save_settings(current_settings)
+
+                        # Create agent
+                        global_agent = create_agent(
+                            model_string=model_string,
+                            save_trajectory=save_traj,
+                            only_n_most_recent_images=recent_imgs,
+                            custom_model_name=custom_model_value if is_custom_model_selected else None,
+                            computer_os=computer_os,
+                            computer_provider=computer_provider,
+                            computer_name=container_name,
+                            computer_api_key=cua_cloud_api_key,
+                            verbosity=logging.DEBUG,
+                            max_trajectory_budget=max_budget_value if max_budget_value and max_budget_value > 0 else None,
+                        )
+
+                        if global_agent is None:
+                            history.append(
+                                gr.ChatMessage(
+                                    role="assistant",
+                                    content="Failed to create agent. Check API keys and configuration.",
+                                )
+                            )
+                            yield history
+                            return
+
+                        # Create message list for agent
+                        messages = [{"role": "user", "content": last_user_message}]
+
+                        # Stream responses from the agent
+                        async for result in global_agent.run(messages):
+                            print(f"DEBUG - Agent response ------- START")
+                            from pprint import pprint
+                            pprint(result)
+                            print(f"DEBUG - Agent response ------- END")
+                            
+                            # Process the result output
+                            for item in result.get("output", []):
+                                if item.get("type") == "message":
+                                    content = item.get("content", [])
+                                    for content_part in content:
+                                        if content_part.get("text"):
+                                            history.append(gr.ChatMessage(
+                                                role=item.get("role", "assistant"),
+                                                content=content_part.get("text", ""),
+                                                metadata=content_part.get("metadata", {})
+                                            ))
+                                elif item.get("type") == "computer_call":
+                                    action = item.get("action", {})
+                                    action_type = action.get("type", "")
+                                    if action_type:
+                                        action_title = f"🛠️ Performing {action_type}"
+                                        if action.get("x") and action.get("y"):
+                                            action_title += f" at ({action['x']}, {action['y']})"
+                                        history.append(gr.ChatMessage(
+                                            role="assistant",
+                                            content=f"```json\n{json.dumps(action)}\n```",
+                                            metadata={"title": action_title}
+                                        ))
+                                elif item.get("type") == "function_call":
+                                    function_name = item.get("name", "")
+                                    arguments = item.get("arguments", "{}")
+                                    history.append(gr.ChatMessage(
+                                        role="assistant",
+                                        content=f"🔧 Calling function: {function_name}\n```json\n{arguments}\n```",
+                                        metadata={"title": f"Function Call: {function_name}"}
+                                    ))
+                                elif item.get("type") == "function_call_output":
+                                    output = item.get("output", "")
+                                    history.append(gr.ChatMessage(
+                                        role="assistant",
+                                        content=f"📤 Function output:\n```\n{output}\n```",
+                                        metadata={"title": "Function Output"}
+                                    ))
+                            
+                            yield history
+                            
+                    except Exception as e:
+                        import traceback
+                        traceback.print_exc()
+                        history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
+                        yield history
+                        
+                # Connect the submit button
+                submit_event = msg.submit(
+                    fn=chat_submit,
+                    inputs=[msg, chatbot_history],
+                    outputs=[msg, chatbot_history],
+                    queue=False,
+                ).then(
+                    fn=process_response,
+                    inputs=[
+                        chatbot_history,
+                        openai_model_choice,
+                        anthropic_model_choice,
+                        omni_model_choice,
+                        uitars_model_choice,
+                        custom_model,
+                        agent_loop,
+                        save_trajectory,
+                        recent_images,
+                        provider_base_url,
+                        provider_api_key,
+                        openai_api_key_input,
+                        anthropic_api_key_input,
+                        computer_os,
+                        computer_provider,
+                        container_name,
+                        cua_cloud_api_key,
+                        max_budget,
+                    ],
+                    outputs=[chatbot_history],
+                    queue=True,
+                )
+
+                # Clear button functionality
+                clear.click(lambda: None, None, chatbot_history, queue=False)
+                
+                # Connect cancel button
+                cancel_button.click(
+                    cancel_agent_task,
+                    [chatbot_history],
+                    [chatbot_history],
+                    queue=False
+                )
+
+                # Code display update function
+                def update_code_display(agent_loop, model_choice_val, custom_model_val, chat_history, recent_images_val, save_trajectory_val, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget_val):
+                    messages = []
+                    if chat_history:
+                        for msg in chat_history:
+                            if isinstance(msg, dict) and msg.get("role") == "user":
+                                messages.append(msg.get("content", ""))
+                    
+                    return generate_python_code(
+                        agent_loop, 
+                        model_choice_val or custom_model_val or "gpt-4o", 
+                        messages, 
+                        recent_images_val,
+                        save_trajectory_val,
+                        computer_os,
+                        computer_provider,
+                        container_name,
+                        cua_cloud_api_key,
+                        max_budget_val
+                    )
+                
+                # Update code display when configuration changes
+                for component in [agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget]:
+                    component.change(
+                        update_code_display,
+                        inputs=[agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget],
+                        outputs=[code_display]
+                    )
+
+    return demo
--- a/libs/python/agent2/example.py
+++ b/libs/python/agent2/example.py
@@ -1,12 +1,12 @@
 """
-Example usage of the agent2 library with docstring-based tool definitions.
+Example usage of the agent library with docstring-based tool definitions.
 """

 import asyncio
 import logging

-from agent2 import agent_loop, ComputerAgent
-from agent2.types import Messages
+from agent import agent_loop, ComputerAgent
+from agent.types import Messages
 from computer import Computer
 from computer.helpers import sandboxed

--- a/libs/python/agent2/pyproject.toml
+++ b/libs/python/agent2/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "pdm.backend"

 [project]
 name = "cua-agent"
-version = "0.4.0"
+version = "0.4.0b1"
 description = "CUA (Computer Use) Agent for AI-driven computer interaction"
 readme = "README.md"
 authors = [
@@ -44,6 +44,9 @@ ui = [
    "gradio>=5.23.3",
    "python-dotenv>=1.0.1",
 ]
+cli = [
+    "yaspin>=3.1.0",
+]
 all = [
    # omni requirements
    "ultralytics>=8.0.0",
@@ -54,6 +57,8 @@ all = [
    # ui requirements
    "gradio>=5.23.3",
    "python-dotenv>=1.0.1",
+    # cli requirements
+    "yaspin>=3.1.0",
 ]

 [tool.uv]
@@ -63,4 +68,4 @@ constraint-dependencies = ["fastrtc>0.43.0", "mlx-audio>0.2.3"]
 distribution = true

 [tool.pdm.build]
-includes = ["agent2/"]
+includes = ["agent/"]