""" CLI chat interface for agent - Computer Use Agent Usage: python -m agent.cli Examples: python -m agent.cli openai/computer-use-preview python -m agent.cli anthropic/claude-3-5-sonnet-20241022 python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022 """ try: import asyncio import argparse import os import sys import json from typing import List, Dict, Any import dotenv from yaspin import yaspin except ImportError: if __name__ == "__main__": raise ImportError( "CLI dependencies not found. " "Please install with: pip install \"cua-agent[cli]\"" ) # Load environment variables dotenv.load_dotenv() # Color codes for terminal output class Colors: RESET = '\033[0m' BOLD = '\033[1m' DIM = '\033[2m' # Text colors RED = '\033[31m' GREEN = '\033[32m' YELLOW = '\033[33m' BLUE = '\033[34m' MAGENTA = '\033[35m' CYAN = '\033[36m' WHITE = '\033[37m' GRAY = '\033[90m' # Background colors BG_RED = '\033[41m' BG_GREEN = '\033[42m' BG_YELLOW = '\033[43m' BG_BLUE = '\033[44m' def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n"): """Print colored text to terminal.""" prefix = "" if bold: prefix += Colors.BOLD if dim: prefix += Colors.DIM if color: prefix += color print(f"{prefix}{text}{Colors.RESET}", end=end) def print_action(action_type: str, details: Dict[str, Any]): """Print computer action with nice formatting.""" # Format action details args_str = "" if action_type == "click" and "x" in details and "y" in details: args_str = f"({details['x']}, {details['y']})" elif action_type == "type" and "text" in details: text = details["text"] if len(text) > 50: text = text[:47] + "..." args_str = f'"{text}"' elif action_type == "key" and "key" in details: args_str = f"'{details['key']}'" elif action_type == "scroll" and "x" in details and "y" in details: args_str = f"({details['x']}, {details['y']})" print_colored(f"šŸ› ļø {action_type}{args_str}", dim=True) def print_welcome(model: str, agent_loop: str, container_name: str): """Print welcome message.""" print_colored(f"Connected to {container_name} ({model}, {agent_loop})") print_colored("Type 'exit' to quit.", dim=True) async def ainput(prompt: str = ""): return await asyncio.to_thread(input, prompt) async def chat_loop(agent, model: str, container_name: str): """Main chat loop with the agent.""" print_welcome(model, agent.agent_loop.__name__, container_name) history = [] while True: # Get user input with prompt print_colored("> ", end="") user_input = await ainput() if user_input.lower() in ['exit', 'quit', 'q']: print_colored("\nšŸ‘‹ Goodbye!") break if not user_input: continue # Add user message to history history.append({"role": "user", "content": user_input}) # Stream responses from the agent with spinner with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner: spinner.hide() async for result in agent.run(history): # Add agent responses to history history.extend(result.get("output", [])) # Process and display the output for item in result.get("output", []): if item.get("type") == "message": # Display agent text response content = item.get("content", []) for content_part in content: if content_part.get("text"): text = content_part.get("text", "").strip() if text: spinner.hide() print_colored(text) elif item.get("type") == "computer_call": # Display computer action action = item.get("action", {}) action_type = action.get("type", "") if action_type: spinner.hide() print_action(action_type, action) spinner.text = f"Performing {action_type}..." spinner.show() elif item.get("type") == "function_call": # Display function call function_name = item.get("name", "") spinner.hide() print_colored(f"šŸ”§ Calling function: {function_name}", dim=True) spinner.text = f"Calling {function_name}..." spinner.show() elif item.get("type") == "function_call_output": # Display function output (dimmed) output = item.get("output", "") if output and len(output.strip()) > 0: spinner.hide() print_colored(f"šŸ“¤ {output}", dim=True) spinner.hide() async def main(): """Main CLI function.""" parser = argparse.ArgumentParser( description="CUA Agent CLI - Interactive computer use assistant", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python -m agent.cli openai/computer-use-preview python -m agent.cli anthropic/claude-3-5-sonnet-20241022 python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022 python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B """ ) parser.add_argument( "model", help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')" ) parser.add_argument( "--images", type=int, default=3, help="Number of recent images to keep in context (default: 3)" ) parser.add_argument( "--trajectory", action="store_true", help="Save trajectory for debugging" ) parser.add_argument( "--budget", type=float, help="Maximum budget for the session (in dollars)" ) parser.add_argument( "--verbose", action="store_true", help="Enable verbose logging" ) args = parser.parse_args() # Check for required environment variables container_name = os.getenv("CUA_CONTAINER_NAME") cua_api_key = os.getenv("CUA_API_KEY") # Prompt for missing environment variables if not container_name: print_colored("CUA_CONTAINER_NAME not set.", dim=True) print_colored("You can get a CUA container at https://www.trycua.com/", dim=True) container_name = input("Enter your CUA container name: ").strip() if not container_name: print_colored("āŒ Container name is required.") sys.exit(1) if not cua_api_key: print_colored("CUA_API_KEY not set.", dim=True) cua_api_key = input("Enter your CUA API key: ").strip() if not cua_api_key: print_colored("āŒ API key is required.") sys.exit(1) # Check for provider-specific API keys based on model provider_api_keys = { "openai/": "OPENAI_API_KEY", "anthropic/": "ANTHROPIC_API_KEY", "omniparser+": "OPENAI_API_KEY", "omniparser+": "ANTHROPIC_API_KEY", } # Find matching provider and check for API key for prefix, env_var in provider_api_keys.items(): if args.model.startswith(prefix): if not os.getenv(env_var): print_colored(f"{env_var} not set.", dim=True) api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip() if not api_key: print_colored(f"āŒ {env_var.replace('_', ' ').title()} is required.") sys.exit(1) # Set the environment variable for the session os.environ[env_var] = api_key break # Import here to avoid import errors if dependencies are missing try: from agent import ComputerAgent from computer import Computer except ImportError as e: print_colored(f"āŒ Import error: {e}", Colors.RED, bold=True) print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW) sys.exit(1) # Create computer instance async with Computer( os_type="linux", provider_type="cloud", name=container_name, api_key=cua_api_key ) as computer: # Create agent agent_kwargs = { "model": args.model, "tools": [computer], "only_n_most_recent_images": args.images, "verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING } if args.trajectory: agent_kwargs["trajectory_dir"] = "trajectories" if args.budget: agent_kwargs["max_trajectory_budget"] = { "max_budget": args.budget, "raise_error": True, "reset_after_each_run": False } agent = ComputerAgent(**agent_kwargs) # Start chat loop await chat_loop(agent, args.model, container_name) if __name__ == "__main__": try: asyncio.run(main()) except (KeyboardInterrupt, EOFError) as _: print_colored("\n\nšŸ‘‹ Goodbye!")