From 6db6825516cd41974d33bed476299a8b72d94f65 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 29 Jul 2025 08:27:10 -0400 Subject: [PATCH] Updated example, cua-core dep, and added --prompt --- examples/agent_examples.py | 113 ++++++++++++++++--------------- libs/python/agent/agent/cli.py | 42 ++++++++---- libs/python/agent/pyproject.toml | 2 +- 3 files changed, 85 insertions(+), 72 deletions(-) diff --git a/examples/agent_examples.py b/examples/agent_examples.py index 62573077..d6a565bf 100644 --- a/examples/agent_examples.py +++ b/examples/agent_examples.py @@ -8,7 +8,7 @@ import signal from computer import Computer, VMProviderType # Import the unified agent class and types -from agent import ComputerAgent, LLMProvider, LLM, AgentLoop +from agent import ComputerAgent # Import utility functions from utils import load_dotenv_files, handle_sigint @@ -19,8 +19,8 @@ logger = logging.getLogger(__name__) async def run_agent_example(): - """Run example of using the ComputerAgent with OpenAI and Omni provider.""" - print("\n=== Example: ComputerAgent with OpenAI and Omni provider ===") + """Run example of using the ComputerAgent with different models.""" + print("\n=== Example: ComputerAgent with different models ===") try: # Create a local macOS computer @@ -37,28 +37,37 @@ async def run_agent_example(): # provider_type=VMProviderType.CLOUD, # ) - # Create Computer instance with async context manager + # Create ComputerAgent with new API agent = ComputerAgent( - computer=computer, - loop=AgentLoop.OPENAI, - # loop=AgentLoop.ANTHROPIC, - # loop=AgentLoop.UITARS, - # loop=AgentLoop.OMNI, - model=LLM(provider=LLMProvider.OPENAI), # No model name for Operator CUA - # model=LLM(provider=LLMProvider.OPENAI, name="gpt-4o"), - # model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"), - # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3:4b-it-q4_K_M"), - # model=LLM(provider=LLMProvider.MLXVLM, name="mlx-community/UI-TARS-1.5-7B-4bit"), - # model=LLM( - # provider=LLMProvider.OAICOMPAT, - # name="gemma-3-12b-it", - # provider_base_url="http://localhost:1234/v1", # LM Studio local endpoint - # ), - save_trajectory=True, + # Supported models: + + # == OpenAI CUA (computer-use-preview) == + model="openai/computer-use-preview", + + # == Anthropic CUA (Claude > 3.5) == + # model="anthropic/claude-opus-4-20250514", + # model="anthropic/claude-sonnet-4-20250514", + # model="anthropic/claude-3-7-sonnet-20250219", + # model="anthropic/claude-3-5-sonnet-20240620", + + # == UI-TARS == + # model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", + # model="mlx/mlx-community/UI-TARS-1.5-7B-6bit", + # model="ollama_chat/0000/ui-tars-1.5-7b", + + # == Omniparser + Any LLM == + # model="omniparser+anthropic/claude-opus-4-20250514", + # model="omniparser+ollama_chat/gemma3:12b-it-q4_K_M", + + tools=[computer], only_n_most_recent_images=3, verbosity=logging.DEBUG, + trajectory_dir="trajectories", + use_prompt_caching=True, + max_trajectory_budget=1.0, ) + # Example tasks to demonstrate the agent tasks = [ "Look for a repository named trycua/cua on GitHub.", "Check the open issues, open the most recent one and read it.", @@ -68,43 +77,35 @@ async def run_agent_example(): "Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.", ] + # Use message-based conversation history + history = [] + for i, task in enumerate(tasks): - print(f"\nExecuting task {i}/{len(tasks)}: {task}") - async for result in agent.run(task): - print("Response ID: ", result.get("id")) - - # Print detailed usage information - usage = result.get("usage") - if usage: - print("\nUsage Details:") - print(f" Input Tokens: {usage.get('input_tokens')}") - if "input_tokens_details" in usage: - print(f" Input Tokens Details: {usage.get('input_tokens_details')}") - print(f" Output Tokens: {usage.get('output_tokens')}") - if "output_tokens_details" in usage: - print(f" Output Tokens Details: {usage.get('output_tokens_details')}") - print(f" Total Tokens: {usage.get('total_tokens')}") - - print("Response Text: ", result.get("text")) - - # Print tools information - tools = result.get("tools") - if tools: - print("\nTools:") - print(tools) - - # Print reasoning and tool call outputs - outputs = result.get("output", []) - for output in outputs: - output_type = output.get("type") - if output_type == "reasoning": - print("\nReasoning Output:") - print(output) - elif output_type == "computer_call": - print("\nTool Call Output:") - print(output) - - print(f"\nāœ… Task {i+1}/{len(tasks)} completed: {task}") + print(f"\nExecuting task {i+1}/{len(tasks)}: {task}") + + # Add user message to history + history.append({"role": "user", "content": task}) + + # Run agent with conversation history + async for result in agent.run(history, stream=False): + # Add agent outputs to history + history += result.get("output", []) + + # Print output for debugging + for item in result.get("output", []): + if item.get("type") == "message": + content = item.get("content", []) + for content_part in content: + if content_part.get("text"): + print(f"Agent: {content_part.get('text')}") + elif item.get("type") == "computer_call": + action = item.get("action", {}) + action_type = action.get("type", "") + print(f"Computer Action: {action_type}({action})") + elif item.get("type") == "computer_call_output": + print("Computer Output: [Screenshot/Result]") + + print(f"āœ… Task {i+1}/{len(tasks)} completed: {task}") except Exception as e: logger.error(f"Error in run_agent_example: {e}") diff --git a/libs/python/agent/agent/cli.py b/libs/python/agent/agent/cli.py index 8656e86f..9b9aded8 100644 --- a/libs/python/agent/agent/cli.py +++ b/libs/python/agent/agent/cli.py @@ -92,26 +92,30 @@ def print_welcome(model: str, agent_loop: str, container_name: str): async def ainput(prompt: str = ""): return await asyncio.to_thread(input, prompt) -async def chat_loop(agent, model: str, container_name: str): +async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = ""): """Main chat loop with the agent.""" print_welcome(model, agent.agent_loop.__name__, container_name) history = [] + if initial_prompt: + history.append({"role": "user", "content": initial_prompt}) + while True: - # Get user input with prompt - print_colored("> ", end="") - user_input = await ainput() - - if user_input.lower() in ['exit', 'quit', 'q']: - print_colored("\nšŸ‘‹ Goodbye!") - break + if history[-1].get("role") != "user": + # Get user input with prompt + print_colored("> ", end="") + user_input = await ainput() - if not user_input: - continue - - # Add user message to history - history.append({"role": "user", "content": user_input}) + if user_input.lower() in ['exit', 'quit', 'q']: + print_colored("\nšŸ‘‹ Goodbye!") + break + + if not user_input: + continue + + # Add user message to history + history.append({"role": "user", "content": user_input}) # Stream responses from the agent with spinner with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner: @@ -204,6 +208,12 @@ Examples: action="store_true", help="Enable verbose logging" ) + + parser.add_argument( + "-p", "--prompt", + type=str, + help="Initial prompt to send to the agent. Leave blank for interactive mode." + ) args = parser.parse_args() @@ -269,9 +279,11 @@ Examples: agent_kwargs = { "model": args.model, "tools": [computer], - "only_n_most_recent_images": args.images, "verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING } + + if args.images > 0: + agent_kwargs["only_n_most_recent_images"] = args.images if args.trajectory: agent_kwargs["trajectory_dir"] = "trajectories" @@ -286,7 +298,7 @@ Examples: agent = ComputerAgent(**agent_kwargs) # Start chat loop - await chat_loop(agent, args.model, container_name) + await chat_loop(agent, args.model, container_name, args.prompt) diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml index 683d036c..be10f729 100644 --- a/libs/python/agent/pyproject.toml +++ b/libs/python/agent/pyproject.toml @@ -20,7 +20,7 @@ dependencies = [ "rich>=13.7.1", "python-dotenv>=1.0.1", "cua-computer>=0.3.0,<0.5.0", - "cua-core>=0.1.0,<0.2.0", + "cua-core>=0.1.8,<0.2.0", "certifi>=2024.2.2", "litellm>=1.74.8" ]