From 6db6825516cd41974d33bed476299a8b72d94f65 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 29 Jul 2025 08:27:10 -0400
Subject: [PATCH] Updated example, cua-core dep, and added --prompt

---
 examples/agent_examples.py       | 113 ++++++++++++++++---------------
 libs/python/agent/agent/cli.py   |  42 ++++++++----
 libs/python/agent/pyproject.toml |   2 +-
 3 files changed, 85 insertions(+), 72 deletions(-)

diff --git a/examples/agent_examples.py b/examples/agent_examples.py
index 62573077..d6a565bf 100644
--- a/examples/agent_examples.py
+++ b/examples/agent_examples.py
@@ -8,7 +8,7 @@ import signal
 from computer import Computer, VMProviderType
 
 # Import the unified agent class and types
-from agent import ComputerAgent, LLMProvider, LLM, AgentLoop
+from agent import ComputerAgent
 
 # Import utility functions
 from utils import load_dotenv_files, handle_sigint
@@ -19,8 +19,8 @@ logger = logging.getLogger(__name__)
 
 
 async def run_agent_example():
-    """Run example of using the ComputerAgent with OpenAI and Omni provider."""
-    print("\n=== Example: ComputerAgent with OpenAI and Omni provider ===")
+    """Run example of using the ComputerAgent with different models."""
+    print("\n=== Example: ComputerAgent with different models ===")
 
     try:
         # Create a local macOS computer
@@ -37,28 +37,37 @@ async def run_agent_example():
         #     provider_type=VMProviderType.CLOUD,
         # )
 
-        # Create Computer instance with async context manager
+        # Create ComputerAgent with new API
         agent = ComputerAgent(
-            computer=computer,
-            loop=AgentLoop.OPENAI,
-            # loop=AgentLoop.ANTHROPIC,
-            # loop=AgentLoop.UITARS,
-            # loop=AgentLoop.OMNI,
-            model=LLM(provider=LLMProvider.OPENAI),  # No model name for Operator CUA
-            # model=LLM(provider=LLMProvider.OPENAI, name="gpt-4o"),
-            # model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
-            # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3:4b-it-q4_K_M"),
-            # model=LLM(provider=LLMProvider.MLXVLM, name="mlx-community/UI-TARS-1.5-7B-4bit"),
-            # model=LLM(
-            #     provider=LLMProvider.OAICOMPAT,
-            #     name="gemma-3-12b-it",
-            #     provider_base_url="http://localhost:1234/v1",  # LM Studio local endpoint
-            # ),
-            save_trajectory=True,
+            # Supported models:
+            
+            # == OpenAI CUA (computer-use-preview) ==
+            model="openai/computer-use-preview",
+
+            # == Anthropic CUA (Claude > 3.5) ==
+            # model="anthropic/claude-opus-4-20250514", 
+            # model="anthropic/claude-sonnet-4-20250514",
+            # model="anthropic/claude-3-7-sonnet-20250219",
+            # model="anthropic/claude-3-5-sonnet-20240620",
+
+            # == UI-TARS ==
+            # model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
+            # model="mlx/mlx-community/UI-TARS-1.5-7B-6bit",
+            # model="ollama_chat/0000/ui-tars-1.5-7b",
+
+            # == Omniparser + Any LLM ==
+            # model="omniparser+anthropic/claude-opus-4-20250514",
+            # model="omniparser+ollama_chat/gemma3:12b-it-q4_K_M",
+
+            tools=[computer],
             only_n_most_recent_images=3,
             verbosity=logging.DEBUG,
+            trajectory_dir="trajectories",
+            use_prompt_caching=True,
+            max_trajectory_budget=1.0,
         )
 
+        # Example tasks to demonstrate the agent
         tasks = [
             "Look for a repository named trycua/cua on GitHub.",
             "Check the open issues, open the most recent one and read it.",
@@ -68,43 +77,35 @@ async def run_agent_example():
             "Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.",
         ]
 
+        # Use message-based conversation history
+        history = []
+        
         for i, task in enumerate(tasks):
-            print(f"\nExecuting task {i}/{len(tasks)}: {task}")
-            async for result in agent.run(task):
-                print("Response ID: ", result.get("id"))
-
-                # Print detailed usage information
-                usage = result.get("usage")
-                if usage:
-                    print("\nUsage Details:")
-                    print(f"  Input Tokens: {usage.get('input_tokens')}")
-                    if "input_tokens_details" in usage:
-                        print(f"  Input Tokens Details: {usage.get('input_tokens_details')}")
-                    print(f"  Output Tokens: {usage.get('output_tokens')}")
-                    if "output_tokens_details" in usage:
-                        print(f"  Output Tokens Details: {usage.get('output_tokens_details')}")
-                    print(f"  Total Tokens: {usage.get('total_tokens')}")
-
-                print("Response Text: ", result.get("text"))
-
-                # Print tools information
-                tools = result.get("tools")
-                if tools:
-                    print("\nTools:")
-                    print(tools)
-
-                # Print reasoning and tool call outputs
-                outputs = result.get("output", [])
-                for output in outputs:
-                    output_type = output.get("type")
-                    if output_type == "reasoning":
-                        print("\nReasoning Output:")
-                        print(output)
-                    elif output_type == "computer_call":
-                        print("\nTool Call Output:")
-                        print(output)
-
-            print(f"\n✅ Task {i+1}/{len(tasks)} completed: {task}")
+            print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")
+            
+            # Add user message to history
+            history.append({"role": "user", "content": task})
+            
+            # Run agent with conversation history
+            async for result in agent.run(history, stream=False):
+                # Add agent outputs to history
+                history += result.get("output", [])
+                
+                # Print output for debugging
+                for item in result.get("output", []):
+                    if item.get("type") == "message":
+                        content = item.get("content", [])
+                        for content_part in content:
+                            if content_part.get("text"):
+                                print(f"Agent: {content_part.get('text')}")
+                    elif item.get("type") == "computer_call":
+                        action = item.get("action", {})
+                        action_type = action.get("type", "")
+                        print(f"Computer Action: {action_type}({action})")
+                    elif item.get("type") == "computer_call_output":
+                        print("Computer Output: [Screenshot/Result]")
+                        
+            print(f"✅ Task {i+1}/{len(tasks)} completed: {task}")
 
     except Exception as e:
         logger.error(f"Error in run_agent_example: {e}")
diff --git a/libs/python/agent/agent/cli.py b/libs/python/agent/agent/cli.py
index 8656e86f..9b9aded8 100644
--- a/libs/python/agent/agent/cli.py
+++ b/libs/python/agent/agent/cli.py
@@ -92,26 +92,30 @@ def print_welcome(model: str, agent_loop: str, container_name: str):
 async def ainput(prompt: str = ""):
     return await asyncio.to_thread(input, prompt)
 
-async def chat_loop(agent, model: str, container_name: str):
+async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = ""):
     """Main chat loop with the agent."""
     print_welcome(model, agent.agent_loop.__name__, container_name)
     
     history = []
     
+    if initial_prompt:
+        history.append({"role": "user", "content": initial_prompt})
+    
     while True:
-        # Get user input with prompt
-        print_colored("> ", end="")
-        user_input = await ainput()
-        
-        if user_input.lower() in ['exit', 'quit', 'q']:
-            print_colored("\n👋 Goodbye!")
-            break
+        if history[-1].get("role") != "user":
+            # Get user input with prompt
+            print_colored("> ", end="")
+            user_input = await ainput()
             
-        if not user_input:
-            continue
-            
-        # Add user message to history
-        history.append({"role": "user", "content": user_input})
+            if user_input.lower() in ['exit', 'quit', 'q']:
+                print_colored("\n👋 Goodbye!")
+                break
+                
+            if not user_input:
+                continue
+                
+            # Add user message to history
+            history.append({"role": "user", "content": user_input})
         
         # Stream responses from the agent with spinner
         with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
@@ -204,6 +208,12 @@ Examples:
         action="store_true",
         help="Enable verbose logging"
     )
+
+    parser.add_argument(
+        "-p", "--prompt",
+        type=str,
+        help="Initial prompt to send to the agent. Leave blank for interactive mode."
+    )
     
     args = parser.parse_args()
     
@@ -269,9 +279,11 @@ Examples:
         agent_kwargs = {
             "model": args.model,
             "tools": [computer],
-            "only_n_most_recent_images": args.images,
             "verbosity": 20 if args.verbose else 30,  # DEBUG vs WARNING
         }
+
+        if args.images > 0:
+            agent_kwargs["only_n_most_recent_images"] = args.images
         
         if args.trajectory:
             agent_kwargs["trajectory_dir"] = "trajectories"
@@ -286,7 +298,7 @@ Examples:
         agent = ComputerAgent(**agent_kwargs)
         
         # Start chat loop
-        await chat_loop(agent, args.model, container_name)
+        await chat_loop(agent, args.model, container_name, args.prompt)
 
 
 
diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml
index 683d036c..be10f729 100644
--- a/libs/python/agent/pyproject.toml
+++ b/libs/python/agent/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
     "rich>=13.7.1",
     "python-dotenv>=1.0.1",
     "cua-computer>=0.3.0,<0.5.0",
-    "cua-core>=0.1.0,<0.2.0",
+    "cua-core>=0.1.8,<0.2.0",
     "certifi>=2024.2.2",
     "litellm>=1.74.8"
 ]