Files
computer/tests/agent_loop_testing/agent_test.py
Adam b5e71efcc9 Feature/agent loop test (#528)
* draft init

* add mock computer

* Correct format

* correct format

* Create test-cua-models.yml

* Update test-cua-models.yml

* format change

* Simplified test

* remove image

* isort fix

* format cleanup
2025-10-28 17:34:41 -07:00

193 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""
Simple CUA Agent Test
Tests the actual CUA ComputerAgent SDK with a mock computer.
Only provides screenshot functionality - no complex computer actions.
"""
import asyncio
import base64
import sys
from io import BytesIO
from pathlib import Path
from PIL import Image, ImageDraw
# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
class MockComputer:
"""Mock computer that only provides screenshots."""
def __init__(self):
self.action_count = 0
self._image = self._create_image()
def _create_image(self) -> str:
"""Create a simple desktop image."""
img = Image.new("RGB", (1920, 1080), color="lightblue")
draw = ImageDraw.Draw(img)
# Draw Safari icon
draw.rectangle([100, 950, 150, 1000], fill="blue", outline="black", width=2)
draw.text((110, 960), "Safari", fill="white")
# Draw Terminal icon
draw.rectangle([200, 950, 250, 1000], fill="green", outline="black", width=2)
draw.text((210, 960), "Terminal", fill="white")
# Convert to base64
img_bytes = BytesIO()
img.save(img_bytes, format="PNG")
return base64.b64encode(img_bytes.getvalue()).decode("utf-8")
async def screenshot(self) -> str:
self.action_count += 1
return self._image
async def get_dimensions(self) -> tuple[int, int]:
return (1920, 1080)
# All other methods are no-ops (required by CUA interface)
async def click(self, x: int, y: int, button: str = "left") -> None:
await asyncio.sleep(0.1)
async def double_click(self, x: int, y: int) -> None:
await asyncio.sleep(0.1)
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
await asyncio.sleep(0.1)
async def type(self, text: str) -> None:
await asyncio.sleep(0.1)
async def wait(self, ms: int = 1000) -> None:
await asyncio.sleep(ms / 1000.0)
async def move(self, x: int, y: int) -> None:
await asyncio.sleep(0.1)
async def keypress(self, keys) -> None:
await asyncio.sleep(0.1)
async def drag(self, path) -> None:
await asyncio.sleep(0.1)
async def get_current_url(self) -> str:
return "desktop://mock"
async def get_environment(self) -> str:
return "mac"
# Required abstract methods
async def left_mouse_down(self, x: int = 0, y: int = 0) -> None:
await asyncio.sleep(0.1)
async def left_mouse_up(self, x: int = 0, y: int = 0) -> None:
await asyncio.sleep(0.1)
async def right_mouse_down(self, x: int = 0, y: int = 0) -> None:
await asyncio.sleep(0.1)
async def right_mouse_up(self, x: int = 0, y: int = 0) -> None:
await asyncio.sleep(0.1)
async def mouse_move(self, x: int, y: int) -> None:
await asyncio.sleep(0.1)
async def key_down(self, key: str) -> None:
await asyncio.sleep(0.1)
async def key_up(self, key: str) -> None:
await asyncio.sleep(0.1)
async def type_text(self, text: str) -> None:
await asyncio.sleep(0.1)
async def test_cua_agent(model_name: str):
"""Test CUA agent with mock computer."""
print(f"🤖 Testing CUA Agent: {model_name}")
print("=" * 50)
try:
# Import the real CUA agent
from agent import ComputerAgent
# Create mock computer
mock_computer = MockComputer()
# Create the real CUA ComputerAgent
agent = ComputerAgent(model=model_name, tools=[mock_computer], max_trajectory_budget=5.0)
print("✅ CUA Agent created")
print("✅ Mock computer ready")
print("🚀 Running agent...")
print()
# Run the agent with a specific task
message = "Open Safari browser"
iteration = 0
async for result in agent.run([{"role": "user", "content": message}]):
iteration += 1
print(f"Iteration {iteration}:")
# Print agent output
output_items = result.get("output", [])
if not output_items:
print(" (No output from agent)")
else:
for item in output_items:
if item["type"] == "message":
print(f" Agent: {item['content'][0]['text']}")
elif item["type"] == "tool_call":
print(f" Tool: {item.get('tool_name')} {item.get('arguments')}")
else:
print(f" Unknown output type: {item}")
# Debug: print full result for empty iterations
if not output_items:
print(f" Debug - Full result: {result}")
# Let the agent decide when to stop (it should try to complete the task)
# Only stop after 5 iterations to prevent infinite loops
if iteration >= 5:
print("🏁 Stopping after 5 iterations (safety limit)")
break
print()
print("=" * 50)
print("🎉 TEST COMPLETE!")
print("=" * 50)
print(f"✅ Model: {model_name}")
print(f"✅ Iterations: {iteration}")
print(f"✅ Screenshots: {mock_computer.action_count}")
print("✅ Agent executed successfully")
return True
except ImportError as e:
print(f"❌ Import error: {e}")
print("💡 Install CUA: pip install -e libs/python/agent -e libs/python/computer")
return False
except Exception as e:
print(f"❌ Test failed: {e}")
return False
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Test CUA Agent with mock computer")
parser.add_argument(
"--model", default="anthropic/claude-sonnet-4-20250514", help="CUA model to test"
)
args = parser.parse_args()
success = asyncio.run(test_cua_agent(args.model))
sys.exit(0 if success else 1)