mirror of
https://github.com/trycua/computer.git
synced 2026-01-03 20:10:04 -06:00
Feature/agent loop test (#528)
* draft init * add mock computer * Correct format * correct format * Create test-cua-models.yml * Update test-cua-models.yml * format change * Simplified test * remove image * isort fix * format cleanup
This commit is contained in:
118
.github/workflows/test-cua-models.yml
vendored
Normal file
118
.github/workflows/test-cua-models.yml
vendored
Normal file
@@ -0,0 +1,118 @@
|
||||
name: Test CUA Supporting Models
|
||||
|
||||
# This workflow tests all supported CUA models with API keys
|
||||
# Run manually using workflow_dispatch with test_models=true
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: [ main, master ]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
test_models:
|
||||
description: "Test all supported models (requires API keys)"
|
||||
required: false
|
||||
default: "true"
|
||||
type: boolean
|
||||
|
||||
jobs:
|
||||
# Test all CUA models - runs on PRs or when manually triggered
|
||||
test-all-models:
|
||||
if: ${{ github.event_name == 'pull_request' || inputs.test_models == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
model:
|
||||
# Anthropic Claude Models
|
||||
# - anthropic/claude-3-5-sonnet-20241022
|
||||
# - anthropic/claude-3-7-sonnet-20250219
|
||||
# - anthropic/claude-opus-4-20250514
|
||||
# - anthropic/claude-sonnet-4-20250514
|
||||
# - anthropic/claude-opus-4-1-20250805
|
||||
- anthropic/claude-sonnet-4-5-20250929
|
||||
# - anthropic/claude-haiku-4-5-20251001
|
||||
|
||||
# OpenAI Models
|
||||
# - openai/computer-use-preview
|
||||
|
||||
# Gemini Models
|
||||
# - gemini-2.5-computer-use-preview-10-2025
|
||||
|
||||
# GLM-4.5V Models
|
||||
# - openrouter/z-ai/glm-4.5v
|
||||
|
||||
# UI-TARS Models
|
||||
# - huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
||||
|
||||
# OpenCUA Models
|
||||
# - huggingface-local/xlangai/OpenCUA-7B
|
||||
# - huggingface-local/xlangai/OpenCUA-32B
|
||||
|
||||
# GTA1 Family Models
|
||||
# - huggingface-local/HelloKKMe/GTA1-7B
|
||||
# - huggingface-local/HelloKKMe/GTA1-32B
|
||||
# - huggingface-local/HelloKKMe/GTA1-72B
|
||||
|
||||
# Holo 1.5 Family Models
|
||||
# - huggingface-local/Hcompany/Holo1.5-3B
|
||||
# - huggingface-local/Hcompany/Holo1.5-7B
|
||||
# - huggingface-local/Hcompany/Holo1.5-72B
|
||||
|
||||
# InternVL 3.5 Family Models
|
||||
# - huggingface-local/OpenGVLab/InternVL3_5-1B
|
||||
# - huggingface-local/OpenGVLab/InternVL3_5-2B
|
||||
# - huggingface-local/OpenGVLab/InternVL3_5-4B
|
||||
# - huggingface-local/OpenGVLab/InternVL3_5-8B
|
||||
|
||||
# GLM-4.5V Local
|
||||
# - huggingface-local/zai-org/GLM-4.5V
|
||||
|
||||
# Composed Models (Grounding + Planning)
|
||||
# - omniparser+anthropic/claude-3-5-sonnet-20241022
|
||||
# - omniparser+openai/gpt-4o-mini
|
||||
# - moondream3+anthropic/claude-3-5-sonnet-20241022
|
||||
# - moondream3+openai/gpt-4o-mini
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.12"
|
||||
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libgl1-mesa-dri libglib2.0-0
|
||||
|
||||
- name: Install CUA dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e libs/python/agent -e libs/python/computer
|
||||
pip install -e libs/python/core
|
||||
pip install "cua-agent[uitars-hf]"
|
||||
pip install pytest
|
||||
|
||||
- name: Set up environment variables
|
||||
run: |
|
||||
echo "ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }}" >> $GITHUB_ENV
|
||||
echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> $GITHUB_ENV
|
||||
echo "GOOGLE_API_KEY=${{ secrets.GOOGLE_API_KEY }}" >> $GITHUB_ENV
|
||||
echo "OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }}" >> $GITHUB_ENV
|
||||
|
||||
- name: Test model with agent loop
|
||||
run: |
|
||||
cd tests/agent_loop_testing
|
||||
python agent_test.py --model "${{ matrix.model }}"
|
||||
|
||||
- name: Upload test results
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: test-results-${{ matrix.model }}
|
||||
path: |
|
||||
tests/agent_loop_testing/test_images/
|
||||
*.log
|
||||
retention-days: 7
|
||||
28
package-lock.json
generated
Normal file
28
package-lock.json
generated
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"name": "cua",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"devDependencies": {
|
||||
"prettier": "^3.6.2"
|
||||
}
|
||||
},
|
||||
"node_modules/prettier": {
|
||||
"version": "3.6.2",
|
||||
"resolved": "https://registry.npmjs.org/prettier/-/prettier-3.6.2.tgz",
|
||||
"integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"bin": {
|
||||
"prettier": "bin/prettier.cjs"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=14"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/prettier/prettier?sponsor=1"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
70
tests/agent_loop_testing/README.md
Normal file
70
tests/agent_loop_testing/README.md
Normal file
@@ -0,0 +1,70 @@
|
||||
# CUA Agent Test
|
||||
|
||||
Simple test for CUA ComputerAgent SDK with mock computer.
|
||||
|
||||
## Run Test
|
||||
|
||||
```bash
|
||||
python tests/agent_loop_testing/agent_test.py --model anthropic/claude-sonnet-4-20250514
|
||||
```
|
||||
|
||||
## What It Does
|
||||
|
||||
- Tests real CUA ComputerAgent SDK
|
||||
- Uses mock computer (only screenshots, no real actions)
|
||||
- Agent tries to "Open Safari browser"
|
||||
- Runs up to 5 iterations
|
||||
- Shows agent responses and tool calls
|
||||
|
||||
## What Passes ✅
|
||||
|
||||
- Agent initializes
|
||||
- Takes screenshots
|
||||
- Analyzes images
|
||||
- Makes tool calls
|
||||
- Runs multiple iterations
|
||||
|
||||
## What Fails ❌
|
||||
|
||||
- Missing dependencies
|
||||
- Invalid API keys
|
||||
- Agent crashes
|
||||
- Import errors
|
||||
|
||||
## Install
|
||||
|
||||
```bash
|
||||
pip install -e libs/python/agent -e libs/python/computer
|
||||
export ANTHROPIC_API_KEY="your-key"
|
||||
```
|
||||
|
||||
## Example Output
|
||||
|
||||
```
|
||||
🤖 Testing CUA Agent: anthropic/claude-sonnet-4-20250514
|
||||
==================================================
|
||||
✅ CUA Agent created
|
||||
✅ Mock computer ready
|
||||
🚀 Running agent...
|
||||
|
||||
Iteration 1:
|
||||
Agent: I'll click on Safari to open it.
|
||||
Tool: click {'x': 125, 'y': 975}
|
||||
|
||||
Iteration 2:
|
||||
Agent: Safari didn't open, let me try again.
|
||||
Tool: click {'x': 125, 'y': 975}
|
||||
|
||||
Iteration 3:
|
||||
Agent: This appears to be a static test environment.
|
||||
|
||||
🏁 Stopping after 5 iterations (safety limit)
|
||||
|
||||
==================================================
|
||||
🎉 TEST COMPLETE!
|
||||
==================================================
|
||||
✅ Model: anthropic/claude-sonnet-4-20250514
|
||||
✅ Iterations: 3
|
||||
✅ Screenshots: 3
|
||||
✅ Agent executed successfully
|
||||
```
|
||||
192
tests/agent_loop_testing/agent_test.py
Normal file
192
tests/agent_loop_testing/agent_test.py
Normal file
@@ -0,0 +1,192 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple CUA Agent Test
|
||||
|
||||
Tests the actual CUA ComputerAgent SDK with a mock computer.
|
||||
Only provides screenshot functionality - no complex computer actions.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import sys
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image, ImageDraw
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
|
||||
class MockComputer:
|
||||
"""Mock computer that only provides screenshots."""
|
||||
|
||||
def __init__(self):
|
||||
self.action_count = 0
|
||||
self._image = self._create_image()
|
||||
|
||||
def _create_image(self) -> str:
|
||||
"""Create a simple desktop image."""
|
||||
img = Image.new("RGB", (1920, 1080), color="lightblue")
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
# Draw Safari icon
|
||||
draw.rectangle([100, 950, 150, 1000], fill="blue", outline="black", width=2)
|
||||
draw.text((110, 960), "Safari", fill="white")
|
||||
|
||||
# Draw Terminal icon
|
||||
draw.rectangle([200, 950, 250, 1000], fill="green", outline="black", width=2)
|
||||
draw.text((210, 960), "Terminal", fill="white")
|
||||
|
||||
# Convert to base64
|
||||
img_bytes = BytesIO()
|
||||
img.save(img_bytes, format="PNG")
|
||||
return base64.b64encode(img_bytes.getvalue()).decode("utf-8")
|
||||
|
||||
async def screenshot(self) -> str:
|
||||
self.action_count += 1
|
||||
return self._image
|
||||
|
||||
async def get_dimensions(self) -> tuple[int, int]:
|
||||
return (1920, 1080)
|
||||
|
||||
# All other methods are no-ops (required by CUA interface)
|
||||
async def click(self, x: int, y: int, button: str = "left") -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def double_click(self, x: int, y: int) -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def type(self, text: str) -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def wait(self, ms: int = 1000) -> None:
|
||||
await asyncio.sleep(ms / 1000.0)
|
||||
|
||||
async def move(self, x: int, y: int) -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def keypress(self, keys) -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def drag(self, path) -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def get_current_url(self) -> str:
|
||||
return "desktop://mock"
|
||||
|
||||
async def get_environment(self) -> str:
|
||||
return "mac"
|
||||
|
||||
# Required abstract methods
|
||||
async def left_mouse_down(self, x: int = 0, y: int = 0) -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def left_mouse_up(self, x: int = 0, y: int = 0) -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def right_mouse_down(self, x: int = 0, y: int = 0) -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def right_mouse_up(self, x: int = 0, y: int = 0) -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def mouse_move(self, x: int, y: int) -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def key_down(self, key: str) -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def key_up(self, key: str) -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
async def type_text(self, text: str) -> None:
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
|
||||
async def test_cua_agent(model_name: str):
|
||||
"""Test CUA agent with mock computer."""
|
||||
print(f"🤖 Testing CUA Agent: {model_name}")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
# Import the real CUA agent
|
||||
from agent import ComputerAgent
|
||||
|
||||
# Create mock computer
|
||||
mock_computer = MockComputer()
|
||||
|
||||
# Create the real CUA ComputerAgent
|
||||
agent = ComputerAgent(model=model_name, tools=[mock_computer], max_trajectory_budget=5.0)
|
||||
|
||||
print("✅ CUA Agent created")
|
||||
print("✅ Mock computer ready")
|
||||
print("🚀 Running agent...")
|
||||
print()
|
||||
|
||||
# Run the agent with a specific task
|
||||
message = "Open Safari browser"
|
||||
|
||||
iteration = 0
|
||||
async for result in agent.run([{"role": "user", "content": message}]):
|
||||
iteration += 1
|
||||
print(f"Iteration {iteration}:")
|
||||
|
||||
# Print agent output
|
||||
output_items = result.get("output", [])
|
||||
if not output_items:
|
||||
print(" (No output from agent)")
|
||||
else:
|
||||
for item in output_items:
|
||||
if item["type"] == "message":
|
||||
print(f" Agent: {item['content'][0]['text']}")
|
||||
elif item["type"] == "tool_call":
|
||||
print(f" Tool: {item.get('tool_name')} {item.get('arguments')}")
|
||||
else:
|
||||
print(f" Unknown output type: {item}")
|
||||
|
||||
# Debug: print full result for empty iterations
|
||||
if not output_items:
|
||||
print(f" Debug - Full result: {result}")
|
||||
|
||||
# Let the agent decide when to stop (it should try to complete the task)
|
||||
# Only stop after 5 iterations to prevent infinite loops
|
||||
if iteration >= 5:
|
||||
print("🏁 Stopping after 5 iterations (safety limit)")
|
||||
break
|
||||
|
||||
print()
|
||||
print("=" * 50)
|
||||
print("🎉 TEST COMPLETE!")
|
||||
print("=" * 50)
|
||||
print(f"✅ Model: {model_name}")
|
||||
print(f"✅ Iterations: {iteration}")
|
||||
print(f"✅ Screenshots: {mock_computer.action_count}")
|
||||
print("✅ Agent executed successfully")
|
||||
|
||||
return True
|
||||
|
||||
except ImportError as e:
|
||||
print(f"❌ Import error: {e}")
|
||||
print("💡 Install CUA: pip install -e libs/python/agent -e libs/python/computer")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"❌ Test failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Test CUA Agent with mock computer")
|
||||
parser.add_argument(
|
||||
"--model", default="anthropic/claude-sonnet-4-20250514", help="CUA model to test"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
success = asyncio.run(test_cua_agent(args.model))
|
||||
sys.exit(0 if success else 1)
|
||||
Reference in New Issue
Block a user