From b5e71efcc9b0a7d940d924fb98fd6c754f69561c Mon Sep 17 00:00:00 2001 From: Adam <62897873+YeIIcw@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:34:41 -0700 Subject: [PATCH] Feature/agent loop test (#528) * draft init * add mock computer * Correct format * correct format * Create test-cua-models.yml * Update test-cua-models.yml * format change * Simplified test * remove image * isort fix * format cleanup --- .github/workflows/test-cua-models.yml | 118 +++++++++++++++ package-lock.json | 28 ++++ tests/agent_loop_testing/README.md | 70 +++++++++ tests/agent_loop_testing/agent_test.py | 192 +++++++++++++++++++++++++ 4 files changed, 408 insertions(+) create mode 100644 .github/workflows/test-cua-models.yml create mode 100644 package-lock.json create mode 100644 tests/agent_loop_testing/README.md create mode 100644 tests/agent_loop_testing/agent_test.py diff --git a/.github/workflows/test-cua-models.yml b/.github/workflows/test-cua-models.yml new file mode 100644 index 00000000..646bf7ec --- /dev/null +++ b/.github/workflows/test-cua-models.yml @@ -0,0 +1,118 @@ +name: Test CUA Supporting Models + +# This workflow tests all supported CUA models with API keys +# Run manually using workflow_dispatch with test_models=true + +on: + pull_request: + branches: [ main, master ] + workflow_dispatch: + inputs: + test_models: + description: "Test all supported models (requires API keys)" + required: false + default: "true" + type: boolean + +jobs: + # Test all CUA models - runs on PRs or when manually triggered + test-all-models: + if: ${{ github.event_name == 'pull_request' || inputs.test_models == 'true' }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + model: + # Anthropic Claude Models + # - anthropic/claude-3-5-sonnet-20241022 + # - anthropic/claude-3-7-sonnet-20250219 + # - anthropic/claude-opus-4-20250514 + # - anthropic/claude-sonnet-4-20250514 + # - anthropic/claude-opus-4-1-20250805 + - anthropic/claude-sonnet-4-5-20250929 + # - anthropic/claude-haiku-4-5-20251001 + + # OpenAI Models + # - openai/computer-use-preview + + # Gemini Models + # - gemini-2.5-computer-use-preview-10-2025 + + # GLM-4.5V Models + # - openrouter/z-ai/glm-4.5v + + # UI-TARS Models + # - huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B + + # OpenCUA Models + # - huggingface-local/xlangai/OpenCUA-7B + # - huggingface-local/xlangai/OpenCUA-32B + + # GTA1 Family Models + # - huggingface-local/HelloKKMe/GTA1-7B + # - huggingface-local/HelloKKMe/GTA1-32B + # - huggingface-local/HelloKKMe/GTA1-72B + + # Holo 1.5 Family Models + # - huggingface-local/Hcompany/Holo1.5-3B + # - huggingface-local/Hcompany/Holo1.5-7B + # - huggingface-local/Hcompany/Holo1.5-72B + + # InternVL 3.5 Family Models + # - huggingface-local/OpenGVLab/InternVL3_5-1B + # - huggingface-local/OpenGVLab/InternVL3_5-2B + # - huggingface-local/OpenGVLab/InternVL3_5-4B + # - huggingface-local/OpenGVLab/InternVL3_5-8B + + # GLM-4.5V Local + # - huggingface-local/zai-org/GLM-4.5V + + # Composed Models (Grounding + Planning) + # - omniparser+anthropic/claude-3-5-sonnet-20241022 + # - omniparser+openai/gpt-4o-mini + # - moondream3+anthropic/claude-3-5-sonnet-20241022 + # - moondream3+openai/gpt-4o-mini + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libgl1-mesa-dri libglib2.0-0 + + - name: Install CUA dependencies + run: | + pip install --upgrade pip + pip install -e libs/python/agent -e libs/python/computer + pip install -e libs/python/core + pip install "cua-agent[uitars-hf]" + pip install pytest + + - name: Set up environment variables + run: | + echo "ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }}" >> $GITHUB_ENV + echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> $GITHUB_ENV + echo "GOOGLE_API_KEY=${{ secrets.GOOGLE_API_KEY }}" >> $GITHUB_ENV + echo "OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }}" >> $GITHUB_ENV + + - name: Test model with agent loop + run: | + cd tests/agent_loop_testing + python agent_test.py --model "${{ matrix.model }}" + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-${{ matrix.model }} + path: | + tests/agent_loop_testing/test_images/ + *.log + retention-days: 7 diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 00000000..2df654cb --- /dev/null +++ b/package-lock.json @@ -0,0 +1,28 @@ +{ + "name": "cua", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "devDependencies": { + "prettier": "^3.6.2" + } + }, + "node_modules/prettier": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.6.2.tgz", + "integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==", + "dev": true, + "license": "MIT", + "bin": { + "prettier": "bin/prettier.cjs" + }, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/prettier/prettier?sponsor=1" + } + } + } +} diff --git a/tests/agent_loop_testing/README.md b/tests/agent_loop_testing/README.md new file mode 100644 index 00000000..48712d90 --- /dev/null +++ b/tests/agent_loop_testing/README.md @@ -0,0 +1,70 @@ +# CUA Agent Test + +Simple test for CUA ComputerAgent SDK with mock computer. + +## Run Test + +```bash +python tests/agent_loop_testing/agent_test.py --model anthropic/claude-sonnet-4-20250514 +``` + +## What It Does + +- Tests real CUA ComputerAgent SDK +- Uses mock computer (only screenshots, no real actions) +- Agent tries to "Open Safari browser" +- Runs up to 5 iterations +- Shows agent responses and tool calls + +## What Passes ✅ + +- Agent initializes +- Takes screenshots +- Analyzes images +- Makes tool calls +- Runs multiple iterations + +## What Fails ❌ + +- Missing dependencies +- Invalid API keys +- Agent crashes +- Import errors + +## Install + +```bash +pip install -e libs/python/agent -e libs/python/computer +export ANTHROPIC_API_KEY="your-key" +``` + +## Example Output + +``` +🤖 Testing CUA Agent: anthropic/claude-sonnet-4-20250514 +================================================== +✅ CUA Agent created +✅ Mock computer ready +🚀 Running agent... + +Iteration 1: + Agent: I'll click on Safari to open it. + Tool: click {'x': 125, 'y': 975} + +Iteration 2: + Agent: Safari didn't open, let me try again. + Tool: click {'x': 125, 'y': 975} + +Iteration 3: + Agent: This appears to be a static test environment. + +🏁 Stopping after 5 iterations (safety limit) + +================================================== +🎉 TEST COMPLETE! +================================================== +✅ Model: anthropic/claude-sonnet-4-20250514 +✅ Iterations: 3 +✅ Screenshots: 3 +✅ Agent executed successfully +``` \ No newline at end of file diff --git a/tests/agent_loop_testing/agent_test.py b/tests/agent_loop_testing/agent_test.py new file mode 100644 index 00000000..b31c8249 --- /dev/null +++ b/tests/agent_loop_testing/agent_test.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +""" +Simple CUA Agent Test + +Tests the actual CUA ComputerAgent SDK with a mock computer. +Only provides screenshot functionality - no complex computer actions. +""" + +import asyncio +import base64 +import sys +from io import BytesIO +from pathlib import Path + +from PIL import Image, ImageDraw + +# Add project root to path +project_root = Path(__file__).parent.parent.parent +sys.path.insert(0, str(project_root)) + + +class MockComputer: + """Mock computer that only provides screenshots.""" + + def __init__(self): + self.action_count = 0 + self._image = self._create_image() + + def _create_image(self) -> str: + """Create a simple desktop image.""" + img = Image.new("RGB", (1920, 1080), color="lightblue") + draw = ImageDraw.Draw(img) + + # Draw Safari icon + draw.rectangle([100, 950, 150, 1000], fill="blue", outline="black", width=2) + draw.text((110, 960), "Safari", fill="white") + + # Draw Terminal icon + draw.rectangle([200, 950, 250, 1000], fill="green", outline="black", width=2) + draw.text((210, 960), "Terminal", fill="white") + + # Convert to base64 + img_bytes = BytesIO() + img.save(img_bytes, format="PNG") + return base64.b64encode(img_bytes.getvalue()).decode("utf-8") + + async def screenshot(self) -> str: + self.action_count += 1 + return self._image + + async def get_dimensions(self) -> tuple[int, int]: + return (1920, 1080) + + # All other methods are no-ops (required by CUA interface) + async def click(self, x: int, y: int, button: str = "left") -> None: + await asyncio.sleep(0.1) + + async def double_click(self, x: int, y: int) -> None: + await asyncio.sleep(0.1) + + async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + await asyncio.sleep(0.1) + + async def type(self, text: str) -> None: + await asyncio.sleep(0.1) + + async def wait(self, ms: int = 1000) -> None: + await asyncio.sleep(ms / 1000.0) + + async def move(self, x: int, y: int) -> None: + await asyncio.sleep(0.1) + + async def keypress(self, keys) -> None: + await asyncio.sleep(0.1) + + async def drag(self, path) -> None: + await asyncio.sleep(0.1) + + async def get_current_url(self) -> str: + return "desktop://mock" + + async def get_environment(self) -> str: + return "mac" + + # Required abstract methods + async def left_mouse_down(self, x: int = 0, y: int = 0) -> None: + await asyncio.sleep(0.1) + + async def left_mouse_up(self, x: int = 0, y: int = 0) -> None: + await asyncio.sleep(0.1) + + async def right_mouse_down(self, x: int = 0, y: int = 0) -> None: + await asyncio.sleep(0.1) + + async def right_mouse_up(self, x: int = 0, y: int = 0) -> None: + await asyncio.sleep(0.1) + + async def mouse_move(self, x: int, y: int) -> None: + await asyncio.sleep(0.1) + + async def key_down(self, key: str) -> None: + await asyncio.sleep(0.1) + + async def key_up(self, key: str) -> None: + await asyncio.sleep(0.1) + + async def type_text(self, text: str) -> None: + await asyncio.sleep(0.1) + + +async def test_cua_agent(model_name: str): + """Test CUA agent with mock computer.""" + print(f"🤖 Testing CUA Agent: {model_name}") + print("=" * 50) + + try: + # Import the real CUA agent + from agent import ComputerAgent + + # Create mock computer + mock_computer = MockComputer() + + # Create the real CUA ComputerAgent + agent = ComputerAgent(model=model_name, tools=[mock_computer], max_trajectory_budget=5.0) + + print("✅ CUA Agent created") + print("✅ Mock computer ready") + print("🚀 Running agent...") + print() + + # Run the agent with a specific task + message = "Open Safari browser" + + iteration = 0 + async for result in agent.run([{"role": "user", "content": message}]): + iteration += 1 + print(f"Iteration {iteration}:") + + # Print agent output + output_items = result.get("output", []) + if not output_items: + print(" (No output from agent)") + else: + for item in output_items: + if item["type"] == "message": + print(f" Agent: {item['content'][0]['text']}") + elif item["type"] == "tool_call": + print(f" Tool: {item.get('tool_name')} {item.get('arguments')}") + else: + print(f" Unknown output type: {item}") + + # Debug: print full result for empty iterations + if not output_items: + print(f" Debug - Full result: {result}") + + # Let the agent decide when to stop (it should try to complete the task) + # Only stop after 5 iterations to prevent infinite loops + if iteration >= 5: + print("🏁 Stopping after 5 iterations (safety limit)") + break + + print() + print("=" * 50) + print("🎉 TEST COMPLETE!") + print("=" * 50) + print(f"✅ Model: {model_name}") + print(f"✅ Iterations: {iteration}") + print(f"✅ Screenshots: {mock_computer.action_count}") + print("✅ Agent executed successfully") + + return True + + except ImportError as e: + print(f"❌ Import error: {e}") + print("💡 Install CUA: pip install -e libs/python/agent -e libs/python/computer") + return False + except Exception as e: + print(f"❌ Test failed: {e}") + return False + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Test CUA Agent with mock computer") + parser.add_argument( + "--model", default="anthropic/claude-sonnet-4-20250514", help="CUA model to test" + ) + args = parser.parse_args() + + success = asyncio.run(test_cua_agent(args.model)) + sys.exit(0 if success else 1)