diff --git a/.all-contributorsrc b/.all-contributorsrc deleted file mode 100644 index 5969251d..00000000 --- a/.all-contributorsrc +++ /dev/null @@ -1,183 +0,0 @@ -{ - "projectName": "cua", - "projectOwner": "trycua", - "files": [ - "README.md" - ], - "commitType": "docs", - "commitConvention": "angular", - "contributorsPerLine": 7, - "contributors": [ - { - "login": "f-trycua", - "name": "f-trycua", - "avatar_url": "https://avatars.githubusercontent.com/u/195596869?v=4", - "profile": "https://github.com/f-trycua", - "contributions": [ - "code" - ] - }, - { - "login": "pepicrft", - "name": "Pedro Piñera Buendía", - "avatar_url": "https://avatars.githubusercontent.com/u/663605?v=4", - "profile": "http://pepicrft.me", - "contributions": [ - "code" - ] - }, - { - "login": "aktech", - "name": "Amit Kumar", - "avatar_url": "https://avatars.githubusercontent.com/u/5647941?v=4", - "profile": "https://iamit.in", - "contributions": [ - "code" - ] - }, - { - "login": "jellydn", - "name": "Dung Duc Huynh (Kaka)", - "avatar_url": "https://avatars.githubusercontent.com/u/870029?v=4", - "profile": "https://productsway.com/", - "contributions": [ - "code" - ] - }, - { - "login": "ShrootBuck", - "name": "Zayd Krunz", - "avatar_url": "https://avatars.githubusercontent.com/u/70227235?v=4", - "profile": "http://zaydkrunz.com", - "contributions": [ - "code" - ] - }, - { - "login": "PrashantRaj18198", - "name": "Prashant Raj", - "avatar_url": "https://avatars.githubusercontent.com/u/23168997?v=4", - "profile": "https://github.com/PrashantRaj18198", - "contributions": [ - "code" - ] - }, - { - "login": "Leland-Takamine", - "name": "Leland Takamine", - "avatar_url": "https://avatars.githubusercontent.com/u/847683?v=4", - "profile": "https://www.mobile.dev", - "contributions": [ - "code" - ] - }, - { - "login": "ddupont808", - "name": "ddupont", - "avatar_url": "https://avatars.githubusercontent.com/u/3820588?v=4", - "profile": "https://github.com/ddupont808", - "contributions": [ - "code" - ] - }, - { - "login": "Lizzard1123", - "name": "Ethan Gutierrez", - "avatar_url": "https://avatars.githubusercontent.com/u/46036335?v=4", - "profile": "https://github.com/Lizzard1123", - "contributions": [ - "code" - ] - }, - { - "login": "RicterZ", - "name": "Ricter Zheng", - "avatar_url": "https://avatars.githubusercontent.com/u/5282759?v=4", - "profile": "https://ricterz.me", - "contributions": [ - "code" - ] - }, - { - "login": "rahulkarajgikar", - "name": "Rahul Karajgikar", - "avatar_url": "https://avatars.githubusercontent.com/u/50844303?v=4", - "profile": "https://www.trytruffle.ai/", - "contributions": [ - "code" - ] - }, - { - "login": "trospix", - "name": "trospix", - "avatar_url": "https://avatars.githubusercontent.com/u/81363696?v=4", - "profile": "https://github.com/trospix", - "contributions": [ - "code" - ] - }, - { - "login": "eltociear", - "name": "Ikko Eltociear Ashimine", - "avatar_url": "https://avatars.githubusercontent.com/u/22633385?v=4", - "profile": "https://wavee.world/invitation/b96d00e6-b802-4a1b-8a66-2e3854a01ffd", - "contributions": [ - "code" - ] - }, - { - "login": "dp221125", - "name": "한석호(MilKyo)", - "avatar_url": "https://avatars.githubusercontent.com/u/10572119?v=4", - "profile": "https://github.com/dp221125", - "contributions": [ - "code" - ] - }, - { - "login": "rahimnathwani", - "name": "Rahim Nathwani", - "avatar_url": "https://avatars.githubusercontent.com/u/891558?v=4", - "profile": "https://www.encona.com/", - "contributions": [ - "code" - ] - }, - { - "login": "mjspeck", - "name": "Matt Speck", - "avatar_url": "https://avatars.githubusercontent.com/u/20689127?v=4", - "profile": "https://mjspeck.github.io/", - "contributions": [ - "code" - ] - }, - { - "login": "FinnBorge", - "name": "FinnBorge", - "avatar_url": "https://avatars.githubusercontent.com/u/9272726?v=4", - "profile": "https://github.com/FinnBorge", - "contributions": [ - "code" - ] - }, - { - "login": "jklapacz", - "name": "Jakub Klapacz", - "avatar_url": "https://avatars.githubusercontent.com/u/5343758?v=4", - "profile": "https://github.com/jklapacz", - "contributions": [ - "code" - ] - }, - { - "login": "evnsnclr", - "name": "Evan smith", - "avatar_url": "https://avatars.githubusercontent.com/u/139897548?v=4", - "profile": "https://github.com/evnsnclr", - "contributions": [ - "code" - ] - } - ] -} diff --git a/.github/workflows/pypi-publish-pylume.yml b/.github/workflows/pypi-publish-pylume.yml deleted file mode 100644 index 91278c00..00000000 --- a/.github/workflows/pypi-publish-pylume.yml +++ /dev/null @@ -1,82 +0,0 @@ -name: Publish Pylume Package - -on: - push: - tags: - - "pylume-v*" - workflow_dispatch: - inputs: - version: - description: "Version to publish (without v prefix)" - required: true - default: "0.1.0" - workflow_call: - inputs: - version: - description: "Version to publish" - required: true - type: string - outputs: - version: - description: "The version that was published" - value: ${{ jobs.determine-version.outputs.version }} - -# Adding permissions at workflow level -permissions: - contents: write - -jobs: - determine-version: - runs-on: macos-latest - outputs: - version: ${{ steps.get-version.outputs.version }} - steps: - - uses: actions/checkout@v4 - - - name: Determine version - id: get-version - run: | - if [ "${{ github.event_name }}" == "push" ]; then - # Extract version from tag (for package-specific tags) - if [[ "${{ github.ref }}" =~ ^refs/tags/pylume-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then - VERSION=${BASH_REMATCH[1]} - else - echo "Invalid tag format for pylume" - exit 1 - fi - elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then - # Use version from workflow dispatch - VERSION=${{ github.event.inputs.version }} - else - # Use version from workflow_call - VERSION=${{ inputs.version }} - fi - echo "VERSION=$VERSION" - echo "version=$VERSION" >> $GITHUB_OUTPUT - - validate-version: - runs-on: macos-latest - needs: determine-version - steps: - - uses: actions/checkout@v4 - - name: Validate version - id: validate-version - run: | - CODE_VERSION=$(grep '__version__' libs/python/pylume/pylume/__init__.py | cut -d'"' -f2) - if [ "${{ needs.determine-version.outputs.version }}" != "$CODE_VERSION" ]; then - echo "Version mismatch: expected $CODE_VERSION, got ${{ needs.determine-version.outputs.version }}" - exit 1 - fi - echo "Version validated: $CODE_VERSION" - - publish: - needs: determine-version - uses: ./.github/workflows/pypi-reusable-publish.yml - with: - package_name: "pylume" - package_dir: "libs/python/pylume" - version: ${{ needs.determine-version.outputs.version }} - is_lume_package: true - base_package_name: "pylume" - secrets: - PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} diff --git a/.github/workflows/pypi-reusable-publish.yml b/.github/workflows/pypi-reusable-publish.yml index 4a220610..614d8a7d 100644 --- a/.github/workflows/pypi-reusable-publish.yml +++ b/.github/workflows/pypi-reusable-publish.yml @@ -4,11 +4,11 @@ on: workflow_call: inputs: package_name: - description: "Name of the package (e.g. pylume, computer, agent)" + description: "Name of the package (e.g. computer, agent)" required: true type: string package_dir: - description: "Directory containing the package relative to workspace root (e.g. libs/python/pylume)" + description: "Directory containing the package relative to workspace root (e.g. libs/python/computer)" required: true type: string version: @@ -21,7 +21,7 @@ on: type: boolean default: false base_package_name: - description: "PyPI package name (e.g. pylume, cua-agent)" + description: "PyPI package name (e.g. cua-agent)" required: true type: string make_latest: diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml new file mode 100644 index 00000000..6bfbed5c --- /dev/null +++ b/.github/workflows/python-tests.yml @@ -0,0 +1,93 @@ +name: Python Unit Tests + +on: + pull_request: + paths: + - "libs/python/**" + - ".github/workflows/python-tests.yml" + push: + branches: + - main + paths: + - "libs/python/**" + - ".github/workflows/python-tests.yml" + workflow_dispatch: # Allow manual trigger + +jobs: + test: + name: Test ${{ matrix.package }} + runs-on: ubuntu-latest + + strategy: + fail-fast: false # Test all packages even if one fails + matrix: + package: + - core + - agent + - computer + - computer-server + - mcp-server + - pylume + - som + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install uv + run: | + pip install uv + + - name: Install package and dependencies + run: | + cd libs/python/${{ matrix.package }} + # Install the package in editable mode with dev dependencies + if [ -f pyproject.toml ]; then + uv pip install --system -e . + # Install test dependencies + uv pip install --system pytest pytest-asyncio pytest-mock pytest-cov + fi + shell: bash + + - name: Run tests + run: | + cd libs/python/${{ matrix.package }} + if [ -d tests ]; then + python -m pytest tests/ -v --tb=short --cov --cov-report=term --cov-report=xml + else + echo "No tests directory found, skipping tests" + fi + shell: bash + env: + CUA_TELEMETRY_DISABLED: "1" # Disable telemetry during tests + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + if: always() + with: + file: ./libs/python/${{ matrix.package }}/coverage.xml + flags: ${{ matrix.package }} + name: codecov-${{ matrix.package }} + fail_ci_if_error: false + continue-on-error: true + + summary: + name: Test Summary + runs-on: ubuntu-latest + needs: test + if: always() + + steps: + - name: Check test results + run: | + if [ "${{ needs.test.result }}" == "failure" ]; then + echo "❌ Some tests failed. Please check the logs above." + exit 1 + else + echo "✅ All tests passed!" + fi diff --git a/.github/workflows/test-cua-models.yml b/.github/workflows/test-cua-models.yml new file mode 100644 index 00000000..646bf7ec --- /dev/null +++ b/.github/workflows/test-cua-models.yml @@ -0,0 +1,118 @@ +name: Test CUA Supporting Models + +# This workflow tests all supported CUA models with API keys +# Run manually using workflow_dispatch with test_models=true + +on: + pull_request: + branches: [ main, master ] + workflow_dispatch: + inputs: + test_models: + description: "Test all supported models (requires API keys)" + required: false + default: "true" + type: boolean + +jobs: + # Test all CUA models - runs on PRs or when manually triggered + test-all-models: + if: ${{ github.event_name == 'pull_request' || inputs.test_models == 'true' }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + model: + # Anthropic Claude Models + # - anthropic/claude-3-5-sonnet-20241022 + # - anthropic/claude-3-7-sonnet-20250219 + # - anthropic/claude-opus-4-20250514 + # - anthropic/claude-sonnet-4-20250514 + # - anthropic/claude-opus-4-1-20250805 + - anthropic/claude-sonnet-4-5-20250929 + # - anthropic/claude-haiku-4-5-20251001 + + # OpenAI Models + # - openai/computer-use-preview + + # Gemini Models + # - gemini-2.5-computer-use-preview-10-2025 + + # GLM-4.5V Models + # - openrouter/z-ai/glm-4.5v + + # UI-TARS Models + # - huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B + + # OpenCUA Models + # - huggingface-local/xlangai/OpenCUA-7B + # - huggingface-local/xlangai/OpenCUA-32B + + # GTA1 Family Models + # - huggingface-local/HelloKKMe/GTA1-7B + # - huggingface-local/HelloKKMe/GTA1-32B + # - huggingface-local/HelloKKMe/GTA1-72B + + # Holo 1.5 Family Models + # - huggingface-local/Hcompany/Holo1.5-3B + # - huggingface-local/Hcompany/Holo1.5-7B + # - huggingface-local/Hcompany/Holo1.5-72B + + # InternVL 3.5 Family Models + # - huggingface-local/OpenGVLab/InternVL3_5-1B + # - huggingface-local/OpenGVLab/InternVL3_5-2B + # - huggingface-local/OpenGVLab/InternVL3_5-4B + # - huggingface-local/OpenGVLab/InternVL3_5-8B + + # GLM-4.5V Local + # - huggingface-local/zai-org/GLM-4.5V + + # Composed Models (Grounding + Planning) + # - omniparser+anthropic/claude-3-5-sonnet-20241022 + # - omniparser+openai/gpt-4o-mini + # - moondream3+anthropic/claude-3-5-sonnet-20241022 + # - moondream3+openai/gpt-4o-mini + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libgl1-mesa-dri libglib2.0-0 + + - name: Install CUA dependencies + run: | + pip install --upgrade pip + pip install -e libs/python/agent -e libs/python/computer + pip install -e libs/python/core + pip install "cua-agent[uitars-hf]" + pip install pytest + + - name: Set up environment variables + run: | + echo "ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }}" >> $GITHUB_ENV + echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> $GITHUB_ENV + echo "GOOGLE_API_KEY=${{ secrets.GOOGLE_API_KEY }}" >> $GITHUB_ENV + echo "OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }}" >> $GITHUB_ENV + + - name: Test model with agent loop + run: | + cd tests/agent_loop_testing + python agent_test.py --model "${{ matrix.model }}" + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-${{ matrix.model }} + path: | + tests/agent_loop_testing/test_images/ + *.log + retention-days: 7 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e1523f92..d9475d42 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ repos: entry: prettier --write language: node additional_dependencies: ["prettier@3.6.2"] - files: \.(ts|tsx|js|jsx|json|md|yaml|yml)$ + files: \.(ts|tsx|js|jsx|json|md|mdx|yaml|yml)$ - repo: local hooks: diff --git a/.vscode/launch.json b/.vscode/launch.json index acfd84b2..58701566 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -10,7 +10,7 @@ "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { - "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" + "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som" } }, { @@ -23,7 +23,7 @@ "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { - "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" + "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som" } }, { @@ -36,7 +36,7 @@ "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { - "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" + "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som" } }, { @@ -49,20 +49,7 @@ "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { - "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" - } - }, - { - "name": "Run PyLume Examples", - "type": "debugpy", - "request": "launch", - "program": "examples/pylume_examples.py", - "console": "integratedTerminal", - "justMyCode": true, - "python": "${workspaceFolder:cua-root}/.venv/bin/python", - "cwd": "${workspaceFolder:cua-root}", - "env": { - "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" + "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som" } }, { @@ -84,7 +71,7 @@ "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { - "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" + "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som" } }, { @@ -106,7 +93,7 @@ "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { - "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" + "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som" } }, { @@ -119,7 +106,7 @@ "python": "${workspaceFolder:cua-root}/.venv/bin/python", "cwd": "${workspaceFolder:cua-root}", "env": { - "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume" + "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som" } }, { diff --git a/.vscode/py.code-workspace b/.vscode/py.code-workspace index 25324251..adb04695 100644 --- a/.vscode/py.code-workspace +++ b/.vscode/py.code-workspace @@ -20,10 +20,6 @@ "name": "computer-server", "path": "../libs/python/computer-server" }, - { - "name": "pylume", - "path": "../libs/python/pylume" - }, { "name": "core", "path": "../libs/python/core" @@ -51,7 +47,6 @@ "${workspaceFolder:cua-root}/libs/python/computer", "${workspaceFolder:cua-root}/libs/python/agent", "${workspaceFolder:cua-root}/libs/python/som", - "${workspaceFolder:cua-root}/libs/python/pylume", "${workspaceFolder:cua-root}/.vscode/typings" ], "python.envFile": "${workspaceFolder:cua-root}/.env", @@ -89,10 +84,6 @@ "name": "som", "depth": 2 }, - { - "name": "pylume", - "depth": 2 - }, { "name": "core", "depth": 2 @@ -103,7 +94,6 @@ "${workspaceFolder:cua-root}/libs/python/computer", "${workspaceFolder:cua-root}/libs/python/agent", "${workspaceFolder:cua-root}/libs/python/som", - "${workspaceFolder:cua-root}/libs/python/pylume" ], "python.languageServer": "None", "[python]": { diff --git a/Dockerfile b/Dockerfile index 9b9f3c47..579842a4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 \ PIP_NO_CACHE_DIR=1 \ PIP_DISABLE_PIP_VERSION_CHECK=1 \ - PYTHONPATH="/app/libs/python/core:/app/libs/python/computer:/app/libs/python/agent:/app/libs/python/som:/app/libs/python/pylume:/app/libs/python/computer-server:/app/libs/python/mcp-server" + PYTHONPATH="/app/libs/python/core:/app/libs/python/computer:/app/libs/python/agent:/app/libs/python/som:/app/libs/python/computer-server:/app/libs/python/mcp-server" # Install system dependencies for ARM architecture RUN apt-get update && apt-get install -y --no-install-recommends \ diff --git a/README.md b/README.md index 2a43f3b7..236f04d4 100644 --- a/README.md +++ b/README.md @@ -207,17 +207,17 @@ The following table shows which capabilities are supported by each model: | Model | Computer-Use | Grounding | Tools | VLM | | -------------------------------------------------------------------------------------------------------------------------------- | :----------: | :-------: | :---: | :-: | -| [Claude Sonnet/Haiku](https://docs.claude.com/en/docs/agents-and-tools/tool-use/computer-use-tool#how-to-implement-computer-use) | ✓ | ✓ | ✓ | ✓ | -| [OpenAI CU Preview](https://platform.openai.com/docs/models/computer-use-preview) | ✓ | ✓ | | ✓ | -| [GLM-V](https://huggingface.co/THUDM/glm-4v-9b) | ✓ | ✓ | ✓ | ✓ | -| [Gemini CU Preview](https://ai.google.dev/gemini-api/docs/computer-use) | ✓ | ✓ | | ✓ | -| [InternVL](https://huggingface.co/OpenGVLab/InternVL3_5-1B) | ✓ | ✓ | ✓ | ✓ | -| [UI-TARS](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B) | ✓ | ✓ | ✓ | ✓ | -| [OpenCUA](https://huggingface.co/xlangai/OpenCUA-7B) | | ✓ | | | -| [GTA](https://huggingface.co/HelloKKMe/GTA1-7B) | | ✓ | | | -| [Holo](https://huggingface.co/Hcompany/Holo1.5-3B) | | ✓ | | | -| [Moondream](https://huggingface.co/moondream/moondream3-preview) | | ✓ | | | -| [OmniParser](https://github.com/microsoft/OmniParser) | | ✓ | | | +| [Claude Sonnet/Haiku](https://docs.claude.com/en/docs/agents-and-tools/tool-use/computer-use-tool#how-to-implement-computer-use) | 🖥️ | 🎯 | 🛠️ | 👁️ | +| [OpenAI CU Preview](https://platform.openai.com/docs/models/computer-use-preview) | 🖥️ | 🎯 | | 👁️ | +| [GLM-V](https://huggingface.co/THUDM/glm-4v-9b) | 🖥️ | 🎯 | 🛠️ | 👁️ | +| [Gemini CU Preview](https://ai.google.dev/gemini-api/docs/computer-use) | 🖥️ | 🎯 | | 👁️ | +| [InternVL](https://huggingface.co/OpenGVLab/InternVL3_5-1B) | 🖥️ | 🎯 | 🛠️ | 👁️ | +| [UI-TARS](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B) | 🖥️ | 🎯 | 🛠️ | 👁️ | +| [OpenCUA](https://huggingface.co/xlangai/OpenCUA-7B) | | 🎯 | | | +| [GTA](https://huggingface.co/HelloKKMe/GTA1-7B) | | 🎯 | | | +| [Holo](https://huggingface.co/Hcompany/Holo1.5-3B) | | 🎯 | | | +| [Moondream](https://huggingface.co/moondream/moondream3-preview) | | 🎯 | | | +| [OmniParser](https://github.com/microsoft/OmniParser) | | 🎯 | | | ### Model IDs diff --git a/TESTING.md b/TESTING.md new file mode 100644 index 00000000..75deb04c --- /dev/null +++ b/TESTING.md @@ -0,0 +1,106 @@ +# Testing Guide for CUA + +Quick guide to running tests and understanding the test architecture. + +## 🚀 Quick Start + +```bash +# Install dependencies +pip install pytest pytest-asyncio pytest-mock pytest-cov + +# Install package +cd libs/python/core +pip install -e . + +# Run tests +export CUA_TELEMETRY_DISABLED=1 # or $env:CUA_TELEMETRY_DISABLED="1" on Windows +pytest tests/ -v +``` + +## 🧪 Running Tests + +```bash +# All packages +pytest libs/python/*/tests/ -v + +# Specific package +cd libs/python/core && pytest tests/ -v + +# With coverage +pytest tests/ --cov --cov-report=html + +# Specific test +pytest tests/test_telemetry.py::TestTelemetryEnabled::test_telemetry_enabled_by_default -v +``` + +## 🏗️ Test Architecture + +**Principles**: SRP (Single Responsibility) + Vertical Slices + Testability + +``` +libs/python/ +├── core/tests/ # Tests ONLY core +├── agent/tests/ # Tests ONLY agent +└── computer/tests/ # Tests ONLY computer +``` + +Each test file = ONE feature. Each test class = ONE concern. + +## ➕ Adding New Tests + +1. Create `test_*.py` in the appropriate package's `tests/` directory +2. Follow the pattern: + +```python +"""Unit tests for my_feature.""" +import pytest +from unittest.mock import patch + +class TestMyFeature: + """Test MyFeature class.""" + + def test_initialization(self): + """Test that feature initializes.""" + from my_package import MyFeature + feature = MyFeature() + assert feature is not None +``` + +3. Mock external dependencies: + +```python +@pytest.fixture +def mock_api(): + with patch("my_package.api_client") as mock: + yield mock +``` + +## 🔄 CI/CD + +Tests run automatically on every PR via GitHub Actions (`.github/workflows/python-tests.yml`): + +- Matrix strategy: each package tested separately +- Python 3.12 +- ~2 minute runtime + +## 🐛 Troubleshooting + +**ModuleNotFoundError**: Run `pip install -e .` in package directory + +**Tests fail in CI but pass locally**: Set `CUA_TELEMETRY_DISABLED=1` + +**Async tests error**: Install `pytest-asyncio` and use `@pytest.mark.asyncio` + +**Mock not working**: Patch at usage location, not definition: + +```python +# ✅ Right +@patch("my_package.module.external_function") + +# ❌ Wrong +@patch("external_library.function") +``` + +--- + +**Questions?** Check existing tests for examples or open an issue. diff --git a/docs/content/docs/agent-sdk/agent-loops.mdx b/docs/content/docs/agent-sdk/agent-loops.mdx index 08dcf07b..625509b7 100644 --- a/docs/content/docs/agent-sdk/agent-loops.mdx +++ b/docs/content/docs/agent-sdk/agent-loops.mdx @@ -3,7 +3,13 @@ title: Agent Loops description: Supported computer-using agent loops and models --- -A corresponding Jupyter Notebook is available for this documentation. + + A corresponding{' '} + + Jupyter Notebook + {' '} + is available for this documentation. + An agent can be thought of as a loop - it generates actions, executes them, and repeats until done: @@ -102,7 +108,7 @@ messages = [ "content": "Take a screenshot and describe what you see" }, { - "role": "assistant", + "role": "assistant", "content": "I'll take a screenshot for you." } ] diff --git a/docs/content/docs/agent-sdk/benchmarks/index.mdx b/docs/content/docs/agent-sdk/benchmarks/index.mdx index 6397b2ec..685a8f92 100644 --- a/docs/content/docs/agent-sdk/benchmarks/index.mdx +++ b/docs/content/docs/agent-sdk/benchmarks/index.mdx @@ -4,13 +4,14 @@ description: Computer Agent SDK benchmarks for agentic GUI tasks --- The benchmark system evaluates models on GUI grounding tasks, specifically agent loop success rate and click prediction accuracy. It supports both: + - **Computer Agent SDK providers** (using model strings like `"huggingface-local/HelloKKMe/GTA1-7B"`) - **Reference agent implementations** (custom model classes implementing the `ModelProtocol`) ## Available Benchmarks - **[ScreenSpot-v2](./benchmarks/screenspot-v2)** - Standard resolution GUI grounding -- **[ScreenSpot-Pro](./benchmarks/screenspot-pro)** - High-resolution GUI grounding +- **[ScreenSpot-Pro](./benchmarks/screenspot-pro)** - High-resolution GUI grounding - **[Interactive Testing](./benchmarks/interactive)** - Real-time testing and visualization ## Quick Start diff --git a/docs/content/docs/agent-sdk/benchmarks/introduction.mdx b/docs/content/docs/agent-sdk/benchmarks/introduction.mdx index 7f15b6a8..67a90769 100644 --- a/docs/content/docs/agent-sdk/benchmarks/introduction.mdx +++ b/docs/content/docs/agent-sdk/benchmarks/introduction.mdx @@ -8,6 +8,7 @@ The Cua agent framework uses benchmarks to test the performance of supported mod ## Benchmark Types Computer-Agent benchmarks evaluate two key capabilities: + - **Plan Generation**: Breaking down complex tasks into a sequence of actions - **Coordinate Generation**: Predicting precise click locations on GUI elements @@ -31,7 +32,7 @@ agent.run("Open Firefox and go to github.com") ### Coordinate Generation Only -**[GUI Agent Grounding Leaderboard](https://gui-agent.github.io/grounding-leaderboard/)** - Benchmark for click prediction accuracy +**[GUI Agent Grounding Leaderboard](https://gui-agent.github.io/grounding-leaderboard/)** - Benchmark for click prediction accuracy This leaderboard tests models that specialize in finding exactly where to click on screen elements, but needs to be told what specific action to take. @@ -41,7 +42,7 @@ This leaderboard tests models that specialize in finding exactly where to click agent = ComputerAgent("huggingface-local/HelloKKMe/GTA1-7B", tools=[computer]) agent.predict_click("find the button to open the settings") # (27, 450) # This will raise an error: -# agent.run("Open Firefox and go to github.com") +# agent.run("Open Firefox and go to github.com") ``` ### Composed Agent diff --git a/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx b/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx index 1bfcfeea..26e4b7e4 100644 --- a/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx +++ b/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx @@ -5,4 +5,4 @@ description: Benchmark ComputerAgent on OSWorld tasks using HUD OSWorld-Verified is a curated subset of OSWorld tasks that can be run using the HUD framework. -Use [ComputerAgent with HUD](../integrations/hud) to benchmark on these tasks. \ No newline at end of file +Use [ComputerAgent with HUD](../integrations/hud) to benchmark on these tasks. diff --git a/docs/content/docs/agent-sdk/benchmarks/screenspot-pro.mdx b/docs/content/docs/agent-sdk/benchmarks/screenspot-pro.mdx index 402b919e..15739d9d 100644 --- a/docs/content/docs/agent-sdk/benchmarks/screenspot-pro.mdx +++ b/docs/content/docs/agent-sdk/benchmarks/screenspot-pro.mdx @@ -18,8 +18,8 @@ python ss-pro.py --samples 50 ## Results -| Model | Accuracy | Failure Rate | Samples | -|-------|----------|--------------|---------| -| Coming Soon | - | - | - | +| Model | Accuracy | Failure Rate | Samples | +| ----------- | -------- | ------------ | ------- | +| Coming Soon | - | - | - | Results will be populated after running benchmarks with various models. diff --git a/docs/content/docs/agent-sdk/benchmarks/screenspot-v2.mdx b/docs/content/docs/agent-sdk/benchmarks/screenspot-v2.mdx index 6cfcf1c1..ba78d5f9 100644 --- a/docs/content/docs/agent-sdk/benchmarks/screenspot-v2.mdx +++ b/docs/content/docs/agent-sdk/benchmarks/screenspot-v2.mdx @@ -18,8 +18,8 @@ python ss-v2.py --samples 100 ## Results -| Model | Accuracy | Failure Rate | Samples | -|-------|----------|--------------|---------| -| Coming Soon | - | - | - | +| Model | Accuracy | Failure Rate | Samples | +| ----------- | -------- | ------------ | ------- | +| Coming Soon | - | - | - | Results will be populated after running benchmarks with various models. diff --git a/docs/content/docs/agent-sdk/callbacks/agent-lifecycle.mdx b/docs/content/docs/agent-sdk/callbacks/agent-lifecycle.mdx index 494c4a8f..1fb4afe7 100644 --- a/docs/content/docs/agent-sdk/callbacks/agent-lifecycle.mdx +++ b/docs/content/docs/agent-sdk/callbacks/agent-lifecycle.mdx @@ -10,30 +10,39 @@ Callbacks provide hooks into the agent lifecycle for extensibility. They're call ## Callback Lifecycle ### 1. `on_run_start(kwargs, old_items)` + Called once when agent run begins. Initialize tracking, logging, or state. ### 2. `on_run_continue(kwargs, old_items, new_items)` → bool + Called before each iteration. Return `False` to stop execution (e.g., budget limits). ### 3. `on_llm_start(messages)` → messages + Preprocess messages before LLM call. Use for PII anonymization, image retention. ### 4. `on_api_start(kwargs)` + Called before each LLM API call. ### 5. `on_api_end(kwargs, result)` + Called after each LLM API call completes. ### 6. `on_usage(usage)` + Called when usage information is received from LLM. ### 7. `on_llm_end(messages)` → messages + Postprocess messages after LLM call. Use for PII deanonymization. ### 8. `on_responses(kwargs, responses)` + Called when responses are received from agent loop. ### 9. Response-specific hooks: + - `on_text(item)` - Text messages - `on_computer_call_start(item)` - Before computer actions - `on_computer_call_end(item, result)` - After computer actions @@ -42,4 +51,5 @@ Called when responses are received from agent loop. - `on_screenshot(screenshot, name)` - When screenshots are taken ### 10. `on_run_end(kwargs, old_items, new_items)` -Called when agent run completes. Finalize tracking, save trajectories. \ No newline at end of file + +Called when agent run completes. Finalize tracking, save trajectories. diff --git a/docs/content/docs/agent-sdk/callbacks/cost-saving.mdx b/docs/content/docs/agent-sdk/callbacks/cost-saving.mdx index 0787b1f6..4a76dc95 100644 --- a/docs/content/docs/agent-sdk/callbacks/cost-saving.mdx +++ b/docs/content/docs/agent-sdk/callbacks/cost-saving.mdx @@ -36,6 +36,7 @@ agent = ComputerAgent( ``` **Or with options:** + ```python # Advanced budget configuration agent = ComputerAgent( diff --git a/docs/content/docs/agent-sdk/callbacks/index.mdx b/docs/content/docs/agent-sdk/callbacks/index.mdx index 590c236a..71b63a2e 100644 --- a/docs/content/docs/agent-sdk/callbacks/index.mdx +++ b/docs/content/docs/agent-sdk/callbacks/index.mdx @@ -15,7 +15,7 @@ Built-in callbacks can be used as follows: ```python from agent.callbacks import ( ImageRetentionCallback, - TrajectorySaverCallback, + TrajectorySaverCallback, BudgetManagerCallback, LoggingCallback ) @@ -52,12 +52,12 @@ class CustomCallback(AsyncCallbackHandler): """Preprocess messages before LLM call""" # Add custom preprocessing logic return messages - + async def on_llm_end(self, messages): """Postprocess messages after LLM call""" # Add custom postprocessing logic return messages - + async def on_usage(self, usage): """Track usage information""" print(f"Tokens used: {usage.total_tokens}") diff --git a/docs/content/docs/agent-sdk/callbacks/logging.mdx b/docs/content/docs/agent-sdk/callbacks/logging.mdx index 8ab9b2e6..2ed3dda8 100644 --- a/docs/content/docs/agent-sdk/callbacks/logging.mdx +++ b/docs/content/docs/agent-sdk/callbacks/logging.mdx @@ -18,7 +18,7 @@ agent = ComputerAgent( tools=[computer], callbacks=[ LoggingCallback( - logger=logging.getLogger("cua"), + logger=logging.getLogger("cua"), level=logging.INFO ) ] @@ -47,7 +47,7 @@ class CustomLogger(AsyncCallbackHandler): def __init__(self, logger_name="agent"): self.logger = logging.getLogger(logger_name) self.logger.setLevel(logging.INFO) - + # Add console handler handler = logging.StreamHandler() formatter = logging.Formatter( @@ -55,18 +55,18 @@ class CustomLogger(AsyncCallbackHandler): ) handler.setFormatter(formatter) self.logger.addHandler(handler) - + async def on_run_start(self, kwargs, old_items): self.logger.info(f"Agent run started with model: {kwargs.get('model')}") - + async def on_computer_call_start(self, item): action = item.get('action', {}) self.logger.info(f"Computer action: {action.get('type')}") - + async def on_usage(self, usage): cost = usage.get('response_cost', 0) self.logger.info(f"API call cost: ${cost:.4f}") - + async def on_run_end(self, kwargs, old_items, new_items): self.logger.info("Agent run completed") @@ -81,6 +81,7 @@ agent = ComputerAgent( ## Available Hooks Log any agent event using these callback methods: + - `on_run_start/end` - Run lifecycle - `on_computer_call_start/end` - Computer actions - `on_api_start/end` - LLM API calls diff --git a/docs/content/docs/agent-sdk/callbacks/trajectories.mdx b/docs/content/docs/agent-sdk/callbacks/trajectories.mdx index 8118f217..b139d9a2 100644 --- a/docs/content/docs/agent-sdk/callbacks/trajectories.mdx +++ b/docs/content/docs/agent-sdk/callbacks/trajectories.mdx @@ -40,6 +40,7 @@ View trajectories in the browser at: **[trycua.com/trajectory-viewer](http://trycua.com/trajectory-viewer)** The viewer provides: + - Interactive conversation replay - Screenshot galleries - No data collection @@ -47,11 +48,13 @@ The viewer provides: ## Trajectory Structure Trajectories are saved with: + - Complete conversation history - Usage statistics and costs - Timestamps and metadata - Screenshots and computer actions Each trajectory contains: + - **metadata.json**: Run info, timestamps, usage stats (`total_tokens`, `response_cost`) - **turn_000/**: Turn-by-turn conversation history (api calls, responses, computer calls, screenshots) diff --git a/docs/content/docs/agent-sdk/custom-computer-handlers.mdx b/docs/content/docs/agent-sdk/custom-computer-handlers.mdx index e087fc21..c76a5d66 100644 --- a/docs/content/docs/agent-sdk/custom-computer-handlers.mdx +++ b/docs/content/docs/agent-sdk/custom-computer-handlers.mdx @@ -53,67 +53,67 @@ from typing import Literal, List, Dict, Union, Optional class MyCustomComputer(AsyncComputerHandler): """Custom computer handler implementation.""" - + def __init__(self): # Initialize your custom computer interface here pass - - # ==== Computer-Use-Preview Action Space ==== + + # ==== Computer-Use-Preview Action Space ==== async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: """Get the current environment type.""" ... - + async def get_dimensions(self) -> tuple[int, int]: """Get screen dimensions as (width, height).""" ... - + async def screenshot(self) -> str: """Take a screenshot and return as base64 string.""" ... - + async def click(self, x: int, y: int, button: str = "left") -> None: """Click at coordinates with specified button.""" ... - + async def double_click(self, x: int, y: int) -> None: """Double click at coordinates.""" ... - + async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: """Scroll at coordinates with specified scroll amounts.""" ... - + async def type(self, text: str) -> None: """Type text.""" ... - + async def wait(self, ms: int = 1000) -> None: """Wait for specified milliseconds.""" ... - + async def move(self, x: int, y: int) -> None: """Move cursor to coordinates.""" ... - + async def keypress(self, keys: Union[List[str], str]) -> None: """Press key combination.""" ... - + async def drag(self, path: List[Dict[str, int]]) -> None: """Drag along specified path.""" ... - + async def get_current_url(self) -> str: """Get current URL (for browser environments).""" ... - - # ==== Anthropic Action Space ==== + + # ==== Anthropic Action Space ==== async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None: """Left mouse down at coordinates.""" ... - + async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None: """Left mouse up at coordinates.""" ... @@ -127,4 +127,4 @@ agent = ComputerAgent( ) await agent.run("Take a screenshot and click at coordinates 100, 200") -``` \ No newline at end of file +``` diff --git a/docs/content/docs/agent-sdk/customizing-computeragent.mdx b/docs/content/docs/agent-sdk/customizing-computeragent.mdx index dac0d35f..e7d3c030 100644 --- a/docs/content/docs/agent-sdk/customizing-computeragent.mdx +++ b/docs/content/docs/agent-sdk/customizing-computeragent.mdx @@ -2,7 +2,16 @@ title: Customizing Your ComputerAgent --- -A corresponding Jupyter Notebook is available for this documentation. + + A corresponding{' '} + + Jupyter Notebook + {' '} + is available for this documentation. + The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems. @@ -118,4 +127,4 @@ await run_single_task( # tools=[your_custom_function], # callbacks=[YourCustomCallback()], ) -``` \ No newline at end of file +``` diff --git a/docs/content/docs/agent-sdk/integrations/hud.mdx b/docs/content/docs/agent-sdk/integrations/hud.mdx index f102e0a1..7bfcbdea 100644 --- a/docs/content/docs/agent-sdk/integrations/hud.mdx +++ b/docs/content/docs/agent-sdk/integrations/hud.mdx @@ -3,7 +3,13 @@ title: HUD Evals description: Use ComputerAgent with HUD for benchmarking and evaluation --- -A corresponding Jupyter Notebook is available for this documentation. + + A corresponding{' '} + + Jupyter Notebook + {' '} + is available for this documentation. + The HUD integration allows an agent to be benchmarked using the [HUD framework](https://www.hud.so/). Through the HUD integration, the agent controls a computer inside HUD, where tests are run to evaluate the success of each task. @@ -120,8 +126,8 @@ Both single-task and full-dataset runs share a common set of configuration optio HUD provides multiple benchmark datasets for realistic evaluation. 1. **[OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified)** – Benchmark on 369+ real-world desktop tasks across Chrome, LibreOffice, GIMP, VS Code, etc. - *Best for*: evaluating full computer-use agents in realistic environments. - *Verified variant*: fixes 300+ issues from earlier versions for reliability. + _Best for_: evaluating full computer-use agents in realistic environments. + _Verified variant_: fixes 300+ issues from earlier versions for reliability. **Coming soon:** SheetBench (spreadsheet automation) and other specialized HUD datasets. @@ -129,7 +135,7 @@ See the [HUD docs](https://docs.hud.so/environment-creation) for more eval envir ## Tips -* **Debugging:** set `verbosity=2` to see every model call and tool action. -* **Performance:** lower `screenshot_delay` for faster runs; raise it if you see race conditions. -* **Safety:** always set `max_steps` (defaults to 50) to prevent runaway loops. -* **Custom tools:** pass extra `tools=[...]` into the agent config if you need beyond `openai_computer`. \ No newline at end of file +- **Debugging:** set `verbosity=2` to see every model call and tool action. +- **Performance:** lower `screenshot_delay` for faster runs; raise it if you see race conditions. +- **Safety:** always set `max_steps` (defaults to 50) to prevent runaway loops. +- **Custom tools:** pass extra `tools=[...]` into the agent config if you need beyond `openai_computer`. diff --git a/docs/content/docs/agent-sdk/migration-guide.mdx b/docs/content/docs/agent-sdk/migration-guide.mdx index 89ee706e..ec75ab7a 100644 --- a/docs/content/docs/agent-sdk/migration-guide.mdx +++ b/docs/content/docs/agent-sdk/migration-guide.mdx @@ -20,7 +20,9 @@ This guide lists **breaking changes** when migrating from the original `Computer ## Usage Examples: Old vs New ### 1. Anthropic Loop + **Old:** + ```python async with Computer() as computer: agent = ComputerAgent( @@ -31,7 +33,9 @@ async with Computer() as computer: async for result in agent.run("Take a screenshot"): print(result) ``` + **New:** + ```python async with Computer() as computer: agent = ComputerAgent( @@ -46,7 +50,9 @@ async with Computer() as computer: ``` ### 2. OpenAI Loop + **Old:** + ```python async with Computer() as computer: agent = ComputerAgent( @@ -57,7 +63,9 @@ async with Computer() as computer: async for result in agent.run("Take a screenshot"): print(result) ``` + **New:** + ```python async with Computer() as computer: agent = ComputerAgent( @@ -72,7 +80,9 @@ async with Computer() as computer: ``` ### 3. UI-TARS Loop + **Old:** + ```python async with Computer() as computer: agent = ComputerAgent( @@ -83,7 +93,9 @@ async with Computer() as computer: async for result in agent.run("Take a screenshot"): print(result) ``` + **New:** + ```python async with Computer() as computer: agent = ComputerAgent( @@ -98,7 +110,9 @@ async with Computer() as computer: ``` ### 4. Omni Loop + **Old:** + ```python async with Computer() as computer: agent = ComputerAgent( @@ -109,7 +123,9 @@ async with Computer() as computer: async for result in agent.run("Take a screenshot"): print(result) ``` + **New:** + ```python async with Computer() as computer: agent = ComputerAgent( diff --git a/docs/content/docs/agent-sdk/prompt-caching.mdx b/docs/content/docs/agent-sdk/prompt-caching.mdx index 721895c5..cdcf7db5 100644 --- a/docs/content/docs/agent-sdk/prompt-caching.mdx +++ b/docs/content/docs/agent-sdk/prompt-caching.mdx @@ -26,7 +26,7 @@ agent = ComputerAgent( When using Anthropic-based CUAs (Claude models), setting `use_prompt_caching=True` will automatically add `{ "cache_control": "ephemeral" }` to your messages. This enables prompt caching for the session and can speed up repeated runs with the same prompt. -This argument is only required for Anthropic CUAs. For other providers, it is ignored. + This argument is only required for Anthropic CUAs. For other providers, it is ignored. ## OpenAI Provider @@ -44,13 +44,16 @@ agent = ComputerAgent( ``` ## Implementation Details + - For Anthropic: Adds `{ "cache_control": "ephemeral" }` to messages when enabled. - For OpenAI: Caching is automatic for long prompts; the argument is ignored. ## When to Use + - Enable for Anthropic CUAs if you want to avoid reprocessing the same prompt in repeated or iterative tasks. - Not needed for OpenAI models unless you want explicit ephemeral cache control (not required for most users). ## See Also + - [Agent Loops](./agent-loops) - [Migration Guide](./migration-guide) diff --git a/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx b/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx index 593ca84b..4e389365 100644 --- a/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx +++ b/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx @@ -59,7 +59,7 @@ Combine state-of-the-art grounding with powerful reasoning: ```python agent = ComputerAgent( - "huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-3-5-sonnet-20241022", + "huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-3-5-sonnet-20241022", tools=[computer] ) diff --git a/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx b/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx index a3384b21..9621e520 100644 --- a/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx +++ b/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx @@ -65,6 +65,7 @@ async for _ in agent.run("Click on the search bar and type 'hello world'"): ## InternVL 3.5 InternVL 3.5 family: + - `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` ```python @@ -76,6 +77,7 @@ async for _ in agent.run("Open Firefox and navigate to github.com"): ## Qwen3 VL Qwen3 VL family: + - `openrouter/qwen/qwen3-vl-235b-a22b-instruct` ```python diff --git a/docs/content/docs/agent-sdk/supported-agents/grounding-models.mdx b/docs/content/docs/agent-sdk/supported-agents/grounding-models.mdx index 20e95ddb..1f12de9a 100644 --- a/docs/content/docs/agent-sdk/supported-agents/grounding-models.mdx +++ b/docs/content/docs/agent-sdk/supported-agents/grounding-models.mdx @@ -17,9 +17,11 @@ All models that support `ComputerAgent.run()` also support `ComputerAgent.predic - Claude 3.5: `claude-3-5-sonnet-20241022` ### OpenAI CUA Preview + - Computer-use-preview: `computer-use-preview` ### UI-TARS 1.5 (Unified VLM with grounding support) + - `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` - `huggingface/ByteDance-Seed/UI-TARS-1.5-7B` (requires TGI endpoint) @@ -28,15 +30,19 @@ All models that support `ComputerAgent.run()` also support `ComputerAgent.predic These models are optimized specifically for click prediction and UI element grounding: ### OpenCUA + - `huggingface-local/xlangai/OpenCUA-{7B,32B}` ### GTA1 Family + - `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` ### Holo 1.5 Family + - `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` ### InternVL 3.5 Family + - `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` ### OmniParser (OCR) diff --git a/docs/content/docs/agent-sdk/supported-model-providers/index.mdx b/docs/content/docs/agent-sdk/supported-model-providers/index.mdx index 68e372b1..9177e712 100644 --- a/docs/content/docs/agent-sdk/supported-model-providers/index.mdx +++ b/docs/content/docs/agent-sdk/supported-model-providers/index.mdx @@ -5,6 +5,7 @@ title: Supported Model Providers ## Supported Models ### Anthropic Claude (Computer Use API) + ```python model="anthropic/claude-3-5-sonnet-20241022" model="anthropic/claude-3-7-sonnet-20250219" @@ -13,20 +14,23 @@ model="anthropic/claude-sonnet-4-20250514" ``` ### OpenAI Computer Use Preview + ```python model="openai/computer-use-preview" ``` ### UI-TARS (Local or Huggingface Inference) + ```python model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" model="ollama_chat/0000/ui-tars-1.5-7b" ``` ### Omniparser + Any LLM + ```python model="omniparser+ollama_chat/mistral-small3.2" model="omniparser+vertex_ai/gemini-pro" model="omniparser+anthropic/claude-3-5-sonnet-20241022" model="omniparser+openai/gpt-4o" -``` \ No newline at end of file +``` diff --git a/docs/content/docs/agent-sdk/usage-tracking.mdx b/docs/content/docs/agent-sdk/usage-tracking.mdx index 2709d738..425c694e 100644 --- a/docs/content/docs/agent-sdk/usage-tracking.mdx +++ b/docs/content/docs/agent-sdk/usage-tracking.mdx @@ -51,7 +51,7 @@ class UsageTrackerCallback(AsyncCallbackHandler): print("Usage update:", usage) agent = ComputerAgent( - ..., + ..., callbacks=[UsageTrackerCallback()] ) ``` @@ -59,5 +59,6 @@ agent = ComputerAgent( See also: [Budget Manager Callbacks](./callbacks/cost-saving) ## See Also + - [Prompt Caching](./prompt-caching) - [Callbacks](./callbacks) diff --git a/docs/content/docs/computer-sdk/cloud-vm-management.mdx b/docs/content/docs/computer-sdk/cloud-vm-management.mdx index 2c8f09db..25291a39 100644 --- a/docs/content/docs/computer-sdk/cloud-vm-management.mdx +++ b/docs/content/docs/computer-sdk/cloud-vm-management.mdx @@ -5,7 +5,6 @@ description: Manage your Cua Cloud sandboxes (VMs) via Python SDK or HTTP API import { Tab, Tabs } from 'fumadocs-ui/components/tabs'; - Using the Cua Cloud API, you can manage your Cua Cloud sandboxes (VMs) with Python or HTTP (curl). All examples require a CUA API key. You can obtain one from the [Dashboard](https://www.cua.ai/dashboard/keys). @@ -17,107 +16,111 @@ All examples require a CUA API key. You can obtain one from the [Dashboard](http - ```python - import os - import asyncio - from computer.providers.cloud.provider import CloudProvider +```python +import os +import asyncio +from computer.providers.cloud.provider import CloudProvider - async def main(): - api_key = os.getenv("CUA_API_KEY") or "your-api-key" - # Optional: point to a different API base - # os.environ["CUA_API_BASE"] = "https://api.cua.ai" +async def main(): + api_key = os.getenv("CUA_API_KEY") or "your-api-key" + # Optional: point to a different API base + # os.environ["CUA_API_BASE"] = "https://api.cua.ai" - provider = CloudProvider(api_key=api_key, verbose=False) - async with provider: - vms = await provider.list_vms() - for vm in vms: - print({ - "name": vm["name"], - "status": vm["status"], - "api_url": vm.get("api_url"), - "vnc_url": vm.get("vnc_url"), - }) + provider = CloudProvider(api_key=api_key, verbose=False) + async with provider: + vms = await provider.list_vms() + for vm in vms: + print({ + "name": vm["name"], + "status": vm["status"], + "api_url": vm.get("api_url"), + "vnc_url": vm.get("vnc_url"), + }) - if __name__ == "__main__": - asyncio.run(main()) - ``` +if __name__ == "__main__": + asyncio.run(main()) +``` - ```bash - curl -H "Authorization: Bearer $CUA_API_KEY" \ - "https://api.cua.ai/v1/vms" - ``` +```bash +curl -H "Authorization: Bearer $CUA_API_KEY" \ + "https://api.cua.ai/v1/vms" +``` - Responses: - - 200: Array of minimal VM objects with fields `{ name, password, status }` - - 401: Unauthorized (missing/invalid API key) +Responses: - ```json - [ - { - "name": "s-windows-x4snp46ebf", - "password": "49b8daa3", - "status": "running" - } - ] - ``` +- 200: Array of minimal VM objects with fields `{ name, password, status }` +- 401: Unauthorized (missing/invalid API key) - Status values: +```json +[ + { + "name": "s-windows-x4snp46ebf", + "password": "49b8daa3", + "status": "running" + } +] +``` - - `pending`: VM deployment in progress - - `running`: VM is active and accessible - - `stopped`: VM is stopped but not terminated - - `terminated`: VM has been permanently destroyed - - `failed`: VM deployment or operation failed +Status values: - - +- `pending`: VM deployment in progress +- `running`: VM is active and accessible +- `stopped`: VM is stopped but not terminated +- `terminated`: VM has been permanently destroyed +- `failed`: VM deployment or operation failed + + + + --- ## Start a VM + Provide the VM name you want to start. - ```python - import os - import asyncio - from computer.providers.cloud.provider import CloudProvider +```python +import os +import asyncio +from computer.providers.cloud.provider import CloudProvider - async def main(): - api_key = os.getenv("CUA_API_KEY") or "your-api-key" - name = "my-vm-name" # e.g., "m-linux-96lcxd2c2k" +async def main(): + api_key = os.getenv("CUA_API_KEY") or "your-api-key" + name = "my-vm-name" # e.g., "m-linux-96lcxd2c2k" - provider = CloudProvider(api_key=api_key) - async with provider: - resp = await provider.run_vm(name) - print(resp) # { "name": name, "status": "starting" } + provider = CloudProvider(api_key=api_key) + async with provider: + resp = await provider.run_vm(name) + print(resp) # { "name": name, "status": "starting" } - if __name__ == "__main__": - asyncio.run(main()) - ``` +if __name__ == "__main__": + asyncio.run(main()) +``` - ```bash - curl -X POST \ - -H "Authorization: Bearer $CUA_API_KEY" \ - "https://api.cua.ai/v1/vms/my-vm-name/start" -i - ``` +```bash +curl -X POST \ + -H "Authorization: Bearer $CUA_API_KEY" \ + "https://api.cua.ai/v1/vms/my-vm-name/start" -i +``` - Responses: - - 204: No Content (start accepted) - - 401: Unauthorized (missing/invalid API key) - - 404: VM not found or not owned by the user +Responses: - ```text - HTTP/1.1 204 No Content - ``` +- 204: No Content (start accepted) +- 401: Unauthorized (missing/invalid API key) +- 404: VM not found or not owned by the user + +```text +HTTP/1.1 204 No Content +``` @@ -125,46 +128,48 @@ Provide the VM name you want to start. --- ## Stop a VM + Stops the VM asynchronously. - ```python - import os - import asyncio - from computer.providers.cloud.provider import CloudProvider +```python +import os +import asyncio +from computer.providers.cloud.provider import CloudProvider - async def main(): - api_key = os.getenv("CUA_API_KEY") or "your-api-key" - name = "my-vm-name" +async def main(): + api_key = os.getenv("CUA_API_KEY") or "your-api-key" + name = "my-vm-name" - provider = CloudProvider(api_key=api_key) - async with provider: - resp = await provider.stop_vm(name) - print(resp) # { "name": name, "status": "stopping" } + provider = CloudProvider(api_key=api_key) + async with provider: + resp = await provider.stop_vm(name) + print(resp) # { "name": name, "status": "stopping" } - if __name__ == "__main__": - asyncio.run(main()) - ``` +if __name__ == "__main__": + asyncio.run(main()) +``` - ```bash - curl -X POST \ - -H "Authorization: Bearer $CUA_API_KEY" \ - "https://api.cua.ai/v1/vms/my-vm-name/stop" - ``` +```bash +curl -X POST \ + -H "Authorization: Bearer $CUA_API_KEY" \ + "https://api.cua.ai/v1/vms/my-vm-name/stop" +``` - Responses: - - 202: Accepted with `{ "status": "stopping" }` - - 401: Unauthorized (missing/invalid API key) - - 404: VM not found or not owned by the user +Responses: - ```json - { "status": "stopping" } - ``` +- 202: Accepted with `{ "status": "stopping" }` +- 401: Unauthorized (missing/invalid API key) +- 404: VM not found or not owned by the user + +```json +{ "status": "stopping" } +``` @@ -172,46 +177,48 @@ Stops the VM asynchronously. --- ## Restart a VM + Restarts the VM asynchronously. - ```python - import os - import asyncio - from computer.providers.cloud.provider import CloudProvider +```python +import os +import asyncio +from computer.providers.cloud.provider import CloudProvider - async def main(): - api_key = os.getenv("CUA_API_KEY") or "your-api-key" - name = "my-vm-name" +async def main(): + api_key = os.getenv("CUA_API_KEY") or "your-api-key" + name = "my-vm-name" - provider = CloudProvider(api_key=api_key) - async with provider: - resp = await provider.restart_vm(name) - print(resp) # { "name": name, "status": "restarting" } + provider = CloudProvider(api_key=api_key) + async with provider: + resp = await provider.restart_vm(name) + print(resp) # { "name": name, "status": "restarting" } - if __name__ == "__main__": - asyncio.run(main()) - ``` +if __name__ == "__main__": + asyncio.run(main()) +``` - ```bash - curl -X POST \ - -H "Authorization: Bearer $CUA_API_KEY" \ - "https://api.cua.ai/v1/vms/my-vm-name/restart" - ``` +```bash +curl -X POST \ + -H "Authorization: Bearer $CUA_API_KEY" \ + "https://api.cua.ai/v1/vms/my-vm-name/restart" +``` - Responses: - - 202: Accepted with `{ "status": "restarting" }` - - 401: Unauthorized (missing/invalid API key) - - 404: VM not found or not owned by the user +Responses: - ```json - { "status": "restarting" } - ``` +- 202: Accepted with `{ "status": "restarting" }` +- 401: Unauthorized (missing/invalid API key) +- 404: VM not found or not owned by the user + +```json +{ "status": "restarting" } +``` @@ -219,42 +226,44 @@ Restarts the VM asynchronously. --- ## Query a VM by name + Query the computer-server running on the VM. Useful for checking details like status or OS type. - ```python - import os - import asyncio - from computer.providers.cloud.provider import CloudProvider +```python +import os +import asyncio +from computer.providers.cloud.provider import CloudProvider - async def main(): - api_key = os.getenv("CUA_API_KEY") or "your-api-key" - name = "my-vm-name" +async def main(): + api_key = os.getenv("CUA_API_KEY") or "your-api-key" + name = "my-vm-name" - provider = CloudProvider(api_key=api_key) - async with provider: - info = await provider.get_vm(name) - print(info) + provider = CloudProvider(api_key=api_key) + async with provider: + info = await provider.get_vm(name) + print(info) - if __name__ == "__main__": - asyncio.run(main()) - ``` +if __name__ == "__main__": + asyncio.run(main()) +``` - ```bash - curl "https://my-vm-name.containers.cloud.cua.ai:8443/status" - ``` +```bash +curl "https://my-vm-name.containers.cloud.cua.ai:8443/status" +``` - Responses: - - 200: Server available +Responses: - ```json - { "status": "ok", "os_type": "linux", "features": ["agent"] } - ``` +- 200: Server available + +```json +{ "status": "ok", "os_type": "linux", "features": ["agent"] } +``` diff --git a/docs/content/docs/computer-sdk/commands.mdx b/docs/content/docs/computer-sdk/commands.mdx index 96330f39..c7b5a39b 100644 --- a/docs/content/docs/computer-sdk/commands.mdx +++ b/docs/content/docs/computer-sdk/commands.mdx @@ -13,12 +13,20 @@ Execute shell commands and get detailed results: - ```python # Run shell command result = await computer.interface.run_command(cmd) # - result.stdout, result.stderr, result.returncode ``` + + ```python + # Run shell command + result = await computer.interface.run_command(cmd) # result.stdout, result.stderr, result.returncode + ``` + - ```typescript // Run shell command const result = await computer.interface.runCommand(cmd); // - result.stdout, result.stderr, result.returncode ``` + + ```typescript + // Run shell command + const result = await computer.interface.runCommand(cmd); // result.stdout, result.stderr, result.returncode + ``` + @@ -28,6 +36,7 @@ Control application launching and windows: + ```python # Launch applications await computer.interface.launch("xfce4-terminal") @@ -52,6 +61,7 @@ Control application launching and windows: + ```typescript // Launch applications await computer.interface.launch("xfce4-terminal"); @@ -83,6 +93,7 @@ Precise mouse control and interaction: + ```python # Basic clicks await computer.interface.left_click(x, y) # Left click at coordinates @@ -101,6 +112,7 @@ Precise mouse control and interaction: + ```typescript // Basic clicks await computer.interface.leftClick(x, y); // Left click at coordinates @@ -126,6 +138,7 @@ Text input and key combinations: + ```python # Text input await computer.interface.type_text("Hello") # Type text @@ -139,6 +152,7 @@ Text input and key combinations: + ```typescript // Text input await computer.interface.typeText("Hello"); // Type text @@ -159,14 +173,24 @@ Mouse wheel and scrolling control: - ```python # Scrolling await computer.interface.scroll(x, y) # Scroll the mouse wheel await - computer.interface.scroll_down(clicks) # Scroll down await computer.interface.scroll_up(clicks) - # Scroll up ``` + + ```python + # Scrolling + await computer.interface.scroll(x, y) # Scroll the mouse wheel + await computer.interface.scroll_down(clicks) # Scroll down + await computer.interface.scroll_up(clicks) # Scroll up + ``` + - ```typescript // Scrolling await computer.interface.scroll(x, y); // Scroll the mouse wheel - await computer.interface.scrollDown(clicks); // Scroll down await - computer.interface.scrollUp(clicks); // Scroll up ``` + + ```typescript + // Scrolling + await computer.interface.scroll(x, y); // Scroll the mouse wheel + await computer.interface.scrollDown(clicks); // Scroll down + await computer.interface.scrollUp(clicks); // Scroll up + ``` + @@ -176,21 +200,22 @@ Screen capture and display information: - ```python - # Screen operations - await computer.interface.screenshot() # Take a screenshot - await computer.interface.get_screen_size() # Get screen dimensions + ```python + # Screen operations + await computer.interface.screenshot() # Take a screenshot + await computer.interface.get_screen_size() # Get screen dimensions ``` - ```typescript - // Screen operations - await computer.interface.screenshot(); // Take a screenshot - await computer.interface.getScreenSize(); // Get screen dimensions - + + ```typescript + // Screen operations + await computer.interface.screenshot(); // Take a screenshot + await computer.interface.getScreenSize(); // Get screen dimensions ``` + @@ -229,20 +254,20 @@ System clipboard management: - ```python - # Clipboard operations await - computer.interface.set_clipboard(text) # Set clipboard content await - computer.interface.copy_to_clipboard() # Get clipboard content + ```python + # Clipboard operations + await computer.interface.set_clipboard(text) # Set clipboard content + await computer.interface.copy_to_clipboard() # Get clipboard content ``` - ```typescript - // Clipboard operations + + ```typescript + // Clipboard operations await computer.interface.setClipboard(text); // Set clipboard content await computer.interface.copyToClipboard(); // Get clipboard content - ``` @@ -275,18 +300,19 @@ Direct file and directory manipulation: + ```typescript - # File existence checks + // File existence checks await computer.interface.fileExists(path); // Check if file exists await computer.interface.directoryExists(path); // Check if directory exists - # File content operations + // File content operations await computer.interface.readText(path, "utf-8"); // Read file content await computer.interface.writeText(path, content, "utf-8"); // Write file content await computer.interface.readBytes(path); // Read file content as bytes await computer.interface.writeBytes(path, content); // Write file content as bytes - # File and directory management + // File and directory management await computer.interface.deleteFile(path); // Delete file await computer.interface.createDir(path); // Create directory await computer.interface.deleteDir(path); // Delete directory @@ -302,20 +328,21 @@ Access system accessibility information: - ```python - # Get accessibility tree - await computer.interface.get_accessibility_tree() + ```python + # Get accessibility tree + await computer.interface.get_accessibility_tree() ``` - ```typescript - // Get accessibility tree - await computer.interface.getAccessibilityTree(); -```` - + ```typescript + // Get accessibility tree + await computer.interface.getAccessibilityTree(); + ``` + + ## Delay Configuration @@ -324,6 +351,7 @@ Control timing between actions: + ```python # Set default delay between all actions (in seconds) computer.interface.delay = 0.5 # 500ms delay between actions @@ -343,6 +371,7 @@ Manage Python environments: + ```python # Virtual environment management await computer.venv_install("demo_venv", ["requests", "macos-pyxa"]) # Install packages in a virtual environment @@ -352,4 +381,3 @@ Manage Python environments: -```` diff --git a/docs/content/docs/computer-sdk/computer-ui.mdx b/docs/content/docs/computer-sdk/computer-ui.mdx index 22b131c0..c731e4c4 100644 --- a/docs/content/docs/computer-sdk/computer-ui.mdx +++ b/docs/content/docs/computer-sdk/computer-ui.mdx @@ -10,7 +10,8 @@ pip install "cua-computer[ui]" ``` -For precise control of the computer, we recommend using VNC or Screen Sharing instead of the Computer Gradio UI. + For precise control of the computer, we recommend using VNC or Screen Sharing instead of the + Computer Gradio UI. ### Building and Sharing Demonstrations with Huggingface @@ -43,8 +44,12 @@ For examples, see [Computer UI Examples](https://github.com/trycua/cua/tree/main #### 3. Record Your Tasks
-View demonstration video - + View demonstration video +
Record yourself performing various computer tasks using the UI. @@ -52,8 +57,12 @@ Record yourself performing various computer tasks using the UI. #### 4. Save Your Demonstrations
-View demonstration video - + View demonstration video +
Save each task by picking a descriptive name and adding relevant tags (e.g., "office", "web-browsing", "coding"). @@ -65,11 +74,16 @@ Repeat steps 3 and 4 until you have a good amount of demonstrations covering dif #### 6. Upload to Huggingface
-View demonstration video - + View demonstration video +
Upload your dataset to Huggingface by: + - Naming it as `{your_username}/{dataset_name}` - Choosing public or private visibility - Optionally selecting specific tags to upload only tasks with certain tags @@ -77,4 +91,4 @@ Upload your dataset to Huggingface by: #### Examples and Resources - Example Dataset: [ddupont/test-dataset](https://huggingface.co/datasets/ddupont/test-dataset) -- Find Community Datasets: 🔍 [Browse CUA Datasets on Huggingface](https://huggingface.co/datasets?other=cua) \ No newline at end of file +- Find Community Datasets: 🔍 [Browse CUA Datasets on Huggingface](https://huggingface.co/datasets?other=cua) diff --git a/docs/content/docs/computer-sdk/computers.mdx b/docs/content/docs/computer-sdk/computers.mdx index d666bd99..e7437959 100644 --- a/docs/content/docs/computer-sdk/computers.mdx +++ b/docs/content/docs/computer-sdk/computers.mdx @@ -3,7 +3,17 @@ title: Cua Computers description: Understanding Cua computer types and connection methods --- -A corresponding Jupyter Notebook and NodeJS project are available for this documentation. + + A corresponding{' '} + + Jupyter Notebook + {' '} + and{' '} + + NodeJS project + {' '} + are available for this documentation. + Before we can automate apps using AI, we need to first connect to a Computer Server to give the AI a safe environment to execute workflows in. diff --git a/docs/content/docs/computer-sdk/sandboxed-python.mdx b/docs/content/docs/computer-sdk/sandboxed-python.mdx index 6d70e9a6..82d2809b 100644 --- a/docs/content/docs/computer-sdk/sandboxed-python.mdx +++ b/docs/content/docs/computer-sdk/sandboxed-python.mdx @@ -3,7 +3,16 @@ title: Sandboxed Python slug: sandboxed-python --- -A corresponding Python example is available for this documentation. + + A corresponding{' '} + + Python example + {' '} + is available for this documentation. + You can run Python functions securely inside a sandboxed virtual environment on a remote Cua Computer. This is useful for executing untrusted user code, isolating dependencies, or providing a safe environment for automation tasks. diff --git a/docs/content/docs/example-usecases/form-filling.mdx b/docs/content/docs/example-usecases/form-filling.mdx index e819502e..d9a61581 100644 --- a/docs/content/docs/example-usecases/form-filling.mdx +++ b/docs/content/docs/example-usecases/form-filling.mdx @@ -15,6 +15,7 @@ This preset usecase uses [Cua Computer](/computer-sdk/computers) to interact wit ## Quickstart Create a `requirements.txt` file with the following dependencies: + ```text cua-agent cua-computer @@ -34,7 +35,7 @@ ANTHROPIC_API_KEY=your-api-key CUA_API_KEY=sk_cua-api01... ``` -Select the environment you want to run the code in (*click on the underlined values in the code to edit them directly!*): +Select the environment you want to run the code in (_click on the underlined values in the code to edit them directly!_): @@ -58,23 +59,21 @@ from computer import Computer, VMProviderType from dotenv import load_dotenv logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - +logger = logging.getLogger(**name**) def handle_sigint(sig, frame): - print("\\n\\nExecution interrupted by user. Exiting gracefully...") - exit(0) - +print("\\n\\nExecution interrupted by user. Exiting gracefully...") +exit(0) async def fill_application(): - try: - async with Computer( - os_type="linux", - provider_type=VMProviderType.CLOUD, - name="`}{`", - api_key="`}{`", - verbosity=logging.INFO, - ) as computer: +try: +async with Computer( +os_type="linux", +provider_type=VMProviderType.CLOUD, +name="`}{`", +api_key="`}{`", +verbosity=logging.INFO, +) as computer: agent = ComputerAgent( model="anthropic/claude-3-5-sonnet-20241022", @@ -124,10 +123,9 @@ async def fill_application(): traceback.print_exc() raise - def main(): - try: - load_dotenv() +try: +load_dotenv() if "ANTHROPIC_API_KEY" not in os.environ: raise RuntimeError( @@ -149,9 +147,9 @@ def main(): logger.error(f"Error running automation: {e}") traceback.print_exc() +if **name** == "**main**": +main()`} -if __name__ == "__main__": - main()`} @@ -175,22 +173,20 @@ from computer import Computer, VMProviderType from dotenv import load_dotenv logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - +logger = logging.getLogger(**name**) def handle_sigint(sig, frame): - print("\\n\\nExecution interrupted by user. Exiting gracefully...") - exit(0) - +print("\\n\\nExecution interrupted by user. Exiting gracefully...") +exit(0) async def fill_application(): - try: - async with Computer( - os_type="macos", - provider_type=VMProviderType.LUME, - name="`}{`", - verbosity=logging.INFO, - ) as computer: +try: +async with Computer( +os_type="macos", +provider_type=VMProviderType.LUME, +name="`}{`", +verbosity=logging.INFO, +) as computer: agent = ComputerAgent( model="anthropic/claude-3-5-sonnet-20241022", @@ -240,10 +236,9 @@ async def fill_application(): traceback.print_exc() raise - def main(): - try: - load_dotenv() +try: +load_dotenv() if "ANTHROPIC_API_KEY" not in os.environ: raise RuntimeError( @@ -259,9 +254,9 @@ def main(): logger.error(f"Error running automation: {e}") traceback.print_exc() +if **name** == "**main**": +main()`} -if __name__ == "__main__": - main()`}
@@ -283,21 +278,19 @@ from computer import Computer, VMProviderType from dotenv import load_dotenv logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - +logger = logging.getLogger(**name**) def handle_sigint(sig, frame): - print("\\n\\nExecution interrupted by user. Exiting gracefully...") - exit(0) - +print("\\n\\nExecution interrupted by user. Exiting gracefully...") +exit(0) async def fill_application(): - try: - async with Computer( - os_type="windows", - provider_type=VMProviderType.WINDOWS_SANDBOX, - verbosity=logging.INFO, - ) as computer: +try: +async with Computer( +os_type="windows", +provider_type=VMProviderType.WINDOWS_SANDBOX, +verbosity=logging.INFO, +) as computer: agent = ComputerAgent( model="anthropic/claude-3-5-sonnet-20241022", @@ -347,10 +340,9 @@ async def fill_application(): traceback.print_exc() raise - def main(): - try: - load_dotenv() +try: +load_dotenv() if "ANTHROPIC_API_KEY" not in os.environ: raise RuntimeError( @@ -366,9 +358,9 @@ def main(): logger.error(f"Error running automation: {e}") traceback.print_exc() +if **name** == "**main**": +main()`} -if __name__ == "__main__": - main()`}
@@ -392,22 +384,20 @@ from computer import Computer, VMProviderType from dotenv import load_dotenv logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - +logger = logging.getLogger(**name**) def handle_sigint(sig, frame): - print("\\n\\nExecution interrupted by user. Exiting gracefully...") - exit(0) - +print("\\n\\nExecution interrupted by user. Exiting gracefully...") +exit(0) async def fill_application(): - try: - async with Computer( - os_type="linux", - provider_type=VMProviderType.DOCKER, - name="`}{`", - verbosity=logging.INFO, - ) as computer: +try: +async with Computer( +os_type="linux", +provider_type=VMProviderType.DOCKER, +name="`}{`", +verbosity=logging.INFO, +) as computer: agent = ComputerAgent( model="anthropic/claude-3-5-sonnet-20241022", @@ -457,10 +447,9 @@ async def fill_application(): traceback.print_exc() raise - def main(): - try: - load_dotenv() +try: +load_dotenv() if "ANTHROPIC_API_KEY" not in os.environ: raise RuntimeError( @@ -476,9 +465,9 @@ def main(): logger.error(f"Error running automation: {e}") traceback.print_exc() +if **name** == "**main**": +main()`} -if __name__ == "__main__": - main()`}
@@ -488,4 +477,4 @@ if __name__ == "__main__": - Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands) - Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/) -- Experiment with different [Models and Providers](/agent-sdk/supported-model-providers/) +- Experiment with different [Models and Providers](/agent-sdk/supported-model-providers/) diff --git a/docs/content/docs/libraries/computer-server/Commands.mdx b/docs/content/docs/libraries/computer-server/Commands.mdx index 269162a1..9c220826 100644 --- a/docs/content/docs/libraries/computer-server/Commands.mdx +++ b/docs/content/docs/libraries/computer-server/Commands.mdx @@ -7,42 +7,42 @@ description: List of all commands supported by the Computer Server API (WebSocke This page lists all supported commands for the Computer Server, available via both WebSocket and REST API endpoints. -| Command | Description | -|---------------------|--------------------------------------------| -| version | Get protocol and package version info | -| run_command | Run a shell command | -| screenshot | Capture a screenshot | -| get_screen_size | Get the screen size | -| get_cursor_position | Get the current mouse cursor position | -| mouse_down | Mouse button down | -| mouse_up | Mouse button up | -| left_click | Left mouse click | -| right_click | Right mouse click | -| double_click | Double mouse click | -| move_cursor | Move mouse cursor to coordinates | -| drag_to | Drag mouse to coordinates | -| drag | Drag mouse by offset | -| key_down | Keyboard key down | -| key_up | Keyboard key up | -| type_text | Type text | -| press_key | Press a single key | -| hotkey | Press a hotkey combination | -| scroll | Scroll the screen | -| scroll_down | Scroll down | -| scroll_up | Scroll up | -| copy_to_clipboard | Copy text to clipboard | -| set_clipboard | Set clipboard content | -| file_exists | Check if a file exists | -| directory_exists | Check if a directory exists | -| list_dir | List files/directories in a directory | -| read_text | Read text from a file | -| write_text | Write text to a file | -| read_bytes | Read bytes from a file | -| write_bytes | Write bytes to a file | -| get_file_size | Get file size | -| delete_file | Delete a file | -| create_dir | Create a directory | -| delete_dir | Delete a directory | -| get_accessibility_tree | Get accessibility tree (if supported) | -| find_element | Find element in accessibility tree | -| diorama_cmd | Run a diorama command (if supported) | +| Command | Description | +| ---------------------- | ------------------------------------- | +| version | Get protocol and package version info | +| run_command | Run a shell command | +| screenshot | Capture a screenshot | +| get_screen_size | Get the screen size | +| get_cursor_position | Get the current mouse cursor position | +| mouse_down | Mouse button down | +| mouse_up | Mouse button up | +| left_click | Left mouse click | +| right_click | Right mouse click | +| double_click | Double mouse click | +| move_cursor | Move mouse cursor to coordinates | +| drag_to | Drag mouse to coordinates | +| drag | Drag mouse by offset | +| key_down | Keyboard key down | +| key_up | Keyboard key up | +| type_text | Type text | +| press_key | Press a single key | +| hotkey | Press a hotkey combination | +| scroll | Scroll the screen | +| scroll_down | Scroll down | +| scroll_up | Scroll up | +| copy_to_clipboard | Copy text to clipboard | +| set_clipboard | Set clipboard content | +| file_exists | Check if a file exists | +| directory_exists | Check if a directory exists | +| list_dir | List files/directories in a directory | +| read_text | Read text from a file | +| write_text | Write text to a file | +| read_bytes | Read bytes from a file | +| write_bytes | Write bytes to a file | +| get_file_size | Get file size | +| delete_file | Delete a file | +| create_dir | Create a directory | +| delete_dir | Delete a directory | +| get_accessibility_tree | Get accessibility tree (if supported) | +| find_element | Find element in accessibility tree | +| diorama_cmd | Run a diorama command (if supported) | diff --git a/docs/content/docs/libraries/computer-server/REST-API.mdx b/docs/content/docs/libraries/computer-server/REST-API.mdx index 369565de..18f5980b 100644 --- a/docs/content/docs/libraries/computer-server/REST-API.mdx +++ b/docs/content/docs/libraries/computer-server/REST-API.mdx @@ -16,6 +16,7 @@ The Computer Server exposes a single REST endpoint for command execution: - Returns results as a streaming response (text/event-stream) ### Request Format + ```json { "command": "", @@ -24,10 +25,12 @@ The Computer Server exposes a single REST endpoint for command execution: ``` ### Required Headers (for cloud containers) + - `X-Container-Name`: Name of the container (cloud only) - `X-API-Key`: API key for authentication (cloud only) ### Example Request (Python) + ```python import requests @@ -38,6 +41,7 @@ print(resp.text) ``` ### Example Request (Cloud) + ```python import requests @@ -52,7 +56,9 @@ print(resp.text) ``` ### Response Format + Streaming text/event-stream with JSON objects, e.g.: + ``` data: {"success": true, "content": "..."} @@ -60,4 +66,5 @@ data: {"success": false, "error": "..."} ``` ### Supported Commands + See [Commands Reference](./Commands) for the full list of commands and parameters. diff --git a/docs/content/docs/libraries/computer-server/WebSocket-API.mdx b/docs/content/docs/libraries/computer-server/WebSocket-API.mdx index 98d6d7ad..00d20d21 100644 --- a/docs/content/docs/libraries/computer-server/WebSocket-API.mdx +++ b/docs/content/docs/libraries/computer-server/WebSocket-API.mdx @@ -11,7 +11,9 @@ The Computer Server exposes a WebSocket endpoint for real-time command execution - `wss://your-container.containers.cloud.trycua.com:8443/ws` (cloud) ### Authentication (Cloud Only) + For cloud containers, you must authenticate immediately after connecting: + ```json { "command": "authenticate", @@ -21,10 +23,13 @@ For cloud containers, you must authenticate immediately after connecting: } } ``` + If authentication fails, the connection is closed. ### Command Format + Send JSON messages: + ```json { "command": "", @@ -33,6 +38,7 @@ Send JSON messages: ``` ### Example (Python) + ```python import websockets import asyncio @@ -49,6 +55,7 @@ asyncio.run(main()) ``` ### Example (Cloud) + ```python import websockets import asyncio @@ -74,7 +81,9 @@ asyncio.run(main()) ``` ### Response Format + Each response is a JSON object: + ```json { "success": true, @@ -83,4 +92,5 @@ Each response is a JSON object: ``` ### Supported Commands + See [Commands Reference](./Commands) for the full list of commands and parameters. diff --git a/docs/content/docs/libraries/computer-server/index.mdx b/docs/content/docs/libraries/computer-server/index.mdx index fcf265da..d5affd25 100644 --- a/docs/content/docs/libraries/computer-server/index.mdx +++ b/docs/content/docs/libraries/computer-server/index.mdx @@ -6,7 +6,16 @@ github: - https://github.com/trycua/cua/tree/main/libs/python/computer-server --- -A corresponding Jupyter Notebook is available for this documentation. + + A corresponding{' '} + + Jupyter Notebook + {' '} + is available for this documentation. + The Computer Server API reference documentation is currently under development. diff --git a/docs/content/docs/libraries/computer/index.mdx b/docs/content/docs/libraries/computer/index.mdx index 6638f878..69478b20 100644 --- a/docs/content/docs/libraries/computer/index.mdx +++ b/docs/content/docs/libraries/computer/index.mdx @@ -20,4 +20,4 @@ See the [Commands](../computer-sdk/commands) documentation for all supported com ## Sandboxed Python Functions -See the [Sandboxed Python](../computer-sdk/sandboxed-python) documentation for running Python functions securely in isolated environments on a remote Cua Computer. \ No newline at end of file +See the [Sandboxed Python](../computer-sdk/sandboxed-python) documentation for running Python functions securely in isolated environments on a remote Cua Computer. diff --git a/docs/content/docs/libraries/lume/cli-reference.mdx b/docs/content/docs/libraries/lume/cli-reference.mdx index 5afcc7fe..20120616 100644 --- a/docs/content/docs/libraries/lume/cli-reference.mdx +++ b/docs/content/docs/libraries/lume/cli-reference.mdx @@ -18,7 +18,8 @@ lume run ubuntu-noble-vanilla:latest ``` -We provide [prebuilt VM images](../lume/prebuilt-images) in our [ghcr registry](https://github.com/orgs/trycua/packages). + We provide [prebuilt VM images](../lume/prebuilt-images) in our [ghcr + registry](https://github.com/orgs/trycua/packages). ### Create a Custom VM @@ -37,10 +38,11 @@ The actual disk space used by sparse images will be much lower than the logical ## VM Management - lume create <name> +lume create <name> Create a new macOS or Linux virtual machine. **Options:** + - `--os ` - Operating system to install (macOS or linux, default: macOS) - `--cpu ` - Number of CPU cores (default: 4) - `--memory ` - Memory size, e.g., 8GB (default: 4GB) @@ -50,6 +52,7 @@ Create a new macOS or Linux virtual machine. - `--storage ` - VM storage location to use **Examples:** + ```bash # Create macOS VM with custom specs lume create my-mac --cpu 6 --memory 16GB --disk-size 100GB @@ -61,10 +64,11 @@ lume create my-ubuntu --os linux --cpu 2 --memory 8GB lume create my-sequoia --ipsw latest ``` - lume run <name> +lume run <name> Start and run a virtual machine. **Options:** + - `--no-display` - Do not start the VNC client app - `--shared-dir ` - Share directory with VM (format: path[:ro|rw]) - `--mount ` - For Linux VMs only, attach a read-only disk image @@ -75,6 +79,7 @@ Start and run a virtual machine. - `--storage ` - VM storage location to use **Examples:** + ```bash # Run VM with shared directory lume run my-vm --shared-dir /path/to/share:rw @@ -86,42 +91,52 @@ lume run my-vm --no-display lume run my-mac --recovery-mode true ``` - lume stop <name> +lume stop <name> Stop a running virtual machine. **Options:** + - `--storage ` - VM storage location to use ### lume delete <name> + Delete a virtual machine and its associated files. **Options:** + - `--force` - Force deletion without confirmation - `--storage ` - VM storage location to use ### lume clone <name> <new-name> + Create a copy of an existing virtual machine. **Options:** + - `--source-storage ` - Source VM storage location - `--dest-storage ` - Destination VM storage location ## VM Information and Configuration ### lume ls + List all virtual machines and their status. ### lume get <name> + Get detailed information about a specific virtual machine. **Options:** + - `-f, --format ` - Output format (json|text) - `--storage ` - VM storage location to use ### lume set <name> + Modify virtual machine configuration. **Options:** + - `--cpu ` - New number of CPU cores (e.g., 4) - `--memory ` - New memory size (e.g., 8192MB or 8GB) - `--disk-size ` - New disk size (e.g., 40960MB or 40GB) @@ -129,6 +144,7 @@ Modify virtual machine configuration. - `--storage ` - VM storage location to use **Examples:** + ```bash # Increase VM memory lume set my-vm --memory 16GB @@ -143,20 +159,25 @@ lume set my-vm --cpu 8 ## Image Management ### lume images + List available macOS images in local cache. ### lume pull <image> + Download a VM image from a container registry. **Options:** + - `--registry ` - Container registry URL (default: ghcr.io) - `--organization ` - Organization to pull from (default: trycua) - `--storage ` - VM storage location to use ### lume push <name> <image:tag> + Upload a VM image to a container registry. **Options:** + - `--additional-tags ` - Additional tags to push the same image to - `--registry ` - Container registry URL (default: ghcr.io) - `--organization ` - Organization/user to push to (default: trycua) @@ -167,38 +188,46 @@ Upload a VM image to a container registry. - `--reassemble` - Verify integrity by reassembling chunks (requires --dry-run) ### lume ipsw + Get the latest macOS restore image URL. ### lume prune + Remove cached images to free up disk space. ## Configuration ### lume config + Manage Lume configuration settings. **Subcommands:** ##### Storage Management + - `lume config storage add ` - Add a new VM storage location - `lume config storage remove ` - Remove a VM storage location - `lume config storage list` - List all VM storage locations - `lume config storage default ` - Set the default VM storage location ##### Cache Management + - `lume config cache get` - Get current cache directory - `lume config cache set ` - Set cache directory ##### Image Caching + - `lume config caching get` - Show current caching status - `lume config caching set ` - Enable or disable image caching ## API Server ### lume serve + Start the Lume API server for programmatic access. **Options:** + - `--port ` - Port to listen on (default: 7777) ## Global Options @@ -206,4 +235,4 @@ Start the Lume API server for programmatic access. These options are available for all commands: - `--help` - Show help information -- `--version` - Show version number \ No newline at end of file +- `--version` - Show version number diff --git a/docs/content/docs/libraries/lume/http-api.mdx b/docs/content/docs/libraries/lume/http-api.mdx index 04792f26..f908f85e 100644 --- a/docs/content/docs/libraries/lume/http-api.mdx +++ b/docs/content/docs/libraries/lume/http-api.mdx @@ -13,9 +13,8 @@ http://localhost:7777 ``` - The HTTP API service runs on port `7777` by default. If you'd like to use a - different port, pass the `--port` option during installation or when running - `lume serve`. + The HTTP API service runs on port `7777` by default. If you'd like to use a different port, pass + the `--port` option during installation or when running `lume serve`. ## Endpoints @@ -726,15 +725,15 @@ Push a VM to a registry as an image (asynchronous operation). #### Parameters -| Name | Type | Required | Description | -| ------------ | ------------ | -------- | ----------------------------------------------- | -| name | string | Yes | Local VM name to push | -| imageName | string | Yes | Image name in registry | -| tags | array | Yes | Image tags (e.g. `["latest", "v1"]`) | -| organization | string | Yes | Organization name | -| registry | string | No | Registry host (e.g. `ghcr.io`) | -| chunkSizeMb | integer | No | Chunk size in MB for upload | -| storage | string/null | No | Storage type (`ssd`, etc.) | +| Name | Type | Required | Description | +| ------------ | ----------- | -------- | ------------------------------------ | +| name | string | Yes | Local VM name to push | +| imageName | string | Yes | Image name in registry | +| tags | array | Yes | Image tags (e.g. `["latest", "v1"]`) | +| organization | string | Yes | Organization name | +| registry | string | No | Registry host (e.g. `ghcr.io`) | +| chunkSizeMb | integer | No | Chunk size in MB for upload | +| storage | string/null | No | Storage type (`ssd`, etc.) | #### Example Request @@ -747,13 +746,13 @@ curl --connect-timeout 6000 \ -X POST \ -H "Content-Type: application/json" \ -d '{ - "name": "my-local-vm", + "name": "my-local-vm", "imageName": "my-image", "tags": ["latest", "v1"], - "organization": "my-org", + "organization": "my-org", "registry": "ghcr.io", "chunkSizeMb": 512, - "storage": null + "storage": null }' \ http://localhost:7777/lume/vms/push ``` @@ -808,10 +807,7 @@ console.log(await res.json()); "message": "Push initiated in background", "name": "my-local-vm", "imageName": "my-image", - "tags": [ - "latest", - "v1" - ] + "tags": ["latest", "v1"] } ``` @@ -857,10 +853,7 @@ console.log(await res.json()); ```json { - "local": [ - "macos-sequoia-xcode:latest", - "macos-sequoia-vanilla:latest" - ] + "local": ["macos-sequoia-xcode:latest", "macos-sequoia-vanilla:latest"] } ``` @@ -1005,11 +998,11 @@ Update Lume configuration settings. #### Parameters -| Name | Type | Required | Description | -| --------------- | ------- | -------- | -------------------------------- | -| homeDirectory | string | No | Lume home directory path | -| cacheDirectory | string | No | Cache directory path | -| cachingEnabled | boolean | No | Enable or disable caching | +| Name | Type | Required | Description | +| -------------- | ------- | -------- | ------------------------- | +| homeDirectory | string | No | Lume home directory path | +| cacheDirectory | string | No | Cache directory path | +| cachingEnabled | boolean | No | Enable or disable caching | #### Example Request diff --git a/docs/content/docs/libraries/lume/index.mdx b/docs/content/docs/libraries/lume/index.mdx index d62c80e0..152d08c7 100644 --- a/docs/content/docs/libraries/lume/index.mdx +++ b/docs/content/docs/libraries/lume/index.mdx @@ -5,4 +5,4 @@ github: - https://github.com/trycua/cua/tree/main/libs/lume --- -Lume is a lightweight Command Line Interface and local API server for creating, running and managing **macOS and Linux virtual machines** with near-native performance on Apple Silicon, using Apple's [Virtualization.Framework](https://developer.apple.com/documentation/virtualization). \ No newline at end of file +Lume is a lightweight Command Line Interface and local API server for creating, running and managing **macOS and Linux virtual machines** with near-native performance on Apple Silicon, using Apple's [Virtualization.Framework](https://developer.apple.com/documentation/virtualization). diff --git a/docs/content/docs/libraries/lume/installation.mdx b/docs/content/docs/libraries/lume/installation.mdx index 161e48e0..7b990665 100644 --- a/docs/content/docs/libraries/lume/installation.mdx +++ b/docs/content/docs/libraries/lume/installation.mdx @@ -15,10 +15,12 @@ lume run macos-sequoia-vanilla:latest ``` -All prebuilt images use the default password `lume`. Change this immediately after your first login using the `passwd` command. + All prebuilt images use the default password `lume`. Change this immediately after your first + login using the `passwd` command. **System Requirements**: + - Apple Silicon Mac (M1, M2, M3, etc.) - macOS 13.0 or later - At least 8GB of RAM (16GB recommended) @@ -33,6 +35,7 @@ Install with a single command: ``` ### Manual Start (No Background Service) + By default, Lume is installed as a background service that starts automatically on login. If you prefer to start the Lume API service manually when needed, you can use the `--no-background-service` option: ```bash @@ -40,8 +43,11 @@ By default, Lume is installed as a background service that starts automatically ``` -With this option, you'll need to manually start the Lume API service by running `lume serve` in your terminal whenever you need to use tools or libraries that rely on the Lume API (such as the Computer-Use Agent). + With this option, you'll need to manually start the Lume API service by running `lume serve` in + your terminal whenever you need to use tools or libraries that rely on the Lume API (such as the + Computer-Use Agent). ## Manual Download and Installation -You can also download the `lume.pkg.tar.gz` archive from the [latest release](https://github.com/trycua/cua/releases?q=lume&expanded=true), extract it, and install the package manually. \ No newline at end of file + +You can also download the `lume.pkg.tar.gz` archive from the [latest release](https://github.com/trycua/cua/releases?q=lume&expanded=true), extract it, and install the package manually. diff --git a/docs/content/docs/libraries/lume/prebuilt-images.mdx b/docs/content/docs/libraries/lume/prebuilt-images.mdx index 49628c59..4e4e3e67 100644 --- a/docs/content/docs/libraries/lume/prebuilt-images.mdx +++ b/docs/content/docs/libraries/lume/prebuilt-images.mdx @@ -5,24 +5,29 @@ title: Prebuilt Images Pre-built images are available in the registry [ghcr.io/trycua](https://github.com/orgs/trycua/packages). These images come with an SSH server pre-configured and auto-login enabled. -The default password on pre-built images is `lume`. For the security of your VM, change this password after your first login. + The default password on pre-built images is `lume`. For the security of your VM, change this + password after your first login. ## Available Images The following pre-built images are available to download via `lume pull`: -| Image | Tag | Description | Logical Size | -|-------|------------|-------------|------| -| `macos-sequoia-vanilla` | `latest`, `15.2` | macOS Sequoia 15.2 image | 20GB | -| `macos-sequoia-xcode` | `latest`, `15.2` | macOS Sequoia 15.2 image with Xcode command line tools | 22GB | -| `macos-sequoia-cua` | `latest`, `15.3` | macOS Sequoia 15.3 image compatible with the Computer interface | 24GB | -| `ubuntu-noble-vanilla` | `latest`, `24.04.1` | [Ubuntu Server for ARM 24.04.1 LTS](https://ubuntu.com/download/server/arm) with Ubuntu Desktop | 20GB | +| Image | Tag | Description | Logical Size | +| ----------------------- | ------------------- | ----------------------------------------------------------------------------------------------- | ------------ | +| `macos-sequoia-vanilla` | `latest`, `15.2` | macOS Sequoia 15.2 image | 20GB | +| `macos-sequoia-xcode` | `latest`, `15.2` | macOS Sequoia 15.2 image with Xcode command line tools | 22GB | +| `macos-sequoia-cua` | `latest`, `15.3` | macOS Sequoia 15.3 image compatible with the Computer interface | 24GB | +| `ubuntu-noble-vanilla` | `latest`, `24.04.1` | [Ubuntu Server for ARM 24.04.1 LTS](https://ubuntu.com/download/server/arm) with Ubuntu Desktop | 20GB | ## Disk Space For additional disk space, resize the VM disk after pulling the image using the `lume set --disk-size ` command. Note that the actual disk space used by sparse images will be much lower than the logical size listed. -**Important Note (v0.2.0+):** Images are being re-uploaded with sparse file system optimizations enabled, resulting in significantly lower actual disk usage. Older images (without the `-sparse` suffix) are now **deprecated**. The last version of `lume` fully supporting the non-sparse images was `v0.1.x`. Starting from `v0.2.0`, lume will automatically pull images optimized with sparse file system support. - \ No newline at end of file + **Important Note (v0.2.0+):** Images are being re-uploaded with sparse file system optimizations + enabled, resulting in significantly lower actual disk usage. Older images (without the `-sparse` + suffix) are now **deprecated**. The last version of `lume` fully supporting the non-sparse images + was `v0.1.x`. Starting from `v0.2.0`, lume will automatically pull images optimized with sparse + file system support. + diff --git a/docs/content/docs/libraries/lumier/building-lumier.mdx b/docs/content/docs/libraries/lumier/building-lumier.mdx index df8ad4f8..bd9b9951 100644 --- a/docs/content/docs/libraries/lumier/building-lumier.mdx +++ b/docs/content/docs/libraries/lumier/building-lumier.mdx @@ -39,4 +39,4 @@ docker build -t yourusername/lumier:custom . # Push to Docker Hub (after docker login) docker push yourusername/lumier:custom -``` \ No newline at end of file +``` diff --git a/docs/content/docs/libraries/lumier/docker-compose.mdx b/docs/content/docs/libraries/lumier/docker-compose.mdx index fece3473..312598e7 100644 --- a/docs/content/docs/libraries/lumier/docker-compose.mdx +++ b/docs/content/docs/libraries/lumier/docker-compose.mdx @@ -13,10 +13,10 @@ services: container_name: lumier-vm restart: unless-stopped ports: - - "8006:8006" # Port for VNC access + - '8006:8006' # Port for VNC access volumes: - - ./storage:/storage # VM persistent storage - - ./shared:/shared # Shared folder accessible in the VM + - ./storage:/storage # VM persistent storage + - ./shared:/shared # Shared folder accessible in the VM environment: - VM_NAME=lumier-vm - VERSION=ghcr.io/trycua/macos-sequoia-cua:latest diff --git a/docs/content/docs/libraries/lumier/docker.mdx b/docs/content/docs/libraries/lumier/docker.mdx index a14d0599..b7c72050 100644 --- a/docs/content/docs/libraries/lumier/docker.mdx +++ b/docs/content/docs/libraries/lumier/docker.mdx @@ -5,6 +5,7 @@ title: Docker You can use Lumier through Docker: ### Run a macOS VM (ephemeral) + ```bash # Run the container with temporary storage (using pre-built image from Docker Hub) docker run -it --rm \ @@ -16,12 +17,15 @@ docker run -it --rm \ -e RAM_SIZE=8192 \ trycua/lumier:latest ``` + Access the VM in your browser at **http://localhost:8006**. After running the command above, you can access your macOS VM through a web browser (e.g., http://localhost:8006). -With the basic setup above, your VM will be reset when you stop the container (ephemeral mode). This means any changes you make inside the macOS VM will be lost. See the section below for how to save your VM state. + With the basic setup above, your VM will be reset when you stop the container (ephemeral mode). + This means any changes you make inside the macOS VM will be lost. See the section below for how to + save your VM state. ## Saving Your VM State @@ -121,4 +125,4 @@ When running Lumier, you'll need to configure a few things: - `HOST_STORAGE_PATH`: Path to save VM state (when using persistent storage) - `HOST_SHARED_PATH`: Path to the shared folder (optional) -- **Background service**: The `lume serve` service should be running on your host (starts automatically when you install Lume using the `install.sh` script above). \ No newline at end of file +- **Background service**: The `lume serve` service should be running on your host (starts automatically when you install Lume using the `install.sh` script above). diff --git a/docs/content/docs/libraries/lumier/index.mdx b/docs/content/docs/libraries/lumier/index.mdx index 814055ba..d21768f4 100644 --- a/docs/content/docs/libraries/lumier/index.mdx +++ b/docs/content/docs/libraries/lumier/index.mdx @@ -15,7 +15,9 @@ github: ## How It Works -We're using Docker primarily as a convenient delivery mechanism, not as an isolation layer. Unlike traditional Docker containers, Lumier leverages the Apple Virtualization Framework (Apple Vz) through the `lume` CLI to create true virtual machines. + We're using Docker primarily as a convenient delivery mechanism, not as an isolation layer. Unlike + traditional Docker containers, Lumier leverages the Apple Virtualization Framework (Apple Vz) + through the `lume` CLI to create true virtual machines. Here's what's happening behind the scenes: @@ -23,4 +25,4 @@ Here's what's happening behind the scenes: 1. The Docker container provides a consistent environment to run the Lumier interface 2. Lumier connects to the Lume service running on your host Mac 3. Lume uses Apple's Virtualization Framework to create a true macOS virtual machine -4. The VM runs with hardware acceleration using your Mac's native virtualization capabilities \ No newline at end of file +4. The VM runs with hardware acceleration using your Mac's native virtualization capabilities diff --git a/docs/content/docs/libraries/lumier/installation.mdx b/docs/content/docs/libraries/lumier/installation.mdx index e0c20267..d2e62399 100644 --- a/docs/content/docs/libraries/lumier/installation.mdx +++ b/docs/content/docs/libraries/lumier/installation.mdx @@ -7,8 +7,9 @@ Before using Lumier, make sure you have: 1. **Docker for Apple Silicon** - download it [here](https://desktop.docker.com/mac/main/arm64/Docker.dmg) and follow the installation instructions. 2. **Lume** - This is the virtualization CLI that powers Lumier. Install it with this command: + ```bash /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" ``` -After installation, Lume runs as a background service and listens on port 7777. This service allows Lumier to create and manage virtual machines. If port 7777 is already in use on your system, you can specify a different port with the `--port` option when running the `install.sh` script. \ No newline at end of file +After installation, Lume runs as a background service and listens on port 7777. This service allows Lumier to create and manage virtual machines. If port 7777 is already in use on your system, you can specify a different port with the `--port` option when running the `install.sh` script. diff --git a/docs/content/docs/libraries/mcp-server/client-integrations.mdx b/docs/content/docs/libraries/mcp-server/client-integrations.mdx index 8699cda0..4ad0c6a6 100644 --- a/docs/content/docs/libraries/mcp-server/client-integrations.mdx +++ b/docs/content/docs/libraries/mcp-server/client-integrations.mdx @@ -17,4 +17,4 @@ To use with Cursor, add an MCP configuration file in one of these locations: After configuration, you can simply tell Cursor's Agent to perform computer tasks by explicitly mentioning the CUA agent, such as "Use the computer control tools to open Safari." -For more information on MCP with Cursor, see the [official Cursor MCP documentation](https://docs.cursor.com/context/model-context-protocol). \ No newline at end of file +For more information on MCP with Cursor, see the [official Cursor MCP documentation](https://docs.cursor.com/context/model-context-protocol). diff --git a/docs/content/docs/libraries/mcp-server/configuration.mdx b/docs/content/docs/libraries/mcp-server/configuration.mdx index e5df8293..998ccc29 100644 --- a/docs/content/docs/libraries/mcp-server/configuration.mdx +++ b/docs/content/docs/libraries/mcp-server/configuration.mdx @@ -4,7 +4,7 @@ title: Configuration The server is configured using environment variables (can be set in the Claude Desktop config): -| Variable | Description | Default | -|----------|-------------|---------| +| Variable | Description | Default | +| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------ | | `CUA_MODEL_NAME` | Model string (e.g., "anthropic/claude-3-5-sonnet-20241022", "openai/computer-use-preview", "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", "omniparser+litellm/gpt-4o", "omniparser+ollama_chat/gemma3") | anthropic/claude-3-5-sonnet-20241022 | -| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 | +| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 | diff --git a/docs/content/docs/libraries/mcp-server/index.mdx b/docs/content/docs/libraries/mcp-server/index.mdx index 87c9a342..e79d6b1e 100644 --- a/docs/content/docs/libraries/mcp-server/index.mdx +++ b/docs/content/docs/libraries/mcp-server/index.mdx @@ -6,4 +6,4 @@ github: - https://github.com/trycua/cua/tree/main/libs/python/mcp-server --- -**cua-mcp-server** is a MCP server for the Computer-Use Agent (CUA), allowing you to run CUA through Claude Desktop or other MCP clients. \ No newline at end of file +**cua-mcp-server** is a MCP server for the Computer-Use Agent (CUA), allowing you to run CUA through Claude Desktop or other MCP clients. diff --git a/docs/content/docs/libraries/mcp-server/installation.mdx b/docs/content/docs/libraries/mcp-server/installation.mdx index c04a4917..9c0d281f 100644 --- a/docs/content/docs/libraries/mcp-server/installation.mdx +++ b/docs/content/docs/libraries/mcp-server/installation.mdx @@ -9,8 +9,9 @@ pip install cua-mcp-server ``` This will install: + - The MCP server -- CUA agent and computer dependencies +- CUA agent and computer dependencies - An executable `cua-mcp-server` script in your PATH ## Easy Setup Script @@ -22,6 +23,7 @@ curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/python/mcp-ser ``` This script will: + - Create the ~/.cua directory if it doesn't exist - Generate a startup script at ~/.cua/start_mcp_server.sh - Make the script executable @@ -30,7 +32,7 @@ This script will: You can then use the script in your MCP configuration like this: ```json -{ +{ "mcpServers": { "cua-agent": { "command": "/bin/bash", @@ -48,6 +50,7 @@ You can then use the script in your MCP configuration like this: If you get a `/bin/bash: ~/cua/libs/python/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative. To see the logs: + ``` tail -n 20 -f ~/Library/Logs/Claude/mcp*.log -``` \ No newline at end of file +``` diff --git a/docs/content/docs/libraries/mcp-server/llm-integrations.mdx b/docs/content/docs/libraries/mcp-server/llm-integrations.mdx index a7515ae2..6dedd52d 100644 --- a/docs/content/docs/libraries/mcp-server/llm-integrations.mdx +++ b/docs/content/docs/libraries/mcp-server/llm-integrations.mdx @@ -1,6 +1,7 @@ --- title: LLM Integrations --- + ## LiteLLM Integration This MCP server features comprehensive liteLLM integration, allowing you to use any supported LLM provider with a simple model string configuration. @@ -10,7 +11,8 @@ This MCP server features comprehensive liteLLM integration, allowing you to use - **Extensive Provider Support**: Works with Anthropic, OpenAI, local models, and any liteLLM-compatible provider ### Model String Examples: + - **Anthropic**: `"anthropic/claude-3-5-sonnet-20241022"` - **OpenAI**: `"openai/computer-use-preview"` - **UI-TARS**: `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"` -- **Omni + Any LiteLLM**: `"omniparser+litellm/gpt-4o"`, `"omniparser+litellm/claude-3-haiku"`, `"omniparser+ollama_chat/gemma3"` \ No newline at end of file +- **Omni + Any LiteLLM**: `"omniparser+litellm/gpt-4o"`, `"omniparser+litellm/claude-3-haiku"`, `"omniparser+ollama_chat/gemma3"` diff --git a/docs/content/docs/libraries/mcp-server/tools.mdx b/docs/content/docs/libraries/mcp-server/tools.mdx index edf29c0b..fd09b366 100644 --- a/docs/content/docs/libraries/mcp-server/tools.mdx +++ b/docs/content/docs/libraries/mcp-server/tools.mdx @@ -7,4 +7,4 @@ title: Tools The MCP server exposes the following tools to Claude: 1. `run_cua_task` - Run a single Computer-Use Agent task with the given instruction -2. `run_multi_cua_tasks` - Run multiple tasks in sequence \ No newline at end of file +2. `run_multi_cua_tasks` - Run multiple tasks in sequence diff --git a/docs/content/docs/libraries/mcp-server/usage.mdx b/docs/content/docs/libraries/mcp-server/usage.mdx index 19eef934..2cefa2be 100644 --- a/docs/content/docs/libraries/mcp-server/usage.mdx +++ b/docs/content/docs/libraries/mcp-server/usage.mdx @@ -16,5 +16,6 @@ Claude will automatically use your CUA agent to perform these tasks. ### First-time Usage Notes **API Keys**: Ensure you have valid API keys: - - Add your Anthropic API key, or other model provider API key in the Claude Desktop config (as shown above) - - Or set it as an environment variable in your shell profile + +- Add your Anthropic API key, or other model provider API key in the Claude Desktop config (as shown above) +- Or set it as an environment variable in your shell profile diff --git a/docs/content/docs/libraries/som/configuration.mdx b/docs/content/docs/libraries/som/configuration.mdx index e57cdf1c..b421fdae 100644 --- a/docs/content/docs/libraries/som/configuration.mdx +++ b/docs/content/docs/libraries/som/configuration.mdx @@ -5,18 +5,28 @@ title: Configuration ### Detection Parameters #### Box Threshold (0.3) + Controls the confidence threshold for accepting detections: -Illustration of confidence thresholds in object detection, with a high-confidence detection accepted and a low-confidence detection rejected. -- Higher values (0.3) yield more precise but fewer detections -- Lower values (0.01) catch more potential icons but increase false positives -- Default is 0.3 for optimal precision/recall balance + +Illustration of confidence thresholds in object detection, with a high-confidence detection accepted and a low-confidence detection rejected. +- Higher values (0.3) yield more precise but fewer detections - Lower values (0.01) catch more +potential icons but increase false positives - Default is 0.3 for optimal precision/recall balance #### IOU Threshold (0.1) + Controls how overlapping detections are merged: -Diagram showing Intersection over Union (IOU) with low overlap between two boxes kept separate and high overlap leading to merging. -- Lower values (0.1) more aggressively remove overlapping boxes -- Higher values (0.5) allow more overlapping detections -- Default is 0.1 to handle densely packed UI elements + +Diagram showing Intersection over Union (IOU) with low overlap between two boxes kept separate and high overlap leading to merging. +- Lower values (0.1) more aggressively remove overlapping boxes - Higher values (0.5) allow more +overlapping detections - Default is 0.1 to handle densely packed UI elements ### OCR Configuration @@ -37,6 +47,7 @@ Controls how overlapping detections are merged: ### Hardware Acceleration #### MPS (Metal Performance Shaders) + - Multi-scale detection (640px, 1280px, 1920px) - Test-time augmentation enabled - Half-precision (FP16) @@ -44,6 +55,7 @@ Controls how overlapping detections are merged: - Best for production use when available #### CPU + - Single-scale detection (1280px) - Full-precision (FP32) - Average detection time: ~1.3s @@ -63,4 +75,4 @@ examples/output/ │ └── screenshot_analyzed.png ├── screen_details.txt └── summary.json -``` \ No newline at end of file +``` diff --git a/docs/content/docs/libraries/som/index.mdx b/docs/content/docs/libraries/som/index.mdx index ceba6e62..3eef53f1 100644 --- a/docs/content/docs/libraries/som/index.mdx +++ b/docs/content/docs/libraries/som/index.mdx @@ -6,7 +6,13 @@ github: - https://github.com/trycua/cua/tree/main/libs/python/som --- -A corresponding Python example is available for this documentation. + + A corresponding{' '} + + Python example + {' '} + is available for this documentation. + ## Overview diff --git a/docs/content/docs/quickstart-devs.mdx b/docs/content/docs/quickstart-devs.mdx index 4bd5b9ab..9cda4a2f 100644 --- a/docs/content/docs/quickstart-devs.mdx +++ b/docs/content/docs/quickstart-devs.mdx @@ -35,7 +35,7 @@ You can run your Cua computer in the cloud (recommended for easiest setup), loca Lume containers are macOS virtual machines that run on a macOS host machine. - + 1. Install the Lume CLI: ```bash @@ -51,8 +51,8 @@ You can run your Cua computer in the cloud (recommended for easiest setup), loca - Windows Sandbox provides Windows virtual environments that run on a Windows host machine. - +Windows Sandbox provides Windows virtual environments that run on a Windows host machine. + 1. Enable [Windows Sandbox](https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/windows-sandbox-install) (requires Windows 10 Pro/Enterprise or Windows 11) 2. Install the `pywinsandbox` dependency: @@ -65,8 +65,8 @@ You can run your Cua computer in the cloud (recommended for easiest setup), loca - Docker provides a way to run Ubuntu containers on any host machine. - +Docker provides a way to run Ubuntu containers on any host machine. + 1. Install Docker Desktop or Docker Engine: 2. Pull the CUA Ubuntu sandbox: @@ -173,6 +173,7 @@ Connect to your Cua computer and perform basic interactions, such as taking scre finally: await computer.close() ``` + Install the Cua computer TypeScript SDK: @@ -260,6 +261,7 @@ Connect to your Cua computer and perform basic interactions, such as taking scre await computer.close(); } ``` + @@ -274,11 +276,13 @@ Learn more about computers in the [Cua computers documentation](/computer-sdk/co Utilize an Agent to automate complex tasks by providing it with a goal and allowing it to interact with the computer environment. Install the Cua agent Python SDK: + ```bash pip install "cua-agent[all]" ``` Then, use the `ComputerAgent` object: + ```python from agent import ComputerAgent diff --git a/docs/content/docs/telemetry.mdx b/docs/content/docs/telemetry.mdx index a62b4f5f..fb5437c1 100644 --- a/docs/content/docs/telemetry.mdx +++ b/docs/content/docs/telemetry.mdx @@ -24,6 +24,7 @@ Basic performance metrics and system information that help us understand usage p ### Opt-In Telemetry (Disabled by Default) **Conversation Trajectory Logging**: Full conversation history including: + - User messages and agent responses - Computer actions and their outputs - Reasoning traces from the agent @@ -123,21 +124,21 @@ Note that telemetry settings must be configured during initialization and cannot ### Computer SDK Events -| Event Name | Data Collected | Trigger Notes | -|------------|----------------|---------------| -| **computer_initialized** | • `os`: Operating system (e.g., 'windows', 'darwin', 'linux')
• `os_version`: OS version
• `python_version`: Python version | Triggered when a Computer instance is created | -| **module_init** | • `module`: "computer"
• `version`: Package version
• `python_version`: Full Python version string | Triggered once when the computer package is imported for the first time | +| Event Name | Data Collected | Trigger Notes | +| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------- | +| **computer_initialized** | • `os`: Operating system (e.g., 'windows', 'darwin', 'linux')
• `os_version`: OS version
• `python_version`: Python version | Triggered when a Computer instance is created | +| **module_init** | • `module`: "computer"
• `version`: Package version
• `python_version`: Full Python version string | Triggered once when the computer package is imported for the first time | ### Agent SDK Events -| Event Name | Data Collected | Trigger Notes | -|------------|----------------|---------------| -| **module_init** | • `module`: "agent"
• `version`: Package version
• `python_version`: Full Python version string | Triggered once when the agent package is imported for the first time | -| **agent_session_start** | • `session_id`: Unique UUID for this agent instance
• `agent_type`: Class name (e.g., "ComputerAgent")
• `model`: Model name (e.g., "claude-3-5-sonnet")
• `os`: Operating system
• `os_version`: OS version
• `python_version`: Python version | Triggered when TelemetryCallback is initialized (agent instantiation) | -| **agent_run_start** | • `session_id`: Agent session UUID
• `run_id`: Unique UUID for this run
• `start_time`: Unix timestamp
• `input_context_size`: Character count of input messages
• `num_existing_messages`: Count of existing messages
• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the start of each agent.run() call | -| **agent_run_end** | • `session_id`: Agent session UUID
• `run_id`: Run UUID
• `end_time`: Unix timestamp
• `duration_seconds`: Total run duration
• `num_steps`: Total steps taken in this run
• `total_usage`: Accumulated token usage and costs
• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the end of each agent.run() call | -| **agent_step** | • `session_id`: Agent session UUID
• `run_id`: Run UUID
• `step`: Step number (incremental)
• `timestamp`: Unix timestamp
• `duration_seconds`: Duration of previous step | Triggered on each agent response/step during a run | -| **agent_usage** | • `session_id`: Agent session UUID
• `run_id`: Run UUID
• `step`: Current step number
• `prompt_tokens`: Tokens in prompt
• `completion_tokens`: Tokens in response
• `total_tokens`: Total tokens used
• `response_cost`: Cost of this API call | Triggered whenever usage information is received from LLM API | +| Event Name | Data Collected | Trigger Notes | +| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------- | +| **module_init** | • `module`: "agent"
• `version`: Package version
• `python_version`: Full Python version string | Triggered once when the agent package is imported for the first time | +| **agent_session_start** | • `session_id`: Unique UUID for this agent instance
• `agent_type`: Class name (e.g., "ComputerAgent")
• `model`: Model name (e.g., "claude-3-5-sonnet")
• `os`: Operating system
• `os_version`: OS version
• `python_version`: Python version | Triggered when TelemetryCallback is initialized (agent instantiation) | +| **agent_run_start** | • `session_id`: Agent session UUID
• `run_id`: Unique UUID for this run
• `start_time`: Unix timestamp
• `input_context_size`: Character count of input messages
• `num_existing_messages`: Count of existing messages
• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the start of each agent.run() call | +| **agent_run_end** | • `session_id`: Agent session UUID
• `run_id`: Run UUID
• `end_time`: Unix timestamp
• `duration_seconds`: Total run duration
• `num_steps`: Total steps taken in this run
• `total_usage`: Accumulated token usage and costs
• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the end of each agent.run() call | +| **agent_step** | • `session_id`: Agent session UUID
• `run_id`: Run UUID
• `step`: Step number (incremental)
• `timestamp`: Unix timestamp
• `duration_seconds`: Duration of previous step | Triggered on each agent response/step during a run | +| **agent_usage** | • `session_id`: Agent session UUID
• `run_id`: Run UUID
• `step`: Current step number
• `prompt_tokens`: Tokens in prompt
• `completion_tokens`: Tokens in response
• `total_tokens`: Total tokens used
• `response_cost`: Cost of this API call | Triggered whenever usage information is received from LLM API | ## Transparency diff --git a/libs/python/agent/.bumpversion.cfg b/libs/python/agent/.bumpversion.cfg index b6bb6583..9536d01f 100644 --- a/libs/python/agent/.bumpversion.cfg +++ b/libs/python/agent/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.35 +current_version = 0.4.36 commit = True tag = True tag_name = agent-v{new_version} diff --git a/libs/python/agent/agent/computers/base.py b/libs/python/agent/agent/computers/base.py index 51da978b..631b79df 100644 --- a/libs/python/agent/agent/computers/base.py +++ b/libs/python/agent/agent/computers/base.py @@ -28,8 +28,12 @@ class AsyncComputerHandler(Protocol): """Get screen dimensions as (width, height).""" ... - async def screenshot(self) -> str: - """Take a screenshot and return as base64 string.""" + async def screenshot(self, text: Optional[str] = None) -> str: + """Take a screenshot and return as base64 string. + + Args: + text: Optional descriptive text (for compatibility with GPT-4o models, ignored) + """ ... async def click(self, x: int, y: int, button: str = "left") -> None: diff --git a/libs/python/agent/agent/computers/cua.py b/libs/python/agent/agent/computers/cua.py index 99723bda..40374528 100644 --- a/libs/python/agent/agent/computers/cua.py +++ b/libs/python/agent/agent/computers/cua.py @@ -36,8 +36,12 @@ class cuaComputerHandler(AsyncComputerHandler): screen_size = await self.interface.get_screen_size() return screen_size["width"], screen_size["height"] - async def screenshot(self) -> str: - """Take a screenshot and return as base64 string.""" + async def screenshot(self, text: Optional[str] = None) -> str: + """Take a screenshot and return as base64 string. + + Args: + text: Optional descriptive text (for compatibility with GPT-4o models, ignored) + """ assert self.interface is not None screenshot_bytes = await self.interface.screenshot() return base64.b64encode(screenshot_bytes).decode("utf-8") diff --git a/libs/python/agent/agent/computers/custom.py b/libs/python/agent/agent/computers/custom.py index e87b1519..720e3b55 100644 --- a/libs/python/agent/agent/computers/custom.py +++ b/libs/python/agent/agent/computers/custom.py @@ -122,8 +122,12 @@ class CustomComputerHandler(AsyncComputerHandler): return self._last_screenshot_size - async def screenshot(self) -> str: - """Take a screenshot and return as base64 string.""" + async def screenshot(self, text: Optional[str] = None) -> str: + """Take a screenshot and return as base64 string. + + Args: + text: Optional descriptive text (for compatibility with GPT-4o models, ignored) + """ result = await self._call_function(self.functions["screenshot"]) b64_str = self._to_b64_str(result) # type: ignore diff --git a/libs/python/agent/agent/loops/omniparser.py b/libs/python/agent/agent/loops/omniparser.py index 14ef3a92..b3cd3d69 100644 --- a/libs/python/agent/agent/loops/omniparser.py +++ b/libs/python/agent/agent/loops/omniparser.py @@ -243,18 +243,20 @@ async def replace_computer_call_with_function( "id": item.get("id"), "call_id": item.get("call_id"), "status": "completed", - # Fall back to string representation - "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})", } ] elif item_type == "computer_call_output": - # Simple conversion: computer_call_output -> function_call_output + output = item.get("output") + + if isinstance(output, dict): + output = [output] + return [ { "type": "function_call_output", "call_id": item.get("call_id"), - "content": [item.get("output")], + "output": output, "id": item.get("id"), "status": "completed", } diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml index d97b9895..d501d204 100644 --- a/libs/python/agent/pyproject.toml +++ b/libs/python/agent/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "pdm.backend" [project] name = "cua-agent" -version = "0.4.35" +version = "0.4.36" description = "CUA (Computer Use) Agent for AI-driven computer interaction" readme = "README.md" authors = [ diff --git a/libs/python/agent/tests/conftest.py b/libs/python/agent/tests/conftest.py new file mode 100644 index 00000000..8270c8e0 --- /dev/null +++ b/libs/python/agent/tests/conftest.py @@ -0,0 +1,84 @@ +"""Pytest configuration and shared fixtures for agent package tests. + +This file contains shared fixtures and configuration for all agent tests. +Following SRP: This file ONLY handles test setup/teardown. +""" + +from unittest.mock import AsyncMock, MagicMock, Mock, patch + +import pytest + + +@pytest.fixture +def mock_litellm(): + """Mock liteLLM completion calls. + + Use this fixture to avoid making real LLM API calls during tests. + Returns a mock that simulates LLM responses. + """ + with patch("litellm.acompletion") as mock_completion: + + async def mock_response(*args, **kwargs): + """Simulate a typical LLM response.""" + return { + "id": "chatcmpl-test123", + "object": "chat.completion", + "created": 1234567890, + "model": kwargs.get("model", "anthropic/claude-3-5-sonnet-20241022"), + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "This is a mocked response for testing.", + }, + "finish_reason": "stop", + } + ], + "usage": { + "prompt_tokens": 10, + "completion_tokens": 20, + "total_tokens": 30, + }, + } + + mock_completion.side_effect = mock_response + yield mock_completion + + +@pytest.fixture +def mock_computer(): + """Mock Computer interface for agent tests. + + Use this fixture to test agent logic without requiring a real Computer instance. + """ + computer = AsyncMock() + computer.interface = AsyncMock() + computer.interface.screenshot = AsyncMock(return_value=b"fake_screenshot_data") + computer.interface.left_click = AsyncMock() + computer.interface.type = AsyncMock() + computer.interface.key = AsyncMock() + + # Mock context manager + computer.__aenter__ = AsyncMock(return_value=computer) + computer.__aexit__ = AsyncMock() + + return computer + + +@pytest.fixture +def disable_telemetry(monkeypatch): + """Disable telemetry for tests. + + Use this fixture to ensure no telemetry is sent during tests. + """ + monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1") + + +@pytest.fixture +def sample_messages(): + """Provide sample messages for testing. + + Returns a list of messages in the expected format. + """ + return [{"role": "user", "content": "Take a screenshot and tell me what you see"}] diff --git a/libs/python/agent/tests/test_computer_agent.py b/libs/python/agent/tests/test_computer_agent.py new file mode 100644 index 00000000..936c984c --- /dev/null +++ b/libs/python/agent/tests/test_computer_agent.py @@ -0,0 +1,139 @@ +"""Unit tests for ComputerAgent class. + +This file tests ONLY the ComputerAgent initialization and basic functionality. +Following SRP: This file tests ONE class (ComputerAgent). +All external dependencies (liteLLM, Computer) are mocked. +""" + +from unittest.mock import AsyncMock, MagicMock, Mock, patch + +import pytest + + +class TestComputerAgentInitialization: + """Test ComputerAgent initialization (SRP: Only tests initialization).""" + + @patch("agent.agent.litellm") + def test_agent_initialization_with_model(self, mock_litellm, disable_telemetry): + """Test that agent can be initialized with a model string.""" + from agent import ComputerAgent + + agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022") + + assert agent is not None + assert hasattr(agent, "model") + assert agent.model == "anthropic/claude-3-5-sonnet-20241022" + + @patch("agent.agent.litellm") + def test_agent_initialization_with_tools(self, mock_litellm, disable_telemetry, mock_computer): + """Test that agent can be initialized with tools.""" + from agent import ComputerAgent + + agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer]) + + assert agent is not None + assert hasattr(agent, "tools") + + @patch("agent.agent.litellm") + def test_agent_initialization_with_max_budget(self, mock_litellm, disable_telemetry): + """Test that agent can be initialized with max trajectory budget.""" + from agent import ComputerAgent + + budget = 5.0 + agent = ComputerAgent( + model="anthropic/claude-3-5-sonnet-20241022", max_trajectory_budget=budget + ) + + assert agent is not None + + @patch("agent.agent.litellm") + def test_agent_requires_model(self, mock_litellm, disable_telemetry): + """Test that agent requires a model parameter.""" + from agent import ComputerAgent + + with pytest.raises(TypeError): + # Should fail without model parameter - intentionally missing required argument + ComputerAgent() # type: ignore[call-arg] + + +class TestComputerAgentRun: + """Test ComputerAgent.run() method (SRP: Only tests run logic).""" + + @pytest.mark.asyncio + @patch("agent.agent.litellm") + async def test_agent_run_with_messages(self, mock_litellm, disable_telemetry, sample_messages): + """Test that agent.run() works with valid messages.""" + from agent import ComputerAgent + + # Mock liteLLM response + mock_response = { + "id": "chatcmpl-test", + "choices": [ + { + "message": {"role": "assistant", "content": "Test response"}, + "finish_reason": "stop", + } + ], + "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30}, + } + + mock_litellm.acompletion = AsyncMock(return_value=mock_response) + + agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022") + + # Run should return an async generator + result_generator = agent.run(sample_messages) + + assert result_generator is not None + # Check it's an async generator + assert hasattr(result_generator, "__anext__") + + def test_agent_has_run_method(self, disable_telemetry): + """Test that agent has run method available.""" + from agent import ComputerAgent + + agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022") + + # Verify run method exists + assert hasattr(agent, "run") + assert callable(agent.run) + + def test_agent_has_agent_loop(self, disable_telemetry): + """Test that agent has agent_loop initialized.""" + from agent import ComputerAgent + + agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022") + + # Verify agent_loop is initialized + assert hasattr(agent, "agent_loop") + assert agent.agent_loop is not None + + +class TestComputerAgentTypes: + """Test AgentResponse and Messages types (SRP: Only tests type definitions).""" + + def test_messages_type_exists(self): + """Test that Messages type is exported.""" + from agent import Messages + + assert Messages is not None + + def test_agent_response_type_exists(self): + """Test that AgentResponse type is exported.""" + from agent import AgentResponse + + assert AgentResponse is not None + + +class TestComputerAgentIntegration: + """Test ComputerAgent integration with Computer tool (SRP: Integration within package).""" + + def test_agent_accepts_computer_tool(self, disable_telemetry, mock_computer): + """Test that agent can be initialized with Computer tool.""" + from agent import ComputerAgent + + agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer]) + + # Verify agent accepted the tool + assert agent is not None + assert hasattr(agent, "tools") diff --git a/libs/python/computer-server/tests/conftest.py b/libs/python/computer-server/tests/conftest.py new file mode 100644 index 00000000..4700526e --- /dev/null +++ b/libs/python/computer-server/tests/conftest.py @@ -0,0 +1,47 @@ +"""Pytest configuration and shared fixtures for computer-server package tests. + +This file contains shared fixtures and configuration for all computer-server tests. +Following SRP: This file ONLY handles test setup/teardown. +""" + +from unittest.mock import AsyncMock, Mock, patch + +import pytest + + +@pytest.fixture +def mock_websocket(): + """Mock WebSocket connection for testing. + + Use this fixture to test WebSocket logic without real connections. + """ + websocket = AsyncMock() + websocket.send = AsyncMock() + websocket.recv = AsyncMock() + websocket.close = AsyncMock() + + return websocket + + +@pytest.fixture +def mock_computer_interface(): + """Mock computer interface for server tests. + + Use this fixture to test server logic without real computer operations. + """ + interface = AsyncMock() + interface.screenshot = AsyncMock(return_value=b"fake_screenshot") + interface.left_click = AsyncMock() + interface.type = AsyncMock() + interface.key = AsyncMock() + + return interface + + +@pytest.fixture +def disable_telemetry(monkeypatch): + """Disable telemetry for tests. + + Use this fixture to ensure no telemetry is sent during tests. + """ + monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1") diff --git a/libs/python/computer-server/tests/test_server.py b/libs/python/computer-server/tests/test_server.py new file mode 100644 index 00000000..385957e8 --- /dev/null +++ b/libs/python/computer-server/tests/test_server.py @@ -0,0 +1,40 @@ +"""Unit tests for computer-server package. + +This file tests ONLY basic server functionality. +Following SRP: This file tests server initialization and basic operations. +All external dependencies are mocked. +""" + +from unittest.mock import AsyncMock, Mock, patch + +import pytest + + +class TestServerImports: + """Test server module imports (SRP: Only tests imports).""" + + def test_server_module_exists(self): + """Test that server module can be imported.""" + try: + import computer_server + + assert computer_server is not None + except ImportError: + pytest.skip("computer_server module not installed") + + +class TestServerInitialization: + """Test server initialization (SRP: Only tests initialization).""" + + @pytest.mark.asyncio + async def test_server_can_be_imported(self): + """Basic smoke test: verify server components can be imported.""" + try: + from computer_server import server + + assert server is not None + except ImportError: + pytest.skip("Server module not available") + except Exception as e: + # Some initialization errors are acceptable in unit tests + pytest.skip(f"Server initialization requires specific setup: {e}") diff --git a/libs/python/computer/tests/conftest.py b/libs/python/computer/tests/conftest.py new file mode 100644 index 00000000..23198674 --- /dev/null +++ b/libs/python/computer/tests/conftest.py @@ -0,0 +1,69 @@ +"""Pytest configuration and shared fixtures for computer package tests. + +This file contains shared fixtures and configuration for all computer tests. +Following SRP: This file ONLY handles test setup/teardown. +""" + +from unittest.mock import AsyncMock, MagicMock, Mock, patch + +import pytest + + +@pytest.fixture +def mock_interface(): + """Mock computer interface for testing. + + Use this fixture to test Computer logic without real OS calls. + """ + interface = AsyncMock() + interface.screenshot = AsyncMock(return_value=b"fake_screenshot") + interface.left_click = AsyncMock() + interface.right_click = AsyncMock() + interface.middle_click = AsyncMock() + interface.double_click = AsyncMock() + interface.type = AsyncMock() + interface.key = AsyncMock() + interface.move_mouse = AsyncMock() + interface.scroll = AsyncMock() + interface.get_screen_size = AsyncMock(return_value=(1920, 1080)) + + return interface + + +@pytest.fixture +def mock_cloud_provider(): + """Mock cloud provider for testing. + + Use this fixture to test cloud provider logic without real API calls. + """ + provider = AsyncMock() + provider.start = AsyncMock() + provider.stop = AsyncMock() + provider.get_status = AsyncMock(return_value="running") + provider.execute_command = AsyncMock(return_value="command output") + + return provider + + +@pytest.fixture +def mock_local_provider(): + """Mock local provider for testing. + + Use this fixture to test local provider logic without real VM operations. + """ + provider = AsyncMock() + provider.start = AsyncMock() + provider.stop = AsyncMock() + provider.get_status = AsyncMock(return_value="running") + provider.execute_command = AsyncMock(return_value="command output") + + return provider + + +@pytest.fixture +def disable_telemetry(monkeypatch): + """Disable telemetry for tests. + + Use this fixture to ensure no telemetry is sent during tests. + """ + monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1") diff --git a/libs/python/computer/tests/test_computer.py b/libs/python/computer/tests/test_computer.py new file mode 100644 index 00000000..bdbefac8 --- /dev/null +++ b/libs/python/computer/tests/test_computer.py @@ -0,0 +1,67 @@ +"""Unit tests for Computer class. + +This file tests ONLY the Computer class initialization and context manager. +Following SRP: This file tests ONE class (Computer). +All external dependencies (providers, interfaces) are mocked. +""" + +from unittest.mock import AsyncMock, MagicMock, Mock, patch + +import pytest + + +class TestComputerImport: + """Test Computer module imports (SRP: Only tests imports).""" + + def test_computer_class_exists(self): + """Test that Computer class can be imported.""" + from computer import Computer + + assert Computer is not None + + def test_vm_provider_type_exists(self): + """Test that VMProviderType enum can be imported.""" + from computer import VMProviderType + + assert VMProviderType is not None + + +class TestComputerInitialization: + """Test Computer initialization (SRP: Only tests initialization).""" + + def test_computer_class_can_be_imported(self, disable_telemetry): + """Test that Computer class can be imported without errors.""" + from computer import Computer + + assert Computer is not None + + def test_computer_has_required_methods(self, disable_telemetry): + """Test that Computer class has required methods.""" + from computer import Computer + + assert hasattr(Computer, "__aenter__") + assert hasattr(Computer, "__aexit__") + + +class TestComputerContextManager: + """Test Computer context manager protocol (SRP: Only tests context manager).""" + + def test_computer_is_async_context_manager(self, disable_telemetry): + """Test that Computer has async context manager methods.""" + from computer import Computer + + assert hasattr(Computer, "__aenter__") + assert hasattr(Computer, "__aexit__") + assert callable(Computer.__aenter__) + assert callable(Computer.__aexit__) + + +class TestComputerInterface: + """Test Computer.interface property (SRP: Only tests interface access).""" + + def test_computer_class_structure(self, disable_telemetry): + """Test that Computer class has expected structure.""" + from computer import Computer + + # Verify Computer is a class + assert isinstance(Computer, type) diff --git a/libs/python/core/tests/conftest.py b/libs/python/core/tests/conftest.py new file mode 100644 index 00000000..1ac1673e --- /dev/null +++ b/libs/python/core/tests/conftest.py @@ -0,0 +1,43 @@ +"""Pytest configuration and shared fixtures for core package tests. + +This file contains shared fixtures and configuration for all core tests. +Following SRP: This file ONLY handles test setup/teardown. +""" + +from unittest.mock import AsyncMock, Mock, patch + +import pytest + + +@pytest.fixture +def mock_httpx_client(): + """Mock httpx.AsyncClient for API calls. + + Use this fixture to avoid making real HTTP requests during tests. + """ + with patch("httpx.AsyncClient") as mock_client: + mock_instance = AsyncMock() + mock_client.return_value.__aenter__.return_value = mock_instance + yield mock_instance + + +@pytest.fixture +def mock_posthog(): + """Mock PostHog client for telemetry tests. + + Use this fixture to avoid sending real telemetry during tests. + """ + with patch("posthog.Posthog") as mock_ph: + mock_instance = Mock() + mock_ph.return_value = mock_instance + yield mock_instance + + +@pytest.fixture +def disable_telemetry(monkeypatch): + """Disable telemetry for tests that don't need it. + + Use this fixture to ensure telemetry is disabled during tests. + """ + monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1") + yield diff --git a/libs/python/core/tests/test_telemetry.py b/libs/python/core/tests/test_telemetry.py new file mode 100644 index 00000000..5446a884 --- /dev/null +++ b/libs/python/core/tests/test_telemetry.py @@ -0,0 +1,255 @@ +"""Unit tests for core telemetry functionality. + +This file tests ONLY telemetry logic, following SRP. +All external dependencies (PostHog, file system) are mocked. +""" + +import os +from pathlib import Path +from unittest.mock import MagicMock, Mock, mock_open, patch + +import pytest + + +class TestTelemetryEnabled: + """Test telemetry enable/disable logic (SRP: Only tests enable/disable).""" + + def test_telemetry_enabled_by_default(self, monkeypatch): + """Test that telemetry is enabled by default.""" + # Remove any environment variables that might affect the test + monkeypatch.delenv("CUA_TELEMETRY", raising=False) + monkeypatch.delenv("CUA_TELEMETRY_ENABLED", raising=False) + + from core.telemetry import is_telemetry_enabled + + assert is_telemetry_enabled() is True + + def test_telemetry_disabled_with_legacy_flag(self, monkeypatch): + """Test that telemetry can be disabled with legacy CUA_TELEMETRY=off.""" + monkeypatch.setenv("CUA_TELEMETRY", "off") + + from core.telemetry import is_telemetry_enabled + + assert is_telemetry_enabled() is False + + def test_telemetry_disabled_with_new_flag(self, monkeypatch): + """Test that telemetry can be disabled with CUA_TELEMETRY_ENABLED=false.""" + monkeypatch.setenv("CUA_TELEMETRY_ENABLED", "false") + + from core.telemetry import is_telemetry_enabled + + assert is_telemetry_enabled() is False + + @pytest.mark.parametrize("value", ["0", "false", "no", "off"]) + def test_telemetry_disabled_with_various_values(self, monkeypatch, value): + """Test that telemetry respects various disable values.""" + monkeypatch.setenv("CUA_TELEMETRY_ENABLED", value) + + from core.telemetry import is_telemetry_enabled + + assert is_telemetry_enabled() is False + + @pytest.mark.parametrize("value", ["1", "true", "yes", "on"]) + def test_telemetry_enabled_with_various_values(self, monkeypatch, value): + """Test that telemetry respects various enable values.""" + monkeypatch.setenv("CUA_TELEMETRY_ENABLED", value) + + from core.telemetry import is_telemetry_enabled + + assert is_telemetry_enabled() is True + + +class TestPostHogTelemetryClient: + """Test PostHogTelemetryClient class (SRP: Only tests client logic).""" + + @patch("core.telemetry.posthog.posthog") + @patch("core.telemetry.posthog.Path") + def test_client_initialization(self, mock_path, mock_posthog, disable_telemetry): + """Test that client initializes correctly.""" + from core.telemetry.posthog import PostHogTelemetryClient + + # Mock the storage directory + mock_storage_dir = MagicMock() + mock_storage_dir.exists.return_value = False + mock_path.return_value.parent.parent = MagicMock() + mock_path.return_value.parent.parent.__truediv__.return_value = mock_storage_dir + + # Reset singleton + PostHogTelemetryClient.destroy_client() + + client = PostHogTelemetryClient() + + assert client is not None + assert hasattr(client, "installation_id") + assert hasattr(client, "initialized") + assert hasattr(client, "queued_events") + + @patch("core.telemetry.posthog.posthog") + @patch("core.telemetry.posthog.Path") + def test_installation_id_generation(self, mock_path, mock_posthog, disable_telemetry): + """Test that installation ID is generated if not exists.""" + from core.telemetry.posthog import PostHogTelemetryClient + + # Mock file system + mock_id_file = MagicMock() + mock_id_file.exists.return_value = False + mock_storage_dir = MagicMock() + mock_storage_dir.__truediv__.return_value = mock_id_file + + mock_core_dir = MagicMock() + mock_core_dir.__truediv__.return_value = mock_storage_dir + mock_path.return_value.parent.parent = mock_core_dir + + # Reset singleton + PostHogTelemetryClient.destroy_client() + + client = PostHogTelemetryClient() + + # Should have generated a new UUID + assert client.installation_id is not None + assert len(client.installation_id) == 36 # UUID format + + @patch("core.telemetry.posthog.posthog") + @patch("core.telemetry.posthog.Path") + def test_installation_id_persistence(self, mock_path, mock_posthog, disable_telemetry): + """Test that installation ID is read from file if exists.""" + from core.telemetry.posthog import PostHogTelemetryClient + + existing_id = "test-installation-id-123" + + # Mock file system + mock_id_file = MagicMock() + mock_id_file.exists.return_value = True + mock_id_file.read_text.return_value = existing_id + + mock_storage_dir = MagicMock() + mock_storage_dir.__truediv__.return_value = mock_id_file + + mock_core_dir = MagicMock() + mock_core_dir.__truediv__.return_value = mock_storage_dir + mock_path.return_value.parent.parent = mock_core_dir + + # Reset singleton + PostHogTelemetryClient.destroy_client() + + client = PostHogTelemetryClient() + + assert client.installation_id == existing_id + + @patch("core.telemetry.posthog.posthog") + @patch("core.telemetry.posthog.Path") + def test_record_event_when_disabled(self, mock_path, mock_posthog, monkeypatch): + """Test that events are not recorded when telemetry is disabled.""" + from core.telemetry.posthog import PostHogTelemetryClient + + # Disable telemetry explicitly using the correct environment variable + monkeypatch.setenv("CUA_TELEMETRY_ENABLED", "false") + + # Mock file system + mock_storage_dir = MagicMock() + mock_storage_dir.exists.return_value = False + mock_path.return_value.parent.parent = MagicMock() + mock_path.return_value.parent.parent.__truediv__.return_value = mock_storage_dir + + # Reset singleton + PostHogTelemetryClient.destroy_client() + + client = PostHogTelemetryClient() + client.record_event("test_event", {"key": "value"}) + + # PostHog capture should not be called at all when telemetry is disabled + mock_posthog.capture.assert_not_called() + + @patch("core.telemetry.posthog.posthog") + @patch("core.telemetry.posthog.Path") + def test_record_event_when_enabled(self, mock_path, mock_posthog, monkeypatch): + """Test that events are recorded when telemetry is enabled.""" + from core.telemetry.posthog import PostHogTelemetryClient + + # Enable telemetry + monkeypatch.setenv("CUA_TELEMETRY_ENABLED", "true") + + # Mock file system + mock_storage_dir = MagicMock() + mock_storage_dir.exists.return_value = False + mock_path.return_value.parent.parent = MagicMock() + mock_path.return_value.parent.parent.__truediv__.return_value = mock_storage_dir + + # Reset singleton + PostHogTelemetryClient.destroy_client() + + client = PostHogTelemetryClient() + client.initialized = True # Pretend it's initialized + + event_name = "test_event" + event_props = {"key": "value"} + client.record_event(event_name, event_props) + + # PostHog capture should be called + assert mock_posthog.capture.call_count >= 1 + + @patch("core.telemetry.posthog.posthog") + @patch("core.telemetry.posthog.Path") + def test_singleton_pattern(self, mock_path, mock_posthog, disable_telemetry): + """Test that get_client returns the same instance.""" + from core.telemetry.posthog import PostHogTelemetryClient + + # Mock file system + mock_storage_dir = MagicMock() + mock_storage_dir.exists.return_value = False + mock_path.return_value.parent.parent = MagicMock() + mock_path.return_value.parent.parent.__truediv__.return_value = mock_storage_dir + + # Reset singleton + PostHogTelemetryClient.destroy_client() + + client1 = PostHogTelemetryClient.get_client() + client2 = PostHogTelemetryClient.get_client() + + assert client1 is client2 + + +class TestRecordEvent: + """Test the public record_event function (SRP: Only tests public API).""" + + @patch("core.telemetry.posthog.PostHogTelemetryClient") + def test_record_event_calls_client(self, mock_client_class, disable_telemetry): + """Test that record_event delegates to the client.""" + from core.telemetry import record_event + + mock_client_instance = Mock() + mock_client_class.get_client.return_value = mock_client_instance + + event_name = "test_event" + event_props = {"key": "value"} + + record_event(event_name, event_props) + + mock_client_instance.record_event.assert_called_once_with(event_name, event_props) + + @patch("core.telemetry.posthog.PostHogTelemetryClient") + def test_record_event_without_properties(self, mock_client_class, disable_telemetry): + """Test that record_event works without properties.""" + from core.telemetry import record_event + + mock_client_instance = Mock() + mock_client_class.get_client.return_value = mock_client_instance + + event_name = "test_event" + + record_event(event_name) + + mock_client_instance.record_event.assert_called_once_with(event_name, {}) + + +class TestDestroyTelemetryClient: + """Test client destruction (SRP: Only tests cleanup).""" + + @patch("core.telemetry.posthog.PostHogTelemetryClient") + def test_destroy_client_calls_class_method(self, mock_client_class): + """Test that destroy_telemetry_client delegates correctly.""" + from core.telemetry import destroy_telemetry_client + + destroy_telemetry_client() + + mock_client_class.destroy_client.assert_called_once() diff --git a/libs/python/mcp-server/tests/conftest.py b/libs/python/mcp-server/tests/conftest.py new file mode 100644 index 00000000..9dff11b3 --- /dev/null +++ b/libs/python/mcp-server/tests/conftest.py @@ -0,0 +1,51 @@ +"""Pytest configuration and shared fixtures for mcp-server package tests. + +This file contains shared fixtures and configuration for all mcp-server tests. +Following SRP: This file ONLY handles test setup/teardown. +""" + +from unittest.mock import AsyncMock, Mock, patch + +import pytest + + +@pytest.fixture +def mock_mcp_context(): + """Mock MCP context for testing. + + Use this fixture to test MCP server logic without real MCP connections. + """ + context = AsyncMock() + context.request_context = AsyncMock() + context.session = Mock() + context.session.send_resource_updated = AsyncMock() + + return context + + +@pytest.fixture +def mock_computer(): + """Mock Computer instance for MCP server tests. + + Use this fixture to test MCP logic without real Computer operations. + """ + computer = AsyncMock() + computer.interface = AsyncMock() + computer.interface.screenshot = AsyncMock(return_value=b"fake_screenshot") + computer.interface.left_click = AsyncMock() + computer.interface.type = AsyncMock() + + # Mock context manager + computer.__aenter__ = AsyncMock(return_value=computer) + computer.__aexit__ = AsyncMock() + + return computer + + +@pytest.fixture +def disable_telemetry(monkeypatch): + """Disable telemetry for tests. + + Use this fixture to ensure no telemetry is sent during tests. + """ + monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1") diff --git a/libs/python/mcp-server/tests/test_mcp_server.py b/libs/python/mcp-server/tests/test_mcp_server.py new file mode 100644 index 00000000..a9487ef2 --- /dev/null +++ b/libs/python/mcp-server/tests/test_mcp_server.py @@ -0,0 +1,44 @@ +"""Unit tests for mcp-server package. + +This file tests ONLY basic MCP server functionality. +Following SRP: This file tests MCP server initialization. +All external dependencies are mocked. +""" + +from unittest.mock import AsyncMock, Mock, patch + +import pytest + + +class TestMCPServerImports: + """Test MCP server module imports (SRP: Only tests imports).""" + + def test_mcp_server_module_exists(self): + """Test that mcp_server module can be imported.""" + try: + import mcp_server + + assert mcp_server is not None + except ImportError: + pytest.skip("mcp_server module not installed") + except SystemExit: + pytest.skip("MCP dependencies (mcp.server.fastmcp) not available") + + +class TestMCPServerInitialization: + """Test MCP server initialization (SRP: Only tests initialization).""" + + @pytest.mark.asyncio + async def test_mcp_server_can_be_imported(self): + """Basic smoke test: verify MCP server components can be imported.""" + try: + from mcp_server import server + + assert server is not None + except ImportError: + pytest.skip("MCP server module not available") + except SystemExit: + pytest.skip("MCP dependencies (mcp.server.fastmcp) not available") + except Exception as e: + # Some initialization errors are acceptable in unit tests + pytest.skip(f"MCP server initialization requires specific setup: {e}") diff --git a/libs/python/pylume/.bumpversion.cfg b/libs/python/pylume/.bumpversion.cfg deleted file mode 100644 index 4a316b37..00000000 --- a/libs/python/pylume/.bumpversion.cfg +++ /dev/null @@ -1,10 +0,0 @@ -[bumpversion] -current_version = 0.2.1 -commit = True -tag = True -tag_name = pylume-v{new_version} -message = Bump pylume to v{new_version} - -[bumpversion:file:pylume/__init__.py] -search = __version__ = "{current_version}" -replace = __version__ = "{new_version}" diff --git a/libs/python/pylume/README.md b/libs/python/pylume/README.md deleted file mode 100644 index 459d1ce5..00000000 --- a/libs/python/pylume/README.md +++ /dev/null @@ -1,46 +0,0 @@ -
-

-
- - - - Shows my svg - -
- -[![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#) -[![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#) -[![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85) -[![PyPI](https://img.shields.io/pypi/v/pylume?color=333333)](https://pypi.org/project/pylume/) - -

-
- -**pylume** is a lightweight Python library based on [lume](https://github.com/trycua/lume) to create, run and manage macOS and Linux virtual machines (VMs) natively on Apple Silicon. - -```bash -pip install pylume -``` - -## Usage - -Please refer to this [Notebook](./samples/nb.ipynb) for a quickstart. More details about the underlying API used by pylume are available [here](https://github.com/trycua/lume/docs/API-Reference.md). - -## Prebuilt Images - -Pre-built images are available on [ghcr.io/trycua](https://github.com/orgs/trycua/packages). -These images come pre-configured with an SSH server and auto-login enabled. - -## Contributing - -We welcome and greatly appreciate contributions to lume! Whether you're improving documentation, adding new features, fixing bugs, or adding new VM images, your efforts help make pylume better for everyone. - -Join our [Discord community](https://discord.com/invite/mVnXXpdE85) to discuss ideas or get assistance. - -## License - -lume is open-sourced under the MIT License - see the [LICENSE](LICENSE) file for details. - -## Stargazers over time - -[![Stargazers over time](https://starchart.cc/trycua/pylume.svg?variant=adaptive)](https://starchart.cc/trycua/pylume) diff --git a/libs/python/pylume/__init__.py b/libs/python/pylume/__init__.py deleted file mode 100644 index 128ce121..00000000 --- a/libs/python/pylume/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -""" -PyLume Python SDK - A client library for managing macOS VMs with PyLume. -""" - -from pylume.exceptions import * -from pylume.models import * -from pylume.pylume import * - -__version__ = "0.1.0" diff --git a/libs/python/pylume/pylume/__init__.py b/libs/python/pylume/pylume/__init__.py deleted file mode 100644 index adfb15d9..00000000 --- a/libs/python/pylume/pylume/__init__.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -PyLume Python SDK - A client library for managing macOS VMs with PyLume. - -Example: - >>> from pylume import PyLume, VMConfig - >>> client = PyLume() - >>> config = VMConfig(name="my-vm", cpu=4, memory="8GB", disk_size="64GB") - >>> client.create_vm(config) - >>> client.run_vm("my-vm") -""" - -# Import exceptions then all models -from .exceptions import ( - LumeConfigError, - LumeConnectionError, - LumeError, - LumeImageError, - LumeNotFoundError, - LumeServerError, - LumeTimeoutError, - LumeVMError, -) -from .models import ( - CloneSpec, - ImageInfo, - ImageList, - ImageRef, - SharedDirectory, - VMConfig, - VMRunOpts, - VMStatus, - VMUpdateOpts, -) - -# Import main class last to avoid circular imports -from .pylume import PyLume - -__version__ = "0.2.1" - -__all__ = [ - "PyLume", - "VMConfig", - "VMStatus", - "VMRunOpts", - "VMUpdateOpts", - "ImageRef", - "CloneSpec", - "SharedDirectory", - "ImageList", - "ImageInfo", - "LumeError", - "LumeServerError", - "LumeConnectionError", - "LumeTimeoutError", - "LumeNotFoundError", - "LumeConfigError", - "LumeVMError", - "LumeImageError", -] diff --git a/libs/python/pylume/pylume/client.py b/libs/python/pylume/pylume/client.py deleted file mode 100644 index 101d5ee8..00000000 --- a/libs/python/pylume/pylume/client.py +++ /dev/null @@ -1,119 +0,0 @@ -import asyncio -import json -import shlex -import subprocess -from typing import Any, Dict, Optional - -from .exceptions import ( - LumeConfigError, - LumeConnectionError, - LumeError, - LumeNotFoundError, - LumeServerError, - LumeTimeoutError, -) - - -class LumeClient: - def __init__(self, base_url: str, timeout: float = 60.0, debug: bool = False): - self.base_url = base_url - self.timeout = timeout - self.debug = debug - - def _log_debug(self, message: str, **kwargs) -> None: - """Log debug information if debug mode is enabled.""" - if self.debug: - print(f"DEBUG: {message}") - if kwargs: - print(json.dumps(kwargs, indent=2)) - - async def _run_curl( - self, - method: str, - path: str, - data: Optional[Dict[str, Any]] = None, - params: Optional[Dict[str, Any]] = None, - ) -> Any: - """Execute a curl command and return the response.""" - url = f"{self.base_url}{path}" - if params: - param_str = "&".join(f"{k}={v}" for k, v in params.items()) - url = f"{url}?{param_str}" - - cmd = ["curl", "-X", method, "-s", "-w", "%{http_code}", "-m", str(self.timeout)] - - if data is not None: - cmd.extend(["-H", "Content-Type: application/json", "-d", json.dumps(data)]) - - cmd.append(url) - - self._log_debug(f"Running curl command: {' '.join(map(shlex.quote, cmd))}") - - try: - process = await asyncio.create_subprocess_exec( - *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - stdout, stderr = await process.communicate() - - if process.returncode != 0: - raise LumeConnectionError(f"Curl command failed: {stderr.decode()}") - - # The last 3 characters are the status code - response = stdout.decode() - status_code = int(response[-3:]) - response_body = response[:-3] # Remove status code from response - - if status_code >= 400: - if status_code == 404: - raise LumeNotFoundError(f"Resource not found: {path}") - elif status_code == 400: - raise LumeConfigError(f"Invalid request: {response_body}") - elif status_code >= 500: - raise LumeServerError(f"Server error: {response_body}") - else: - raise LumeError(f"Request failed with status {status_code}: {response_body}") - - return json.loads(response_body) if response_body.strip() else None - - except asyncio.TimeoutError: - raise LumeTimeoutError(f"Request timed out after {self.timeout} seconds") - - async def get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Any: - """Make a GET request.""" - return await self._run_curl("GET", path, params=params) - - async def post( - self, path: str, data: Optional[Dict[str, Any]] = None, timeout: Optional[float] = None - ) -> Any: - """Make a POST request.""" - old_timeout = self.timeout - if timeout is not None: - self.timeout = timeout - try: - return await self._run_curl("POST", path, data=data) - finally: - self.timeout = old_timeout - - async def patch(self, path: str, data: Dict[str, Any]) -> None: - """Make a PATCH request.""" - await self._run_curl("PATCH", path, data=data) - - async def delete(self, path: str) -> None: - """Make a DELETE request.""" - await self._run_curl("DELETE", path) - - def print_curl(self, method: str, path: str, data: Optional[Dict[str, Any]] = None) -> None: - """Print equivalent curl command for debugging.""" - curl_cmd = f"""curl -X {method} \\ - '{self.base_url}{path}'""" - - if data: - curl_cmd += f" \\\n -H 'Content-Type: application/json' \\\n -d '{json.dumps(data)}'" - - print("\nEquivalent curl command:") - print(curl_cmd) - print() - - async def close(self) -> None: - """Close the client resources.""" - pass # No shared resources to clean up diff --git a/libs/python/pylume/pylume/exceptions.py b/libs/python/pylume/pylume/exceptions.py deleted file mode 100644 index 191718b0..00000000 --- a/libs/python/pylume/pylume/exceptions.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import Optional - - -class LumeError(Exception): - """Base exception for all PyLume errors.""" - - pass - - -class LumeServerError(LumeError): - """Raised when there's an error with the PyLume server.""" - - def __init__( - self, message: str, status_code: Optional[int] = None, response_text: Optional[str] = None - ): - self.status_code = status_code - self.response_text = response_text - super().__init__(message) - - -class LumeConnectionError(LumeError): - """Raised when there's an error connecting to the PyLume server.""" - - pass - - -class LumeTimeoutError(LumeError): - """Raised when a request to the PyLume server times out.""" - - pass - - -class LumeNotFoundError(LumeError): - """Raised when a requested resource is not found.""" - - pass - - -class LumeConfigError(LumeError): - """Raised when there's an error with the configuration.""" - - pass - - -class LumeVMError(LumeError): - """Raised when there's an error with a VM operation.""" - - pass - - -class LumeImageError(LumeError): - """Raised when there's an error with an image operation.""" - - pass diff --git a/libs/python/pylume/pylume/lume b/libs/python/pylume/pylume/lume deleted file mode 100755 index 5ea1be47..00000000 Binary files a/libs/python/pylume/pylume/lume and /dev/null differ diff --git a/libs/python/pylume/pylume/models.py b/libs/python/pylume/pylume/models.py deleted file mode 100644 index 021ea8aa..00000000 --- a/libs/python/pylume/pylume/models.py +++ /dev/null @@ -1,265 +0,0 @@ -import re -from typing import Any, Dict, List, Literal, Optional - -from pydantic import BaseModel, ConfigDict, Field, RootModel, computed_field, validator - - -class DiskInfo(BaseModel): - """Information about disk storage allocation. - - Attributes: - total: Total disk space in bytes - allocated: Currently allocated disk space in bytes - """ - - total: int - allocated: int - - -class VMConfig(BaseModel): - """Configuration for creating a new VM. - - Note: Memory and disk sizes should be specified with units (e.g., "4GB", "64GB") - - Attributes: - name: Name of the virtual machine - os: Operating system type, either "macOS" or "linux" - cpu: Number of CPU cores to allocate - memory: Amount of memory to allocate with units - disk_size: Size of the disk to create with units - display: Display resolution in format "widthxheight" - ipsw: IPSW path or 'latest' for macOS VMs, None for other OS types - """ - - name: str - os: Literal["macOS", "linux"] = "macOS" - cpu: int = Field(default=2, ge=1) - memory: str = "4GB" - disk_size: str = Field(default="64GB", alias="diskSize") - display: str = "1024x768" - ipsw: Optional[str] = Field(default=None, description="IPSW path or 'latest', for macOS VMs") - - class Config: - populate_by_alias = True - - -class SharedDirectory(BaseModel): - """Configuration for a shared directory. - - Attributes: - host_path: Path to the directory on the host system - read_only: Whether the directory should be mounted as read-only - """ - - host_path: str = Field(..., alias="hostPath") # Allow host_path but serialize as hostPath - read_only: bool = False - - class Config: - populate_by_name = True # Allow both alias and original name - alias_generator = lambda s: "".join( - word.capitalize() if i else word for i, word in enumerate(s.split("_")) - ) - - -class VMRunOpts(BaseModel): - """Configuration for running a VM. - - Args: - no_display: Whether to not display the VNC client - shared_directories: List of directories to share with the VM - """ - - no_display: bool = Field(default=False, alias="noDisplay") - shared_directories: Optional[list[SharedDirectory]] = Field( - default=None, alias="sharedDirectories" - ) - - model_config = ConfigDict( - populate_by_name=True, - alias_generator=lambda s: "".join( - word.capitalize() if i else word for i, word in enumerate(s.split("_")) - ), - ) - - def model_dump(self, **kwargs): - """Export model data with proper field name conversion. - - Converts shared directory fields to match API expectations when using aliases. - - Args: - **kwargs: Keyword arguments passed to parent model_dump method - - Returns: - dict: Model data with properly formatted field names - """ - data = super().model_dump(**kwargs) - # Convert shared directory fields to match API expectations - if self.shared_directories and "by_alias" in kwargs and kwargs["by_alias"]: - data["sharedDirectories"] = [ - {"hostPath": d.host_path, "readOnly": d.read_only} for d in self.shared_directories - ] - # Remove the snake_case version if it exists - data.pop("shared_directories", None) - return data - - -class VMStatus(BaseModel): - """Status information for a virtual machine. - - Attributes: - name: Name of the virtual machine - status: Current status of the VM - os: Operating system type - cpu_count: Number of CPU cores allocated - memory_size: Amount of memory allocated in bytes - disk_size: Disk storage information - vnc_url: URL for VNC connection if available - ip_address: IP address of the VM if available - """ - - name: str - status: str - os: Literal["macOS", "linux"] - cpu_count: int = Field(alias="cpuCount") - memory_size: int = Field(alias="memorySize") # API returns memory size in bytes - disk_size: DiskInfo = Field(alias="diskSize") - vnc_url: Optional[str] = Field(default=None, alias="vncUrl") - ip_address: Optional[str] = Field(default=None, alias="ipAddress") - - class Config: - populate_by_alias = True - - @computed_field - @property - def state(self) -> str: - """Get the current state of the VM. - - Returns: - str: Current VM status - """ - return self.status - - @computed_field - @property - def cpu(self) -> int: - """Get the number of CPU cores. - - Returns: - int: Number of CPU cores allocated to the VM - """ - return self.cpu_count - - @computed_field - @property - def memory(self) -> str: - """Get memory allocation in human-readable format. - - Returns: - str: Memory size formatted as "{size}GB" - """ - # Convert bytes to GB - gb = self.memory_size / (1024 * 1024 * 1024) - return f"{int(gb)}GB" - - -class VMUpdateOpts(BaseModel): - """Options for updating VM configuration. - - Attributes: - cpu: Number of CPU cores to update to - memory: Amount of memory to update to with units - disk_size: Size of disk to update to with units - """ - - cpu: Optional[int] = None - memory: Optional[str] = None - disk_size: Optional[str] = None - - -class ImageRef(BaseModel): - """Reference to a VM image. - - Attributes: - image: Name of the image - tag: Tag version of the image - registry: Registry hostname where image is stored - organization: Organization or namespace in the registry - """ - - image: str - tag: str = "latest" - registry: Optional[str] = "ghcr.io" - organization: Optional[str] = "trycua" - - def model_dump(self, **kwargs): - """Override model_dump to return just the image:tag format. - - Args: - **kwargs: Keyword arguments (ignored) - - Returns: - str: Image reference in "image:tag" format - """ - return f"{self.image}:{self.tag}" - - -class CloneSpec(BaseModel): - """Specification for cloning a VM. - - Attributes: - name: Name of the source VM to clone - new_name: Name for the new cloned VM - """ - - name: str - new_name: str = Field(alias="newName") - - class Config: - populate_by_alias = True - - -class ImageInfo(BaseModel): - """Model for individual image information. - - Attributes: - imageId: Unique identifier for the image - """ - - imageId: str - - -class ImageList(RootModel): - """Response model for the images endpoint. - - A list-like container for ImageInfo objects that provides - iteration and indexing capabilities. - """ - - root: List[ImageInfo] - - def __iter__(self): - """Iterate over the image list. - - Returns: - Iterator over ImageInfo objects - """ - return iter(self.root) - - def __getitem__(self, item): - """Get an item from the image list by index. - - Args: - item: Index or slice to retrieve - - Returns: - ImageInfo or list of ImageInfo objects - """ - return self.root[item] - - def __len__(self): - """Get the number of images in the list. - - Returns: - int: Number of images in the list - """ - return len(self.root) diff --git a/libs/python/pylume/pylume/pylume.py b/libs/python/pylume/pylume/pylume.py deleted file mode 100644 index 1bbe34b2..00000000 --- a/libs/python/pylume/pylume/pylume.py +++ /dev/null @@ -1,315 +0,0 @@ -import asyncio -import json -import os -import re -import signal -import subprocess -import sys -import time -from functools import wraps -from typing import Any, Callable, List, Optional, TypeVar, Union - -from .client import LumeClient -from .exceptions import ( - LumeConfigError, - LumeConnectionError, - LumeError, - LumeImageError, - LumeNotFoundError, - LumeServerError, - LumeTimeoutError, - LumeVMError, -) -from .models import ( - CloneSpec, - ImageList, - ImageRef, - SharedDirectory, - VMConfig, - VMRunOpts, - VMStatus, - VMUpdateOpts, -) -from .server import LumeServer - -# Type variable for the decorator -T = TypeVar("T") - - -def ensure_server(func: Callable[..., T]) -> Callable[..., T]: - """Decorator to ensure server is running before executing the method.""" - - @wraps(func) - async def wrapper(self: "PyLume", *args: Any, **kwargs: Any) -> T: - # ensure_running is an async method, so we need to await it - await self.server.ensure_running() - # Initialize client if needed - await self._init_client() - return await func(self, *args, **kwargs) # type: ignore - - return wrapper # type: ignore - - -class PyLume: - def __init__( - self, - debug: bool = False, - server_start_timeout: int = 60, - port: Optional[int] = None, - use_existing_server: bool = False, - host: str = "localhost", - ): - """Initialize the async PyLume client. - - Args: - debug: Enable debug logging - auto_start_server: Whether to automatically start the lume server if not running - server_start_timeout: Timeout in seconds to wait for server to start - port: Port number for the lume server. Required when use_existing_server is True. - use_existing_server: If True, will try to connect to an existing server on the specified port - instead of starting a new one. - host: Host to use for connections (e.g., "localhost", "127.0.0.1", "host.docker.internal") - """ - if use_existing_server and port is None: - raise LumeConfigError("Port must be specified when using an existing server") - - self.server = LumeServer( - debug=debug, - server_start_timeout=server_start_timeout, - port=port, - use_existing_server=use_existing_server, - host=host, - ) - self.client = None - - async def __aenter__(self) -> "PyLume": - """Async context manager entry.""" - if self.server.use_existing_server: - # Just ensure base_url is set for existing server - if self.server.requested_port is None: - raise LumeConfigError("Port must be specified when using an existing server") - - if not self.server.base_url: - self.server.port = self.server.requested_port - self.server.base_url = f"http://{self.server.host}:{self.server.port}/lume" - - # Ensure the server is running (will connect to existing or start new as needed) - await self.server.ensure_running() - - # Initialize the client - await self._init_client() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: - """Async context manager exit.""" - if self.client is not None: - await self.client.close() - await self.server.stop() - - async def _init_client(self) -> None: - """Initialize the client if not already initialized.""" - if self.client is None: - if self.server.base_url is None: - raise RuntimeError("Server base URL not set") - self.client = LumeClient(self.server.base_url, debug=self.server.debug) - - def _log_debug(self, message: str, **kwargs) -> None: - """Log debug information if debug mode is enabled.""" - if self.server.debug: - print(f"DEBUG: {message}") - if kwargs: - print(json.dumps(kwargs, indent=2)) - - async def _handle_api_error(self, e: Exception, operation: str) -> None: - """Handle API errors and raise appropriate custom exceptions.""" - if isinstance(e, subprocess.SubprocessError): - raise LumeConnectionError(f"Failed to connect to PyLume server: {str(e)}") - elif isinstance(e, asyncio.TimeoutError): - raise LumeTimeoutError(f"Request timed out: {str(e)}") - - if not hasattr(e, "status") and not isinstance(e, subprocess.CalledProcessError): - raise LumeServerError(f"Unknown error during {operation}: {str(e)}") - - status_code = getattr(e, "status", 500) - response_text = str(e) - - self._log_debug( - f"{operation} request failed", status_code=status_code, response_text=response_text - ) - - if status_code == 404: - raise LumeNotFoundError(f"Resource not found during {operation}") - elif status_code == 400: - raise LumeConfigError(f"Invalid configuration for {operation}: {response_text}") - elif status_code >= 500: - raise LumeServerError( - f"Server error during {operation}", - status_code=status_code, - response_text=response_text, - ) - else: - raise LumeServerError( - f"Error during {operation}", status_code=status_code, response_text=response_text - ) - - async def _read_output(self) -> None: - """Read and log server output.""" - try: - while True: - if not self.server.server_process or self.server.server_process.poll() is not None: - self._log_debug("Server process ended") - break - - # Read stdout without blocking - if self.server.server_process.stdout: - while True: - line = self.server.server_process.stdout.readline() - if not line: - break - line = line.strip() - self._log_debug(f"Server stdout: {line}") - if "Server started" in line.decode("utf-8"): - self._log_debug("Detected server started message") - return - - # Read stderr without blocking - if self.server.server_process.stderr: - while True: - line = self.server.server_process.stderr.readline() - if not line: - break - line = line.strip() - self._log_debug(f"Server stderr: {line}") - if "error" in line.decode("utf-8").lower(): - raise RuntimeError(f"Server error: {line}") - - await asyncio.sleep(0.1) # Small delay to prevent CPU spinning - except Exception as e: - self._log_debug(f"Error in output reader: {str(e)}") - raise - - @ensure_server - async def create_vm(self, spec: Union[VMConfig, dict]) -> None: - """Create a VM with the given configuration.""" - # Ensure client is initialized - await self._init_client() - - if isinstance(spec, VMConfig): - spec = spec.model_dump(by_alias=True, exclude_none=True) - - # Suppress optional attribute access errors - self.client.print_curl("POST", "/vms", spec) # type: ignore[attr-defined] - await self.client.post("/vms", spec) # type: ignore[attr-defined] - - @ensure_server - async def run_vm(self, name: str, opts: Optional[Union[VMRunOpts, dict]] = None) -> None: - """Run a VM.""" - if opts is None: - opts = VMRunOpts(no_display=False) # type: ignore[attr-defined] - elif isinstance(opts, dict): - opts = VMRunOpts(**opts) - - payload = opts.model_dump(by_alias=True, exclude_none=True) - self.client.print_curl("POST", f"/vms/{name}/run", payload) # type: ignore[attr-defined] - await self.client.post(f"/vms/{name}/run", payload) # type: ignore[attr-defined] - - @ensure_server - async def list_vms(self) -> List[VMStatus]: - """List all VMs.""" - data = await self.client.get("/vms") # type: ignore[attr-defined] - return [VMStatus.model_validate(vm) for vm in data] - - @ensure_server - async def get_vm(self, name: str) -> VMStatus: - """Get VM details.""" - data = await self.client.get(f"/vms/{name}") # type: ignore[attr-defined] - return VMStatus.model_validate(data) - - @ensure_server - async def update_vm(self, name: str, params: Union[VMUpdateOpts, dict]) -> None: - """Update VM settings.""" - if isinstance(params, dict): - params = VMUpdateOpts(**params) - - payload = params.model_dump(by_alias=True, exclude_none=True) - self.client.print_curl("PATCH", f"/vms/{name}", payload) # type: ignore[attr-defined] - await self.client.patch(f"/vms/{name}", payload) # type: ignore[attr-defined] - - @ensure_server - async def stop_vm(self, name: str) -> None: - """Stop a VM.""" - await self.client.post(f"/vms/{name}/stop") # type: ignore[attr-defined] - - @ensure_server - async def delete_vm(self, name: str) -> None: - """Delete a VM.""" - await self.client.delete(f"/vms/{name}") # type: ignore[attr-defined] - - @ensure_server - async def pull_image( - self, spec: Union[ImageRef, dict, str], name: Optional[str] = None - ) -> None: - """Pull a VM image.""" - await self._init_client() - if isinstance(spec, str): - if ":" in spec: - image_str = spec - else: - image_str = f"{spec}:latest" - registry = "ghcr.io" - organization = "trycua" - elif isinstance(spec, dict): - image = spec.get("image", "") - tag = spec.get("tag", "latest") - image_str = f"{image}:{tag}" - registry = spec.get("registry", "ghcr.io") - organization = spec.get("organization", "trycua") - else: - image_str = f"{spec.image}:{spec.tag}" - registry = spec.registry - organization = spec.organization - - payload = { - "image": image_str, - "name": name, - "registry": registry, - "organization": organization, - } - - self.client.print_curl("POST", "/pull", payload) # type: ignore[attr-defined] - await self.client.post("/pull", payload, timeout=300.0) # type: ignore[attr-defined] - - @ensure_server - async def clone_vm(self, name: str, new_name: str) -> None: - """Clone a VM with the given name to a new VM with new_name.""" - config = CloneSpec(name=name, newName=new_name) - self.client.print_curl("POST", "/vms/clone", config.model_dump()) # type: ignore[attr-defined] - await self.client.post("/vms/clone", config.model_dump()) # type: ignore[attr-defined] - - @ensure_server - async def get_latest_ipsw_url(self) -> str: - """Get the latest IPSW URL.""" - await self._init_client() - data = await self.client.get("/ipsw") # type: ignore[attr-defined] - return data["url"] - - @ensure_server - async def get_images(self, organization: Optional[str] = None) -> ImageList: - """Get list of available images.""" - await self._init_client() - params = {"organization": organization} if organization else None - data = await self.client.get("/images", params) # type: ignore[attr-defined] - return ImageList(root=data) - - async def close(self) -> None: - """Close the client and stop the server.""" - if self.client is not None: - await self.client.close() - self.client = None - await asyncio.sleep(1) - await self.server.stop() - - async def _ensure_client(self) -> None: - """Ensure client is initialized.""" - if self.client is None: - await self._init_client() diff --git a/libs/python/pylume/pylume/server.py b/libs/python/pylume/pylume/server.py deleted file mode 100644 index cab5f627..00000000 --- a/libs/python/pylume/pylume/server.py +++ /dev/null @@ -1,481 +0,0 @@ -import asyncio -import json -import logging -import os -import random -import shlex -import signal -import socket -import subprocess -import sys -import tempfile -import time -from logging import getLogger -from typing import Optional - -from .exceptions import LumeConnectionError - - -class LumeServer: - def __init__( - self, - debug: bool = False, - server_start_timeout: int = 60, - port: Optional[int] = None, - use_existing_server: bool = False, - host: str = "localhost", - ): - """Initialize the LumeServer. - - Args: - debug: Enable debug logging - server_start_timeout: Timeout in seconds to wait for server to start - port: Specific port to use for the server - use_existing_server: If True, will try to connect to an existing server - instead of starting a new one - host: Host to use for connections (e.g., "localhost", "127.0.0.1", "host.docker.internal") - """ - self.debug = debug - self.server_start_timeout = server_start_timeout - self.server_process = None - self.output_file = None - self.requested_port = port - self.port = None - self.base_url = None - self.use_existing_server = use_existing_server - self.host = host - - # Configure logging - self.logger = getLogger("pylume.server") - if not self.logger.handlers: - handler = logging.StreamHandler() - formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") - handler.setFormatter(formatter) - self.logger.addHandler(handler) - self.logger.setLevel(logging.DEBUG if debug else logging.INFO) - - self.logger.debug(f"Server initialized with host: {self.host}") - - def _check_port_available(self, port: int) -> bool: - """Check if a port is available.""" - try: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.settimeout(0.5) - result = s.connect_ex(("127.0.0.1", port)) - if result == 0: # Port is in use on localhost - return False - except: - pass - - # Check the specified host (e.g., "host.docker.internal") if it's not a localhost alias - if self.host not in ["localhost", "127.0.0.1"]: - try: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.settimeout(0.5) - result = s.connect_ex((self.host, port)) - if result == 0: # Port is in use on host - return False - except: - pass - - return True - - def _get_server_port(self) -> int: - """Get an available port for the server.""" - # Use requested port if specified - if self.requested_port is not None: - if not self._check_port_available(self.requested_port): - raise RuntimeError(f"Requested port {self.requested_port} is not available") - return self.requested_port - - # Find a free port - for _ in range(10): # Try up to 10 times - port = random.randint(49152, 65535) - if self._check_port_available(port): - return port - - raise RuntimeError("Could not find an available port") - - async def _ensure_server_running(self) -> None: - """Ensure the lume server is running, start it if it's not.""" - try: - self.logger.debug("Checking if lume server is running...") - # Try to connect to the server with a short timeout - cmd = ["curl", "-s", "-w", "%{http_code}", "-m", "5", f"{self.base_url}/vms"] - process = await asyncio.create_subprocess_exec( - *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - stdout, stderr = await process.communicate() - - if process.returncode == 0: - response = stdout.decode() - status_code = int(response[-3:]) - if status_code == 200: - self.logger.debug("PyLume server is running") - return - - self.logger.debug("PyLume server not running, attempting to start it") - # Server not running, try to start it - lume_path = os.path.join(os.path.dirname(__file__), "lume") - if not os.path.exists(lume_path): - raise RuntimeError(f"Could not find lume binary at {lume_path}") - - # Make sure the file is executable - os.chmod(lume_path, 0o755) - - # Create a temporary file for server output - self.output_file = tempfile.NamedTemporaryFile(mode="w+", delete=False) - self.logger.debug(f"Using temporary file for server output: {self.output_file.name}") - - # Start the server - self.logger.debug(f"Starting lume server with: {lume_path} serve --port {self.port}") - - # Start server in background using subprocess.Popen - try: - self.server_process = subprocess.Popen( - [lume_path, "serve", "--port", str(self.port)], - stdout=self.output_file, - stderr=self.output_file, - cwd=os.path.dirname(lume_path), - start_new_session=True, # Run in new session to avoid blocking - ) - except Exception as e: - self.output_file.close() - os.unlink(self.output_file.name) - raise RuntimeError(f"Failed to start lume server process: {str(e)}") - - # Wait for server to start - self.logger.debug( - f"Waiting up to {self.server_start_timeout} seconds for server to start..." - ) - start_time = time.time() - server_ready = False - last_size = 0 - - while time.time() - start_time < self.server_start_timeout: - if self.server_process.poll() is not None: - # Process has terminated - self.output_file.seek(0) - output = self.output_file.read() - self.output_file.close() - os.unlink(self.output_file.name) - error_msg = ( - f"Server process terminated unexpectedly.\n" - f"Exit code: {self.server_process.returncode}\n" - f"Output: {output}" - ) - raise RuntimeError(error_msg) - - # Check output file for server ready message - self.output_file.seek(0, os.SEEK_END) - size = self.output_file.tell() - if size > last_size: # Only read if there's new content - self.output_file.seek(last_size) - new_output = self.output_file.read() - if new_output.strip(): # Only log non-empty output - self.logger.debug(f"Server output: {new_output.strip()}") - last_size = size - - if "Server started" in new_output: - server_ready = True - self.logger.debug("Server startup detected") - break - - # Try to connect to the server periodically - try: - cmd = ["curl", "-s", "-w", "%{http_code}", "-m", "5", f"{self.base_url}/vms"] - process = await asyncio.create_subprocess_exec( - *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - stdout, stderr = await process.communicate() - - if process.returncode == 0: - response = stdout.decode() - status_code = int(response[-3:]) - if status_code == 200: - server_ready = True - self.logger.debug("Server is responding to requests") - break - except: - pass # Server not ready yet - - await asyncio.sleep(1.0) - - if not server_ready: - # Cleanup if server didn't start - if self.server_process: - self.server_process.terminate() - try: - self.server_process.wait(timeout=5) - except subprocess.TimeoutExpired: - self.server_process.kill() - self.output_file.close() - os.unlink(self.output_file.name) - raise RuntimeError( - f"Failed to start lume server after {self.server_start_timeout} seconds. " - "Check the debug output for more details." - ) - - # Give the server a moment to fully initialize - await asyncio.sleep(2.0) - - # Verify server is responding - try: - cmd = ["curl", "-s", "-w", "%{http_code}", "-m", "10", f"{self.base_url}/vms"] - process = await asyncio.create_subprocess_exec( - *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - stdout, stderr = await process.communicate() - - if process.returncode != 0: - raise RuntimeError(f"Curl command failed: {stderr.decode()}") - - response = stdout.decode() - status_code = int(response[-3:]) - - if status_code != 200: - raise RuntimeError(f"Server returned status code {status_code}") - - self.logger.debug("PyLume server started successfully") - except Exception as e: - self.logger.debug(f"Server verification failed: {str(e)}") - if self.server_process: - self.server_process.terminate() - try: - self.server_process.wait(timeout=5) - except subprocess.TimeoutExpired: - self.server_process.kill() - self.output_file.close() - os.unlink(self.output_file.name) - raise RuntimeError(f"Server started but is not responding: {str(e)}") - - self.logger.debug("Server startup completed successfully") - - except Exception as e: - raise RuntimeError(f"Failed to start lume server: {str(e)}") - - async def _start_server(self) -> None: - """Start the lume server using the lume executable.""" - self.logger.debug("Starting PyLume server") - - # Get absolute path to lume executable in the same directory as this file - lume_path = os.path.join(os.path.dirname(__file__), "lume") - if not os.path.exists(lume_path): - raise RuntimeError(f"Could not find lume binary at {lume_path}") - - try: - # Make executable - os.chmod(lume_path, 0o755) - - # Get and validate port - self.port = self._get_server_port() - self.base_url = f"http://{self.host}:{self.port}/lume" - - # Set up output handling - self.output_file = tempfile.NamedTemporaryFile(mode="w+", delete=False) - - # Start the server process with the lume executable - env = os.environ.copy() - env["RUST_BACKTRACE"] = "1" # Enable backtrace for better error reporting - - # Specify the host to bind to (0.0.0.0 to allow external connections) - self.server_process = subprocess.Popen( - [lume_path, "serve", "--port", str(self.port)], - stdout=self.output_file, - stderr=subprocess.STDOUT, - cwd=os.path.dirname(lume_path), # Run from same directory as executable - env=env, - ) - - # Wait for server to initialize - await asyncio.sleep(2) - await self._wait_for_server() - - except Exception as e: - await self._cleanup() - raise RuntimeError(f"Failed to start lume server process: {str(e)}") - - async def _tail_log(self) -> None: - """Read and display server log output in debug mode.""" - while True: - try: - self.output_file.seek(0, os.SEEK_END) # type: ignore[attr-defined] - line = self.output_file.readline() # type: ignore[attr-defined] - if line: - line = line.strip() - if line: - print(f"SERVER: {line}") - if self.server_process.poll() is not None: # type: ignore[attr-defined] - print("Server process ended") - break - await asyncio.sleep(0.1) - except Exception as e: - print(f"Error reading log: {e}") - await asyncio.sleep(0.1) - - async def _wait_for_server(self) -> None: - """Wait for server to start and become responsive with increased timeout.""" - start_time = time.time() - while time.time() - start_time < self.server_start_timeout: - if self.server_process.poll() is not None: # type: ignore[attr-defined] - error_msg = await self._get_error_output() - await self._cleanup() - raise RuntimeError(error_msg) - - try: - await self._verify_server() - self.logger.debug("Server is now responsive") - return - except Exception as e: - self.logger.debug(f"Server not ready yet: {str(e)}") - await asyncio.sleep(1.0) - - await self._cleanup() - raise RuntimeError(f"Server failed to start after {self.server_start_timeout} seconds") - - async def _verify_server(self) -> None: - """Verify server is responding to requests.""" - try: - cmd = [ - "curl", - "-s", - "-w", - "%{http_code}", - "-m", - "10", - f"http://{self.host}:{self.port}/lume/vms", - ] - process = await asyncio.create_subprocess_exec( - *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - stdout, stderr = await process.communicate() - - if process.returncode != 0: - raise RuntimeError(f"Curl command failed: {stderr.decode()}") - - response = stdout.decode() - status_code = int(response[-3:]) - - if status_code != 200: - raise RuntimeError(f"Server returned status code {status_code}") - - self.logger.debug("PyLume server started successfully") - except Exception as e: - raise RuntimeError(f"Server not responding: {str(e)}") - - async def _get_error_output(self) -> str: - """Get error output from the server process.""" - if not self.output_file: - return "No output available" - self.output_file.seek(0) - output = self.output_file.read() - return ( - f"Server process terminated unexpectedly.\n" - f"Exit code: {self.server_process.returncode}\n" # type: ignore[attr-defined] - f"Output: {output}" - ) - - async def _cleanup(self) -> None: - """Clean up all server resources.""" - if self.server_process: - try: - self.server_process.terminate() - try: - self.server_process.wait(timeout=5) - except subprocess.TimeoutExpired: - self.server_process.kill() - except: - pass - self.server_process = None - - # Clean up output file - if self.output_file: - try: - self.output_file.close() - os.unlink(self.output_file.name) - except Exception as e: - self.logger.debug(f"Error cleaning up output file: {e}") - self.output_file = None - - async def ensure_running(self) -> None: - """Ensure the server is running. - - If use_existing_server is True, will only try to connect to an existing server. - Otherwise will: - 1. Try to connect to an existing server on the specified port - 2. If that fails and not in Docker, start a new server - 3. If in Docker and no existing server is found, raise an error - """ - # First check if we're in Docker - in_docker = os.path.exists("/.dockerenv") or ( - os.path.exists("/proc/1/cgroup") and "docker" in open("/proc/1/cgroup", "r").read() - ) - - # If using a non-localhost host like host.docker.internal, set up the connection details - if self.host not in ["localhost", "127.0.0.1"]: - if self.requested_port is None: - raise RuntimeError("Port must be specified when using a remote host") - - self.port = self.requested_port - self.base_url = f"http://{self.host}:{self.port}/lume" - self.logger.debug(f"Using remote host server at {self.base_url}") - - # Try to verify the server is accessible - try: - await self._verify_server() - self.logger.debug("Successfully connected to remote server") - return - except Exception as e: - if self.use_existing_server or in_docker: - # If explicitly requesting an existing server or in Docker, we can't start a new one - raise RuntimeError( - f"Failed to connect to remote server at {self.base_url}: {str(e)}" - ) - else: - self.logger.debug(f"Remote server not available at {self.base_url}: {str(e)}") - # Fall back to localhost for starting a new server - self.host = "localhost" - - # If explicitly using an existing server, verify it's running - if self.use_existing_server: - if self.requested_port is None: - raise RuntimeError("Port must be specified when using an existing server") - - self.port = self.requested_port - self.base_url = f"http://{self.host}:{self.port}/lume" - - try: - await self._verify_server() - self.logger.debug("Successfully connected to existing server") - except Exception as e: - raise RuntimeError( - f"Failed to connect to existing server at {self.base_url}: {str(e)}" - ) - else: - # Try to connect to an existing server first - if self.requested_port is not None: - self.port = self.requested_port - self.base_url = f"http://{self.host}:{self.port}/lume" - - try: - await self._verify_server() - self.logger.debug("Successfully connected to existing server") - return - except Exception: - self.logger.debug(f"No existing server found at {self.base_url}") - - # If in Docker and can't connect to existing server, raise an error - if in_docker: - raise RuntimeError( - f"Failed to connect to server at {self.base_url} and cannot start a new server in Docker" - ) - - # Start a new server - self.logger.debug("Starting a new server instance") - await self._start_server() - - async def stop(self) -> None: - """Stop the server if we're managing it.""" - if not self.use_existing_server: - self.logger.debug("Stopping lume server...") - await self._cleanup() diff --git a/libs/python/pylume/pyproject.toml b/libs/python/pylume/pyproject.toml deleted file mode 100644 index 976fe6ff..00000000 --- a/libs/python/pylume/pyproject.toml +++ /dev/null @@ -1,51 +0,0 @@ -[build-system] -build-backend = "pdm.backend" -requires = ["pdm-backend"] - -[project] -authors = [{ name = "TryCua", email = "gh@trycua.com" }] -classifiers = [ - "Intended Audience :: Developers", - "License :: OSI Approved :: MIT License", - "Operating System :: MacOS :: MacOS X", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", -] -dependencies = ["pydantic>=2.11.1"] -description = "Python SDK for lume - run macOS and Linux VMs on Apple Silicon" -dynamic = ["version"] -keywords = ["apple-silicon", "macos", "virtualization", "vm"] -license = { text = "MIT" } -name = "pylume" -readme = "README.md" -requires-python = ">=3.12" - -[tool.pdm.version] -path = "pylume/__init__.py" -source = "file" - -[project.urls] -homepage = "https://github.com/trycua/pylume" -repository = "https://github.com/trycua/pylume" - -[tool.pdm] -distribution = true - -[tool.pdm.dev-dependencies] -dev = [ - "black>=23.0.0", - "isort>=5.12.0", - "pytest-asyncio>=0.23.0", - "pytest>=7.0.0", -] - -[tool.pytest.ini_options] -asyncio_mode = "auto" -python_files = "test_*.py" -testpaths = ["tests"] - -[tool.pdm.build] -includes = ["pylume/"] -source-includes = ["LICENSE", "README.md", "tests/"] \ No newline at end of file diff --git a/libs/python/pylume/tests/conftest.py b/libs/python/pylume/tests/conftest.py new file mode 100644 index 00000000..66ff2dee --- /dev/null +++ b/libs/python/pylume/tests/conftest.py @@ -0,0 +1,23 @@ +"""Pytest configuration for pylume tests. + +This module provides test fixtures for the pylume package. +Note: This package has macOS-specific dependencies and will skip tests +if the required modules are not available. +""" + +from unittest.mock import Mock, patch + +import pytest + + +@pytest.fixture +def mock_subprocess(): + with patch("subprocess.run") as mock_run: + mock_run.return_value = Mock(returncode=0, stdout="", stderr="") + yield mock_run + + +@pytest.fixture +def mock_requests(): + with patch("requests.get") as mock_get, patch("requests.post") as mock_post: + yield {"get": mock_get, "post": mock_post} diff --git a/libs/python/pylume/tests/test_pylume.py b/libs/python/pylume/tests/test_pylume.py new file mode 100644 index 00000000..fa27eb0f --- /dev/null +++ b/libs/python/pylume/tests/test_pylume.py @@ -0,0 +1,38 @@ +"""Unit tests for pylume package. + +This file tests ONLY basic pylume functionality. +Following SRP: This file tests pylume module imports and basic operations. +All external dependencies are mocked. +""" + +import pytest + + +class TestPylumeImports: + """Test pylume module imports (SRP: Only tests imports).""" + + def test_pylume_module_exists(self): + """Test that pylume module can be imported.""" + try: + import pylume + + assert pylume is not None + except ImportError: + pytest.skip("pylume module not installed") + + +class TestPylumeInitialization: + """Test pylume initialization (SRP: Only tests initialization).""" + + def test_pylume_can_be_imported(self): + """Basic smoke test: verify pylume components can be imported.""" + try: + import pylume + + # Check for basic attributes + assert pylume is not None + except ImportError: + pytest.skip("pylume module not available") + except Exception as e: + # Some initialization errors are acceptable in unit tests + pytest.skip(f"pylume initialization requires specific setup: {e}") diff --git a/libs/python/som/tests/conftest.py b/libs/python/som/tests/conftest.py new file mode 100644 index 00000000..d47e9430 --- /dev/null +++ b/libs/python/som/tests/conftest.py @@ -0,0 +1,24 @@ +"""Pytest configuration for som tests. + +This module provides test fixtures for the som (Set-of-Mark) package. +The som package depends on heavy ML models and will skip tests if not available. +""" + +from unittest.mock import Mock, patch + +import pytest + + +@pytest.fixture +def mock_torch(): + with patch("torch.load") as mock_load: + mock_load.return_value = Mock() + yield mock_load + + +@pytest.fixture +def mock_icon_detector(): + with patch("omniparser.IconDetector") as mock_detector: + instance = Mock() + mock_detector.return_value = instance + yield instance diff --git a/libs/python/som/tests/test_omniparser.py b/libs/python/som/tests/test_omniparser.py index 2edbdcd0..50598e7b 100644 --- a/libs/python/som/tests/test_omniparser.py +++ b/libs/python/som/tests/test_omniparser.py @@ -1,13 +1,73 @@ -# """Basic tests for the omniparser package.""" +"""Unit tests for som package (Set-of-Mark). -# import pytest -# from omniparser import IconDetector +This file tests ONLY basic som functionality. +Following SRP: This file tests som module imports and basic operations. +All external dependencies (ML models, OCR) are mocked. +""" -# def test_icon_detector_import(): -# """Test that we can import the IconDetector class.""" -# assert IconDetector is not None +import pytest -# def test_icon_detector_init(): -# """Test that we can create an IconDetector instance.""" -# detector = IconDetector(force_cpu=True) -# assert detector is not None + +class TestSomImports: + """Test som module imports (SRP: Only tests imports).""" + + def test_som_module_exists(self): + """Test that som module can be imported.""" + try: + import som + + assert som is not None + except ImportError: + pytest.skip("som module not installed") + + def test_omniparser_import(self): + """Test that OmniParser can be imported.""" + try: + from som import OmniParser + + assert OmniParser is not None + except ImportError: + pytest.skip("som module not available") + except Exception as e: + pytest.skip(f"som initialization requires ML models: {e}") + + def test_models_import(self): + """Test that model classes can be imported.""" + try: + from som import BoundingBox, ParseResult, UIElement + + assert BoundingBox is not None + assert UIElement is not None + assert ParseResult is not None + except ImportError: + pytest.skip("som models not available") + except Exception as e: + pytest.skip(f"som models require dependencies: {e}") + + +class TestSomModels: + """Test som data models (SRP: Only tests model structure).""" + + def test_bounding_box_structure(self): + """Test BoundingBox class structure.""" + try: + from som import BoundingBox + + # Check the class exists and has expected structure + assert hasattr(BoundingBox, "__init__") + except ImportError: + pytest.skip("som models not available") + except Exception as e: + pytest.skip(f"som models require dependencies: {e}") + + def test_ui_element_structure(self): + """Test UIElement class structure.""" + try: + from som import UIElement + + # Check the class exists and has expected structure + assert hasattr(UIElement, "__init__") + except ImportError: + pytest.skip("som models not available") + except Exception as e: + pytest.skip(f"som models require dependencies: {e}") diff --git a/notebooks/README.md b/notebooks/README.md index e0a18cd3..fb70230d 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -8,7 +8,6 @@ This folder contains Jupyter notebooks that demonstrate the core functionality o - **`computer_nb.ipynb`** - Demonstrates the Computer API for programmatically operating sandbox VMs using either Cua Cloud Sandbox or local Lume VMs on Apple Silicon macOS systems - **`agent_nb.ipynb`** - Shows how to use CUA's Agent to run automated workflows in virtual sandboxes with various AI models (OpenAI, Anthropic, local models) -- **`pylume_nb.ipynb`** - Quickstart guide for the pylume Python library, which handles VM creation, management, and image operations - **`computer_server_nb.ipynb`** - Demonstrates how to host and configure the Computer server that powers the Computer API ### Evaluation & Benchmarking diff --git a/notebooks/pylume_nb.ipynb b/notebooks/pylume_nb.ipynb deleted file mode 100644 index 1b504417..00000000 --- a/notebooks/pylume_nb.ipynb +++ /dev/null @@ -1,357 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Intro\n", - "\n", - "This notebook provides a quickstart guide to the pylume python library." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip uninstall pylume -y" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install pylume" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install pydantic" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If locally installed, use this instead:\n", - "# !poetry install\n", - "# !poetry build\n", - "!pip uninstall pylume -y && pip install ./dist/pylume-0.1.0-py3-none-any.whl --force-reinstall" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import asyncio\n", - "from pylume import (\n", - " PyLume, \n", - " ImageRef, \n", - " VMRunOpts, \n", - " SharedDirectory, \n", - " VMConfig,\n", - " VMUpdateOpts\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get latest IPSW URL from Apple Server\n", - "\n", - "This is used to create a new macOS VM by providing the downloaded IPSW file path to the `ipsw` argument in the `create_vm` method." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "async def get_ipsw():\n", - " async with PyLume(port=7777) as pylume:\n", - " url = await pylume.get_latest_ipsw_url()\n", - " print(f\"Latest IPSW URL: {url}\")\n", - "\n", - "await get_ipsw()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a new VM" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### macOS\n", - "\n", - "An IPSW file path is required to create a new macOS VM. To fetch automatically the latest IPSW during the VM creation, use `ipsw=\"latest\"`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "async def create_macos_vm():\n", - " async with PyLume() as pylume:\n", - " vm_config = VMConfig(\n", - " name=\"macos-vm\",\n", - " os=\"macOS\",\n", - " cpu=4,\n", - " memory=\"4GB\",\n", - " disk_size=\"40GB\",\n", - " display=\"1024x768\",\n", - " ipsw=\"latest\"\n", - " )\n", - " await pylume.create_vm(vm_config)\n", - "\n", - "await create_macos_vm()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Linux\n", - "\n", - "To create a new Linux VM, use the `os=\"linux\"` argument in the `VMConfig` class. Note that this doesn't set up any Linux distribution, it just creates a VM with a Linux kernel." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "async def create_linux_vm():\n", - " async with PyLume() as pylume:\n", - " vm_config = VMConfig(\n", - " name=\"linux-vm\",\n", - " os=\"linux\",\n", - " cpu=2,\n", - " memory=\"4GB\",\n", - " disk_size=\"25GB\",\n", - " display=\"1024x768\"\n", - " )\n", - " await pylume.create_vm(vm_config)\n", - "\n", - "await create_linux_vm()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Pull an image from ghcr.io/trycua" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Cua provides pre-built images for macOS and Linux." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "async def pull_macos_image():\n", - " async with PyLume() as pylume:\n", - " image_ref = ImageRef(\n", - " image=\"macos-sequoia-vanilla\",\n", - " tag=\"15.2\",\n", - " registry=\"ghcr.io\",\n", - " organization=\"trycua\"\n", - " )\n", - " await pylume.pull_image(image_ref, name=\"macos-sequoia-vanilla\")\n", - "\n", - "await pull_macos_image()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run\n", - "\n", - "Run a VM by providing the `VMRunConfig` object." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "async def run_vm():\n", - " async with PyLume() as pylume:\n", - " vm_name = \"macos-sequoia-vanilla\"\n", - " run_opts = VMRunOpts(\n", - " no_display=False,\n", - " shared_directories=[\n", - " SharedDirectory(\n", - " host_path=\"/Users//Shared\",\n", - " read_only=False\n", - " )\n", - " ]\n", - " )\n", - " await pylume.run_vm(vm_name, run_opts)\n", - "\n", - "await run_vm()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### List existing VMs\n", - "\n", - "VMs are stored in the `~/.lume` directory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "async with PyLume() as pylume:\n", - " vms = await pylume.list_vms()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get VM status" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "async with PyLume() as pylume:\n", - " status = await pylume.get_vm(\"macos-sequoia-vanilla\")\n", - " print(status)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Update VM Settings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "update_config = VMUpdateOpts(\n", - " cpu=8,\n", - " memory=\"8GB\"\n", - ")\n", - "async with PyLume() as pylume:\n", - " await pylume.update_vm(\"macos-sequoia-vanilla\", update_config)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Stop a VM" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "async with PyLume() as pylume:\n", - " await pylume.stop_vm(\"macos-sequoia-vanilla\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Delete a VM" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "async with PyLume() as pylume:\n", - " await pylume.delete_vm(\"linux-vm\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Images\n", - "\n", - "List the images available locally" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "async with PyLume() as pylume:\n", - " await pylume.get_images()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "cua", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 00000000..2df654cb --- /dev/null +++ b/package-lock.json @@ -0,0 +1,28 @@ +{ + "name": "cua", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "devDependencies": { + "prettier": "^3.6.2" + } + }, + "node_modules/prettier": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.6.2.tgz", + "integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==", + "dev": true, + "license": "MIT", + "bin": { + "prettier": "bin/prettier.cjs" + }, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/prettier/prettier?sponsor=1" + } + } + } +} diff --git a/package.json b/package.json index 2d0118af..80a66c81 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "scripts": { - "prettier:check": "prettier --check '**/*.{ts,tsx,js,jsx,json,md,yaml,yml}'", - "prettier:format": "prettier --write '**/*.{ts,tsx,js,jsx,json,md,yaml,yml}'" + "prettier:check": "prettier --check '**/*.{ts,tsx,js,jsx,json,md,mdx,yaml,yml}'", + "prettier:format": "prettier --write '**/*.{ts,tsx,js,jsx,json,md,mdx,yaml,yml}'" }, "devDependencies": { "prettier": "^3.6.2" diff --git a/scripts/run-docker-dev.sh b/scripts/run-docker-dev.sh index e4aab8ea..ef066798 100755 --- a/scripts/run-docker-dev.sh +++ b/scripts/run-docker-dev.sh @@ -43,7 +43,7 @@ else fi # Environment variables -PYTHONPATH="/app/libs/python/core:/app/libs/python/computer:/app/libs/python/agent:/app/libs/python/som:/app/libs/python/pylume:/app/libs/python/computer-server:/app/libs/python/mcp-server" +PYTHONPATH="/app/libs/python/core:/app/libs/python/computer:/app/libs/python/agent:/app/libs/python/som:/app/libs/python/computer-server:/app/libs/python/mcp-server" # Check if Docker is installed if ! command -v docker &> /dev/null; then diff --git a/tests/agent_loop_testing/README.md b/tests/agent_loop_testing/README.md new file mode 100644 index 00000000..48712d90 --- /dev/null +++ b/tests/agent_loop_testing/README.md @@ -0,0 +1,70 @@ +# CUA Agent Test + +Simple test for CUA ComputerAgent SDK with mock computer. + +## Run Test + +```bash +python tests/agent_loop_testing/agent_test.py --model anthropic/claude-sonnet-4-20250514 +``` + +## What It Does + +- Tests real CUA ComputerAgent SDK +- Uses mock computer (only screenshots, no real actions) +- Agent tries to "Open Safari browser" +- Runs up to 5 iterations +- Shows agent responses and tool calls + +## What Passes ✅ + +- Agent initializes +- Takes screenshots +- Analyzes images +- Makes tool calls +- Runs multiple iterations + +## What Fails ❌ + +- Missing dependencies +- Invalid API keys +- Agent crashes +- Import errors + +## Install + +```bash +pip install -e libs/python/agent -e libs/python/computer +export ANTHROPIC_API_KEY="your-key" +``` + +## Example Output + +``` +🤖 Testing CUA Agent: anthropic/claude-sonnet-4-20250514 +================================================== +✅ CUA Agent created +✅ Mock computer ready +🚀 Running agent... + +Iteration 1: + Agent: I'll click on Safari to open it. + Tool: click {'x': 125, 'y': 975} + +Iteration 2: + Agent: Safari didn't open, let me try again. + Tool: click {'x': 125, 'y': 975} + +Iteration 3: + Agent: This appears to be a static test environment. + +🏁 Stopping after 5 iterations (safety limit) + +================================================== +🎉 TEST COMPLETE! +================================================== +✅ Model: anthropic/claude-sonnet-4-20250514 +✅ Iterations: 3 +✅ Screenshots: 3 +✅ Agent executed successfully +``` \ No newline at end of file diff --git a/tests/agent_loop_testing/agent_test.py b/tests/agent_loop_testing/agent_test.py new file mode 100644 index 00000000..b31c8249 --- /dev/null +++ b/tests/agent_loop_testing/agent_test.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +""" +Simple CUA Agent Test + +Tests the actual CUA ComputerAgent SDK with a mock computer. +Only provides screenshot functionality - no complex computer actions. +""" + +import asyncio +import base64 +import sys +from io import BytesIO +from pathlib import Path + +from PIL import Image, ImageDraw + +# Add project root to path +project_root = Path(__file__).parent.parent.parent +sys.path.insert(0, str(project_root)) + + +class MockComputer: + """Mock computer that only provides screenshots.""" + + def __init__(self): + self.action_count = 0 + self._image = self._create_image() + + def _create_image(self) -> str: + """Create a simple desktop image.""" + img = Image.new("RGB", (1920, 1080), color="lightblue") + draw = ImageDraw.Draw(img) + + # Draw Safari icon + draw.rectangle([100, 950, 150, 1000], fill="blue", outline="black", width=2) + draw.text((110, 960), "Safari", fill="white") + + # Draw Terminal icon + draw.rectangle([200, 950, 250, 1000], fill="green", outline="black", width=2) + draw.text((210, 960), "Terminal", fill="white") + + # Convert to base64 + img_bytes = BytesIO() + img.save(img_bytes, format="PNG") + return base64.b64encode(img_bytes.getvalue()).decode("utf-8") + + async def screenshot(self) -> str: + self.action_count += 1 + return self._image + + async def get_dimensions(self) -> tuple[int, int]: + return (1920, 1080) + + # All other methods are no-ops (required by CUA interface) + async def click(self, x: int, y: int, button: str = "left") -> None: + await asyncio.sleep(0.1) + + async def double_click(self, x: int, y: int) -> None: + await asyncio.sleep(0.1) + + async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + await asyncio.sleep(0.1) + + async def type(self, text: str) -> None: + await asyncio.sleep(0.1) + + async def wait(self, ms: int = 1000) -> None: + await asyncio.sleep(ms / 1000.0) + + async def move(self, x: int, y: int) -> None: + await asyncio.sleep(0.1) + + async def keypress(self, keys) -> None: + await asyncio.sleep(0.1) + + async def drag(self, path) -> None: + await asyncio.sleep(0.1) + + async def get_current_url(self) -> str: + return "desktop://mock" + + async def get_environment(self) -> str: + return "mac" + + # Required abstract methods + async def left_mouse_down(self, x: int = 0, y: int = 0) -> None: + await asyncio.sleep(0.1) + + async def left_mouse_up(self, x: int = 0, y: int = 0) -> None: + await asyncio.sleep(0.1) + + async def right_mouse_down(self, x: int = 0, y: int = 0) -> None: + await asyncio.sleep(0.1) + + async def right_mouse_up(self, x: int = 0, y: int = 0) -> None: + await asyncio.sleep(0.1) + + async def mouse_move(self, x: int, y: int) -> None: + await asyncio.sleep(0.1) + + async def key_down(self, key: str) -> None: + await asyncio.sleep(0.1) + + async def key_up(self, key: str) -> None: + await asyncio.sleep(0.1) + + async def type_text(self, text: str) -> None: + await asyncio.sleep(0.1) + + +async def test_cua_agent(model_name: str): + """Test CUA agent with mock computer.""" + print(f"🤖 Testing CUA Agent: {model_name}") + print("=" * 50) + + try: + # Import the real CUA agent + from agent import ComputerAgent + + # Create mock computer + mock_computer = MockComputer() + + # Create the real CUA ComputerAgent + agent = ComputerAgent(model=model_name, tools=[mock_computer], max_trajectory_budget=5.0) + + print("✅ CUA Agent created") + print("✅ Mock computer ready") + print("🚀 Running agent...") + print() + + # Run the agent with a specific task + message = "Open Safari browser" + + iteration = 0 + async for result in agent.run([{"role": "user", "content": message}]): + iteration += 1 + print(f"Iteration {iteration}:") + + # Print agent output + output_items = result.get("output", []) + if not output_items: + print(" (No output from agent)") + else: + for item in output_items: + if item["type"] == "message": + print(f" Agent: {item['content'][0]['text']}") + elif item["type"] == "tool_call": + print(f" Tool: {item.get('tool_name')} {item.get('arguments')}") + else: + print(f" Unknown output type: {item}") + + # Debug: print full result for empty iterations + if not output_items: + print(f" Debug - Full result: {result}") + + # Let the agent decide when to stop (it should try to complete the task) + # Only stop after 5 iterations to prevent infinite loops + if iteration >= 5: + print("🏁 Stopping after 5 iterations (safety limit)") + break + + print() + print("=" * 50) + print("🎉 TEST COMPLETE!") + print("=" * 50) + print(f"✅ Model: {model_name}") + print(f"✅ Iterations: {iteration}") + print(f"✅ Screenshots: {mock_computer.action_count}") + print("✅ Agent executed successfully") + + return True + + except ImportError as e: + print(f"❌ Import error: {e}") + print("💡 Install CUA: pip install -e libs/python/agent -e libs/python/computer") + return False + except Exception as e: + print(f"❌ Test failed: {e}") + return False + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Test CUA Agent with mock computer") + parser.add_argument( + "--model", default="anthropic/claude-sonnet-4-20250514", help="CUA model to test" + ) + args = parser.parse_args() + + success = asyncio.run(test_cua_agent(args.model)) + sys.exit(0 if success else 1) diff --git a/uv.lock b/uv.lock index ff198041..9c93aea2 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.12, <3.14" resolution-markers = [ "python_version < '0'", @@ -4321,11 +4321,11 @@ wheels = [ [[package]] name = "pip" -version = "25.2" +version = "25.3" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/20/16/650289cd3f43d5a2fadfd98c68bd1e1e7f2550a1a5326768cddfbcedb2c5/pip-25.2.tar.gz", hash = "sha256:578283f006390f85bb6282dffb876454593d637f5d1be494b5202ce4877e71f2", size = 1840021, upload-time = "2025-07-30T21:50:15.401Z" } +sdist = { url = "https://files.pythonhosted.org/packages/fe/6e/74a3f0179a4a73a53d66ce57fdb4de0080a8baa1de0063de206d6167acc2/pip-25.3.tar.gz", hash = "sha256:8d0538dbbd7babbd207f261ed969c65de439f6bc9e5dbd3b3b9a77f25d95f343", size = 1803014, upload-time = "2025-10-25T00:55:41.394Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b7/3f/945ef7ab14dc4f9d7f40288d2df998d1837ee0888ec3659c813487572faa/pip-25.2-py3-none-any.whl", hash = "sha256:6d67a2b4e7f14d8b31b8b52648866fa717f45a1eb70e83002f4331d07e953717", size = 1752557, upload-time = "2025-07-30T21:50:13.323Z" }, + { url = "https://files.pythonhosted.org/packages/44/3c/d717024885424591d5376220b5e836c2d5293ce2011523c9de23ff7bf068/pip-25.3-py3-none-any.whl", hash = "sha256:9655943313a94722b7774661c21049070f6bbb0a1516bf02f7c8d5d9201514cd", size = 1778622, upload-time = "2025-10-25T00:55:39.247Z" }, ] [[package]]