Merge branch 'main' into feat/api_key_overrides

2026-05-05 03:40:34 -05:00 · 2025-10-30 12:34:40 -04:00
parent a19ae76a58 c2c2b37ade
commit 4026ed3aa2
142 changed files with 7173 additions and 4438 deletions
@@ -1,183 +0,0 @@
-{
-  "projectName": "cua",
-  "projectOwner": "trycua",
-  "files": [
-    "README.md"
-  ],
-  "commitType": "docs",
-  "commitConvention": "angular",
-  "contributorsPerLine": 7,
-  "contributors": [
-    {
-      "login": "f-trycua",
-      "name": "f-trycua",
-      "avatar_url": "https://avatars.githubusercontent.com/u/195596869?v=4",
-      "profile": "https://github.com/f-trycua",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "pepicrft",
-      "name": "Pedro Piñera Buendía",
-      "avatar_url": "https://avatars.githubusercontent.com/u/663605?v=4",
-      "profile": "http://pepicrft.me",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "aktech",
-      "name": "Amit Kumar",
-      "avatar_url": "https://avatars.githubusercontent.com/u/5647941?v=4",
-      "profile": "https://iamit.in",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "jellydn",
-      "name": "Dung Duc Huynh (Kaka)",
-      "avatar_url": "https://avatars.githubusercontent.com/u/870029?v=4",
-      "profile": "https://productsway.com/",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "ShrootBuck",
-      "name": "Zayd Krunz",
-      "avatar_url": "https://avatars.githubusercontent.com/u/70227235?v=4",
-      "profile": "http://zaydkrunz.com",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "PrashantRaj18198",
-      "name": "Prashant Raj",
-      "avatar_url": "https://avatars.githubusercontent.com/u/23168997?v=4",
-      "profile": "https://github.com/PrashantRaj18198",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "Leland-Takamine",
-      "name": "Leland Takamine",
-      "avatar_url": "https://avatars.githubusercontent.com/u/847683?v=4",
-      "profile": "https://www.mobile.dev",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "ddupont808",
-      "name": "ddupont",
-      "avatar_url": "https://avatars.githubusercontent.com/u/3820588?v=4",
-      "profile": "https://github.com/ddupont808",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "Lizzard1123",
-      "name": "Ethan Gutierrez",
-      "avatar_url": "https://avatars.githubusercontent.com/u/46036335?v=4",
-      "profile": "https://github.com/Lizzard1123",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "RicterZ",
-      "name": "Ricter Zheng",
-      "avatar_url": "https://avatars.githubusercontent.com/u/5282759?v=4",
-      "profile": "https://ricterz.me",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "rahulkarajgikar",
-      "name": "Rahul Karajgikar",
-      "avatar_url": "https://avatars.githubusercontent.com/u/50844303?v=4",
-      "profile": "https://www.trytruffle.ai/",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "trospix",
-      "name": "trospix",
-      "avatar_url": "https://avatars.githubusercontent.com/u/81363696?v=4",
-      "profile": "https://github.com/trospix",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "eltociear",
-      "name": "Ikko Eltociear Ashimine",
-      "avatar_url": "https://avatars.githubusercontent.com/u/22633385?v=4",
-      "profile": "https://wavee.world/invitation/b96d00e6-b802-4a1b-8a66-2e3854a01ffd",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "dp221125",
-      "name": "한석호(MilKyo)",
-      "avatar_url": "https://avatars.githubusercontent.com/u/10572119?v=4",
-      "profile": "https://github.com/dp221125",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "rahimnathwani",
-      "name": "Rahim Nathwani",
-      "avatar_url": "https://avatars.githubusercontent.com/u/891558?v=4",
-      "profile": "https://www.encona.com/",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "mjspeck",
-      "name": "Matt Speck",
-      "avatar_url": "https://avatars.githubusercontent.com/u/20689127?v=4",
-      "profile": "https://mjspeck.github.io/",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "FinnBorge",
-      "name": "FinnBorge",
-      "avatar_url": "https://avatars.githubusercontent.com/u/9272726?v=4",
-      "profile": "https://github.com/FinnBorge",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "jklapacz",
-      "name": "Jakub Klapacz",
-      "avatar_url": "https://avatars.githubusercontent.com/u/5343758?v=4",
-      "profile": "https://github.com/jklapacz",
-      "contributions": [
-        "code"
-      ]
-    },
-    {
-      "login": "evnsnclr",
-      "name": "Evan smith",
-      "avatar_url": "https://avatars.githubusercontent.com/u/139897548?v=4",
-      "profile": "https://github.com/evnsnclr",
-      "contributions": [
-        "code"
-      ]
-    }
-  ]
-}
@@ -0,0 +1,91 @@
+name: Bump Version
+
+on:
+  workflow_dispatch:
+    inputs:
+      service:
+        description: "Service/Package to bump"
+        required: true
+        type: choice
+        options:
+          - cua-agent
+          - cua-computer
+          - cua-computer-server
+          - cua-core
+          - cua-mcp-server
+          - cua-som
+          - pylume
+      bump_type:
+        description: "Version bump type"
+        required: true
+        type: choice
+        options:
+          - patch
+          - minor
+          - major
+
+permissions:
+  contents: write
+
+jobs:
+  bump-version:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Set package directory
+        id: package
+        run: |
+          case "${{ inputs.service }}" in
+            "cua-agent")
+              echo "directory=libs/python/agent" >> $GITHUB_OUTPUT
+              ;;
+            "cua-computer")
+              echo "directory=libs/python/computer" >> $GITHUB_OUTPUT
+              ;;
+            "cua-computer-server")
+              echo "directory=libs/python/computer-server" >> $GITHUB_OUTPUT
+              ;;
+            "cua-core")
+              echo "directory=libs/python/core" >> $GITHUB_OUTPUT
+              ;;
+            "cua-mcp-server")
+              echo "directory=libs/python/mcp-server" >> $GITHUB_OUTPUT
+              ;;
+            "cua-som")
+              echo "directory=libs/python/som" >> $GITHUB_OUTPUT
+              ;;
+            "pylume")
+              echo "directory=libs/python/pylume" >> $GITHUB_OUTPUT
+              ;;
+            *)
+              echo "Unknown service: ${{ inputs.service }}"
+              exit 1
+              ;;
+          esac
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install bump2version
+        run: pip install bump2version
+
+      - name: Configure Git
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+
+      - name: Run bump2version
+        run: |
+          cd ${{ steps.package.outputs.directory }}
+          bump2version ${{ inputs.bump_type }}
+
+      - name: Push changes
+        run: |
+          git push origin main --follow-tags
@@ -1,82 +0,0 @@
-name: Publish Pylume Package
-
-on:
-  push:
-    tags:
-      - "pylume-v*"
-  workflow_dispatch:
-    inputs:
-      version:
-        description: "Version to publish (without v prefix)"
-        required: true
-        default: "0.1.0"
-  workflow_call:
-    inputs:
-      version:
-        description: "Version to publish"
-        required: true
-        type: string
-    outputs:
-      version:
-        description: "The version that was published"
-        value: ${{ jobs.determine-version.outputs.version }}
-
-# Adding permissions at workflow level
-permissions:
-  contents: write
-
-jobs:
-  determine-version:
-    runs-on: macos-latest
-    outputs:
-      version: ${{ steps.get-version.outputs.version }}
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Determine version
-        id: get-version
-        run: |
-          if [ "${{ github.event_name }}" == "push" ]; then
-            # Extract version from tag (for package-specific tags)
-            if [[ "${{ github.ref }}" =~ ^refs/tags/pylume-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then
-              VERSION=${BASH_REMATCH[1]}
-            else
-              echo "Invalid tag format for pylume"
-              exit 1
-            fi
-          elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
-            # Use version from workflow dispatch
-            VERSION=${{ github.event.inputs.version }}
-          else
-            # Use version from workflow_call
-            VERSION=${{ inputs.version }}
-          fi
-          echo "VERSION=$VERSION"
-          echo "version=$VERSION" >> $GITHUB_OUTPUT
-
-  validate-version:
-    runs-on: macos-latest
-    needs: determine-version
-    steps:
-      - uses: actions/checkout@v4
-      - name: Validate version
-        id: validate-version
-        run: |
-          CODE_VERSION=$(grep '__version__' libs/python/pylume/pylume/__init__.py | cut -d'"' -f2)
-          if [ "${{ needs.determine-version.outputs.version }}" != "$CODE_VERSION" ]; then
-            echo "Version mismatch: expected $CODE_VERSION, got ${{ needs.determine-version.outputs.version }}"
-            exit 1
-          fi
-          echo "Version validated: $CODE_VERSION"
-
-  publish:
-    needs: determine-version
-    uses: ./.github/workflows/pypi-reusable-publish.yml
-    with:
-      package_name: "pylume"
-      package_dir: "libs/python/pylume"
-      version: ${{ needs.determine-version.outputs.version }}
-      is_lume_package: true
-      base_package_name: "pylume"
-    secrets:
-      PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
@@ -4,11 +4,11 @@ on:
  workflow_call:
    inputs:
      package_name:
-        description: "Name of the package (e.g. pylume, computer, agent)"
+        description: "Name of the package (e.g. computer, agent)"
        required: true
        type: string
      package_dir:
-        description: "Directory containing the package relative to workspace root (e.g. libs/python/pylume)"
+        description: "Directory containing the package relative to workspace root (e.g. libs/python/computer)"
        required: true
        type: string
      version:
@@ -21,7 +21,7 @@ on:
        type: boolean
        default: false
      base_package_name:
-        description: "PyPI package name (e.g. pylume, cua-agent)"
+        description: "PyPI package name (e.g. cua-agent)"
        required: true
        type: string
      make_latest:
@@ -0,0 +1,93 @@
+name: Python Unit Tests
+
+on:
+  pull_request:
+    paths:
+      - "libs/python/**"
+      - ".github/workflows/python-tests.yml"
+  push:
+    branches:
+      - main
+    paths:
+      - "libs/python/**"
+      - ".github/workflows/python-tests.yml"
+  workflow_dispatch: # Allow manual trigger
+
+jobs:
+  test:
+    name: Test ${{ matrix.package }}
+    runs-on: ubuntu-latest
+
+    strategy:
+      fail-fast: false # Test all packages even if one fails
+      matrix:
+        package:
+          - core
+          - agent
+          - computer
+          - computer-server
+          - mcp-server
+          - pylume
+          - som
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install uv
+        run: |
+          pip install uv
+
+      - name: Install package and dependencies
+        run: |
+          cd libs/python/${{ matrix.package }}
+          # Install the package in editable mode with dev dependencies
+          if [ -f pyproject.toml ]; then
+            uv pip install --system -e .
+            # Install test dependencies
+            uv pip install --system pytest pytest-asyncio pytest-mock pytest-cov
+          fi
+        shell: bash
+
+      - name: Run tests
+        run: |
+          cd libs/python/${{ matrix.package }}
+          if [ -d tests ]; then
+            python -m pytest tests/ -v --tb=short --cov --cov-report=term --cov-report=xml
+          else
+            echo "No tests directory found, skipping tests"
+          fi
+        shell: bash
+        env:
+          CUA_TELEMETRY_DISABLED: "1" # Disable telemetry during tests
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v4
+        if: always()
+        with:
+          file: ./libs/python/${{ matrix.package }}/coverage.xml
+          flags: ${{ matrix.package }}
+          name: codecov-${{ matrix.package }}
+          fail_ci_if_error: false
+        continue-on-error: true
+
+  summary:
+    name: Test Summary
+    runs-on: ubuntu-latest
+    needs: test
+    if: always()
+
+    steps:
+      - name: Check test results
+        run: |
+          if [ "${{ needs.test.result }}" == "failure" ]; then
+            echo "❌ Some tests failed. Please check the logs above."
+            exit 1
+          else
+            echo "✅ All tests passed!"
+          fi
@@ -0,0 +1,116 @@
+name: Test CUA Supporting Models
+
+# This workflow tests all supported CUA models with API keys
+# Run manually using workflow_dispatch with test_models=true
+
+on:
+  pull_request_target:
+    branches: [main, master]
+  workflow_dispatch:
+    inputs:
+      test_models:
+        description: "Test all supported models (requires API keys)"
+        required: false
+        default: true
+        type: boolean
+
+jobs:
+  # Test all CUA models - runs on PRs or when manually triggered
+  test-all-models:
+    if: ${{ github.event_name == 'pull_request_target' || fromJSON(inputs.test_models || 'false') }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        model:
+          # Anthropic Claude Models
+          # - anthropic/claude-3-5-sonnet-20241022
+          # - anthropic/claude-3-7-sonnet-20250219
+          # - anthropic/claude-opus-4-20250514
+          # - anthropic/claude-sonnet-4-20250514
+          # - anthropic/claude-opus-4-1-20250805
+          - anthropic/claude-sonnet-4-5-20250929
+          # - anthropic/claude-haiku-4-5-20251001
+
+          # OpenAI Models
+          # - openai/computer-use-preview
+
+          # Gemini Models
+          # - gemini-2.5-computer-use-preview-10-2025
+
+          # GLM-4.5V Models
+          # - openrouter/z-ai/glm-4.5v
+
+          # UI-TARS Models
+          # - huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
+
+          # OpenCUA Models
+          # - huggingface-local/xlangai/OpenCUA-7B
+          # - huggingface-local/xlangai/OpenCUA-32B
+
+          # GTA1 Family Models
+          # - huggingface-local/HelloKKMe/GTA1-7B
+          # - huggingface-local/HelloKKMe/GTA1-32B
+          # - huggingface-local/HelloKKMe/GTA1-72B
+
+          # Holo 1.5 Family Models
+          # - huggingface-local/Hcompany/Holo1.5-3B
+          # - huggingface-local/Hcompany/Holo1.5-7B
+          # - huggingface-local/Hcompany/Holo1.5-72B
+
+          # InternVL 3.5 Family Models
+          # - huggingface-local/OpenGVLab/InternVL3_5-1B
+          # - huggingface-local/OpenGVLab/InternVL3_5-2B
+          # - huggingface-local/OpenGVLab/InternVL3_5-4B
+          # - huggingface-local/OpenGVLab/InternVL3_5-8B
+
+          # GLM-4.5V Local
+          # - huggingface-local/zai-org/GLM-4.5V
+
+          # Composed Models (Grounding + Planning)
+          # - omniparser+anthropic/claude-3-5-sonnet-20241022
+          # - omniparser+openai/gpt-4o-mini
+          # - moondream3+anthropic/claude-3-5-sonnet-20241022
+          # - moondream3+openai/gpt-4o-mini
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up uv and Python
+        uses: astral-sh/setup-uv@v4
+        with:
+          python-version: "3.12"
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libgl1-mesa-dri libglib2.0-0
+
+      - name: Install CUA dependencies (uv)
+        run: |
+          uv venv
+          uv pip install -e libs/python/agent -e libs/python/computer
+          uv pip install -e libs/python/core
+          uv pip install "cua-agent[uitars-hf]"
+          uv pip install pytest
+
+      - name: Test model with agent loop
+        run: |
+          cd tests/agent_loop_testing
+          uv run python agent_test.py --model "${{ matrix.model }}"
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          # OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          # GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+          # OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
+
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-results-${{ matrix.model }}
+          path: |
+            tests/agent_loop_testing/test_images/
+            *.log
+          retention-days: 7
@@ -111,6 +111,9 @@ ENV/
 env.bak/
 venv.bak/

+# Git worktrees
+.worktrees/
+
 # Spyder project settings
 .spyderproject
 .spyproject
@@ -7,7 +7,7 @@ repos:
        entry: prettier --write
        language: node
        additional_dependencies: ["prettier@3.6.2"]
-        files: \.(ts|tsx|js|jsx|json|md|yaml|yml)$
+        files: \.(ts|tsx|js|jsx|json|md|mdx|yaml|yml)$

  - repo: local
    hooks:
@@ -10,7 +10,7 @@
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
-                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
+                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
            }
        },
        {
@@ -23,7 +23,7 @@
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
-                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
+                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
            }
        },
        {
@@ -36,7 +36,7 @@
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
-                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
+                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
            }
        },
        {
@@ -49,20 +49,7 @@
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
-                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
-            }
-        },
-        {
-            "name": "Run PyLume Examples",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "examples/pylume_examples.py",
-            "console": "integratedTerminal",
-            "justMyCode": true,
-            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
-            "cwd": "${workspaceFolder:cua-root}",
-            "env": {
-                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
+                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
            }
        },
        {
@@ -84,7 +71,7 @@
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
-                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
+                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
            }
        },
        {
@@ -106,7 +93,7 @@
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
-                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
+                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
            }
        },
        {
@@ -119,7 +106,7 @@
            "python": "${workspaceFolder:cua-root}/.venv/bin/python",
            "cwd": "${workspaceFolder:cua-root}",
            "env": {
-                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
+                "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
            }
        },
        {
@@ -20,10 +20,6 @@
            "name": "computer-server",
            "path": "../libs/python/computer-server"
        },
-        {
-            "name": "pylume",
-            "path": "../libs/python/pylume"
-        },
        {
            "name": "core",
            "path": "../libs/python/core"
@@ -51,7 +47,6 @@
            "${workspaceFolder:cua-root}/libs/python/computer",
            "${workspaceFolder:cua-root}/libs/python/agent",
            "${workspaceFolder:cua-root}/libs/python/som",
-            "${workspaceFolder:cua-root}/libs/python/pylume",
            "${workspaceFolder:cua-root}/.vscode/typings"
        ],
        "python.envFile": "${workspaceFolder:cua-root}/.env",
@@ -89,10 +84,6 @@
                "name": "som",
                "depth": 2
            },
-            {
-                "name": "pylume",
-                "depth": 2
-            },
            {
                "name": "core",
                "depth": 2
@@ -103,7 +94,6 @@
            "${workspaceFolder:cua-root}/libs/python/computer",
            "${workspaceFolder:cua-root}/libs/python/agent",
            "${workspaceFolder:cua-root}/libs/python/som",
-            "${workspaceFolder:cua-root}/libs/python/pylume"
        ],
        "python.languageServer": "None",
        "[python]": {
@@ -1,6 +1,6 @@
 {
  "python-envs.pythonProjects": [],
-  "python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python",
+  "python.defaultInterpreterPath": "${workspaceFolder}/.venv",
  "editor.formatOnSave": true,
  "editor.codeActionsOnSave": {
    "source.organizeImports": "explicit",
@@ -346,3 +346,101 @@ For Swift code in the `libs/lume` directory:
 - Follow the [Swift API Design Guidelines](https://www.swift.org/documentation/api-design-guidelines/)
 - Use SwiftFormat for consistent formatting
 - Code will be automatically formatted on save when using the lume workspace
+
+## Releasing Packages
+
+Cua uses an automated GitHub Actions workflow to bump package versions.
+
+> **Note:** The main branch is currently not protected. If branch protection is enabled in the future, the github-actions bot must be added to the bypass list for these workflows to commit directly.
+
+### Version Bump Workflow
+
+All packages are managed through a single consolidated workflow: [Bump Version](https://github.com/trycua/cua/actions/workflows/bump-version.yml)
+
+**Supported packages:**
+
+- cua-agent
+- cua-computer
+- cua-computer-server
+- cua-core
+- cua-mcp-server
+- cua-som
+- pylume
+
+**How to use:**
+
+1. Navigate to the [Bump Version workflow](https://github.com/trycua/cua/actions/workflows/bump-version.yml)
+2. Click the "Run workflow" button in the GitHub UI
+3. Select the **service/package** you want to bump from the first dropdown
+4. Select the **bump type** (patch/minor/major) from the second dropdown
+5. Click "Run workflow" to start the version bump
+6. The workflow will automatically commit changes and push to main
+
+### Rolling Back a Version Bump
+
+If you need to revert a version bump, follow these steps:
+
+**Step 1: Find the version bump commit**
+
+```bash
+# List recent commits
+git log --oneline | grep "Bump"
+
+# Example output:
+# a1b2c3d Bump cua-core to v0.1.9
+```
+
+**Step 2: Revert the commit**
+
+```bash
+# Revert the specific commit
+git revert <commit-hash>
+
+# Example:
+# git revert a1b2c3d
+```
+
+**Step 3: Delete the git tag**
+
+```bash
+# List tags to find the version tag
+git tag -l
+
+# Delete the tag locally (use the correct package-specific format)
+git tag -d core-v0.1.9
+
+# Delete the tag remotely
+git push origin :refs/tags/core-v0.1.9
+```
+
+**Step 4: Push the revert**
+
+```bash
+git push origin main
+```
+
+**Per-package tag patterns:**
+
+Each package uses its own tag format defined in `.bumpversion.cfg`:
+
+- **cua-core**: `core-v{version}` (e.g., `core-v0.1.9`)
+- **cua-computer**: `computer-v{version}` (e.g., `computer-v0.4.7`)
+- **cua-agent**: `agent-v{version}` (e.g., `agent-v0.4.35`)
+- **cua-som**: `som-v{version}` (e.g., `som-v0.1.3`)
+- **pylume**: `pylume-v{version}` (e.g., `pylume-v0.2.1`)
+- **cua-computer-server**: `computer-server-v{version}` (e.g., `computer-server-v0.1.27`)
+- **cua-mcp-server**: `mcp-server-v{version}` (e.g., `mcp-server-v0.1.14`)
+
+### Local Testing (Advanced)
+
+The Makefile targets are kept for local testing only:
+
+```bash
+# Test version bump locally (dry run)
+make dry-run-patch-core
+
+# View current versions
+make show-versions
+```
+
+**Note:** For production releases, always use the GitHub Actions workflows above instead of running Makefile commands directly.
@@ -5,7 +5,7 @@ ENV PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    PIP_NO_CACHE_DIR=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1 \
-    PYTHONPATH="/app/libs/python/core:/app/libs/python/computer:/app/libs/python/agent:/app/libs/python/som:/app/libs/python/pylume:/app/libs/python/computer-server:/app/libs/python/mcp-server"
+    PYTHONPATH="/app/libs/python/core:/app/libs/python/computer:/app/libs/python/agent:/app/libs/python/som:/app/libs/python/computer-server:/app/libs/python/mcp-server"

 # Install system dependencies for ARM architecture
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -1,145 +1,19 @@
 # Python Package Release Makefile
-# This Makefile provides convenient targets for bumping versions of all Python packages
-# using bump2version. After running a target, remember to push: git push origin main
+# Version bumps are managed via GitHub Actions workflows (see Development.md)
+# This Makefile provides utility targets for checking versions and dry-run testing

 .PHONY: help

 help: ## Show this help message
-	@echo "Python Package Release Automation"
+	@echo "Python Package Release Utilities"
 	@echo ""
 	@echo "Usage: make <target>"
 	@echo ""
 	@echo "Available targets:"
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "  %-25s %s\n", $$1, $$2}'
 	@echo ""
-	@echo "After bumping, push changes with: git push origin main"
-
-# Core package targets
-bump-patch-core: ## Bump patch version of cua-core (0.1.8 → 0.1.9)
-	@echo "Bumping cua-core patch version..."
-	cd libs/python/core && bump2version patch
-	@echo "✓ Done! Now run: git push origin main"
-
-bump-minor-core: ## Bump minor version of cua-core (0.1.8 → 0.2.0)
-	@echo "Bumping cua-core minor version..."
-	cd libs/python/core && bump2version minor
-	@echo "✓ Done! Now run: git push origin main"
-
-bump-major-core: ## Bump major version of cua-core (0.1.8 → 1.0.0)
-	@echo "Bumping cua-core major version..."
-	cd libs/python/core && bump2version major
-	@echo "✓ Done! Now run: git push origin main"
-
-# Pylume package targets
-bump-patch-pylume: ## Bump patch version of pylume (0.2.2 → 0.2.3)
-	@echo "Bumping pylume patch version..."
-	cd libs/python/pylume && bump2version patch
-	@echo "✓ Done! Now run: git push origin main"
-
-bump-minor-pylume: ## Bump minor version of pylume (0.2.2 → 0.3.0)
-	@echo "Bumping pylume minor version..."
-	cd libs/python/pylume && bump2version minor
-	@echo "✓ Done! Now run: git push origin main"
-
-bump-major-pylume: ## Bump major version of pylume (0.2.2 → 1.0.0)
-	@echo "Bumping pylume major version..."
-	cd libs/python/pylume && bump2version major
-	@echo "✓ Done! Now run: git push origin main"
-
-# Computer package targets
-bump-patch-computer: ## Bump patch version of cua-computer (0.4.0 → 0.4.1)
-	@echo "Bumping cua-computer patch version..."
-	cd libs/python/computer && bump2version patch
-	@echo "✓ Done! Now run: git push origin main"
-
-bump-minor-computer: ## Bump minor version of cua-computer (0.4.0 → 0.5.0)
-	@echo "Bumping cua-computer minor version..."
-	cd libs/python/computer && bump2version minor
-	@echo "✓ Done! Now run: git push origin main"
-
-bump-major-computer: ## Bump major version of cua-computer (0.4.0 → 1.0.0)
-	@echo "Bumping cua-computer major version..."
-	cd libs/python/computer && bump2version major
-	@echo "✓ Done! Now run: git push origin main"
-
-# SOM package targets
-bump-patch-som: ## Bump patch version of cua-som (0.1.0 → 0.1.1)
-	@echo "Bumping cua-som patch version..."
-	cd libs/python/som && bump2version patch
-	@echo "✓ Done! Now run: git push origin main"
-
-bump-minor-som: ## Bump minor version of cua-som (0.1.0 → 0.2.0)
-	@echo "Bumping cua-som minor version..."
-	cd libs/python/som && bump2version minor
-	@echo "✓ Done! Now run: git push origin main"
-
-bump-major-som: ## Bump major version of cua-som (0.1.0 → 1.0.0)
-	@echo "Bumping cua-som major version..."
-	cd libs/python/som && bump2version major
-	@echo "✓ Done! Now run: git push origin main"
-
-# Agent package targets
-bump-patch-agent: ## Bump patch version of cua-agent (0.4.0 → 0.4.1)
-	@echo "Bumping cua-agent patch version..."
-	cd libs/python/agent && bump2version patch
-	@echo "✓ Done! Now run: git push origin main"
-
-bump-minor-agent: ## Bump minor version of cua-agent (0.4.0 → 0.5.0)
-	@echo "Bumping cua-agent minor version..."
-	cd libs/python/agent && bump2version minor
-	@echo "✓ Done! Now run: git push origin main"
-
-bump-major-agent: ## Bump major version of cua-agent (0.4.0 → 1.0.0)
-	@echo "Bumping cua-agent major version..."
-	cd libs/python/agent && bump2version major
-	@echo "✓ Done! Now run: git push origin main"
-
-# Computer Server package targets
-bump-patch-computer-server: ## Bump patch version of cua-computer-server (0.1.0 → 0.1.1)
-	@echo "Bumping cua-computer-server patch version..."
-	cd libs/python/computer-server && bump2version patch
-	@echo "✓ Done! Now run: git push origin main"
-
-bump-minor-computer-server: ## Bump minor version of cua-computer-server (0.1.0 → 0.2.0)
-	@echo "Bumping cua-computer-server minor version..."
-	cd libs/python/computer-server && bump2version minor
-	@echo "✓ Done! Now run: git push origin main"
-
-bump-major-computer-server: ## Bump major version of cua-computer-server (0.1.0 → 1.0.0)
-	@echo "Bumping cua-computer-server major version..."
-	cd libs/python/computer-server && bump2version major
-	@echo "✓ Done! Now run: git push origin main"
-
-# MCP Server package targets
-bump-patch-mcp-server: ## Bump patch version of cua-mcp-server (0.1.0 → 0.1.1)
-	@echo "Bumping cua-mcp-server patch version..."
-	cd libs/python/mcp-server && bump2version patch
-	@echo "✓ Done! Now run: git push origin main"
-
-bump-minor-mcp-server: ## Bump minor version of cua-mcp-server (0.1.0 → 0.2.0)
-	@echo "Bumping cua-mcp-server minor version..."
-	cd libs/python/mcp-server && bump2version minor
-	@echo "✓ Done! Now run: git push origin main"
-
-bump-major-mcp-server: ## Bump major version of cua-mcp-server (0.1.0 → 1.0.0)
-	@echo "Bumping cua-mcp-server major version..."
-	cd libs/python/mcp-server && bump2version major
-	@echo "✓ Done! Now run: git push origin main"
-
-# Convenience targets for common workflows
-bump-all-patch: ## Bump patch version for ALL packages (use with caution!)
-	@echo "⚠️  Bumping patch version for ALL packages..."
-	@read -p "Are you sure? [y/N] " -n 1 -r; \
-	echo; \
-	if [[ $$REPLY =~ ^[Yy]$$ ]]; then \
-		$(MAKE) bump-patch-core && \
-		$(MAKE) bump-patch-pylume && \
-		$(MAKE) bump-patch-computer && \
-		$(MAKE) bump-patch-som && \
-		$(MAKE) bump-patch-agent && \
-		$(MAKE) bump-patch-computer-server && \
-		$(MAKE) bump-patch-mcp-server; \
-	fi
+	@echo "⚠️  For production version bumps, use GitHub Actions:"
+	@echo "    https://github.com/trycua/cua/actions/workflows/bump-version.yml"

 # Dry run targets (test without making changes)
 dry-run-patch-%: ## Dry run for patch version bump (e.g., make dry-run-patch-core)
@@ -14,25 +14,18 @@

 </div>

-> We're hosting **The Computer-Use Agents SOTA Challenge concluded** at [Hack the North](https://hackthenorth.com) and online!
->
-> > **Track A (On-site @ UWaterloo)**: 🏆 ~~Prize: **YC interview guaranteed**.~~ **Concluded**  
-> > **Track B (Remote)**: 🏆 ~~Prize: **Cash award**.~~ **Concluded - Winners will be announced soon**
-> >
-> > > ~~👉 Sign up here: [trycua.com/hackathon](https://www.trycua.com/hackathon)~~
-
 **Cua** ("koo-ah") is Docker for [Computer-Use Agents](https://www.oneusefulthing.org/p/when-you-give-a-claude-a-mouse) - it enables AI agents to control full operating systems in virtual containers and deploy them locally or to the cloud.

 <div align="center">
  <video src="https://github.com/user-attachments/assets/c619b4ea-bb8e-4382-860e-f3757e36af20" width="600" controls></video>
 </div>

-With the Computer SDK, you can:
+With the [Computer SDK](#computer-sdk), you can:

 - automate Windows, Linux, and macOS VMs with a consistent, [pyautogui-like API](https://docs.trycua.com/docs/libraries/computer#interface-actions)
 - create & manage VMs [locally](https://docs.trycua.com/docs/computer-sdk/computers#cua-local-containers) or using [Cua cloud](https://www.trycua.com/)

-With the Agent SDK, you can:
+With the [Agent SDK](#agent-sdk), you can:

 - run computer-use models with a [consistent schema](https://docs.trycua.com/docs/agent-sdk/message-format)
 - benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://docs.trycua.com/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb))
@@ -40,44 +33,87 @@ With the Agent SDK, you can:
 - use new UI agent models and UI grounding models from the Model Zoo below with just a model string (e.g., `ComputerAgent(model="openai/computer-use-preview")`)
 - use API or local inference by changing a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers))

-### CUA Model Zoo 🐨
+# Modules

-| [All-in-one CUAs](https://docs.trycua.com/docs/agent-sdk/supported-agents/computer-use-agents) | [UI Grounding Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) | [UI Planning Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) |
-| ---------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------- |
-| `anthropic/claude-sonnet-4-5-20250929`, `anthropic/claude-haiku-4-5-20251001`                  | `huggingface-local/xlangai/OpenCUA-{7B,32B}`                                                   | any all-in-one CUA                                                                            |
-| `openai/computer-use-preview`                                                                  | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}`                                                | any VLM (using liteLLM, requires `tools` parameter)                                           |
-| `openrouter/z-ai/glm-4.5v`                                                                     | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}`                                               | any LLM (using liteLLM, requires `moondream3+` prefix )                                       |
-| `gemini-2.5-computer-use-preview-10-2025`                                                      | any-all-in-one CUA                                                                             |                                                                                               |
-| `openrouter/qwen/qwen3-vl-235b-a22b-instruct`                                                  |                                                                                                |                                                                                               |
-| `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`                                    |                                                                                                |                                                                                               |
-| `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`                                              |                                                                                                |
-| `moondream3+{ui planning}` (supports text-only models)                                         |                                                                                                |
-| `omniparser+{ui planning}`                                                                     |                                                                                                |                                                                                               |
-| `{ui grounding}+{ui planning}`                                                                 |                                                                                                |                                                                                               |
+<table>
+<tr>
+<td width="25%" align="center" valign="top">

- `human/human` → [Human-in-the-Loop](https://docs.trycua.com/docs/agent-sdk/supported-agents/human-in-the-loop)
+[**Agent**](#agent-sdk)<br />
+AI agent framework for automating tasks

-Missing a model? [Raise a feature request](https://github.com/trycua/cua/issues/new?assignees=&labels=enhancement&projects=&title=%5BAgent%5D%3A+Add+model+support+for+) or [contribute](https://github.com/trycua/cua/blob/main/CONTRIBUTING.md)!
+</td>
+<td width="25%" align="center" valign="top">

-<br/>
+**[Computer](#computer-sdk)**<br />
+TypeScript/Python SDK for controlling Cua environments
+
+</td>
+<td width="25%" align="center" valign="top">
+
+**[MCP Server](#mcp-server)**<br />
+MCP server for using Cua agents and computers
+
+</td>
+<td width="25%" align="center" valign="top">
+
+**[Computer Server](#computer-server)**<br />
+Server component that runs on Cua environments
+
+</td>
+</tr>
+</table>
+
+<table>
+<tr>
+<td width="25%" align="center" valign="top">
+
+**[Lume](#lume)**<br />
+VM management for macOS
+
+</td>
+<td width="25%" align="center" valign="top">
+
+**[Lumier](#lumier)**<br />
+Docker interface for macOS/Linux VMs
+
+</td>
+<td width="25%" align="center" valign="top">
+
+**[SOM](#som)**<br />
+Set-of-Mark library for Agent
+
+</td>
+<td width="25%" align="center" valign="top">
+
+**[Core](#core)**<br />
+Core utilities for Cua
+
+</td>
+</tr>
+</table>

 # Quick Start

- [Clone a starter template and run the code in <1 min](https://github.com/trycua/agent-template) (⭐️ Recommended!)
- [Get started with the Computer-Use Agent CLI](https://docs.trycua.com/docs/quickstart-cli)
- [Get started with the Python SDKs](https://docs.trycua.com/docs/quickstart-devs)
+- [Clone a starter template and run the code in <1 min](https://github.com/trycua/agent-template)
+- [Get started with the Cua SDKs](https://docs.trycua.com/docs/quickstart-devs)
+- [Get started with the Cua CLI](https://docs.trycua.com/docs/quickstart-cli)

-<br/>
+# Agent SDK

-# Usage ([Docs](https://docs.trycua.com/docs))
+Install the agent SDK:

 ```bash
 pip install cua-agent[all]
 ```

+Initialize a computer agent using a [model configuration string](#model-configuration) and a [computer instance](#computer-usage):
+
 ```python
 from agent import ComputerAgent

+# ComputerAgent works with any computer initialized with the Computer SDK
+
 agent = ComputerAgent(
    model="anthropic/claude-3-5-sonnet-20241022",
    tools=[computer],
@@ -92,115 +128,221 @@ async for result in agent.run(messages):
            print(item["content"][0]["text"])
 ```

-### Output format (OpenAI Agent Responses Format):
+## Output format
+
+Cua uses the OpenAI Agent response format.
+
+<details>
+<summary>Example</summary>

 ```json
 {
  "output": [
-    # user input
    {
-        "role": "user",
-        "content": "go to trycua on gh"
-    },
-    # first agent turn adds the model output to the history
-    {
-        "summary": [
-            {
-                "text": "Searching Firefox for Trycua GitHub",
-                "type": "summary_text"
-            }
-        ],
-        "type": "reasoning"
+      "role": "user",
+      "content": "go to trycua on gh"
    },
    {
-        "action": {
-            "text": "Trycua GitHub",
-            "type": "type"
-        },
-        "call_id": "call_QI6OsYkXxl6Ww1KvyJc4LKKq",
-        "status": "completed",
-        "type": "computer_call"
-    },
-    # second agent turn adds the computer output to the history
-    {
-        "type": "computer_call_output",
-        "call_id": "call_QI6OsYkXxl6Ww1KvyJc4LKKq",
-        "output": {
-            "type": "input_image",
-            "image_url": "data:image/png;base64,..."
+      "summary": [
+        {
+          "text": "Searching Firefox for Trycua GitHub",
+          "type": "summary_text"
        }
+      ],
+      "type": "reasoning"
    },
-    # final agent turn adds the agent output text to the history
    {
-        "type": "message",
-        "role": "assistant",
-        "content": [
-          {
-            "text": "Success! The Trycua GitHub page has been opened.",
-            "type": "output_text"
-          }
-        ]
+      "action": {
+        "text": "Trycua GitHub",
+        "type": "type"
+      },
+      "call_id": "call_QI6OsYkXxl6Ww1KvyJc4LKKq",
+      "status": "completed",
+      "type": "computer_call"
+    },
+    {
+      "type": "computer_call_output",
+      "call_id": "call_QI6OsYkXxl6Ww1KvyJc4LKKq",
+      "output": {
+        "type": "input_image",
+        "image_url": "data:image/png;base64,..."
+      }
+    },
+    {
+      "type": "message",
+      "role": "assistant",
+      "content": [
+        {
+          "text": "Success! The Trycua GitHub page has been opened.",
+          "type": "output_text"
+        }
+      ]
    }
  ],
  "usage": {
-      "prompt_tokens": 150,
-      "completion_tokens": 75,
-      "total_tokens": 225,
-      "response_cost": 0.01,
+    "prompt_tokens": 150,
+    "completion_tokens": 75,
+    "total_tokens": 225,
+    "response_cost": 0.01
  }
 }
 ```

-# Computer ([Docs](https://docs.trycua.com/docs/computer-sdk/computers))
+</details>
+
+## Model Configuration
+
+These are the valid model configurations for `ComputerAgent(model="...")`:
+
+| Configuration                            | Description                                                                                                                                         |
+| ---------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `{computer-use-model}`                   | A single model to perform all computer-use tasks                                                                                                    |
+| `{grounding-model}+{any-vlm-with-tools}` | [Composed](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) with VLM for captioning and grounding LLM for element detection |
+| `moondream3+{any-llm-with-tools}`        | [Composed](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) with Moondream3 for captioning and UI element detection         |
+| `human/human`                            | A [human-in-the-loop](https://docs.trycua.com/docs/agent-sdk/supported-agents/human-in-the-loop) in place of a model                                |
+
+### Model Capabilities
+
+The following table shows which capabilities are supported by each model:
+
+| Model                                                                                                                            | Computer-Use | Grounding | Tools | VLM |
+| -------------------------------------------------------------------------------------------------------------------------------- | :----------: | :-------: | :---: | :-: |
+| [Claude Sonnet/Haiku](https://docs.claude.com/en/docs/agents-and-tools/tool-use/computer-use-tool#how-to-implement-computer-use) |      🖥️      |    🎯     |  🛠️   | 👁️  |
+| [OpenAI CU Preview](https://platform.openai.com/docs/models/computer-use-preview)                                                |      🖥️      |    🎯     |       | 👁️  |
+| [GLM-V](https://huggingface.co/THUDM/glm-4v-9b)                                                                                  |      🖥️      |    🎯     |  🛠️   | 👁️  |
+| [Gemini CU Preview](https://ai.google.dev/gemini-api/docs/computer-use)                                                          |      🖥️      |    🎯     |       | 👁️  |
+| [InternVL](https://huggingface.co/OpenGVLab/InternVL3_5-1B)                                                                      |      🖥️      |    🎯     |  🛠️   | 👁️  |
+| [UI-TARS](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B)                                                                  |      🖥️      |    🎯     |  🛠️   | 👁️  |
+| [OpenCUA](https://huggingface.co/xlangai/OpenCUA-7B)                                                                             |              |    🎯     |       |     |
+| [GTA](https://huggingface.co/HelloKKMe/GTA1-7B)                                                                                  |              |    🎯     |       |     |
+| [Holo](https://huggingface.co/Hcompany/Holo1.5-3B)                                                                               |              |    🎯     |       |     |
+| [Moondream](https://huggingface.co/moondream/moondream3-preview)                                                                 |              |    🎯     |       |     |
+| [OmniParser](https://github.com/microsoft/OmniParser)                                                                            |              |    🎯     |       |     |
+
+### Model IDs
+
+<details>
+<summary>Examples of valid model IDs</summary>
+
+| Model                                                                                                                            | Model IDs                                                        |
+| -------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------- |
+| [Claude Sonnet/Haiku](https://docs.claude.com/en/docs/agents-and-tools/tool-use/computer-use-tool#how-to-implement-computer-use) | `anthropic/claude-sonnet-4-5`, `anthropic/claude-haiku-4-5`      |
+| [OpenAI CU Preview](https://platform.openai.com/docs/models/computer-use-preview)                                                | `openai/computer-use-preview`                                    |
+| [GLM-V](https://huggingface.co/THUDM/glm-4v-9b)                                                                                  | `openrouter/z-ai/glm-4.5v`, `huggingface-local/zai-org/GLM-4.5V` |
+| [Gemini CU Preview](https://ai.google.dev/gemini-api/docs/computer-use)                                                          | `gemini-2.5-computer-use-preview`                                |
+| [InternVL](https://huggingface.co/OpenGVLab/InternVL3_5-1B)                                                                      | `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`      |
+| [UI-TARS](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B)                                                                  | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`                |
+| [OpenCUA](https://huggingface.co/xlangai/OpenCUA-7B)                                                                             | `huggingface-local/xlangai/OpenCUA-{7B,32B}`                     |
+| [GTA](https://huggingface.co/HelloKKMe/GTA1-7B)                                                                                  | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}`                  |
+| [Holo](https://huggingface.co/Hcompany/Holo1.5-3B)                                                                               | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}`                 |
+| [Moondream](https://huggingface.co/moondream/moondream3-preview)                                                                 | `moondream3`                                                     |
+| [OmniParser](https://github.com/microsoft/OmniParser)                                                                            | `omniparser`                                                     |
+
+</details>
+
+Missing a model? Create a [feature request](https://github.com/trycua/cua/issues/new?assignees=&labels=enhancement&projects=&title=%5BAgent%5D%3A+Add+model+support+for+) or [contribute](https://github.com/trycua/cua/blob/main/CONTRIBUTING.md)!
+
+Learn more in the [Agent SDK documentation](./libs/python/agent/README.md).
+
+# Computer SDK
+
+Install the computer SDK:

 ```bash
-pip install cua-computer[all]
+pip install cua-computer
 ```

+Initialize a computer:
+
 ```python
 from computer import Computer

-async with Computer(
-    os_type="linux",
-    provider_type="cloud",
+computer = Computer(
+    os_type="linux",  # or "macos", "windows"
+    provider_type="cloud",  # or "lume", "docker", "windows_sandbox"
    name="your-sandbox-name",
-    api_key="your-api-key"
-) as computer:
-    # Take screenshot
+    api_key="your-api-key"  # only for cloud
+    # or use_host_computer_server=True for host desktop
+)
+
+try:
+    await computer.run()
+
+    # Take a screenshot
    screenshot = await computer.interface.screenshot()

    # Click and type
    await computer.interface.left_click(100, 100)
    await computer.interface.type("Hello!")
+finally:
+    await computer.close()
 ```

+Learn more in the [Computer SDK documentation](./libs/python/computer/README.md).
+
+# MCP Server
+
+Install the MCP server:
+
+```bash
+pip install cua-mcp-server
+```
+
+Learn more in the [MCP Server documentation](./libs/python/mcp-server/README.md).
+
+# Computer Server
+
+Install the Computer Server:
+
+```bash
+pip install cua-computer-server
+python -m computer_server
+```
+
+Learn more in the [Computer Server documentation](./libs/python/computer-server/README.md).
+
+# Lume
+
+Install Lume:
+
+```bash
+curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh | bash
+```
+
+Learn more in the [Lume documentation](./libs/lume/README.md).
+
+# Lumier
+
+Install Lumier:
+
+```bash
+docker pull trycua/lumier:latest
+```
+
+Learn more in the [Lumier documentation](./libs/lumier/README.md).
+
+# SOM
+
+Install SOM:
+
+```bash
+pip install cua-som
+```
+
+Learn more in the [SOM documentation](./libs/python/som/README.md).
+
 # Resources

- [How to use the MCP Server with Claude Desktop or other MCP clients](./libs/python/mcp-server/README.md) - One of the easiest ways to get started with Cua
- [How to use OpenAI Computer-Use, Anthropic, OmniParser, or UI-TARS for your Computer-Use Agent](./libs/python/agent/README.md)
- [How to use Lume CLI for managing desktops](./libs/lume/README.md)
- [Training Computer-Use Models: Collecting Human Trajectories with Cua (Part 1)](https://www.trycua.com/blog/training-computer-use-models-trajectories-1)
+- [Cua Blog](https://www.trycua.com/blog)
+- [Cua Docs](https://docs.trycua.com)

-## Modules
+# Community and Contributions

-| Module                                                            | Description                                                          | Installation                                                                                        |
-| ----------------------------------------------------------------- | -------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------- |
-| [**Lume**](./libs/lume/README.md)                                 | VM management for macOS/Linux using Apple's Virtualization.Framework | `curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh \| bash` |
-| [**Lumier**](./libs/lumier/README.md)                             | Docker interface for macOS and Linux VMs                             | `docker pull trycua/lumier:latest`                                                                  |
-| [**Computer (Python)**](./libs/python/computer/README.md)         | Python Interface for controlling virtual machines                    | `pip install "cua-computer[all]"`                                                                   |
-| [**Computer (Typescript)**](./libs/typescript/computer/README.md) | Typescript Interface for controlling virtual machines                | `npm install @trycua/computer`                                                                      |
-| [**Agent**](./libs/python/agent/README.md)                        | AI agent framework for automating tasks                              | `pip install "cua-agent[all]"`                                                                      |
-| [**MCP Server**](./libs/python/mcp-server/README.md)              | MCP server for using CUA with Claude Desktop                         | `pip install cua-mcp-server`                                                                        |
-| [**SOM**](./libs/python/som/README.md)                            | Self-of-Mark library for Agent                                       | `pip install cua-som`                                                                               |
-| [**Computer Server**](./libs/python/computer-server/README.md)    | Server component for Computer                                        | `pip install cua-computer-server`                                                                   |
-| [**Core (Python)**](./libs/python/core/README.md)                 | Python Core utilities                                                | `pip install cua-core`                                                                              |
-| [**Core (Typescript)**](./libs/typescript/core/README.md)         | Typescript Core utilities                                            | `npm install @trycua/core`                                                                          |
-
-## Community
+We welcome contributions to Cua! Please refer to our [Contributing Guidelines](CONTRIBUTING.md) for details.

 Join our [Discord community](https://discord.com/invite/mVnXXpdE85) to discuss ideas, get assistance, or share your demos!

-## License
+# License

 Cua is open-sourced under the MIT License - see the [LICENSE](LICENSE.md) file for details.

@@ -208,7 +350,7 @@ Portions of this project, specifically components adapted from Kasm Technologies

 Microsoft's OmniParser, which is used in this project, is licensed under the Creative Commons Attribution 4.0 International License (CC-BY-4.0). See the [OmniParser LICENSE](https://github.com/microsoft/OmniParser/blob/master/LICENSE) for details.

-### Third-Party Licenses and Optional Components
+## Third-Party Licenses and Optional Components

 Some optional extras for this project depend on third-party packages that are licensed under terms different from the MIT License.

@@ -216,77 +358,6 @@ Some optional extras for this project depend on third-party packages that are li

 When you choose to install and use such optional extras, your use, modification, and distribution of those third-party components are governed by their respective licenses (e.g., AGPL-3.0 for `ultralytics`).

-## Releasing Packages
-
-Cua uses `bump2version` to manage package versions across all Python modules. A Makefile is provided to simplify the release process.
-
-### Prerequisites
-
-#### install `bump2version`
-
-using brew
-
-```
-brew install bumpversion
-```
-
-### View Current Versions
-
-```bash
-make show-versions
-```
-
-### Bump Package Versions
-
-To bump a specific package version:
-
-```bash
-# Patch version bump (e.g., 0.1.8 → 0.1.9)
-make bump-patch-core          # cua-core
-make bump-patch-pylume        # pylume
-make bump-patch-computer      # cua-computer
-make bump-patch-som           # cua-som
-make bump-patch-agent         # cua-agent
-make bump-patch-computer-server  # cua-computer-server
-make bump-patch-mcp-server    # cua-mcp-server
-
-# Minor version bump (e.g., 0.1.8 → 0.2.0)
-make bump-minor-core          # Replace 'core' with any package name
-
-# Major version bump (e.g., 0.1.8 → 1.0.0)
-make bump-major-core          # Replace 'core' with any package name
-```
-
-### Dry Run (Test Before Bumping)
-
-To preview changes without modifying files:
-
-```bash
-make dry-run-patch-core       # Test patch bump for cua-core
-make dry-run-minor-pylume     # Test minor bump for pylume
-make dry-run-major-agent      # Test major bump for cua-agent
-```
-
-### Bump All Packages (Use with Caution)
-
-```bash
-make bump-all-patch           # Bumps patch version for ALL packages
-```
-
-### After Bumping
-
-After running any bump command, push your changes:
-
-```bash
-git push origin main && git push origin --tags
-```
-
-For more details, run `make help` or see the [Makefile](./Makefile).
-
-## Contributing
-
-We welcome contributions to Cua! Please refer to our [Contributing Guidelines](CONTRIBUTING.md) for details.
-
 ## Trademarks

 Apple, macOS, and Apple Silicon are trademarks of Apple Inc.  
@@ -295,13 +366,13 @@ Microsoft is a registered trademark of Microsoft Corporation.

 This project is not affiliated with, endorsed by, or sponsored by Apple Inc., Canonical Ltd., Microsoft Corporation, or Kasm Technologies.

-## Stargazers
+# Stargazers

 Thank you to all our supporters!

 [![Stargazers over time](https://starchart.cc/trycua/cua.svg?variant=adaptive)](https://starchart.cc/trycua/cua)

-## Sponsors
+# Sponsors

 Thank you to all our [GitHub Sponsors](https://github.com/sponsors/trycua)!

@@ -0,0 +1,106 @@
+# Testing Guide for CUA
+
+Quick guide to running tests and understanding the test architecture.
+
+## 🚀 Quick Start
+
+```bash
+# Install dependencies
+pip install pytest pytest-asyncio pytest-mock pytest-cov
+
+# Install package
+cd libs/python/core
+pip install -e .
+
+# Run tests
+export CUA_TELEMETRY_DISABLED=1  # or $env:CUA_TELEMETRY_DISABLED="1" on Windows
+pytest tests/ -v
+```
+
+## 🧪 Running Tests
+
+```bash
+# All packages
+pytest libs/python/*/tests/ -v
+
+# Specific package
+cd libs/python/core && pytest tests/ -v
+
+# With coverage
+pytest tests/ --cov --cov-report=html
+
+# Specific test
+pytest tests/test_telemetry.py::TestTelemetryEnabled::test_telemetry_enabled_by_default -v
+```
+
+## 🏗️ Test Architecture
+
+**Principles**: SRP (Single Responsibility) + Vertical Slices + Testability
+
+```
+libs/python/
+├── core/tests/           # Tests ONLY core
+├── agent/tests/          # Tests ONLY agent
+└── computer/tests/       # Tests ONLY computer
+```
+
+Each test file = ONE feature. Each test class = ONE concern.
+
+## ➕ Adding New Tests
+
+1. Create `test_*.py` in the appropriate package's `tests/` directory
+2. Follow the pattern:
+
+```python
+"""Unit tests for my_feature."""
+import pytest
+from unittest.mock import patch
+
+class TestMyFeature:
+    """Test MyFeature class."""
+
+    def test_initialization(self):
+        """Test that feature initializes."""
+        from my_package import MyFeature
+        feature = MyFeature()
+        assert feature is not None
+```
+
+3. Mock external dependencies:
+
+```python
+@pytest.fixture
+def mock_api():
+    with patch("my_package.api_client") as mock:
+        yield mock
+```
+
+## 🔄 CI/CD
+
+Tests run automatically on every PR via GitHub Actions (`.github/workflows/python-tests.yml`):
+
+- Matrix strategy: each package tested separately
+- Python 3.12
+- ~2 minute runtime
+
+## 🐛 Troubleshooting
+
+**ModuleNotFoundError**: Run `pip install -e .` in package directory
+
+**Tests fail in CI but pass locally**: Set `CUA_TELEMETRY_DISABLED=1`
+
+**Async tests error**: Install `pytest-asyncio` and use `@pytest.mark.asyncio`
+
+**Mock not working**: Patch at usage location, not definition:
+
+```python
+# ✅ Right
+@patch("my_package.module.external_function")
+
+# ❌ Wrong
+@patch("external_library.function")
+```
+
+---
+
+**Questions?** Check existing tests for examples or open an issue.
@@ -4,13 +4,14 @@ description: Computer Agent SDK benchmarks for agentic GUI tasks
 ---

 The benchmark system evaluates models on GUI grounding tasks, specifically agent loop success rate and click prediction accuracy. It supports both:
+
 - **Computer Agent SDK providers** (using model strings like `"huggingface-local/HelloKKMe/GTA1-7B"`)
 - **Reference agent implementations** (custom model classes implementing the `ModelProtocol`)

 ## Available Benchmarks

 - **[ScreenSpot-v2](./benchmarks/screenspot-v2)** - Standard resolution GUI grounding
- **[ScreenSpot-Pro](./benchmarks/screenspot-pro)** - High-resolution GUI grounding  
+- **[ScreenSpot-Pro](./benchmarks/screenspot-pro)** - High-resolution GUI grounding
 - **[Interactive Testing](./benchmarks/interactive)** - Real-time testing and visualization

 ## Quick Start
@@ -8,6 +8,7 @@ The Cua agent framework uses benchmarks to test the performance of supported mod
 ## Benchmark Types

 Computer-Agent benchmarks evaluate two key capabilities:
+
 - **Plan Generation**: Breaking down complex tasks into a sequence of actions
 - **Coordinate Generation**: Predicting precise click locations on GUI elements

@@ -31,7 +32,7 @@ agent.run("Open Firefox and go to github.com")

 ### Coordinate Generation Only

-**[GUI Agent Grounding Leaderboard](https://gui-agent.github.io/grounding-leaderboard/)** - Benchmark for click prediction accuracy  
+**[GUI Agent Grounding Leaderboard](https://gui-agent.github.io/grounding-leaderboard/)** - Benchmark for click prediction accuracy

 This leaderboard tests models that specialize in finding exactly where to click on screen elements, but needs to be told what specific action to take.

@@ -41,7 +42,7 @@ This leaderboard tests models that specialize in finding exactly where to click
 agent = ComputerAgent("huggingface-local/HelloKKMe/GTA1-7B", tools=[computer])
 agent.predict_click("find the button to open the settings") # (27, 450)
 # This will raise an error:
-# agent.run("Open Firefox and go to github.com") 
+# agent.run("Open Firefox and go to github.com")
 ```

 ### Composed Agent
@@ -5,4 +5,4 @@ description: Benchmark ComputerAgent on OSWorld tasks using HUD

 OSWorld-Verified is a curated subset of OSWorld tasks that can be run using the HUD framework.

-Use [ComputerAgent with HUD](../integrations/hud) to benchmark on these tasks.
+Use [ComputerAgent with HUD](../integrations/hud) to benchmark on these tasks.
@@ -18,8 +18,8 @@ python ss-pro.py --samples 50

 ## Results

-| Model | Accuracy | Failure Rate | Samples |
-|-------|----------|--------------|---------|
-| Coming Soon | - | - | - |
+| Model       | Accuracy | Failure Rate | Samples |
+| ----------- | -------- | ------------ | ------- |
+| Coming Soon | -        | -            | -       |

 Results will be populated after running benchmarks with various models.
@@ -18,8 +18,8 @@ python ss-v2.py --samples 100

 ## Results

-| Model | Accuracy | Failure Rate | Samples |
-|-------|----------|--------------|---------|
-| Coming Soon | - | - | - |
+| Model       | Accuracy | Failure Rate | Samples |
+| ----------- | -------- | ------------ | ------- |
+| Coming Soon | -        | -            | -       |

 Results will be populated after running benchmarks with various models.
@@ -10,30 +10,39 @@ Callbacks provide hooks into the agent lifecycle for extensibility. They're call
 ## Callback Lifecycle

 ### 1. `on_run_start(kwargs, old_items)`
+
 Called once when agent run begins. Initialize tracking, logging, or state.

 ### 2. `on_run_continue(kwargs, old_items, new_items)` → bool
+
 Called before each iteration. Return `False` to stop execution (e.g., budget limits).

 ### 3. `on_llm_start(messages)` → messages
+
 Preprocess messages before LLM call. Use for PII anonymization, image retention.

 ### 4. `on_api_start(kwargs)`
+
 Called before each LLM API call.

 ### 5. `on_api_end(kwargs, result)`
+
 Called after each LLM API call completes.

 ### 6. `on_usage(usage)`
+
 Called when usage information is received from LLM.

 ### 7. `on_llm_end(messages)` → messages
+
 Postprocess messages after LLM call. Use for PII deanonymization.

 ### 8. `on_responses(kwargs, responses)`
+
 Called when responses are received from agent loop.

 ### 9. Response-specific hooks:
+
 - `on_text(item)` - Text messages
 - `on_computer_call_start(item)` - Before computer actions
 - `on_computer_call_end(item, result)` - After computer actions
@@ -42,4 +51,5 @@ Called when responses are received from agent loop.
 - `on_screenshot(screenshot, name)` - When screenshots are taken

 ### 10. `on_run_end(kwargs, old_items, new_items)`
-Called when agent run completes. Finalize tracking, save trajectories.
+
+Called when agent run completes. Finalize tracking, save trajectories.
@@ -36,6 +36,7 @@ agent = ComputerAgent(
 ```

 **Or with options:**
+
 ```python
 # Advanced budget configuration
 agent = ComputerAgent(
@@ -15,7 +15,7 @@ Built-in callbacks can be used as follows:
 ```python
 from agent.callbacks import (
    ImageRetentionCallback,
-    TrajectorySaverCallback, 
+    TrajectorySaverCallback,
    BudgetManagerCallback,
    LoggingCallback
 )
@@ -52,12 +52,12 @@ class CustomCallback(AsyncCallbackHandler):
        """Preprocess messages before LLM call"""
        # Add custom preprocessing logic
        return messages
-    
+
    async def on_llm_end(self, messages):
        """Postprocess messages after LLM call"""
        # Add custom postprocessing logic
        return messages
-    
+
    async def on_usage(self, usage):
        """Track usage information"""
        print(f"Tokens used: {usage.total_tokens}")
@@ -18,7 +18,7 @@ agent = ComputerAgent(
    tools=[computer],
    callbacks=[
        LoggingCallback(
-            logger=logging.getLogger("cua"), 
+            logger=logging.getLogger("cua"),
            level=logging.INFO
        )
    ]
@@ -47,7 +47,7 @@ class CustomLogger(AsyncCallbackHandler):
    def __init__(self, logger_name="agent"):
        self.logger = logging.getLogger(logger_name)
        self.logger.setLevel(logging.INFO)
-        
+
        # Add console handler
        handler = logging.StreamHandler()
        formatter = logging.Formatter(
@@ -55,18 +55,18 @@ class CustomLogger(AsyncCallbackHandler):
        )
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)
-    
+
    async def on_run_start(self, kwargs, old_items):
        self.logger.info(f"Agent run started with model: {kwargs.get('model')}")
-    
+
    async def on_computer_call_start(self, item):
        action = item.get('action', {})
        self.logger.info(f"Computer action: {action.get('type')}")
-    
+
    async def on_usage(self, usage):
        cost = usage.get('response_cost', 0)
        self.logger.info(f"API call cost: ${cost:.4f}")
-    
+
    async def on_run_end(self, kwargs, old_items, new_items):
        self.logger.info("Agent run completed")

@@ -81,6 +81,7 @@ agent = ComputerAgent(
 ## Available Hooks

 Log any agent event using these callback methods:
+
 - `on_run_start/end` - Run lifecycle
 - `on_computer_call_start/end` - Computer actions
 - `on_api_start/end` - LLM API calls
@@ -40,6 +40,7 @@ View trajectories in the browser at:
 **[trycua.com/trajectory-viewer](http://trycua.com/trajectory-viewer)**

 The viewer provides:
+
 - Interactive conversation replay
 - Screenshot galleries
 - No data collection
@@ -47,11 +48,13 @@ The viewer provides:
 ## Trajectory Structure

 Trajectories are saved with:
+
 - Complete conversation history
 - Usage statistics and costs
 - Timestamps and metadata
 - Screenshots and computer actions

 Each trajectory contains:
+
 - **metadata.json**: Run info, timestamps, usage stats (`total_tokens`, `response_cost`)
 - **turn_000/**: Turn-by-turn conversation history (api calls, responses, computer calls, screenshots)
@@ -53,67 +53,67 @@ from typing import Literal, List, Dict, Union, Optional

 class MyCustomComputer(AsyncComputerHandler):
    """Custom computer handler implementation."""
-    
+
    def __init__(self):
        # Initialize your custom computer interface here
        pass
-    
-    # ==== Computer-Use-Preview Action Space ==== 
+
+    # ==== Computer-Use-Preview Action Space ====

    async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
        """Get the current environment type."""
        ...
-    
+
    async def get_dimensions(self) -> tuple[int, int]:
        """Get screen dimensions as (width, height)."""
        ...
-    
+
    async def screenshot(self) -> str:
        """Take a screenshot and return as base64 string."""
        ...
-    
+
    async def click(self, x: int, y: int, button: str = "left") -> None:
        """Click at coordinates with specified button."""
        ...
-    
+
    async def double_click(self, x: int, y: int) -> None:
        """Double click at coordinates."""
        ...
-    
+
    async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
        """Scroll at coordinates with specified scroll amounts."""
        ...
-    
+
    async def type(self, text: str) -> None:
        """Type text."""
        ...
-    
+
    async def wait(self, ms: int = 1000) -> None:
        """Wait for specified milliseconds."""
        ...
-    
+
    async def move(self, x: int, y: int) -> None:
        """Move cursor to coordinates."""
        ...
-    
+
    async def keypress(self, keys: Union[List[str], str]) -> None:
        """Press key combination."""
        ...
-    
+
    async def drag(self, path: List[Dict[str, int]]) -> None:
        """Drag along specified path."""
        ...
-    
+
    async def get_current_url(self) -> str:
        """Get current URL (for browser environments)."""
        ...
-    
-    # ==== Anthropic Action Space ==== 
+
+    # ==== Anthropic Action Space ====

    async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
        """Left mouse down at coordinates."""
        ...
-    
+
    async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
        """Left mouse up at coordinates."""
        ...
@@ -127,4 +127,4 @@ agent = ComputerAgent(
 )

 await agent.run("Take a screenshot and click at coordinates 100, 200")
-```
+```
@@ -2,7 +2,16 @@
 title: Customizing Your ComputerAgent
 ---

-<Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/customizing_computeragent.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.</Callout>
+<Callout>
+  A corresponding{' '}
+  <a
+    href="https://github.com/trycua/cua/blob/main/notebooks/customizing_computeragent.ipynb"
+    target="_blank"
+  >
+    Jupyter Notebook
+  </a>{' '}
+  is available for this documentation.
+</Callout>

 The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems.

@@ -118,4 +127,4 @@ await run_single_task(
    # tools=[your_custom_function],
    # callbacks=[YourCustomCallback()],
 )
-```
+```
@@ -3,7 +3,13 @@ title: HUD Evals
 description: Use ComputerAgent with HUD for benchmarking and evaluation
 ---

-<Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.</Callout>
+<Callout>
+  A corresponding{' '}
+  <a href="https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb" target="_blank">
+    Jupyter Notebook
+  </a>{' '}
+  is available for this documentation.
+</Callout>

 The HUD integration allows an agent to be benchmarked using the [HUD framework](https://www.hud.so/). Through the HUD integration, the agent controls a computer inside HUD, where tests are run to evaluate the success of each task.

@@ -120,8 +126,8 @@ Both single-task and full-dataset runs share a common set of configuration optio
 HUD provides multiple benchmark datasets for realistic evaluation.

 1. **[OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified)** – Benchmark on 369+ real-world desktop tasks across Chrome, LibreOffice, GIMP, VS Code, etc.
-   *Best for*: evaluating full computer-use agents in realistic environments.
-   *Verified variant*: fixes 300+ issues from earlier versions for reliability.
+   _Best for_: evaluating full computer-use agents in realistic environments.
+   _Verified variant_: fixes 300+ issues from earlier versions for reliability.

 **Coming soon:** SheetBench (spreadsheet automation) and other specialized HUD datasets.

@@ -129,7 +135,7 @@ See the [HUD docs](https://docs.hud.so/environment-creation) for more eval envir

 ## Tips

-* **Debugging:** set `verbosity=2` to see every model call and tool action.
-* **Performance:** lower `screenshot_delay` for faster runs; raise it if you see race conditions.
-* **Safety:** always set `max_steps` (defaults to 50) to prevent runaway loops.
-* **Custom tools:** pass extra `tools=[...]` into the agent config if you need beyond `openai_computer`.
+- **Debugging:** set `verbosity=2` to see every model call and tool action.
+- **Performance:** lower `screenshot_delay` for faster runs; raise it if you see race conditions.
+- **Safety:** always set `max_steps` (defaults to 50) to prevent runaway loops.
+- **Custom tools:** pass extra `tools=[...]` into the agent config if you need beyond `openai_computer`.
@@ -20,7 +20,9 @@ This guide lists **breaking changes** when migrating from the original `Computer
 ## Usage Examples: Old vs New

 ### 1. Anthropic Loop
+
 **Old:**
+
 ```python
 async with Computer() as computer:
    agent = ComputerAgent(
@@ -31,7 +33,9 @@ async with Computer() as computer:
    async for result in agent.run("Take a screenshot"):
        print(result)
 ```
+
 **New:**
+
 ```python
 async with Computer() as computer:
    agent = ComputerAgent(
@@ -46,7 +50,9 @@ async with Computer() as computer:
 ```

 ### 2. OpenAI Loop
+
 **Old:**
+
 ```python
 async with Computer() as computer:
    agent = ComputerAgent(
@@ -57,7 +63,9 @@ async with Computer() as computer:
    async for result in agent.run("Take a screenshot"):
        print(result)
 ```
+
 **New:**
+
 ```python
 async with Computer() as computer:
    agent = ComputerAgent(
@@ -72,7 +80,9 @@ async with Computer() as computer:
 ```

 ### 3. UI-TARS Loop
+
 **Old:**
+
 ```python
 async with Computer() as computer:
    agent = ComputerAgent(
@@ -83,7 +93,9 @@ async with Computer() as computer:
    async for result in agent.run("Take a screenshot"):
        print(result)
 ```
+
 **New:**
+
 ```python
 async with Computer() as computer:
    agent = ComputerAgent(
@@ -98,7 +110,9 @@ async with Computer() as computer:
 ```

 ### 4. Omni Loop
+
 **Old:**
+
 ```python
 async with Computer() as computer:
    agent = ComputerAgent(
@@ -109,7 +123,9 @@ async with Computer() as computer:
    async for result in agent.run("Take a screenshot"):
        print(result)
 ```
+
 **New:**
+
 ```python
 async with Computer() as computer:
    agent = ComputerAgent(
@@ -26,7 +26,7 @@ agent = ComputerAgent(
 When using Anthropic-based CUAs (Claude models), setting `use_prompt_caching=True` will automatically add `{ "cache_control": "ephemeral" }` to your messages. This enables prompt caching for the session and can speed up repeated runs with the same prompt.

 <Callout title="Note">
-This argument is only required for Anthropic CUAs. For other providers, it is ignored.
+  This argument is only required for Anthropic CUAs. For other providers, it is ignored.
 </Callout>

 ## OpenAI Provider
@@ -44,13 +44,16 @@ agent = ComputerAgent(
 ```

 ## Implementation Details
+
 - For Anthropic: Adds `{ "cache_control": "ephemeral" }` to messages when enabled.
 - For OpenAI: Caching is automatic for long prompts; the argument is ignored.

 ## When to Use
+
 - Enable for Anthropic CUAs if you want to avoid reprocessing the same prompt in repeated or iterative tasks.
 - Not needed for OpenAI models unless you want explicit ephemeral cache control (not required for most users).

 ## See Also
+
 - [Agent Loops](./agent-loops)
 - [Migration Guide](./migration-guide)
@@ -59,7 +59,7 @@ Combine state-of-the-art grounding with powerful reasoning:

 ```python
 agent = ComputerAgent(
-    "huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-3-5-sonnet-20241022", 
+    "huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-3-5-sonnet-20241022",
    tools=[computer]
 )

@@ -65,6 +65,7 @@ async for _ in agent.run("Click on the search bar and type 'hello world'"):
 ## InternVL 3.5

 InternVL 3.5 family:
+
 - `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`

 ```python
@@ -76,6 +77,7 @@ async for _ in agent.run("Open Firefox and navigate to github.com"):
 ## Qwen3 VL

 Qwen3 VL family:
+
 - `openrouter/qwen/qwen3-vl-235b-a22b-instruct`

 ```python
@@ -17,9 +17,11 @@ All models that support `ComputerAgent.run()` also support `ComputerAgent.predic
 - Claude 3.5: `claude-3-5-sonnet-20241022`

 ### OpenAI CUA Preview
+
 - Computer-use-preview: `computer-use-preview`

 ### UI-TARS 1.5 (Unified VLM with grounding support)
+
 - `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`
 - `huggingface/ByteDance-Seed/UI-TARS-1.5-7B` (requires TGI endpoint)

@@ -28,15 +30,19 @@ All models that support `ComputerAgent.run()` also support `ComputerAgent.predic
 These models are optimized specifically for click prediction and UI element grounding:

 ### OpenCUA
+
 - `huggingface-local/xlangai/OpenCUA-{7B,32B}`

 ### GTA1 Family
+
 - `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}`

 ### Holo 1.5 Family
+
 - `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}`

 ### InternVL 3.5 Family
+
 - `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`

 ### OmniParser (OCR)
@@ -5,6 +5,7 @@ title: Supported Model Providers
 ## Supported Models

 ### Anthropic Claude (Computer Use API)
+
 ```python
 model="anthropic/claude-3-5-sonnet-20241022"
 model="anthropic/claude-3-7-sonnet-20250219"
@@ -13,20 +14,23 @@ model="anthropic/claude-sonnet-4-20250514"
 ```

 ### OpenAI Computer Use Preview
+
 ```python
 model="openai/computer-use-preview"
 ```

 ### UI-TARS (Local or Huggingface Inference)
+
 ```python
 model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"
 model="ollama_chat/0000/ui-tars-1.5-7b"
 ```

 ### Omniparser + Any LLM
+
 ```python
 model="omniparser+ollama_chat/mistral-small3.2"
 model="omniparser+vertex_ai/gemini-pro"
 model="omniparser+anthropic/claude-3-5-sonnet-20241022"
 model="omniparser+openai/gpt-4o"
-```
+```
@@ -51,7 +51,7 @@ class UsageTrackerCallback(AsyncCallbackHandler):
        print("Usage update:", usage)

 agent = ComputerAgent(
-    ..., 
+    ...,
    callbacks=[UsageTrackerCallback()]
 )
 ```
@@ -59,5 +59,6 @@ agent = ComputerAgent(
 See also: [Budget Manager Callbacks](./callbacks/cost-saving)

 ## See Also
+
 - [Prompt Caching](./prompt-caching)
 - [Callbacks](./callbacks)
@@ -5,7 +5,6 @@ description: Manage your Cua Cloud sandboxes (VMs) via Python SDK or HTTP API

 import { Tab, Tabs } from 'fumadocs-ui/components/tabs';

-
 Using the Cua Cloud API, you can manage your Cua Cloud sandboxes (VMs) with Python or HTTP (curl).

 All examples require a CUA API key. You can obtain one from the [Dashboard](https://www.cua.ai/dashboard/keys).
@@ -14,110 +13,116 @@ All examples require a CUA API key. You can obtain one from the [Dashboard](http

 ## List VMs

-<Tabs items={["Python", "curl"]}>
+<Tabs items={['Python', 'curl']}>
  <Tab value="Python">

-  ```python
-  import os
-  import asyncio
-  from computer.providers.cloud.provider import CloudProvider
+```python
+import os
+import asyncio
+from computer.providers.cloud.provider import CloudProvider

-  async def main():
-      api_key = os.getenv("CUA_API_KEY") or "your-api-key"
-      # Optional: point to a different API base
-      # os.environ["CUA_API_BASE"] = "https://api.cua.ai"
+async def main():
+    api_key = os.getenv("CUA_API_KEY") or "your-api-key"
+    # Optional: point to a different API base
+    # os.environ["CUA_API_BASE"] = "https://api.cua.ai"

-      provider = CloudProvider(api_key=api_key, verbose=False)
-      async with provider:
-          vms = await provider.list_vms()
-          for vm in vms:
-              print({
-                  "name": vm["name"],
-                  "status": vm["status"],
-                  "api_url": vm.get("api_url"),
-                  "vnc_url": vm.get("vnc_url"),
-              })
+    provider = CloudProvider(api_key=api_key, verbose=False)
+    async with provider:
+        vms = await provider.list_vms()
+        for vm in vms:
+            print({
+                "name": vm["name"],
+                "status": vm["status"],
+                "api_url": vm.get("api_url"),
+                "vnc_url": vm.get("vnc_url"),
+            })

-  if __name__ == "__main__":
-      asyncio.run(main())
-  ```
+if __name__ == "__main__":
+    asyncio.run(main())
+```

  </Tab>
  <Tab value="curl">

-  ```bash
-  curl -H "Authorization: Bearer $CUA_API_KEY" \
-       "https://api.cua.ai/v1/vms"
-  ```
+```bash
+curl -H "Authorization: Bearer $CUA_API_KEY" \
+     "https://api.cua.ai/v1/vms"
+```

-  Responses:
-  - 200: Array of minimal VM objects with fields `{ name, password, status }`
-  - 401: Unauthorized (missing/invalid API key)
+Responses:

-  ```json
-  [
-    {
-      "name": "s-windows-x4snp46ebf",
-      "password": "49b8daa3",
-      "status": "running"
-    }
-  ]
-  ```
+- 200: Array of minimal VM objects with fields `{ name, password, status }`
+- 401: Unauthorized (missing/invalid API key)

-  Status values:
+```json
+[
+  {
+    "name": "s-windows-x4snp46ebf",
+    "password": "49b8daa3",
+    "status": "running"
+  }
+]
+```

-  - `pending`: VM deployment in progress
-  - `running`: VM is active and accessible
-  - `stopped`: VM is stopped but not terminated
-  - `terminated`: VM has been permanently destroyed
-  - `failed`: VM deployment or operation failed
+Status values:

-  </Tab>
-</Tabs>
+- `pending`: VM deployment in progress
+- `running`: VM is active and accessible
+- `stopped`: VM is stopped but not terminated
+- `terminated`: VM has been permanently destroyed
+- `failed`: VM deployment or operation failed
+
+---
+
+      </Tab>
+
+  </Tabs>

 ---

 ## Start a VM
+
 Provide the VM name you want to start.

 <Tabs items={["Python", "curl"]}>
  <Tab value="Python">

-  ```python
-  import os
-  import asyncio
-  from computer.providers.cloud.provider import CloudProvider
+```python
+import os
+import asyncio
+from computer.providers.cloud.provider import CloudProvider

-  async def main():
-      api_key = os.getenv("CUA_API_KEY") or "your-api-key"
-      name = "my-vm-name"  # e.g., "m-linux-96lcxd2c2k"
+async def main():
+    api_key = os.getenv("CUA_API_KEY") or "your-api-key"
+    name = "my-vm-name"  # e.g., "m-linux-96lcxd2c2k"

-      provider = CloudProvider(api_key=api_key)
-      async with provider:
-          resp = await provider.run_vm(name)
-          print(resp)  # { "name": name, "status": "starting" }
+    provider = CloudProvider(api_key=api_key)
+    async with provider:
+        resp = await provider.run_vm(name)
+        print(resp)  # { "name": name, "status": "starting" }

-  if __name__ == "__main__":
-      asyncio.run(main())
-  ```
+if __name__ == "__main__":
+    asyncio.run(main())
+```

  </Tab>
  <Tab value="curl">

-  ```bash
-  curl -X POST \
-       -H "Authorization: Bearer $CUA_API_KEY" \
-       "https://api.cua.ai/v1/vms/my-vm-name/start" -i
-  ```
+```bash
+curl -X POST \
+     -H "Authorization: Bearer $CUA_API_KEY" \
+     "https://api.cua.ai/v1/vms/my-vm-name/start" -i
+```

-  Responses:
-  - 204: No Content (start accepted)
-  - 401: Unauthorized (missing/invalid API key)
-  - 404: VM not found or not owned by the user
+Responses:

-  ```text
-  HTTP/1.1 204 No Content
-  ```
+- 204: No Content (start accepted)
+- 401: Unauthorized (missing/invalid API key)
+- 404: VM not found or not owned by the user
+
+```text
+HTTP/1.1 204 No Content
+```

  </Tab>
 </Tabs>
@@ -125,46 +130,48 @@ Provide the VM name you want to start.
 ---

 ## Stop a VM
+
 Stops the VM asynchronously.

 <Tabs items={["Python", "curl"]}>
  <Tab value="Python">

-  ```python
-  import os
-  import asyncio
-  from computer.providers.cloud.provider import CloudProvider
+```python
+import os
+import asyncio
+from computer.providers.cloud.provider import CloudProvider

-  async def main():
-      api_key = os.getenv("CUA_API_KEY") or "your-api-key"
-      name = "my-vm-name"
+async def main():
+    api_key = os.getenv("CUA_API_KEY") or "your-api-key"
+    name = "my-vm-name"

-      provider = CloudProvider(api_key=api_key)
-      async with provider:
-          resp = await provider.stop_vm(name)
-          print(resp)  # { "name": name, "status": "stopping" }
+    provider = CloudProvider(api_key=api_key)
+    async with provider:
+        resp = await provider.stop_vm(name)
+        print(resp)  # { "name": name, "status": "stopping" }

-  if __name__ == "__main__":
-      asyncio.run(main())
-  ```
+if __name__ == "__main__":
+    asyncio.run(main())
+```

  </Tab>
  <Tab value="curl">

-  ```bash
-  curl -X POST \
-       -H "Authorization: Bearer $CUA_API_KEY" \
-       "https://api.cua.ai/v1/vms/my-vm-name/stop"
-  ```
+```bash
+curl -X POST \
+     -H "Authorization: Bearer $CUA_API_KEY" \
+     "https://api.cua.ai/v1/vms/my-vm-name/stop"
+```

-  Responses:
-  - 202: Accepted with `{ "status": "stopping" }`
-  - 401: Unauthorized (missing/invalid API key)
-  - 404: VM not found or not owned by the user
+Responses:

-  ```json
-  { "status": "stopping" }
-  ```
+- 202: Accepted with `{ "status": "stopping" }`
+- 401: Unauthorized (missing/invalid API key)
+- 404: VM not found or not owned by the user
+
+```json
+{ "status": "stopping" }
+```

  </Tab>
 </Tabs>
@@ -172,46 +179,48 @@ Stops the VM asynchronously.
 ---

 ## Restart a VM
+
 Restarts the VM asynchronously.

 <Tabs items={["Python", "curl"]}>
  <Tab value="Python">

-  ```python
-  import os
-  import asyncio
-  from computer.providers.cloud.provider import CloudProvider
+```python
+import os
+import asyncio
+from computer.providers.cloud.provider import CloudProvider

-  async def main():
-      api_key = os.getenv("CUA_API_KEY") or "your-api-key"
-      name = "my-vm-name"
+async def main():
+    api_key = os.getenv("CUA_API_KEY") or "your-api-key"
+    name = "my-vm-name"

-      provider = CloudProvider(api_key=api_key)
-      async with provider:
-          resp = await provider.restart_vm(name)
-          print(resp)  # { "name": name, "status": "restarting" }
+    provider = CloudProvider(api_key=api_key)
+    async with provider:
+        resp = await provider.restart_vm(name)
+        print(resp)  # { "name": name, "status": "restarting" }

-  if __name__ == "__main__":
-      asyncio.run(main())
-  ```
+if __name__ == "__main__":
+    asyncio.run(main())
+```

  </Tab>
  <Tab value="curl">

-  ```bash
-  curl -X POST \
-       -H "Authorization: Bearer $CUA_API_KEY" \
-       "https://api.cua.ai/v1/vms/my-vm-name/restart"
-  ```
+```bash
+curl -X POST \
+     -H "Authorization: Bearer $CUA_API_KEY" \
+     "https://api.cua.ai/v1/vms/my-vm-name/restart"
+```

-  Responses:
-  - 202: Accepted with `{ "status": "restarting" }`
-  - 401: Unauthorized (missing/invalid API key)
-  - 404: VM not found or not owned by the user
+Responses:

-  ```json
-  { "status": "restarting" }
-  ```
+- 202: Accepted with `{ "status": "restarting" }`
+- 401: Unauthorized (missing/invalid API key)
+- 404: VM not found or not owned by the user
+
+```json
+{ "status": "restarting" }
+```

  </Tab>
 </Tabs>
@@ -219,42 +228,44 @@ Restarts the VM asynchronously.
 ---

 ## Query a VM by name
+
 Query the computer-server running on the VM. Useful for checking details like status or OS type.

 <Tabs items={["Python", "curl"]}>
  <Tab value="Python">

-  ```python
-  import os
-  import asyncio
-  from computer.providers.cloud.provider import CloudProvider
+```python
+import os
+import asyncio
+from computer.providers.cloud.provider import CloudProvider

-  async def main():
-      api_key = os.getenv("CUA_API_KEY") or "your-api-key"
-      name = "my-vm-name"
+async def main():
+    api_key = os.getenv("CUA_API_KEY") or "your-api-key"
+    name = "my-vm-name"

-      provider = CloudProvider(api_key=api_key)
-      async with provider:
-          info = await provider.get_vm(name)
-          print(info)
+    provider = CloudProvider(api_key=api_key)
+    async with provider:
+        info = await provider.get_vm(name)
+        print(info)

-  if __name__ == "__main__":
-      asyncio.run(main())
-  ```
+if __name__ == "__main__":
+    asyncio.run(main())
+```

  </Tab>
  <Tab value="curl">

-  ```bash
-  curl "https://my-vm-name.containers.cloud.cua.ai:8443/status"
-  ```
+```bash
+curl "https://my-vm-name.containers.cloud.cua.ai:8443/status"
+```

-  Responses:
-  - 200: Server available
+Responses:

-  ```json
-  { "status": "ok", "os_type": "linux", "features": ["agent"] }
-  ```
+- 200: Server available
+
+```json
+{ "status": "ok", "os_type": "linux", "features": ["agent"] }
+```

  </Tab>
 </Tabs>
@@ -13,16 +13,77 @@ Execute shell commands and get detailed results:

 <Tabs items={['Python', 'TypeScript']}>
  <Tab value="Python">
+
    ```python
-    # Run shell command result = await
-    computer.interface.run_command(cmd) # result.stdout, result.stderr, result.returncode
+    # Run shell command
+    result = await computer.interface.run_command(cmd) # result.stdout, result.stderr, result.returncode
    ```
+    
  </Tab>
  <Tab value="TypeScript">
+
    ```typescript
-    // Run shell command const result = await
-    computer.interface.runCommand(cmd); // result.stdout, result.stderr, result.returncode
+    // Run shell command
+    const result = await computer.interface.runCommand(cmd); // result.stdout, result.stderr, result.returncode
    ```
+
+  </Tab>
+</Tabs>
+
+## Window Management
+
+Control application launching and windows:
+
+<Tabs items={['Python', 'TypeScript']}>
+  <Tab value="Python">
+
+    ```python
+    # Launch applications
+    await computer.interface.launch("xfce4-terminal")
+    await computer.interface.launch("libreoffice --writer")
+    await computer.interface.open("https://www.google.com")
+
+    # Window management
+    windows = await computer.interface.get_application_windows("xfce4-terminal")
+    window_id = windows[0]
+    await computer.interface.activate_window(window_id)
+
+    window_id = await computer.interface.get_current_window_id()  # get the current active window id
+    await computer.interface.window_size(window_id)
+    await computer.interface.get_window_title(window_id)
+    await computer.interface.get_window_position(window_id)
+    await computer.interface.set_window_size(window_id, 1200, 800)
+    await computer.interface.set_window_position(window_id, 100, 100)
+    await computer.interface.maximize_window(window_id)
+    await computer.interface.minimize_window(window_id)
+    await computer.interface.close_window(window_id)
+    ```
+
+  </Tab>
+  <Tab value="TypeScript">
+
+    ```typescript
+    // Launch applications
+    await computer.interface.launch("xfce4-terminal");
+    await computer.interface.launch("libreoffice --writer");
+    await computer.interface.open("https://www.google.com");
+
+    // Window management
+    const windows = await computer.interface.getApplicationWindows("xfce4-terminal");
+    let windowId = windows[0];
+    await computer.interface.activateWindow(windowId);
+
+    windowId = await computer.interface.getCurrentWindowId(); // current active window id
+    await computer.interface.getWindowSize(windowId);
+    await computer.interface.getWindowName(windowId);
+    await computer.interface.getWindowPosition(windowId);
+    await computer.interface.setWindowSize(windowId, 1200, 800);
+    await computer.interface.setWindowPosition(windowId, 100, 100);
+    await computer.interface.maximizeWindow(windowId);
+    await computer.interface.minimizeWindow(windowId);
+    await computer.interface.closeWindow(windowId);
+    ```
+
  </Tab>
 </Tabs>

@@ -32,6 +93,7 @@ Precise mouse control and interaction:

 <Tabs items={['Python', 'TypeScript']}>
  <Tab value="Python">
+
    ```python
    # Basic clicks
    await computer.interface.left_click(x, y)       # Left click at coordinates
@@ -50,6 +112,7 @@ Precise mouse control and interaction:

  </Tab>
  <Tab value="TypeScript">
+
    ```typescript
    // Basic clicks
    await computer.interface.leftClick(x, y);       // Left click at coordinates
@@ -75,6 +138,7 @@ Text input and key combinations:

 <Tabs items={['Python', 'TypeScript']}>
  <Tab value="Python">
+
    ```python
    # Text input
    await computer.interface.type_text("Hello")     # Type text
@@ -88,6 +152,7 @@ Text input and key combinations:

  </Tab>
  <Tab value="TypeScript">
+
    ```typescript
    // Text input
    await computer.interface.typeText("Hello");     // Type text
@@ -108,20 +173,24 @@ Mouse wheel and scrolling control:

 <Tabs items={['Python', 'TypeScript']}>
  <Tab value="Python">
+
    ```python
    # Scrolling
    await computer.interface.scroll(x, y) # Scroll the mouse wheel
-    await computer.interface.scroll_down(clicks) # Scroll down await
-    computer.interface.scroll_up(clicks) # Scroll up
+    await computer.interface.scroll_down(clicks) # Scroll down
+    await computer.interface.scroll_up(clicks) # Scroll up
    ```
+
  </Tab>
  <Tab value="TypeScript">
-    ```typescript 
-    // Scrolling 
-    await computer.interface.scroll(x, y); // Scroll the mouse wheel 
+
+    ```typescript
+    // Scrolling
+    await computer.interface.scroll(x, y); // Scroll the mouse wheel
    await computer.interface.scrollDown(clicks); // Scroll down
-    await computer.interface.scrollUp(clicks); // Scroll up 
+    await computer.interface.scrollUp(clicks); // Scroll up
    ```
+
  </Tab>
 </Tabs>

@@ -131,21 +200,51 @@ Screen capture and display information:

 <Tabs items={['Python', 'TypeScript']}>
  <Tab value="Python">
-    ```python 
-    # Screen operations 
-    await computer.interface.screenshot() # Take a screenshot 
-    await computer.interface.get_screen_size() # Get screen dimensions

+    ```python
+    # Screen operations
+    await computer.interface.screenshot() # Take a screenshot
+    await computer.interface.get_screen_size() # Get screen dimensions
+    ```
+
+  </Tab>
+  <Tab value="TypeScript">
+
+    ```typescript
+    // Screen operations
+    await computer.interface.screenshot(); // Take a screenshot
+    await computer.interface.getScreenSize(); // Get screen dimensions
+    ```
+
+  </Tab>
+</Tabs>
+
+## Desktop Actions
+
+Control desktop environment features like wallpaper:
+
+<Tabs items={['Python', 'TypeScript']}>
+  <Tab value="Python">
+    ```python 
+    # Get current desktop environment (e.g., 'xfce4', 'gnome', 'kde', 'mac', 'windows')
+    env = await computer.interface.get_desktop_environment()
+    print(env) # "xfce4"
+
+    # Set desktop wallpaper to an image file accessible on the VM
+    await computer.interface.set_wallpaper("/home/cua/shared/wallpaper.png")
    ```

  </Tab>
  <Tab value="TypeScript">
    ```typescript 
-    // Screen operations 
-    await computer.interface.screenshot(); // Take a screenshot 
-    await computer.interface.getScreenSize(); // Get screen dimensions 
-    
+    // Get current desktop environment
+    const env = await computer.interface.getDesktopEnvironment();
+    print(env) # "xfce4"
+
+    // Set desktop wallpaper to an image file accessible on the VM
+    await computer.interface.setWallpaper('/home/cua/shared/wallpaper.png');
    ```
+
  </Tab>
 </Tabs>

@@ -155,20 +254,20 @@ System clipboard management:

 <Tabs items={['Python', 'TypeScript']}>
  <Tab value="Python">
-    ```python 
-    # Clipboard operations await
-    computer.interface.set_clipboard(text) # Set clipboard content await
-    computer.interface.copy_to_clipboard() # Get clipboard content

+    ```python
+    # Clipboard operations
+    await computer.interface.set_clipboard(text) # Set clipboard content
+    await computer.interface.copy_to_clipboard() # Get clipboard content
    ```

  </Tab>
  <Tab value="TypeScript">
-    ```typescript 
-    // Clipboard operations 
+
+    ```typescript
+    // Clipboard operations
    await computer.interface.setClipboard(text); // Set clipboard content
    await computer.interface.copyToClipboard(); // Get clipboard content
-
    ```

  </Tab>
@@ -201,18 +300,19 @@ Direct file and directory manipulation:

  </Tab>
  <Tab value="TypeScript">
+
    ```typescript
-    # File existence checks
+    // File existence checks
    await computer.interface.fileExists(path);      // Check if file exists
    await computer.interface.directoryExists(path); // Check if directory exists

-    # File content operations
+    // File content operations
    await computer.interface.readText(path, "utf-8");        // Read file content
    await computer.interface.writeText(path, content, "utf-8"); // Write file content
    await computer.interface.readBytes(path);       // Read file content as bytes
    await computer.interface.writeBytes(path, content); // Write file content as bytes

-    # File and directory management
+    // File and directory management
    await computer.interface.deleteFile(path);      // Delete file
    await computer.interface.createDir(path);       // Create directory
    await computer.interface.deleteDir(path);       // Delete directory
@@ -228,20 +328,21 @@ Access system accessibility information:

 <Tabs items={['Python', 'TypeScript']}>
  <Tab value="Python">
-    ```python 
-    # Get accessibility tree 
-    await computer.interface.get_accessibility_tree()

+    ```python
+    # Get accessibility tree
+    await computer.interface.get_accessibility_tree()
    ```

  </Tab>
  <Tab value="TypeScript">
-    ```typescript 
-    // Get accessibility tree 
-   await computer.interface.getAccessibilityTree();

-```
-</Tab>
+    ```typescript
+    // Get accessibility tree
+    await computer.interface.getAccessibilityTree();
+    ```
+
+  </Tab>
 </Tabs>

 ## Delay Configuration
@@ -250,6 +351,7 @@ Control timing between actions:

 <Tabs items={['Python']}>
  <Tab value="Python">
+
    ```python
    # Set default delay between all actions (in seconds)
    computer.interface.delay = 0.5  # 500ms delay between actions
@@ -269,6 +371,7 @@ Manage Python environments:

 <Tabs items={['Python']}>
  <Tab value="Python">
+
    ```python
    # Virtual environment management
    await computer.venv_install("demo_venv", ["requests", "macos-pyxa"]) # Install packages in a virtual environment
@@ -277,4 +380,4 @@ Manage Python environments:
    ```

  </Tab>
-</Tabs>
+</Tabs>
@@ -10,7 +10,8 @@ pip install "cua-computer[ui]"
 ```

 <Callout title="Note">
-For precise control of the computer, we recommend using VNC or Screen Sharing instead of the Computer Gradio UI.
+  For precise control of the computer, we recommend using VNC or Screen Sharing instead of the
+  Computer Gradio UI.
 </Callout>

 ### Building and Sharing Demonstrations with Huggingface
@@ -43,8 +44,12 @@ For examples, see [Computer UI Examples](https://github.com/trycua/cua/tree/main
 #### 3. Record Your Tasks

 <details open>
-<summary>View demonstration video</summary>
-<video src="https://github.com/user-attachments/assets/de3c3477-62fe-413c-998d-4063e48de176" controls width="600"></video>
+  <summary>View demonstration video</summary>
+  <video
+    src="https://github.com/user-attachments/assets/de3c3477-62fe-413c-998d-4063e48de176"
+    controls
+    width="600"
+  ></video>
 </details>

 Record yourself performing various computer tasks using the UI.
@@ -52,8 +57,12 @@ Record yourself performing various computer tasks using the UI.
 #### 4. Save Your Demonstrations

 <details open>
-<summary>View demonstration video</summary>
-<video src="https://github.com/user-attachments/assets/5ad1df37-026a-457f-8b49-922ae805faef" controls width="600"></video>
+  <summary>View demonstration video</summary>
+  <video
+    src="https://github.com/user-attachments/assets/5ad1df37-026a-457f-8b49-922ae805faef"
+    controls
+    width="600"
+  ></video>
 </details>

 Save each task by picking a descriptive name and adding relevant tags (e.g., "office", "web-browsing", "coding").
@@ -65,11 +74,16 @@ Repeat steps 3 and 4 until you have a good amount of demonstrations covering dif
 #### 6. Upload to Huggingface

 <details open>
-<summary>View demonstration video</summary>
-<video src="https://github.com/user-attachments/assets/c586d460-3877-4b5f-a736-3248886d2134" controls width="600"></video>
+  <summary>View demonstration video</summary>
+  <video
+    src="https://github.com/user-attachments/assets/c586d460-3877-4b5f-a736-3248886d2134"
+    controls
+    width="600"
+  ></video>
 </details>

 Upload your dataset to Huggingface by:
+
 - Naming it as `{your_username}/{dataset_name}`
 - Choosing public or private visibility
 - Optionally selecting specific tags to upload only tasks with certain tags
@@ -77,4 +91,4 @@ Upload your dataset to Huggingface by:
 #### Examples and Resources

 - Example Dataset: [ddupont/test-dataset](https://huggingface.co/datasets/ddupont/test-dataset)
- Find Community Datasets: 🔍 [Browse CUA Datasets on Huggingface](https://huggingface.co/datasets?other=cua)
+- Find Community Datasets: 🔍 [Browse CUA Datasets on Huggingface](https://huggingface.co/datasets?other=cua)
@@ -3,7 +3,17 @@ title: Cua Computers
 description: Understanding Cua computer types and connection methods
 ---

-<Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/computer_nb.ipynb" target="_blank">Jupyter Notebook</a> and <a href="https://github.com/trycua/cua/tree/main/examples/computer-example-ts" target="_blank">NodeJS project</a> are available for this documentation.</Callout>
+<Callout>
+  A corresponding{' '}
+  <a href="https://github.com/trycua/cua/blob/main/notebooks/computer_nb.ipynb" target="_blank">
+    Jupyter Notebook
+  </a>{' '}
+  and{' '}
+  <a href="https://github.com/trycua/cua/tree/main/examples/computer-example-ts" target="_blank">
+    NodeJS project
+  </a>{' '}
+  are available for this documentation.
+</Callout>

 Before we can automate apps using AI, we need to first connect to a Computer Server to give the AI a safe environment to execute workflows in.

@@ -3,7 +3,16 @@ title: Sandboxed Python
 slug: sandboxed-python
 ---

-<Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py" target="_blank">Python example</a> is available for this documentation.</Callout>
+<Callout>
+  A corresponding{' '}
+  <a
+    href="https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py"
+    target="_blank"
+  >
+    Python example
+  </a>{' '}
+  is available for this documentation.
+</Callout>

 You can run Python functions securely inside a sandboxed virtual environment on a remote Cua Computer. This is useful for executing untrusted user code, isolating dependencies, or providing a safe environment for automation tasks.

@@ -15,6 +15,7 @@ This preset usecase uses [Cua Computer](/computer-sdk/computers) to interact wit
 ## Quickstart

 Create a `requirements.txt` file with the following dependencies:
+
 ```text
 cua-agent
 cua-computer
@@ -34,7 +35,7 @@ ANTHROPIC_API_KEY=your-api-key
 CUA_API_KEY=sk_cua-api01...
 ```

-Select the environment you want to run the code in (*click on the underlined values in the code to edit them directly!*):
+Select the environment you want to run the code in (_click on the underlined values in the code to edit them directly!_):

 <Tabs items={['☁️ Cloud', '🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox']}>
  <Tab value="☁️ Cloud">
@@ -58,23 +59,21 @@ from computer import Computer, VMProviderType
 from dotenv import load_dotenv

 logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
+logger = logging.getLogger(**name**)

 def handle_sigint(sig, frame):
-    print("\\n\\nExecution interrupted by user. Exiting gracefully...")
-    exit(0)
-
+print("\\n\\nExecution interrupted by user. Exiting gracefully...")
+exit(0)

 async def fill_application():
-    try:
-        async with Computer(
-            os_type="linux",
-            provider_type=VMProviderType.CLOUD,
-            name="`}<EditableValue placeholder="container-name" />{`",
-            api_key="`}<EditableValue placeholder="api_key" />{`",
-            verbosity=logging.INFO,
-        ) as computer:
+try:
+async with Computer(
+os_type="linux",
+provider_type=VMProviderType.CLOUD,
+name="`}<EditableValue placeholder="container-name" />{`",
+api_key="`}<EditableValue placeholder="api_key" />{`",
+verbosity=logging.INFO,
+) as computer:

            agent = ComputerAgent(
                model="anthropic/claude-3-5-sonnet-20241022",
@@ -124,10 +123,9 @@ async def fill_application():
        traceback.print_exc()
        raise

-
 def main():
-    try:
-        load_dotenv()
+try:
+load_dotenv()

        if "ANTHROPIC_API_KEY" not in os.environ:
            raise RuntimeError(
@@ -149,9 +147,9 @@ def main():
        logger.error(f"Error running automation: {e}")
        traceback.print_exc()

+if **name** == "**main**":
+main()`}

-if __name__ == "__main__":
-    main()`}
 </EditableCodeBlock>

  </Tab>
@@ -175,22 +173,20 @@ from computer import Computer, VMProviderType
 from dotenv import load_dotenv

 logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
+logger = logging.getLogger(**name**)

 def handle_sigint(sig, frame):
-    print("\\n\\nExecution interrupted by user. Exiting gracefully...")
-    exit(0)
-
+print("\\n\\nExecution interrupted by user. Exiting gracefully...")
+exit(0)

 async def fill_application():
-    try:
-        async with Computer(
-            os_type="macos",
-            provider_type=VMProviderType.LUME,
-            name="`}<EditableValue placeholder="container-name" />{`",
-            verbosity=logging.INFO,
-        ) as computer:
+try:
+async with Computer(
+os_type="macos",
+provider_type=VMProviderType.LUME,
+name="`}<EditableValue placeholder="container-name" />{`",
+verbosity=logging.INFO,
+) as computer:

            agent = ComputerAgent(
                model="anthropic/claude-3-5-sonnet-20241022",
@@ -240,10 +236,9 @@ async def fill_application():
        traceback.print_exc()
        raise

-
 def main():
-    try:
-        load_dotenv()
+try:
+load_dotenv()

        if "ANTHROPIC_API_KEY" not in os.environ:
            raise RuntimeError(
@@ -259,9 +254,9 @@ def main():
        logger.error(f"Error running automation: {e}")
        traceback.print_exc()

+if **name** == "**main**":
+main()`}

-if __name__ == "__main__":
-    main()`}
 </EditableCodeBlock>

  </Tab>
@@ -283,21 +278,19 @@ from computer import Computer, VMProviderType
 from dotenv import load_dotenv

 logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
+logger = logging.getLogger(**name**)

 def handle_sigint(sig, frame):
-    print("\\n\\nExecution interrupted by user. Exiting gracefully...")
-    exit(0)
-
+print("\\n\\nExecution interrupted by user. Exiting gracefully...")
+exit(0)

 async def fill_application():
-    try:
-        async with Computer(
-            os_type="windows",
-            provider_type=VMProviderType.WINDOWS_SANDBOX,
-            verbosity=logging.INFO,
-        ) as computer:
+try:
+async with Computer(
+os_type="windows",
+provider_type=VMProviderType.WINDOWS_SANDBOX,
+verbosity=logging.INFO,
+) as computer:

            agent = ComputerAgent(
                model="anthropic/claude-3-5-sonnet-20241022",
@@ -347,10 +340,9 @@ async def fill_application():
        traceback.print_exc()
        raise

-
 def main():
-    try:
-        load_dotenv()
+try:
+load_dotenv()

        if "ANTHROPIC_API_KEY" not in os.environ:
            raise RuntimeError(
@@ -366,9 +358,9 @@ def main():
        logger.error(f"Error running automation: {e}")
        traceback.print_exc()

+if **name** == "**main**":
+main()`}

-if __name__ == "__main__":
-    main()`}
 </EditableCodeBlock>

  </Tab>
@@ -392,22 +384,20 @@ from computer import Computer, VMProviderType
 from dotenv import load_dotenv

 logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
+logger = logging.getLogger(**name**)

 def handle_sigint(sig, frame):
-    print("\\n\\nExecution interrupted by user. Exiting gracefully...")
-    exit(0)
-
+print("\\n\\nExecution interrupted by user. Exiting gracefully...")
+exit(0)

 async def fill_application():
-    try:
-        async with Computer(
-            os_type="linux",
-            provider_type=VMProviderType.DOCKER,
-            name="`}<EditableValue placeholder="container-name" />{`",
-            verbosity=logging.INFO,
-        ) as computer:
+try:
+async with Computer(
+os_type="linux",
+provider_type=VMProviderType.DOCKER,
+name="`}<EditableValue placeholder="container-name" />{`",
+verbosity=logging.INFO,
+) as computer:

            agent = ComputerAgent(
                model="anthropic/claude-3-5-sonnet-20241022",
@@ -457,10 +447,9 @@ async def fill_application():
        traceback.print_exc()
        raise

-
 def main():
-    try:
-        load_dotenv()
+try:
+load_dotenv()

        if "ANTHROPIC_API_KEY" not in os.environ:
            raise RuntimeError(
@@ -476,9 +465,9 @@ def main():
        logger.error(f"Error running automation: {e}")
        traceback.print_exc()

+if **name** == "**main**":
+main()`}

-if __name__ == "__main__":
-    main()`}
 </EditableCodeBlock>

  </Tab>
@@ -488,4 +477,4 @@ if __name__ == "__main__":

 - Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands)
 - Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/)
- Experiment with different [Models and Providers](/agent-sdk/supported-model-providers/) 
+- Experiment with different [Models and Providers](/agent-sdk/supported-model-providers/)
@@ -7,42 +7,42 @@ description: List of all commands supported by the Computer Server API (WebSocke

 This page lists all supported commands for the Computer Server, available via both WebSocket and REST API endpoints.

-| Command             | Description                                |
-|---------------------|--------------------------------------------|
-| version             | Get protocol and package version info       |
-| run_command         | Run a shell command                        |
-| screenshot          | Capture a screenshot                       |
-| get_screen_size     | Get the screen size                        |
-| get_cursor_position | Get the current mouse cursor position      |
-| mouse_down          | Mouse button down                          |
-| mouse_up            | Mouse button up                            |
-| left_click          | Left mouse click                           |
-| right_click         | Right mouse click                          |
-| double_click        | Double mouse click                         |
-| move_cursor         | Move mouse cursor to coordinates           |
-| drag_to             | Drag mouse to coordinates                  |
-| drag                | Drag mouse by offset                       |
-| key_down            | Keyboard key down                          |
-| key_up              | Keyboard key up                            |
-| type_text           | Type text                                  |
-| press_key           | Press a single key                         |
-| hotkey              | Press a hotkey combination                 |
-| scroll              | Scroll the screen                          |
-| scroll_down         | Scroll down                                |
-| scroll_up           | Scroll up                                  |
-| copy_to_clipboard   | Copy text to clipboard                     |
-| set_clipboard       | Set clipboard content                      |
-| file_exists         | Check if a file exists                     |
-| directory_exists    | Check if a directory exists                |
-| list_dir            | List files/directories in a directory      |
-| read_text           | Read text from a file                      |
-| write_text          | Write text to a file                       |
-| read_bytes          | Read bytes from a file                     |
-| write_bytes         | Write bytes to a file                      |
-| get_file_size       | Get file size                              |
-| delete_file         | Delete a file                              |
-| create_dir          | Create a directory                         |
-| delete_dir          | Delete a directory                         |
-| get_accessibility_tree | Get accessibility tree (if supported)    |
-| find_element        | Find element in accessibility tree         |
-| diorama_cmd         | Run a diorama command (if supported)       |
+| Command                | Description                           |
+| ---------------------- | ------------------------------------- |
+| version                | Get protocol and package version info |
+| run_command            | Run a shell command                   |
+| screenshot             | Capture a screenshot                  |
+| get_screen_size        | Get the screen size                   |
+| get_cursor_position    | Get the current mouse cursor position |
+| mouse_down             | Mouse button down                     |
+| mouse_up               | Mouse button up                       |
+| left_click             | Left mouse click                      |
+| right_click            | Right mouse click                     |
+| double_click           | Double mouse click                    |
+| move_cursor            | Move mouse cursor to coordinates      |
+| drag_to                | Drag mouse to coordinates             |
+| drag                   | Drag mouse by offset                  |
+| key_down               | Keyboard key down                     |
+| key_up                 | Keyboard key up                       |
+| type_text              | Type text                             |
+| press_key              | Press a single key                    |
+| hotkey                 | Press a hotkey combination            |
+| scroll                 | Scroll the screen                     |
+| scroll_down            | Scroll down                           |
+| scroll_up              | Scroll up                             |
+| copy_to_clipboard      | Copy text to clipboard                |
+| set_clipboard          | Set clipboard content                 |
+| file_exists            | Check if a file exists                |
+| directory_exists       | Check if a directory exists           |
+| list_dir               | List files/directories in a directory |
+| read_text              | Read text from a file                 |
+| write_text             | Write text to a file                  |
+| read_bytes             | Read bytes from a file                |
+| write_bytes            | Write bytes to a file                 |
+| get_file_size          | Get file size                         |
+| delete_file            | Delete a file                         |
+| create_dir             | Create a directory                    |
+| delete_dir             | Delete a directory                    |
+| get_accessibility_tree | Get accessibility tree (if supported) |
+| find_element           | Find element in accessibility tree    |
+| diorama_cmd            | Run a diorama command (if supported)  |
@@ -16,6 +16,7 @@ The Computer Server exposes a single REST endpoint for command execution:
 - Returns results as a streaming response (text/event-stream)

 ### Request Format
+
 ```json
 {
  "command": "<command_name>",
@@ -24,10 +25,12 @@ The Computer Server exposes a single REST endpoint for command execution:
 ```

 ### Required Headers (for cloud containers)
+
 - `X-Container-Name`: Name of the container (cloud only)
 - `X-API-Key`: API key for authentication (cloud only)

 ### Example Request (Python)
+
 ```python
 import requests

@@ -38,6 +41,7 @@ print(resp.text)
 ```

 ### Example Request (Cloud)
+
 ```python
 import requests

@@ -52,7 +56,9 @@ print(resp.text)
 ```

 ### Response Format
+
 Streaming text/event-stream with JSON objects, e.g.:
+
 ```
 data: {"success": true, "content": "..."}

@@ -60,4 +66,5 @@ data: {"success": false, "error": "..."}
 ```

 ### Supported Commands
+
 See [Commands Reference](./Commands) for the full list of commands and parameters.
@@ -11,7 +11,9 @@ The Computer Server exposes a WebSocket endpoint for real-time command execution
 - `wss://your-container.containers.cloud.trycua.com:8443/ws` (cloud)

 ### Authentication (Cloud Only)
+
 For cloud containers, you must authenticate immediately after connecting:
+
 ```json
 {
  "command": "authenticate",
@@ -21,10 +23,13 @@ For cloud containers, you must authenticate immediately after connecting:
  }
 }
 ```
+
 If authentication fails, the connection is closed.

 ### Command Format
+
 Send JSON messages:
+
 ```json
 {
  "command": "<command_name>",
@@ -33,6 +38,7 @@ Send JSON messages:
 ```

 ### Example (Python)
+
 ```python
 import websockets
 import asyncio
@@ -49,6 +55,7 @@ asyncio.run(main())
 ```

 ### Example (Cloud)
+
 ```python
 import websockets
 import asyncio
@@ -74,7 +81,9 @@ asyncio.run(main())
 ```

 ### Response Format
+
 Each response is a JSON object:
+
 ```json
 {
  "success": true,
@@ -83,4 +92,5 @@ Each response is a JSON object:
 ```

 ### Supported Commands
+
 See [Commands Reference](./Commands) for the full list of commands and parameters.
@@ -6,7 +6,16 @@ github:
  - https://github.com/trycua/cua/tree/main/libs/python/computer-server
 ---

-<Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/computer_server_nb.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.</Callout>
+<Callout>
+  A corresponding{' '}
+  <a
+    href="https://github.com/trycua/cua/blob/main/notebooks/computer_server_nb.ipynb"
+    target="_blank"
+  >
+    Jupyter Notebook
+  </a>{' '}
+  is available for this documentation.
+</Callout>

 The Computer Server API reference documentation is currently under development.

@@ -20,4 +20,4 @@ See the [Commands](../computer-sdk/commands) documentation for all supported com

 ## Sandboxed Python Functions

-See the [Sandboxed Python](../computer-sdk/sandboxed-python) documentation for running Python functions securely in isolated environments on a remote Cua Computer.
+See the [Sandboxed Python](../computer-sdk/sandboxed-python) documentation for running Python functions securely in isolated environments on a remote Cua Computer.
@@ -18,7 +18,8 @@ lume run ubuntu-noble-vanilla:latest
 ```

 <Callout>
-We provide [prebuilt VM images](../lume/prebuilt-images) in our [ghcr registry](https://github.com/orgs/trycua/packages).
+  We provide [prebuilt VM images](../lume/prebuilt-images) in our [ghcr
+  registry](https://github.com/orgs/trycua/packages).
 </Callout>

 ### Create a Custom VM
@@ -37,10 +38,11 @@ The actual disk space used by sparse images will be much lower than the logical

 ## VM Management

- lume create &lt;name&gt;
+lume create &lt;name&gt;
 Create a new macOS or Linux virtual machine.

 **Options:**
+
 - `--os <os>` - Operating system to install (macOS or linux, default: macOS)
 - `--cpu <cores>` - Number of CPU cores (default: 4)
 - `--memory <size>` - Memory size, e.g., 8GB (default: 4GB)
@@ -50,6 +52,7 @@ Create a new macOS or Linux virtual machine.
 - `--storage <name>` - VM storage location to use

 **Examples:**
+
 ```bash
 # Create macOS VM with custom specs
 lume create my-mac --cpu 6 --memory 16GB --disk-size 100GB
@@ -61,10 +64,11 @@ lume create my-ubuntu --os linux --cpu 2 --memory 8GB
 lume create my-sequoia --ipsw latest
 ```

- lume run &lt;name&gt;
+lume run &lt;name&gt;
 Start and run a virtual machine.

 **Options:**
+
 - `--no-display` - Do not start the VNC client app
 - `--shared-dir <dir>` - Share directory with VM (format: path[:ro|rw])
 - `--mount <path>` - For Linux VMs only, attach a read-only disk image
@@ -75,6 +79,7 @@ Start and run a virtual machine.
 - `--storage <name>` - VM storage location to use

 **Examples:**
+
 ```bash
 # Run VM with shared directory
 lume run my-vm --shared-dir /path/to/share:rw
@@ -86,42 +91,52 @@ lume run my-vm --no-display
 lume run my-mac --recovery-mode true
 ```

- lume stop &lt;name&gt;
+lume stop &lt;name&gt;
 Stop a running virtual machine.

 **Options:**
+
 - `--storage <name>` - VM storage location to use

 ### lume delete &lt;name&gt;
+
 Delete a virtual machine and its associated files.

 **Options:**
+
 - `--force` - Force deletion without confirmation
 - `--storage <name>` - VM storage location to use

 ### lume clone &lt;name&gt; &lt;new-name&gt;
+
 Create a copy of an existing virtual machine.

 **Options:**
+
 - `--source-storage <name>` - Source VM storage location
 - `--dest-storage <name>` - Destination VM storage location

 ## VM Information and Configuration

 ### lume ls
+
 List all virtual machines and their status.

 ### lume get &lt;name&gt;
+
 Get detailed information about a specific virtual machine.

 **Options:**
+
 - `-f, --format <format>` - Output format (json|text)
 - `--storage <name>` - VM storage location to use

 ### lume set &lt;name&gt;
+
 Modify virtual machine configuration.

 **Options:**
+
 - `--cpu <cores>` - New number of CPU cores (e.g., 4)
 - `--memory <size>` - New memory size (e.g., 8192MB or 8GB)
 - `--disk-size <size>` - New disk size (e.g., 40960MB or 40GB)
@@ -129,6 +144,7 @@ Modify virtual machine configuration.
 - `--storage <name>` - VM storage location to use

 **Examples:**
+
 ```bash
 # Increase VM memory
 lume set my-vm --memory 16GB
@@ -143,20 +159,25 @@ lume set my-vm --cpu 8
 ## Image Management

 ### lume images
+
 List available macOS images in local cache.

 ### lume pull &lt;image&gt;
+
 Download a VM image from a container registry.

 **Options:**
+
 - `--registry <url>` - Container registry URL (default: ghcr.io)
 - `--organization <org>` - Organization to pull from (default: trycua)
 - `--storage <name>` - VM storage location to use

 ### lume push &lt;name&gt; &lt;image:tag&gt;
+
 Upload a VM image to a container registry.

 **Options:**
+
 - `--additional-tags <tags...>` - Additional tags to push the same image to
 - `--registry <url>` - Container registry URL (default: ghcr.io)
 - `--organization <org>` - Organization/user to push to (default: trycua)
@@ -167,38 +188,46 @@ Upload a VM image to a container registry.
 - `--reassemble` - Verify integrity by reassembling chunks (requires --dry-run)

 ### lume ipsw
+
 Get the latest macOS restore image URL.

 ### lume prune
+
 Remove cached images to free up disk space.

 ## Configuration

 ### lume config
+
 Manage Lume configuration settings.

 **Subcommands:**

 ##### Storage Management
+
 - `lume config storage add <name> <path>` - Add a new VM storage location
 - `lume config storage remove <name>` - Remove a VM storage location
 - `lume config storage list` - List all VM storage locations
 - `lume config storage default <name>` - Set the default VM storage location

 ##### Cache Management
+
 - `lume config cache get` - Get current cache directory
 - `lume config cache set <path>` - Set cache directory

 ##### Image Caching
+
 - `lume config caching get` - Show current caching status
 - `lume config caching set <boolean>` - Enable or disable image caching

 ## API Server

 ### lume serve
+
 Start the Lume API server for programmatic access.

 **Options:**
+
 - `--port <port>` - Port to listen on (default: 7777)

 ## Global Options
@@ -206,4 +235,4 @@ Start the Lume API server for programmatic access.
 These options are available for all commands:

 - `--help` - Show help information
- `--version` - Show version number
+- `--version` - Show version number
@@ -13,9 +13,8 @@ http://localhost:7777
 ```

 <Callout type="info">
-  The HTTP API service runs on port `7777` by default. If you'd like to use a
-  different port, pass the `--port` option during installation or when running
-  `lume serve`.
+  The HTTP API service runs on port `7777` by default. If you'd like to use a different port, pass
+  the `--port` option during installation or when running `lume serve`.
 </Callout>

 ## Endpoints
@@ -726,15 +725,15 @@ Push a VM to a registry as an image (asynchronous operation).

 #### Parameters

-| Name         | Type         | Required | Description                                     |
-| ------------ | ------------ | -------- | ----------------------------------------------- |
-| name         | string       | Yes      | Local VM name to push                           |
-| imageName    | string       | Yes      | Image name in registry                          |
-| tags         | array        | Yes      | Image tags (e.g. `["latest", "v1"]`)           |
-| organization | string       | Yes      | Organization name                               |
-| registry     | string       | No       | Registry host (e.g. `ghcr.io`)                  |
-| chunkSizeMb  | integer      | No       | Chunk size in MB for upload                     |
-| storage      | string/null  | No       | Storage type (`ssd`, etc.)                      |
+| Name         | Type        | Required | Description                          |
+| ------------ | ----------- | -------- | ------------------------------------ |
+| name         | string      | Yes      | Local VM name to push                |
+| imageName    | string      | Yes      | Image name in registry               |
+| tags         | array       | Yes      | Image tags (e.g. `["latest", "v1"]`) |
+| organization | string      | Yes      | Organization name                    |
+| registry     | string      | No       | Registry host (e.g. `ghcr.io`)       |
+| chunkSizeMb  | integer     | No       | Chunk size in MB for upload          |
+| storage      | string/null | No       | Storage type (`ssd`, etc.)           |

 #### Example Request

@@ -747,13 +746,13 @@ curl --connect-timeout 6000 \
  -X POST \
  -H "Content-Type: application/json" \
  -d '{
-    "name": "my-local-vm", 
+    "name": "my-local-vm",
    "imageName": "my-image",
    "tags": ["latest", "v1"],
-    "organization": "my-org", 
+    "organization": "my-org",
    "registry": "ghcr.io",
    "chunkSizeMb": 512,
-    "storage": null 
+    "storage": null
  }' \
  http://localhost:7777/lume/vms/push
 ```
@@ -808,10 +807,7 @@ console.log(await res.json());
  "message": "Push initiated in background",
  "name": "my-local-vm",
  "imageName": "my-image",
-  "tags": [
-    "latest",
-    "v1"
-  ]
+  "tags": ["latest", "v1"]
 }
 ```

@@ -857,10 +853,7 @@ console.log(await res.json());

 ```json
 {
-  "local": [
-    "macos-sequoia-xcode:latest",
-    "macos-sequoia-vanilla:latest"
-  ]
+  "local": ["macos-sequoia-xcode:latest", "macos-sequoia-vanilla:latest"]
 }
 ```

@@ -1005,11 +998,11 @@ Update Lume configuration settings.

 #### Parameters

-| Name            | Type    | Required | Description                      |
-| --------------- | ------- | -------- | -------------------------------- |
-| homeDirectory   | string  | No       | Lume home directory path         |
-| cacheDirectory  | string  | No       | Cache directory path             |
-| cachingEnabled  | boolean | No       | Enable or disable caching        |
+| Name           | Type    | Required | Description               |
+| -------------- | ------- | -------- | ------------------------- |
+| homeDirectory  | string  | No       | Lume home directory path  |
+| cacheDirectory | string  | No       | Cache directory path      |
+| cachingEnabled | boolean | No       | Enable or disable caching |

 #### Example Request

@@ -5,4 +5,4 @@ github:
  - https://github.com/trycua/cua/tree/main/libs/lume
 ---

-Lume is a lightweight Command Line Interface and local API server for creating, running and managing **macOS and Linux virtual machines** with near-native performance on Apple Silicon, using Apple's [Virtualization.Framework](https://developer.apple.com/documentation/virtualization).
+Lume is a lightweight Command Line Interface and local API server for creating, running and managing **macOS and Linux virtual machines** with near-native performance on Apple Silicon, using Apple's [Virtualization.Framework](https://developer.apple.com/documentation/virtualization).
@@ -15,10 +15,12 @@ lume run macos-sequoia-vanilla:latest
 ```

 <Callout title="Security Note">
-All prebuilt images use the default password `lume`. Change this immediately after your first login using the `passwd` command.
+  All prebuilt images use the default password `lume`. Change this immediately after your first
+  login using the `passwd` command.
 </Callout>

 **System Requirements**:
+
 - Apple Silicon Mac (M1, M2, M3, etc.)
 - macOS 13.0 or later
 - At least 8GB of RAM (16GB recommended)
@@ -33,6 +35,7 @@ Install with a single command:
 ```

 ### Manual Start (No Background Service)
+
 By default, Lume is installed as a background service that starts automatically on login. If you prefer to start the Lume API service manually when needed, you can use the `--no-background-service` option:

 ```bash
@@ -40,8 +43,11 @@ By default, Lume is installed as a background service that starts automatically
 ```

 <Callout title="Note">
-With this option, you'll need to manually start the Lume API service by running `lume serve` in your terminal whenever you need to use tools or libraries that rely on the Lume API (such as the Computer-Use Agent).
+  With this option, you'll need to manually start the Lume API service by running `lume serve` in
+  your terminal whenever you need to use tools or libraries that rely on the Lume API (such as the
+  Computer-Use Agent).
 </Callout>

 ## Manual Download and Installation
-You can also download the `lume.pkg.tar.gz` archive from the [latest release](https://github.com/trycua/cua/releases?q=lume&expanded=true), extract it, and install the package manually.
+
+You can also download the `lume.pkg.tar.gz` archive from the [latest release](https://github.com/trycua/cua/releases?q=lume&expanded=true), extract it, and install the package manually.
@@ -5,24 +5,29 @@ title: Prebuilt Images
 Pre-built images are available in the registry [ghcr.io/trycua](https://github.com/orgs/trycua/packages). These images come with an SSH server pre-configured and auto-login enabled.

 <Callout>
-The default password on pre-built images is `lume`. For the security of your VM, change this password after your first login.
+  The default password on pre-built images is `lume`. For the security of your VM, change this
+  password after your first login.
 </Callout>

 ## Available Images

 The following pre-built images are available to download via `lume pull`:

-| Image | Tag | Description | Logical Size |
-|-------|------------|-------------|------|
-| `macos-sequoia-vanilla` | `latest`, `15.2` | macOS Sequoia 15.2 image | 20GB |
-| `macos-sequoia-xcode` | `latest`, `15.2` | macOS Sequoia 15.2 image with Xcode command line tools | 22GB |
-| `macos-sequoia-cua` | `latest`, `15.3` | macOS Sequoia 15.3 image compatible with the Computer interface | 24GB |
-| `ubuntu-noble-vanilla` | `latest`, `24.04.1` | [Ubuntu Server for ARM 24.04.1 LTS](https://ubuntu.com/download/server/arm) with Ubuntu Desktop | 20GB |
+| Image                   | Tag                 | Description                                                                                     | Logical Size |
+| ----------------------- | ------------------- | ----------------------------------------------------------------------------------------------- | ------------ |
+| `macos-sequoia-vanilla` | `latest`, `15.2`    | macOS Sequoia 15.2 image                                                                        | 20GB         |
+| `macos-sequoia-xcode`   | `latest`, `15.2`    | macOS Sequoia 15.2 image with Xcode command line tools                                          | 22GB         |
+| `macos-sequoia-cua`     | `latest`, `15.3`    | macOS Sequoia 15.3 image compatible with the Computer interface                                 | 24GB         |
+| `ubuntu-noble-vanilla`  | `latest`, `24.04.1` | [Ubuntu Server for ARM 24.04.1 LTS](https://ubuntu.com/download/server/arm) with Ubuntu Desktop | 20GB         |

 ## Disk Space

 For additional disk space, resize the VM disk after pulling the image using the `lume set <name> --disk-size <size>` command. Note that the actual disk space used by sparse images will be much lower than the logical size listed.

 <Callout>
-**Important Note (v0.2.0+):** Images are being re-uploaded with sparse file system optimizations enabled, resulting in significantly lower actual disk usage. Older images (without the `-sparse` suffix) are now **deprecated**. The last version of `lume` fully supporting the non-sparse images was `v0.1.x`. Starting from `v0.2.0`, lume will automatically pull images optimized with sparse file system support.
-</Callout>
+  **Important Note (v0.2.0+):** Images are being re-uploaded with sparse file system optimizations
+  enabled, resulting in significantly lower actual disk usage. Older images (without the `-sparse`
+  suffix) are now **deprecated**. The last version of `lume` fully supporting the non-sparse images
+  was `v0.1.x`. Starting from `v0.2.0`, lume will automatically pull images optimized with sparse
+  file system support.
+</Callout>
@@ -39,4 +39,4 @@ docker build -t yourusername/lumier:custom .

 # Push to Docker Hub (after docker login)
 docker push yourusername/lumier:custom
-```
+```
@@ -13,10 +13,10 @@ services:
    container_name: lumier-vm
    restart: unless-stopped
    ports:
-      - "8006:8006"  # Port for VNC access
+      - '8006:8006' # Port for VNC access
    volumes:
-      - ./storage:/storage  # VM persistent storage
-      - ./shared:/shared    # Shared folder accessible in the VM
+      - ./storage:/storage # VM persistent storage
+      - ./shared:/shared # Shared folder accessible in the VM
    environment:
      - VM_NAME=lumier-vm
      - VERSION=ghcr.io/trycua/macos-sequoia-cua:latest
@@ -5,6 +5,7 @@ title: Docker
 You can use Lumier through Docker:

 ### Run a macOS VM (ephemeral)
+
 ```bash
 # Run the container with temporary storage (using pre-built image from Docker Hub)
 docker run -it --rm \
@@ -16,12 +17,15 @@ docker run -it --rm \
    -e RAM_SIZE=8192 \
    trycua/lumier:latest
 ```
+
 Access the VM in your browser at **http://localhost:8006**.

 After running the command above, you can access your macOS VM through a web browser (e.g., http://localhost:8006).

 <Callout title="Note">
-With the basic setup above, your VM will be reset when you stop the container (ephemeral mode). This means any changes you make inside the macOS VM will be lost. See the section below for how to save your VM state.
+  With the basic setup above, your VM will be reset when you stop the container (ephemeral mode).
+  This means any changes you make inside the macOS VM will be lost. See the section below for how to
+  save your VM state.
 </Callout>

 ## Saving Your VM State
@@ -121,4 +125,4 @@ When running Lumier, you'll need to configure a few things:
  - `HOST_STORAGE_PATH`: Path to save VM state (when using persistent storage)
  - `HOST_SHARED_PATH`: Path to the shared folder (optional)

- **Background service**: The `lume serve` service should be running on your host (starts automatically when you install Lume using the `install.sh` script above).
+- **Background service**: The `lume serve` service should be running on your host (starts automatically when you install Lume using the `install.sh` script above).
@@ -15,7 +15,9 @@ github:
 ## How It Works

 <Callout title="Note">
-We're using Docker primarily as a convenient delivery mechanism, not as an isolation layer. Unlike traditional Docker containers, Lumier leverages the Apple Virtualization Framework (Apple Vz) through the `lume` CLI to create true virtual machines.
+  We're using Docker primarily as a convenient delivery mechanism, not as an isolation layer. Unlike
+  traditional Docker containers, Lumier leverages the Apple Virtualization Framework (Apple Vz)
+  through the `lume` CLI to create true virtual machines.
 </Callout>

 Here's what's happening behind the scenes:
@@ -23,4 +25,4 @@ Here's what's happening behind the scenes:
 1. The Docker container provides a consistent environment to run the Lumier interface
 2. Lumier connects to the Lume service running on your host Mac
 3. Lume uses Apple's Virtualization Framework to create a true macOS virtual machine
-4. The VM runs with hardware acceleration using your Mac's native virtualization capabilities
+4. The VM runs with hardware acceleration using your Mac's native virtualization capabilities
@@ -7,8 +7,9 @@ Before using Lumier, make sure you have:
 1. **Docker for Apple Silicon** - download it [here](https://desktop.docker.com/mac/main/arm64/Docker.dmg) and follow the installation instructions.

 2. **Lume** - This is the virtualization CLI that powers Lumier. Install it with this command:
+
 ```bash
 /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
 ```

-After installation, Lume runs as a background service and listens on port 7777. This service allows Lumier to create and manage virtual machines. If port 7777 is already in use on your system, you can specify a different port with the `--port` option when running the `install.sh` script.
+After installation, Lume runs as a background service and listens on port 7777. This service allows Lumier to create and manage virtual machines. If port 7777 is already in use on your system, you can specify a different port with the `--port` option when running the `install.sh` script.
@@ -17,4 +17,4 @@ To use with Cursor, add an MCP configuration file in one of these locations:

 After configuration, you can simply tell Cursor's Agent to perform computer tasks by explicitly mentioning the CUA agent, such as "Use the computer control tools to open Safari."

-For more information on MCP with Cursor, see the [official Cursor MCP documentation](https://docs.cursor.com/context/model-context-protocol).
+For more information on MCP with Cursor, see the [official Cursor MCP documentation](https://docs.cursor.com/context/model-context-protocol).
@@ -4,7 +4,7 @@ title: Configuration

 The server is configured using environment variables (can be set in the Claude Desktop config):

-| Variable | Description | Default |
-|----------|-------------|---------|
+| Variable         | Description                                                                                                                                                                                                 | Default                              |
+| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------ |
 | `CUA_MODEL_NAME` | Model string (e.g., "anthropic/claude-3-5-sonnet-20241022", "openai/computer-use-preview", "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", "omniparser+litellm/gpt-4o", "omniparser+ollama_chat/gemma3") | anthropic/claude-3-5-sonnet-20241022 |
-| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 |
+| `CUA_MAX_IMAGES` | Maximum number of images to keep in context                                                                                                                                                                 | 3                                    |
@@ -6,4 +6,4 @@ github:
  - https://github.com/trycua/cua/tree/main/libs/python/mcp-server
 ---

-**cua-mcp-server** is a MCP server for the Computer-Use Agent (CUA), allowing you to run CUA through Claude Desktop or other MCP clients.
+**cua-mcp-server** is a MCP server for the Computer-Use Agent (CUA), allowing you to run CUA through Claude Desktop or other MCP clients.
@@ -9,8 +9,9 @@ pip install cua-mcp-server
 ```

 This will install:
+
 - The MCP server
- CUA agent and computer dependencies 
+- CUA agent and computer dependencies
 - An executable `cua-mcp-server` script in your PATH

 ## Easy Setup Script
@@ -22,6 +23,7 @@ curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/python/mcp-ser
 ```

 This script will:
+
 - Create the ~/.cua directory if it doesn't exist
 - Generate a startup script at ~/.cua/start_mcp_server.sh
 - Make the script executable
@@ -30,7 +32,7 @@ This script will:
 You can then use the script in your MCP configuration like this:

 ```json
-{ 
+{
  "mcpServers": {
    "cua-agent": {
      "command": "/bin/bash",
@@ -48,6 +50,7 @@ You can then use the script in your MCP configuration like this:
 If you get a `/bin/bash: ~/cua/libs/python/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative.

 To see the logs:
+
 ```
 tail -n 20 -f ~/Library/Logs/Claude/mcp*.log
-```
+```
@@ -1,6 +1,7 @@
 ---
 title: LLM Integrations
 ---
+
 ## LiteLLM Integration

 This MCP server features comprehensive liteLLM integration, allowing you to use any supported LLM provider with a simple model string configuration.
@@ -10,7 +11,8 @@ This MCP server features comprehensive liteLLM integration, allowing you to use
 - **Extensive Provider Support**: Works with Anthropic, OpenAI, local models, and any liteLLM-compatible provider

 ### Model String Examples:
+
 - **Anthropic**: `"anthropic/claude-3-5-sonnet-20241022"`
 - **OpenAI**: `"openai/computer-use-preview"`
 - **UI-TARS**: `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`
- **Omni + Any LiteLLM**: `"omniparser+litellm/gpt-4o"`, `"omniparser+litellm/claude-3-haiku"`, `"omniparser+ollama_chat/gemma3"`
+- **Omni + Any LiteLLM**: `"omniparser+litellm/gpt-4o"`, `"omniparser+litellm/claude-3-haiku"`, `"omniparser+ollama_chat/gemma3"`
@@ -7,4 +7,4 @@ title: Tools
 The MCP server exposes the following tools to Claude:

 1. `run_cua_task` - Run a single Computer-Use Agent task with the given instruction
-2. `run_multi_cua_tasks` - Run multiple tasks in sequence
+2. `run_multi_cua_tasks` - Run multiple tasks in sequence
@@ -16,5 +16,6 @@ Claude will automatically use your CUA agent to perform these tasks.
 ### First-time Usage Notes

 **API Keys**: Ensure you have valid API keys:
-   - Add your Anthropic API key, or other model provider API key in the Claude Desktop config (as shown above)
-   - Or set it as an environment variable in your shell profile
+
+- Add your Anthropic API key, or other model provider API key in the Claude Desktop config (as shown above)
+- Or set it as an environment variable in your shell profile
@@ -5,18 +5,28 @@ title: Configuration
 ### Detection Parameters

 #### Box Threshold (0.3)
+
 Controls the confidence threshold for accepting detections:
-<img src="/docs/img/som_box_threshold.png" alt="Illustration of confidence thresholds in object detection, with a high-confidence detection accepted and a low-confidence detection rejected." width="500px" />
- Higher values (0.3) yield more precise but fewer detections
- Lower values (0.01) catch more potential icons but increase false positives
- Default is 0.3 for optimal precision/recall balance
+
+<img
+  src="/docs/img/som_box_threshold.png"
+  alt="Illustration of confidence thresholds in object detection, with a high-confidence detection accepted and a low-confidence detection rejected."
+  width="500px"
+/>
+- Higher values (0.3) yield more precise but fewer detections - Lower values (0.01) catch more
+potential icons but increase false positives - Default is 0.3 for optimal precision/recall balance

 #### IOU Threshold (0.1)
+
 Controls how overlapping detections are merged:
-<img src="/docs/img/som_iou_threshold.png" alt="Diagram showing Intersection over Union (IOU) with low overlap between two boxes kept separate and high overlap leading to merging." width="500px" />
- Lower values (0.1) more aggressively remove overlapping boxes
- Higher values (0.5) allow more overlapping detections
- Default is 0.1 to handle densely packed UI elements
+
+<img
+  src="/docs/img/som_iou_threshold.png"
+  alt="Diagram showing Intersection over Union (IOU) with low overlap between two boxes kept separate and high overlap leading to merging."
+  width="500px"
+/>
+- Lower values (0.1) more aggressively remove overlapping boxes - Higher values (0.5) allow more
+overlapping detections - Default is 0.1 to handle densely packed UI elements

 ### OCR Configuration

@@ -37,6 +47,7 @@ Controls how overlapping detections are merged:
 ### Hardware Acceleration

 #### MPS (Metal Performance Shaders)
+
 - Multi-scale detection (640px, 1280px, 1920px)
 - Test-time augmentation enabled
 - Half-precision (FP16)
@@ -44,6 +55,7 @@ Controls how overlapping detections are merged:
 - Best for production use when available

 #### CPU
+
 - Single-scale detection (1280px)
 - Full-precision (FP32)
 - Average detection time: ~1.3s
@@ -63,4 +75,4 @@ examples/output/
    │   └── screenshot_analyzed.png
    ├── screen_details.txt
    └── summary.json
-```
+```
@@ -6,7 +6,13 @@ github:
  - https://github.com/trycua/cua/tree/main/libs/python/som
 ---

-<Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/examples/som_examples.py" target="_blank">Python example</a> is available for this documentation.</Callout>
+<Callout>
+  A corresponding{' '}
+  <a href="https://github.com/trycua/cua/blob/main/examples/som_examples.py" target="_blank">
+    Python example
+  </a>{' '}
+  is available for this documentation.
+</Callout>

 ## Overview

@@ -35,7 +35,7 @@ You can run your Cua computer in the cloud (recommended for easiest setup), loca
  <Tab value="🍎 Lume">

    Lume containers are macOS virtual machines that run on a macOS host machine.
-    
+
    1. Install the Lume CLI:

    ```bash
@@ -51,8 +51,8 @@ You can run your Cua computer in the cloud (recommended for easiest setup), loca
  </Tab>
  <Tab value="🪟 Windows Sandbox">

-  Windows Sandbox provides Windows virtual environments that run on a Windows host machine.
-    
+Windows Sandbox provides Windows virtual environments that run on a Windows host machine.
+
    1. Enable [Windows Sandbox](https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/windows-sandbox-install) (requires Windows 10 Pro/Enterprise or Windows 11)
    2. Install the `pywinsandbox` dependency:

@@ -65,8 +65,8 @@ You can run your Cua computer in the cloud (recommended for easiest setup), loca
  </Tab>
  <Tab value="🐳 Docker">

-  Docker provides a way to run Ubuntu containers on any host machine.
-    
+Docker provides a way to run Ubuntu containers on any host machine.
+
    1. Install Docker Desktop or Docker Engine:

    2. Pull the CUA Ubuntu sandbox:
@@ -173,6 +173,7 @@ Connect to your Cua computer and perform basic interactions, such as taking scre
    finally:
        await computer.close()
    ```
+
  </Tab>
  <Tab value="TypeScript">
    Install the Cua computer TypeScript SDK:
@@ -260,6 +261,7 @@ Connect to your Cua computer and perform basic interactions, such as taking scre
      await computer.close();
    }
    ```
+
  </Tab>
 </Tabs>

@@ -274,11 +276,13 @@ Learn more about computers in the [Cua computers documentation](/computer-sdk/co
 Utilize an Agent to automate complex tasks by providing it with a goal and allowing it to interact with the computer environment.

 Install the Cua agent Python SDK:
+
 ```bash
 pip install "cua-agent[all]"
 ```

 Then, use the `ComputerAgent` object:
+
 ```python
 from agent import ComputerAgent

@@ -24,6 +24,7 @@ Basic performance metrics and system information that help us understand usage p
 ### Opt-In Telemetry (Disabled by Default)

 **Conversation Trajectory Logging**: Full conversation history including:
+
 - User messages and agent responses
 - Computer actions and their outputs
 - Reasoning traces from the agent
@@ -123,21 +124,21 @@ Note that telemetry settings must be configured during initialization and cannot

 ### Computer SDK Events

-| Event Name | Data Collected | Trigger Notes |
-|------------|----------------|---------------|
-| **computer_initialized** | • `os`: Operating system (e.g., 'windows', 'darwin', 'linux')<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when a Computer instance is created |
-| **module_init** | • `module`: "computer"<br />• `version`: Package version<br />• `python_version`: Full Python version string | Triggered once when the computer package is imported for the first time |
+| Event Name               | Data Collected                                                                                                                        | Trigger Notes                                                           |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------- |
+| **computer_initialized** | • `os`: Operating system (e.g., 'windows', 'darwin', 'linux')<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when a Computer instance is created                           |
+| **module_init**          | • `module`: "computer"<br />• `version`: Package version<br />• `python_version`: Full Python version string                          | Triggered once when the computer package is imported for the first time |

 ### Agent SDK Events

-| Event Name | Data Collected | Trigger Notes |
-|------------|----------------|---------------|
-| **module_init** | • `module`: "agent"<br />• `version`: Package version<br />• `python_version`: Full Python version string | Triggered once when the agent package is imported for the first time |
-| **agent_session_start** | • `session_id`: Unique UUID for this agent instance<br />• `agent_type`: Class name (e.g., "ComputerAgent")<br />• `model`: Model name (e.g., "claude-3-5-sonnet")<br />• `os`: Operating system<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when TelemetryCallback is initialized (agent instantiation) |
-| **agent_run_start** | • `session_id`: Agent session UUID<br />• `run_id`: Unique UUID for this run<br />• `start_time`: Unix timestamp<br />• `input_context_size`: Character count of input messages<br />• `num_existing_messages`: Count of existing messages<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the start of each agent.run() call |
-| **agent_run_end** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `end_time`: Unix timestamp<br />• `duration_seconds`: Total run duration<br />• `num_steps`: Total steps taken in this run<br />• `total_usage`: Accumulated token usage and costs<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the end of each agent.run() call |
-| **agent_step** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Step number (incremental)<br />• `timestamp`: Unix timestamp<br />• `duration_seconds`: Duration of previous step | Triggered on each agent response/step during a run |
-| **agent_usage** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Current step number<br />• `prompt_tokens`: Tokens in prompt<br />• `completion_tokens`: Tokens in response<br />• `total_tokens`: Total tokens used<br />• `response_cost`: Cost of this API call | Triggered whenever usage information is received from LLM API |
+| Event Name              | Data Collected                                                                                                                                                                                                                                                                                                        | Trigger Notes                                                         |
+| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------- |
+| **module_init**         | • `module`: "agent"<br />• `version`: Package version<br />• `python_version`: Full Python version string                                                                                                                                                                                                             | Triggered once when the agent package is imported for the first time  |
+| **agent_session_start** | • `session_id`: Unique UUID for this agent instance<br />• `agent_type`: Class name (e.g., "ComputerAgent")<br />• `model`: Model name (e.g., "claude-3-5-sonnet")<br />• `os`: Operating system<br />• `os_version`: OS version<br />• `python_version`: Python version                                              | Triggered when TelemetryCallback is initialized (agent instantiation) |
+| **agent_run_start**     | • `session_id`: Agent session UUID<br />• `run_id`: Unique UUID for this run<br />• `start_time`: Unix timestamp<br />• `input_context_size`: Character count of input messages<br />• `num_existing_messages`: Count of existing messages<br />• `uploaded_trajectory`: Full conversation items (opt-in)             | Triggered at the start of each agent.run() call                       |
+| **agent_run_end**       | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `end_time`: Unix timestamp<br />• `duration_seconds`: Total run duration<br />• `num_steps`: Total steps taken in this run<br />• `total_usage`: Accumulated token usage and costs<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the end of each agent.run() call                         |
+| **agent_step**          | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Step number (incremental)<br />• `timestamp`: Unix timestamp<br />• `duration_seconds`: Duration of previous step                                                                                                                         | Triggered on each agent response/step during a run                    |
+| **agent_usage**         | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Current step number<br />• `prompt_tokens`: Tokens in prompt<br />• `completion_tokens`: Tokens in response<br />• `total_tokens`: Total tokens used<br />• `response_cost`: Cost of this API call                                        | Triggered whenever usage information is received from LLM API         |

 ## Transparency

@@ -273,15 +273,99 @@ export async function generateMetadata(props: {
  if (page.url.includes('api')) title = `${page.data.title} | Cua API Docs`;
  if (page.url.includes('guide')) title = ` Guide: ${page.data.title} | Cua Docs`;

+  // Canonical URL points to cua.ai to consolidate all SEO authority on main domain
+  const canonicalUrl = `https://cua.ai${page.url}`;
+
+  // Extract keywords from the page for SEO
+  const keywords = [
+    'computer use agent',
+    'computer use',
+    'AI automation',
+    'visual automation',
+    page.data.title,
+  ];
+
+  // Structured data for better Google indexing (TechArticle schema)
+  const structuredData = {
+    '@context': 'https://schema.org',
+    '@type': 'TechArticle',
+    headline: page.data.title,
+    description: page.data.description,
+    url: canonicalUrl,
+    publisher: {
+      '@type': 'Organization',
+      name: 'Cua',
+      url: 'https://cua.ai',
+      logo: {
+        '@type': 'ImageObject',
+        url: 'https://cua.ai/cua_logo_black.svg',
+      },
+    },
+    mainEntityOfPage: {
+      '@type': 'WebPage',
+      '@id': canonicalUrl,
+    },
+  };
+
+  // Breadcrumb schema for better site structure understanding
+  const breadcrumbSchema = {
+    '@context': 'https://schema.org',
+    '@type': 'BreadcrumbList',
+    itemListElement: [
+      {
+        '@type': 'ListItem',
+        position: 1,
+        name: 'Cua',
+        item: 'https://cua.ai',
+      },
+      {
+        '@type': 'ListItem',
+        position: 2,
+        name: 'Documentation',
+        item: 'https://cua.ai/docs',
+      },
+      {
+        '@type': 'ListItem',
+        position: 3,
+        name: page.data.title,
+        item: canonicalUrl,
+      },
+    ],
+  };
+
  return {
    title,
    description: page.data.description,
+    keywords,
+    authors: [{ name: 'Cua', url: 'https://cua.ai' }],
+    robots: {
+      index: true,
+      follow: true,
+      googleBot: {
+        index: true,
+        follow: true,
+        'max-image-preview': 'large',
+        'max-snippet': -1,
+      },
+    },
+    alternates: {
+      canonical: canonicalUrl,
+    },
    openGraph: {
      title,
      description: page.data.description,
      type: 'article',
      siteName: 'Cua Docs',
-      url: 'https://trycua.com/docs',
+      url: canonicalUrl,
+    },
+    twitter: {
+      card: 'summary',
+      title,
+      description: page.data.description,
+      creator: '@trycua',
+    },
+    other: {
+      'script:ld+json': JSON.stringify([structuredData, breadcrumbSchema]),
    },
  };
 }
@@ -41,15 +41,15 @@ export const baseOptions: BaseLayoutProps = {
  githubUrl: 'https://github.com/trycua/cua',
  links: [
    {
-      url: 'https://trycua.com',
-      text: 'Cua home',
+      url: 'https://cua.ai',
+      text: 'Cua Home',
      type: 'icon',
      icon: <HomeIcon />,
-      external: false,
+      external: true,
    },
    {
      url: 'https://discord.com/invite/mVnXXpdE85',
-      text: 'Cua discord',
+      text: 'Discord',
      type: 'icon',
      icon: (
        <>
@@ -69,6 +69,7 @@ export const baseOptions: BaseLayoutProps = {
          />
        </>
      ),
+      external: true,
    },
  ],
 };
@@ -0,0 +1,32 @@
+import { MetadataRoute } from 'next';
+import { source } from '@/lib/source';
+
+export default function sitemap(): MetadataRoute.Sitemap {
+  const baseUrl = 'https://cua.ai';
+
+  // Get all pages from fumadocs source
+  const pages = source.getPages();
+
+  // Map pages to sitemap entries with /docs prefix
+  const docPages = pages.map((page) => {
+    // Ensure URL starts with /docs
+    const url = page.url.startsWith('/docs') ? page.url : `/docs${page.url}`;
+
+    return {
+      url: `${baseUrl}${url}`,
+      lastModified: new Date(),
+      changeFrequency: 'weekly' as const,
+      priority: url === '/docs' ? 1.0 : 0.8,
+    };
+  });
+
+  // Add main docs page if not included
+  const mainDocsPage = {
+    url: `${baseUrl}/docs`,
+    lastModified: new Date(),
+    changeFrequency: 'weekly' as const,
+    priority: 1.0,
+  };
+
+  return [mainDocsPage, ...docPages];
+}
@@ -1,15 +1,159 @@
 export function Footer() {
  return (
-    <footer className="mt-auto border-t border-fd-border py-4">
-      <div className="container mx-auto px-4 flex justify-end">
-        <a
-          href="https://www.cua.ai/cookie-policy"
-          target="_blank"
-          rel="noopener noreferrer"
-          className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
-        >
-          Cookie Policy
-        </a>
+    <footer className="mt-auto border-t border-fd-border py-8">
+      <div className="container mx-auto px-4">
+        <div className="grid grid-cols-1 md:grid-cols-4 gap-8 mb-6">
+          {/* Product Links */}
+          <div>
+            <h3 className="font-semibold text-sm mb-3 text-fd-foreground">Product</h3>
+            <ul className="space-y-2">
+              <li>
+                <a
+                  href="https://cua.ai"
+                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
+                >
+                  Home
+                </a>
+              </li>
+              <li>
+                <a
+                  href="https://cua.ai/pricing"
+                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
+                >
+                  Pricing
+                </a>
+              </li>
+              <li>
+                <a
+                  href="https://cua.ai/#features"
+                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
+                >
+                  Features
+                </a>
+              </li>
+            </ul>
+          </div>
+
+          {/* Documentation Links */}
+          <div>
+            <h3 className="font-semibold text-sm mb-3 text-fd-foreground">Documentation</h3>
+            <ul className="space-y-2">
+              <li>
+                <a
+                  href="/docs"
+                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
+                >
+                  Getting Started
+                </a>
+              </li>
+              <li>
+                <a
+                  href="/docs/agent-sdk/agent-loops"
+                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
+                >
+                  Agent Loops
+                </a>
+              </li>
+              <li>
+                <a
+                  href="/docs/quickstart-devs"
+                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
+                >
+                  Quick Start
+                </a>
+              </li>
+            </ul>
+          </div>
+
+          {/* Resources Links */}
+          <div>
+            <h3 className="font-semibold text-sm mb-3 text-fd-foreground">Resources</h3>
+            <ul className="space-y-2">
+              <li>
+                <a
+                  href="https://cua.ai/blog"
+                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
+                >
+                  Blog
+                </a>
+              </li>
+              <li>
+                <a
+                  href="https://github.com/trycua/cua"
+                  target="_blank"
+                  rel="noopener noreferrer"
+                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
+                >
+                  GitHub
+                </a>
+              </li>
+              <li>
+                <a
+                  href="https://discord.com/invite/mVnXXpdE85"
+                  target="_blank"
+                  rel="noopener noreferrer"
+                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
+                >
+                  Discord Community
+                </a>
+              </li>
+            </ul>
+          </div>
+
+          {/* Company Links */}
+          <div>
+            <h3 className="font-semibold text-sm mb-3 text-fd-foreground">Company</h3>
+            <ul className="space-y-2">
+              <li>
+                <a
+                  href="https://cua.ai/about"
+                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
+                >
+                  About
+                </a>
+              </li>
+              <li>
+                <a
+                  href="mailto:hello@trycua.com"
+                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
+                >
+                  Contact
+                </a>
+              </li>
+              <li>
+                <a
+                  href="https://cua.ai/cookie-policy"
+                  target="_blank"
+                  rel="noopener noreferrer"
+                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
+                >
+                  Cookie Policy
+                </a>
+              </li>
+            </ul>
+          </div>
+        </div>
+
+        {/* Bottom Bar */}
+        <div className="pt-6 border-t border-fd-border flex flex-col md:flex-row justify-between items-center gap-4">
+          <p className="text-sm text-fd-muted-foreground">
+            © {new Date().getFullYear()} Cua. All rights reserved.
+          </p>
+          <div className="flex gap-4">
+            <a
+              href="https://cua.ai/privacy"
+              className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
+            >
+              Privacy Policy
+            </a>
+            <a
+              href="https://cua.ai/terms"
+              className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
+            >
+              Terms of Service
+            </a>
+          </div>
+        </div>
      </div>
    </footer>
  );
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.35
+current_version = 0.4.37
 commit = True
 tag = True
 tag_name = agent-v{new_version}
@@ -73,8 +73,8 @@ if __name__ == "__main__":
 ## Docs

 - [Agent Loops](https://trycua.com/docs/agent-sdk/agent-loops)
- [Supported Agents](https://trycua.com/docs/agent-sdk/supported-agents)
- [Supported Models](https://trycua.com/docs/agent-sdk/supported-models)
+- [Supported Agents](https://trycua.com/docs/agent-sdk/supported-agents/computer-use-agents)
+- [Supported Models](https://trycua.com/docs/agent-sdk/supported-model-providers)
 - [Chat History](https://trycua.com/docs/agent-sdk/chat-history)
 - [Callbacks](https://trycua.com/docs/agent-sdk/callbacks)
 - [Custom Tools](https://trycua.com/docs/agent-sdk/custom-tools)
@@ -28,8 +28,12 @@ class AsyncComputerHandler(Protocol):
        """Get screen dimensions as (width, height)."""
        ...

-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
        ...

    async def click(self, x: int, y: int, button: str = "left") -> None:
@@ -36,8 +36,12 @@ class cuaComputerHandler(AsyncComputerHandler):
        screen_size = await self.interface.get_screen_size()
        return screen_size["width"], screen_size["height"]

-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
        assert self.interface is not None
        screenshot_bytes = await self.interface.screenshot()
        return base64.b64encode(screenshot_bytes).decode("utf-8")
@@ -122,8 +122,12 @@ class CustomComputerHandler(AsyncComputerHandler):

        return self._last_screenshot_size

-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
+    async def screenshot(self, text: Optional[str] = None) -> str:
+        """Take a screenshot and return as base64 string.
+
+        Args:
+            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
+        """
        result = await self._call_function(self.functions["screenshot"])
        b64_str = self._to_b64_str(result)  # type: ignore

@@ -14,67 +14,73 @@ import litellm

 from ..decorators import register_agent
 from ..loops.base import AsyncAgentConfig
+from ..responses import (
+    convert_completion_messages_to_responses_items,
+    convert_responses_items_to_completion_messages,
+)
 from ..types import AgentCapability, AgentResponse, Messages, Tools

 SOM_TOOL_SCHEMA = {
    "type": "function",
-    "name": "computer",
-    "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
-    "parameters": {
-        "type": "object",
-        "properties": {
-            "action": {
-                "type": "string",
-                "enum": [
-                    "screenshot",
-                    "click",
-                    "double_click",
-                    "drag",
-                    "type",
-                    "keypress",
-                    "scroll",
-                    "move",
-                    "wait",
-                    "get_current_url",
-                    "get_dimensions",
-                    "get_environment",
-                ],
-                "description": "The action to perform",
-            },
-            "element_id": {
-                "type": "integer",
-                "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
-            },
-            "start_element_id": {
-                "type": "integer",
-                "description": "The ID of the element to start dragging from (required for drag action)",
-            },
-            "end_element_id": {
-                "type": "integer",
-                "description": "The ID of the element to drag to (required for drag action)",
-            },
-            "text": {
-                "type": "string",
-                "description": "The text to type (required for type action)",
-            },
-            "keys": {
-                "type": "string",
-                "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
-            },
-            "button": {
-                "type": "string",
-                "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
-            },
-            "scroll_x": {
-                "type": "integer",
-                "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
-            },
-            "scroll_y": {
-                "type": "integer",
-                "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
+    "function": {
+        "name": "computer",
+        "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "action": {
+                    "type": "string",
+                    "enum": [
+                        "screenshot",
+                        "click",
+                        "double_click",
+                        "drag",
+                        "type",
+                        "keypress",
+                        "scroll",
+                        "move",
+                        "wait",
+                        "get_current_url",
+                        "get_dimensions",
+                        "get_environment",
+                    ],
+                    "description": "The action to perform",
+                },
+                "element_id": {
+                    "type": "integer",
+                    "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
+                },
+                "start_element_id": {
+                    "type": "integer",
+                    "description": "The ID of the element to start dragging from (required for drag action)",
+                },
+                "end_element_id": {
+                    "type": "integer",
+                    "description": "The ID of the element to drag to (required for drag action)",
+                },
+                "text": {
+                    "type": "string",
+                    "description": "The text to type (required for type action)",
+                },
+                "keys": {
+                    "type": "string",
+                    "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
+                },
+                "button": {
+                    "type": "string",
+                    "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
+                },
+                "scroll_x": {
+                    "type": "integer",
+                    "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
+                },
+                "scroll_y": {
+                    "type": "integer",
+                    "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
+                },
            },
+            "required": ["action", "element_id"],
        },
-        "required": ["action"],
    },
 }

@@ -243,18 +249,20 @@ async def replace_computer_call_with_function(
                "id": item.get("id"),
                "call_id": item.get("call_id"),
                "status": "completed",
-                # Fall back to string representation
-                "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})",
            }
        ]

    elif item_type == "computer_call_output":
-        # Simple conversion: computer_call_output -> function_call_output
+        output = item.get("output")
+
+        if isinstance(output, dict):
+            output = [output]
+
        return [
            {
                "type": "function_call_output",
                "call_id": item.get("call_id"),
-                "content": [item.get("output")],
+                "output": item.get("output"),
                "id": item.get("id"),
                "status": "completed",
            }
@@ -296,6 +304,13 @@ class OmniparserConfig(AsyncAgentConfig):

        llm_model = model.split("+")[-1]

+        # Get screen dimensions from computer handler
+        try:
+            width, height = await computer_handler.get_dimensions()
+        except Exception:
+            # Fallback to default dimensions if method fails
+            width, height = 1024, 768
+
        # Prepare tools for OpenAI API
        openai_tools, id2xy = _prepare_tools_for_omniparser(tools)

@@ -309,27 +324,43 @@ class OmniparserConfig(AsyncAgentConfig):
                result = parser.parse(image_data)
                if _on_screenshot:
                    await _on_screenshot(result.annotated_image_base64, "annotated_image")
-                for element in result.elements:
-                    id2xy[element.id] = (
-                        (element.bbox.x1 + element.bbox.x2) / 2,
-                        (element.bbox.y1 + element.bbox.y2) / 2,
-                    )

-        # handle computer calls -> function calls
-        new_messages = []
-        for message in messages:
+                # Convert OmniParser normalized coordinates (0-1) to absolute pixels, convert to pixels
+                for element in result.elements:
+                    norm_x = (element.bbox.x1 + element.bbox.x2) / 2
+                    norm_y = (element.bbox.y1 + element.bbox.y2) / 2
+                    pixel_x = int(norm_x * width)
+                    pixel_y = int(norm_y * height)
+                    id2xy[element.id] = (pixel_x, pixel_y)
+
+                # Replace the original screenshot with the annotated image
+                annotated_image_url = f"data:image/png;base64,{result.annotated_image_base64}"
+                last_computer_call_output["output"]["image_url"] = annotated_image_url
+
+        xy2id = {v: k for k, v in id2xy.items()}
+        messages_with_element_ids = []
+        for i, message in enumerate(messages):
            if not isinstance(message, dict):
                message = message.__dict__
-            new_messages += await replace_computer_call_with_function(message, id2xy)  # type: ignore
-        messages = new_messages
+
+            msg_type = message.get("type")
+
+            if msg_type == "computer_call" and "action" in message:
+                action = message.get("action", {})
+
+            converted = await replace_computer_call_with_function(message, xy2id)  # type: ignore
+            messages_with_element_ids += converted
+
+        completion_messages = convert_responses_items_to_completion_messages(
+            messages_with_element_ids, allow_images_in_tool_results=False
+        )

        # Prepare API call kwargs
        api_kwargs = {
            "model": llm_model,
-            "input": messages,
+            "messages": completion_messages,
            "tools": openai_tools if openai_tools else None,
            "stream": stream,
-            "truncation": "auto",
            "num_retries": max_retries,
            **kwargs,
        }
@@ -340,8 +371,8 @@ class OmniparserConfig(AsyncAgentConfig):

        print(str(api_kwargs)[:1000])

-        # Use liteLLM responses
-        response = await litellm.aresponses(**api_kwargs)
+        # Use liteLLM completion
+        response = await litellm.acompletion(**api_kwargs)

        # Call API end hook
        if _on_api_end:
@@ -355,12 +386,45 @@ class OmniparserConfig(AsyncAgentConfig):
        if _on_usage:
            await _on_usage(usage)

-        # handle som function calls -> xy computer calls
-        new_output = []
-        for i in range(len(response.output)):  # type: ignore
-            new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy)  # type: ignore
+        response_dict = response.model_dump()  # type: ignore
+        choice_messages = [choice["message"] for choice in response_dict["choices"]]
+        responses_items = []
+        for choice_message in choice_messages:
+            responses_items.extend(convert_completion_messages_to_responses_items([choice_message]))

-        return {"output": new_output, "usage": usage}
+        # Convert element_id → x,y (similar to moondream's convert_computer_calls_desc2xy)
+        final_output = []
+        for item in responses_items:
+            if item.get("type") == "computer_call" and "action" in item:
+                action = item["action"].copy()
+
+                # Handle single element_id
+                if "element_id" in action:
+                    element_id = action["element_id"]
+                    if element_id in id2xy:
+                        x, y = id2xy[element_id]
+                        action["x"] = x
+                        action["y"] = y
+                        del action["element_id"]
+
+                # Handle start_element_id and end_element_id for drag operations
+                elif "start_element_id" in action and "end_element_id" in action:
+                    start_id = action["start_element_id"]
+                    end_id = action["end_element_id"]
+                    if start_id in id2xy and end_id in id2xy:
+                        start_x, start_y = id2xy[start_id]
+                        end_x, end_y = id2xy[end_id]
+                        action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
+                        del action["start_element_id"]
+                        del action["end_element_id"]
+
+                converted_item = item.copy()
+                converted_item["action"] = action
+                final_output.append(converted_item)
+            else:
+                final_output.append(item)
+
+        return {"output": final_output, "usage": usage}

    async def predict_click(
        self, model: str, image_b64: str, instruction: str, **kwargs
@@ -4,7 +4,7 @@ build-backend = "pdm.backend"

 [project]
 name = "cua-agent"
-version = "0.4.35"
+version = "0.4.37"
 description = "CUA (Computer Use) Agent for AI-driven computer interaction"
 readme = "README.md"
 authors = [
@@ -0,0 +1,84 @@
+"""Pytest configuration and shared fixtures for agent package tests.
+
+This file contains shared fixtures and configuration for all agent tests.
+Following SRP: This file ONLY handles test setup/teardown.
+"""
+
+from unittest.mock import AsyncMock, MagicMock, Mock, patch
+
+import pytest
+
+
+@pytest.fixture
+def mock_litellm():
+    """Mock liteLLM completion calls.
+
+    Use this fixture to avoid making real LLM API calls during tests.
+    Returns a mock that simulates LLM responses.
+    """
+    with patch("litellm.acompletion") as mock_completion:
+
+        async def mock_response(*args, **kwargs):
+            """Simulate a typical LLM response."""
+            return {
+                "id": "chatcmpl-test123",
+                "object": "chat.completion",
+                "created": 1234567890,
+                "model": kwargs.get("model", "anthropic/claude-3-5-sonnet-20241022"),
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {
+                            "role": "assistant",
+                            "content": "This is a mocked response for testing.",
+                        },
+                        "finish_reason": "stop",
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": 10,
+                    "completion_tokens": 20,
+                    "total_tokens": 30,
+                },
+            }
+
+        mock_completion.side_effect = mock_response
+        yield mock_completion
+
+
+@pytest.fixture
+def mock_computer():
+    """Mock Computer interface for agent tests.
+
+    Use this fixture to test agent logic without requiring a real Computer instance.
+    """
+    computer = AsyncMock()
+    computer.interface = AsyncMock()
+    computer.interface.screenshot = AsyncMock(return_value=b"fake_screenshot_data")
+    computer.interface.left_click = AsyncMock()
+    computer.interface.type = AsyncMock()
+    computer.interface.key = AsyncMock()
+
+    # Mock context manager
+    computer.__aenter__ = AsyncMock(return_value=computer)
+    computer.__aexit__ = AsyncMock()
+
+    return computer
+
+
+@pytest.fixture
+def disable_telemetry(monkeypatch):
+    """Disable telemetry for tests.
+
+    Use this fixture to ensure no telemetry is sent during tests.
+    """
+    monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1")
+
+
+@pytest.fixture
+def sample_messages():
+    """Provide sample messages for testing.
+
+    Returns a list of messages in the expected format.
+    """
+    return [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
@@ -0,0 +1,139 @@
+"""Unit tests for ComputerAgent class.
+
+This file tests ONLY the ComputerAgent initialization and basic functionality.
+Following SRP: This file tests ONE class (ComputerAgent).
+All external dependencies (liteLLM, Computer) are mocked.
+"""
+
+from unittest.mock import AsyncMock, MagicMock, Mock, patch
+
+import pytest
+
+
+class TestComputerAgentInitialization:
+    """Test ComputerAgent initialization (SRP: Only tests initialization)."""
+
+    @patch("agent.agent.litellm")
+    def test_agent_initialization_with_model(self, mock_litellm, disable_telemetry):
+        """Test that agent can be initialized with a model string."""
+        from agent import ComputerAgent
+
+        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
+
+        assert agent is not None
+        assert hasattr(agent, "model")
+        assert agent.model == "anthropic/claude-3-5-sonnet-20241022"
+
+    @patch("agent.agent.litellm")
+    def test_agent_initialization_with_tools(self, mock_litellm, disable_telemetry, mock_computer):
+        """Test that agent can be initialized with tools."""
+        from agent import ComputerAgent
+
+        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
+
+        assert agent is not None
+        assert hasattr(agent, "tools")
+
+    @patch("agent.agent.litellm")
+    def test_agent_initialization_with_max_budget(self, mock_litellm, disable_telemetry):
+        """Test that agent can be initialized with max trajectory budget."""
+        from agent import ComputerAgent
+
+        budget = 5.0
+        agent = ComputerAgent(
+            model="anthropic/claude-3-5-sonnet-20241022", max_trajectory_budget=budget
+        )
+
+        assert agent is not None
+
+    @patch("agent.agent.litellm")
+    def test_agent_requires_model(self, mock_litellm, disable_telemetry):
+        """Test that agent requires a model parameter."""
+        from agent import ComputerAgent
+
+        with pytest.raises(TypeError):
+            # Should fail without model parameter - intentionally missing required argument
+            ComputerAgent()  # type: ignore[call-arg]
+
+
+class TestComputerAgentRun:
+    """Test ComputerAgent.run() method (SRP: Only tests run logic)."""
+
+    @pytest.mark.asyncio
+    @patch("agent.agent.litellm")
+    async def test_agent_run_with_messages(self, mock_litellm, disable_telemetry, sample_messages):
+        """Test that agent.run() works with valid messages."""
+        from agent import ComputerAgent
+
+        # Mock liteLLM response
+        mock_response = {
+            "id": "chatcmpl-test",
+            "choices": [
+                {
+                    "message": {"role": "assistant", "content": "Test response"},
+                    "finish_reason": "stop",
+                }
+            ],
+            "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
+        }
+
+        mock_litellm.acompletion = AsyncMock(return_value=mock_response)
+
+        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
+
+        # Run should return an async generator
+        result_generator = agent.run(sample_messages)
+
+        assert result_generator is not None
+        # Check it's an async generator
+        assert hasattr(result_generator, "__anext__")
+
+    def test_agent_has_run_method(self, disable_telemetry):
+        """Test that agent has run method available."""
+        from agent import ComputerAgent
+
+        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
+
+        # Verify run method exists
+        assert hasattr(agent, "run")
+        assert callable(agent.run)
+
+    def test_agent_has_agent_loop(self, disable_telemetry):
+        """Test that agent has agent_loop initialized."""
+        from agent import ComputerAgent
+
+        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
+
+        # Verify agent_loop is initialized
+        assert hasattr(agent, "agent_loop")
+        assert agent.agent_loop is not None
+
+
+class TestComputerAgentTypes:
+    """Test AgentResponse and Messages types (SRP: Only tests type definitions)."""
+
+    def test_messages_type_exists(self):
+        """Test that Messages type is exported."""
+        from agent import Messages
+
+        assert Messages is not None
+
+    def test_agent_response_type_exists(self):
+        """Test that AgentResponse type is exported."""
+        from agent import AgentResponse
+
+        assert AgentResponse is not None
+
+
+class TestComputerAgentIntegration:
+    """Test ComputerAgent integration with Computer tool (SRP: Integration within package)."""
+
+    def test_agent_accepts_computer_tool(self, disable_telemetry, mock_computer):
+        """Test that agent can be initialized with Computer tool."""
+        from agent import ComputerAgent
+
+        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
+
+        # Verify agent accepted the tool
+        assert agent is not None
+        assert hasattr(agent, "tools")
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.1.25
+current_version = 0.1.28
 commit = True
 tag = True
 tag_name = computer-server-v{new_version}
@@ -85,6 +85,102 @@ class BaseFileHandler(ABC):
        pass


+class BaseDesktopHandler(ABC):
+    """Abstract base class for OS-specific desktop handlers.
+
+    Categories:
+    - Wallpaper Actions: Methods for wallpaper operations
+    - Desktop shortcut actions: Methods for managing desktop shortcuts
+    """
+
+    # Wallpaper Actions
+    @abstractmethod
+    async def get_desktop_environment(self) -> Dict[str, Any]:
+        """Get the current desktop environment name."""
+        pass
+
+    @abstractmethod
+    async def set_wallpaper(self, path: str) -> Dict[str, Any]:
+        """Set the desktop wallpaper to the file at path."""
+        pass
+
+
+class BaseWindowHandler(ABC):
+    """Abstract class for OS-specific window management handlers.
+
+    Categories:
+    - Window Management: Methods for application/window control
+    """
+
+    # Window Management
+    @abstractmethod
+    async def open(self, target: str) -> Dict[str, Any]:
+        """Open a file or URL with the default application."""
+        pass
+
+    @abstractmethod
+    async def launch(self, app: str, args: Optional[List[str]] = None) -> Dict[str, Any]:
+        """Launch an application with optional arguments."""
+        pass
+
+    @abstractmethod
+    async def get_current_window_id(self) -> Dict[str, Any]:
+        """Get the currently active window ID."""
+        pass
+
+    @abstractmethod
+    async def get_application_windows(self, app: str) -> Dict[str, Any]:
+        """Get windows belonging to an application (by name or bundle)."""
+        pass
+
+    @abstractmethod
+    async def get_window_name(self, window_id: str) -> Dict[str, Any]:
+        """Get the title/name of a window by ID."""
+        pass
+
+    @abstractmethod
+    async def get_window_size(self, window_id: str | int) -> Dict[str, Any]:
+        """Get the size of a window by ID as {width, height}."""
+        pass
+
+    @abstractmethod
+    async def activate_window(self, window_id: str | int) -> Dict[str, Any]:
+        """Bring a window to the foreground by ID."""
+        pass
+
+    @abstractmethod
+    async def close_window(self, window_id: str | int) -> Dict[str, Any]:
+        """Close a window by ID."""
+        pass
+
+    @abstractmethod
+    async def get_window_position(self, window_id: str | int) -> Dict[str, Any]:
+        """Get the top-left position of a window as {x, y}."""
+        pass
+
+    @abstractmethod
+    async def set_window_size(
+        self, window_id: str | int, width: int, height: int
+    ) -> Dict[str, Any]:
+        """Set the size of a window by ID."""
+        pass
+
+    @abstractmethod
+    async def set_window_position(self, window_id: str | int, x: int, y: int) -> Dict[str, Any]:
+        """Set the position of a window by ID."""
+        pass
+
+    @abstractmethod
+    async def maximize_window(self, window_id: str | int) -> Dict[str, Any]:
+        """Maximize a window by ID."""
+        pass
+
+    @abstractmethod
+    async def minimize_window(self, window_id: str | int) -> Dict[str, Any]:
+        """Minimize a window by ID."""
+        pass
+
+
 class BaseAutomationHandler(ABC):
    """Abstract base class for OS-specific automation handlers.

@@ -4,7 +4,13 @@ from typing import Tuple, Type

 from computer_server.diorama.base import BaseDioramaHandler

-from .base import BaseAccessibilityHandler, BaseAutomationHandler, BaseFileHandler
+from .base import (
+    BaseAccessibilityHandler,
+    BaseAutomationHandler,
+    BaseDesktopHandler,
+    BaseFileHandler,
+    BaseWindowHandler,
+)

 # Conditionally import platform-specific handlers
 system = platform.system().lower()
@@ -17,7 +23,7 @@ elif system == "linux":
 elif system == "windows":
    from .windows import WindowsAccessibilityHandler, WindowsAutomationHandler

-from .generic import GenericFileHandler
+from .generic import GenericDesktopHandler, GenericFileHandler, GenericWindowHandler


 class HandlerFactory:
@@ -49,9 +55,14 @@ class HandlerFactory:
            raise RuntimeError(f"Failed to determine current OS: {str(e)}")

    @staticmethod
-    def create_handlers() -> (
-        Tuple[BaseAccessibilityHandler, BaseAutomationHandler, BaseDioramaHandler, BaseFileHandler]
-    ):
+    def create_handlers() -> Tuple[
+        BaseAccessibilityHandler,
+        BaseAutomationHandler,
+        BaseDioramaHandler,
+        BaseFileHandler,
+        BaseDesktopHandler,
+        BaseWindowHandler,
+    ]:
        """Create and return appropriate handlers for the current OS.

        Returns:
@@ -70,6 +81,8 @@ class HandlerFactory:
                MacOSAutomationHandler(),
                MacOSDioramaHandler(),
                GenericFileHandler(),
+                GenericDesktopHandler(),
+                GenericWindowHandler(),
            )
        elif os_type == "linux":
            return (
@@ -77,6 +90,8 @@ class HandlerFactory:
                LinuxAutomationHandler(),
                BaseDioramaHandler(),
                GenericFileHandler(),
+                GenericDesktopHandler(),
+                GenericWindowHandler(),
            )
        elif os_type == "windows":
            return (
@@ -84,6 +99,8 @@ class HandlerFactory:
                WindowsAutomationHandler(),
                BaseDioramaHandler(),
                GenericFileHandler(),
+                GenericDesktopHandler(),
+                GenericWindowHandler(),
            )
        else:
            raise NotImplementedError(f"OS '{os_type}' is not supported")
@@ -2,15 +2,26 @@
 Generic handlers for all OSes.

 Includes:
+- DesktopHandler
 - FileHandler

 """

 import base64
+import os
+import platform
+import subprocess
+import webbrowser
 from pathlib import Path
 from typing import Any, Dict, Optional

-from .base import BaseFileHandler
+from ..utils import wallpaper
+from .base import BaseDesktopHandler, BaseFileHandler, BaseWindowHandler
+
+try:
+    import pywinctl as pwc
+except Exception:  # pragma: no cover
+    pwc = None  # type: ignore


 def resolve_path(path: str) -> Path:
@@ -25,6 +36,233 @@ def resolve_path(path: str) -> Path:
    return Path(path).expanduser().resolve()


+# ===== Cross-platform Desktop command handlers =====
+
+
+class GenericDesktopHandler(BaseDesktopHandler):
+    """
+    Generic desktop handler providing desktop-related operations.
+
+    Implements:
+    - get_desktop_environment: detect current desktop environment
+    - set_wallpaper: set desktop wallpaper path
+    """
+
+    async def get_desktop_environment(self) -> Dict[str, Any]:
+        """
+        Get the current desktop environment.
+
+        Returns:
+            Dict containing 'success' boolean and either 'environment' string or 'error' string
+        """
+        try:
+            env = wallpaper.get_desktop_environment()
+            return {"success": True, "environment": env}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+    async def set_wallpaper(self, path: str) -> Dict[str, Any]:
+        """
+        Set the desktop wallpaper to the specified path.
+
+        Args:
+            path: The file path to set as wallpaper
+
+        Returns:
+            Dict containing 'success' boolean and optionally 'error' string
+        """
+        try:
+            file_path = resolve_path(path)
+            ok = wallpaper.set_wallpaper(str(file_path))
+            return {"success": bool(ok)}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+
+# ===== Cross-platform window control command handlers =====
+
+
+class GenericWindowHandler(BaseWindowHandler):
+    """
+    Cross-platform window management using pywinctl where possible.
+    """
+
+    async def open(self, target: str) -> Dict[str, Any]:
+        try:
+            if target.startswith("http://") or target.startswith("https://"):
+                ok = webbrowser.open(target)
+                return {"success": bool(ok)}
+            path = str(resolve_path(target))
+            sys = platform.system().lower()
+            if sys == "darwin":
+                subprocess.Popen(["open", path])
+            elif sys == "linux":
+                subprocess.Popen(["xdg-open", path])
+            elif sys == "windows":
+                os.startfile(path)  # type: ignore[attr-defined]
+            else:
+                return {"success": False, "error": f"Unsupported OS: {sys}"}
+            return {"success": True}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+    async def launch(self, app: str, args: Optional[list[str]] = None) -> Dict[str, Any]:
+        try:
+            if args:
+                proc = subprocess.Popen([app, *args])
+            else:
+                # allow shell command like "libreoffice --writer"
+                proc = subprocess.Popen(app, shell=True)
+            return {"success": True, "pid": proc.pid}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+    def _get_window_by_id(self, window_id: int | str) -> Optional[Any]:
+        if pwc is None:
+            raise RuntimeError("pywinctl not available")
+        # Find by native handle among Window objects; getAllWindowsDict keys are titles
+        try:
+            for w in pwc.getAllWindows():
+                if str(w.getHandle()) == str(window_id):
+                    return w
+            return None
+        except Exception:
+            return None
+
+    async def get_current_window_id(self) -> Dict[str, Any]:
+        try:
+            if pwc is None:
+                return {"success": False, "error": "pywinctl not available"}
+            win = pwc.getActiveWindow()
+            if not win:
+                return {"success": False, "error": "No active window"}
+            return {"success": True, "window_id": win.getHandle()}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+    async def get_application_windows(self, app: str) -> Dict[str, Any]:
+        try:
+            if pwc is None:
+                return {"success": False, "error": "pywinctl not available"}
+            wins = pwc.getWindowsWithTitle(app, condition=pwc.Re.CONTAINS, flags=pwc.Re.IGNORECASE)
+            ids = [w.getHandle() for w in wins]
+            return {"success": True, "windows": ids}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+    async def get_window_name(self, window_id: int | str) -> Dict[str, Any]:
+        try:
+            if pwc is None:
+                return {"success": False, "error": "pywinctl not available"}
+            w = self._get_window_by_id(window_id)
+            if not w:
+                return {"success": False, "error": "Window not found"}
+            return {"success": True, "name": w.title}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+    async def get_window_size(self, window_id: int | str) -> Dict[str, Any]:
+        try:
+            if pwc is None:
+                return {"success": False, "error": "pywinctl not available"}
+            w = self._get_window_by_id(window_id)
+            if not w:
+                return {"success": False, "error": "Window not found"}
+            width, height = w.size
+            return {"success": True, "width": int(width), "height": int(height)}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+    async def get_window_position(self, window_id: int | str) -> Dict[str, Any]:
+        try:
+            if pwc is None:
+                return {"success": False, "error": "pywinctl not available"}
+            w = self._get_window_by_id(window_id)
+            if not w:
+                return {"success": False, "error": "Window not found"}
+            x, y = w.position
+            return {"success": True, "x": int(x), "y": int(y)}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+    async def set_window_size(
+        self, window_id: int | str, width: int, height: int
+    ) -> Dict[str, Any]:
+        try:
+            if pwc is None:
+                return {"success": False, "error": "pywinctl not available"}
+            w = self._get_window_by_id(window_id)
+            if not w:
+                return {"success": False, "error": "Window not found"}
+            ok = w.resizeTo(int(width), int(height))
+            return {"success": bool(ok)}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+    async def set_window_position(self, window_id: int | str, x: int, y: int) -> Dict[str, Any]:
+        try:
+            if pwc is None:
+                return {"success": False, "error": "pywinctl not available"}
+            w = self._get_window_by_id(window_id)
+            if not w:
+                return {"success": False, "error": "Window not found"}
+            ok = w.moveTo(int(x), int(y))
+            return {"success": bool(ok)}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+    async def maximize_window(self, window_id: int | str) -> Dict[str, Any]:
+        try:
+            if pwc is None:
+                return {"success": False, "error": "pywinctl not available"}
+            w = self._get_window_by_id(window_id)
+            if not w:
+                return {"success": False, "error": "Window not found"}
+            ok = w.maximize()
+            return {"success": bool(ok)}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+    async def minimize_window(self, window_id: int | str) -> Dict[str, Any]:
+        try:
+            if pwc is None:
+                return {"success": False, "error": "pywinctl not available"}
+            w = self._get_window_by_id(window_id)
+            if not w:
+                return {"success": False, "error": "Window not found"}
+            ok = w.minimize()
+            return {"success": bool(ok)}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+    async def activate_window(self, window_id: int | str) -> Dict[str, Any]:
+        try:
+            if pwc is None:
+                return {"success": False, "error": "pywinctl not available"}
+            w = self._get_window_by_id(window_id)
+            if not w:
+                return {"success": False, "error": "Window not found"}
+            ok = w.activate()
+            return {"success": bool(ok)}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+    async def close_window(self, window_id: int | str) -> Dict[str, Any]:
+        try:
+            if pwc is None:
+                return {"success": False, "error": "pywinctl not available"}
+            w = self._get_window_by_id(window_id)
+            if not w:
+                return {"success": False, "error": "Window not found"}
+            ok = w.close()
+            return {"success": bool(ok)}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
+
+# ===== Cross-platform file system command handlers =====
+
+
 class GenericFileHandler(BaseFileHandler):
    """
    Generic file handler that provides file system operations for all operating systems.
@@ -75,9 +75,14 @@ except Exception:
    except Exception:
        package_version = "unknown"

-accessibility_handler, automation_handler, diorama_handler, file_handler = (
-    HandlerFactory.create_handlers()
-)
+(
+    accessibility_handler,
+    automation_handler,
+    diorama_handler,
+    file_handler,
+    desktop_handler,
+    window_handler,
+) = HandlerFactory.create_handlers()
 handlers = {
    "version": lambda: {"protocol": protocol_version, "package": package_version},
    # App-Use commands
@@ -99,6 +104,23 @@ handlers = {
    "delete_file": file_handler.delete_file,
    "create_dir": file_handler.create_dir,
    "delete_dir": file_handler.delete_dir,
+    # Desktop commands
+    "get_desktop_environment": desktop_handler.get_desktop_environment,
+    "set_wallpaper": desktop_handler.set_wallpaper,
+    # Window management
+    "open": window_handler.open,
+    "launch": window_handler.launch,
+    "get_current_window_id": window_handler.get_current_window_id,
+    "get_application_windows": window_handler.get_application_windows,
+    "get_window_name": window_handler.get_window_name,
+    "get_window_size": window_handler.get_window_size,
+    "get_window_position": window_handler.get_window_position,
+    "set_window_size": window_handler.set_window_size,
+    "set_window_position": window_handler.set_window_position,
+    "maximize_window": window_handler.maximize_window,
+    "minimize_window": window_handler.minimize_window,
+    "activate_window": window_handler.activate_window,
+    "close_window": window_handler.close_window,
    # Mouse commands
    "mouse_down": automation_handler.mouse_down,
    "mouse_up": automation_handler.mouse_up,
@@ -0,0 +1,3 @@
+from . import wallpaper
+
+__all__ = ["wallpaper"]
@@ -0,0 +1,321 @@
+"""Set the desktop wallpaper."""
+
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+
+def get_desktop_environment() -> str:
+    """
+    Returns the name of the current desktop environment.
+    """
+    # From https://stackoverflow.com/a/21213358/2624876
+    # which takes from:
+    # http://stackoverflow.com/questions/2035657/what-is-my-current-desktop-environment
+    # and http://ubuntuforums.org/showthread.php?t=652320
+    # and http://ubuntuforums.org/showthread.php?t=1139057
+    if sys.platform in ["win32", "cygwin"]:
+        return "windows"
+    elif sys.platform == "darwin":
+        return "mac"
+    else:  # Most likely either a POSIX system or something not much common
+        desktop_session = os.environ.get("DESKTOP_SESSION")
+        if (
+            desktop_session is not None
+        ):  # easier to match if we doesn't have to deal with character cases
+            desktop_session = desktop_session.lower()
+            if desktop_session in [
+                "gnome",
+                "unity",
+                "cinnamon",
+                "mate",
+                "xfce4",
+                "lxde",
+                "fluxbox",
+                "blackbox",
+                "openbox",
+                "icewm",
+                "jwm",
+                "afterstep",
+                "trinity",
+                "kde",
+            ]:
+                return desktop_session
+            ## Special cases ##
+            # Canonical sets $DESKTOP_SESSION to Lubuntu rather than LXDE if using LXDE.
+            # There is no guarantee that they will not do the same with the other desktop environments.
+            elif "xfce" in desktop_session or desktop_session.startswith("xubuntu"):
+                return "xfce4"
+            elif desktop_session.startswith("ubuntustudio"):
+                return "kde"
+            elif desktop_session.startswith("ubuntu"):
+                return "gnome"
+            elif desktop_session.startswith("lubuntu"):
+                return "lxde"
+            elif desktop_session.startswith("kubuntu"):
+                return "kde"
+            elif desktop_session.startswith("razor"):  # e.g. razorkwin
+                return "razor-qt"
+            elif desktop_session.startswith("wmaker"):  # e.g. wmaker-common
+                return "windowmaker"
+        gnome_desktop_session_id = os.environ.get("GNOME_DESKTOP_SESSION_ID")
+        if os.environ.get("KDE_FULL_SESSION") == "true":
+            return "kde"
+        elif gnome_desktop_session_id:
+            if "deprecated" not in gnome_desktop_session_id:
+                return "gnome2"
+        # From http://ubuntuforums.org/showthread.php?t=652320
+        elif is_running("xfce-mcs-manage"):
+            return "xfce4"
+        elif is_running("ksmserver"):
+            return "kde"
+    return "unknown"
+
+
+def is_running(process: str) -> bool:
+    """Returns whether a process with the given name is (likely) currently running.
+
+    Uses a basic text search, and so may have false positives.
+    """
+    # From http://www.bloggerpolis.com/2011/05/how-to-check-if-a-process-is-running-using-python/
+    # and http://richarddingwall.name/2009/06/18/windows-equivalents-of-ps-and-kill-commands/
+    try:  # Linux/Unix
+        s = subprocess.Popen(["ps", "axw"], stdout=subprocess.PIPE)
+    except:  # Windows
+        s = subprocess.Popen(["tasklist", "/v"], stdout=subprocess.PIPE)
+    assert s.stdout is not None
+    for x in s.stdout:
+        # if re.search(process, x):
+        if process in str(x):
+            return True
+    return False
+
+
+def set_wallpaper(file_loc: str, first_run: bool = True):
+    """Sets the wallpaper to the given file location."""
+    # From https://stackoverflow.com/a/21213504/2624876
+    # I have not personally tested most of this. -- @1j01
+    # -----------------------------------------
+
+    # Note: There are two common Linux desktop environments where
+    # I have not been able to set the desktop background from
+    # command line: KDE, Enlightenment
+    desktop_env = get_desktop_environment()
+    if desktop_env in ["gnome", "unity", "cinnamon"]:
+        # Tested on Ubuntu 22 -- @1j01
+        uri = Path(file_loc).as_uri()
+        SCHEMA = "org.gnome.desktop.background"
+        KEY = "picture-uri"
+        # Needed for Ubuntu 22 in dark mode
+        # Might be better to set only one or the other, depending on the current theme
+        # In the settings it will say "This background selection only applies to the dark style"
+        # even if it's set for both, arguably referring to the selection that you can make on that page.
+        # -- @1j01
+        KEY_DARK = "picture-uri-dark"
+        try:
+            from gi.repository import Gio  # type: ignore
+
+            gsettings = Gio.Settings.new(SCHEMA)  # type: ignore
+            gsettings.set_string(KEY, uri)
+            gsettings.set_string(KEY_DARK, uri)
+        except Exception:
+            # Fallback tested on Ubuntu 22 -- @1j01
+            args = ["gsettings", "set", SCHEMA, KEY, uri]
+            subprocess.Popen(args)
+            args = ["gsettings", "set", SCHEMA, KEY_DARK, uri]
+            subprocess.Popen(args)
+    elif desktop_env == "mate":
+        try:  # MATE >= 1.6
+            # info from http://wiki.mate-desktop.org/docs:gsettings
+            args = ["gsettings", "set", "org.mate.background", "picture-filename", file_loc]
+            subprocess.Popen(args)
+        except Exception:  # MATE < 1.6
+            # From https://bugs.launchpad.net/variety/+bug/1033918
+            args = [
+                "mateconftool-2",
+                "-t",
+                "string",
+                "--set",
+                "/desktop/mate/background/picture_filename",
+                file_loc,
+            ]
+            subprocess.Popen(args)
+    elif desktop_env == "gnome2":  # Not tested
+        # From https://bugs.launchpad.net/variety/+bug/1033918
+        args = [
+            "gconftool-2",
+            "-t",
+            "string",
+            "--set",
+            "/desktop/gnome/background/picture_filename",
+            file_loc,
+        ]
+        subprocess.Popen(args)
+    ## KDE4 is difficult
+    ## see http://blog.zx2c4.com/699 for a solution that might work
+    elif desktop_env in ["kde3", "trinity"]:
+        # From http://ubuntuforums.org/archive/index.php/t-803417.html
+        args = ["dcop", "kdesktop", "KBackgroundIface", "setWallpaper", "0", file_loc, "6"]
+        subprocess.Popen(args)
+    elif desktop_env == "xfce4":
+        # Iterate over all wallpaper-related keys and set to file_loc
+        try:
+            list_proc = subprocess.run(
+                ["xfconf-query", "-c", "xfce4-desktop", "-l"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=False,
+            )
+            keys = []
+            if list_proc.stdout:
+                for line in list_proc.stdout.splitlines():
+                    line = line.strip()
+                    if not line:
+                        continue
+                    # Common keys: .../last-image and .../image-path
+                    if "/last-image" in line or "/image-path" in line:
+                        keys.append(line)
+            # Fallback: known defaults if none were listed
+            if not keys:
+                keys = [
+                    "/backdrop/screen0/monitorVNC-0/workspace0/last-image",
+                    "/backdrop/screen0/monitor0/image-path",
+                ]
+            for key in keys:
+                subprocess.run(
+                    [
+                        "xfconf-query",
+                        "-c",
+                        "xfce4-desktop",
+                        "-p",
+                        key,
+                        "-s",
+                        file_loc,
+                    ],
+                    check=False,
+                )
+        except Exception:
+            pass
+        # Reload xfdesktop to apply changes
+        subprocess.Popen(["xfdesktop", "--reload"])
+    elif desktop_env == "razor-qt":  # TODO: implement reload of desktop when possible
+        if first_run:
+            import configparser
+
+            desktop_conf = configparser.ConfigParser()
+            # Development version
+            desktop_conf_file = os.path.join(get_config_dir("razor"), "desktop.conf")
+            if os.path.isfile(desktop_conf_file):
+                config_option = R"screens\1\desktops\1\wallpaper"
+            else:
+                desktop_conf_file = os.path.join(get_home_dir(), ".razor/desktop.conf")
+                config_option = R"desktops\1\wallpaper"
+            desktop_conf.read(os.path.join(desktop_conf_file))
+            try:
+                if desktop_conf.has_option("razor", config_option):  # only replacing a value
+                    desktop_conf.set("razor", config_option, file_loc)
+                    with open(desktop_conf_file, "w", encoding="utf-8", errors="replace") as f:
+                        desktop_conf.write(f)
+            except Exception:
+                pass
+        else:
+            # TODO: reload desktop when possible
+            pass
+    elif desktop_env in ["fluxbox", "jwm", "openbox", "afterstep"]:
+        # http://fluxbox-wiki.org/index.php/Howto_set_the_background
+        # used fbsetbg on jwm too since I am too lazy to edit the XML configuration
+        # now where fbsetbg does the job excellent anyway.
+        # and I have not figured out how else it can be set on Openbox and AfterSTep
+        # but fbsetbg works excellent here too.
+        try:
+            args = ["fbsetbg", file_loc]
+            subprocess.Popen(args)
+        except Exception:
+            sys.stderr.write("ERROR: Failed to set wallpaper with fbsetbg!\n")
+            sys.stderr.write("Please make sre that You have fbsetbg installed.\n")
+    elif desktop_env == "icewm":
+        # command found at http://urukrama.wordpress.com/2007/12/05/desktop-backgrounds-in-window-managers/
+        args = ["icewmbg", file_loc]
+        subprocess.Popen(args)
+    elif desktop_env == "blackbox":
+        # command found at http://blackboxwm.sourceforge.net/BlackboxDocumentation/BlackboxBackground
+        args = ["bsetbg", "-full", file_loc]
+        subprocess.Popen(args)
+    elif desktop_env == "lxde":
+        args = ["pcmanfm", "--set-wallpaper", file_loc, "--wallpaper-mode=scaled"]
+        subprocess.Popen(args)
+    elif desktop_env == "windowmaker":
+        # From http://www.commandlinefu.com/commands/view/3857/set-wallpaper-on-windowmaker-in-one-line
+        args = ["wmsetbg", "-s", "-u", file_loc]
+        subprocess.Popen(args)
+    # elif desktop_env == "enlightenment": # I have not been able to make it work on e17. On e16 it would have been something in this direction
+    #     args = ["enlightenment_remote", "-desktop-bg-add", "0", "0", "0", "0", file_loc]
+    #     subprocess.Popen(args)
+    elif desktop_env == "windows":
+        # From https://stackoverflow.com/questions/1977694/change-desktop-background
+        # Tested on Windows 10. -- @1j01
+        import ctypes
+
+        SPI_SETDESKWALLPAPER = 20
+        ctypes.windll.user32.SystemParametersInfoW(SPI_SETDESKWALLPAPER, 0, file_loc, 0)  # type: ignore
+    elif desktop_env == "mac":
+        # From https://stackoverflow.com/questions/431205/how-can-i-programatically-change-the-background-in-mac-os-x
+        try:
+            # Tested on macOS 10.14.6 (Mojave) -- @1j01
+            assert (
+                sys.platform == "darwin"
+            )  # ignore `Import "appscript" could not be resolved` for other platforms
+            from appscript import app, mactypes
+
+            app("Finder").desktop_picture.set(mactypes.File(file_loc))
+        except ImportError:
+            # Tested on macOS 10.14.6 (Mojave) -- @1j01
+            # import subprocess
+            # SCRIPT = f"""/usr/bin/osascript<<END
+            # tell application "Finder" to set desktop picture to POSIX file "{file_loc}"
+            # END"""
+            # subprocess.Popen(SCRIPT, shell=True)
+
+            # Safer version, avoiding string interpolation,
+            # to protect against command injection (both in the shell and in AppleScript):
+            OSASCRIPT = """
+            on run (clp)
+                if clp's length is not 1 then error "Incorrect Parameters"
+                local file_loc
+                set file_loc to clp's item 1
+                tell application "Finder" to set desktop picture to POSIX file file_loc
+            end run
+            """
+            subprocess.Popen(["osascript", "-e", OSASCRIPT, "--", file_loc])
+    else:
+        if first_run:  # don't spam the user with the same message over and over again
+            sys.stderr.write(
+                "Warning: Failed to set wallpaper. Your desktop environment is not supported."
+            )
+            sys.stderr.write(f"You can try manually to set your wallpaper to {file_loc}")
+        return False
+    return True
+
+
+def get_config_dir(app_name: str) -> str:
+    """Returns the configuration directory for the given application name."""
+    if "XDG_CONFIG_HOME" in os.environ:
+        config_home = os.environ["XDG_CONFIG_HOME"]
+    elif "APPDATA" in os.environ:  # On Windows
+        config_home = os.environ["APPDATA"]
+    else:
+        try:
+            from xdg import BaseDirectory
+
+            config_home = BaseDirectory.xdg_config_home
+        except ImportError:  # Most likely a Linux/Unix system anyway
+            config_home = os.path.join(get_home_dir(), ".config")
+    config_dir = os.path.join(config_home, app_name)
+    return config_dir
+
+
+def get_home_dir() -> str:
+    """Returns the home directory of the current user."""
+    return os.path.expanduser("~")
@@ -4,7 +4,7 @@ build-backend = "pdm.backend"

 [project]
 name = "cua-computer-server"
-version = "0.1.25"
+version = "0.1.28"

 description = "Server component for the Computer-Use Interface (CUI) framework powering Cua"
 authors = [
@@ -22,7 +22,15 @@ dependencies = [
    "pillow>=10.2.0",
    "aiohttp>=3.9.1",
    "pyperclip>=1.9.0",
-    "websockets>=12.0"
+    "websockets>=12.0",
+    "pywinctl>=0.4.1",
+    # OS-specific runtime deps
+    "pyobjc-framework-Cocoa>=10.1; sys_platform == 'darwin'",
+    "pyobjc-framework-Quartz>=10.1; sys_platform == 'darwin'",
+    "pyobjc-framework-ApplicationServices>=10.1; sys_platform == 'darwin'",
+    "python-xlib>=0.33; sys_platform == 'linux'",
+    "pywin32>=310; sys_platform == 'win32'",
+    "pip-system-certs; sys_platform == 'win32'",
 ]

 [project.optional-dependencies]
@@ -0,0 +1,47 @@
+"""Pytest configuration and shared fixtures for computer-server package tests.
+
+This file contains shared fixtures and configuration for all computer-server tests.
+Following SRP: This file ONLY handles test setup/teardown.
+"""
+
+from unittest.mock import AsyncMock, Mock, patch
+
+import pytest
+
+
+@pytest.fixture
+def mock_websocket():
+    """Mock WebSocket connection for testing.
+
+    Use this fixture to test WebSocket logic without real connections.
+    """
+    websocket = AsyncMock()
+    websocket.send = AsyncMock()
+    websocket.recv = AsyncMock()
+    websocket.close = AsyncMock()
+
+    return websocket
+
+
+@pytest.fixture
+def mock_computer_interface():
+    """Mock computer interface for server tests.
+
+    Use this fixture to test server logic without real computer operations.
+    """
+    interface = AsyncMock()
+    interface.screenshot = AsyncMock(return_value=b"fake_screenshot")
+    interface.left_click = AsyncMock()
+    interface.type = AsyncMock()
+    interface.key = AsyncMock()
+
+    return interface
+
+
+@pytest.fixture
+def disable_telemetry(monkeypatch):
+    """Disable telemetry for tests.
+
+    Use this fixture to ensure no telemetry is sent during tests.
+    """
+    monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1")
@@ -0,0 +1,40 @@
+"""Unit tests for computer-server package.
+
+This file tests ONLY basic server functionality.
+Following SRP: This file tests server initialization and basic operations.
+All external dependencies are mocked.
+"""
+
+from unittest.mock import AsyncMock, Mock, patch
+
+import pytest
+
+
+class TestServerImports:
+    """Test server module imports (SRP: Only tests imports)."""
+
+    def test_server_module_exists(self):
+        """Test that server module can be imported."""
+        try:
+            import computer_server
+
+            assert computer_server is not None
+        except ImportError:
+            pytest.skip("computer_server module not installed")
+
+
+class TestServerInitialization:
+    """Test server initialization (SRP: Only tests initialization)."""
+
+    @pytest.mark.asyncio
+    async def test_server_can_be_imported(self):
+        """Basic smoke test: verify server components can be imported."""
+        try:
+            from computer_server import server
+
+            assert server is not None
+        except ImportError:
+            pytest.skip("Server module not available")
+        except Exception as e:
+            # Some initialization errors are acceptable in unit tests
+            pytest.skip(f"Server initialization requires specific setup: {e}")
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.7
+current_version = 0.4.11
 commit = True
 tag = True
 tag_name = computer-v{new_version}
@@ -436,6 +436,189 @@ class BaseComputerInterface(ABC):
        """
        pass

+    # Desktop actions
+    @abstractmethod
+    async def get_desktop_environment(self) -> str:
+        """Get the current desktop environment.
+
+        Returns:
+            The name of the current desktop environment.
+        """
+        pass
+
+    @abstractmethod
+    async def set_wallpaper(self, path: str) -> None:
+        """Set the desktop wallpaper to the specified path.
+
+        Args:
+            path: The file path to set as wallpaper
+        """
+        pass
+
+    # Window management
+    @abstractmethod
+    async def open(self, target: str) -> None:
+        """Open a target using the system's default handler.
+
+        Typically opens files, folders, or URLs with the associated application.
+
+        Args:
+            target: The file path, folder path, or URL to open.
+        """
+        pass
+
+    @abstractmethod
+    async def launch(self, app: str, args: List[str] | None = None) -> Optional[int]:
+        """Launch an application with optional arguments.
+
+        Args:
+            app: The application executable or bundle identifier.
+            args: Optional list of arguments to pass to the application.
+
+        Returns:
+            Optional process ID (PID) of the launched application if available, otherwise None.
+        """
+        pass
+
+    @abstractmethod
+    async def get_current_window_id(self) -> int | str:
+        """Get the identifier of the currently active/focused window.
+
+        Returns:
+            A window identifier that can be used with other window management methods.
+        """
+        pass
+
+    @abstractmethod
+    async def get_application_windows(self, app: str) -> List[int | str]:
+        """Get all window identifiers for a specific application.
+
+        Args:
+            app: The application name, executable, or identifier to query.
+
+        Returns:
+            A list of window identifiers belonging to the specified application.
+        """
+        pass
+
+    @abstractmethod
+    async def get_window_name(self, window_id: int | str) -> str:
+        """Get the title/name of a window.
+
+        Args:
+            window_id: The window identifier.
+
+        Returns:
+            The window's title or name string.
+        """
+        pass
+
+    @abstractmethod
+    async def get_window_size(self, window_id: int | str) -> tuple[int, int]:
+        """Get the size of a window in pixels.
+
+        Args:
+            window_id: The window identifier.
+
+        Returns:
+            A tuple of (width, height) representing the window size in pixels.
+        """
+        pass
+
+    @abstractmethod
+    async def get_window_position(self, window_id: int | str) -> tuple[int, int]:
+        """Get the screen position of a window.
+
+        Args:
+            window_id: The window identifier.
+
+        Returns:
+            A tuple of (x, y) representing the window's top-left corner in screen coordinates.
+        """
+        pass
+
+    @abstractmethod
+    async def set_window_size(self, window_id: int | str, width: int, height: int) -> None:
+        """Set the size of a window in pixels.
+
+        Args:
+            window_id: The window identifier.
+            width: Desired width in pixels.
+            height: Desired height in pixels.
+        """
+        pass
+
+    @abstractmethod
+    async def set_window_position(self, window_id: int | str, x: int, y: int) -> None:
+        """Move a window to a specific position on the screen.
+
+        Args:
+            window_id: The window identifier.
+            x: X coordinate for the window's top-left corner.
+            y: Y coordinate for the window's top-left corner.
+        """
+        pass
+
+    @abstractmethod
+    async def maximize_window(self, window_id: int | str) -> None:
+        """Maximize a window.
+
+        Args:
+            window_id: The window identifier.
+        """
+        pass
+
+    @abstractmethod
+    async def minimize_window(self, window_id: int | str) -> None:
+        """Minimize a window.
+
+        Args:
+            window_id: The window identifier.
+        """
+        pass
+
+    @abstractmethod
+    async def activate_window(self, window_id: int | str) -> None:
+        """Bring a window to the foreground and focus it.
+
+        Args:
+            window_id: The window identifier.
+        """
+        pass
+
+    @abstractmethod
+    async def close_window(self, window_id: int | str) -> None:
+        """Close a window.
+
+        Args:
+            window_id: The window identifier.
+        """
+        pass
+
+    # Convenience aliases
+    async def get_window_title(self, window_id: int | str) -> str:
+        """Convenience alias for get_window_name().
+
+        Args:
+            window_id: The window identifier.
+
+        Returns:
+            The window's title or name string.
+        """
+        return await self.get_window_name(window_id)
+
+    async def window_size(self, window_id: int | str) -> tuple[int, int]:
+        """Convenience alias for get_window_size().
+
+        Args:
+            window_id: The window identifier.
+
+        Returns:
+            A tuple of (width, height) representing the window size in pixels.
+        """
+        return await self.get_window_size(window_id)
+
+    # Shell actions
    @abstractmethod
    async def run_command(self, command: str) -> CommandResult:
        """Run shell command and return structured result.
@@ -487,6 +487,104 @@ class GenericComputerInterface(BaseComputerInterface):
            raise RuntimeError(result.get("error", "Failed to list directory"))
        return result.get("files", [])

+    # Desktop actions
+    async def get_desktop_environment(self) -> str:
+        result = await self._send_command("get_desktop_environment")
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to get desktop environment"))
+        return result.get("environment", "unknown")
+
+    async def set_wallpaper(self, path: str) -> None:
+        result = await self._send_command("set_wallpaper", {"path": path})
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to set wallpaper"))
+
+    # Window management
+    async def open(self, target: str) -> None:
+        result = await self._send_command("open", {"target": target})
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to open target"))
+
+    async def launch(self, app: str, args: list[str] | None = None) -> int | None:
+        payload: dict[str, object] = {"app": app}
+        if args is not None:
+            payload["args"] = args
+        result = await self._send_command("launch", payload)
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to launch application"))
+        return result.get("pid")  # type: ignore[return-value]
+
+    async def get_current_window_id(self) -> int | str:
+        result = await self._send_command("get_current_window_id")
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to get current window id"))
+        return result["window_id"]  # type: ignore[return-value]
+
+    async def get_application_windows(self, app: str) -> list[int | str]:
+        result = await self._send_command("get_application_windows", {"app": app})
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to get application windows"))
+        return list(result.get("windows", []))  # type: ignore[return-value]
+
+    async def get_window_name(self, window_id: int | str) -> str:
+        result = await self._send_command("get_window_name", {"window_id": window_id})
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to get window name"))
+        return result.get("name", "")  # type: ignore[return-value]
+
+    async def get_window_size(self, window_id: int | str) -> tuple[int, int]:
+        result = await self._send_command("get_window_size", {"window_id": window_id})
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to get window size"))
+        return int(result.get("width", 0)), int(result.get("height", 0))
+
+    async def get_window_position(self, window_id: int | str) -> tuple[int, int]:
+        result = await self._send_command("get_window_position", {"window_id": window_id})
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to get window position"))
+        return int(result.get("x", 0)), int(result.get("y", 0))
+
+    async def set_window_size(self, window_id: int | str, width: int, height: int) -> None:
+        result = await self._send_command(
+            "set_window_size", {"window_id": window_id, "width": width, "height": height}
+        )
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to set window size"))
+
+    async def set_window_position(self, window_id: int | str, x: int, y: int) -> None:
+        result = await self._send_command(
+            "set_window_position", {"window_id": window_id, "x": x, "y": y}
+        )
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to set window position"))
+
+    async def maximize_window(self, window_id: int | str) -> None:
+        result = await self._send_command("maximize_window", {"window_id": window_id})
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to maximize window"))
+
+    async def minimize_window(self, window_id: int | str) -> None:
+        result = await self._send_command("minimize_window", {"window_id": window_id})
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to minimize window"))
+
+    async def activate_window(self, window_id: int | str) -> None:
+        result = await self._send_command("activate_window", {"window_id": window_id})
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to activate window"))
+
+    async def close_window(self, window_id: int | str) -> None:
+        result = await self._send_command("close_window", {"window_id": window_id})
+        if not result.get("success", False):
+            raise RuntimeError(result.get("error", "Failed to close window"))
+
+    # Convenience aliases
+    async def get_window_title(self, window_id: int | str) -> str:
+        return await self.get_window_name(window_id)
+
+    async def window_size(self, window_id: int | str) -> tuple[int, int]:
+        return await self.get_window_size(window_id)
+
    # Command execution
    async def run_command(self, command: str) -> CommandResult:
        result = await self._send_command("run_command", {"command": command})
@@ -258,14 +258,20 @@ class DockerProvider(BaseVMProvider):
                logger.info(f"Container {name} is already running")
                return existing_vm
            elif existing_vm["status"] in ["stopped", "paused"]:
-                # Start existing container
-                logger.info(f"Starting existing container {name}")
-                start_cmd = ["docker", "start", name]
-                result = subprocess.run(start_cmd, capture_output=True, text=True, check=True)
+                if self.ephemeral:
+                    # Delete existing container
+                    logger.info(f"Deleting existing container {name}")
+                    delete_cmd = ["docker", "rm", name]
+                    result = subprocess.run(delete_cmd, capture_output=True, text=True, check=True)
+                else:
+                    # Start existing container
+                    logger.info(f"Starting existing container {name}")
+                    start_cmd = ["docker", "start", name]
+                    result = subprocess.run(start_cmd, capture_output=True, text=True, check=True)

-                # Wait for container to be ready
-                await self._wait_for_container_ready(name)
-                return await self.get_vm(name, storage)
+                    # Wait for container to be ready
+                    await self._wait_for_container_ready(name)
+                    return await self.get_vm(name, storage)

            # Use provided image or default
            docker_image = image if image != "default" else self.image
@@ -307,6 +313,20 @@ class DockerProvider(BaseVMProvider):
            cmd.extend(["-e", "VNC_PW=password"])  # Set VNC password
            cmd.extend(["-e", "VNCOPTIONS=-disableBasicAuth"])  # Disable VNC basic auth

+            # Apply display resolution if provided (e.g., "1024x768")
+            display_resolution = run_opts.get("display")
+            if (
+                isinstance(display_resolution, dict)
+                and "width" in display_resolution
+                and "height" in display_resolution
+            ):
+                cmd.extend(
+                    [
+                        "-e",
+                        f"VNC_RESOLUTION={display_resolution['width']}x{display_resolution['height']}",
+                    ]
+                )
+
            # Add the image
            cmd.append(docker_image)

@@ -388,6 +408,11 @@ class DockerProvider(BaseVMProvider):

            logger.info(f"Container {name} stopped successfully")

+            # Delete container if ephemeral=True
+            if self.ephemeral:
+                cmd = ["docker", "rm", name]
+                result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+
            return {
                "name": name,
                "status": "stopped",
@@ -10,6 +10,8 @@ import subprocess
 import urllib.parse
 from typing import Any, Dict, List, Optional

+from computer.utils import safe_join
+
 # Setup logging
 logger = logging.getLogger(__name__)

@@ -59,7 +61,7 @@ def lume_api_get(
    # --max-time: Maximum time for the whole operation (20 seconds)
    # -f: Fail silently (no output at all) on server errors
    # Add single quotes around URL to ensure special characters are handled correctly
-    cmd = ["curl", "--connect-timeout", "15", "--max-time", "20", "-s", "-f", f"'{api_url}'"]
+    cmd = ["curl", "--connect-timeout", "15", "--max-time", "20", "-s", "-f", api_url]

    # For logging and display, show the properly escaped URL
    display_cmd = ["curl", "--connect-timeout", "15", "--max-time", "20", "-s", "-f", api_url]
@@ -71,7 +73,7 @@ def lume_api_get(
    # Execute the command - for execution we need to use shell=True to handle URLs with special characters
    try:
        # Use a single string with shell=True for proper URL handling
-        shell_cmd = " ".join(cmd)
+        shell_cmd = safe_join(cmd)
        result = subprocess.run(shell_cmd, shell=True, capture_output=True, text=True)

        # Handle curl exit codes
@@ -514,7 +516,7 @@ def lume_api_delete(
        "-s",
        "-X",
        "DELETE",
-        f"'{api_url}'",
+        api_url,
    ]

    # For logging and display, show the properly escaped URL
@@ -537,7 +539,7 @@ def lume_api_delete(
    # Execute the command - for execution we need to use shell=True to handle URLs with special characters
    try:
        # Use a single string with shell=True for proper URL handling
-        shell_cmd = " ".join(cmd)
+        shell_cmd = safe_join(cmd)
        result = subprocess.run(shell_cmd, shell=True, capture_output=True, text=True)

        # Handle curl exit codes
@@ -1,7 +1,10 @@
 import base64
 import io
+import os
+import shlex
 from typing import Any, Dict, Optional, Tuple

+import mslex
 from PIL import Image, ImageDraw


@@ -104,3 +107,25 @@ def parse_vm_info(vm_info: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """Parse VM info from pylume response."""
    if not vm_info:
        return None
+
+
+def safe_join(argv: list[str]) -> str:
+    """
+    Return a platform-correct string that safely quotes `argv` for shell execution.
+
+    - On POSIX: uses `shlex.join`.
+    - On Windows: uses `shlex.join`.
+
+    Args:
+        argv: iterable of argument strings (will be coerced to str).
+
+    Returns:
+        A safely quoted command-line string appropriate for the current platform that protects against
+        shell injection vulnerabilities.
+    """
+    if os.name == "nt":
+        # On Windows, use mslex for proper quoting
+        return mslex.join(argv)
+    else:
+        # On POSIX systems, use shlex
+        return shlex.join(argv)
@@ -4,7 +4,7 @@ build-backend = "pdm.backend"

 [project]
 name = "cua-computer"
-version = "0.4.10"
+version = "0.4.11"
 description = "Computer-Use Interface (CUI) framework powering Cua"
 readme = "README.md"
 authors = [
@@ -16,7 +16,8 @@ dependencies = [
    "websockets>=12.0",
    "aiohttp>=3.9.0",
    "cua-core>=0.1.0,<0.2.0",
-    "pydantic>=2.11.1"
+    "pydantic>=2.11.1",
+    "mslex>=1.3.0",
 ]
 requires-python = ">=3.12"

@@ -47,4 +48,4 @@ source-includes = ["tests/", "README.md", "LICENSE"]
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
 testpaths = ["tests"]
-python_files = "test_*.py"
+python_files = "test_*.py"
@@ -0,0 +1,69 @@
+"""Pytest configuration and shared fixtures for computer package tests.
+
+This file contains shared fixtures and configuration for all computer tests.
+Following SRP: This file ONLY handles test setup/teardown.
+"""
+
+from unittest.mock import AsyncMock, MagicMock, Mock, patch
+
+import pytest
+
+
+@pytest.fixture
+def mock_interface():
+    """Mock computer interface for testing.
+
+    Use this fixture to test Computer logic without real OS calls.
+    """
+    interface = AsyncMock()
+    interface.screenshot = AsyncMock(return_value=b"fake_screenshot")
+    interface.left_click = AsyncMock()
+    interface.right_click = AsyncMock()
+    interface.middle_click = AsyncMock()
+    interface.double_click = AsyncMock()
+    interface.type = AsyncMock()
+    interface.key = AsyncMock()
+    interface.move_mouse = AsyncMock()
+    interface.scroll = AsyncMock()
+    interface.get_screen_size = AsyncMock(return_value=(1920, 1080))
+
+    return interface
+
+
+@pytest.fixture
+def mock_cloud_provider():
+    """Mock cloud provider for testing.
+
+    Use this fixture to test cloud provider logic without real API calls.
+    """
+    provider = AsyncMock()
+    provider.start = AsyncMock()
+    provider.stop = AsyncMock()
+    provider.get_status = AsyncMock(return_value="running")
+    provider.execute_command = AsyncMock(return_value="command output")
+
+    return provider
+
+
+@pytest.fixture
+def mock_local_provider():
+    """Mock local provider for testing.
+
+    Use this fixture to test local provider logic without real VM operations.
+    """
+    provider = AsyncMock()
+    provider.start = AsyncMock()
+    provider.stop = AsyncMock()
+    provider.get_status = AsyncMock(return_value="running")
+    provider.execute_command = AsyncMock(return_value="command output")
+
+    return provider
+
+
+@pytest.fixture
+def disable_telemetry(monkeypatch):
+    """Disable telemetry for tests.
+
+    Use this fixture to ensure no telemetry is sent during tests.
+    """
+    monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1")
@@ -0,0 +1,67 @@
+"""Unit tests for Computer class.
+
+This file tests ONLY the Computer class initialization and context manager.
+Following SRP: This file tests ONE class (Computer).
+All external dependencies (providers, interfaces) are mocked.
+"""
+
+from unittest.mock import AsyncMock, MagicMock, Mock, patch
+
+import pytest
+
+
+class TestComputerImport:
+    """Test Computer module imports (SRP: Only tests imports)."""
+
+    def test_computer_class_exists(self):
+        """Test that Computer class can be imported."""
+        from computer import Computer
+
+        assert Computer is not None
+
+    def test_vm_provider_type_exists(self):
+        """Test that VMProviderType enum can be imported."""
+        from computer import VMProviderType
+
+        assert VMProviderType is not None
+
+
+class TestComputerInitialization:
+    """Test Computer initialization (SRP: Only tests initialization)."""
+
+    def test_computer_class_can_be_imported(self, disable_telemetry):
+        """Test that Computer class can be imported without errors."""
+        from computer import Computer
+
+        assert Computer is not None
+
+    def test_computer_has_required_methods(self, disable_telemetry):
+        """Test that Computer class has required methods."""
+        from computer import Computer
+
+        assert hasattr(Computer, "__aenter__")
+        assert hasattr(Computer, "__aexit__")
+
+
+class TestComputerContextManager:
+    """Test Computer context manager protocol (SRP: Only tests context manager)."""
+
+    def test_computer_is_async_context_manager(self, disable_telemetry):
+        """Test that Computer has async context manager methods."""
+        from computer import Computer
+
+        assert hasattr(Computer, "__aenter__")
+        assert hasattr(Computer, "__aexit__")
+        assert callable(Computer.__aenter__)
+        assert callable(Computer.__aexit__)
+
+
+class TestComputerInterface:
+    """Test Computer.interface property (SRP: Only tests interface access)."""
+
+    def test_computer_class_structure(self, disable_telemetry):
+        """Test that Computer class has expected structure."""
+        from computer import Computer
+
+        # Verify Computer is a class
+        assert isinstance(Computer, type)
--- a/Show More
+++ b/Show More