From 174ae253badb9c06870883067fb96070dd4caab1 Mon Sep 17 00:00:00 2001 From: Francesco Bonacci Date: Sun, 8 Feb 2026 23:54:11 -0800 Subject: [PATCH] feat: auto-generated SDK docs, Python CLI, and docs improvements (#1040) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: auto-generated SDK docs, Python CLI, and docs improvements - Add auto-generated SDK reference pages (computer-sdk, agent-sdk) with version selector - Add Python CLI package (cua-cli) with auth, sandbox, image, MCP commands - Deprecate TypeScript CLI in favor of Python CLI - Add versioned docs (agent-sdk v0.3-v0.7, computer-sdk v0.3-v0.5) - Rename cloud-cli to cli in docs - Add mobile header fix with sidebar toggle - Restructure guide pages (quickstart, self-hosted-sandboxes) - Add redirects for old /api URLs - Update workflows, lume docs, cuabench docs, desktop sandbox docs * refactor: auto-generate CLI index page like computer/agent SDKs Change CLI docs to use the same auto-generated index.mdx pattern as computer-sdk and agent-sdk. Removes hand-written index page that could become stale, and deletes the separate api.mdx. * fix: rename "Cua Bench API Reference" to "API Reference" in menu * fix: update lume examples to macos-tahoe-vanilla and shorten page titles - Replace macos-sequoia-vanilla:latest with macos-tahoe-vanilla:latest in lume docs and generator - Rename "Lume CLI Reference" to "CLI Reference" - Rename "Lume HTTP API Reference" to "API Reference" * feat: rename CuaBot to Cua-Bot and add to dropdown selector - Rename CuaBot to Cua-Bot in docs meta.json and content pages - Add Cua-Bot entry to the header dropdown selector * refactor: restructure Cua-Bot docs to match Cua/Cua-Bench pattern Reorganize cuabot docs from flat structure into guide/getting-started/ hierarchy matching other collections: - cuabot.mdx → guide/getting-started/introduction.mdx - install.mdx → guide/getting-started/installation.mdx - Add meta.json files with proper icons and structure - Update dropdown selector href to new path * feat(docs): add auto-generated API reference, changelog, and versioning for Cua-Bot Add TypeScript SDK doc generator (regex-based, no compiler dependency) and configure cuabot for changelog generation and versioned docs snapshots. * feat(ci): add cuabot to docs drift check and improve failure message Wire cuabot into CI path triggers, runner config, and changed-file detection. Add --check mode to typescript-sdk.ts for drift comparison. Update failure banner with per-library and versioning commands. * fix: resolve Python lint issues (black, ruff) Run black formatting on 12 files, fix ruff F841 (unused variables) in tests, and add TYPE_CHECKING import for FastMCP forward references. * fix: resolve TS typecheck and Lume Swift 6 CI failures - typescript-typecheck.js: build @trycua/core before running typecheck so its dist/ type declarations are available for @trycua/computer - SSHClient.swift: avoid crossing Sendable boundary with NIOSSHHandler by keeping handler access + createChannel within flatMap on the event loop, fixing Swift 6 strict concurrency errors * fix: TS typecheck pnpm version strict mode and Lume mock conformance - Set COREPACK_ENABLE_STRICT=0 in typecheck script to allow pnpm 9.x to run commands in workspace packages declaring pnpm 10.x - Update MockVNCService.sendText signature to match protocol (add delayMs parameter) * fix: run prettier formatting and ignore auto-generated docs files Format all files to pass prettier 3.8.1 check. Add docs/.source/ and docs/next-env.d.ts to .prettierignore (auto-generated, not editable). * fix: restore MDX comment syntax broken by prettier Prettier 3.8.1 converts {/* */} to {/_ _/} in MDX files, which breaks the acorn parser. Restore all comments and add *.mdx to .prettierignore. * fix: regenerate docs to pass drift check after prettier revert * fix: CI docs check fetch-depth, regenerate Lume docs, fix header layout shift - Use fetch-depth: 0 in CI checkout so git tags are available for version discovery (was using fetch-depth: 2, causing version fallback) - Regenerate Lume docs from local Swift build (0.2.75 → 0.2.76) - Fix header product selector layout shift with consistent icon/text sizing * fix: format custom-header.tsx with prettier * fix: use arch-agnostic JAVA_HOME for arm64 Docker build The openjdk package writes the arch-specific path (e.g. java-17-openjdk-amd64) to /etc/environment, which sdkmanager sources, overriding the Dockerfile ENV. Create an arch-agnostic symlink and re-export JAVA_HOME in the sdkmanager RUN step to ensure it works on both amd64 and arm64. * fix: skip emulator package on arm64 (not available for that arch) The Android emulator SDK package is only published for amd64. Conditionally install it based on dpkg --print-architecture. * ci: retrigger cuabot docker build --- .github/workflows/cd-py-agent.yml | 21 +- .github/workflows/cd-py-bench-ui.yml | 9 +- .github/workflows/cd-py-bench.yml | 9 +- .github/workflows/cd-py-cli.yml | 74 + .github/workflows/cd-py-computer-server.yml | 18 +- .github/workflows/cd-py-computer.yml | 9 +- .github/workflows/cd-py-core.yml | 9 +- .github/workflows/cd-py-mcp-server.yml | 38 +- .github/workflows/cd-py-som.yml | 11 +- .github/workflows/cd-swift-lume.yml | 4 +- .github/workflows/ci-check-docs.yml | 70 +- .github/workflows/ci-swift-lume.yml | 4 +- .github/workflows/release-github-reusable.yml | 12 +- .github/workflows/ts-reusable-build.yml | 2 +- .github/workflows/ts-reusable-publish.yml | 2 +- .prettierignore | 8 +- README.md | 2 +- blog/clawcon-multiplayer.md | 3 +- blog/clawdbot-computer-use-history.md | 6 +- .../docs/cua/examples/claude-code/meta.json | 1 + .../docs/cua/guide/advanced/vnc-recorder.mdx | 2 +- .../docs/cua/guide/get-started/meta.json | 6 +- .../docs/cua/guide/get-started/quickstart.mdx | 184 + .../get-started/self-hosted-sandboxes.mdx | 308 + .../cua/guide/get-started/what-is-cua.mdx | 62 +- .../cua/reference/agent-sdk/changelog.mdx | 623 ++ .../docs/cua/reference/agent-sdk/index.mdx | 1618 ++++-- .../docs/cua/reference/agent-sdk/meta.json | 2 +- .../docs/cua/reference/agent-sdk/v0.3/api.mdx | 70 + .../cua/reference/agent-sdk/v0.3/meta.json | 5 + .../docs/cua/reference/agent-sdk/v0.4/api.mdx | 83 + .../cua/reference/agent-sdk/v0.4/meta.json | 5 + .../docs/cua/reference/agent-sdk/v0.5/api.mdx | 83 + .../cua/reference/agent-sdk/v0.5/meta.json | 5 + .../docs/cua/reference/agent-sdk/v0.6/api.mdx | 99 + .../cua/reference/agent-sdk/v0.6/meta.json | 5 + .../docs/cua/reference/agent-sdk/v0.7/api.mdx | 99 + .../cua/reference/agent-sdk/v0.7/meta.json | 5 + .../docs/cua/reference/cli/changelog.mdx | 28 + .../docs/cua/reference/cli/commands.mdx | 338 ++ docs/content/docs/cua/reference/cli/index.mdx | 249 + docs/content/docs/cua/reference/cli/meta.json | 6 + .../cua/reference/computer-sdk/changelog.mdx | 328 ++ .../docs/cua/reference/computer-sdk/index.mdx | 3079 ++++++---- .../docs/cua/reference/computer-sdk/meta.json | 2 +- .../cua/reference/computer-sdk/v0.3/api.mdx | 209 + .../cua/reference/computer-sdk/v0.3/meta.json | 5 + .../cua/reference/computer-sdk/v0.4/api.mdx | 328 ++ .../cua/reference/computer-sdk/v0.4/meta.json | 5 + .../cua/reference/computer-sdk/v0.5/api.mdx | 328 ++ .../cua/reference/computer-sdk/v0.5/meta.json | 5 + .../reference/desktop-sandbox/changelog.mdx | 341 ++ .../cua/reference/desktop-sandbox/index.mdx | 46 +- .../desktop-sandbox/linux-container/kasm.mdx | 9 +- .../desktop-sandbox/linux-container/xfce.mdx | 27 +- .../cua/reference/desktop-sandbox/macos.mdx | 20 +- .../cua/reference/desktop-sandbox/meta.json | 2 +- .../qemu-container/android.mdx | 9 +- .../desktop-sandbox/qemu-container/linux.mdx | 11 +- .../qemu-container/windows.mdx | 11 +- docs/content/docs/cua/reference/meta.json | 2 +- .../docs/cuabench/examples/rl-training.mdx | 20 +- .../cuabench/guide/fundamentals/meta.json | 10 +- .../guide/getting-started/introduction.mdx | 3 +- docs/content/docs/cuabench/reference/api.mdx | 5157 +++++++++++++++++ .../docs/cuabench/reference/cli-reference.mdx | 4 + .../content/docs/cuabench/reference/meta.json | 2 +- .../getting-started/installation.mdx} | 4 +- .../getting-started/introduction.mdx} | 12 +- .../cuabot/guide/getting-started/meta.json | 7 + docs/content/docs/cuabot/guide/meta.json | 6 + docs/content/docs/cuabot/meta.json | 4 +- .../docs/cuabot/reference/changelog.mdx | 132 + docs/content/docs/cuabot/reference/index.mdx | 295 + docs/content/docs/cuabot/reference/meta.json | 6 + .../examples/claude-code/homebrew-testing.mdx | 10 +- .../lume/examples/claude-code/sandbox.mdx | 13 +- .../claude-cowork/numbers-stock-analysis.mdx | 10 +- .../lume/examples/claude-cowork/sandbox.mdx | 32 +- docs/content/docs/lume/examples/index.mdx | 14 +- docs/content/docs/lume/examples/meta.json | 2 +- .../docs/lume/guide/advanced/http-server.mdx | 4 +- .../guide/advanced/lumier/building-lumier.mdx | 10 +- .../guide/advanced/lumier/docker-compose.mdx | 8 +- .../lume/guide/advanced/lumier/docker.mdx | 19 +- .../docs/lume/guide/advanced/lumier/index.mdx | 16 +- .../guide/advanced/lumier/installation.mdx | 3 +- .../docs/lume/guide/advanced/mcp-server.mdx | 78 +- .../guide/fundamentals/unattended-setup.mdx | 124 +- .../lume/guide/fundamentals/vm-management.mdx | 24 +- .../lume/guide/getting-started/comparison.mdx | 21 +- .../docs/lume/guide/getting-started/faq.mdx | 10 +- .../guide/getting-started/installation.mdx | 47 +- .../guide/getting-started/introduction.mdx | 12 +- .../lume/guide/getting-started/quickstart.mdx | 66 +- .../content/docs/lume/reference/changelog.mdx | 441 ++ .../docs/lume/reference/cli-reference.mdx | 32 +- docs/content/docs/lume/reference/http-api.mdx | 18 +- docs/content/docs/lume/reference/meta.json | 2 +- .../lume/reference/v0.2/cli-reference.mdx | 295 + .../docs/lume/reference/v0.2/http-api.mdx | 1114 ++++ .../docs/lume/reference/v0.2/meta.json | 5 + docs/next.config.mjs | 11 + docs/package.json | 5 +- docs/scripts/README.md | 30 +- docs/scripts/crawl_docs.py | 261 + docs/scripts/generate_db.py | 261 + docs/scripts/generate_sqlite.py | 278 + docs/scripts/modal_app.py | 388 +- docs/src/components/custom-header.tsx | 105 +- docs/src/components/version-selector.tsx | 82 + .../cua-bench/cua_bench/cli/commands/image.py | 60 +- .../cua_bench/cli/commands/platform.py | 39 +- .../datasets/cua-bench-workflows/README.md | 14 +- libs/cuabot/Dockerfile | 23 +- libs/cuabot/bin/cuabot.js | 8 +- libs/cuabot/package.json | 2 +- libs/cuabot/src/client.ts | 131 +- libs/cuabot/src/cuabot.tsx | 407 +- libs/cuabot/src/cuabotd.ts | 429 +- libs/cuabot/src/mcp/computer-use-mcp.py | 1 + libs/cuabot/src/onboarding.tsx | 339 +- libs/cuabot/src/prompts/SYSTEM.md | 3 + libs/cuabot/src/settings.ts | 44 +- libs/cuabot/src/telemetry.ts | 44 +- libs/cuabot/src/types/node-pty.d.ts | 12 +- libs/cuabot/src/utils.ts | 185 +- libs/lume/src/SSH/SSHClient.swift | 52 +- libs/lume/tests/Mocks/MockVNCService.swift | 2 +- libs/python/computer-server/README.md | 10 + libs/python/cua-cli/.bumpversion.cfg | 14 + libs/python/cua-cli/README.md | 100 + libs/python/cua-cli/cua_cli/__init__.py | 3 + libs/python/cua-cli/cua_cli/api/__init__.py | 1 + libs/python/cua-cli/cua_cli/api/client.py | 142 + libs/python/cua-cli/cua_cli/auth/__init__.py | 5 + libs/python/cua-cli/cua_cli/auth/browser.py | 148 + libs/python/cua-cli/cua_cli/auth/store.py | 168 + .../cua-cli/cua_cli/commands/__init__.py | 5 + libs/python/cua-cli/cua_cli/commands/auth.py | 181 + libs/python/cua-cli/cua_cli/commands/image.py | 673 +++ .../cua-cli/cua_cli/commands/local_image.py | 993 ++++ libs/python/cua-cli/cua_cli/commands/mcp.py | 844 +++ .../cua-cli/cua_cli/commands/platform.py | 309 + .../cua-cli/cua_cli/commands/sandbox.py | 541 ++ .../python/cua-cli/cua_cli/commands/skills.py | 929 +++ libs/python/cua-cli/cua_cli/main.py | 88 + libs/python/cua-cli/cua_cli/utils/__init__.py | 21 + .../cua-cli/cua_cli/utils/async_utils.py | 20 + libs/python/cua-cli/cua_cli/utils/docker.py | 103 + libs/python/cua-cli/cua_cli/utils/output.py | 66 + libs/python/cua-cli/cua_cli/utils/paths.py | 30 + libs/python/cua-cli/cua_cli/utils/registry.py | 148 + libs/python/cua-cli/pyproject.toml | 111 + libs/python/cua-cli/tests/__init__.py | 1 + libs/python/cua-cli/tests/api/__init__.py | 1 + libs/python/cua-cli/tests/auth/__init__.py | 1 + libs/python/cua-cli/tests/auth/test_store.py | 230 + .../python/cua-cli/tests/commands/__init__.py | 1 + .../cua-cli/tests/commands/test_auth.py | 200 + .../cua-cli/tests/commands/test_image.py | 297 + .../python/cua-cli/tests/commands/test_mcp.py | 232 + .../cua-cli/tests/commands/test_sandbox.py | 442 ++ .../cua-cli/tests/commands/test_skills.py | 235 + libs/python/cua-cli/tests/conftest.py | 258 + libs/python/cua-cli/tests/test_main.py | 166 + libs/python/cua-cli/tests/utils/__init__.py | 1 + .../cua-cli/tests/utils/test_async_utils.py | 95 + .../python/cua-cli/tests/utils/test_output.py | 163 + libs/typescript/cua-cli/README.md | 29 +- libs/typescript/cua-cli/package.json | 3 +- libs/typescript/cua-cli/src/commands/image.ts | 106 +- .../playground/src/adapters/cloud.ts | 18 +- .../playground/src/adapters/local.ts | 8 +- libs/typescript/playground/src/types/chat.ts | 5 +- libs/typescript/playground/src/types/index.ts | 8 +- .../playground/src/utils/localStorage.ts | 38 +- scripts/docs-generators/config.json | 63 +- .../docs-generators/extract_python_docs.py | 309 + scripts/docs-generators/generate-changelog.ts | 546 ++ .../generate-versioned-docs.ts | 681 +++ scripts/docs-generators/lume.ts | 159 +- scripts/docs-generators/python-sdk.ts | 920 +++ scripts/docs-generators/requirements.txt | 2 + scripts/docs-generators/typescript-sdk.ts | 772 +++ scripts/typescript-typecheck.js | 10 +- uv.lock | 18 +- 187 files changed, 28921 insertions(+), 3022 deletions(-) create mode 100644 .github/workflows/cd-py-cli.yml create mode 100644 docs/content/docs/cua/guide/get-started/quickstart.mdx create mode 100644 docs/content/docs/cua/guide/get-started/self-hosted-sandboxes.mdx create mode 100644 docs/content/docs/cua/reference/agent-sdk/changelog.mdx create mode 100644 docs/content/docs/cua/reference/agent-sdk/v0.3/api.mdx create mode 100644 docs/content/docs/cua/reference/agent-sdk/v0.3/meta.json create mode 100644 docs/content/docs/cua/reference/agent-sdk/v0.4/api.mdx create mode 100644 docs/content/docs/cua/reference/agent-sdk/v0.4/meta.json create mode 100644 docs/content/docs/cua/reference/agent-sdk/v0.5/api.mdx create mode 100644 docs/content/docs/cua/reference/agent-sdk/v0.5/meta.json create mode 100644 docs/content/docs/cua/reference/agent-sdk/v0.6/api.mdx create mode 100644 docs/content/docs/cua/reference/agent-sdk/v0.6/meta.json create mode 100644 docs/content/docs/cua/reference/agent-sdk/v0.7/api.mdx create mode 100644 docs/content/docs/cua/reference/agent-sdk/v0.7/meta.json create mode 100644 docs/content/docs/cua/reference/cli/changelog.mdx create mode 100644 docs/content/docs/cua/reference/cli/commands.mdx create mode 100644 docs/content/docs/cua/reference/cli/index.mdx create mode 100644 docs/content/docs/cua/reference/cli/meta.json create mode 100644 docs/content/docs/cua/reference/computer-sdk/changelog.mdx create mode 100644 docs/content/docs/cua/reference/computer-sdk/v0.3/api.mdx create mode 100644 docs/content/docs/cua/reference/computer-sdk/v0.3/meta.json create mode 100644 docs/content/docs/cua/reference/computer-sdk/v0.4/api.mdx create mode 100644 docs/content/docs/cua/reference/computer-sdk/v0.4/meta.json create mode 100644 docs/content/docs/cua/reference/computer-sdk/v0.5/api.mdx create mode 100644 docs/content/docs/cua/reference/computer-sdk/v0.5/meta.json create mode 100644 docs/content/docs/cua/reference/desktop-sandbox/changelog.mdx create mode 100644 docs/content/docs/cuabench/reference/api.mdx rename docs/content/docs/cuabot/{install.mdx => guide/getting-started/installation.mdx} (95%) rename docs/content/docs/cuabot/{cuabot.mdx => guide/getting-started/introduction.mdx} (91%) create mode 100644 docs/content/docs/cuabot/guide/getting-started/meta.json create mode 100644 docs/content/docs/cuabot/guide/meta.json create mode 100644 docs/content/docs/cuabot/reference/changelog.mdx create mode 100644 docs/content/docs/cuabot/reference/index.mdx create mode 100644 docs/content/docs/cuabot/reference/meta.json create mode 100644 docs/content/docs/lume/reference/changelog.mdx create mode 100644 docs/content/docs/lume/reference/v0.2/cli-reference.mdx create mode 100644 docs/content/docs/lume/reference/v0.2/http-api.mdx create mode 100644 docs/content/docs/lume/reference/v0.2/meta.json create mode 100644 docs/scripts/crawl_docs.py create mode 100644 docs/scripts/generate_db.py create mode 100644 docs/scripts/generate_sqlite.py create mode 100644 docs/src/components/version-selector.tsx create mode 100644 libs/python/cua-cli/.bumpversion.cfg create mode 100644 libs/python/cua-cli/README.md create mode 100644 libs/python/cua-cli/cua_cli/__init__.py create mode 100644 libs/python/cua-cli/cua_cli/api/__init__.py create mode 100644 libs/python/cua-cli/cua_cli/api/client.py create mode 100644 libs/python/cua-cli/cua_cli/auth/__init__.py create mode 100644 libs/python/cua-cli/cua_cli/auth/browser.py create mode 100644 libs/python/cua-cli/cua_cli/auth/store.py create mode 100644 libs/python/cua-cli/cua_cli/commands/__init__.py create mode 100644 libs/python/cua-cli/cua_cli/commands/auth.py create mode 100644 libs/python/cua-cli/cua_cli/commands/image.py create mode 100644 libs/python/cua-cli/cua_cli/commands/local_image.py create mode 100644 libs/python/cua-cli/cua_cli/commands/mcp.py create mode 100644 libs/python/cua-cli/cua_cli/commands/platform.py create mode 100644 libs/python/cua-cli/cua_cli/commands/sandbox.py create mode 100644 libs/python/cua-cli/cua_cli/commands/skills.py create mode 100644 libs/python/cua-cli/cua_cli/main.py create mode 100644 libs/python/cua-cli/cua_cli/utils/__init__.py create mode 100644 libs/python/cua-cli/cua_cli/utils/async_utils.py create mode 100644 libs/python/cua-cli/cua_cli/utils/docker.py create mode 100644 libs/python/cua-cli/cua_cli/utils/output.py create mode 100644 libs/python/cua-cli/cua_cli/utils/paths.py create mode 100644 libs/python/cua-cli/cua_cli/utils/registry.py create mode 100644 libs/python/cua-cli/pyproject.toml create mode 100644 libs/python/cua-cli/tests/__init__.py create mode 100644 libs/python/cua-cli/tests/api/__init__.py create mode 100644 libs/python/cua-cli/tests/auth/__init__.py create mode 100644 libs/python/cua-cli/tests/auth/test_store.py create mode 100644 libs/python/cua-cli/tests/commands/__init__.py create mode 100644 libs/python/cua-cli/tests/commands/test_auth.py create mode 100644 libs/python/cua-cli/tests/commands/test_image.py create mode 100644 libs/python/cua-cli/tests/commands/test_mcp.py create mode 100644 libs/python/cua-cli/tests/commands/test_sandbox.py create mode 100644 libs/python/cua-cli/tests/commands/test_skills.py create mode 100644 libs/python/cua-cli/tests/conftest.py create mode 100644 libs/python/cua-cli/tests/test_main.py create mode 100644 libs/python/cua-cli/tests/utils/__init__.py create mode 100644 libs/python/cua-cli/tests/utils/test_async_utils.py create mode 100644 libs/python/cua-cli/tests/utils/test_output.py create mode 100644 scripts/docs-generators/extract_python_docs.py create mode 100644 scripts/docs-generators/generate-changelog.ts create mode 100644 scripts/docs-generators/generate-versioned-docs.ts create mode 100644 scripts/docs-generators/python-sdk.ts create mode 100644 scripts/docs-generators/requirements.txt create mode 100644 scripts/docs-generators/typescript-sdk.ts diff --git a/.github/workflows/cd-py-agent.yml b/.github/workflows/cd-py-agent.yml index 9eee5dc9..40410705 100644 --- a/.github/workflows/cd-py-agent.yml +++ b/.github/workflows/cd-py-agent.yml @@ -181,23 +181,4 @@ jobs: release_name: "cua-agent v${{ needs.prepare.outputs.version }}" module_path: "libs/python/agent" body: | - ## Dependencies - * cua-computer: ${{ needs.prepare.outputs.computer_version }} - * cua-som: ${{ needs.prepare.outputs.som_version }} - - ## Installation Options - - ### Basic installation with Anthropic - ```bash - pip install cua-agent[anthropic]==${{ needs.prepare.outputs.version }} - ``` - - ### With SOM (recommended) - ```bash - pip install cua-agent[som]==${{ needs.prepare.outputs.version }} - ``` - - ### All features - ```bash - pip install cua-agent[all]==${{ needs.prepare.outputs.version }} - ``` + **Dependencies:** cua-computer ${{ needs.prepare.outputs.computer_version }}, cua-som ${{ needs.prepare.outputs.som_version }} diff --git a/.github/workflows/cd-py-bench-ui.yml b/.github/workflows/cd-py-bench-ui.yml index 0752799f..c0112292 100644 --- a/.github/workflows/cd-py-bench-ui.yml +++ b/.github/workflows/cd-py-bench-ui.yml @@ -70,11 +70,4 @@ jobs: with: tag_name: "bench-ui-v${{ needs.prepare.outputs.version }}" release_name: "cua-bench-ui v${{ needs.prepare.outputs.version }}" - body: | - ## Lightweight webUI window controller for Cua bench - - ## Installation - - ```bash - pip install cua-bench-ui==${{ needs.prepare.outputs.version }} - ``` + body: "" diff --git a/.github/workflows/cd-py-bench.yml b/.github/workflows/cd-py-bench.yml index aafde578..defa065e 100644 --- a/.github/workflows/cd-py-bench.yml +++ b/.github/workflows/cd-py-bench.yml @@ -70,11 +70,4 @@ jobs: with: tag_name: "bench-v${{ needs.prepare.outputs.version }}" release_name: "cua-bench v${{ needs.prepare.outputs.version }}" - body: | - ## Toolkit for computer-use RL environments and benchmarks - - ## Installation - - ```bash - pip install cua-bench==${{ needs.prepare.outputs.version }} - ``` + body: "" diff --git a/.github/workflows/cd-py-cli.yml b/.github/workflows/cd-py-cli.yml new file mode 100644 index 00000000..5a69d904 --- /dev/null +++ b/.github/workflows/cd-py-cli.yml @@ -0,0 +1,74 @@ +name: "CD: cua-cli (PyPI)" + +on: + push: + tags: + - "cli-v*" + workflow_dispatch: + inputs: + version: + description: "Version to publish (without v prefix)" + required: true + default: "0.1.0" + workflow_call: + inputs: + version: + description: "Version to publish" + required: true + type: string + +# Adding permissions at workflow level +permissions: + contents: write + +jobs: + prepare: + runs-on: macos-latest + outputs: + version: ${{ steps.get-version.outputs.version }} + steps: + - uses: actions/checkout@v4 + + - name: Determine version + id: get-version + run: | + # Check inputs.version first (set by workflow_call) + if [ -n "${{ inputs.version }}" ]; then + VERSION=${{ inputs.version }} + elif [ "${{ github.event_name }}" == "push" ]; then + # Extract version from tag (for package-specific tags) + if [[ "${{ github.ref }}" =~ ^refs/tags/cli-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then + VERSION=${BASH_REMATCH[1]} + else + echo "Invalid tag format for cli" + exit 1 + fi + elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + # Use version from workflow dispatch + VERSION=${{ github.event.inputs.version }} + else + echo "No version provided" + exit 1 + fi + echo "VERSION=$VERSION" + echo "version=$VERSION" >> $GITHUB_OUTPUT + + publish: + needs: prepare + uses: ./.github/workflows/py-reusable-publish.yml + with: + package_name: "cli" + package_dir: "libs/python/cua-cli" + version: ${{ needs.prepare.outputs.version }} + base_package_name: "cua-cli" + secrets: + PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} + + create-release: + needs: [prepare, publish] + uses: ./.github/workflows/release-github-reusable.yml + with: + tag_name: "cli-v${{ needs.prepare.outputs.version }}" + release_name: "cua-cli v${{ needs.prepare.outputs.version }}" + module_path: "libs/python/cua-cli" + body: "" diff --git a/.github/workflows/cd-py-computer-server.yml b/.github/workflows/cd-py-computer-server.yml index a9d7b09c..f6a5bbfb 100644 --- a/.github/workflows/cd-py-computer-server.yml +++ b/.github/workflows/cd-py-computer-server.yml @@ -88,20 +88,4 @@ jobs: tag_name: "computer-server-v${{ needs.prepare.outputs.version }}" release_name: "cua-computer-server v${{ needs.prepare.outputs.version }}" module_path: "libs/python/computer-server" - body: | - ## Computer Server for the Computer Universal Automation (Cua) project - - A FastAPI-based server implementation for computer control. - - ## Usage - - ```bash - # Run the server - cua-computer-server - ``` - - ## Installation - - ```bash - pip install cua-computer-server==${{ needs.prepare.outputs.version }} - ``` + body: "" diff --git a/.github/workflows/cd-py-computer.yml b/.github/workflows/cd-py-computer.yml index 38ecd368..7fadc877 100644 --- a/.github/workflows/cd-py-computer.yml +++ b/.github/workflows/cd-py-computer.yml @@ -163,11 +163,4 @@ jobs: tag_name: "computer-v${{ needs.prepare.outputs.version }}" release_name: "cua-computer v${{ needs.prepare.outputs.version }}" module_path: "libs/python/computer" - body: | - ## Computer control library for the Computer Universal Automation (Cua) project - - ## Installation - - ```bash - pip install cua-computer==${{ needs.prepare.outputs.version }} - ``` + body: "" diff --git a/.github/workflows/cd-py-core.yml b/.github/workflows/cd-py-core.yml index f979914d..1795f0e0 100644 --- a/.github/workflows/cd-py-core.yml +++ b/.github/workflows/cd-py-core.yml @@ -71,11 +71,4 @@ jobs: tag_name: "core-v${{ needs.prepare.outputs.version }}" release_name: "cua-core v${{ needs.prepare.outputs.version }}" module_path: "libs/python/core" - body: | - ## Base package for Cua project with telemetry and core utilities - - ## Installation - - ```bash - pip install cua-core==${{ needs.prepare.outputs.version }} - ``` + body: "" diff --git a/.github/workflows/cd-py-mcp-server.yml b/.github/workflows/cd-py-mcp-server.yml index 15f3945e..b7c8c42b 100644 --- a/.github/workflows/cd-py-mcp-server.yml +++ b/.github/workflows/cd-py-mcp-server.yml @@ -165,40 +165,4 @@ jobs: tag_name: "mcp-server-v${{ needs.prepare.outputs.version }}" release_name: "cua-mcp-server v${{ needs.prepare.outputs.version }}" module_path: "libs/python/mcp-server" - body: | - ## MCP Server for the Computer-Use Agent (Cua) - - This package provides MCP (Model Context Protocol) integration for Cua agents, allowing them to be used with Claude Desktop, Cursor, and other MCP clients. - - ## Usage - - ```bash - # Run the MCP server directly - cua-mcp-server - ``` - - ## Claude Desktop Integration - - Add to your Claude Desktop configuration (~/.config/claude-desktop/claude_desktop_config.json or OS-specific location): - - ```json - "mcpServers": { - "cua-agent": { - "command": "cua-mcp-server", - "args": [], - "env": { - "CUA_AGENT_LOOP": "OMNI", - "CUA_MODEL_PROVIDER": "ANTHROPIC", - "CUA_MODEL_NAME": "claude-3-opus-20240229", - "ANTHROPIC_API_KEY": "your-api-key", - "PYTHONIOENCODING": "utf-8" - } - } - } - ``` - - ## Installation - - ```bash - pip install cua-mcp-server==${{ needs.prepare.outputs.version }} - ``` + body: "" diff --git a/.github/workflows/cd-py-som.yml b/.github/workflows/cd-py-som.yml index dd72b20e..c85f0247 100644 --- a/.github/workflows/cd-py-som.yml +++ b/.github/workflows/cd-py-som.yml @@ -75,13 +75,4 @@ jobs: tag_name: "som-v${{ needs.determine-version.outputs.version }}" release_name: "cua-som v${{ needs.determine-version.outputs.version }}" module_path: "libs/python/som" - body: | - ## Computer Vision and OCR library for detecting and analyzing UI elements - - This package provides enhanced UI understanding capabilities through computer vision and OCR. - - ## Installation - - ```bash - pip install cua-som==${{ needs.determine-version.outputs.version }} - ``` + body: "" diff --git a/.github/workflows/cd-swift-lume.yml b/.github/workflows/cd-swift-lume.yml index eae3a973..de57a860 100644 --- a/.github/workflows/cd-swift-lume.yml +++ b/.github/workflows/cd-swift-lume.yml @@ -53,9 +53,9 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Select Xcode 16.2 + - name: Select Xcode 16 run: | - sudo xcode-select -s /Applications/Xcode_16.2.app + sudo xcode-select -s /Applications/Xcode_16.app xcodebuild -version - name: Install dependencies diff --git a/.github/workflows/ci-check-docs.yml b/.github/workflows/ci-check-docs.yml index 7f37b250..9bb726ac 100644 --- a/.github/workflows/ci-check-docs.yml +++ b/.github/workflows/ci-check-docs.yml @@ -10,13 +10,16 @@ on: # MCP Server (Python) - "libs/python/mcp-server/src/**" # Computer SDK - - "libs/python/computer/src/**" + - "libs/python/computer/computer/**" - "libs/typescript/computer/src/**" # Agent SDK - - "libs/python/agent/src/**" + - "libs/python/agent/agent/**" - "libs/typescript/agent/src/**" + # Cua-Bot + - "libs/cuabot/src/**" # Documentation files themselves - "docs/content/docs/cua/reference/**" + - "docs/content/docs/cuabot/reference/**" # Generator scripts - "scripts/docs-generators/**" @@ -28,7 +31,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 with: - fetch-depth: 2 # Need history for changed file detection + fetch-depth: 0 # Need full history for git tags (version discovery) and changed file detection - name: Setup Node.js uses: actions/setup-node@v4 @@ -42,6 +45,14 @@ jobs: run: pnpm install working-directory: docs + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install Python doc dependencies + run: pip install -r scripts/docs-generators/requirements.txt + - name: Determine changed generators id: changed run: | @@ -68,14 +79,14 @@ jobs: GENERATORS="$GENERATORS mcp-server" fi - # Computer SDK changes - if echo "$CHANGED_FILES" | grep -q "^libs/python/computer/src/\|^libs/typescript/computer/src/"; then - GENERATORS="$GENERATORS computer-sdk" + # Python SDK changes (Computer and Agent) + if echo "$CHANGED_FILES" | grep -q "^libs/python/computer/computer/\|^libs/python/agent/agent/"; then + GENERATORS="$GENERATORS python-sdk" fi - # Agent SDK changes - if echo "$CHANGED_FILES" | grep -q "^libs/python/agent/src/\|^libs/typescript/agent/src/"; then - GENERATORS="$GENERATORS agent-sdk" + # Cua-Bot changes + if echo "$CHANGED_FILES" | grep -q "^libs/cuabot/src/"; then + GENERATORS="$GENERATORS cuabot" fi # Generator changes should run all @@ -113,20 +124,27 @@ jobs: if: failure() run: | echo "" - echo "╔══════════════════════════════════════════════════════════════════╗" - echo "║ Documentation Out of Sync! ║" - echo "╠══════════════════════════════════════════════════════════════════╣" - echo "║ ║" - echo "║ The documentation has drifted from the source code. ║" - echo "║ ║" - echo "║ To fix this, run from the repository root: ║" - echo "║ ║" - echo "║ npx tsx scripts/docs-generators/runner.ts ║" - echo "║ ║" - echo "║ Or for a specific library: ║" - echo "║ ║" - echo "║ npx tsx scripts/docs-generators/runner.ts --library lume ║" - echo "║ ║" - echo "║ Then commit the updated documentation files. ║" - echo "║ ║" - echo "╚══════════════════════════════════════════════════════════════════╝" + echo "╔═══════════════════════════════════════════════════════════════════════╗" + echo "║ Documentation Out of Sync! ║" + echo "╠═══════════════════════════════════════════════════════════════════════╣" + echo "║ ║" + echo "║ The documentation has drifted from the source code. ║" + echo "║ ║" + echo "║ To regenerate all docs, run from the repository root: ║" + echo "║ ║" + echo "║ npx tsx scripts/docs-generators/runner.ts ║" + echo "║ ║" + echo "║ Or for a specific library: ║" + echo "║ ║" + echo "║ npx tsx scripts/docs-generators/runner.ts --library lume ║" + echo "║ npx tsx scripts/docs-generators/runner.ts --library python-sdk ║" + echo "║ npx tsx scripts/docs-generators/runner.ts --library cuabot ║" + echo "║ ║" + echo "║ For versioned docs and changelogs (after tagging a new release): ║" + echo "║ ║" + echo "║ npx tsx scripts/docs-generators/generate-versioned-docs.ts ║" + echo "║ npx tsx scripts/docs-generators/generate-changelog.ts ║" + echo "║ ║" + echo "║ Then commit the updated documentation files. ║" + echo "║ ║" + echo "╚═══════════════════════════════════════════════════════════════════════╝" diff --git a/.github/workflows/ci-swift-lume.yml b/.github/workflows/ci-swift-lume.yml index 4cfa569b..0c121215 100644 --- a/.github/workflows/ci-swift-lume.yml +++ b/.github/workflows/ci-swift-lume.yml @@ -18,7 +18,7 @@ jobs: steps: - uses: actions/checkout@v4 - run: uname -a - - run: sudo xcode-select -s /Applications/Xcode_16.2.app # Swift 6.2 + - run: sudo xcode-select -s /Applications/Xcode_16.app # Swift 6.0 - run: swift test working-directory: ./libs/lume build: @@ -27,6 +27,6 @@ jobs: steps: - uses: actions/checkout@v4 - run: uname -a - - run: sudo xcode-select -s /Applications/Xcode_16.2.app # Swift 6.2 + - run: sudo xcode-select -s /Applications/Xcode_16.app # Swift 6.0 - run: swift build --configuration release working-directory: ./libs/lume diff --git a/.github/workflows/release-github-reusable.yml b/.github/workflows/release-github-reusable.yml index 408b77fd..6a8e1e01 100644 --- a/.github/workflows/release-github-reusable.yml +++ b/.github/workflows/release-github-reusable.yml @@ -83,6 +83,11 @@ jobs: # Get commits and process each one to fetch GitHub username while IFS='|' read -r sha subject; do + # Skip automated bump commits (e.g., "Bump cua-agent to v0.7.22") + if [[ "$subject" =~ ^Bump\ cua- ]]; then + continue + fi + # Try to get GitHub username via API USERNAME=$(gh api repos/${{ github.repository }}/commits/${sha} --jq '.author.login' 2>/dev/null || echo "") @@ -98,12 +103,15 @@ jobs: fi fi + # Link PR numbers: (#123) -> ([#123](https://github.com/REPO/pull/123)) + LINKED_SUBJECT=$(echo "$subject" | sed 's/(#\([0-9]*\))/([#\1](https:\/\/github.com\/${{ github.repository }}\/pull\/\1))/g') + SHORT_SHA=$(echo ${sha} | cut -c1-7) - NOTES="${NOTES}* ${subject} (${SHORT_SHA}) by @${USERNAME}"$'\n' + NOTES="${NOTES}* ${LINKED_SUBJECT} (${SHORT_SHA}) by @${USERNAME}"$'\n' done < <(git log ${COMMIT_RANGE} --pretty=format:"%H|%s" -- "${{ inputs.module_path }}" | head -50) if [ -z "$NOTES" ]; then - NOTES="* Initial release or no path-specific changes found" + NOTES="Maintenance release — dependency updates only." fi # Store notes in output (handle multiline) diff --git a/.github/workflows/ts-reusable-build.yml b/.github/workflows/ts-reusable-build.yml index 58d6d72c..8d519ad0 100644 --- a/.github/workflows/ts-reusable-build.yml +++ b/.github/workflows/ts-reusable-build.yml @@ -56,7 +56,7 @@ jobs: bun run build fi else - pnpm run --if-present build + pnpm run build --if-present fi - name: Verify build diff --git a/.github/workflows/ts-reusable-publish.yml b/.github/workflows/ts-reusable-publish.yml index 74b65c2d..3b631d6f 100644 --- a/.github/workflows/ts-reusable-publish.yml +++ b/.github/workflows/ts-reusable-publish.yml @@ -98,7 +98,7 @@ jobs: bun run build fi else - pnpm run --if-present build + pnpm run build --if-present fi - name: Publish to npm diff --git a/.prettierignore b/.prettierignore index c9aaff70..cd64d011 100644 --- a/.prettierignore +++ b/.prettierignore @@ -31,8 +31,12 @@ venv/ pnpm-lock.yaml uv.lock -# Docs with complex JSX formatting -docs/content/docs/get-started/quickstart.mdx +# Auto-generated docs source +docs/.source/ +docs/next-env.d.ts + +# MDX files (prettier mangles {/* */} comment syntax into {/_ _/}) +*.mdx # Git worktrees (separate branches) .worktrees/ \ No newline at end of file diff --git a/README.md b/README.md index 43e5fbd9..cc98ffe7 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Built-in support for `agent-browser` and `agent-device` (iOS, Android) out of th
-**[Get Started](https://cua.ai/docs/cuabot/cuabot)** | **[Installation](https://cua.ai/docs/cuabot/install)** | *First spotted at [ClawCon](https://www.claw-con.com/)* +**[Get Started](https://cua.ai/docs/cuabot/cuabot)** | **[Installation](https://cua.ai/docs/cuabot/install)** | _First spotted at [ClawCon](https://www.claw-con.com/)_
cuabot screenshot diff --git a/blog/clawcon-multiplayer.md b/blog/clawcon-multiplayer.md index 68ea0d9b..fe98674e 100644 --- a/blog/clawcon-multiplayer.md +++ b/blog/clawcon-multiplayer.md @@ -1,4 +1,5 @@ # Announcing the First Multi-Player Computer-Use — Live from ClawCon + _Published on February 6, 2026 by Francesco Bonacci and Dillon DuPont_ ClawCon brought over 700 attendees to Frontier Tower, with a waitlist that had people lining up down Market Street, and another 20k tuned into the livestream. It was the first community event for OpenClaw, and we had the 2nd demo session. @@ -54,8 +55,6 @@ ClawCon livestream demo
Multi-Player Computer Use Agent building an RL gym and coding and playing a desktop game simultaneously. Before today, you could only have 1 Computer-Use Agent deployed per system to control - - ## OpenClaw + CuaBot We demoed this at ClawCon for a reason — CuaBot is designed to work with OpenClaw out of the box. diff --git a/blog/clawdbot-computer-use-history.md b/blog/clawdbot-computer-use-history.md index fd9b2128..0ede49e2 100644 --- a/blog/clawdbot-computer-use-history.md +++ b/blog/clawdbot-computer-use-history.md @@ -2,7 +2,7 @@ _Published on Jan 28, 2026 by Francesco Bonacci. Originally posted on [X](https://x.com/francedot/status/2016627257310384554)._ -***TLDR**: Since Clawdbot went viral, I've gotten a lot of questions: Where did this all come from? What's next? Here's my take.* +**\*TLDR**: Since Clawdbot went viral, I've gotten a lot of questions: Where did this all come from? What's next? Here's my take.\* Clawdbot just hit the mainstream. The open-source AI assistant — now rebranded to Moltbot after trademark issues — has captured the imagination of developers and mainstream users alike. An AI that runs on your own machine, controlled through WhatsApp, extensible through plugins. It feels like the future arriving all at once. @@ -104,7 +104,7 @@ Nine days later, the open-source community responded. Browser-use ([@gregpr07](h ![story_11](https://github.com/user-attachments/assets/e079984a-ee73-4dc1-b43d-fdea78f08158) -Then, on January 23, 2025, OpenAI entered with [Operator](https://openai.com/index/introducing-operator/), powered by their Computer-Using Agent (CUA) model. +Then, on January 23, 2025, OpenAI entered with [Operator](https://openai.com/index/introducing-operator/), powered by their Computer-Using Agent (CUA) model. ![story_12](https://github.com/user-attachments/assets/9a146ad2-174a-46a3-bbaf-9d066708fba1) @@ -188,6 +188,7 @@ Created by Austrian developer Peter Steinberger, [Clawdbot](https://github.com/c ![story_18](https://github.com/user-attachments/assets/a7eaffff-d62b-452d-98c5-402fd7dadf69) What makes it different: + - **Self-hosted**: Runs entirely on your machine. Your data stays local. - **Multi-model**: Works with Claude, GPT, or local models via Ollama. - **Extensible**: Skills and plugins from ClawdHub — a public registry where anyone can contribute capabilities. @@ -202,6 +203,7 @@ But the paradigm is clear: the future of computer-use agents is modular, and use ## The Evolution in One Frame Looking back, the progression follows a clear arc: + - 2023: "Can AI see a screen?" (GPT-4V) - 2024: "Can AI click buttons?" (Claude Computer Use, Operator) - 2025: "Can AI write code instead?" (CoAct-1) diff --git a/docs/content/docs/cua/examples/claude-code/meta.json b/docs/content/docs/cua/examples/claude-code/meta.json index c0e2d2d9..f3100520 100644 --- a/docs/content/docs/cua/examples/claude-code/meta.json +++ b/docs/content/docs/cua/examples/claude-code/meta.json @@ -1,5 +1,6 @@ { "title": "Claude Code", "description": "Claude Code integration examples", + "icon": "Terminal", "pages": ["human-demonstrations"] } diff --git a/docs/content/docs/cua/guide/advanced/vnc-recorder.mdx b/docs/content/docs/cua/guide/advanced/vnc-recorder.mdx index f1d39b31..109cbd88 100644 --- a/docs/content/docs/cua/guide/advanced/vnc-recorder.mdx +++ b/docs/content/docs/cua/guide/advanced/vnc-recorder.mdx @@ -61,7 +61,7 @@ cua skills read my-skill ### 1. Start a sandbox and open the VNC UI -Start a sandbox using [Docker](/docs/cua/guide/get-started/set-up-sandbox) or [Cua Cloud](/docs/cua/guide/get-started/set-up-sandbox). The VNC UI will be available at: +Start a sandbox using [Docker](/docs/cua/guide/get-started/self-hosted-sandboxes) or [Cua Cloud](/docs/cua/guide/get-started/quickstart). The VNC UI will be available at: - **Docker**: `http://localhost:8006` - **Cloud**: `https://{sandbox-name}.sandbox.cua.ai/vnc.html` diff --git a/docs/content/docs/cua/guide/get-started/meta.json b/docs/content/docs/cua/guide/get-started/meta.json index a9b3c92e..3eef4f68 100644 --- a/docs/content/docs/cua/guide/get-started/meta.json +++ b/docs/content/docs/cua/guide/get-started/meta.json @@ -5,11 +5,9 @@ "icon": "Rocket", "pages": [ "what-is-cua", - "what-is-computer-use-agent", - "what-is-desktop-sandbox", - "set-up-sandbox", + "quickstart", "using-computer-sdk", "using-agent-sdk", - "using-cloud-cli" + "self-hosted-sandboxes" ] } diff --git a/docs/content/docs/cua/guide/get-started/quickstart.mdx b/docs/content/docs/cua/guide/get-started/quickstart.mdx new file mode 100644 index 00000000..3aaa9a93 --- /dev/null +++ b/docs/content/docs/cua/guide/get-started/quickstart.mdx @@ -0,0 +1,184 @@ +--- +title: Quickstart +description: Get a computer-use agent running in 5 minutes +--- + +import { Callout } from 'fumadocs-ui/components/callout'; + +This guide gets you from zero to a working computer-use agent using Cua Cloud sandboxes. + + + **Prerequisites:** Python 3.12 or 3.13 and a free Cua account at [cua.ai](https://cua.ai/signin). + + +## 1. Install the Cua CLI and SDKs + +```bash +pip install cua-cli cua-computer "cua-agent[all]" +``` + +## 2. Authenticate + +Login with your Cua account: + +```bash +cua auth login +``` + +This opens your browser for authentication. Once complete, your API key is stored locally. + +To export your API key to a `.env` file for use in scripts: + +```bash +cua auth env +``` + +## 3. Create a Cloud Sandbox + +Create a Linux sandbox: + +```bash +cua sandbox create --os linux --size small --region north-america +``` + +Note the sandbox name from the output (e.g., `curious-fox-123`). + +You can view all your sandboxes: + +```bash +cua sandbox list +``` + +## 4. Take a Screenshot (Hello World) + +Create a file `hello.py`: + +```python +import os +import asyncio +from computer import Computer + +# Load API key from environment (or set directly) +os.environ["CUA_API_KEY"] = "sk_cua-api01_..." # or use: cua auth env + +computer = Computer( + os_type="linux", + provider_type="cloud", + name="curious-fox-123" # your sandbox name +) + +async def main(): + await computer.run() # Connect to the sandbox + + try: + # Take a screenshot + screenshot = await computer.interface.screenshot() + screenshot.save("screenshot.png") + print("Screenshot saved to screenshot.png") + + # Click at coordinates (100, 100) + await computer.interface.left_click(100, 100) + + # Type some text + await computer.interface.type_text("Hello from Cua!") + finally: + await computer.disconnect() + +asyncio.run(main()) +``` + +Run it: + +```bash +python hello.py +``` + +## 5. Run an Agent + +Now let's add AI. Create `agent.py`: + +```python +import os +import asyncio +from computer import Computer +from agent import ComputerAgent + +os.environ["CUA_API_KEY"] = "sk_cua-api01_..." # or use: cua auth env + +computer = Computer( + os_type="linux", + provider_type="cloud", + name="curious-fox-123" # your sandbox name +) + +async def main(): + await computer.run() + + try: + agent = ComputerAgent( + model="cua/anthropic/claude-sonnet-4.5", + tools=[computer], + ) + + messages = [{"role": "user", "content": "Open Firefox and go to google.com"}] + + async for result in agent.run(messages): + for item in result["output"]: + if item["type"] == "message": + print(item["content"][0]["text"]) + finally: + await computer.disconnect() + +asyncio.run(main()) +``` + +Run it: + +```bash +python agent.py +``` + +The agent will observe the screen, decide what actions to take, and execute them to complete your task. + +## 6. Open the Desktop in Your Browser + +Watch your agent work in real-time: + +```bash +cua sandbox vnc curious-fox-123 +``` + +This opens a browser window showing the sandbox desktop. + +## 7. Clean Up + +When you're done, stop your sandbox to avoid charges: + +```bash +cua sandbox stop curious-fox-123 +``` + +Or delete it entirely: + +```bash +cua sandbox delete curious-fox-123 +``` + +## CLI Reference + +| Command | Description | +| -------------------------------------------------------------- | ---------------------------------- | +| `cua auth login` | Authenticate with your Cua account | +| `cua auth env` | Export API key to `.env` file | +| `cua sandbox list` | List all sandboxes | +| `cua sandbox create --os --size --region ` | Create a sandbox | +| `cua sandbox vnc ` | Open sandbox in browser | +| `cua sandbox start ` | Start a stopped sandbox | +| `cua sandbox stop ` | Stop a sandbox | +| `cua sandbox delete ` | Delete a sandbox | + +## Next Steps + +- [Using the Computer SDK](/cua/guide/get-started/using-computer-sdk) - Learn low-level computer control +- [Using the Agent SDK](/cua/guide/get-started/using-agent-sdk) - Advanced AI agent configuration +- [Self-Hosted Sandboxes](/cua/guide/get-started/self-hosted-sandboxes) - Run sandboxes locally with Docker diff --git a/docs/content/docs/cua/guide/get-started/self-hosted-sandboxes.mdx b/docs/content/docs/cua/guide/get-started/self-hosted-sandboxes.mdx new file mode 100644 index 00000000..5c4a8794 --- /dev/null +++ b/docs/content/docs/cua/guide/get-started/self-hosted-sandboxes.mdx @@ -0,0 +1,308 @@ +--- +title: Self-Hosted Sandboxes +description: Run sandboxes locally with Docker, QEMU, or native virtualization +--- + +import { Tab, Tabs } from 'fumadocs-ui/components/tabs'; +import { Callout } from 'fumadocs-ui/components/callout'; + +For development, testing, or air-gapped environments, you can run sandboxes locally instead of using Cua Cloud. + +## Options Overview + +| Option | OS Support | Requirements | Best For | +| ------------------- | ----------------------- | ----------------------------- | -------------------------------- | +| **Docker** | Linux | Docker Desktop/Engine | Local development, fastest setup | +| **QEMU Docker** | Linux, Windows, Android | Docker + golden image | Testing specific OS versions | +| **Lume** | macOS | macOS host + Lume CLI | macOS automation | +| **Windows Sandbox** | Windows | Windows 10 Pro/Enterprise, 11 | Windows automation | + +## Linux on Docker (Recommended) + +The fastest way to get a local sandbox running. + +**1. Install Docker Desktop or Docker Engine** + +**2. Pull a Cua Docker image:** + +```bash +# XFCE (Lightweight) - recommended for most use cases +docker pull --platform=linux/amd64 trycua/cua-xfce:latest + +# OR KASM (Full-Featured) - full Ubuntu desktop +docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest +``` + +**3. Connect with Python:** + +```python +from computer import Computer +import asyncio + +computer = Computer( + os_type="linux", + provider_type="docker", + image="trycua/cua-xfce:latest" +) + +async def main(): + await computer.run() # Launch & connect + + try: + screenshot = await computer.interface.screenshot() + await computer.interface.left_click(100, 100) + await computer.interface.type_text("Hello!") + finally: + await computer.disconnect() + +asyncio.run(main()) +``` + +## QEMU Docker + +Run full virtual machines (Linux, Windows, Android) inside Docker containers using QEMU virtualization. + + + Linux and Windows images require a **golden image preparation step** on first use. Android images + start directly. + + + + + +**1. Pull the QEMU Linux image:** + +```bash +docker pull trycua/cua-qemu-linux:latest +``` + +**2. Download Ubuntu 22.04 LTS Server ISO:** + +Download the [Ubuntu 22.04 Server ISO](https://releases.ubuntu.com/22.04/ubuntu-22.04.5-live-server-amd64.iso) (~2GB) + +**3. Create golden image:** + +```bash +docker run -it --rm \ + --device=/dev/kvm \ + --cap-add NET_ADMIN \ + --mount type=bind,source=/path/to/ubuntu-22.04.5-live-server-amd64.iso,target=/custom.iso \ + -v ~/cua-storage/linux:/storage \ + -p 8006:8006 \ + -p 5000:5000 \ + -e RAM_SIZE=8G \ + -e CPU_CORES=4 \ + -e DISK_SIZE=64G \ + trycua/cua-qemu-linux:latest +``` + +Monitor progress at [http://localhost:8006](http://localhost:8006). The container will install Ubuntu Desktop and shut down when complete. + +**4. Connect with Python:** + +```python +from computer import Computer +import asyncio + +computer = Computer( + os_type="linux", + provider_type="docker", + image="trycua/cua-qemu-linux:latest", + storage="~/cua-storage/linux", + run_opts={"devices": ["/dev/kvm"]}, +) + +async def main(): + await computer.run() + + try: + screenshot = await computer.interface.screenshot() + await computer.interface.left_click(100, 100) + finally: + await computer.disconnect() + +asyncio.run(main()) +``` + + + + +**1. Pull the QEMU Windows image:** + +```bash +docker pull trycua/cua-qemu-windows:latest +``` + +**2. Download Windows 11 Enterprise Evaluation ISO:** + +- Visit [Microsoft Evaluation Center](https://info.microsoft.com/ww-landing-windows-11-enterprise.html) +- Download **Windows 11 Enterprise Evaluation (90-day trial)** ISO (~6GB) + +**3. Create golden image:** + +```bash +docker run -it --rm \ + --device=/dev/kvm \ + --cap-add NET_ADMIN \ + --mount type=bind,source=/path/to/windows-11-enterprise-eval.iso,target=/custom.iso \ + -v ~/cua-storage/windows:/storage \ + -p 8006:8006 \ + -p 5000:5000 \ + -e RAM_SIZE=8G \ + -e CPU_CORES=4 \ + -e DISK_SIZE=64G \ + trycua/cua-qemu-windows:latest +``` + +Monitor progress at [http://localhost:8006](http://localhost:8006). The container will install Windows 11 and shut down when complete. + +**4. Connect with Python:** + +```python +from computer import Computer +import asyncio + +computer = Computer( + os_type="windows", + provider_type="docker", + image="trycua/cua-qemu-windows:latest", + storage="~/cua-storage/windows", + run_opts={"devices": ["/dev/kvm"]}, +) + +async def main(): + await computer.run() + + try: + screenshot = await computer.interface.screenshot() + await computer.interface.left_click(100, 100) + finally: + await computer.disconnect() + +asyncio.run(main()) +``` + + + + +No golden image preparation needed. + +**1. Pull the QEMU Android image:** + +```bash +docker pull trycua/cua-qemu-android:latest +``` + +**2. Connect with Python:** + +```python +from computer import Computer +import asyncio + +computer = Computer( + os_type="android", + provider_type="docker", + image="trycua/cua-qemu-android:latest", + timeout=150, # Emulator needs more time to boot + run_opts={ + "devices": ["/dev/kvm"], + "env": {"EMULATOR_DEVICE": "Samsung Galaxy S10"}, + }, +) + +async def main(): + await computer.run() + + try: + screenshot = await computer.interface.screenshot() + await computer.interface.left_click(100, 100) + finally: + await computer.disconnect() + +asyncio.run(main()) +``` + + + + +## macOS Sandbox (Lume) + +Run native macOS VMs using Apple's Virtualization framework. **macOS host required.** + +**1. Install the Lume CLI:** + +```bash +/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" +``` + +**2. Start a macOS sandbox:** + +```bash +lume run macos-sequoia-cua:latest +``` + +**3. Connect with Python:** + +```python +from computer import Computer +import asyncio + +computer = Computer( + os_type="macos", + provider_type="lume", + name="macos-sequoia-cua:latest" +) + +async def main(): + await computer.run() + + try: + screenshot = await computer.interface.screenshot() + await computer.interface.left_click(100, 100) + finally: + await computer.disconnect() + +asyncio.run(main()) +``` + +## Windows Sandbox + +Use the native Windows Sandbox feature. **Windows 10 Pro/Enterprise or Windows 11 required.** + +**1. Enable Windows Sandbox:** + +Follow the [Microsoft guide](https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/windows-sandbox-install) to enable Windows Sandbox. + +**2. Install the dependency:** + +```bash +pip install -U git+git://github.com/karkason/pywinsandbox.git +``` + +**3. Connect with Python:** + +```python +from computer import Computer +import asyncio + +computer = Computer( + os_type="windows", + provider_type="windows_sandbox" +) + +async def main(): + await computer.run() + + try: + screenshot = await computer.interface.screenshot() + await computer.interface.left_click(100, 100) + finally: + await computer.disconnect() + +asyncio.run(main()) +``` + +## Next Steps + +- [Using the Computer SDK](/cua/guide/get-started/using-computer-sdk) - Full SDK reference with all sandbox types +- [Using the Agent SDK](/cua/guide/get-started/using-agent-sdk) - Add AI automation to your sandboxes diff --git a/docs/content/docs/cua/guide/get-started/what-is-cua.mdx b/docs/content/docs/cua/guide/get-started/what-is-cua.mdx index b25568f0..74b0c016 100644 --- a/docs/content/docs/cua/guide/get-started/what-is-cua.mdx +++ b/docs/content/docs/cua/guide/get-started/what-is-cua.mdx @@ -14,12 +14,16 @@ Cua is an open-source platform for building, benchmarking, and deploying agents Cua consists of three main components:
- Cua Architecture + Cua Architecture
### 1. Desktop Sandboxes -Isolated virtual environments where your agents can safely execute tasks. Cua supports: +Isolated virtual environments where your agents can safely execute tasks: - **Cloud Sandboxes** - Managed Linux, Windows, and macOS environments hosted by Cua - **Local Sandboxes** - Docker containers, QEMU VMs, macOS VMs via Lume, or Windows Sandbox on your own machine @@ -36,16 +40,60 @@ A unified SDK for controlling desktop environments programmatically: ### 3. Agent Framework -Build agents that see screens, click buttons, and complete tasks autonomously. Run isolated code execution environments for AI coding assistants like Claude Code, Codex CLI, or OpenCode. +Build agents that see screens, click buttons, and complete tasks autonomously: - **100+ vision-language model options** through Cua VLM Router or direct provider access - **Pre-built agent loops** optimized for computer-use tasks - **Composable architecture** for combining grounding and planning models - **Built-in telemetry** for monitoring agent performance -## Why Cua? +## How Computer-Use Agents Work -Cua provides isolated, reproducible environments for AI agents to operate safely: +Computer-use agents operate through a continuous loop: + +``` +┌─────────────────────────────────────────┐ +│ 1. OBSERVE │ +│ Take a screenshot of the screen │ +└──────────────────┬──────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────┐ +│ 2. UNDERSTAND │ +│ Vision-language model analyzes │ +│ the screenshot and current goal │ +└──────────────────┬──────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────┐ +│ 3. DECIDE │ +│ Determine the next action: │ +│ click, type, scroll, etc. │ +└──────────────────┬──────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────┐ +│ 4. ACT │ +│ Execute the action on the computer │ +└──────────────────┬──────────────────────┘ + │ + ▼ + Loop back to 1 +``` + +This cycle repeats until the agent completes its goal or determines it cannot proceed. + +## Sandbox Options + +| Sandbox Type | OS Support | Best For | API Key Required | +| ------------------- | ----------------------- | ---------------------------- | ---------------- | +| **Cloud** | Linux, Windows, macOS | Production, teams, CI/CD | Yes | +| **Docker** | Linux | Local development | No | +| **QEMU Docker** | Linux, Windows, Android | Testing specific OS versions | No | +| **Lume (macOS)** | macOS | macOS automation | No | +| **Windows Sandbox** | Windows | Windows automation | No | + +## Why Cua? - **Secure execution** - Run AI coding assistants and computer-use agents in sandboxed environments - **Self-hostable** - Deploy locally with Docker, QEMU, or Apple Virtualization @@ -54,8 +102,6 @@ Cua provides isolated, reproducible environments for AI agents to operate safely ## Use Cases -Cua is ideal for: - - **AI coding assistants** - Isolated code execution environments for Claude Code, Codex CLI, OpenCode, and other AI coding tools - **Computer-use agents** - Build agents that interact with any desktop application autonomously - **Workflow automation** - Automate repetitive tasks across any application @@ -65,4 +111,4 @@ Cua is ideal for: ## Getting Started -Ready to build your first agent? Continue to [Set Up a Sandbox](/cua/guide/get-started/set-up-sandbox) to set up your environment and run your first automation. +Ready to build your first agent? Continue to the [Quickstart](/cua/guide/get-started/quickstart) to get a computer-use agent running. diff --git a/docs/content/docs/cua/reference/agent-sdk/changelog.mdx b/docs/content/docs/cua/reference/agent-sdk/changelog.mdx new file mode 100644 index 00000000..7eea1428 --- /dev/null +++ b/docs/content/docs/cua/reference/agent-sdk/changelog.mdx @@ -0,0 +1,623 @@ +--- +title: Changelog +description: Release history for Agent SDK +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/generate-changelog.ts +Last updated: 2026-02-04 +*/} + +# Agent SDK Changelog + +All notable changes to the Agent SDK are documented here. + +## 0.7.x + +### v0.7.24 (2026-02-04) + +**Dependencies:** cua-computer: 0.5.12, cua-som: 0.1.3 + +- Initial release or no path-specific changes found + +### v0.7.22 (2026-01-28) + +**Dependencies:** cua-computer: 0.5.12, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.22 by @github-actions[bot] + +### v0.7.21 (2026-01-26) + +**Dependencies:** cua-computer: 0.5.11, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.21 by @github-actions[bot] + +### v0.7.18 (2026-01-22) + +**Dependencies:** cua-computer: 0.5.10, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.18 by @github-actions[bot] +- fix(agent): accept 'computer_use' as valid function name for Fara model ([#865](https://github.com/trycua/cua/pull/865)) by @sarinali + +### v0.7.17 (2026-01-17) + +**Dependencies:** cua-computer: 0.5.7, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.17 by @github-actions[bot] + +### v0.7.16 (2026-01-17) + +**Dependencies:** cua-computer: 0.5.7, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.16 by @github-actions[bot] + +### v0.7.15 (2026-01-17) + +**Dependencies:** cua-computer: 0.5.7, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.15 by @github-actions[bot] + +### v0.7.14 (2026-01-17) + +**Dependencies:** cua-computer: 0.5.6, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.14 by @github-actions[bot] + +### v0.7.13 (2026-01-17) + +**Dependencies:** cua-computer: 0.5.6, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.13 by @github-actions[bot] +- Fix/gradio 6.3 compatibility ([#810](https://github.com/trycua/cua/pull/810)) by @Weichen Zhang +- fix(agent): create screenshot_dir when trajectory_dir option is specified ([#813](https://github.com/trycua/cua/pull/813)) by @Harsh Verma +- fix(agent): only yield partial response if not empty ([#802](https://github.com/trycua/cua/pull/802)) by @Harsh Verma +- fix tool mismatch in uitars agent loop ([#640](https://github.com/trycua/cua/pull/640)) by @zju-lx + +### v0.7.12 (2026-01-13) + +**Dependencies:** cua-computer: 0.5.6, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.12 by @github-actions[bot] +- pass browser tool ([#774](https://github.com/trycua/cua/pull/774)) by @Sarina Li +- feat(agent): add OpenTelemetry instrumentation callback ([#662](https://github.com/trycua/cua/pull/662)) by @r33drichards +- Fix Omniparser historical message conversion using per-screenshot mappings ([#706](https://github.com/trycua/cua/pull/706)) by @Fizza Mukhtar + +### v0.7.11 (2026-01-12) + +**Dependencies:** cua-computer: 0.5.6, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.11 by @github-actions[bot] + +### v0.7.10 (2026-01-12) + +**Dependencies:** cua-computer: 0.5.6, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.10 by @github-actions[bot] + +### v0.7.9 (2026-01-12) + +**Dependencies:** cua-computer: 0.5.5, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.9 by @github-actions[bot] + +### v0.7.8 (2026-01-12) + +**Dependencies:** cua-computer: 0.5.4, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.8 by @github-actions[bot] +- Bump cua-agent to v0.7.7 ([#781](https://github.com/trycua/cua/pull/781)) by @Francesco Bonacci +- Bump cua-agent to v0.7.7 ([#780](https://github.com/trycua/cua/pull/780)) by @Francesco Bonacci + +### v0.7.6 (2026-01-12) + +**Dependencies:** cua-computer: 0.5.1, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.6 by @github-actions[bot] + +### v0.7.5 (2026-01-12) + +**Dependencies:** cua-computer: 0.5.1, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.5 by @github-actions[bot] + +### v0.7.4 (2026-01-12) + +**Dependencies:** cua-computer: 0.5.1, cua-som: 0.1.3 + +- Bump cua-agent to v0.7.4 by @github-actions[bot] +- refactor(docs): reorganize lume/cua docs and standardize READMEs ([#752](https://github.com/trycua/cua/pull/752)) by @Francesco Bonacci +- feat(lume,ci): add unattended VM setup and reorganize CI/CD workflows ([#729](https://github.com/trycua/cua/pull/729)) by @Francesco Bonacci +- Fix linting in cua-bench module ([#727](https://github.com/trycua/cua/pull/727)) by @ddupont +- Raise clear error when using Ollama models with ComputerAgent image inputs ([#711](https://github.com/trycua/cua/pull/711)) by @Fizza Mukhtar +- update dead links due to docs update by @Sarina Li +- add vm name to start/end by @Sarina Li + +## 0.4.x + +### v0.4.53 (2025-11-19) + +cua-agent is a general Computer-Use framework with liteLLM integration for running agentic workflows on macOS, Windows, and Linux sandboxes. It provides a unified interface for computer-use agents across multiple LLM providers with advanced callback system for extensibility. + +- Jagjeevan's Fix + Merged Main for 4o Model Fix by @sarinali in https://github.com/trycua/cua/pull/522 +- fix: Added GPT-4o compatibility for screenshot actions with text parameter by @JagjeevanAK in https://github.com/trycua/cua/pull/422 +- Add test infrastructure with CI/CD #478 by @AceAtDev in https://github.com/trycua/cua/pull/491 +- Fix/omniparser predict refactor by @sarinali in https://github.com/trycua/cua/pull/529 +- Make VS Code Python interpreter path cross-platform by @skools-here in https://github.com/trycua/cua/pull/520 +- Fix: sanitize shell in subprocess calls by @AdityaBavadekar in https://github.com/trycua/cua/pull/519 +- [Agent] Add support for overriding api_base and api_url kwargs by @ddupont808 in https://github.com/trycua/cua/pull/504 +- Update model list and schedule daily test for agent testing by @YeIIcw in https://github.com/trycua/cua/pull/540 +- Add Claude Desktop Extension by @YeIIcw in https://github.com/trycua/cua/pull/521 +- [AGENT] - New Model Gelato-30B-A3B added by @tamoghnokandar in https://github.com/trycua/cua/pull/558 +- [AGENT] - New model UI-Ins added by @tamoghnokandar in https://github.com/trycua/cua/pull/549 +- Add "cua/" LLM provider by @ddupont808 in https://github.com/trycua/cua/pull/506 +- Add Local Desktop Mode for MCP Server with updated docs by @YeIIcw in https://github.com/trycua/cua/pull/493 +- Fix: Enable custom function tools with Anthropic models by @LucaStngn in https://github.com/trycua/cua/pull/548 +- Fix cua adapter, add UI-TARS-2 model by @ddupont808 in https://github.com/trycua/cua/pull/592 +- Fix cua adapter for anthropic models by @ddupont808 in https://github.com/trycua/cua/pull/601 +- Remove hud-python from cua-agent[all] extra by @f-trycua in https://github.com/trycua/cua/pull/603 +- Add automatic CUA_API_KEY environment variable support by @f-trycua in https://github.com/trycua/cua/pull/604 + +New Contributors + +- @AceAtDev made their first contribution in https://github.com/trycua/cua/pull/491 +- @skools-here made their first contribution in https://github.com/trycua/cua/pull/520 +- @tamoghnokandar made their first contribution in https://github.com/trycua/cua/pull/545 +- @dinmukhamedm made their first contribution in https://github.com/trycua/cua/pull/404 +- @LucaStngn made their first contribution in https://github.com/trycua/cua/pull/548 + +### v0.4.35 (2025-10-22) + +**Dependencies:** cua-computer: latest, cua-som: latest + +- Standardize Python version 3.12 across all packages by @AdityaBavadekar in https://github.com/trycua/cua/pull/500 +- Add support for all QwenVL models by @ddupont808 in https://github.com/trycua/cua/pull/501 + +### v0.4.34 (2025-10-17) + +**Dependencies:** cua-computer: latest, cua-som: latest + +- Fix torch dependency in moondream3 loop by @ddupont808 in https://github.com/trycua/cua/pull/482 + +New Contributors + +- @sarinali made their first contribution in https://github.com/trycua/cua/pull/477 + +### v0.4.33 (2025-10-15) + +**Dependencies:** cua-computer: latest, cua-som: latest + +- Stream MCP responses instead of buffering by @YeIIcw in https://github.com/trycua/cua/pull/427 +- Add support for Gemini CUA model by @ddupont808 in https://github.com/trycua/cua/pull/472 +- Add support for Claude Haiku 4.5 by @ddupont808 in https://github.com/trycua/cua/pull/474 +- Add bump2version configuration for all Python packages by @r33drichards in https://github.com/trycua/cua/pull/463 + +### v0.4.32 (2025-09-25) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.31 (2025-09-13) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.30 (2025-09-12) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.29 (2025-09-12) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.28 (2025-09-12) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.27 (2025-09-10) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.26 (2025-09-10) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.25 (2025-09-05) + +**Dependencies:** cua-computer: latest, cua-som: latest + +- Change HUD dataset name from `OSWorld-Verified-XLang` to `OSWorld-Verified` by @jamesmurdza in https://github.com/trycua/cua/pull/392 +- Fixed error when running agent with multimodal user inputs in the anthropic loop by @ddupont808 in https://github.com/trycua/cua/pull/394 +- Reference documentation batch by @onel in https://github.com/trycua/cua/pull/390 + +### v0.4.24 (2025-08-28) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.23 (2025-08-28) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.22 (2025-08-28) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.21 (2025-08-28) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.20 (2025-08-28) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.19 (2025-08-28) + +**Dependencies:** cua-computer: latest, cua-som: latest + +- Added screenshot_dir parameter +- Bugfix: Lazy loading of MLX by @ddupont808 in https://github.com/trycua/cua/pull/373 + +### v0.4.18 (2025-08-28) + +**Dependencies:** cua-computer: latest, cua-som: latest + +- Fix example code and notebooks by @jamesmurdza in https://github.com/trycua/cua/pull/364 +- Added a readme file to the notebooks folder by @onel in https://github.com/trycua/cua/pull/361 +- Restrict root project to Python versions below 3.14 by @jamesmurdza in https://github.com/trycua/cua/pull/363 +- Restored `mlx/` adapter by @ddupont808 in https://github.com/trycua/cua/pull/366 +- Upgrade HUD SDK to 0.4.12 by @ddupont808 in https://github.com/trycua/cua/pull/371 +- Added callback to normalize common tool call hallucinations seen during evals + +New Contributors + +- @onel made their first contribution in https://github.com/trycua/cua/pull/361 + +### v0.4.17 (2025-08-19) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.16 (2025-08-19) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.15 (2025-08-19) + +**Dependencies:** cua-computer: latest, cua-som: latest + +- Pin OpenAI version to work around BerriAI/litellm#13711 by @jamesmurdza in https://github.com/trycua/cua/pull/356 +- Upgrade Claude 3.5 snapshot in web app, examples and docs by @jamesmurdza in https://github.com/trycua/cua/pull/359 +- Update agent/computer SDKs to match changes in telemetry SDK by @jamesmurdza in https://github.com/trycua/cua/pull/355 + +### v0.4.14 (2025-08-18) + +**Dependencies:** cua-computer: latest, cua-som: latest + +- Upgrade Agent SDK to require Python 3.12 by @jamesmurdza in https://github.com/trycua/cua/pull/342 +- Fix UI-TARS predict_click always returning None by @ddupont808 in https://github.com/trycua/cua/pull/350 + +### v0.4.13 (2025-08-14) + +**Dependencies:** cua-computer: latest, cua-som: latest + +- [Agent] Add HUD evals, OSWorld-verified docs, and support for custom computers by @ddupont808 in https://github.com/trycua/cua/pull/334 + +### v0.4.12 (2025-08-12) + +**Dependencies:** cua-computer: latest, cua-som: latest + +- [Agent] Implement left_mouse_down, left_mouse_up, and tool errors by @ddupont808 in https://github.com/trycua/cua/pull/333 +- [Agent] Add GLM-4.5V support by @ddupont808 in https://github.com/trycua/cua/pull/337 + +### v0.4.11 (2025-08-07) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.10 (2025-08-06) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.9 (2025-08-06) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.8 (2025-08-05) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.7 (2025-08-05) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.6 (2025-08-04) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.5 (2025-07-29) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.4.4 (2025-07-29) + +Bugfixes caused by `mouse_move` having the wrong name + +### v0.4.3 (2025-07-29) + +Bugfixes caused by `mouse_move` having the wrong name + +### v0.4.2 (2025-07-29) + +- [Agent] Fix Anthropic unexpected tool_use_id by @ddupont808 in https://github.com/trycua/cua/pull/325 + +### v0.4.1 (2025-07-28) + +Updated to use latest `cua-core` + +### v0.4.0 (2025-07-28) + +This update refactored the Agent SDK to make it easier to implement new features and support the release of new agent models/loops. + +Changelog: + +- Reworked agent loop, now all agent providers share a loop (Generate, Execute, Repeat), with the only difference between loops being the implementation of the Generate function +- Replaced LLM clients with LiteLLM, now all agent providers support any provider supported by LiteLLM +- Added 2 custom LiteLLM providers for local model inference on CUDA and MLX devices: `huggingface-local/`, `mlx/` +- Reworked callback system to have hooks at every step of the lifecycle +- Converted logging, trajectory saving, image retention into callbacks +- Added new callbacks - PII Anonymization (still a W.I.P) & budget management +- Anthropic providers - Added support for explicit prompt caching +- OpenAI providers - Added support for zero data retention +- Added Agent CLI for quick testing: `python -m agent.cli ` + +[Breaking Changes](https://docs-woad-phi.vercel.app/home/agent-sdk/migration-guide#breaking-changes) + +- **Initialization:** + - `ComputerAgent` (v0.4.x) uses `model` as a string (e.g. "anthropic/claude-3-5-sonnet-20241022") instead of `LLM` and `AgentLoop` objects. + - `tools` is a list (can include multiple computers and decorated functions). + - `callbacks` are now first-class for extensibility (image retention, budget, trajectory, logging, etc). +- **No explicit `loop` parameter:** + - Loop is inferred from the `model` string (e.g. `anthropic/`, `openai/`, `omniparser+`, `ui-tars`). +- **No explicit `computer` parameter:** + - Computers are added to `tools` list. + +Install + +```bash +Before merge: +pip install --pre "cua-agent[all]==0.4.0b4" + +After merge: +pip install "cua-agent[all]" + +or install specific providers +pip install "cua-agent[openai]" # OpenAI computer-use-preview support +pip install "cua-agent[anthropic]" # Anthropic Claude support +pip install "cua-agent[omni]" # Omniparser + any LLM support +pip install "cua-agent[uitars]" # UI-TARS +pip install "cua-agent[uitars-mlx]" # UI-TARS + MLX support +pip install "cua-agent[uitars-hf]" # UI-TARS + Huggingface support +pip install "cua-agent[ui]" # Gradio UI support +``` + +Supported Models + +Anthropic Claude (Computer Use API) + +```python +model="anthropic/claude-3-5-sonnet-20241022" +model="anthropic/claude-3-5-sonnet-20240620" +model="anthropic/claude-opus-4-20250514" +model="anthropic/claude-sonnet-4-20250514" +``` + +OpenAI Computer Use Preview + +```python +model="openai/computer-use-preview" +``` + +UI-TARS (Local or Huggingface Inference) + +```python +model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" +model="ollama_chat/0000/ui-tars-1.5-7b" +``` + +Omniparser + Any LLM + +```python +model="omniparser+ollama_chat/mistral-small3.2" +model="omniparser+vertex_ai/gemini-pro" +model="omniparser+anthropic/claude-3-5-sonnet-20241022" +model="omniparser+openai/gpt-4o" +``` + +## 0.3.x + +### v0.3.2 (2025-07-15) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.3.1 (2025-07-01) + +**Dependencies:** cua-computer: latest, cua-som: latest + +## 0.2.x + +### v0.2.15 (2025-06-25) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.2.14 (2025-06-24) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.2.13 (2025-06-24) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.2.12 (2025-06-20) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.2.11 (2025-06-18) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.2.10 (2025-06-10) + +**Dependencies:** cua-computer: latest, cua-som: latest + +- Fixed image retention bug by @ddupont808 in https://github.com/trycua/cua/pull/282 +- Removed `torch` requirement from `cua-agent/core` and `cua-agent/anthropic` by @ddupont808 in https://github.com/trycua/cua/pull/285 + +### v0.2.9 (2025-06-05) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.2.7 (2025-06-05) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.2.6 (2025-05-28) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.2.5 (2025-05-28) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.2.4 (2025-05-28) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.2.3 (2025-05-28) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.2.2 (2025-05-28) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.2.1 (2025-05-16) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.2.0 (2025-05-14) + +**Dependencies:** cua-computer: latest, cua-som: latest + +## 0.1.x + +### v0.1.44 (2025-05-13) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.43 (2025-05-12) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.42 (2025-05-11) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.41 (2025-05-11) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.40 (2025-05-11) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.39 (2025-05-11) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.38 (2025-05-11) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.37 (2025-05-10) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.35 (2025-05-10) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.34 (2025-05-10) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.33 (2025-05-10) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.32 (2025-05-02) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.31 (2025-04-29) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.30 (2025-04-24) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.29 (2025-04-22) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.28 (2025-04-15) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.27 (2025-04-15) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.26 (2025-04-14) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.25 (2025-04-06) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.24 (2025-04-06) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.23 (2025-04-06) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.22 (2025-04-04) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.21 (2025-04-02) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.20 (2025-03-30) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.19 (2025-03-30) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.18 (2025-03-30) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.17 (2025-03-24) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.6 (2025-03-19) + +**Dependencies:** cua-computer: latest, cua-som: latest + +### v0.1.5 (2025-03-17) + +**Dependencies:** cua-computer: latest, cua-som: latest diff --git a/docs/content/docs/cua/reference/agent-sdk/index.mdx b/docs/content/docs/cua/reference/agent-sdk/index.mdx index 60fa6a92..ae458d59 100644 --- a/docs/content/docs/cua/reference/agent-sdk/index.mdx +++ b/docs/content/docs/cua/reference/agent-sdk/index.mdx @@ -1,584 +1,1348 @@ --- -title: Agent SDK -description: Python API reference for the Agent SDK +title: Agent SDK API Reference +description: Python API reference for building computer-use agents --- -import { Tabs, Tab } from 'fumadocs-ui/components/tabs'; +{/* + AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY + Generated by: npx tsx scripts/docs-generators/python-sdk.ts + Source: libs/python/agent/agent + Version: 0.7.24 +*/} + import { Callout } from 'fumadocs-ui/components/callout'; +import { Tabs, Tab } from 'fumadocs-ui/components/tabs'; +import { VersionHeader } from '@/components/version-selector'; -The **Agent SDK** (`cua-agent`) provides the Python interface for building computer-use agents. This reference covers the ComputerAgent class, callbacks, tools, and response types. + -## Installation +agent - Decorator-based Computer Use Agent with liteLLM integration -```bash -pip install cua-agent -``` +## Classes + +| Class | Description | +|-------|-------------| +| [`ComputerAgent`](#computeragent) | Main agent class that automatically selects the appropriate agent loop | + +## Functions + +| Function | Description | +|----------|-------------| +| [`register_agent`](#register_agent) | Decorator to register an AsyncAgentConfig class. | + +--- ## ComputerAgent -The main class for creating agents that can autonomously operate computers. +Main agent class that automatically selects the appropriate agent loop +based on the model and executes tool calls. + +### Constructor ```python -from agent import ComputerAgent - -agent = ComputerAgent( - model="anthropic/claude-sonnet-4-5-20250929", - tools=[computer] -) - -async for result in agent.run("Open Firefox and search for Cua"): - print(result.text) +ComputerAgent(self, model: str, tools: Optional[List[Any]] = None, custom_loop: Optional[Callable] = None, only_n_most_recent_images: Optional[int] = None, callbacks: Optional[List[Any]] = None, instructions: Optional[str] = None, verbosity: Optional[int] = None, trajectory_dir: Optional[str | Path | dict] = None, max_retries: Optional[int] = 3, screenshot_delay: Optional[float | int] = 0.5, use_prompt_caching: Optional[bool] = False, max_trajectory_budget: Optional[float | dict] = None, telemetry_enabled: Optional[bool] = True, trust_remote_code: Optional[bool] = False, api_key: Optional[str] = None, api_base: Optional[str] = None, additional_generation_kwargs = {}) ``` -### Constructor Parameters +### Attributes -| Parameter | Type | Default | Description | -| --------------------------- | --------------- | ---------------- | ----------------------------------------------------------- | -| `model` | `str` | Required | Model identifier (see [VLMs](/cua/guide/fundamentals/vlms)) | -| `tools` | `list` | Required | Tools the agent can use | -| `api_key` | `str` | Provider env var | API key for the model provider | -| `callbacks` | `list` | `[]` | List of callback handlers | -| `instructions` | `str` | `None` | Custom instructions prepended to prompts | -| `verbosity` | `int` | `None` | Logging level (e.g., `logging.INFO`) | -| `max_trajectory_budget` | `float` | `None` | Maximum cost in dollars | -| `only_n_most_recent_images` | `int` | `None` | Limit retained screenshots | -| `trajectory_dir` | `str` \| `dict` | `None` | Directory for saving trajectories | +| Name | Type | Description | +|------|------|-------------| +| `model` | `Any` | | +| `tools` | `Any` | | +| `custom_loop` | `Any` | | +| `only_n_most_recent_images` | `Any` | | +| `callbacks` | `Any` | | +| `instructions` | `Any` | | +| `verbosity` | `Any` | | +| `trajectory_dir` | `Any` | | +| `max_retries` | `Any` | | +| `screenshot_delay` | `Any` | | +| `use_prompt_caching` | `Any` | | +| `telemetry_enabled` | `Any` | | +| `kwargs` | `Any` | | +| `trust_remote_code` | `Any` | | +| `api_key` | `Any` | | +| `api_base` | `Any` | | +| `agent_loop` | `Any` | | +| `agent_config_info` | `Any` | | +| `tool_schemas` | `Any` | | +| `computer_handler` | `Any` | | ### Methods -#### `run(task, chat_history)` - -Execute a task autonomously. Returns an async generator of results. +#### ComputerAgent.run ```python -async for result in agent.run("Click the submit button"): - if result.text: - print(result.text) - if result.action: - print(f"Action: {result.action}") +async def run(self, messages: Messages, stream: bool = False, api_key: Optional[str] = None, api_base: Optional[str] = None, additional_generation_kwargs = {}) -> AsyncGenerator[Dict[str, Any], None] ``` -| Parameter | Type | Default | Description | -| -------------- | ------ | -------- | --------------------------------- | -| `task` | `str` | Required | Natural language task description | -| `chat_history` | `list` | `[]` | Previous conversation messages | +Run the agent with the given messages using Computer protocol handler pattern. -**Yields:** `AgentResult` objects containing model responses and actions. +**Parameters:** -#### `run_to_completion(task, chat_history)` +| Name | Type | Description | +|------|------|-------------| +| `messages` | `Any` | List of message dictionaries | +| `stream` | `Any` | Whether to stream the response | +| `api_key` | `Any` | Optional API key override for the model provider | +| `api_base` | `Any` | Optional API base URL override for the model provider **additional_generation_kwargs: Additional arguments passed to the model provider | -Execute a task and return only the final result. +**Returns:** AsyncGenerator that yields response chunks + +#### ComputerAgent.predict_click ```python -result = await agent.run_to_completion("Open the calculator app") -print(result.text) +async def predict_click(self, instruction: str, image_b64: Optional[str] = None) -> Optional[Tuple[int, int]] ``` -**Returns:** `AgentResult` - The final result after task completion. +Predict click coordinates based on image and instruction. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `instruction` | `Any` | Instruction for where to click | +| `image_b64` | `Any` | Base64 encoded image (optional, will take screenshot if not provided) | + +**Returns:** None or tuple with (x, y) coordinates + +#### ComputerAgent.get_capabilities + +```python +def get_capabilities(self) -> List[AgentCapability] +``` + +Get list of capabilities supported by the current agent config. + +**Returns:** List of capability strings (e.g., ["step", "click"]) + +#### ComputerAgent.open + +```python +def open(self, port: Optional[int] = None) +``` + +Start the playground server and open it in the browser. + +This method starts a local HTTP server that exposes the /responses endpoint +and automatically opens the Cua playground interface in the default browser. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `port` | `Any` | Port to run the server on. If None, finds an available port automatically. | + +**Example:** + +```python +>>> agent = ComputerAgent(model="claude-sonnet-4") +>>> agent.open() # Starts server and opens browser +``` + +## register_agent + +```python +def register_agent(models: str, priority: int = 0, tool_type: Optional[str] = None) +``` + +Decorator to register an AsyncAgentConfig class. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `models` | `Any` | Regex pattern to match supported models | +| `priority` | `Any` | Priority for agent selection (higher = more priority) | +| `tool_type` | `Any` | Required tool type for this model ("browser" | "mobile" | None). Specialized models (like FARA) declare their required tool type, and ComputerAgent will auto-wrap tools accordingly. General models (like Claude) leave this as None for full flexibility. | --- -## AgentResult +## types -Returned by `agent.run()` for each iteration of the agent loop. +Type definitions for agent + +--- + +## ToolError + +*Inherits from: RuntimeError* + +Base exception for tool-related errors + +--- + +## IllegalArgumentError + +*Inherits from: ToolError* + +Exception raised when function arguments are invalid + +--- + +## AgentConfigInfo + +*Inherits from: BaseModel* + +Information about a registered agent config + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `agent_class` | `type` | | +| `models_regex` | `str` | | +| `priority` | `int` | | +| `tool_type` | `Optional[str]` | | + +### Methods + +#### AgentConfigInfo.matches_model ```python -async for result in agent.run("Search for documentation"): - # Check what happened this iteration - if result.text: - print(f"Agent said: {result.text}") - - if result.action: - print(f"Action type: {result.action.type}") - - if result.usage: - print(f"Tokens used: {result.usage.total_tokens}") +def matches_model(self, model: str) -> bool ``` -### Properties +Check if this agent config matches the given model -| Property | Type | Description | -| ------------ | ------------------- | ------------------------------------- | -| `text` | `str \| None` | Text response from the model | -| `action` | `Action \| None` | Computer action taken | -| `screenshot` | `PIL.Image \| None` | Screenshot after action | -| `usage` | `Usage \| None` | Token and cost information | -| `error` | `str \| None` | Error message if action failed | -| `done` | `bool` | True if agent considers task complete | +--- -### Action Types +## tools -The `action` property contains details about computer actions: +Agent tools module. +Provides base classes and registered tools for agent interactions. + +--- + +## BaseComputerTool + +*Inherits from: BaseTool* + +Base class for computer tools that can provide screenshots. + +Computer tools must implement: +- All BaseTool requirements (name, description, parameters, call) +- screenshot() method that returns screenshot as base64 string + +### Methods + +#### BaseComputerTool.screenshot ```python -result.action.type # "click", "type", "key", "scroll", etc. -result.action.coordinate # [x, y] for click actions -result.action.text # Text for type actions -result.action.key # Key for key press actions +async def screenshot(self) -> str +``` + +Take a screenshot of the computer/browser. + +**Returns:** Screenshot image data as base64-encoded string + +--- + +## BaseTool + +*Inherits from: ABC* + +Base class for all agent tools. + +Tools must implement: +- name: str - The tool name (set by @register_tool decorator) +- description: property that returns str - Tool description +- parameters: property that returns dict - JSON schema for tool parameters +- call: method - Execute the tool with given parameters + +### Constructor + +```python +BaseTool(self, cfg: Optional[dict] = None) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `name` | `str` | | +| `cfg` | `Any` | | +| `description` | `str` | Return the tool description. | +| `parameters` | `dict` | Return the JSON schema for tool parameters. | +| `function` | `dict` | Return the function information for this tool. | + +### Methods + +#### BaseTool.call + +```python +def call(self, params: Union[str, dict], kwargs = {}) -> Union[str, list, dict] +``` + +Execute the tool with the given parameters. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `params` | `Any` | The parameters for the tool call (JSON string or dict) **kwargs: Additional keyword arguments | + +**Returns:** The result of the tool execution + +--- + +## BrowserTool + +*Inherits from: BaseComputerTool* + +Browser tool that uses the computer SDK's interface to control a browser. +Implements a comprehensive computer_use action interface for browser control. + +### Constructor + +```python +BrowserTool(self, interface: GenericComputerInterface, cfg: Optional[dict] = None) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `interface` | `Any` | | +| `viewport_width` | `Any` | | +| `viewport_height` | `Any` | | +| `resized_width` | `Any` | | +| `resized_height` | `Any` | | +| `automation` | `Any` | Get the automation interface for keyboard/mouse actions. | +| `description` | `str` | | +| `parameters` | `dict` | | + +### Methods + +#### BrowserTool.call + +```python +def call(self, params: Union[str, dict], kwargs = {}) -> Union[str, dict] +``` + +Execute a browser action. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `params` | `Any` | Action parameters (JSON string or dict) **kwargs: Additional keyword arguments | + +**Returns:** Result of the action execution + +#### BrowserTool.visit_url + +```python +async def visit_url(self, url: str) -> dict +``` + +Navigate to a URL. + +#### BrowserTool.click + +```python +async def click(self, x: int = None, y: int = None, button: str = 'left', kwargs = {}) -> dict +``` + +Click at coordinates. Supports both positional (x, y) and kwargs (button, x, y). + +This is compatible with the normalized format from OperatorNormalizerCallback +which transforms actions like \{"type": "left_click", "coordinate": [x, y]\} +into \{"type": "click", "button": "left", "x": x, "y": y\}. + +#### BrowserTool.type + +```python +async def type(self, text: str) -> dict +``` + +Type text into the focused element. + +#### BrowserTool.scroll + +```python +async def scroll(self, delta_x: int = None, delta_y: int = None, scroll_x: int = None, scroll_y: int = None, x: int = None, y: int = None, pixels: int = None, coordinate = None, kwargs = {}) -> dict +``` + +Scroll the page. Supports multiple formats: +- Legacy: scroll(delta_x, delta_y) +- Normalized: scroll(scroll_x=0, scroll_y=100, x=500, y=300) +- FARA: scroll(pixels=100, coordinate=[500, 300]) + +#### BrowserTool.web_search + +```python +async def web_search(self, query: str) -> dict +``` + +Navigate to a Google search for the query. + +#### BrowserTool.screenshot + +```python +async def screenshot(self) -> str +``` + +Take a screenshot of the current browser page. + +#### BrowserTool.get_current_url + +```python +async def get_current_url(self) -> str +``` + +Get the current URL of the browser page. + +#### BrowserTool.left_click + +```python +async def left_click(self, coordinate = None, x: int = None, y: int = None, kwargs = {}) -> dict +``` + +Left click at coordinates. Supports coordinate array or x/y kwargs. + +#### BrowserTool.right_click + +```python +async def right_click(self, coordinate = None, x: int = None, y: int = None, kwargs = {}) -> dict +``` + +Right click at coordinates. Supports coordinate array or x/y kwargs. + +#### BrowserTool.middle_click + +```python +async def middle_click(self, coordinate = None, x: int = None, y: int = None, kwargs = {}) -> dict +``` + +Middle click at coordinates. Supports coordinate array or x/y kwargs. + +#### BrowserTool.double_click + +```python +async def double_click(self, coordinate = None, x: int = None, y: int = None, kwargs = {}) -> dict +``` + +Double click at coordinates. Supports coordinate array or x/y kwargs. + +#### BrowserTool.triple_click + +```python +async def triple_click(self, coordinate = None, x: int = None, y: int = None, button: str = None, kwargs = {}) -> dict +``` + +Triple click at coordinates. Supports coordinate array or x/y kwargs. + +#### BrowserTool.mouse_move + +```python +async def mouse_move(self, coordinate = None, x: int = None, y: int = None, kwargs = {}) -> dict +``` + +Move mouse to coordinates. Supports coordinate array or x/y kwargs. + +#### BrowserTool.move + +```python +async def move(self, x: int = None, y: int = None, kwargs = {}) -> dict +``` + +Move mouse to coordinates. Alias for mouse_move with x/y kwargs. + +#### BrowserTool.left_click_drag + +```python +async def left_click_drag(self, coordinate = None, start_coordinate = None, end_coordinate = None, kwargs = {}) -> dict +``` + +Drag from start to end coordinates. FARA-compatible. + +#### BrowserTool.key + +```python +async def key(self, keys = None, kwargs = {}) -> dict +``` + +Press keys. FARA-compatible. + +#### BrowserTool.keypress + +```python +async def keypress(self, keys = None, kwargs = {}) -> dict +``` + +Press keys. Alias for key() - used by OperatorNormalizerCallback. + +#### BrowserTool.hscroll + +```python +async def hscroll(self, pixels = None, coordinate = None, kwargs = {}) -> dict +``` + +Horizontal scroll. FARA-compatible. + +#### BrowserTool.wait + +```python +async def wait(self, time = None, kwargs = {}) -> dict +``` + +Wait for specified seconds. FARA-compatible. + +#### BrowserTool.history_back + +```python +async def history_back(self, kwargs = {}) -> dict +``` + +Go back in browser history. FARA-compatible. + +#### BrowserTool.terminate + +```python +async def terminate(self, status = None, kwargs = {}) -> dict +``` + +Terminate and report status. FARA-compatible. + +### get_registered_tools + +```python +def get_registered_tools() -> Dict[str, type] +``` + +Get all registered tools. + +**Returns:** Dictionary mapping tool names to tool classes + +### get_tool + +```python +def get_tool(name: str) -> Optional[type] +``` + +Get a registered tool by name. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `name` | `Any` | The tool name | + +**Returns:** The tool class, or None if not found + +### register_tool + +```python +def register_tool(name: str, allow_overwrite: bool = False) +``` + +Decorator to register a tool class with a given name. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `name` | `Any` | The name to register the tool under | +| `allow_overwrite` | `Any` | Whether to allow overwriting an existing tool with the same name | + +**Returns:** Decorator function that registers the class + +**Example:** + +```python +@register_tool("my_tool") +class MyTool(BaseTool): + ... ``` --- -## Usage Tracking +## callbacks -The `Usage` object tracks token consumption and costs. - -```python -async for result in agent.run("Complete this form"): - if result.usage: - print(f"Input tokens: {result.usage.input_tokens}") - print(f"Output tokens: {result.usage.output_tokens}") - print(f"Total tokens: {result.usage.total_tokens}") - print(f"Cost: ${result.usage.response_cost:.4f}") -``` - -### Properties - -| Property | Type | Description | -| ----------------------------- | ------- | ----------------------- | -| `input_tokens` | `int` | Tokens in the prompt | -| `output_tokens` | `int` | Tokens in the response | -| `total_tokens` | `int` | Total tokens used | -| `response_cost` | `float` | Cost in dollars | -| `cache_creation_input_tokens` | `int` | Tokens written to cache | -| `cache_read_input_tokens` | `int` | Tokens read from cache | +Callback system for ComputerAgent preprocessing and postprocessing hooks. --- -## Callbacks +## AsyncCallbackHandler -Callbacks hook into the agent lifecycle for logging, cost tracking, and custom behavior. +*Inherits from: ABC* -### Built-in Callbacks +Base class for async callback handlers that can preprocess messages before +the agent loop and postprocess output after the agent loop. -#### LoggingCallback +### Methods -Log agent events with configurable verbosity. +#### AsyncCallbackHandler.on_run_start ```python -from agent.callbacks import LoggingCallback -import logging - -agent = ComputerAgent( - model="...", - tools=[computer], - callbacks=[LoggingCallback(level=logging.DEBUG)] -) +async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None ``` -| Parameter | Type | Default | Description | -| --------- | ----- | -------------- | ------------- | -| `level` | `int` | `logging.INFO` | Logging level | +Called at the start of an agent run loop. -#### BudgetManagerCallback - -Track costs and stop when budget is exceeded. +#### AsyncCallbackHandler.on_run_end ```python -from agent.callbacks import BudgetManagerCallback - -agent = ComputerAgent( - model="...", - tools=[computer], - callbacks=[BudgetManagerCallback( - max_budget=10.0, - reset_after_each_run=True, - raise_error=False - )] -) +async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None ``` -| Parameter | Type | Default | Description | -| ---------------------- | ------- | -------- | -------------------------------- | -| `max_budget` | `float` | Required | Maximum cost in dollars | -| `reset_after_each_run` | `bool` | `True` | Reset budget per run | -| `raise_error` | `bool` | `False` | Raise exception vs graceful stop | +Called at the end of an agent run loop. -#### ImageRetentionCallback - -Limit screenshot history to prevent context overflow. +#### AsyncCallbackHandler.on_run_continue ```python -from agent.callbacks import ImageRetentionCallback - -agent = ComputerAgent( - model="...", - tools=[computer], - callbacks=[ImageRetentionCallback(only_n_most_recent_images=3)] -) +async def on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool ``` -| Parameter | Type | Default | Description | -| --------------------------- | ----- | -------- | ------------------------- | -| `only_n_most_recent_images` | `int` | Required | Max screenshots to retain | +Called during agent run loop to determine if execution should continue. -#### TrajectorySaverCallback +**Parameters:** -Save complete agent conversations for debugging. +| Name | Type | Description | +|------|------|-------------| +| `kwargs` | `Any` | Run arguments | +| `old_items` | `Any` | Original messages | +| `new_items` | `Any` | New messages generated during run | + +**Returns:** True to continue execution, False to stop + +#### AsyncCallbackHandler.on_llm_start ```python -from agent.callbacks import TrajectorySaverCallback - -agent = ComputerAgent( - model="...", - tools=[computer], - callbacks=[TrajectorySaverCallback( - trajectory_dir="trajectories", - reset_on_run=True, - screenshot_dir="screenshots" - )] -) +async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]] ``` -| Parameter | Type | Default | Description | -| ---------------- | ------ | -------- | ---------------------------------- | -| `trajectory_dir` | `str` | Required | Base directory for trajectories | -| `reset_on_run` | `bool` | `True` | Create new trajectory per run | -| `screenshot_dir` | `str` | `None` | Separate directory for screenshots | +Called before messages are sent to the agent loop. -#### PromptInstructionsCallback +**Parameters:** -Prepend custom instructions to every LLM call. +| Name | Type | Description | +|------|------|-------------| +| `messages` | `Any` | List of message dictionaries to preprocess | + +**Returns:** List of preprocessed message dictionaries + +#### AsyncCallbackHandler.on_llm_end ```python -from agent.callbacks import PromptInstructionsCallback - -agent = ComputerAgent( - model="...", - tools=[computer], - callbacks=[PromptInstructionsCallback("Always confirm before clicking")] -) +async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]] ``` -### Creating Custom Callbacks +Called after the agent loop returns output. -Extend `AsyncCallbackHandler` to create custom callbacks: +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `output` | `Any` | List of output message dictionaries to postprocess | + +**Returns:** List of postprocessed output dictionaries + +#### AsyncCallbackHandler.on_computer_call_start ```python -from agent.callbacks.base import AsyncCallbackHandler - -class MyCallback(AsyncCallbackHandler): - async def on_run_start(self, kwargs, old_items): - """Called when agent.run() begins""" - print("Starting run...") - - async def on_run_continue(self, kwargs, old_items, new_items) -> bool: - """Called before each iteration. Return False to stop.""" - return True - - async def on_llm_start(self, messages): - """Preprocess messages before LLM call.""" - return messages - - async def on_llm_end(self, messages): - """Postprocess messages after LLM call.""" - return messages - - async def on_usage(self, usage): - """Called with usage stats after each LLM call.""" - print(f"Cost: ${usage.response_cost:.4f}") - - async def on_computer_call_start(self, item): - """Called before a computer action.""" - pass - - async def on_computer_call_end(self, item, result): - """Called after a computer action.""" - pass - - async def on_screenshot(self, screenshot, name): - """Called when a screenshot is taken.""" - pass - - async def on_run_end(self, kwargs, old_items, new_items): - """Called when agent.run() completes.""" - print("Run complete!") +async def on_computer_call_start(self, item: Dict[str, Any]) -> None ``` -### Callback Lifecycle Order +Called when a computer call is about to start. -1. `on_run_start` - Once at the beginning -2. For each iteration: - - `on_run_continue` - Check if should continue - - `on_llm_start` - Before LLM call - - `on_api_start` - Before API request - - `on_api_end` - After API response - - `on_usage` - With usage stats - - `on_llm_end` - After LLM processing - - `on_responses` - With model responses - - `on_text` / `on_computer_call_start` / `on_computer_call_end` - Per response item - - `on_screenshot` - When screenshots are taken -3. `on_run_end` - Once at the end +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `item` | `Any` | The computer call item dictionary | + +#### AsyncCallbackHandler.on_computer_call_end + +```python +async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None +``` + +Called when a computer call has completed. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `item` | `Any` | The computer call item dictionary | +| `result` | `Any` | The result of the computer call | + +#### AsyncCallbackHandler.on_function_call_start + +```python +async def on_function_call_start(self, item: Dict[str, Any]) -> None +``` + +Called when a function call is about to start. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `item` | `Any` | The function call item dictionary | + +#### AsyncCallbackHandler.on_function_call_end + +```python +async def on_function_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None +``` + +Called when a function call has completed. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `item` | `Any` | The function call item dictionary | +| `result` | `Any` | The result of the function call | + +#### AsyncCallbackHandler.on_text + +```python +async def on_text(self, item: Dict[str, Any]) -> None +``` + +Called when a text message is encountered. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `item` | `Any` | The message item dictionary | + +#### AsyncCallbackHandler.on_api_start + +```python +async def on_api_start(self, kwargs: Dict[str, Any]) -> None +``` + +Called when an API call is about to start. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `kwargs` | `Any` | The kwargs being passed to the API call | + +#### AsyncCallbackHandler.on_api_end + +```python +async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None +``` + +Called when an API call has completed. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `kwargs` | `Any` | The kwargs that were passed to the API call | +| `result` | `Any` | The result of the API call | + +#### AsyncCallbackHandler.on_usage + +```python +async def on_usage(self, usage: Dict[str, Any]) -> None +``` + +Called when usage information is received. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `usage` | `Any` | The usage information | + +#### AsyncCallbackHandler.on_screenshot + +```python +async def on_screenshot(self, screenshot: Union[str, bytes], name: str = 'screenshot') -> None +``` + +Called when a screenshot is taken. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `screenshot` | `Any` | The screenshot image | +| `name` | `Any` | The name of the screenshot | + +#### AsyncCallbackHandler.on_responses + +```python +async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None +``` + +Called when responses are received. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `kwargs` | `Any` | The kwargs being passed to the agent loop | +| `responses` | `Any` | The responses received | --- -## Tools +## BudgetManagerCallback -### Built-in Tools +*Inherits from: AsyncCallbackHandler* -#### Computer +Budget manager callback that tracks usage costs and can stop execution when budget is exceeded. -The primary tool for full computer control. +### Constructor ```python -from computer import Computer - -computer = Computer( - os_type="linux", - provider_type="docker", - image="trycua/cua-xfce:latest" -) -await computer.run() - -agent = ComputerAgent( - model="anthropic/claude-sonnet-4-5-20250929", - tools=[computer] -) +BudgetManagerCallback(self, max_budget: float, reset_after_each_run: bool = True, raise_error: bool = False) ``` -#### BrowserTool +### Attributes -Specialized tool for web automation with browser-only models. +| Name | Type | Description | +|------|------|-------------| +| `max_budget` | `Any` | | +| `reset_after_each_run` | `Any` | | +| `raise_error` | `Any` | | +| `total_cost` | `Any` | | + +### Methods + +#### BudgetManagerCallback.on_run_start ```python -from agent.tools import BrowserTool - -browser = BrowserTool(interface=computer) - -agent = ComputerAgent( - model="google/gemini-2.5-flash", - tools=[browser] -) +async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None ``` -See [Browser Tool](/cua/guide/fundamentals/browser-tool) for available actions. +Reset budget if configured to do so. -### Custom Function Tools - -Add Python functions as tools: +#### BudgetManagerCallback.on_usage ```python -def calculate(a: int, b: int) -> int: - """Calculate the sum of two integers""" - return a + b - -async def fetch_data(url: str) -> str: - """Fetch data from a URL""" - async with httpx.AsyncClient() as client: - response = await client.get(url) - return response.text - -agent = ComputerAgent( - model="anthropic/claude-sonnet-4-5-20250929", - tools=[computer, calculate, fetch_data] -) +async def on_usage(self, usage: Dict[str, Any]) -> None ``` -### Sandboxed Tools +Track usage costs. -Run tools inside the sandbox with the `@sandboxed` decorator: +#### BudgetManagerCallback.on_run_continue ```python -from computer.helpers import sandboxed - -@sandboxed() -def read_sandbox_file(path: str) -> str: - """Read a file from inside the sandbox""" - with open(path, 'r') as f: - return f.read() - -agent = ComputerAgent( - model="anthropic/claude-sonnet-4-5-20250929", - tools=[computer, read_sandbox_file] -) +async def on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool ``` -### BaseTool Class - -For full control over tool schema: - -```python -from agent.tools import BaseTool, register_tool - -@register_tool("database_query") -class DatabaseQueryTool(BaseTool): - def __init__(self, connection_string: str): - self.connection = connection_string - - @property - def description(self) -> str: - return "Execute a read-only SQL query" - - @property - def parameters(self) -> dict: - return { - "type": "object", - "properties": { - "query": { - "type": "string", - "description": "SQL SELECT query to execute" - } - }, - "required": ["query"] - } - - def call(self, params, **kwargs): - query = params["query"] if isinstance(params, dict) else params - # Execute and return results - return {"rows": [...]} -``` - -### ToolError - -Raise `ToolError` for recoverable errors: - -```python -from agent.tools import ToolError - -def divide(a: float, b: float) -> float: - """Divide a by b""" - if b == 0: - raise ToolError("Cannot divide by zero") - return a / b -``` - -The model sees the error message and can adjust its approach. +Check if budget allows continuation. --- -## Model Providers +## ImageRetentionCallback -### Model Format +*Inherits from: AsyncCallbackHandler* -Models are specified as `provider/model-name`: +Callback handler that applies image retention policy to limit the number +of recent images in message history to prevent context window overflow. + +### Constructor ```python -# Anthropic -agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929", ...) - -# OpenAI -agent = ComputerAgent(model="openai/computer-use-preview", ...) - -# Google -agent = ComputerAgent(model="google/gemini-2.5-flash", ...) - -# Local models via Ollama -agent = ComputerAgent(model="ollama/ui-tars:latest", ...) +ImageRetentionCallback(self, only_n_most_recent_images: Optional[int] = None) ``` -### Composed Models +### Attributes -Use `+` to combine models for different capabilities: +| Name | Type | Description | +|------|------|-------------| +| `only_n_most_recent_images` | `Any` | | + +### Methods + +#### ImageRetentionCallback.on_llm_start ```python -# UI-TARS for grounding, Claude for planning -agent = ComputerAgent( - model="ollama/ui-tars:latest+anthropic/claude-sonnet-4-5-20250929", - tools=[computer] -) - -# Qwen for grounding, GPT-4 for planning -agent = ComputerAgent( - model="ollama/qwen2.5-vl:latest+openai/gpt-4o", - tools=[computer] -) +async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]] ``` -### Environment Variables +Apply image retention policy to messages before sending to agent loop. -| Variable | Description | -| ------------------- | -------------------------------------------- | -| `ANTHROPIC_API_KEY` | API key for Anthropic models | -| `OPENAI_API_KEY` | API key for OpenAI models | -| `GOOGLE_API_KEY` | API key for Google models | -| `OLLAMA_HOST` | Host for Ollama (default: `localhost:11434`) | +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `messages` | `Any` | List of message dictionaries | + +**Returns:** List of messages with image retention policy applied --- -## Chat History +## LoggingCallback -Pass previous messages to maintain context: +*Inherits from: AsyncCallbackHandler* + +Callback handler that logs agent lifecycle events with configurable verbosity. + +Logging levels: +- DEBUG: All events including API calls, message preprocessing, and detailed outputs +- INFO: Major lifecycle events (start/end, messages, outputs) +- WARNING: Only warnings and errors +- ERROR: Only errors + +### Constructor ```python -from agent.types import Message - -history = [ - Message(role="user", content="Open the browser"), - Message(role="assistant", content="I've opened Firefox."), -] - -async for result in agent.run( - "Now search for Python tutorials", - chat_history=history -): - print(result.text) +LoggingCallback(self, logger: Optional[logging.Logger] = None, level: int = logging.INFO) ``` -### Message Format +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `logger` | `Any` | | +| `level` | `Any` | | + +### Methods + +#### LoggingCallback.on_run_start ```python -from agent.types import Message +async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None +``` -# User message -user_msg = Message(role="user", content="Click the button") +Called before the run starts. -# Assistant message -assistant_msg = Message(role="assistant", content="I clicked the submit button.") +#### LoggingCallback.on_usage -# System message (for instructions) -system_msg = Message(role="system", content="Be concise in responses.") +```python +async def on_usage(self, usage: Dict[str, Any]) -> None +``` + +Called when usage information is received. + +#### LoggingCallback.on_run_end + +```python +async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None +``` + +Called after the run ends. + +#### LoggingCallback.on_llm_start + +```python +async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]] +``` + +Called before LLM processing starts. + +#### LoggingCallback.on_llm_end + +```python +async def on_llm_end(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]] +``` + +Called after LLM processing ends. + +#### LoggingCallback.on_computer_call_start + +```python +async def on_computer_call_start(self, item: Dict[str, Any]) -> None +``` + +Called when a computer call starts. + +#### LoggingCallback.on_computer_call_end + +```python +async def on_computer_call_end(self, item: Dict[str, Any], result: Any) -> None +``` + +Called when a computer call ends. + +#### LoggingCallback.on_function_call_start + +```python +async def on_function_call_start(self, item: Dict[str, Any]) -> None +``` + +Called when a function call starts. + +#### LoggingCallback.on_function_call_end + +```python +async def on_function_call_end(self, item: Dict[str, Any], result: Any) -> None +``` + +Called when a function call ends. + +#### LoggingCallback.on_text + +```python +async def on_text(self, item: Dict[str, Any]) -> None +``` + +Called when a text message is encountered. + +#### LoggingCallback.on_api_start + +```python +async def on_api_start(self, kwargs: Dict[str, Any]) -> None +``` + +Called when an API call is about to start. + +#### LoggingCallback.on_api_end + +```python +async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None +``` + +Called when an API call has completed. + +#### LoggingCallback.on_screenshot + +```python +async def on_screenshot(self, item: Union[str, bytes], name: str = 'screenshot') -> None +``` + +Called when a screenshot is taken. + +--- + +## OperatorNormalizerCallback + +*Inherits from: AsyncCallbackHandler* + +Normalizes common computer call hallucinations / errors in computer call syntax. + +### Methods + +#### OperatorNormalizerCallback.on_llm_end + +```python +async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]] ``` --- -## Error Handling +## OtelCallback -Handle errors during agent execution: +*Inherits from: AsyncCallbackHandler* + +OpenTelemetry callback handler for instrumentation. + +Tracks: +- Agent session lifecycle (start/end) +- Agent run lifecycle (start/end with duration) +- Individual steps (with duration) +- Computer actions (with duration) +- Token usage +- Errors + +### Constructor ```python -from agent import ComputerAgent -from agent.errors import AgentError, BudgetExceededError - -try: - async for result in agent.run("Complete the form"): - if result.error: - print(f"Action failed: {result.error}") - print(result.text) -except BudgetExceededError: - print("Budget limit reached") -except AgentError as e: - print(f"Agent error: {e}") +OtelCallback(self, agent: Any) ``` -### Error Types +### Attributes -| Error | Description | -| --------------------- | --------------------------- | -| `AgentError` | Base class for agent errors | -| `BudgetExceededError` | Cost limit exceeded | -| `ModelError` | Model API error | -| `ToolError` | Tool execution error | +| Name | Type | Description | +|------|------|-------------| +| `agent` | `Any` | | +| `model` | `Any` | | +| `run_start_time` | `Optional[float]` | | +| `step_start_time` | `Optional[float]` | | +| `step_count` | `Any` | | + +### Methods + +#### OtelCallback.on_run_start + +```python +async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None +``` + +Called at the start of an agent run loop. + +#### OtelCallback.on_run_end + +```python +async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None +``` + +Called at the end of an agent run loop. + +#### OtelCallback.on_responses + +```python +async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None +``` + +Called when responses are received (each step). + +#### OtelCallback.on_usage + +```python +async def on_usage(self, usage: Dict[str, Any]) -> None +``` + +Called when usage information is received. + +#### OtelCallback.on_computer_call_start + +```python +async def on_computer_call_start(self, item: Dict[str, Any]) -> None +``` + +Called when a computer call is about to start. + +#### OtelCallback.on_computer_call_end + +```python +async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None +``` + +Called when a computer call has completed. + +#### OtelCallback.on_api_start + +```python +async def on_api_start(self, kwargs: Dict[str, Any]) -> None +``` + +Called when an LLM API call is about to start. + +#### OtelCallback.on_api_end + +```python +async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None +``` + +Called when an LLM API call has completed. --- -## Trajectory Configuration +## OtelErrorCallback -Configure trajectory saving with a dict: +*Inherits from: AsyncCallbackHandler* + +Callback that captures errors and sends them to Sentry/OTEL. + +Should be added early in the callback chain to catch all errors. + +### Constructor ```python -agent = ComputerAgent( - model="anthropic/claude-sonnet-4-5-20250929", - tools=[computer], - trajectory_dir={ - "trajectory_dir": "trajectories", - "reset_on_run": False, # Continue same trajectory across runs - "screenshot_dir": "screenshots" # Save screenshots separately - } -) +OtelErrorCallback(self, agent: Any) ``` -| Option | Default | Description | -| ---------------- | -------- | ----------------------------- | -| `trajectory_dir` | Required | Base directory | -| `reset_on_run` | `True` | Create new ID per run | -| `screenshot_dir` | `None` | Separate screenshot directory | +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `agent` | `Any` | | +| `model` | `Any` | | + +### Methods + +#### OtelErrorCallback.on_error + +```python +async def on_error(self, error: Exception, context: Dict[str, Any]) -> None +``` + +Called when an error occurs during agent execution. + +--- + +## PromptInstructionsCallback + +*Inherits from: AsyncCallbackHandler* + +Prepend a user instructions message to the message list. + +This is a minimal, non-invasive way to guide the agent's behavior without +modifying agent loops or tools. It works with any provider/loop since it +only alters the messages array before sending to the model. + +### Constructor + +```python +PromptInstructionsCallback(self, instructions: Optional[str]) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `instructions` | `Any` | | + +### Methods + +#### PromptInstructionsCallback.on_llm_start + +```python +async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]] +``` + +--- + +## TelemetryCallback + +*Inherits from: AsyncCallbackHandler* + +Telemetry callback handler for Computer-Use Agent (cua-agent) + +Tracks agent usage, performance metrics, and optionally trajectory data. + +### Constructor + +```python +TelemetryCallback(self, agent, log_trajectory: bool = False) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `agent` | `Any` | | +| `log_trajectory` | `Any` | | +| `session_id` | `Any` | | +| `run_id` | `Any` | | +| `run_start_time` | `Any` | | +| `step_count` | `Any` | | +| `step_start_time` | `Any` | | +| `total_usage` | `Any` | | + +### Methods + +#### TelemetryCallback.on_run_start + +```python +async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None +``` + +Called at the start of an agent run loop. + +#### TelemetryCallback.on_run_end + +```python +async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None +``` + +Called at the end of an agent run loop. + +#### TelemetryCallback.on_usage + +```python +async def on_usage(self, usage: Dict[str, Any]) -> None +``` + +Called when usage information is received. + +#### TelemetryCallback.on_responses + +```python +async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None +``` + +Called when responses are received. + +--- + +## TrajectorySaverCallback + +*Inherits from: AsyncCallbackHandler* + +Callback handler that saves agent trajectories to disk. + +Saves each run as a separate trajectory with unique ID, and each turn +within the trajectory gets its own folder with screenshots and responses. + +### Constructor + +```python +TrajectorySaverCallback(self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `trajectory_dir` | `Any` | | +| `trajectory_id` | `Optional[str]` | | +| `current_turn` | `int` | | +| `current_artifact` | `int` | | +| `model` | `Optional[str]` | | +| `total_usage` | `Dict[str, Any]` | | +| `reset_on_run` | `Any` | | +| `screenshot_dir` | `Optional[Path]` | | + +### Methods + +#### TrajectorySaverCallback.on_run_start + +```python +async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None +``` + +Initialize trajectory tracking for a new run. + +#### TrajectorySaverCallback.on_run_end + +```python +async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None +``` + +Finalize run tracking by updating metadata with completion status, usage, and new items. + +#### TrajectorySaverCallback.on_api_start + +```python +async def on_api_start(self, kwargs: Dict[str, Any]) -> None +``` + +#### TrajectorySaverCallback.on_api_end + +```python +async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None +``` + +Save API call result. + +#### TrajectorySaverCallback.on_screenshot + +```python +async def on_screenshot(self, screenshot: Union[str, bytes], name: str = 'screenshot') -> None +``` + +Save a screenshot. + +#### TrajectorySaverCallback.on_usage + +```python +async def on_usage(self, usage: Dict[str, Any]) -> None +``` + +Called when usage information is received. + +#### TrajectorySaverCallback.on_responses + +```python +async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None +``` + +Save responses to the current turn directory and update usage statistics. + +#### TrajectorySaverCallback.on_computer_call_end + +```python +async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None +``` + +Called when a computer call has completed. +Saves screenshots and computer call output. diff --git a/docs/content/docs/cua/reference/agent-sdk/meta.json b/docs/content/docs/cua/reference/agent-sdk/meta.json index 096a077d..0d3799c9 100644 --- a/docs/content/docs/cua/reference/agent-sdk/meta.json +++ b/docs/content/docs/cua/reference/agent-sdk/meta.json @@ -2,5 +2,5 @@ "title": "Agent SDK", "description": "Python API for building computer-use agents", "icon": "Bot", - "pages": ["index"] + "pages": ["changelog"] } diff --git a/docs/content/docs/cua/reference/agent-sdk/v0.3/api.mdx b/docs/content/docs/cua/reference/agent-sdk/v0.3/api.mdx new file mode 100644 index 00000000..9dcc6f85 --- /dev/null +++ b/docs/content/docs/cua/reference/agent-sdk/v0.3/api.mdx @@ -0,0 +1,70 @@ +--- +title: Agent SDK v0.3 API Reference +description: API reference for Agent SDK version 0.3 +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/generate-versioned-docs.ts +Source tag: agent-v0.3.2 +Version: 0.3.2 +*/} + +import { Callout } from 'fumadocs-ui/components/callout'; + + + This is documentation for **v0.3**. [View latest version](/cua/reference/agent-sdk). + + +
+ + v0.3.2 + + pip install cua-agent==0.3.2 +
+ +CUA (Computer Use) Agent for AI-driven computer interaction. + +## Classes + +| Class | Description | +| --------------- | -------------------------------------------------------------------------------------- | +| `LLMProvider` | Supported LLM providers. | +| `LLM` | Configuration for LLM model and provider. | +| `ComputerAgent` | A computer agent that can perform automated tasks using natural language instructions. | + +## LLMProvider + +Supported LLM providers. + +## LLM + +Configuration for LLM model and provider. + +## ComputerAgent + +A computer agent that can perform automated tasks using natural language instructions. + +### Methods + +#### ComputerAgent.initialize + +```python +async def initialize(self) -> None +``` + +Initialize the agent and its components. + +#### ComputerAgent.run + +```python +async def run(self, task: str) -> AsyncGenerator[AgentResponse, None] +``` + +Run a task using the computer agent. + +Args: +task: Task description + +Yields: +Agent response format diff --git a/docs/content/docs/cua/reference/agent-sdk/v0.3/meta.json b/docs/content/docs/cua/reference/agent-sdk/v0.3/meta.json new file mode 100644 index 00000000..4f4b3dc2 --- /dev/null +++ b/docs/content/docs/cua/reference/agent-sdk/v0.3/meta.json @@ -0,0 +1,5 @@ +{ + "title": "v0.3", + "description": "Agent SDK v0.3 API Reference", + "pages": ["api"] +} diff --git a/docs/content/docs/cua/reference/agent-sdk/v0.4/api.mdx b/docs/content/docs/cua/reference/agent-sdk/v0.4/api.mdx new file mode 100644 index 00000000..b1fbe0fe --- /dev/null +++ b/docs/content/docs/cua/reference/agent-sdk/v0.4/api.mdx @@ -0,0 +1,83 @@ +--- +title: Agent SDK v0.4 API Reference +description: API reference for Agent SDK version 0.4 +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/generate-versioned-docs.ts +Source tag: agent-v0.4.53 +Version: 0.4.53 +*/} + +import { Callout } from 'fumadocs-ui/components/callout'; + + + This is documentation for **v0.4**. [View latest version](/cua/reference/agent-sdk). + + +
+ + v0.4.53 + + pip install cua-agent==0.4.53 +
+ +agent - Decorator-based Computer Use Agent with liteLLM integration + +## Classes + +| Class | Description | +| --------------- | ---------------------------------------------------------------------- | +| `ComputerAgent` | Main agent class that automatically selects the appropriate agent loop | + +## ComputerAgent + +Main agent class that automatically selects the appropriate agent loop +based on the model and executes tool calls. + +### Methods + +#### ComputerAgent.run + +```python +async def run(self, messages: Messages, stream: bool = False, api_key: Optional[str] = None, api_base: Optional[str] = None, additional_generation_kwargs = {}) -> AsyncGenerator[Dict[str, Any], None] +``` + +Run the agent with the given messages using Computer protocol handler pattern. + +Args: +messages: List of message dictionaries +stream: Whether to stream the response +api_key: Optional API key override for the model provider +api_base: Optional API base URL override for the model provider +\*\*additional_generation_kwargs: Additional arguments passed to the model provider + +Returns: +AsyncGenerator that yields response chunks + +#### ComputerAgent.predict_click + +```python +async def predict_click(self, instruction: str, image_b64: Optional[str] = None) -> Optional[Tuple[int, int]] +``` + +Predict click coordinates based on image and instruction. + +Args: +instruction: Instruction for where to click +image_b64: Base64 encoded image (optional, will take screenshot if not provided) + +Returns: +None or tuple with (x, y) coordinates + +#### ComputerAgent.get_capabilities + +```python +def get_capabilities(self) -> List[AgentCapability] +``` + +Get list of capabilities supported by the current agent config. + +Returns: +List of capability strings (e.g., ["step", "click"]) diff --git a/docs/content/docs/cua/reference/agent-sdk/v0.4/meta.json b/docs/content/docs/cua/reference/agent-sdk/v0.4/meta.json new file mode 100644 index 00000000..c1fcce1a --- /dev/null +++ b/docs/content/docs/cua/reference/agent-sdk/v0.4/meta.json @@ -0,0 +1,5 @@ +{ + "title": "v0.4", + "description": "Agent SDK v0.4 API Reference", + "pages": ["api"] +} diff --git a/docs/content/docs/cua/reference/agent-sdk/v0.5/api.mdx b/docs/content/docs/cua/reference/agent-sdk/v0.5/api.mdx new file mode 100644 index 00000000..80eed1cf --- /dev/null +++ b/docs/content/docs/cua/reference/agent-sdk/v0.5/api.mdx @@ -0,0 +1,83 @@ +--- +title: Agent SDK v0.5 API Reference +description: API reference for Agent SDK version 0.5 +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/generate-versioned-docs.ts +Source tag: agent-v0.5.2 +Version: 0.5.2 +*/} + +import { Callout } from 'fumadocs-ui/components/callout'; + + + This is documentation for **v0.5**. [View latest version](/cua/reference/agent-sdk). + + +
+ + v0.5.2 + + pip install cua-agent==0.5.2 +
+ +agent - Decorator-based Computer Use Agent with liteLLM integration + +## Classes + +| Class | Description | +| --------------- | ---------------------------------------------------------------------- | +| `ComputerAgent` | Main agent class that automatically selects the appropriate agent loop | + +## ComputerAgent + +Main agent class that automatically selects the appropriate agent loop +based on the model and executes tool calls. + +### Methods + +#### ComputerAgent.run + +```python +async def run(self, messages: Messages, stream: bool = False, api_key: Optional[str] = None, api_base: Optional[str] = None, additional_generation_kwargs = {}) -> AsyncGenerator[Dict[str, Any], None] +``` + +Run the agent with the given messages using Computer protocol handler pattern. + +Args: +messages: List of message dictionaries +stream: Whether to stream the response +api_key: Optional API key override for the model provider +api_base: Optional API base URL override for the model provider +\*\*additional_generation_kwargs: Additional arguments passed to the model provider + +Returns: +AsyncGenerator that yields response chunks + +#### ComputerAgent.predict_click + +```python +async def predict_click(self, instruction: str, image_b64: Optional[str] = None) -> Optional[Tuple[int, int]] +``` + +Predict click coordinates based on image and instruction. + +Args: +instruction: Instruction for where to click +image_b64: Base64 encoded image (optional, will take screenshot if not provided) + +Returns: +None or tuple with (x, y) coordinates + +#### ComputerAgent.get_capabilities + +```python +def get_capabilities(self) -> List[AgentCapability] +``` + +Get list of capabilities supported by the current agent config. + +Returns: +List of capability strings (e.g., ["step", "click"]) diff --git a/docs/content/docs/cua/reference/agent-sdk/v0.5/meta.json b/docs/content/docs/cua/reference/agent-sdk/v0.5/meta.json new file mode 100644 index 00000000..0332e1eb --- /dev/null +++ b/docs/content/docs/cua/reference/agent-sdk/v0.5/meta.json @@ -0,0 +1,5 @@ +{ + "title": "v0.5", + "description": "Agent SDK v0.5 API Reference", + "pages": ["api"] +} diff --git a/docs/content/docs/cua/reference/agent-sdk/v0.6/api.mdx b/docs/content/docs/cua/reference/agent-sdk/v0.6/api.mdx new file mode 100644 index 00000000..d28c71bf --- /dev/null +++ b/docs/content/docs/cua/reference/agent-sdk/v0.6/api.mdx @@ -0,0 +1,99 @@ +--- +title: Agent SDK v0.6 API Reference +description: API reference for Agent SDK version 0.6 +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/generate-versioned-docs.ts +Source tag: agent-v0.6.2 +Version: 0.6.2 +*/} + +import { Callout } from 'fumadocs-ui/components/callout'; + + + This is documentation for **v0.6**. [View latest version](/cua/reference/agent-sdk). + + +
+ + v0.6.2 + + pip install cua-agent==0.6.2 +
+ +agent - Decorator-based Computer Use Agent with liteLLM integration + +## Classes + +| Class | Description | +| --------------- | ---------------------------------------------------------------------- | +| `ComputerAgent` | Main agent class that automatically selects the appropriate agent loop | + +## ComputerAgent + +Main agent class that automatically selects the appropriate agent loop +based on the model and executes tool calls. + +### Methods + +#### ComputerAgent.run + +```python +async def run(self, messages: Messages, stream: bool = False, api_key: Optional[str] = None, api_base: Optional[str] = None, additional_generation_kwargs = {}) -> AsyncGenerator[Dict[str, Any], None] +``` + +Run the agent with the given messages using Computer protocol handler pattern. + +Args: +messages: List of message dictionaries +stream: Whether to stream the response +api_key: Optional API key override for the model provider +api_base: Optional API base URL override for the model provider +\*\*additional_generation_kwargs: Additional arguments passed to the model provider + +Returns: +AsyncGenerator that yields response chunks + +#### ComputerAgent.predict_click + +```python +async def predict_click(self, instruction: str, image_b64: Optional[str] = None) -> Optional[Tuple[int, int]] +``` + +Predict click coordinates based on image and instruction. + +Args: +instruction: Instruction for where to click +image_b64: Base64 encoded image (optional, will take screenshot if not provided) + +Returns: +None or tuple with (x, y) coordinates + +#### ComputerAgent.get_capabilities + +```python +def get_capabilities(self) -> List[AgentCapability] +``` + +Get list of capabilities supported by the current agent config. + +Returns: +List of capability strings (e.g., ["step", "click"]) + +#### ComputerAgent.open + +```python +def open(self, port: Optional[int] = None) +``` + +Start the playground server and open it in the browser. + +This method starts a local HTTP server that exposes the /responses endpoint +and automatically opens the CUA playground interface in the default browser. + +Args: +port: Port to run the server on. If None, finds an available port automatically. + +Example: >>> agent = ComputerAgent(model="claude-sonnet-4") >>> agent.open() # Starts server and opens browser diff --git a/docs/content/docs/cua/reference/agent-sdk/v0.6/meta.json b/docs/content/docs/cua/reference/agent-sdk/v0.6/meta.json new file mode 100644 index 00000000..d141fa6c --- /dev/null +++ b/docs/content/docs/cua/reference/agent-sdk/v0.6/meta.json @@ -0,0 +1,5 @@ +{ + "title": "v0.6", + "description": "Agent SDK v0.6 API Reference", + "pages": ["api"] +} diff --git a/docs/content/docs/cua/reference/agent-sdk/v0.7/api.mdx b/docs/content/docs/cua/reference/agent-sdk/v0.7/api.mdx new file mode 100644 index 00000000..d5ac95e6 --- /dev/null +++ b/docs/content/docs/cua/reference/agent-sdk/v0.7/api.mdx @@ -0,0 +1,99 @@ +--- +title: Agent SDK v0.7 API Reference +description: API reference for Agent SDK version 0.7 +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/generate-versioned-docs.ts +Source tag: agent-v0.7.23 +Version: 0.7.23 +*/} + +import { Callout } from 'fumadocs-ui/components/callout'; + + + This is documentation for **v0.7**. [View latest version](/cua/reference/agent-sdk). + + +
+ + v0.7.23 + + pip install cua-agent==0.7.23 +
+ +agent - Decorator-based Computer Use Agent with liteLLM integration + +## Classes + +| Class | Description | +| --------------- | ---------------------------------------------------------------------- | +| `ComputerAgent` | Main agent class that automatically selects the appropriate agent loop | + +## ComputerAgent + +Main agent class that automatically selects the appropriate agent loop +based on the model and executes tool calls. + +### Methods + +#### ComputerAgent.run + +```python +async def run(self, messages: Messages, stream: bool = False, api_key: Optional[str] = None, api_base: Optional[str] = None, additional_generation_kwargs = {}) -> AsyncGenerator[Dict[str, Any], None] +``` + +Run the agent with the given messages using Computer protocol handler pattern. + +Args: +messages: List of message dictionaries +stream: Whether to stream the response +api_key: Optional API key override for the model provider +api_base: Optional API base URL override for the model provider +\*\*additional_generation_kwargs: Additional arguments passed to the model provider + +Returns: +AsyncGenerator that yields response chunks + +#### ComputerAgent.predict_click + +```python +async def predict_click(self, instruction: str, image_b64: Optional[str] = None) -> Optional[Tuple[int, int]] +``` + +Predict click coordinates based on image and instruction. + +Args: +instruction: Instruction for where to click +image_b64: Base64 encoded image (optional, will take screenshot if not provided) + +Returns: +None or tuple with (x, y) coordinates + +#### ComputerAgent.get_capabilities + +```python +def get_capabilities(self) -> List[AgentCapability] +``` + +Get list of capabilities supported by the current agent config. + +Returns: +List of capability strings (e.g., ["step", "click"]) + +#### ComputerAgent.open + +```python +def open(self, port: Optional[int] = None) +``` + +Start the playground server and open it in the browser. + +This method starts a local HTTP server that exposes the /responses endpoint +and automatically opens the Cua playground interface in the default browser. + +Args: +port: Port to run the server on. If None, finds an available port automatically. + +Example: >>> agent = ComputerAgent(model="claude-sonnet-4") >>> agent.open() # Starts server and opens browser diff --git a/docs/content/docs/cua/reference/agent-sdk/v0.7/meta.json b/docs/content/docs/cua/reference/agent-sdk/v0.7/meta.json new file mode 100644 index 00000000..832d2e52 --- /dev/null +++ b/docs/content/docs/cua/reference/agent-sdk/v0.7/meta.json @@ -0,0 +1,5 @@ +{ + "title": "v0.7", + "description": "Agent SDK v0.7 API Reference", + "pages": ["api"] +} diff --git a/docs/content/docs/cua/reference/cli/changelog.mdx b/docs/content/docs/cua/reference/cli/changelog.mdx new file mode 100644 index 00000000..9a54d7db --- /dev/null +++ b/docs/content/docs/cua/reference/cli/changelog.mdx @@ -0,0 +1,28 @@ +--- +title: Changelog +description: Release history for Cua CLI +--- + +# CLI Changelog + +All notable changes to the Cua CLI are documented here. + +## 0.1.x + +### v0.1.0 (Unreleased) + +Initial release of the unified Cua CLI. + +**Features:** + +- Authentication management (`cua auth login/logout/env`) +- Sandbox lifecycle management (`cua sandbox create/list/start/stop/delete`) +- Cloud image management (`cua image list/push/pull/delete`) +- Skills recording and playback (`cua skills record/list/read/replay`) +- MCP server for AI assistants (`cua serve-mcp`) + +**Installation:** + +```bash +pip install cua-cli +``` diff --git a/docs/content/docs/cua/reference/cli/commands.mdx b/docs/content/docs/cua/reference/cli/commands.mdx new file mode 100644 index 00000000..40eb9cd2 --- /dev/null +++ b/docs/content/docs/cua/reference/cli/commands.mdx @@ -0,0 +1,338 @@ +--- +title: Command Reference +description: Complete reference for all Cua CLI commands +--- + +import { Callout } from 'fumadocs-ui/components/callout'; +import { VersionBadge } from '@/components/version-selector'; + + + +## Overview + +The Cua CLI provides commands for authentication, sandbox management, image management, skills recording, and MCP server. + +### Command Groups + +| Group | Alias | Description | +| --------------- | --------- | ---------------------------------- | +| `cua auth` | | Authentication management | +| `cua sandbox` | `cua sb` | Sandbox lifecycle management | +| `cua image` | `cua img` | Cloud image management | +| `cua skills` | | Skills recording and playback | +| `cua serve-mcp` | | Start MCP server for AI assistants | + +## Authentication Commands + +### `cua auth login` + +Authenticate with your Cua account. + +```bash +# Browser-based login +cua auth login + +# Direct API key login +cua auth login --api-key sk_cua-api01_... +``` + +### `cua auth env` + +Export your API key to a `.env` file in the current directory. + +```bash +cua auth env +``` + +### `cua auth logout` + +Remove the stored API key. + +```bash +cua auth logout +``` + +## Sandbox Commands + +All sandbox commands support the `cua sb` alias (e.g., `cua sb list`). + +### `cua sandbox list` + +List all sandboxes. + +```bash +cua sandbox list + +# With JSON output +cua sandbox list --json + +# Show passwords +cua sandbox list --show-passwords +``` + +**Aliases:** `cua sb ls`, `cua sb ps` + +### `cua sandbox create` + +Create a new sandbox. + +```bash +cua sandbox create --os --size --region +``` + +**Required options:** + +| Option | Values | +| ---------- | ---------------------------------------------------------- | +| `--os` | `linux`, `windows`, `macos` | +| `--size` | `small`, `medium`, `large` | +| `--region` | `north-america`, `europe`, `asia-pacific`, `south-america` | + +**Example:** + +```bash +cua sandbox create --os linux --size small --region north-america +``` + +### `cua sandbox get` + +Get details for a specific sandbox. + +```bash +cua sandbox get + +# With options +cua sandbox get --json +cua sandbox get --show-passwords +cua sandbox get --show-vnc-url +``` + +### `cua sandbox start` + +Start a stopped sandbox. + +```bash +cua sandbox start +``` + +### `cua sandbox stop` + +Stop a running sandbox. + +```bash +cua sandbox stop +``` + +### `cua sandbox restart` + +Restart a sandbox. + +```bash +cua sandbox restart +``` + +### `cua sandbox suspend` + +Suspend a sandbox (preserves memory state). + +```bash +cua sandbox suspend +``` + +### `cua sandbox delete` + +Delete a sandbox permanently. + +```bash +cua sandbox delete +``` + + + This action is irreversible. All data on the sandbox will be permanently lost. + + +### `cua sandbox vnc` + +Open the sandbox desktop in your browser. + +```bash +cua sandbox vnc +``` + +**Alias:** `cua sb open ` + +## Image Commands + +All image commands support the `cua img` alias. + +### `cua image list` + +List cloud images. + +```bash +cua image list + +# JSON output +cua image list --json + +# List local images instead +cua image list --local +``` + +**Alias:** `cua img ls` + +### `cua image push` + +Upload a local image to the cloud. + +```bash +cua image push --file --tag +``` + +### `cua image pull` + +Download an image from the cloud. + +```bash +cua image pull --tag --output +``` + +### `cua image delete` + +Delete a cloud image. + +```bash +cua image delete --tag + +# Skip confirmation +cua image delete --tag --force +``` + +## Skills Commands + +Skills are recorded demonstrations that guide agent behavior. + +### `cua skills list` + +List all saved skills. + +```bash +cua skills list + +# JSON output +cua skills list --json +``` + +### `cua skills record` + +Record a new skill from a sandbox. + +```bash +cua skills record --sandbox + +# With options +cua skills record --sandbox --name "my-skill" --description "Does something" +cua skills record --sandbox --provider anthropic --model claude-sonnet-4-5-20250514 +``` + +**Options:** + +| Option | Description | +| --------------- | --------------------------------------------------- | +| `--sandbox` | Sandbox to record from | +| `--name` | Skill name (skips prompt) | +| `--description` | Skill description (skips prompt) | +| `--provider` | LLM provider for captioning (`anthropic`, `openai`) | +| `--model` | Model to use for captioning | + +### `cua skills read` + +Display a skill's instructions. + +```bash +cua skills read + +# JSON format with trajectory data +cua skills read --format json +``` + +### `cua skills replay` + +Open the recorded video for a skill. + +```bash +cua skills replay +``` + +### `cua skills delete` + +Delete a skill. + +```bash +cua skills delete +``` + +### `cua skills clean` + +Delete all skills (with confirmation). + +```bash +cua skills clean +``` + +## MCP Server + +Run the CLI as an MCP (Model Context Protocol) server for AI assistants like Claude. + +### `cua serve-mcp` + +Start the MCP server. + +```bash +cua serve-mcp + +# With specific sandbox +cua serve-mcp --sandbox + +# With permission restrictions +cua serve-mcp --permissions sandbox:readonly +``` + +**Options:** + +| Option | Description | +| --------------- | ------------------------------------- | +| `--sandbox` | Default sandbox for computer commands | +| `--permissions` | Permission groups (see below) | + +**Permission groups:** + +| Group | Description | +| ------------------- | ---------------------------- | +| `all` | All permissions (default) | +| `sandbox:all` | Full sandbox management | +| `sandbox:readonly` | List and view sandboxes only | +| `computer:all` | Full computer control | +| `computer:readonly` | Screenshots only | +| `skills:all` | Full skills management | +| `skills:readonly` | List and read skills only | + +**Example with multiple permissions:** + +```bash +cua serve-mcp --permissions "sandbox:all,computer:readonly,skills:list" +``` + +## Environment Variables + +| Variable | Description | +| --------------------- | -------------------------- | +| `CUA_API_KEY` | API key for authentication | +| `CUA_SANDBOX` | Default sandbox name | +| `CUA_MCP_PERMISSIONS` | Default MCP permissions | + +## Next Steps + +- [Quickstart Guide](/cua/guide/get-started/quickstart) +- [Computer SDK Reference](/cua/reference/computer-sdk) +- [Agent SDK Reference](/cua/reference/agent-sdk) diff --git a/docs/content/docs/cua/reference/cli/index.mdx b/docs/content/docs/cua/reference/cli/index.mdx new file mode 100644 index 00000000..13450520 --- /dev/null +++ b/docs/content/docs/cua/reference/cli/index.mdx @@ -0,0 +1,249 @@ +--- +title: Cua CLI API Reference +description: Python API reference for the Cua command-line interface +--- + +{/* + AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY + Generated by: npx tsx scripts/docs-generators/python-sdk.ts + Source: libs/python/cua-cli/cua_cli + Version: 0.1.0 +*/} + +import { Callout } from 'fumadocs-ui/components/callout'; +import { Tabs, Tab } from 'fumadocs-ui/components/tabs'; +import { VersionHeader } from '@/components/version-selector'; + + + +CUA CLI - Unified command-line interface for Computer-Use Agents. + +--- + +## main + +Main entry point for CUA CLI. + +### print_error + +```python +def print_error(message: str) -> None +``` + +Print an error message to stderr. + +### create_parser + +```python +def create_parser() -> argparse.ArgumentParser +``` + +Create the main argument parser with all subcommands. + +### main + +```python +def main() -> int +``` + +Main entry point for the CLI. + +--- + +## auth + +Authentication module for CUA CLI. + +--- + +## CredentialStore + +SQLite-based credential store with WAL mode for concurrent access. + +### Constructor + +```python +CredentialStore(self, db_path: Path | None = None) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `db_path` | `Any` | | + +### Methods + +#### CredentialStore.get + +```python +def get(self, key: str) -> Optional[str] +``` + +Get a value from the store. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `key` | `Any` | The key to look up | + +**Returns:** The value, or None if not found + +#### CredentialStore.set + +```python +def set(self, key: str, value: str) -> None +``` + +Set a value in the store. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `key` | `Any` | The key to set | +| `value` | `Any` | The value to store | + +#### CredentialStore.delete + +```python +def delete(self, key: str) -> bool +``` + +Delete a value from the store. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `key` | `Any` | The key to delete | + +**Returns:** True if the key was deleted, False if it didn't exist + +#### CredentialStore.clear + +```python +def clear(self) -> None +``` + +Clear all stored credentials. + +### clear_credentials + +```python +def clear_credentials() -> None +``` + +Clear all stored credentials. + +### get_api_key + +```python +def get_api_key() -> Optional[str] +``` + +Get the stored API key. + +First checks CUA_API_KEY environment variable, then falls back to stored credentials. + +**Returns:** The API key, or None if not found + +### save_api_key + +```python +def save_api_key(api_key: str) -> None +``` + +Save an API key to the credential store. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `api_key` | `Any` | The API key to save | + +--- + +## utils + +Utility modules for CUA CLI. + +### run_async + +```python +def run_async(coro: Coroutine[Any, Any, T]) -> T +``` + +Run an async coroutine synchronously. + +This is the standard pattern for CLI commands that need to call async code. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `coro` | `Any` | The coroutine to run | + +**Returns:** The result of the coroutine + +### print_error + +```python +def print_error(message: str) -> None +``` + +Print an error message to stderr. + +### print_info + +```python +def print_info(message: str) -> None +``` + +Print an info message. + +### print_json + +```python +def print_json(data: Any) -> None +``` + +Print data as formatted JSON. + +### print_success + +```python +def print_success(message: str) -> None +``` + +Print a success message. + +### print_table + +```python +def print_table(data: list[dict[str, Any]], columns: list[tuple[str, str]] | None = None, title: str | None = None) -> None +``` + +Print data as a formatted table. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `data` | `Any` | List of dictionaries to display | +| `columns` | `Any` | List of (key, header) tuples. If None, uses all keys from first item. | +| `title` | `Any` | Optional table title | + +### print_warning + +```python +def print_warning(message: str) -> None +``` + +Print a warning message. diff --git a/docs/content/docs/cua/reference/cli/meta.json b/docs/content/docs/cua/reference/cli/meta.json new file mode 100644 index 00000000..739205a5 --- /dev/null +++ b/docs/content/docs/cua/reference/cli/meta.json @@ -0,0 +1,6 @@ +{ + "title": "CLI", + "description": "Command-line interface for Cua", + "icon": "Terminal", + "pages": ["commands", "changelog"] +} diff --git a/docs/content/docs/cua/reference/computer-sdk/changelog.mdx b/docs/content/docs/cua/reference/computer-sdk/changelog.mdx new file mode 100644 index 00000000..7b1ce79d --- /dev/null +++ b/docs/content/docs/cua/reference/computer-sdk/changelog.mdx @@ -0,0 +1,328 @@ +--- +title: Changelog +description: Release history for Computer SDK +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/generate-changelog.ts +Last updated: 2026-02-04 +*/} + +# Computer SDK Changelog + +All notable changes to the Computer SDK are documented here. + +## 0.5.x + +### v0.5.12 (2026-01-26) + +- Initial release or no path-specific changes found + +### v0.5.11 (2026-01-24) + +- Bump cua-computer to v0.5.11 by @github-actions[bot] +- Setup UV as venv manager for computer ([#789](https://github.com/trycua/cua/pull/789)) by @synacktraa + +### v0.5.10 (2026-01-17) + +Maintenance release. + +### v0.5.7 (2026-01-17) + +- Bump cua-computer to v0.5.7 by @github-actions[bot] +- fix(computer): prevent data loss when tracing.stop path conflicts with trace directory ([#814](https://github.com/trycua/cua/pull/814)) by @Harsh Verma +- Improve Computer API Server connection warnings and timeout errors ([#803](https://github.com/trycua/cua/pull/803)) by @Fizza Mukhtar +- feat(computer): add OpenTelemetry instrumentation ([#663](https://github.com/trycua/cua/pull/663)) by @r33drichards + +### v0.5.6 (2026-01-12) + +- Initial release or no path-specific changes found + +### v0.5.5 (2026-01-12) + +- Initial release or no path-specific changes found + +### v0.5.4 (2026-01-12) + +- Bump cua-computer to v0.5.3 by @github-actions[bot] +- Bump cua-computer to v0.5.2 by @github-actions[bot] +- refactor(docs): reorganize lume/cua docs and standardize READMEs ([#752](https://github.com/trycua/cua/pull/752)) by @Francesco Bonacci +- feat(lume,ci): add unattended VM setup and reorganize CI/CD workflows ([#729](https://github.com/trycua/cua/pull/729)) by @Francesco Bonacci +- update dead links due to docs update by @Sarina Li +- pyautogui cleanup by @Adam +- Revert "refractor docs into 6 sections" by @Adam +- refractor docs into 6 sections by @Adam + +## 0.4.x + +### v0.4.11 (2025-10-29) + +**Dependencies:** pylume: latest + +- Docs/improve readme by @jamesmurdza in https://github.com/trycua/cua/pull/512 +- Feature/version bump workflows by @r33drichards in https://github.com/trycua/cua/pull/511 +- Support `display` and `ephemeral` in Docker provider by @ddupont808 in https://github.com/trycua/cua/pull/514 +- Fix markdown formatting in docs by @jamesmurdza in https://github.com/trycua/cua/pull/523 +- Jagjeevan's Fix + Merged Main for 4o Model Fix by @sarinali in https://github.com/trycua/cua/pull/522 +- fix: Added GPT-4o compatibility for screenshot actions with text parameter by @JagjeevanAK in https://github.com/trycua/cua/pull/422 +- Remove PyLume code and examples by @jamesmurdza in https://github.com/trycua/cua/pull/435 +- Remove PyLume library by @jamesmurdza in https://github.com/trycua/cua/pull/527 +- Add test infrastructure with CI/CD #478 by @AceAtDev in https://github.com/trycua/cua/pull/491 +- Feature/agent loop test by @YeIIcw in https://github.com/trycua/cua/pull/528 +- Update Agent Loop Test by @YeIIcw in https://github.com/trycua/cua/pull/530 +- Fix/omniparser predict refactor by @sarinali in https://github.com/trycua/cua/pull/529 +- Fix agent test condition trigger by @YeIIcw in https://github.com/trycua/cua/pull/531 +- Fix/agent loop test by @YeIIcw in https://github.com/trycua/cua/pull/532 +- Add desktop and window management commands to computer interface by @ddupont808 in https://github.com/trycua/cua/pull/516 + +### v0.4.10 (2025-10-14) + +- Fix XFCE Password Prompts and Lock Screen Issues by @f-trycua in https://github.com/trycua/cua/pull/466 +- Add Cloud VM Management API by @ddupont808 in https://github.com/trycua/cua/pull/452 + +### v0.4.8 (2025-10-12) + +**Dependencies:** pylume: latest + +### v0.4.7 (2025-10-03) + +**Dependencies:** pylume: latest + +### v0.4.6 (2025-09-24) + +**Dependencies:** pylume: latest + +- 🔧 Improve HUD eval script: auto-discover .env and simplify usage by @YeIIcw in https://github.com/trycua/cua/pull/419 +- Fix Windows Sandbox errors by @ddupont808 in https://github.com/trycua/cua/pull/424 + +New Contributors + +- @YeIIcw made their first contribution in https://github.com/trycua/cua/pull/419 +- @JagjeevanAK made their first contribution in https://github.com/trycua/cua/pull/420 + +### v0.4.5 (2025-09-05) + +**Dependencies:** pylume: latest + +- Fix x/y scrolling by @ddupont808 in https://github.com/trycua/cua/pull/393 +- Fix invalid image error with some providers (i.e. WinSandbox) +- Reference documentation batch by @onel in https://github.com/trycua/cua/pull/390 + +### v0.4.4 (2025-08-19) + +**Dependencies:** pylume: latest + +- Update agent/computer SDKs to match changes in telemetry SDK by @jamesmurdza in https://github.com/trycua/cua/pull/355 +- Bugfixes - Removed slashes from container name by @ddupont808 in https://github.com/trycua/cua/pull/358 + +### v0.4.3 (2025-08-15) + +**Dependencies:** pylume: latest + +- Fixed bug where the `image` kwarg would be ignored + +### v0.4.2 (2025-08-14) + +**Dependencies:** pylume: latest + +- [Computer] Add Docker as a local VM provider by @ddupont808 in https://github.com/trycua/cua/pull/339 + +### v0.4.1 (2025-08-05) + +**Dependencies:** pylume: latest + +### v0.4.0 (2025-07-28) + +Bumped `cua-computer` to use latest `cua-core` + +## 0.3.x + +### v0.3.7 (2025-07-16) + +**Dependencies:** pylume: latest + +### v0.3.6 (2025-07-16) + +**Dependencies:** pylume: latest + +### v0.3.5 (2025-07-10) + +**Dependencies:** pylume: latest + +### v0.3.4 (2025-07-03) + +**Dependencies:** pylume: latest + +### v0.3.3 (2025-07-03) + +**Dependencies:** pylume: latest + +### v0.3.2 (2025-07-03) + +**Dependencies:** pylume: latest + +### v0.3.1 (2025-07-01) + +**Dependencies:** pylume: latest + +### v0.3.0 (2025-06-25) + +**Dependencies:** pylume: latest + +## 0.2.x + +### v0.2.13 (2025-06-24) + +**Dependencies:** pylume: latest + +### v0.2.12 (2025-06-20) + +**Dependencies:** pylume: latest + +### v0.2.11 (2025-06-18) + +**Dependencies:** pylume: latest + +### v0.2.10 (2025-06-10) + +**Dependencies:** pylume: latest + +- Removed warning when calling `start_vm` on the CloudProvider +- Add disconnect() method for Computer +- Fixed `file_exists` and `directory_exists` not being implemented in the computer-server +- Added keyboard/mouse primitives ( `mouse_up`, `mouse_down`, `key_up`, `key_down` ) +- Added file system commands to computer interface +- Added file system interface pytests +- Improved computer interface accessibility tree output on macOS, now includes menubar and dock items, along with preserving z-ordering +- Increased timeout for web socket commands due to a TimeoutError if the accessibility tree takes longer than 30 seconds to crawl + +**Changes reference:** + +```py +... old actions haven't changed + +Disconnect from computer interface / cleanup websocket resources +await computer.disconnect() + +New mouse actions +await computer.interface.mouse_down(x, y, button="left") # Press and hold a mouse button +await computer.interface.mouse_up(x, y, button="left") # Release a mouse button + +New keyboard actions +await computer.interface.key_down("command") # Press and hold a key +await computer.interface.key_up("command") # Release a key + +New scrolling actions +await computer.interface.scroll(x, y) # Scroll the mouse wheel + +New file-system commands +Fixed: await computer.interface.file_exists(path) # Check if file exists +Fixed: await computer.interface.directory_exists(path) # Check if directory exists +await computer.interface.read_text(path) # Read file content +await computer.interface.write_text(path, content) # Write file content +await computer.interface.read_bytes(path) # Read file content as bytes +await computer.interface.write_bytes(path, content) # Write file content as bytes +await computer.interface.delete_file(path) # Delete file +await computer.interface.create_dir(path) # Create directory +await computer.interface.delete_dir(path) # Delete directory +await computer.interface.list_dir(path) # List directory contents +``` + +### v0.2.9 (2025-06-05) + +**Dependencies:** pylume: latest + +### v0.2.8 (2025-05-31) + +**Dependencies:** pylume: latest + +### v0.2.7 (2025-05-28) + +**Dependencies:** pylume: latest + +### v0.2.6 (2025-05-28) + +**Dependencies:** pylume: latest + +### v0.2.5 (2025-05-28) + +**Dependencies:** pylume: latest + +### v0.2.4 (2025-05-28) + +**Dependencies:** pylume: latest + +### v0.2.3 (2025-05-28) + +**Dependencies:** pylume: latest + +### v0.2.2 (2025-05-16) + +**Dependencies:** pylume: latest + +### v0.2.1 (2025-05-14) + +**Dependencies:** pylume: latest + +### v0.2.0 (2025-05-14) + +**Dependencies:** pylume: latest + +## 0.1.x + +### v0.1.29 (2025-05-12) + +**Dependencies:** pylume: latest + +### v0.1.28 (2025-05-10) + +**Dependencies:** pylume: latest + +### v0.1.27 (2025-05-10) + +**Dependencies:** pylume: latest + +### v0.1.26 (2025-05-10) + +**Dependencies:** pylume: latest + +### v0.1.25 (2025-05-07) + +**Dependencies:** pylume: latest + +### v0.1.24 (2025-04-29) + +**Dependencies:** pylume: latest + +### v0.1.23 (2025-04-22) + +**Dependencies:** pylume: latest + +### v0.1.22 (2025-04-15) + +**Dependencies:** pylume: latest + +### v0.1.21 (2025-03-30) + +**Dependencies:** pylume: latest + +### v0.1.20 (2025-03-30) + +**Dependencies:** pylume: latest + +### v0.1.19 (2025-03-30) + +**Dependencies:** pylume: latest + +### v0.1.18 (2025-03-24) + +**Dependencies:** pylume: latest + +### v0.1.7 (2025-03-19) + +**Dependencies:** pylume: latest + +### v0.1.6 (2025-03-17) + +**Dependencies:** pylume: latest diff --git a/docs/content/docs/cua/reference/computer-sdk/index.mdx b/docs/content/docs/cua/reference/computer-sdk/index.mdx index 36018199..cb1025a3 100644 --- a/docs/content/docs/cua/reference/computer-sdk/index.mdx +++ b/docs/content/docs/cua/reference/computer-sdk/index.mdx @@ -1,1208 +1,2037 @@ --- -title: Computer SDK -description: Python API reference for the Computer SDK +title: Computer SDK API Reference +description: Python API reference for controlling virtual machines and computer interfaces --- +{/* + AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY + Generated by: npx tsx scripts/docs-generators/python-sdk.ts + Source: libs/python/computer/computer + Version: 0.5.12 +*/} + +import { Callout } from 'fumadocs-ui/components/callout'; import { Tabs, Tab } from 'fumadocs-ui/components/tabs'; +import { VersionHeader } from '@/components/version-selector'; -The **Computer SDK** (`cua-computer`) provides the Python interface for creating and controlling sandboxed desktop environments. This reference covers the core classes, methods, and types you'll use when working with computers programmatically. + -## Installation +Cua Computer Interface for cross-platform computer control. -```bash -pip install cua-computer -``` +## Classes -## Core Classes - -### Computer - -The main class for creating and managing sandboxed desktop environments. - -```python -from computer import Computer - -computer = Computer( - os_type="linux", - provider_type="docker", - image="trycua/cua-xfce:latest" -) -await computer.run() -``` - -#### Constructor Parameters - -| Parameter | Type | Default | Description | -| -------------------- | ------------- | ------------- | ---------------------------------------------------------------------------------------------------- | -| `os_type` | `str` | Required | Operating system type: `"linux"`, `"macos"`, or `"windows"` | -| `provider_type` | `str` | Required | Provider type: `"docker"`, `"lume"`, `"cloud"`, `"qemu"`, `"windows-sandbox"`, or `"host"` | -| `image` | `str` | `None` | Container/VM image to use (provider-specific) | -| `name` | `str` | `""` | Optional name for the computer instance | -| `display` | `str \| dict` | `"1024x768"` | Display resolution (can be string like `"1920x1080"` or dict like `{"width": 1920, "height": 1080}`) | -| `memory` | `str` | `"8GB"` | Memory allocation | -| `cpu` | `str` | `"4"` | Number of CPU cores | -| `shared_directories` | `list[str]` | `None` | List of host directories to share with the computer | -| `storage` | `str` | `None` | Path to persistent storage | -| `ephemeral` | `bool` | `False` | Use ephemeral storage (data lost on stop) | -| `api_key` | `str` | `None` | API key for cloud provider (defaults to `CUA_API_KEY` env var) | -| `host` | `str` | `"localhost"` | Host address for provider connection | -| `timeout` | `int` | `100` | Connection timeout in seconds | -| `telemetry_enabled` | `bool` | `True` | Enable telemetry | - -#### Methods - -##### `run()` - -Start the computer and establish connection. - -```python -await computer.run() -``` - -Returns once the computer is ready to accept commands. - -##### `stop()` - -Stop the computer and release resources. - -```python -await computer.stop() -``` - -##### `restart()` - -Restart the computer. - -```python -await computer.restart() -``` - -##### `disconnect()` - -Disconnect from the computer without stopping it. - -```python -await computer.disconnect() -``` - -##### `get_ip()` - -Get the IP address of the computer. - -```python -ip = await computer.get_ip() -``` - -**Returns:** `str` - IP address - -##### `update(cpu, memory)` - -Update computer resources (cloud provider only). - -```python -await computer.update(cpu="8", memory="16GB") -``` - -##### Python Execution Methods - -##### `python_exec(func, *args, **kwargs)` - -Execute a Python function in the computer's Python environment. - -```python -def calculate(x, y): - return x + y - -result = await computer.python_exec(calculate, 5, 10) -# result = 15 -``` - -##### `python_exec_background(func, *args, requirements=None, **kwargs)` - -Execute a Python function in the background. - -```python -def long_running_task(): - import time - time.sleep(60) - return "done" - -task_id = await computer.python_exec_background(long_running_task) -``` - -**Returns:** `int` - Task ID for tracking - -##### `pip_install(requirements)` - -Install Python packages in the computer. - -```python -await computer.pip_install(["requests", "pandas==2.0.0"]) -``` - -##### Virtual Environment Methods - -##### `venv_install(venv_name, requirements)` - -Install packages in a virtual environment. - -```python -await computer.venv_install("my_env", ["requests", "pandas"]) -``` - -##### `venv_cmd(venv_name, command)` - -Run a shell command in a virtual environment. - -```python -result = await computer.venv_cmd("my_env", "pip list") -print(result.stdout) -``` - -**Returns:** `CommandResult` with `stdout`, `stderr`, `returncode` - -##### `venv_exec(venv_name, func, *args, **kwargs)` - -Execute a Python function in a virtual environment. - -```python -def process_data(x): - import pandas as pd - return pd.DataFrame(x).to_dict() - -result = await computer.venv_exec("my_env", process_data, [1, 2, 3]) -``` - -##### `venv_exec_background(venv_name, func, *args, requirements=None, **kwargs)` - -Execute a Python function in a virtual environment in the background. - -```python -task_id = await computer.venv_exec_background("my_env", long_task) -``` - -**Returns:** `int` - Task ID - -See [Sandboxed Python](/cua/guide/advanced/sandboxed-python) for detailed usage. - -##### Browser Automation - -##### `playwright_exec(command, params=None)` - -Execute Playwright browser automation commands. - -```python -result = await computer.playwright_exec("goto", {"url": "https://example.com"}) -``` +| Class | Description | +|-------|-------------| +| [`Computer`](#computer) | Computer is the main class for interacting with the computer. | +| [`VMProviderType`](#vmprovidertype) | Enum of supported VM provider types. | --- -### ComputerInterface +## Computer -The interface for interacting with the computer's display, keyboard, and mouse. Accessed via `computer.interface`. +Computer is the main class for interacting with the computer. + +### Constructor ```python -interface = computer.interface +Computer(self, display: Union[Display, Dict[str, int], str] = '1024x768', memory: str = '8GB', cpu: str = '4', os_type: OSType = 'macos', name: str = '', image: Optional[str] = None, shared_directories: Optional[List[str]] = None, use_host_computer_server: bool = False, verbosity: Union[int, LogLevel] = logging.INFO, telemetry_enabled: bool = True, provider_type: Union[str, VMProviderType] = VMProviderType.LUME, provider_port: Optional[int] = 7777, noVNC_port: Optional[int] = 8006, api_port: Optional[int] = None, host: str = 'localhost', api_host: Optional[str] = None, storage: Optional[str] = None, ephemeral: bool = False, api_key: Optional[str] = None, experiments: Optional[List[str]] = None, timeout: int = 100, run_opts: Optional[Dict[str, Any]] = None) ``` -All interface methods accept an optional `delay` parameter to add a pause after the action: - -```python -await computer.interface.left_click(500, 300, delay=0.5) -``` - -#### Mouse Actions - -##### `left_click(x=None, y=None, delay=None)` - -Perform a left mouse click. If coordinates are omitted, clicks at current cursor position. - -```python -await computer.interface.left_click(500, 300) -await computer.interface.left_click() # Click at current position -``` - -| Parameter | Type | Description | -| --------- | --------------- | ----------------------------- | -| `x` | `int \| None` | X coordinate (optional) | -| `y` | `int \| None` | Y coordinate (optional) | -| `delay` | `float \| None` | Delay in seconds after action | - -##### `right_click(x=None, y=None, delay=None)` - -Perform a right mouse click. - -```python -await computer.interface.right_click(500, 300) -``` - -##### `double_click(x=None, y=None, delay=None)` - -Perform a double-click. - -```python -await computer.interface.double_click(500, 300) -``` - -##### `mouse_down(x=None, y=None, button="left", delay=None)` - -Press and hold a mouse button. - -```python -await computer.interface.mouse_down(100, 100, button="left") -``` - -| Parameter | Type | Description | -| --------- | ----- | ------------------------------------------------ | -| `button` | `str` | Mouse button: `"left"`, `"right"`, or `"middle"` | - -##### `mouse_up(x=None, y=None, button="left", delay=None)` - -Release a mouse button. - -```python -await computer.interface.mouse_up(500, 500, button="left") -``` - -##### `move_cursor(x, y, delay=None)` - -Move the mouse cursor to the specified coordinates. - -```python -await computer.interface.move_cursor(500, 300) -``` - -##### `drag_to(x, y, button="left", duration=0.5, delay=None)` - -Drag from the current cursor position to the specified coordinates. - -```python -# Move cursor to start position first -await computer.interface.move_cursor(100, 100) -# Then drag to end position -await computer.interface.drag_to(500, 500, duration=1.0) -``` - -| Parameter | Type | Description | -| ---------- | ------- | ------------------------------------------------------- | -| `x` | `int` | Ending X coordinate | -| `y` | `int` | Ending Y coordinate | -| `button` | `str` | Mouse button to use: `"left"`, `"right"`, or `"middle"` | -| `duration` | `float` | Duration of drag in seconds | - -##### `drag(path, button="left", duration=0.5, delay=None)` - -Drag along a path of coordinates. - -```python -path = [(100, 100), (200, 150), (300, 200), (400, 250)] -await computer.interface.drag(path, duration=2.0) -``` - -| Parameter | Type | Description | -| --------- | ----------------------- | -------------------------------- | -| `path` | `list[tuple[int, int]]` | List of (x, y) coordinate tuples | - -##### `scroll(x, y, delay=None)` - -Scroll by the specified amounts. Positive y scrolls up, negative scrolls down. - -```python -# Scroll down (negative y) -await computer.interface.scroll(0, -3) - -# Scroll up (positive y) -await computer.interface.scroll(0, 3) - -# Scroll right (positive x) -await computer.interface.scroll(3, 0) -``` - -| Parameter | Type | Description | -| --------- | ----- | ------------------------------------------------------------ | -| `x` | `int` | Horizontal scroll amount (positive = right, negative = left) | -| `y` | `int` | Vertical scroll amount (positive = up, negative = down) | - -##### `scroll_down(clicks=1, delay=None)` / `scroll_up(clicks=1, delay=None)` - -Convenience methods for vertical scrolling. - -```python -await computer.interface.scroll_down(3) # Scroll down 3 clicks -await computer.interface.scroll_up(2) # Scroll up 2 clicks -``` - -#### Keyboard Actions - -##### `type_text(text, delay=None)` - -Type text using the keyboard. - -```python -await computer.interface.type_text("Hello, World!") -``` - -| Parameter | Type | Description | -| --------- | ----- | ------------ | -| `text` | `str` | Text to type | - -##### `press(key, delay=None)` - -Press a single key. - -```python -from computer.interface.models import Key - -# Using Key enum (recommended) -await computer.interface.press(Key.ENTER) -await computer.interface.press(Key.PAGE_DOWN) - -# Using string (also supported) -await computer.interface.press("enter") -``` - -| Parameter | Type | Description | -| --------- | ------------ | --------------------------------------- | -| `key` | `Key \| str` | Key to press (use `Key` enum or string) | - -##### `hotkey(*keys, delay=None)` - -Press a key combination. - -```python -from computer.interface.models import Key - -# Copy (Ctrl+C) -await computer.interface.hotkey(Key.CTRL, Key.C) - -# Paste (Ctrl+V) -await computer.interface.hotkey(Key.CTRL, Key.V) - -# Save (Ctrl+S) -await computer.interface.hotkey(Key.CTRL, Key.S) - -# Quit (Cmd+Q on macOS) -await computer.interface.hotkey(Key.COMMAND, Key.Q) -``` - -##### `key_down(key, delay=None)` / `key_up(key, delay=None)` - -Press and hold or release a key. - -```python -# Hold shift while clicking -await computer.interface.key_down(Key.SHIFT) -await computer.interface.left_click(500, 300) -await computer.interface.key_up(Key.SHIFT) -``` - -**Supported Keys (Key enum):** - -```python -from computer.interface.models import Key - -# Navigation -Key.PAGE_DOWN, Key.PAGE_UP, Key.HOME, Key.END -Key.LEFT, Key.RIGHT, Key.UP, Key.DOWN - -# Special -Key.RETURN, Key.ENTER # Same key -Key.ESCAPE, Key.ESC # Same key -Key.TAB, Key.SPACE, Key.BACKSPACE, Key.DELETE - -# Modifiers -Key.ALT, Key.CTRL, Key.SHIFT -Key.WIN # Windows key -Key.COMMAND # macOS Cmd key -Key.OPTION # macOS Option key - -# Function keys -Key.F1, Key.F2, Key.F3, Key.F4, Key.F5, Key.F6 -Key.F7, Key.F8, Key.F9, Key.F10, Key.F11, Key.F12 - -# Letters and numbers can be strings or Key.A, Key.B, etc. -``` - -#### Screen Methods - -##### `screenshot(boxes=None, box_color="#FF0000", box_thickness=2, scale_factor=1.0)` - -Capture the current screen with optional box overlays. - -```python -# Basic screenshot -screenshot_bytes = await computer.interface.screenshot() - -# Screenshot with bounding boxes -boxes = [ - {"x": 100, "y": 100, "width": 200, "height": 150}, - {"x": 400, "y": 300, "width": 100, "height": 100} -] -screenshot_bytes = await computer.interface.screenshot( - boxes=boxes, - box_color="#00FF00", - box_thickness=3 -) - -# Screenshot scaled down to 50% -screenshot_bytes = await computer.interface.screenshot(scale_factor=0.5) -``` - -**Returns:** `bytes` - Raw image data (PNG format) - -##### `get_screen_size()` - -Get the screen dimensions. - -```python -size = await computer.interface.get_screen_size() -width = size["width"] -height = size["height"] -``` - -**Returns:** `dict[str, int]` - Dictionary with `"width"` and `"height"` keys - -##### `get_cursor_position()` - -Get the current cursor position. - -```python -pos = await computer.interface.get_cursor_position() -x = pos["x"] -y = pos["y"] -``` - -**Returns:** `dict[str, int]` - Dictionary with `"x"` and `"y"` keys - -##### Coordinate Conversion - -##### `to_screen_coordinates(x, y)` - -Convert screenshot coordinates to screen coordinates. - -```python -screen_x, screen_y = await computer.interface.to_screen_coordinates(100, 100) -``` - -**Returns:** `tuple[float, float]` - Screen coordinates - -##### `to_screenshot_coordinates(x, y)` - -Convert screen coordinates to screenshot coordinates. - -```python -ss_x, ss_y = await computer.interface.to_screenshot_coordinates(1920, 1080) -``` - -**Returns:** `tuple[float, float]` - Screenshot coordinates - -#### Clipboard Methods - -##### `copy_to_clipboard()` - -Get the current clipboard contents. - -```python -text = await computer.interface.copy_to_clipboard() -``` - -**Returns:** `str` - Clipboard text content - -##### `set_clipboard(text)` - -Set the clipboard contents. - -```python -await computer.interface.set_clipboard("Text to copy") -``` - -| Parameter | Type | Description | -| --------- | ----- | ------------------------ | -| `text` | `str` | Text to set in clipboard | - -#### Shell Methods - -##### `run_command(command)` - -Execute a shell command in the computer. - -```python -result = await computer.interface.run_command("ls -la") -print(result.stdout) -print(result.stderr) -print(result.returncode) -``` - -| Parameter | Type | Description | -| --------- | ----- | ------------------------ | -| `command` | `str` | Shell command to execute | - -**Returns:** `CommandResult` with properties: - -- `stdout: str` - Standard output -- `stderr: str` - Standard error -- `returncode: int` - Exit code (0 = success) - -#### File Methods - -##### `read_text(path, encoding="utf-8")` - -Read a text file from the computer. - -```python -content = await computer.interface.read_text("/home/user/file.txt") -``` - -**Returns:** `str` - File contents - -##### `write_text(path, content, encoding="utf-8")` - -Write text content to a file. - -```python -await computer.interface.write_text("/home/user/file.txt", "Hello!") -``` - -##### `read_bytes(path, offset=0, length=None)` - -Read a file as bytes with optional seeking. - -```python -# Read entire file -data = await computer.interface.read_bytes("/home/user/image.png") - -# Read 1024 bytes starting at offset 512 -data = await computer.interface.read_bytes("/home/user/file.bin", offset=512, length=1024) -``` - -**Returns:** `bytes` - File contents - -##### `write_bytes(path, content, append=False)` - -Write binary content to a file. - -```python -await computer.interface.write_bytes("/home/user/image.png", image_bytes) - -# Append to file -await computer.interface.write_bytes("/home/user/log.bin", data, append=True) -``` - -##### `file_exists(path)` / `directory_exists(path)` - -Check if a file or directory exists. - -```python -if await computer.interface.file_exists("/home/user/file.txt"): - print("File exists") - -if await computer.interface.directory_exists("/home/user/documents"): - print("Directory exists") -``` - -**Returns:** `bool` - True if exists - -##### `get_file_size(path)` - -Get the size of a file in bytes. - -```python -size = await computer.interface.get_file_size("/home/user/file.txt") -``` - -**Returns:** `int` - File size in bytes - -##### `list_dir(path)` - -List contents of a directory. - -```python -files = await computer.interface.list_dir("/home/user") -for file in files: - print(file) -``` - -**Returns:** `list[str]` - List of file/directory names - -##### `create_dir(path)` / `delete_dir(path)` / `delete_file(path)` - -Create or delete files and directories. - -```python -await computer.interface.create_dir("/home/user/new_folder") -await computer.interface.delete_file("/home/user/old_file.txt") -await computer.interface.delete_dir("/home/user/old_folder") -``` - -#### Window Management - -##### `launch(application, args=None)` - -Launch an application. - -```python -await computer.interface.launch("xfce4-terminal") -await computer.interface.launch("firefox", ["--private-window"]) -``` - -**Returns:** `int | None` - Window ID if available - -##### `open(uri)` - -Open a URL or file with the default application. - -```python -await computer.interface.open("https://www.google.com") -await computer.interface.open("/home/user/document.pdf") -``` - -##### `get_current_window_id()` - -Get the active window ID. - -```python -window_id = await computer.interface.get_current_window_id() -``` - -**Returns:** `int | str` - Window ID - -##### `get_application_windows(app_name)` - -Get window IDs for an application. - -```python -windows = await computer.interface.get_application_windows("firefox") -for window_id in windows: - print(window_id) -``` - -**Returns:** `list[int | str]` - List of window IDs - -##### `get_window_name(window_id)` / `get_window_title(window_id)` - -Get the title of a window. - -```python -title = await computer.interface.get_window_name(window_id) -``` - -**Returns:** `str` - Window title - -##### `get_window_size(window_id)` / `window_size(window_id)` - -Get window dimensions. - -```python -width, height = await computer.interface.get_window_size(window_id) -``` - -**Returns:** `tuple[int, int]` - Width and height in pixels - -##### `set_window_size(window_id, width, height)` - -Set window dimensions. - -```python -await computer.interface.set_window_size(window_id, 1200, 800) -``` - -##### `get_window_position(window_id)` - -Get window position on screen. - -```python -x, y = await computer.interface.get_window_position(window_id) -``` - -**Returns:** `tuple[int, int]` - X and Y coordinates - -##### `set_window_position(window_id, x, y)` - -Set window position on screen. - -```python -await computer.interface.set_window_position(window_id, 100, 100) -``` - -##### `maximize_window(window_id)` / `minimize_window(window_id)` - -Change window state. - -```python -await computer.interface.maximize_window(window_id) -await computer.interface.minimize_window(window_id) -``` - -##### `activate_window(window_id)` - -Bring a window to focus. - -```python -await computer.interface.activate_window(window_id) -``` - -##### `close_window(window_id)` - -Close a window. - -```python -await computer.interface.close_window(window_id) -``` - -#### Accessibility - -##### `get_accessibility_tree()` - -Get the accessibility tree for the current screen. - -```python -tree = await computer.interface.get_accessibility_tree() -``` - -**Returns:** `dict` - Accessibility tree structure with UI element information - -##### `get_active_window_bounds()` - -Get the bounds of the active window. - -```python -bounds = await computer.interface.get_active_window_bounds() -x = bounds["x"] -y = bounds["y"] -width = bounds["width"] -height = bounds["height"] -``` - -**Returns:** `dict[str, int]` - Dictionary with `"x"`, `"y"`, `"width"`, `"height"` - -#### Advanced Methods - -##### `get_desktop_environment()` - -Get the desktop environment name. - -```python -de = await computer.interface.get_desktop_environment() -# Returns "XFCE", "GNOME", "KDE", etc. -``` - -**Returns:** `str` - Desktop environment name - -##### `set_wallpaper(path)` - -Set the desktop wallpaper. - -```python -await computer.interface.set_wallpaper("/home/user/wallpaper.jpg") -``` - -##### `playwright_exec(command, params=None)` - -Execute Playwright browser automation commands. - -```python -result = await computer.interface.playwright_exec("goto", {"url": "https://example.com"}) -``` - -**Returns:** `dict` - Command result - ---- - -### Tracing - -The tracing subsystem records computer interactions. Accessed via `computer.tracing`. - -```python -tracing = computer.tracing -``` - -#### Methods - -##### `start(options)` - -Start recording interactions. - -```python -await computer.tracing.start({ - "name": "my-workflow", - "screenshots": True, - "api_calls": True, - "accessibility_tree": False, - "metadata": True -}) -``` - -| Option | Type | Default | Description | -| -------------------- | ------ | -------------- | -------------------------- | -| `name` | `str` | Auto-generated | Custom trace name | -| `screenshots` | `bool` | `True` | Capture screenshots | -| `api_calls` | `bool` | `True` | Log interface calls | -| `accessibility_tree` | `bool` | `False` | Record accessibility trees | -| `metadata` | `bool` | `True` | Enable custom metadata | - -##### `stop(options)` - -Stop recording and save the trace. - -```python -trace_path = await computer.tracing.stop({ - "format": "zip", # or "dir" - "path": "/custom/path.zip" # optional -}) -``` - -**Returns:** `str` - Path to saved trace - -##### `add_metadata(key, value)` - -Add custom metadata to the trace. - -```python -await computer.tracing.add_metadata("workflow", "login-flow") -await computer.tracing.add_metadata("step", "entering-credentials") -``` - ---- - -## Provider Types - -Different providers offer different capabilities and trade-offs. - -### Docker Provider - -```python -computer = Computer( - os_type="linux", - provider_type="docker", - image="trycua/cua-xfce:latest" -) -``` - -Best for Linux sandboxes with fast startup. Requires Docker to be installed. - -### Lume Provider - -```python -computer = Computer( - os_type="macos", - provider_type="lume", - name="my-macos-vm" -) -``` - -For macOS virtual machines on Apple Silicon. Requires [Lume](/lume) to be installed. - -### Cloud Provider - -```python -from computer import Computer - -computer = Computer( - os_type="linux", - provider_type="cloud", - api_key="your-api-key" # or set CUA_API_KEY env var -) -``` - -For managed cloud sandboxes. See [CloudProvider](#cloudprovider) for management API. - -### Windows Sandbox Provider - -```python -computer = Computer( - os_type="windows", - provider_type="windows-sandbox" -) -``` - -For Windows sandboxes on Windows hosts. Requires Windows Sandbox feature enabled. - -### QEMU Provider - -```python -computer = Computer( - os_type="linux", - provider_type="qemu", - image="/path/to/disk.qcow2" -) -``` - -For full VM emulation with QEMU. Supports any guest OS. - -### Host Provider - -```python -computer = Computer( - os_type="macos", # or current host OS - provider_type="host" -) -``` - -Directly controls the host machine. Use with caution. - ---- - -## CloudProvider - -The `CloudProvider` class enables programmatic management of cloud sandboxes. - -```python -from computer.providers.cloud.provider import CloudProvider - -# Automatically reads CUA_API_KEY from environment -provider = CloudProvider(verbose=False) - -async with provider: - vms = await provider.list_vms() -``` - -### Constructor Parameters - -| Parameter | Type | Default | Description | -| --------- | ------ | --------------------- | -------------------------- | -| `api_key` | `str` | `CUA_API_KEY` env var | API key for authentication | -| `verbose` | `bool` | `False` | Enable verbose logging | +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `logger` | `Any` | | +| `image` | `Any` | | +| `host` | `Any` | | +| `provider_port` | `Any` | | +| `noVNC_port` | `Any` | | +| `api_port` | `Any` | | +| `api_host` | `Any` | | +| `os_type` | `Any` | | +| `provider_type` | `Any` | | +| `ephemeral` | `Any` | | +| `api_key` | `Any` | | +| `timeout` | `Any` | | +| `experiments` | `Any` | | +| `custom_run_opts` | `Any` | | +| `storage` | `Any` | | +| `shared_path` | `Any` | | +| `verbosity` | `Any` | | +| `vm_logger` | `Any` | | +| `interface_logger` | `Any` | | +| `config` | `Any` | | +| `shared_directories` | `Any` | | +| `use_host_computer_server` | `Any` | | +| `interface` | `Any` | Get the computer interface for interacting with the VM. | +| `tracing` | `ComputerTracing` | Get the computer tracing instance for recording sessions. | +| `telemetry_enabled` | `bool` | Check if telemetry is enabled for this computer instance. | ### Methods -##### `list_vms()` - -List all sandboxes. +#### Computer.create_desktop_from_apps ```python -async with provider: - vms = await provider.list_vms() - for vm in vms: - print(f"{vm['name']}: {vm['status']}") +def create_desktop_from_apps(self, apps) ``` -##### `get_vm(name)` +Create a virtual desktop from a list of app names, returning a DioramaComputer +that proxies Diorama.Interface but uses diorama_cmds via the computer interface. -Get details for a specific sandbox. +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `apps` | `list[str]` | List of application names to include in the desktop. | + +**Returns:** DioramaComputer: A proxy object with the Diorama interface, but using diorama_cmds. + +#### Computer.run ```python -info = await provider.get_vm("my-vm-name") +async def run(self) -> Optional[str] ``` -##### `run_vm(name)` +Initialize the VM and computer interface. -Start a sandbox. +#### Computer.disconnect ```python -resp = await provider.run_vm("my-vm-name") -# {"name": "my-vm-name", "status": "starting"} +async def disconnect(self) -> None ``` -##### `stop_vm(name)` +Disconnect from the computer's WebSocket interface. -Stop a sandbox. +#### Computer.stop ```python -resp = await provider.stop_vm("my-vm-name") -# {"name": "my-vm-name", "status": "stopping"} +async def stop(self) -> None ``` -##### `restart_vm(name)` +Disconnect from the computer's WebSocket interface and stop the computer. -Restart a sandbox. +#### Computer.start ```python -resp = await provider.restart_vm("my-vm-name") -# {"name": "my-vm-name", "status": "restarting"} +async def start(self) -> None ``` -### Sandbox Status Values +Start the computer. -| Status | Description | -| ------------ | ------------------------------ | -| `pending` | Deployment in progress | -| `running` | Active and accessible | -| `stopped` | Stopped but not terminated | -| `terminated` | Permanently destroyed | -| `failed` | Deployment or operation failed | +#### Computer.restart -### HTTP API - -You can also manage sandboxes via HTTP: - -```bash -# List sandboxes -curl -H "Authorization: Bearer $CUA_API_KEY" \ - "https://api.cua.ai/v1/vms" - -# Start sandbox -curl -X POST -H "Authorization: Bearer $CUA_API_KEY" \ - "https://api.cua.ai/v1/vms/my-vm-name/start" - -# Stop sandbox -curl -X POST -H "Authorization: Bearer $CUA_API_KEY" \ - "https://api.cua.ai/v1/vms/my-vm-name/stop" - -# Restart sandbox -curl -X POST -H "Authorization: Bearer $CUA_API_KEY" \ - "https://api.cua.ai/v1/vms/my-vm-name/restart" +```python +async def restart(self) -> None ``` +Restart the computer. + +If using a VM provider that supports restart, this will issue a restart +without tearing down the provider context, then reconnect the interface. +Falls back to stop()+run() when a provider restart is not available. + +#### Computer.get_ip + +```python +async def get_ip(self, max_retries: int = 15, retry_delay: int = 3) -> str +``` + +Get the IP address of the VM or localhost if using host computer server. + +This method delegates to the provider's get_ip method, which waits indefinitely +until the VM has a valid IP address. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `max_retries` | `Any` | Unused parameter, kept for backward compatibility | +| `retry_delay` | `Any` | Delay between retries in seconds (default: 2) | + +**Returns:** IP address of the VM or localhost if using host computer server + +#### Computer.wait_vm_ready + +```python +async def wait_vm_ready(self) -> Optional[Dict[str, Any]] +``` + +Wait for VM to be ready with an IP address. + +**Returns:** VM status information or None if using host computer server. + +#### Computer.update + +```python +async def update(self, cpu: Optional[int] = None, memory: Optional[str] = None) +``` + +Update VM settings. + +#### Computer.get_screenshot_size + +```python +def get_screenshot_size(self, screenshot: bytes) -> Dict[str, int] +``` + +Get the dimensions of a screenshot. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `screenshot` | `Any` | The screenshot bytes | + +**Returns:** Dict[str, int]: Dictionary containing 'width' and 'height' of the image + +#### Computer.to_screen_coordinates + +```python +async def to_screen_coordinates(self, x: float, y: float) -> tuple[float, float] +``` + +Convert normalized coordinates to screen coordinates. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `Any` | X coordinate between 0 and 1 | +| `y` | `Any` | Y coordinate between 0 and 1 | + +**Returns:** tuple[float, float]: Screen coordinates (x, y) + +#### Computer.to_screenshot_coordinates + +```python +async def to_screenshot_coordinates(self, x: float, y: float) -> tuple[float, float] +``` + +Convert screen coordinates to screenshot coordinates. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `Any` | X coordinate in screen space | +| `y` | `Any` | Y coordinate in screen space | + +**Returns:** tuple[float, float]: (x, y) coordinates in screenshot space + +#### Computer.playwright_exec + +```python +async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any] +``` + +Execute a Playwright browser command. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `command` | `Any` | The browser command to execute (visit_url, click, type, scroll, web_search) | +| `params` | `Any` | Command parameters | + +**Returns:** Dict containing the command result + +**Example:** + +```python +# Navigate to a URL +await computer.playwright_exec("visit_url", {"url": "https://example.com"}) + +# Click at coordinates +await computer.playwright_exec("click", {"x": 100, "y": 200}) + +# Type text +await computer.playwright_exec("type", {"text": "Hello, world!"}) + +# Scroll +await computer.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100}) + +# Web search +await computer.playwright_exec("web_search", {"query": "computer use agent"}) +``` + +#### Computer.venv_install + +```python +async def venv_install(self, venv_name: str, requirements: list[str]) +``` + +Install packages in a UV project. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `venv_name` | `Any` | Name of the UV project | +| `requirements` | `Any` | List of package requirements to install | + +**Returns:** Tuple of (stdout, stderr) from the installation command + +#### Computer.pip_install + +```python +async def pip_install(self, requirements: list[str]) +``` + +Install packages using the system Python with UV (no venv). + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `requirements` | `Any` | List of package requirements to install globally/user site. | + +**Returns:** Tuple of (stdout, stderr) from the installation command + +#### Computer.venv_cmd + +```python +async def venv_cmd(self, venv_name: str, command: str) +``` + +Execute a shell command in a UV project. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `venv_name` | `Any` | Name of the UV project | +| `command` | `Any` | Shell command to execute in the UV project | + +**Returns:** Tuple of (stdout, stderr) from the command execution + +#### Computer.venv_exec + +```python +async def venv_exec(self, venv_name: str, python_func, args = (), kwargs = {}) +``` + +Execute Python function in a virtual environment using source code extraction. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `venv_name` | `Any` | Name of the virtual environment | +| `python_func` | `Any` | A callable function to execute *args: Positional arguments to pass to the function **kwargs: Keyword arguments to pass to the function | + +**Returns:** The result of the function execution, or raises any exception that occurred + +#### Computer.venv_exec_background + +```python +async def venv_exec_background(self, venv_name: str, python_func, args = (), requirements: Optional[List[str]] = None, kwargs = {}) -> int +``` + +Run the Python function in the venv in the background and return the PID. + +Uses a short launcher Python that spawns a detached child and exits immediately. + +#### Computer.python_exec + +```python +async def python_exec(self, python_func, args = (), kwargs = {}) +``` + +Execute a Python function using the system Python (no venv). + +Uses source extraction and base64 transport, mirroring venv_exec but +without virtual environment activation. + +Returns the function result or raises a reconstructed exception with +remote traceback context appended. + +#### Computer.python_exec_background + +```python +async def python_exec_background(self, python_func, args = (), requirements: Optional[List[str]] = None, kwargs = {}) -> int +``` + +Run a Python function with the system interpreter in the background and return PID. + +Uses a short launcher Python that spawns a detached child and exits immediately. + +#### Computer.python_command + +```python +def python_command(self, requirements: Optional[List[str]] = None, venv_name: str = 'default', use_system_python: bool = False, background: bool = False) -> Callable[[Callable[P, R]], Callable[P, Awaitable[R]]] +``` + +Decorator to execute a Python function remotely in this Computer's venv. + +This mirrors `computer.helpers.sandboxed()` but binds to this instance and +optionally ensures required packages are installed before execution. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `requirements` | `Any` | Packages to install in the virtual environment. | +| `venv_name` | `Any` | Name of the virtual environment to use. | +| `use_system_python` | `Any` | If True, use the system Python/pip instead of a venv. | +| `background` | `Any` | If True, run the function detached and return the child PID immediately. | + +**Returns:** A decorator that turns a local function into an async callable which runs remotely and returns the function's result. + --- -## Types +## VMProviderType -### OSType +*Inherits from: StrEnum* -```python -from computer import OSType +Enum of supported VM provider types. -OSType.LINUX # "linux" -OSType.MACOS # "macos" -OSType.WINDOWS # "windows" -``` +### Attributes -### ProviderType - -```python -from computer import ProviderType - -ProviderType.DOCKER # "docker" -ProviderType.LUME # "lume" -ProviderType.CLOUD # "cloud" -ProviderType.QEMU # "qemu" -ProviderType.WINDOWS_SANDBOX # "windows-sandbox" -ProviderType.HOST # "host" -``` - -### Key - -Enum for keyboard keys with cross-platform support. - -```python -from computer.interface.models import Key - -# Use in keyboard methods -await computer.interface.press(Key.ENTER) -await computer.interface.hotkey(Key.CTRL, Key.C) -``` - -**Available Keys:** - -| Category | Keys | -| ---------- | ----------------------------------------------------------------------- | -| Navigation | `PAGE_DOWN`, `PAGE_UP`, `HOME`, `END`, `LEFT`, `RIGHT`, `UP`, `DOWN` | -| Special | `RETURN`/`ENTER`, `ESCAPE`/`ESC`, `TAB`, `SPACE`, `BACKSPACE`, `DELETE` | -| Modifiers | `ALT`, `CTRL`, `SHIFT`, `WIN`, `COMMAND`, `OPTION` | -| Function | `F1` through `F12` | -| Letters | `A` through `Z` | -| Numbers | `N0` through `N9` | - -### CommandResult - -Result from `run_command()` calls. - -```python -result = await computer.interface.run_command("echo hello") -result.stdout # "hello\n" -result.stderr # "" -result.returncode # 0 -``` - -| Property | Type | Description | -| ------------ | ----- | ----------------------- | -| `stdout` | `str` | Standard output | -| `stderr` | `str` | Standard error | -| `returncode` | `int` | Exit code (0 = success) | +| Name | Type | Description | +|------|------|-------------| +| `LUME` | `Any` | | +| `LUMIER` | `Any` | | +| `CLOUD` | `Any` | | +| `CLOUDV2` | `Any` | | +| `WINSANDBOX` | `Any` | | +| `DOCKER` | `Any` | | +| `UNKNOWN` | `Any` | | --- -## Environment Variables +## tracing -| Variable | Description | -| ------------- | ------------------------------------------------ | -| `CUA_API_KEY` | API key for cloud provider | -| `CUA_REGION` | Default region for cloud provider | -| `DOCKER_HOST` | Custom Docker host for Docker provider | -| `LUME_HOST` | Custom Lume API host (default: `localhost:7777`) | +Computer tracing functionality for recording sessions. + +This module provides a Computer.tracing API inspired by Playwright's tracing functionality, +allowing users to record computer interactions for debugging, training, and analysis. --- -## Context Manager Usage +## ComputerTracing -The Computer class supports async context managers for automatic cleanup: +Computer tracing class that records computer interactions and saves them to disk. + +This class provides a flexible API for recording computer sessions with configurable +options for what to record (screenshots, API calls, video, etc.). + +### Constructor ```python -from computer import Computer - -async with Computer( - os_type="linux", - provider_type="docker", - image="trycua/cua-xfce:latest" -) as computer: - await computer.interface.type_text("Hello!") - # Computer automatically stopped on exit +ComputerTracing(self, computer_instance) ``` +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `is_tracing` | `bool` | Check if tracing is currently active. | + +### Methods + +#### ComputerTracing.start + +```python +async def start(self, config: Optional[Dict[str, Any]] = None) -> None +``` + +Start tracing with the specified configuration. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `config` | `Any` | Tracing configuration dict with options: - video: bool - Record video frames (default: False) - screenshots: bool - Record screenshots (default: True) - api_calls: bool - Record API calls and results (default: True) - accessibility_tree: bool - Record accessibility tree snapshots (default: False) - metadata: bool - Record custom metadata (default: True) - name: str - Custom trace name (default: auto-generated) - path: str - Custom trace directory path (default: auto-generated) | + +#### ComputerTracing.stop + +```python +async def stop(self, options: Optional[Dict[str, Any]] = None) -> str +``` + +Stop tracing and save the trace data. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `options` | `Any` | Stop options dict with: - path: str - Custom output path for the trace archive - format: str - Output format ('zip' or 'dir', default: 'zip') | + +**Returns:** str: Path to the saved trace file or directory + +#### ComputerTracing.record_api_call + +```python +async def record_api_call(self, method: str, args: Dict[str, Any], result: Any = None, error: Optional[Exception] = None) -> None +``` + +Record an API call event. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `method` | `Any` | The method name that was called | +| `args` | `Any` | Arguments passed to the method | +| `result` | `Any` | Result returned by the method | +| `error` | `Any` | Exception raised by the method, if any | + +#### ComputerTracing.record_accessibility_tree + +```python +async def record_accessibility_tree(self) -> None +``` + +Record the current accessibility tree if enabled. + +#### ComputerTracing.add_metadata + +```python +async def add_metadata(self, key: str, value: Any) -> None +``` + +Add custom metadata to the trace. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `key` | `Any` | Metadata key | +| `value` | `Any` | Metadata value | + --- -## Common Patterns +## models -### Click and Type +Models for computer configuration. + +--- + +## BaseVMProvider + +*Inherits from: AsyncContextManager* + +Base interface for VM providers. + +All VM provider implementations must implement this interface. + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `provider_type` | `VMProviderType` | Get the provider type. | + +### Methods + +#### BaseVMProvider.get_vm ```python -# Click a text field and type -await computer.interface.left_click(500, 300) -await computer.interface.type_text("Hello, World!") -await computer.interface.press(Key.ENTER) +async def get_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any] ``` -### Drag and Drop +Get VM information by name. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `name` | `Any` | Name of the VM to get information for | +| `storage` | `Any` | Optional storage path override. If provided, this will be used instead of the provider's default storage path. | + +**Returns:** Dictionary with VM information including status, IP address, etc. + +#### BaseVMProvider.list_vms ```python -# Method 1: Using drag_to -await computer.interface.move_cursor(100, 100) -await computer.interface.drag_to(500, 500) - -# Method 2: Using mouse_down/up -await computer.interface.mouse_down(100, 100) -await computer.interface.move_cursor(500, 500) -await computer.interface.mouse_up() +async def list_vms(self) -> ListVMsResponse ``` -### Keyboard Shortcuts +List all available VMs. + +**Returns:** ListVMsResponse: A list of minimal VM objects as defined in `computer.providers.types.MinimalVM`. + +#### BaseVMProvider.run_vm ```python -from computer.interface.models import Key - -# Copy -await computer.interface.hotkey(Key.CTRL, Key.C) - -# Paste -await computer.interface.hotkey(Key.CTRL, Key.V) - -# Select All -await computer.interface.hotkey(Key.CTRL, Key.A) - -# Undo -await computer.interface.hotkey(Key.CTRL, Key.Z) - -# macOS uses Command key -await computer.interface.hotkey(Key.COMMAND, Key.Q) # Quit +async def run_vm(self, image: str, name: str, run_opts: Dict[str, Any], storage: Optional[str] = None) -> Dict[str, Any] ``` -### File Operations +Run a VM by name with the given options. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `image` | `Any` | Name/tag of the image to use | +| `name` | `Any` | Name of the VM to run | +| `run_opts` | `Any` | Dictionary of run options (memory, cpu, etc.) | +| `storage` | `Any` | Optional storage path override. If provided, this will be used instead of the provider's default storage path. | + +**Returns:** Dictionary with VM run status and information + +#### BaseVMProvider.stop_vm ```python -# Read a text file -content = await computer.interface.read_text("/home/user/config.json") -data = json.loads(content) - -# Write a text file -await computer.interface.write_text("/home/user/output.txt", "Results") - -# Read binary file -image_data = await computer.interface.read_bytes("/home/user/photo.jpg") - -# Write binary file -await computer.interface.write_bytes("/home/user/output.png", image_bytes) - -# Check if file exists -if await computer.interface.file_exists("/home/user/data.csv"): - content = await computer.interface.read_text("/home/user/data.csv") +async def stop_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any] ``` + +Stop a VM by name. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `name` | `Any` | Name of the VM to stop | +| `storage` | `Any` | Optional storage path override. If provided, this will be used instead of the provider's default storage path. | + +**Returns:** Dictionary with VM stop status and information + +#### BaseVMProvider.restart_vm + +```python +async def restart_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any] +``` + +Restart a VM by name. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `name` | `Any` | Name of the VM to restart | +| `storage` | `Any` | Optional storage path override. If provided, this will be used instead of the provider's default storage path. | + +**Returns:** Dictionary with VM restart status and information + +#### BaseVMProvider.update_vm + +```python +async def update_vm(self, name: str, update_opts: Dict[str, Any], storage: Optional[str] = None) -> Dict[str, Any] +``` + +Update VM configuration. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `name` | `Any` | Name of the VM to update | +| `update_opts` | `Any` | Dictionary of update options (memory, cpu, etc.) | +| `storage` | `Any` | Optional storage path override. If provided, this will be used instead of the provider's default storage path. | + +**Returns:** Dictionary with VM update status and information + +#### BaseVMProvider.get_ip + +```python +async def get_ip(self, name: str, storage: Optional[str] = None, retry_delay: int = 2) -> str +``` + +Get the IP address of a VM, waiting indefinitely until it's available. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `name` | `Any` | Name of the VM to get the IP for | +| `storage` | `Any` | Optional storage path override. If provided, this will be used instead of the provider's default storage path. | +| `retry_delay` | `Any` | Delay between retries in seconds (default: 2) | + +**Returns:** IP address of the VM when it becomes available + +--- + +## Display + +Display configuration. + +### Constructor + +```python +Display(self, width: int, height: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `width` | `int` | | +| `height` | `int` | | + +--- + +## Image + +VM image configuration. + +### Constructor + +```python +Image(self, image: str, tag: str, name: str) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `image` | `str` | | +| `tag` | `str` | | +| `name` | `str` | | + +--- + +## Computer + +Computer configuration. + +### Constructor + +```python +Computer(self, image: str, tag: str, name: str, display: Display, memory: str, cpu: str, vm_provider: Optional[BaseVMProvider] = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `image` | `str` | | +| `tag` | `str` | | +| `name` | `str` | | +| `display` | `Display` | | +| `memory` | `str` | | +| `cpu` | `str` | | +| `vm_provider` | `Optional[BaseVMProvider]` | | + +### Methods + +#### Computer.get_ip + +```python +async def get_ip(self) -> Optional[str] +``` + +Get the IP address of the VM. + +--- + +## diorama_computer + +--- + +## Key + +*Inherits from: Enum* + +Keyboard keys that can be used with press_key. + +These key names follow a consistent cross-platform keyboard key naming convention. + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `PAGE_DOWN` | `Any` | | +| `PAGE_UP` | `Any` | | +| `HOME` | `Any` | | +| `END` | `Any` | | +| `LEFT` | `Any` | | +| `RIGHT` | `Any` | | +| `UP` | `Any` | | +| `DOWN` | `Any` | | +| `RETURN` | `Any` | | +| `ENTER` | `Any` | | +| `ESCAPE` | `Any` | | +| `ESC` | `Any` | | +| `TAB` | `Any` | | +| `SPACE` | `Any` | | +| `BACKSPACE` | `Any` | | +| `DELETE` | `Any` | | +| `ALT` | `Any` | | +| `CTRL` | `Any` | | +| `SHIFT` | `Any` | | +| `WIN` | `Any` | | +| `COMMAND` | `Any` | | +| `OPTION` | `Any` | | +| `F1` | `Any` | | +| `F2` | `Any` | | +| `F3` | `Any` | | +| `F4` | `Any` | | +| `F5` | `Any` | | +| `F6` | `Any` | | +| `F7` | `Any` | | +| `F8` | `Any` | | +| `F9` | `Any` | | +| `F10` | `Any` | | +| `F11` | `Any` | | +| `F12` | `Any` | | + +### Methods + +#### Key.from_string + +```python +def from_string(cls, key: str) -> Key | str +``` + +Convert a string key name to a Key enum value. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `key` | `Any` | String key name to convert | + +**Returns:** Key enum value if the string matches a known key, otherwise returns the original string for single character keys + +--- + +## DioramaComputer + +A Computer-compatible proxy for Diorama that sends commands over the ComputerInterface. + +### Constructor + +```python +DioramaComputer(self, computer, apps) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `computer` | `Any` | | +| `apps` | `Any` | | +| `interface` | `Any` | | + +### Methods + +#### DioramaComputer.run + +```python +async def run(self) +``` + +Initialize and run the DioramaComputer if not already initialized. + +**Returns:** self: The DioramaComputer instance + +--- + +## DioramaComputerInterface + +Diorama Interface proxy that sends diorama_cmds via the Computer's interface. + +### Constructor + +```python +DioramaComputerInterface(self, computer, apps) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `computer` | `Any` | | +| `apps` | `Any` | | + +### Methods + +#### DioramaComputerInterface.screenshot + +```python +async def screenshot(self, as_bytes = True) +``` + +Take a screenshot of the diorama scene. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `as_bytes` | `bool` | If True, return image as bytes; if False, return PIL Image object | + +**Returns:** bytes or PIL.Image: Screenshot data in the requested format + +#### DioramaComputerInterface.get_screen_size + +```python +async def get_screen_size(self) +``` + +Get the dimensions of the diorama scene. + +**Returns:** dict: Dictionary containing 'width' and 'height' keys with pixel dimensions + +#### DioramaComputerInterface.move_cursor + +```python +async def move_cursor(self, x, y) +``` + +Move the cursor to the specified coordinates. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | X coordinate to move cursor to | +| `y` | `int` | Y coordinate to move cursor to | + +#### DioramaComputerInterface.left_click + +```python +async def left_click(self, x = None, y = None) +``` + +Perform a left mouse click at the specified coordinates or current cursor position. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int, optional` | X coordinate to click at. If None, clicks at current cursor position | +| `y` | `int, optional` | Y coordinate to click at. If None, clicks at current cursor position | + +#### DioramaComputerInterface.right_click + +```python +async def right_click(self, x = None, y = None) +``` + +Perform a right mouse click at the specified coordinates or current cursor position. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int, optional` | X coordinate to click at. If None, clicks at current cursor position | +| `y` | `int, optional` | Y coordinate to click at. If None, clicks at current cursor position | + +#### DioramaComputerInterface.double_click + +```python +async def double_click(self, x = None, y = None) +``` + +Perform a double mouse click at the specified coordinates or current cursor position. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int, optional` | X coordinate to double-click at. If None, clicks at current cursor position | +| `y` | `int, optional` | Y coordinate to double-click at. If None, clicks at current cursor position | + +#### DioramaComputerInterface.scroll_up + +```python +async def scroll_up(self, clicks = 1) +``` + +Scroll up by the specified number of clicks. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `clicks` | `int` | Number of scroll clicks to perform upward. Defaults to 1 | + +#### DioramaComputerInterface.scroll_down + +```python +async def scroll_down(self, clicks = 1) +``` + +Scroll down by the specified number of clicks. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `clicks` | `int` | Number of scroll clicks to perform downward. Defaults to 1 | + +#### DioramaComputerInterface.drag_to + +```python +async def drag_to(self, x, y, duration = 0.5) +``` + +Drag from the current cursor position to the specified coordinates. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | X coordinate to drag to | +| `y` | `int` | Y coordinate to drag to | +| `duration` | `float` | Duration of the drag operation in seconds. Defaults to 0.5 | + +#### DioramaComputerInterface.get_cursor_position + +```python +async def get_cursor_position(self) +``` + +Get the current cursor position. + +**Returns:** dict: Dictionary containing the current cursor coordinates + +#### DioramaComputerInterface.type_text + +```python +async def type_text(self, text) +``` + +Type the specified text at the current cursor position. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `text` | `str` | The text to type | + +#### DioramaComputerInterface.press_key + +```python +async def press_key(self, key) +``` + +Press a single key. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `key` | `Any` | The key to press | + +#### DioramaComputerInterface.hotkey + +```python +async def hotkey(self, keys = ()) +``` + +Press multiple keys simultaneously as a hotkey combination. + +**Raises:** + +- `ValueError` - If any key is not a Key enum or string type + +#### DioramaComputerInterface.to_screen_coordinates + +```python +async def to_screen_coordinates(self, x, y) +``` + +Convert coordinates to screen coordinates. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | X coordinate to convert | +| `y` | `int` | Y coordinate to convert | + +**Returns:** dict: Dictionary containing the converted screen coordinates + +--- + +## helpers + +Helper functions and decorators for the Computer module. + +--- + +## DependencyInfo + +*Inherits from: TypedDict* + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `import_statements` | `List[str]` | | +| `definitions` | `List[tuple[str, Any]]` | | + +### set_default_computer + +```python +def set_default_computer(computer: Any) -> None +``` + +Set the default computer instance to be used by the remote decorator. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `computer` | `Any` | The computer instance to use as default | + +### sandboxed + +```python +def sandboxed(venv_name: str = 'default', computer: str = 'default', max_retries: int = 3) -> Callable[[Callable[P, R]], Callable[P, Awaitable[R]]] +``` + +Decorator that wraps a function to be executed remotely via computer.venv_exec + +The function is automatically analyzed for dependencies (imports, helper functions, +constants, etc.) and reconstructed with all necessary code in the remote sandbox. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `venv_name` | `Any` | Name of the virtual environment to execute in | +| `computer` | `Any` | The computer instance to use, or "default" to use the globally set default | +| `max_retries` | `Any` | Maximum number of retries for the remote execution | + +### generate_source_code + +```python +def generate_source_code(func: FunctionType) -> str +``` + +Generate complete source code for a function with all dependencies. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `func` | `Any` | The function to generate source code for | + +**Returns:** Complete Python source code as a string + +--- + +## interface + +Interface package for Computer SDK. + +--- + +## BaseComputerInterface + +*Inherits from: ABC* + +Base class for computer control interfaces. + +### Constructor + +```python +BaseComputerInterface(self, ip_address: str, username: str = 'lume', password: str = 'lume', api_key: Optional[str] = None, vm_name: Optional[str] = None) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `ip_address` | `Any` | | +| `username` | `Any` | | +| `password` | `Any` | | +| `api_key` | `Any` | | +| `vm_name` | `Any` | | +| `logger` | `Any` | | +| `delay` | `float` | | + +### Methods + +#### BaseComputerInterface.wait_for_ready + +```python +async def wait_for_ready(self, timeout: int = 60) -> None +``` + +Wait for interface to be ready. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `timeout` | `Any` | Maximum time to wait in seconds | + +**Raises:** + +- `TimeoutError` - If interface is not ready within timeout + +#### BaseComputerInterface.close + +```python +def close(self) -> None +``` + +Close the interface connection. + +#### BaseComputerInterface.force_close + +```python +def force_close(self) -> None +``` + +Force close the interface connection. + +By default, this just calls close(), but subclasses can override +to provide more forceful cleanup. + +#### BaseComputerInterface.mouse_down + +```python +async def mouse_down(self, x: Optional[int] = None, y: Optional[int] = None, button: MouseButton = 'left', delay: Optional[float] = None) -> None +``` + +Press and hold a mouse button. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `Any` | X coordinate to press at. If None, uses current cursor position. | +| `y` | `Any` | Y coordinate to press at. If None, uses current cursor position. | +| `button` | `Any` | Mouse button to press ('left', 'middle', 'right'). | +| `delay` | `Any` | Optional delay in seconds after the action | + +#### BaseComputerInterface.mouse_up + +```python +async def mouse_up(self, x: Optional[int] = None, y: Optional[int] = None, button: MouseButton = 'left', delay: Optional[float] = None) -> None +``` + +Release a mouse button. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `Any` | X coordinate to release at. If None, uses current cursor position. | +| `y` | `Any` | Y coordinate to release at. If None, uses current cursor position. | +| `button` | `Any` | Mouse button to release ('left', 'middle', 'right'). | +| `delay` | `Any` | Optional delay in seconds after the action | + +#### BaseComputerInterface.left_click + +```python +async def left_click(self, x: Optional[int] = None, y: Optional[int] = None, delay: Optional[float] = None) -> None +``` + +Perform a left mouse button click. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `Any` | X coordinate to click at. If None, uses current cursor position. | +| `y` | `Any` | Y coordinate to click at. If None, uses current cursor position. | +| `delay` | `Any` | Optional delay in seconds after the action | + +#### BaseComputerInterface.right_click + +```python +async def right_click(self, x: Optional[int] = None, y: Optional[int] = None, delay: Optional[float] = None) -> None +``` + +Perform a right mouse button click. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `Any` | X coordinate to click at. If None, uses current cursor position. | +| `y` | `Any` | Y coordinate to click at. If None, uses current cursor position. | +| `delay` | `Any` | Optional delay in seconds after the action | + +#### BaseComputerInterface.double_click + +```python +async def double_click(self, x: Optional[int] = None, y: Optional[int] = None, delay: Optional[float] = None) -> None +``` + +Perform a double left mouse button click. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `Any` | X coordinate to double-click at. If None, uses current cursor position. | +| `y` | `Any` | Y coordinate to double-click at. If None, uses current cursor position. | +| `delay` | `Any` | Optional delay in seconds after the action | + +#### BaseComputerInterface.move_cursor + +```python +async def move_cursor(self, x: int, y: int, delay: Optional[float] = None) -> None +``` + +Move the cursor to the specified screen coordinates. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `Any` | X coordinate to move cursor to. | +| `y` | `Any` | Y coordinate to move cursor to. | +| `delay` | `Any` | Optional delay in seconds after the action | + +#### BaseComputerInterface.drag_to + +```python +async def drag_to(self, x: int, y: int, button: str = 'left', duration: float = 0.5, delay: Optional[float] = None) -> None +``` + +Drag from current position to specified coordinates. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `Any` | The x coordinate to drag to | +| `y` | `Any` | The y coordinate to drag to | +| `button` | `Any` | The mouse button to use ('left', 'middle', 'right') | +| `duration` | `Any` | How long the drag should take in seconds | +| `delay` | `Any` | Optional delay in seconds after the action | + +#### BaseComputerInterface.drag + +```python +async def drag(self, path: List[Tuple[int, int]], button: str = 'left', duration: float = 0.5, delay: Optional[float] = None) -> None +``` + +Drag the cursor along a path of coordinates. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `path` | `Any` | List of (x, y) coordinate tuples defining the drag path | +| `button` | `Any` | The mouse button to use ('left', 'middle', 'right') | +| `duration` | `Any` | Total time in seconds that the drag operation should take | +| `delay` | `Any` | Optional delay in seconds after the action | + +#### BaseComputerInterface.key_down + +```python +async def key_down(self, key: str, delay: Optional[float] = None) -> None +``` + +Press and hold a key. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `key` | `Any` | The key to press and hold (e.g., 'a', 'shift', 'ctrl'). | +| `delay` | `Any` | Optional delay in seconds after the action. | + +#### BaseComputerInterface.key_up + +```python +async def key_up(self, key: str, delay: Optional[float] = None) -> None +``` + +Release a previously pressed key. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `key` | `Any` | The key to release (e.g., 'a', 'shift', 'ctrl'). | +| `delay` | `Any` | Optional delay in seconds after the action. | + +#### BaseComputerInterface.type_text + +```python +async def type_text(self, text: str, delay: Optional[float] = None) -> None +``` + +Type the specified text string. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `text` | `Any` | The text string to type. | +| `delay` | `Any` | Optional delay in seconds after the action. | + +#### BaseComputerInterface.press_key + +```python +async def press_key(self, key: str, delay: Optional[float] = None) -> None +``` + +Press and release a single key. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `key` | `Any` | The key to press (e.g., 'a', 'enter', 'escape'). | +| `delay` | `Any` | Optional delay in seconds after the action. | + +#### BaseComputerInterface.hotkey + +```python +async def hotkey(self, keys: str = (), delay: Optional[float] = None) -> None +``` + +Press multiple keys simultaneously (keyboard shortcut). + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `delay` | `Any` | Optional delay in seconds after the action. | + +#### BaseComputerInterface.scroll + +```python +async def scroll(self, x: int, y: int, delay: Optional[float] = None) -> None +``` + +Scroll the mouse wheel by specified amounts. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `Any` | Horizontal scroll amount (positive = right, negative = left). | +| `y` | `Any` | Vertical scroll amount (positive = up, negative = down). | +| `delay` | `Any` | Optional delay in seconds after the action. | + +#### BaseComputerInterface.scroll_down + +```python +async def scroll_down(self, clicks: int = 1, delay: Optional[float] = None) -> None +``` + +Scroll down by the specified number of clicks. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `clicks` | `Any` | Number of scroll clicks to perform downward. | +| `delay` | `Any` | Optional delay in seconds after the action. | + +#### BaseComputerInterface.scroll_up + +```python +async def scroll_up(self, clicks: int = 1, delay: Optional[float] = None) -> None +``` + +Scroll up by the specified number of clicks. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `clicks` | `Any` | Number of scroll clicks to perform upward. | +| `delay` | `Any` | Optional delay in seconds after the action. | + +#### BaseComputerInterface.screenshot + +```python +async def screenshot(self) -> bytes +``` + +Take a screenshot. + +**Returns:** Raw bytes of the screenshot image + +#### BaseComputerInterface.get_screen_size + +```python +async def get_screen_size(self) -> Dict[str, int] +``` + +Get the screen dimensions. + +**Returns:** Dict with 'width' and 'height' keys + +#### BaseComputerInterface.get_cursor_position + +```python +async def get_cursor_position(self) -> Dict[str, int] +``` + +Get the current cursor position on screen. + +**Returns:** Dict with 'x' and 'y' keys containing cursor coordinates. + +#### BaseComputerInterface.copy_to_clipboard + +```python +async def copy_to_clipboard(self) -> str +``` + +Get the current clipboard content. + +**Returns:** The text content currently stored in the clipboard. + +#### BaseComputerInterface.set_clipboard + +```python +async def set_clipboard(self, text: str) -> None +``` + +Set the clipboard content to the specified text. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `text` | `Any` | The text to store in the clipboard. | + +#### BaseComputerInterface.file_exists + +```python +async def file_exists(self, path: str) -> bool +``` + +Check if a file exists at the specified path. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `path` | `Any` | The file path to check. | + +**Returns:** True if the file exists, False otherwise. + +#### BaseComputerInterface.directory_exists + +```python +async def directory_exists(self, path: str) -> bool +``` + +Check if a directory exists at the specified path. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `path` | `Any` | The directory path to check. | + +**Returns:** True if the directory exists, False otherwise. + +#### BaseComputerInterface.list_dir + +```python +async def list_dir(self, path: str) -> List[str] +``` + +List the contents of a directory. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `path` | `Any` | The directory path to list. | + +**Returns:** List of file and directory names in the specified directory. + +#### BaseComputerInterface.read_text + +```python +async def read_text(self, path: str) -> str +``` + +Read the text contents of a file. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `path` | `Any` | The file path to read from. | + +**Returns:** The text content of the file. + +#### BaseComputerInterface.write_text + +```python +async def write_text(self, path: str, content: str) -> None +``` + +Write text content to a file. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `path` | `Any` | The file path to write to. | +| `content` | `Any` | The text content to write. | + +#### BaseComputerInterface.read_bytes + +```python +async def read_bytes(self, path: str, offset: int = 0, length: Optional[int] = None) -> bytes +``` + +Read file binary contents with optional seeking support. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `path` | `Any` | Path to the file | +| `offset` | `Any` | Byte offset to start reading from (default: 0) | +| `length` | `Any` | Number of bytes to read (default: None for entire file) | + +#### BaseComputerInterface.write_bytes + +```python +async def write_bytes(self, path: str, content: bytes) -> None +``` + +Write binary content to a file. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `path` | `Any` | The file path to write to. | +| `content` | `Any` | The binary content to write. | + +#### BaseComputerInterface.delete_file + +```python +async def delete_file(self, path: str) -> None +``` + +Delete a file at the specified path. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `path` | `Any` | The file path to delete. | + +#### BaseComputerInterface.create_dir + +```python +async def create_dir(self, path: str) -> None +``` + +Create a directory at the specified path. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `path` | `Any` | The directory path to create. | + +#### BaseComputerInterface.delete_dir + +```python +async def delete_dir(self, path: str) -> None +``` + +Delete a directory at the specified path. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `path` | `Any` | The directory path to delete. | + +#### BaseComputerInterface.get_file_size + +```python +async def get_file_size(self, path: str) -> int +``` + +Get the size of a file in bytes. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `path` | `Any` | The file path to get the size of. | + +**Returns:** The size of the file in bytes. + +#### BaseComputerInterface.get_desktop_environment + +```python +async def get_desktop_environment(self) -> str +``` + +Get the current desktop environment. + +**Returns:** The name of the current desktop environment. + +#### BaseComputerInterface.set_wallpaper + +```python +async def set_wallpaper(self, path: str) -> None +``` + +Set the desktop wallpaper to the specified path. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `path` | `Any` | The file path to set as wallpaper | + +#### BaseComputerInterface.open + +```python +async def open(self, target: str) -> None +``` + +Open a target using the system's default handler. + +Typically opens files, folders, or URLs with the associated application. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `target` | `Any` | The file path, folder path, or URL to open. | + +#### BaseComputerInterface.launch + +```python +async def launch(self, app: str, args: List[str] | None = None) -> Optional[int] +``` + +Launch an application with optional arguments. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `app` | `Any` | The application executable or bundle identifier. | +| `args` | `Any` | Optional list of arguments to pass to the application. | + +**Returns:** Optional process ID (PID) of the launched application if available, otherwise None. + +#### BaseComputerInterface.get_current_window_id + +```python +async def get_current_window_id(self) -> int | str +``` + +Get the identifier of the currently active/focused window. + +**Returns:** A window identifier that can be used with other window management methods. + +#### BaseComputerInterface.get_application_windows + +```python +async def get_application_windows(self, app: str) -> List[int | str] +``` + +Get all window identifiers for a specific application. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `app` | `Any` | The application name, executable, or identifier to query. | + +**Returns:** A list of window identifiers belonging to the specified application. + +#### BaseComputerInterface.get_window_name + +```python +async def get_window_name(self, window_id: int | str) -> str +``` + +Get the title/name of a window. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `window_id` | `Any` | The window identifier. | + +**Returns:** The window's title or name string. + +#### BaseComputerInterface.get_window_size + +```python +async def get_window_size(self, window_id: int | str) -> tuple[int, int] +``` + +Get the size of a window in pixels. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `window_id` | `Any` | The window identifier. | + +**Returns:** A tuple of (width, height) representing the window size in pixels. + +#### BaseComputerInterface.get_window_position + +```python +async def get_window_position(self, window_id: int | str) -> tuple[int, int] +``` + +Get the screen position of a window. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `window_id` | `Any` | The window identifier. | + +**Returns:** A tuple of (x, y) representing the window's top-left corner in screen coordinates. + +#### BaseComputerInterface.set_window_size + +```python +async def set_window_size(self, window_id: int | str, width: int, height: int) -> None +``` + +Set the size of a window in pixels. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `window_id` | `Any` | The window identifier. | +| `width` | `Any` | Desired width in pixels. | +| `height` | `Any` | Desired height in pixels. | + +#### BaseComputerInterface.set_window_position + +```python +async def set_window_position(self, window_id: int | str, x: int, y: int) -> None +``` + +Move a window to a specific position on the screen. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `window_id` | `Any` | The window identifier. | +| `x` | `Any` | X coordinate for the window's top-left corner. | +| `y` | `Any` | Y coordinate for the window's top-left corner. | + +#### BaseComputerInterface.maximize_window + +```python +async def maximize_window(self, window_id: int | str) -> None +``` + +Maximize a window. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `window_id` | `Any` | The window identifier. | + +#### BaseComputerInterface.minimize_window + +```python +async def minimize_window(self, window_id: int | str) -> None +``` + +Minimize a window. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `window_id` | `Any` | The window identifier. | + +#### BaseComputerInterface.activate_window + +```python +async def activate_window(self, window_id: int | str) -> None +``` + +Bring a window to the foreground and focus it. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `window_id` | `Any` | The window identifier. | + +#### BaseComputerInterface.close_window + +```python +async def close_window(self, window_id: int | str) -> None +``` + +Close a window. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `window_id` | `Any` | The window identifier. | + +#### BaseComputerInterface.get_window_title + +```python +async def get_window_title(self, window_id: int | str) -> str +``` + +Convenience alias for get_window_name(). + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `window_id` | `Any` | The window identifier. | + +**Returns:** The window's title or name string. + +#### BaseComputerInterface.window_size + +```python +async def window_size(self, window_id: int | str) -> tuple[int, int] +``` + +Convenience alias for get_window_size(). + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `window_id` | `Any` | The window identifier. | + +**Returns:** A tuple of (width, height) representing the window size in pixels. + +#### BaseComputerInterface.run_command + +```python +async def run_command(self, command: str) -> CommandResult +``` + +Run shell command and return structured result. + +Executes a shell command using subprocess.run with shell=True and check=False. +The command is run in the target environment and captures both stdout and stderr. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `command` | `str` | The shell command to execute | + +**Returns:** CommandResult: A structured result containing: - stdout (str): Standard output from the command - stderr (str): Standard error from the command - returncode (int): Exit code from the command (0 indicates success) + +**Raises:** + +- `RuntimeError` - If the command execution fails at the system level + +**Example:** + +```python +result = await interface.run_command("ls -la") +if result.returncode == 0: + print(f"Output: {result.stdout}") +else: + print(f"Error: {result.stderr}, Exit code: {result.returncode}") +``` + +#### BaseComputerInterface.get_accessibility_tree + +```python +async def get_accessibility_tree(self) -> Dict +``` + +Get the accessibility tree of the current screen. + +**Returns:** Dict containing the hierarchical accessibility information of screen elements. + +#### BaseComputerInterface.to_screen_coordinates + +```python +async def to_screen_coordinates(self, x: float, y: float) -> tuple[float, float] +``` + +Convert screenshot coordinates to screen coordinates. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `Any` | X coordinate in screenshot space | +| `y` | `Any` | Y coordinate in screenshot space | + +**Returns:** tuple[float, float]: (x, y) coordinates in screen space + +#### BaseComputerInterface.to_screenshot_coordinates + +```python +async def to_screenshot_coordinates(self, x: float, y: float) -> tuple[float, float] +``` + +Convert screen coordinates to screenshot coordinates. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `x` | `Any` | X coordinate in screen space | +| `y` | `Any` | Y coordinate in screen space | + +**Returns:** tuple[float, float]: (x, y) coordinates in screenshot space + +--- + +## InterfaceFactory + +Factory for creating OS-specific computer interfaces. + +### Methods + +#### InterfaceFactory.create_interface_for_os + +```python +def create_interface_for_os(os: OSType, ip_address: str, api_port: Optional[int] = None, api_key: Optional[str] = None, vm_name: Optional[str] = None) -> BaseComputerInterface +``` + +Create an interface for the specified OS. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `os` | `Any` | Operating system type ('macos', 'linux', or 'windows') | +| `ip_address` | `Any` | IP address of the computer to control | +| `api_port` | `Any` | Optional API port of the computer to control | +| `api_key` | `Any` | Optional API key for cloud authentication | +| `vm_name` | `Any` | Optional VM name for cloud authentication | + +**Returns:** BaseComputerInterface: The appropriate interface for the OS + +**Raises:** + +- `ValueError` - If the OS type is not supported + +--- + +## MacOSComputerInterface + +*Inherits from: GenericComputerInterface* + +Interface for macOS. + +### Constructor + +```python +MacOSComputerInterface(self, ip_address: str, username: str = 'lume', password: str = 'lume', api_key: Optional[str] = None, vm_name: Optional[str] = None, api_port: Optional[int] = None) +``` + +### Methods + +#### MacOSComputerInterface.diorama_cmd + +```python +async def diorama_cmd(self, action: str, arguments: Optional[dict] = None) -> dict +``` + +Send a diorama command to the server (macOS only). diff --git a/docs/content/docs/cua/reference/computer-sdk/meta.json b/docs/content/docs/cua/reference/computer-sdk/meta.json index bf192152..5ae7deab 100644 --- a/docs/content/docs/cua/reference/computer-sdk/meta.json +++ b/docs/content/docs/cua/reference/computer-sdk/meta.json @@ -2,5 +2,5 @@ "title": "Computer SDK", "description": "Python API for sandboxed desktop environments", "icon": "Monitor", - "pages": ["index"] + "pages": ["changelog"] } diff --git a/docs/content/docs/cua/reference/computer-sdk/v0.3/api.mdx b/docs/content/docs/cua/reference/computer-sdk/v0.3/api.mdx new file mode 100644 index 00000000..b4f1973c --- /dev/null +++ b/docs/content/docs/cua/reference/computer-sdk/v0.3/api.mdx @@ -0,0 +1,209 @@ +--- +title: Computer SDK v0.3 API Reference +description: API reference for Computer SDK version 0.3 +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/generate-versioned-docs.ts +Source tag: computer-v0.3.7 +Version: 0.3.7 +*/} + +import { Callout } from 'fumadocs-ui/components/callout'; + + + This is documentation for **v0.3**. [View latest version](/cua/reference/computer-sdk). + + +
+ + v0.3.7 + + pip install cua-computer==0.3.7 +
+ +CUA Computer Interface for cross-platform computer control. + +## Classes + +| Class | Description | +| ---------------- | ------------------------------------------------------------- | +| `Computer` | Computer is the main class for interacting with the computer. | +| `VMProviderType` | Enum of supported VM provider types. | + +## Computer + +Computer is the main class for interacting with the computer. + +### Methods + +#### Computer.create_desktop_from_apps + +```python +def create_desktop_from_apps(self, apps) +``` + +Create a virtual desktop from a list of app names, returning a DioramaComputer +that proxies Diorama.Interface but uses diorama_cmds via the computer interface. + +Args: +apps (list[str]): List of application names to include in the desktop. +Returns: +DioramaComputer: A proxy object with the Diorama interface, but using diorama_cmds. + +#### Computer.run + +```python +async def run(self) -> Optional[str] +``` + +Initialize the VM and computer interface. + +#### Computer.disconnect + +```python +async def disconnect(self) -> None +``` + +Disconnect from the computer's WebSocket interface. + +#### Computer.stop + +```python +async def stop(self) -> None +``` + +Disconnect from the computer's WebSocket interface and stop the computer. + +#### Computer.get_ip + +```python +async def get_ip(self, max_retries: int = 15, retry_delay: int = 3) -> str +``` + +Get the IP address of the VM or localhost if using host computer server. + +This method delegates to the provider's get_ip method, which waits indefinitely +until the VM has a valid IP address. + +Args: +max_retries: Unused parameter, kept for backward compatibility +retry_delay: Delay between retries in seconds (default: 2) + +Returns: +IP address of the VM or localhost if using host computer server + +#### Computer.wait_vm_ready + +```python +async def wait_vm_ready(self) -> Optional[Dict[str, Any]] +``` + +Wait for VM to be ready with an IP address. + +Returns: +VM status information or None if using host computer server. + +#### Computer.update + +```python +async def update(self, cpu: Optional[int] = None, memory: Optional[str] = None) +``` + +Update VM settings. + +#### Computer.get_screenshot_size + +```python +def get_screenshot_size(self, screenshot: bytes) -> Dict[str, int] +``` + +Get the dimensions of a screenshot. + +Args: +screenshot: The screenshot bytes + +Returns: +Dict[str, int]: Dictionary containing 'width' and 'height' of the image + +#### Computer.to_screen_coordinates + +```python +async def to_screen_coordinates(self, x: float, y: float) -> tuple[float, float] +``` + +Convert normalized coordinates to screen coordinates. + +Args: +x: X coordinate between 0 and 1 +y: Y coordinate between 0 and 1 + +Returns: +tuple[float, float]: Screen coordinates (x, y) + +#### Computer.to_screenshot_coordinates + +```python +async def to_screenshot_coordinates(self, x: float, y: float) -> tuple[float, float] +``` + +Convert screen coordinates to screenshot coordinates. + +Args: +x: X coordinate in screen space +y: Y coordinate in screen space + +Returns: +tuple[float, float]: (x, y) coordinates in screenshot space + +#### Computer.venv_install + +```python +async def venv_install(self, venv_name: str, requirements: list[str]) +``` + +Install packages in a virtual environment. + +Args: +venv_name: Name of the virtual environment +requirements: List of package requirements to install + +Returns: +Tuple of (stdout, stderr) from the installation command + +#### Computer.venv_cmd + +```python +async def venv_cmd(self, venv_name: str, command: str) +``` + +Execute a shell command in a virtual environment. + +Args: +venv_name: Name of the virtual environment +command: Shell command to execute in the virtual environment + +Returns: +Tuple of (stdout, stderr) from the command execution + +#### Computer.venv_exec + +```python +async def venv_exec(self, venv_name: str, python_func, args = (), kwargs = {}) +``` + +Execute Python function in a virtual environment using source code extraction. + +Args: +venv_name: Name of the virtual environment +python_func: A callable function to execute +\*args: Positional arguments to pass to the function +\*\*kwargs: Keyword arguments to pass to the function + +Returns: +The result of the function execution, or raises any exception that occurred + +## VMProviderType + +Enum of supported VM provider types. diff --git a/docs/content/docs/cua/reference/computer-sdk/v0.3/meta.json b/docs/content/docs/cua/reference/computer-sdk/v0.3/meta.json new file mode 100644 index 00000000..1d882707 --- /dev/null +++ b/docs/content/docs/cua/reference/computer-sdk/v0.3/meta.json @@ -0,0 +1,5 @@ +{ + "title": "v0.3", + "description": "Computer SDK v0.3 API Reference", + "pages": ["api"] +} diff --git a/docs/content/docs/cua/reference/computer-sdk/v0.4/api.mdx b/docs/content/docs/cua/reference/computer-sdk/v0.4/api.mdx new file mode 100644 index 00000000..9a865a16 --- /dev/null +++ b/docs/content/docs/cua/reference/computer-sdk/v0.4/api.mdx @@ -0,0 +1,328 @@ +--- +title: Computer SDK v0.4 API Reference +description: API reference for Computer SDK version 0.4 +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/generate-versioned-docs.ts +Source tag: computer-v0.4.19 +Version: 0.4.19 +*/} + +import { Callout } from 'fumadocs-ui/components/callout'; + + + This is documentation for **v0.4**. [View latest version](/cua/reference/computer-sdk). + + +
+ + v0.4.19 + + pip install cua-computer==0.4.19 +
+ +CUA Computer Interface for cross-platform computer control. + +## Classes + +| Class | Description | +| ---------------- | ------------------------------------------------------------- | +| `Computer` | Computer is the main class for interacting with the computer. | +| `VMProviderType` | Enum of supported VM provider types. | + +## Computer + +Computer is the main class for interacting with the computer. + +### Methods + +#### Computer.create_desktop_from_apps + +```python +def create_desktop_from_apps(self, apps) +``` + +Create a virtual desktop from a list of app names, returning a DioramaComputer +that proxies Diorama.Interface but uses diorama_cmds via the computer interface. + +Args: +apps (list[str]): List of application names to include in the desktop. +Returns: +DioramaComputer: A proxy object with the Diorama interface, but using diorama_cmds. + +#### Computer.run + +```python +async def run(self) -> Optional[str] +``` + +Initialize the VM and computer interface. + +#### Computer.disconnect + +```python +async def disconnect(self) -> None +``` + +Disconnect from the computer's WebSocket interface. + +#### Computer.stop + +```python +async def stop(self) -> None +``` + +Disconnect from the computer's WebSocket interface and stop the computer. + +#### Computer.start + +```python +async def start(self) -> None +``` + +Start the computer. + +#### Computer.restart + +```python +async def restart(self) -> None +``` + +Restart the computer. + +If using a VM provider that supports restart, this will issue a restart +without tearing down the provider context, then reconnect the interface. +Falls back to stop()+run() when a provider restart is not available. + +#### Computer.get_ip + +```python +async def get_ip(self, max_retries: int = 15, retry_delay: int = 3) -> str +``` + +Get the IP address of the VM or localhost if using host computer server. + +This method delegates to the provider's get_ip method, which waits indefinitely +until the VM has a valid IP address. + +Args: +max_retries: Unused parameter, kept for backward compatibility +retry_delay: Delay between retries in seconds (default: 2) + +Returns: +IP address of the VM or localhost if using host computer server + +#### Computer.wait_vm_ready + +```python +async def wait_vm_ready(self) -> Optional[Dict[str, Any]] +``` + +Wait for VM to be ready with an IP address. + +Returns: +VM status information or None if using host computer server. + +#### Computer.update + +```python +async def update(self, cpu: Optional[int] = None, memory: Optional[str] = None) +``` + +Update VM settings. + +#### Computer.get_screenshot_size + +```python +def get_screenshot_size(self, screenshot: bytes) -> Dict[str, int] +``` + +Get the dimensions of a screenshot. + +Args: +screenshot: The screenshot bytes + +Returns: +Dict[str, int]: Dictionary containing 'width' and 'height' of the image + +#### Computer.to_screen_coordinates + +```python +async def to_screen_coordinates(self, x: float, y: float) -> tuple[float, float] +``` + +Convert normalized coordinates to screen coordinates. + +Args: +x: X coordinate between 0 and 1 +y: Y coordinate between 0 and 1 + +Returns: +tuple[float, float]: Screen coordinates (x, y) + +#### Computer.to_screenshot_coordinates + +```python +async def to_screenshot_coordinates(self, x: float, y: float) -> tuple[float, float] +``` + +Convert screen coordinates to screenshot coordinates. + +Args: +x: X coordinate in screen space +y: Y coordinate in screen space + +Returns: +tuple[float, float]: (x, y) coordinates in screenshot space + +#### Computer.playwright_exec + +```python +async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any] +``` + +Execute a Playwright browser command. + +Args: +command: The browser command to execute (visit_url, click, type, scroll, web_search) +params: Command parameters + +Returns: +Dict containing the command result + +Examples: # Navigate to a URL +await computer.playwright_exec("visit_url", \{"url": "https://example.com"\}) + + # Click at coordinates + await computer.playwright_exec("click", \{"x": 100, "y": 200\}) + + # Type text + await computer.playwright_exec("type", \{"text": "Hello, world!"\}) + + # Scroll + await computer.playwright_exec("scroll", \{"delta_x": 0, "delta_y": -100\}) + + # Web search + await computer.playwright_exec("web_search", \{"query": "computer use agent"\}) + +#### Computer.venv_install + +```python +async def venv_install(self, venv_name: str, requirements: list[str]) +``` + +Install packages in a virtual environment. + +Args: +venv_name: Name of the virtual environment +requirements: List of package requirements to install + +Returns: +Tuple of (stdout, stderr) from the installation command + +#### Computer.pip_install + +```python +async def pip_install(self, requirements: list[str]) +``` + +Install packages using the system Python/pip (no venv). + +Args: +requirements: List of package requirements to install globally/user site. + +Returns: +Tuple of (stdout, stderr) from the installation command + +#### Computer.venv_cmd + +```python +async def venv_cmd(self, venv_name: str, command: str) +``` + +Execute a shell command in a virtual environment. + +Args: +venv_name: Name of the virtual environment +command: Shell command to execute in the virtual environment + +Returns: +Tuple of (stdout, stderr) from the command execution + +#### Computer.venv_exec + +```python +async def venv_exec(self, venv_name: str, python_func, args = (), kwargs = {}) +``` + +Execute Python function in a virtual environment using source code extraction. + +Args: +venv_name: Name of the virtual environment +python_func: A callable function to execute +\*args: Positional arguments to pass to the function +\*\*kwargs: Keyword arguments to pass to the function + +Returns: +The result of the function execution, or raises any exception that occurred + +#### Computer.venv_exec_background + +```python +async def venv_exec_background(self, venv_name: str, python_func, args = (), requirements: Optional[List[str]] = None, kwargs = {}) -> int +``` + +Run the Python function in the venv in the background and return the PID. + +Uses a short launcher Python that spawns a detached child and exits immediately. + +#### Computer.python_exec + +```python +async def python_exec(self, python_func, args = (), kwargs = {}) +``` + +Execute a Python function using the system Python (no venv). + +Uses source extraction and base64 transport, mirroring venv_exec but +without virtual environment activation. + +Returns the function result or raises a reconstructed exception with +remote traceback context appended. + +#### Computer.python_exec_background + +```python +async def python_exec_background(self, python_func, args = (), requirements: Optional[List[str]] = None, kwargs = {}) -> int +``` + +Run a Python function with the system interpreter in the background and return PID. + +Uses a short launcher Python that spawns a detached child and exits immediately. + +#### Computer.python_command + +```python +def python_command(self, requirements: Optional[List[str]] = None, venv_name: str = 'default', use_system_python: bool = False, background: bool = False) -> Callable[[Callable[P, R]], Callable[P, Awaitable[R]]] +``` + +Decorator to execute a Python function remotely in this Computer's venv. + +This mirrors `computer.helpers.sandboxed()` but binds to this instance and +optionally ensures required packages are installed before execution. + +Args: +requirements: Packages to install in the virtual environment. +venv_name: Name of the virtual environment to use. +use_system_python: If True, use the system Python/pip instead of a venv. +background: If True, run the function detached and return the child PID immediately. + +Returns: +A decorator that turns a local function into an async callable which +runs remotely and returns the function's result. + +## VMProviderType + +Enum of supported VM provider types. diff --git a/docs/content/docs/cua/reference/computer-sdk/v0.4/meta.json b/docs/content/docs/cua/reference/computer-sdk/v0.4/meta.json new file mode 100644 index 00000000..c526fd3c --- /dev/null +++ b/docs/content/docs/cua/reference/computer-sdk/v0.4/meta.json @@ -0,0 +1,5 @@ +{ + "title": "v0.4", + "description": "Computer SDK v0.4 API Reference", + "pages": ["api"] +} diff --git a/docs/content/docs/cua/reference/computer-sdk/v0.5/api.mdx b/docs/content/docs/cua/reference/computer-sdk/v0.5/api.mdx new file mode 100644 index 00000000..8bb1dc34 --- /dev/null +++ b/docs/content/docs/cua/reference/computer-sdk/v0.5/api.mdx @@ -0,0 +1,328 @@ +--- +title: Computer SDK v0.5 API Reference +description: API reference for Computer SDK version 0.5 +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/generate-versioned-docs.ts +Source tag: computer-v0.5.12 +Version: 0.5.12 +*/} + +import { Callout } from 'fumadocs-ui/components/callout'; + + + This is documentation for **v0.5**. [View latest version](/cua/reference/computer-sdk). + + +
+ + v0.5.12 + + pip install cua-computer==0.5.12 +
+ +Cua Computer Interface for cross-platform computer control. + +## Classes + +| Class | Description | +| ---------------- | ------------------------------------------------------------- | +| `Computer` | Computer is the main class for interacting with the computer. | +| `VMProviderType` | Enum of supported VM provider types. | + +## Computer + +Computer is the main class for interacting with the computer. + +### Methods + +#### Computer.create_desktop_from_apps + +```python +def create_desktop_from_apps(self, apps) +``` + +Create a virtual desktop from a list of app names, returning a DioramaComputer +that proxies Diorama.Interface but uses diorama_cmds via the computer interface. + +Args: +apps (list[str]): List of application names to include in the desktop. +Returns: +DioramaComputer: A proxy object with the Diorama interface, but using diorama_cmds. + +#### Computer.run + +```python +async def run(self) -> Optional[str] +``` + +Initialize the VM and computer interface. + +#### Computer.disconnect + +```python +async def disconnect(self) -> None +``` + +Disconnect from the computer's WebSocket interface. + +#### Computer.stop + +```python +async def stop(self) -> None +``` + +Disconnect from the computer's WebSocket interface and stop the computer. + +#### Computer.start + +```python +async def start(self) -> None +``` + +Start the computer. + +#### Computer.restart + +```python +async def restart(self) -> None +``` + +Restart the computer. + +If using a VM provider that supports restart, this will issue a restart +without tearing down the provider context, then reconnect the interface. +Falls back to stop()+run() when a provider restart is not available. + +#### Computer.get_ip + +```python +async def get_ip(self, max_retries: int = 15, retry_delay: int = 3) -> str +``` + +Get the IP address of the VM or localhost if using host computer server. + +This method delegates to the provider's get_ip method, which waits indefinitely +until the VM has a valid IP address. + +Args: +max_retries: Unused parameter, kept for backward compatibility +retry_delay: Delay between retries in seconds (default: 2) + +Returns: +IP address of the VM or localhost if using host computer server + +#### Computer.wait_vm_ready + +```python +async def wait_vm_ready(self) -> Optional[Dict[str, Any]] +``` + +Wait for VM to be ready with an IP address. + +Returns: +VM status information or None if using host computer server. + +#### Computer.update + +```python +async def update(self, cpu: Optional[int] = None, memory: Optional[str] = None) +``` + +Update VM settings. + +#### Computer.get_screenshot_size + +```python +def get_screenshot_size(self, screenshot: bytes) -> Dict[str, int] +``` + +Get the dimensions of a screenshot. + +Args: +screenshot: The screenshot bytes + +Returns: +Dict[str, int]: Dictionary containing 'width' and 'height' of the image + +#### Computer.to_screen_coordinates + +```python +async def to_screen_coordinates(self, x: float, y: float) -> tuple[float, float] +``` + +Convert normalized coordinates to screen coordinates. + +Args: +x: X coordinate between 0 and 1 +y: Y coordinate between 0 and 1 + +Returns: +tuple[float, float]: Screen coordinates (x, y) + +#### Computer.to_screenshot_coordinates + +```python +async def to_screenshot_coordinates(self, x: float, y: float) -> tuple[float, float] +``` + +Convert screen coordinates to screenshot coordinates. + +Args: +x: X coordinate in screen space +y: Y coordinate in screen space + +Returns: +tuple[float, float]: (x, y) coordinates in screenshot space + +#### Computer.playwright_exec + +```python +async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any] +``` + +Execute a Playwright browser command. + +Args: +command: The browser command to execute (visit_url, click, type, scroll, web_search) +params: Command parameters + +Returns: +Dict containing the command result + +Examples: # Navigate to a URL +await computer.playwright_exec("visit_url", \{"url": "https://example.com"\}) + + # Click at coordinates + await computer.playwright_exec("click", \{"x": 100, "y": 200\}) + + # Type text + await computer.playwright_exec("type", \{"text": "Hello, world!"\}) + + # Scroll + await computer.playwright_exec("scroll", \{"delta_x": 0, "delta_y": -100\}) + + # Web search + await computer.playwright_exec("web_search", \{"query": "computer use agent"\}) + +#### Computer.venv_install + +```python +async def venv_install(self, venv_name: str, requirements: list[str]) +``` + +Install packages in a UV project. + +Args: +venv_name: Name of the UV project +requirements: List of package requirements to install + +Returns: +Tuple of (stdout, stderr) from the installation command + +#### Computer.pip_install + +```python +async def pip_install(self, requirements: list[str]) +``` + +Install packages using the system Python with UV (no venv). + +Args: +requirements: List of package requirements to install globally/user site. + +Returns: +Tuple of (stdout, stderr) from the installation command + +#### Computer.venv_cmd + +```python +async def venv_cmd(self, venv_name: str, command: str) +``` + +Execute a shell command in a UV project. + +Args: +venv_name: Name of the UV project +command: Shell command to execute in the UV project + +Returns: +Tuple of (stdout, stderr) from the command execution + +#### Computer.venv_exec + +```python +async def venv_exec(self, venv_name: str, python_func, args = (), kwargs = {}) +``` + +Execute Python function in a virtual environment using source code extraction. + +Args: +venv_name: Name of the virtual environment +python_func: A callable function to execute +\*args: Positional arguments to pass to the function +\*\*kwargs: Keyword arguments to pass to the function + +Returns: +The result of the function execution, or raises any exception that occurred + +#### Computer.venv_exec_background + +```python +async def venv_exec_background(self, venv_name: str, python_func, args = (), requirements: Optional[List[str]] = None, kwargs = {}) -> int +``` + +Run the Python function in the venv in the background and return the PID. + +Uses a short launcher Python that spawns a detached child and exits immediately. + +#### Computer.python_exec + +```python +async def python_exec(self, python_func, args = (), kwargs = {}) +``` + +Execute a Python function using the system Python (no venv). + +Uses source extraction and base64 transport, mirroring venv_exec but +without virtual environment activation. + +Returns the function result or raises a reconstructed exception with +remote traceback context appended. + +#### Computer.python_exec_background + +```python +async def python_exec_background(self, python_func, args = (), requirements: Optional[List[str]] = None, kwargs = {}) -> int +``` + +Run a Python function with the system interpreter in the background and return PID. + +Uses a short launcher Python that spawns a detached child and exits immediately. + +#### Computer.python_command + +```python +def python_command(self, requirements: Optional[List[str]] = None, venv_name: str = 'default', use_system_python: bool = False, background: bool = False) -> Callable[[Callable[P, R]], Callable[P, Awaitable[R]]] +``` + +Decorator to execute a Python function remotely in this Computer's venv. + +This mirrors `computer.helpers.sandboxed()` but binds to this instance and +optionally ensures required packages are installed before execution. + +Args: +requirements: Packages to install in the virtual environment. +venv_name: Name of the virtual environment to use. +use_system_python: If True, use the system Python/pip instead of a venv. +background: If True, run the function detached and return the child PID immediately. + +Returns: +A decorator that turns a local function into an async callable which +runs remotely and returns the function's result. + +## VMProviderType + +Enum of supported VM provider types. diff --git a/docs/content/docs/cua/reference/computer-sdk/v0.5/meta.json b/docs/content/docs/cua/reference/computer-sdk/v0.5/meta.json new file mode 100644 index 00000000..3dedd0e8 --- /dev/null +++ b/docs/content/docs/cua/reference/computer-sdk/v0.5/meta.json @@ -0,0 +1,5 @@ +{ + "title": "v0.5", + "description": "Computer SDK v0.5 API Reference", + "pages": ["api"] +} diff --git a/docs/content/docs/cua/reference/desktop-sandbox/changelog.mdx b/docs/content/docs/cua/reference/desktop-sandbox/changelog.mdx new file mode 100644 index 00000000..f0defa05 --- /dev/null +++ b/docs/content/docs/cua/reference/desktop-sandbox/changelog.mdx @@ -0,0 +1,341 @@ +--- +title: Changelog +description: Release history for Desktop Sandbox +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/generate-changelog.ts +Last updated: 2026-02-04 +*/} + +# Desktop Sandbox Changelog + +All notable changes to the Desktop Sandbox are documented here. + +## 0.3.x + +### v0.3.16 (2026-01-30) + +- Initial release or no path-specific changes found + +Run the server +cua-computer-server + +``` + +### v0.3.15 (2026-01-29) + +* Bump cua-computer-server to v0.3.15 by @github-actions[bot] +* fix(computer-server): initialize coordinate scaling after auto-resize in ([#936](https://github.com/trycua/cua/pull/936)) by @synacktraa +* fix(mcp-server): disable redirect for /mcp endpoint ([#935](https://github.com/trycua/cua/pull/935)) by @Fizza-Mukhtar +* feat(agent): Centralized tool resolution in ComputerAgent ([#920](https://github.com/trycua/cua/pull/920)) by @sarinali + +Run the server +cua-computer-server +``` + +### v0.3.14 (2026-01-24) + +Maintenance release. + +### v0.3.13 (2026-01-23) + +- Bump cua-computer-server to v0.3.13 by @github-actions[bot] +- fix(computer-server): standardize Android handler response format ([#893](https://github.com/trycua/cua/pull/893)) by @synacktraa + +Run the server +cua-computer-server + +``` + +### v0.3.12 (2026-01-21) + +Maintenance release. + +### v0.3.11 (2026-01-21) + +* Initial release or no path-specific changes found + +Run the server +cua-computer-server +``` + +### v0.3.9 (2026-01-21) + +- Bump cua-computer-server to v0.3.9 by @github-actions[bot] +- fix(computer-server): make fastmcp a required dependency ([#875](https://github.com/trycua/cua/pull/875)) by @Francesco Bonacci + +Run the server +cua-computer-server + +``` + +### v0.3.8 (2026-01-20) + +* Bump cua-computer-server to v0.3.8 by @github-actions[bot] +* fix(computer-server): correct MCP endpoint path and lifespan ([#869](https://github.com/trycua/cua/pull/869)) by @Francesco Bonacci + +Run the server +cua-computer-server +``` + +### v0.3.7 (2026-01-20) + +- Bump cua-computer-server to v0.3.7 by @github-actions[bot] +- feat(computer-server): expose HTTP and MCP interfaces simultaneously ([#861](https://github.com/trycua/cua/pull/861)) by @Francesco Bonacci + +Run the server +cua-computer-server + +``` + +### v0.3.6 (2026-01-20) + +* Bump cua-computer-server to v0.3.6 by @github-actions[bot] +* feat(computer-server): auto-detect MCP mode and expose HTTP+MCP simultaneously ([#860](https://github.com/trycua/cua/pull/860)) by @Francesco Bonacci + +Run the server +cua-computer-server +``` + +### v0.3.5 (2026-01-20) + +- Bump cua-computer-server to v0.3.5 by @github-actions[bot] +- feat(computer-server): add MCP interface for Claude Code integration ([#859](https://github.com/trycua/cua/pull/859)) by @Francesco Bonacci + +Run the server +cua-computer-server + +``` + +### v0.3.2 (2026-01-11) + +* Initial release or no path-specific changes found + +### v0.3.1 (2026-01-11) + +Maintenance release. + +## 0.1.x + +### v0.1.28 (2025-10-29) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server +``` + +- Add test infrastructure with CI/CD #478 by @AceAtDev in https://github.com/trycua/cua/pull/491 +- Add desktop and window management commands to computer interface by @ddupont808 in https://github.com/trycua/cua/pull/516 + +New Contributors + +- @AceAtDev made their first contribution in https://github.com/trycua/cua/pull/491 + +### v0.1.27 (2025-10-24) + +**Dependencies:** cua-computer: latest + +- Patch Python SSL on Windows VMs to use system cert store (Python bug: https://bugs.python.org/issue36011) ( by @ddupont808 in https://github.com/trycua/cua/pull/510 + +### v0.1.26 (2025-10-24) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server + +``` + + +* Moved platform-specific deps from computer-server out of optional dependencies (fixes accessibility API not working on macOS, Windows) by @ddupont808 in https://github.com/trycua/cua/pull/508 + +### v0.1.25 (2025-10-22) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server +``` + +- Fix authentication / connection errors caused by computer-server using old API url by @ddupont808 in https://github.com/trycua/cua/pull/499 + +New Contributors + +- @masterbatcoderman10 made their first contribution in https://github.com/trycua/cua/pull/480 + +### v0.1.23 (2025-10-03) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server + +``` + +### v0.1.22 (2025-09-03) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server +``` + +### v0.1.21 (2025-08-06) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server + +``` + +### v0.1.20 (2025-07-29) + +Replaced `pyautogui.type_text` with `pynput.KeyboardController`, fixing the bug where multi-line text would just type the letter `"v"` on Linux + +### v0.1.19 (2025-07-11) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server +``` + +### v0.1.18 (2025-07-10) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server + +``` + +### v0.1.17 (2025-07-09) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server +``` + +### v0.1.16 (2025-07-09) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server + +``` + +### v0.1.15 (2025-07-01) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server +``` + +### v0.1.14 (2025-07-01) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server + +``` + +### v0.1.13 (2025-06-18) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server +``` + +### v0.1.12 (2025-06-10) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server + +``` + +### v0.1.11 (2025-06-05) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server +``` + +### v0.1.10 (2025-05-31) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server + +``` + +### v0.1.9 (2025-05-28) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server +``` + +### v0.1.8 (2025-05-28) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server + +``` + +### v0.1.7 (2025-05-28) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server +``` + +### v0.1.6 (2025-05-25) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server + +``` + +### v0.1.5 (2025-05-16) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server +``` + +### v0.1.4 (2025-04-25) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server + +``` + +### v0.1.3 (2025-04-15) + +**Dependencies:** cua-computer: latest + +Run the server +cua-computer-server +``` diff --git a/docs/content/docs/cua/reference/desktop-sandbox/index.mdx b/docs/content/docs/cua/reference/desktop-sandbox/index.mdx index 0c6ab590..dbfb1300 100644 --- a/docs/content/docs/cua/reference/desktop-sandbox/index.mdx +++ b/docs/content/docs/cua/reference/desktop-sandbox/index.mdx @@ -7,43 +7,43 @@ Desktop sandbox environments provide isolated desktops for running Computer-Usin ## Sandbox Options -### [macOS Sandbox](/cua/reference/desktop-sandbox/macos) +### macOS Sandbox -Native macOS virtual machines on Apple Silicon using Apple's Virtualization Framework. Near-native performance for macOS automation. +Native macOS virtual machines on Apple Silicon using Apple's Virtualization Framework. Near-native performance for macOS automation. [Learn more →](/cua/reference/desktop-sandbox/macos) - **Lume** - Native CLI for VM management - **Lumier** - Docker wrapper for containerized deployments -### [Linux Container](/cua/reference/desktop-sandbox/linux-container) +### Linux Container -Docker containers running Linux desktops. Fast startup, low resource usage. +Docker containers running Linux desktops. Fast startup, low resource usage. [Learn more →](/cua/reference/desktop-sandbox/linux-container) -| Container | Description | -|-----------|-------------| -| [Kasm](/cua/reference/desktop-sandbox/linux-container/kasm) | KasmWeb-based Ubuntu with XFCE | +| Container | Description | +| ----------------------------------------------------------- | ---------------------------------- | +| [Kasm](/cua/reference/desktop-sandbox/linux-container/kasm) | KasmWeb-based Ubuntu with XFCE | | [XFCE](/cua/reference/desktop-sandbox/linux-container/xfce) | Vanilla XFCE, minimal dependencies | -### [QEMU Container](/cua/reference/desktop-sandbox/qemu-container) +### QEMU Container -Full virtual machines running in Docker via QEMU/KVM. Complete OS isolation, supports Windows. +Full virtual machines running in Docker via QEMU/KVM. Complete OS isolation, supports Windows. [Learn more →](/cua/reference/desktop-sandbox/qemu-container) -| Container | OS | Description | -|-----------|-----|-------------| -| [Windows](/cua/reference/desktop-sandbox/qemu-container/windows) | Windows 11 | Windows desktop with KVM | -| [Linux](/cua/reference/desktop-sandbox/qemu-container/linux) | Ubuntu 22.04 | Full Ubuntu VM | -| [Android](/cua/reference/desktop-sandbox/qemu-container/android) | Android 11 | Android emulator | +| Container | OS | Description | +| ---------------------------------------------------------------- | ------------ | ------------------------ | +| [Windows](/cua/reference/desktop-sandbox/qemu-container/windows) | Windows 11 | Windows desktop with KVM | +| [Linux](/cua/reference/desktop-sandbox/qemu-container/linux) | Ubuntu 22.04 | Full Ubuntu VM | +| [Android](/cua/reference/desktop-sandbox/qemu-container/android) | Android 11 | Android emulator | ## Quick Comparison -| Feature | macOS (Lume) | Linux Container | QEMU VM | -|---------|--------------|-----------------|---------| -| Performance | Near-native | Native | Virtualized | -| Startup Time | Minutes | Seconds | 30s-2min | -| Host Requirements | Apple Silicon | Docker | KVM support | -| Windows Support | No | No | Yes | -| Resource Usage | Medium | Low | High | -| Memory Snapshots | Yes | No | Yes | -| Filesystem Snapshots | Yes | Yes | Yes | +| Feature | macOS (Lume) | Linux Container | QEMU VM | +| -------------------- | ------------- | --------------- | ----------- | +| Performance | Near-native | Native | Virtualized | +| Startup Time | Minutes | Seconds | 30s-2min | +| Host Requirements | Apple Silicon | Docker | KVM support | +| Windows Support | No | No | Yes | +| Resource Usage | Medium | Low | High | +| Memory Snapshots | Yes | No | Yes | +| Filesystem Snapshots | Yes | Yes | Yes | ### Snapshot Capabilities diff --git a/docs/content/docs/cua/reference/desktop-sandbox/linux-container/kasm.mdx b/docs/content/docs/cua/reference/desktop-sandbox/linux-container/kasm.mdx index 47c8f888..b4c07bf9 100644 --- a/docs/content/docs/cua/reference/desktop-sandbox/linux-container/kasm.mdx +++ b/docs/content/docs/cua/reference/desktop-sandbox/linux-container/kasm.mdx @@ -28,6 +28,7 @@ docker run --rm -it \ ``` **Access Points:** + - **VNC Web Interface**: `http://localhost:6901` - **Computer Server API**: `http://localhost:8000` @@ -60,10 +61,10 @@ async with computer: ### Environment Variables -| Variable | Default | Description | -|----------|---------|-------------| -| `VNC_PW` | `password` | VNC password | -| `DISPLAY` | `:0` | X11 display | +| Variable | Default | Description | +| --------- | ---------- | ------------ | +| `VNC_PW` | `password` | VNC password | +| `DISPLAY` | `:0` | X11 display | ### Volumes diff --git a/docs/content/docs/cua/reference/desktop-sandbox/linux-container/xfce.mdx b/docs/content/docs/cua/reference/desktop-sandbox/linux-container/xfce.mdx index e34892ff..189ca99b 100644 --- a/docs/content/docs/cua/reference/desktop-sandbox/linux-container/xfce.mdx +++ b/docs/content/docs/cua/reference/desktop-sandbox/linux-container/xfce.mdx @@ -28,6 +28,7 @@ docker run --rm -it \ ``` **Access Points:** + - **noVNC Web Interface**: `http://localhost:6901` (no password required) - **VNC Client**: `localhost:5901` (no password required) - **Computer Server API**: `http://localhost:8000` @@ -78,13 +79,13 @@ async with computer: ### Environment Variables -| Variable | Default | Description | -|----------|---------|-------------| -| `VNC_RESOLUTION` | `1024x768` | Screen resolution | -| `VNC_COL_DEPTH` | `24` | Color depth | -| `VNC_PORT` | `5901` | VNC server port | -| `NOVNC_PORT` | `6901` | noVNC web interface port | -| `API_PORT` | `8000` | Computer-server API port | +| Variable | Default | Description | +| ---------------- | ---------- | ------------------------ | +| `VNC_RESOLUTION` | `1024x768` | Screen resolution | +| `VNC_COL_DEPTH` | `24` | Color depth | +| `VNC_PORT` | `5901` | VNC server port | +| `NOVNC_PORT` | `6901` | noVNC web interface port | +| `API_PORT` | `8000` | Computer-server API port | ### Ports @@ -111,12 +112,12 @@ docker commit cua-xfce-snapshot:latest ## Comparison with Kasm Container -| Feature | Kasm Container | XFCE Container | -|---------|----------------|----------------| -| Base Image | KasmWeb Ubuntu | Vanilla Ubuntu | -| VNC Server | KasmVNC | TigerVNC | -| Dependencies | Higher | Lower | -| Size | Larger | Smaller | +| Feature | Kasm Container | XFCE Container | +| ------------ | -------------- | -------------- | +| Base Image | KasmWeb Ubuntu | Vanilla Ubuntu | +| VNC Server | KasmVNC | TigerVNC | +| Dependencies | Higher | Lower | +| Size | Larger | Smaller | ## Building from Source diff --git a/docs/content/docs/cua/reference/desktop-sandbox/macos.mdx b/docs/content/docs/cua/reference/desktop-sandbox/macos.mdx index 40cc8806..6b5df571 100644 --- a/docs/content/docs/cua/reference/desktop-sandbox/macos.mdx +++ b/docs/content/docs/cua/reference/desktop-sandbox/macos.mdx @@ -7,10 +7,10 @@ macOS sandbox environments run native macOS virtual machines on Apple Silicon us ## Options -| Option | Description | Best For | -|--------|-------------|----------| -| [Lume](/lume) | Native CLI for creating and managing macOS VMs | Direct VM management, development | -| [Lumier](/lume/guide/advanced/lumier) | Docker wrapper around Lume | Containerized deployments | +| Option | Description | Best For | +| ------------------------------------- | ---------------------------------------------- | --------------------------------- | +| [Lume](/lume) | Native CLI for creating and managing macOS VMs | Direct VM management, development | +| [Lumier](/lume/guide/advanced/lumier) | Docker wrapper around Lume | Containerized deployments | ## Requirements @@ -67,12 +67,12 @@ async with computer: ## Comparison with Other Platforms -| Feature | macOS (Lume) | Linux Container | QEMU VM | -|---------|--------------|-----------------|---------| -| Performance | Near-native | Native | Virtualized | -| Host OS | macOS only | Any with Docker | Any with KVM | -| Setup Time | Minutes | Seconds | 15-30 min (first run) | -| Windows Support | No | No | Yes | +| Feature | macOS (Lume) | Linux Container | QEMU VM | +| --------------- | ------------ | --------------- | --------------------- | +| Performance | Near-native | Native | Virtualized | +| Host OS | macOS only | Any with Docker | Any with KVM | +| Setup Time | Minutes | Seconds | 15-30 min (first run) | +| Windows Support | No | No | Yes | ## Related Documentation diff --git a/docs/content/docs/cua/reference/desktop-sandbox/meta.json b/docs/content/docs/cua/reference/desktop-sandbox/meta.json index c51d5986..12cf0816 100644 --- a/docs/content/docs/cua/reference/desktop-sandbox/meta.json +++ b/docs/content/docs/cua/reference/desktop-sandbox/meta.json @@ -1,5 +1,5 @@ { "title": "Desktop Sandbox", "icon": "Container", - "pages": ["macos", "linux-container", "qemu-container"] + "pages": ["macos", "linux-container", "qemu-container", "changelog"] } diff --git a/docs/content/docs/cua/reference/desktop-sandbox/qemu-container/android.mdx b/docs/content/docs/cua/reference/desktop-sandbox/qemu-container/android.mdx index c5700c25..882240c0 100644 --- a/docs/content/docs/cua/reference/desktop-sandbox/qemu-container/android.mdx +++ b/docs/content/docs/cua/reference/desktop-sandbox/qemu-container/android.mdx @@ -33,6 +33,7 @@ docker run -d \ ``` **Access Points:** + - **VNC Web UI**: `http://localhost:6080` - **Computer Server API**: `http://localhost:8000` - **API Documentation**: `http://localhost:8000/docs` @@ -60,10 +61,10 @@ curl -X POST http://localhost:8000/cmd \ ### Environment Variables -| Variable | Default | Description | -|----------|---------|-------------| -| `EMULATOR_DEVICE` | - | Device profile (e.g., "Samsung Galaxy S10") | -| `WEB_VNC` | `true` | Enable VNC web interface | +| Variable | Default | Description | +| ----------------- | ------- | ------------------------------------------- | +| `EMULATOR_DEVICE` | - | Device profile (e.g., "Samsung Galaxy S10") | +| `WEB_VNC` | `true` | Enable VNC web interface | ### Ports diff --git a/docs/content/docs/cua/reference/desktop-sandbox/qemu-container/linux.mdx b/docs/content/docs/cua/reference/desktop-sandbox/qemu-container/linux.mdx index 26ec7ff5..066c00e1 100644 --- a/docs/content/docs/cua/reference/desktop-sandbox/qemu-container/linux.mdx +++ b/docs/content/docs/cua/reference/desktop-sandbox/qemu-container/linux.mdx @@ -37,6 +37,7 @@ docker run -it --rm \ ``` **Access Points:** + - **Computer Server API**: `http://localhost:5000` - **noVNC Browser**: `http://localhost:8006` @@ -44,11 +45,11 @@ docker run -it --rm \ ### Environment Variables -| Variable | Default | Description | -|----------|---------|-------------| -| `RAM_SIZE` | `8G` | RAM allocated to Ubuntu VM | -| `CPU_CORES` | `8` | CPU cores allocated to VM | -| `DISK_SIZE` | `64G` | VM disk size (minimum: 32G) | +| Variable | Default | Description | +| ----------- | ------- | --------------------------- | +| `RAM_SIZE` | `8G` | RAM allocated to Ubuntu VM | +| `CPU_CORES` | `8` | CPU cores allocated to VM | +| `DISK_SIZE` | `64G` | VM disk size (minimum: 32G) | ### Ports diff --git a/docs/content/docs/cua/reference/desktop-sandbox/qemu-container/windows.mdx b/docs/content/docs/cua/reference/desktop-sandbox/qemu-container/windows.mdx index 10f4fc90..74962d13 100644 --- a/docs/content/docs/cua/reference/desktop-sandbox/qemu-container/windows.mdx +++ b/docs/content/docs/cua/reference/desktop-sandbox/qemu-container/windows.mdx @@ -37,6 +37,7 @@ docker run -it --rm \ ``` **Access Points:** + - **Computer Server API**: `http://localhost:5000` - **noVNC Browser**: `http://localhost:8006` @@ -44,11 +45,11 @@ docker run -it --rm \ ### Environment Variables -| Variable | Default | Description | -|----------|---------|-------------| -| `RAM_SIZE` | `8G` | RAM allocated to Windows VM | -| `CPU_CORES` | `8` | CPU cores allocated to VM | -| `DISK_SIZE` | `30G` | VM disk size (minimum: 20G) | +| Variable | Default | Description | +| ----------- | ------- | --------------------------- | +| `RAM_SIZE` | `8G` | RAM allocated to Windows VM | +| `CPU_CORES` | `8` | CPU cores allocated to VM | +| `DISK_SIZE` | `30G` | VM disk size (minimum: 20G) | ### Ports diff --git a/docs/content/docs/cua/reference/meta.json b/docs/content/docs/cua/reference/meta.json index 8658412b..f2222bec 100644 --- a/docs/content/docs/cua/reference/meta.json +++ b/docs/content/docs/cua/reference/meta.json @@ -2,5 +2,5 @@ "title": "Reference", "description": "SDK and CLI API reference", "icon": "FileText", - "pages": ["desktop-sandbox", "computer-sdk", "agent-sdk", "mcp-server", "cloud-cli"] + "pages": ["desktop-sandbox", "computer-sdk", "agent-sdk", "cli"] } diff --git a/docs/content/docs/cuabench/examples/rl-training.mdx b/docs/content/docs/cuabench/examples/rl-training.mdx index 392ab20e..a2469c07 100644 --- a/docs/content/docs/cuabench/examples/rl-training.mdx +++ b/docs/content/docs/cuabench/examples/rl-training.mdx @@ -713,16 +713,16 @@ modal run modal_grpo_training.py \ ## CLI Options -| Option | Default | Description | -|--------|---------|-------------| -| `--model-id` | `Qwen/Qwen3-VL-2B-Instruct` | Model to train | -| `--num-workers` | `2` | Number of parallel environment workers | -| `--max-steps` | `10` | Max steps per episode | -| `--num-generations` | `4` | Rollouts per prompt | -| `--dataset-size` | `1000` | Number of training samples | -| `--learning-rate` | `5e-6` | Learning rate | -| `--save-steps` | `100` | Checkpoint save interval | -| `--debug` | `False` | Enable verbose output | +| Option | Default | Description | +| ------------------- | --------------------------- | -------------------------------------- | +| `--model-id` | `Qwen/Qwen3-VL-2B-Instruct` | Model to train | +| `--num-workers` | `2` | Number of parallel environment workers | +| `--max-steps` | `10` | Max steps per episode | +| `--num-generations` | `4` | Rollouts per prompt | +| `--dataset-size` | `1000` | Number of training samples | +| `--learning-rate` | `5e-6` | Learning rate | +| `--save-steps` | `100` | Checkpoint save interval | +| `--debug` | `False` | Enable verbose output | ## Creating Custom Tasks diff --git a/docs/content/docs/cuabench/guide/fundamentals/meta.json b/docs/content/docs/cuabench/guide/fundamentals/meta.json index ab2dbc63..c675290a 100644 --- a/docs/content/docs/cuabench/guide/fundamentals/meta.json +++ b/docs/content/docs/cuabench/guide/fundamentals/meta.json @@ -2,5 +2,13 @@ "title": "Fundamentals", "description": "Core concepts of tasks and environments", "icon": "Lightbulb", - "pages": ["tasks", "app-helpers", "universal-gui", "simulated-desktop", "agent-traces", "adapters", "registry"] + "pages": [ + "tasks", + "app-helpers", + "universal-gui", + "simulated-desktop", + "agent-traces", + "adapters", + "registry" + ] } diff --git a/docs/content/docs/cuabench/guide/getting-started/introduction.mdx b/docs/content/docs/cuabench/guide/getting-started/introduction.mdx index 82b3f046..30596928 100644 --- a/docs/content/docs/cuabench/guide/getting-started/introduction.mdx +++ b/docs/content/docs/cuabench/guide/getting-started/introduction.mdx @@ -4,7 +4,8 @@ description: A benchmark to measure the capabilities of computer-use agents in d --- - Building or researching computer-use agents? Fill out the [Interest Form](https://cuabench.ai/) to chat with us. + Building or researching computer-use agents? Fill out the [Interest Form](https://cuabench.ai/) to + chat with us. **Cua-Bench** is a framework and set of tasks for evaluating how well AI agents can accomplish complex tasks on a desktop computer using primarily the keyboard and mouse, or on a mobile device using primarily the touchscreen. diff --git a/docs/content/docs/cuabench/reference/api.mdx b/docs/content/docs/cuabench/reference/api.mdx new file mode 100644 index 00000000..849c8401 --- /dev/null +++ b/docs/content/docs/cuabench/reference/api.mdx @@ -0,0 +1,5157 @@ +--- +title: API Reference +description: Python API reference for the desktop automation benchmarking framework +--- + +{/* + AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY + Generated by: npx tsx scripts/docs-generators/python-sdk.ts + Source: libs/cua-bench/cua_bench + Version: 0.2.3 +*/} + +import { Callout } from 'fumadocs-ui/components/callout'; +import { Tabs, Tab } from 'fumadocs-ui/components/tabs'; +import { VersionHeader } from '@/components/version-selector'; + + + +cua-bench SDK - A framework for desktop automation tasks with batch processing. + +## Classes + +| Class | Description | +|-------|-------------| +| [`Task`](#task) | Represents a single task to be executed. | +| [`Desktop`](#desktop) | Desktop environment manager. | +| [`Environment`](#environment) | A minimal environment wrapper that delegates everything to a provider. | +| [`BenchmarkResult`](#benchmarkresult) | Result of a benchmark run. | +| [`TaskResult`](#taskresult) | Result of a single task execution. | +| [`ClickAction`](#clickaction) | No description | +| [`DoneAction`](#doneaction) | No description | +| [`DoubleClickAction`](#doubleclickaction) | No description | +| [`DragAction`](#dragaction) | No description | +| [`HotkeyAction`](#hotkeyaction) | No description | +| [`KeyAction`](#keyaction) | No description | +| [`MiddleClickAction`](#middleclickaction) | No description | +| [`MoveToAction`](#movetoaction) | No description | +| [`RightClickAction`](#rightclickaction) | No description | +| [`ScrollAction`](#scrollaction) | No description | +| [`TypeAction`](#typeaction) | No description | +| [`WaitAction`](#waitaction) | No description | + +## Functions + +| Function | Description | +|----------|-------------| +| [`repr_to_action`](#repr_to_action) | Parse an action from repr format string. | +| [`interact`](#interact) | Run an environment interactively with simplified output. | +| [`make`](#make) | Create an Environment by loading the env's main.py as a module. | +| [`evaluate_task`](#evaluate_task) | Decorator for the function that evaluates a task. | +| [`setup_task`](#setup_task) | Decorator for the function that sets up a task. | +| [`solve_task`](#solve_task) | Decorator for the function that solves a task. | +| [`tasks_config`](#tasks_config) | Decorator for the function that loads tasks. | +| [`run_benchmark`](#run_benchmark) | Run a benchmark on a dataset using the gym interface. | +| [`run_interactive`](#run_interactive) | Run an environment interactively using the gym interface. | +| [`run_single_task`](#run_single_task) | Run a single task using the gym interface. | + +--- + +## Task + +Represents a single task to be executed. + +### Constructor + +```python +Task(self, description: str, task_id: Optional[str] = None, metadata: Optional[dict] = None, computer: Optional[dict] = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `description` | `str` | | +| `task_id` | `Optional[str]` | | +| `metadata` | `Optional[dict]` | | +| `computer` | `Optional[dict]` | | + +--- + +## Desktop + +Desktop environment manager. + +### Constructor + +```python +Desktop(self, env) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `env` | `Any` | | +| `state` | `Any` | | +| `template` | `Any` | | + +### Methods + +#### Desktop.configure + +```python +def configure(self, os_type: Optional[str] = None, width: Optional[int] = None, height: Optional[int] = None, background: Optional[str] = None, dock_state: Optional[Dict[str, List[Union[str, Dict[str, str]]]]] = None, randomize_dock: bool = True, taskbar_state: Optional[Dict[str, List[Union[str, Dict[str, str]]]]] = None, randomize_taskbar: bool = True) +``` + +Configure desktop appearance. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `os_type` | `Any` | OS appearance (win11, win10, win7, macos, winxp, win98, android, ios) | +| `width` | `Any` | Screen width in pixels | +| `height` | `Any` | Screen height in pixels | +| `background` | `Any` | Background color | +| `dock_state` | `Any` | Explicit dock state to set with keys 'pinned_apps', 'recent_apps', 'pinned_folders' | +| `randomize_dock` | `Any` | If True, populate dock_state using macOS icon sets | +| `taskbar_state` | `Any` | Explicit taskbar state to set with keys 'pinned_apps', 'open_apps' | +| `randomize_taskbar` | `Any` | If True, populate taskbar_state using Windows 11 icon sets | + +#### Desktop.launch + +```python +def launch(self, content: str, title: str = 'Window', x: Optional[int] = None, y: Optional[int] = None, width: int = 600, height: int = 400, icon: Optional[str] = None, use_inner_size: bool = False, title_bar_style: str = 'default') -> Window +``` + +Launch a new window on the desktop. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `content` | `Any` | HTML content for the window body | +| `title` | `Any` | Window title | +| `x` | `Any` | X position (auto-calculated if None) | +| `y` | `Any` | Y position (auto-calculated if None) | +| `width` | `Any` | Window width | +| `height` | `Any` | Window height | +| `use_inner_size` | `Any` | Whether to use the inner size of the window (i.e. content size) | + +**Returns:** Window instance + +--- + +## Environment + +A minimal environment wrapper that delegates everything to a provider. + +Functions can be injected directly, or discovered from a module via +`make_from_module` based on cua-bench decorators (`_td_type`, `_td_split`). + +### Constructor + +```python +Environment(self, env_name: Optional[str] = None, split: str = 'train', tasks_config_fn: Optional[Callable[..., Any]] = None, setup_task_fn: Optional[Callable[..., Any]] = None, solve_task_fn: Optional[Callable[..., Any]] = None, evaluate_task_fn: Optional[Callable[..., Any]] = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `session` | `Optional[Any]` | | +| `env_name` | `Optional[str]` | | +| `split` | `Optional[str]` | | +| `headless` | `bool` | | +| `print_actions` | `bool` | | +| `bot` | `Optional[Bot]` | | +| `tracing` | `Optional[Tracing]` | | +| `step_count` | `int` | | +| `max_steps` | `Optional[int]` | | +| `tasks_config_fn` | `Any` | | +| `setup_task_fn` | `Any` | | +| `solve_task_fn` | `Any` | | +| `evaluate_task_fn` | `Any` | | +| `tasks` | `Optional[list]` | | +| `current_task` | `Optional[Any]` | | +| `session_name` | `Optional[str]` | | +| `session_config` | `Dict[str, Any]` | | +| `setup_config` | `DesktopSetupConfig` | | +| `page` | `Optional[Any]` | | + +### Methods + +#### Environment.make_from_module + +```python +def make_from_module(cls, module: Any, env_path: str | Path, split: str = 'train') -> 'Environment' +``` + +#### Environment.create_sandbox + +```python +async def create_sandbox(self, provider: str, provider_config: Dict[str, Any] | None = None, setup_config: DesktopSetupConfig | None = None) -> None +``` + +#### Environment.reset + +```python +async def reset(self, task_id: Optional[int] = None, run_id: Optional[str] = None) -> Tuple[bytes, Dict] +``` + +#### Environment.step + +```python +async def step(self, action: Action, dry_run: bool | Literal['before', 'after'] = False) -> bytes +``` + +#### Environment.solve + +```python +async def solve(self) -> bytes +``` + +#### Environment.evaluate + +```python +async def evaluate(self) -> Any +``` + +#### Environment.close + +```python +async def close(self) -> None +``` + +--- + +## BenchmarkResult + +Result of a benchmark run. + +Attributes: + run_id: Unique identifier for this run + task_results: List of individual task results + total_tasks: Total number of tasks in the benchmark + success_count: Number of successful tasks + failed_count: Number of failed tasks + avg_reward: Average reward across all tasks + duration_seconds: Total duration of the benchmark + output_dir: Output directory for results (if any) + +### Constructor + +```python +BenchmarkResult(self, run_id: str, task_results: List[Dict[str, Any]], total_tasks: int, success_count: int, failed_count: int, avg_reward: float, duration_seconds: float, output_dir: Optional[str] = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `run_id` | `str` | | +| `task_results` | `List[Dict[str, Any]]` | | +| `total_tasks` | `int` | | +| `success_count` | `int` | | +| `failed_count` | `int` | | +| `avg_reward` | `float` | | +| `duration_seconds` | `float` | | +| `output_dir` | `Optional[str]` | | + +--- + +## TaskResult + +Result of a single task execution. + +Attributes: + task_path: Path to the task + variant_id: Task variant index + success: Whether the task succeeded + reward: Reward from evaluation + steps: Number of steps taken + error: Error message if failed + +### Constructor + +```python +TaskResult(self, task_path: str, variant_id: int, success: bool, reward: float, steps: int, error: Optional[str] = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `task_path` | `str` | | +| `variant_id` | `int` | | +| `success` | `bool` | | +| `reward` | `float` | | +| `steps` | `int` | | +| `error` | `Optional[str]` | | + +--- + +## ClickAction + +### Constructor + +```python +ClickAction(self, x: int, y: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | + +--- + +## DoneAction + +### Constructor + +```python +DoneAction(self) -> None +``` + +--- + +## DoubleClickAction + +### Constructor + +```python +DoubleClickAction(self, x: int, y: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | + +--- + +## DragAction + +### Constructor + +```python +DragAction(self, from_x: int, from_y: int, to_x: int, to_y: int, duration: float = 1.0) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `from_x` | `int` | | +| `from_y` | `int` | | +| `to_x` | `int` | | +| `to_y` | `int` | | +| `duration` | `float` | | + +--- + +## HotkeyAction + +### Constructor + +```python +HotkeyAction(self, keys: List[str]) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `keys` | `List[str]` | | + +--- + +## KeyAction + +### Constructor + +```python +KeyAction(self, key: str) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `key` | `str` | | + +--- + +## MiddleClickAction + +### Constructor + +```python +MiddleClickAction(self, x: int, y: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | + +--- + +## MoveToAction + +### Constructor + +```python +MoveToAction(self, x: int, y: int, duration: float = 0.0) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | +| `duration` | `float` | | + +--- + +## RightClickAction + +### Constructor + +```python +RightClickAction(self, x: int, y: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | + +--- + +## ScrollAction + +### Constructor + +```python +ScrollAction(self, direction: Literal['up', 'down'] = 'up', amount: int = 100) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `direction` | `Literal['up', 'down']` | | +| `amount` | `int` | | + +--- + +## TypeAction + +### Constructor + +```python +TypeAction(self, text: str) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `text` | `str` | | + +--- + +## WaitAction + +### Constructor + +```python +WaitAction(self, seconds: float = 1.0) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `seconds` | `float` | | + +## repr_to_action + +```python +def repr_to_action(action_repr: str) -> Action +``` + +Parse an action from repr format string. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `action_repr` | `Any` | Action string in repr format, e.g., "ClickAction(x=100, y=200)" | + +**Returns:** Parsed Action object + +**Raises:** + +- `ValueError` - If the action string cannot be parsed + +## interact + +```python +def interact(env_path: str, task_id: int = 0) -> None +``` + +Run an environment interactively with simplified output. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_path` | `Any` | Path to the environment directory | +| `task_id` | `Any` | Task ID to run (default: 0) | + +## make + +```python +def make(env_name: str, split: str = 'train') -> Any +``` + +Create an Environment by loading the env's main.py as a module. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_name` | `Any` | Path to the environment directory (must contain main.py) | +| `split` | `Any` | Dataset split to use for decorated functions (e.g., 'train', 'test') | + +**Returns:** Environment instance + +## evaluate_task + +```python +def evaluate_task(_arg: Optional[Callable] = None, args = (), kwargs = {}) -> Callable +``` + +Decorator for the function that evaluates a task. + +Can be used as ``@cb.evaluate_task`` or ``@cb.evaluate_task("train")``. +The decorated function receives task_cfg and should return evaluation results. + +## setup_task + +```python +def setup_task(_arg: Optional[Callable] = None, args = (), kwargs = {}) -> Callable +``` + +Decorator for the function that sets up a task. + +Can be used as ``@cb.setup_task`` or ``@cb.setup_task("train")``. +The decorated function receives task_cfg and should initialize the environment. + +## solve_task + +```python +def solve_task(_arg: Optional[Callable] = None, args = (), kwargs = {}) -> Callable +``` + +Decorator for the function that solves a task. + +Can be used as ``@cb.solve_task`` or ``@cb.solve_task("train")``. +The decorated function receives task_cfg and should execute the solution. + +## tasks_config + +```python +def tasks_config(_arg: Optional[Callable] = None, args = (), kwargs = {}) -> Callable +``` + +Decorator for the function that loads tasks. + +Can be used as ``@cb.tasks_config`` or ``@cb.tasks_config("train")``. +The decorated function should return a list of Task objects. + +## run_benchmark + +```python +async def run_benchmark(dataset_path: Path, agent_fn: Optional[Callable[[bytes, Task], Action]] = None, max_steps: int = 100, max_parallel: int = 4, oracle: bool = False, max_variants: Optional[int] = None, task_filter: Optional[str] = None, split: str = 'train') -> BenchmarkResult +``` + +Run a benchmark on a dataset using the gym interface. + +This function runs multiple tasks in parallel using the core gym interface +(make, reset, step, evaluate). + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `dataset_path` | `Any` | Path to the dataset directory | +| `agent_fn` | `Any` | Optional agent function that takes (screenshot, task_config) and returns an Action. Required if oracle=False. | +| `max_steps` | `Any` | Maximum steps per task (default: 100) | +| `max_parallel` | `Any` | Maximum parallel workers (default: 4) | +| `oracle` | `Any` | Run oracle/solver mode (default: False) | +| `max_variants` | `Any` | Maximum variants per task (optional) | +| `task_filter` | `Any` | Glob pattern to filter tasks (optional) | +| `split` | `Any` | Dataset split (default: "train") | + +**Returns:** BenchmarkResult with run statistics and task results + +**Example:** + +```python +# Run oracle benchmark +result = await run_benchmark( + Path("./datasets/cua-bench-basic"), + oracle=True, + max_parallel=8, +) +print(f"Success rate: {result.success_count / result.total_tasks:.2%}") + +# Run with custom agent +def random_agent(screenshot: bytes, task: Task) -> Action: + import random + return random.choice([ + ClickAction(x=random.randint(0, 1920), y=random.randint(0, 1080)), + DoneAction(), + ]) + +result = await run_benchmark( + Path("./datasets/my-dataset"), + agent_fn=random_agent, + max_parallel=4, +) +``` + +## run_interactive + +```python +async def run_interactive(env_path: Path, task_index: int = 0, split: str = 'train', headless: bool = False) -> Tuple[Environment, bytes, Task] +``` + +Run an environment interactively using the gym interface. + +This function sets up an environment for interactive use, returning +the environment instance, initial screenshot, and task configuration. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_path` | `Any` | Path to the environment directory | +| `task_index` | `Any` | Task variant index (default: 0) | +| `split` | `Any` | Dataset split (default: "train") | +| `headless` | `Any` | Run in headless mode (default: False) | + +**Returns:** Tuple of (env, screenshot, task_config) - env: Environment instance (caller should call env.close() when done) - screenshot: Initial screenshot bytes - task_config: Task configuration + +**Example:** + +```python +env, screenshot, task_cfg = await run_interactive(Path("./task")) +print(f"Task: {task_cfg.description}") + +# Execute actions... +screenshot = await env.step(ClickAction(x=100, y=200)) + +# Evaluate +reward = await env.evaluate() +print(f"Reward: {reward}") + +# Cleanup +await env.close() +``` + +## run_single_task + +```python +async def run_single_task(env_path: Path, task_index: int = 0, split: str = 'train', agent_fn: Optional[Callable[[bytes, Task], Action]] = None, max_steps: int = 100, oracle: bool = False) -> TaskResult +``` + +Run a single task using the gym interface. + +This function uses the core gym interface (make, reset, step, evaluate) +to run a task with either an agent function or the oracle solver. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_path` | `Any` | Path to the task environment directory | +| `task_index` | `Any` | Task variant index (default: 0) | +| `split` | `Any` | Dataset split (default: "train") | +| `agent_fn` | `Any` | Optional agent function that takes (screenshot, task_config) and returns an Action. If None and oracle=False, returns after setup. | +| `max_steps` | `Any` | Maximum steps per task (default: 100) | +| `oracle` | `Any` | Run oracle/solver mode (default: False) | + +**Returns:** TaskResult with execution results + +**Example:** + +```python +# Run with oracle +result = await run_single_task(Path("./task"), oracle=True) + +# Run with custom agent +def my_agent(screenshot: bytes, task: Task) -> Action: + return DoneAction() # Simple agent that immediately finishes + +result = await run_single_task(Path("./task"), agent_fn=my_agent) +``` + +--- + +## tracing + +--- + +## Tracing + +Lightweight trajectory tracing using Hugging Face Datasets. + +Records events with arbitrary JSON metadata and a list of PIL images. +Exposes a datasets.Dataset-compatible interface for saving/pushing. + +### Constructor + +```python +Tracing(self, env: Any) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `env` | `Any` | | +| `trajectory_id` | `Optional[str]` | | +| `dataset` | `Dataset` | Return a HF Dataset built from current rows, constructing lazily. | + +### Methods + +#### Tracing.start + +```python +def start(self, trajectory_id: Optional[str] = None) -> str +``` + +Start a new trajectory. Resets any previously recorded rows. + +Returns the trajectory_id used. + +#### Tracing.record + +```python +def record(self, event_name: str, data_dict: Dict[str, Any], data_images: List[Image.Image | bytes] | None = None) -> None +``` + +#### Tracing.save_to_disk + +```python +def save_to_disk(self, output_dir: str, save_pngs: bool = False, image_dir: Optional[str] = None, filter_events: Optional[List[str]] = None) -> None +``` + +#### Tracing.push_to_hub + +```python +def push_to_hub(self, repo_id: str, private: bool | None = None) -> str +``` + +#### Tracing.bytes_to_image + +```python +def bytes_to_image(png_bytes: bytes) -> Image.Image +``` + +--- + +## actions + +--- + +## ClickAction + +### Constructor + +```python +ClickAction(self, x: int, y: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | + +--- + +## DoneAction + +### Constructor + +```python +DoneAction(self) -> None +``` + +--- + +## DoubleClickAction + +### Constructor + +```python +DoubleClickAction(self, x: int, y: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | + +--- + +## DragAction + +### Constructor + +```python +DragAction(self, from_x: int, from_y: int, to_x: int, to_y: int, duration: float = 1.0) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `from_x` | `int` | | +| `from_y` | `int` | | +| `to_x` | `int` | | +| `to_y` | `int` | | +| `duration` | `float` | | + +--- + +## HotkeyAction + +### Constructor + +```python +HotkeyAction(self, keys: List[str]) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `keys` | `List[str]` | | + +--- + +## KeyAction + +### Constructor + +```python +KeyAction(self, key: str) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `key` | `str` | | + +--- + +## MiddleClickAction + +### Constructor + +```python +MiddleClickAction(self, x: int, y: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | + +--- + +## MoveToAction + +### Constructor + +```python +MoveToAction(self, x: int, y: int, duration: float = 0.0) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | +| `duration` | `float` | | + +--- + +## RightClickAction + +### Constructor + +```python +RightClickAction(self, x: int, y: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | + +--- + +## ScrollAction + +### Constructor + +```python +ScrollAction(self, direction: Literal['up', 'down'] = 'up', amount: int = 100) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `direction` | `Literal['up', 'down']` | | +| `amount` | `int` | | + +--- + +## TypeAction + +### Constructor + +```python +TypeAction(self, text: str) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `text` | `str` | | + +--- + +## WaitAction + +### Constructor + +```python +WaitAction(self, seconds: float = 1.0) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `seconds` | `float` | | + +### repr_to_action + +```python +def repr_to_action(action_repr: str) -> Action +``` + +Parse an action from repr format string. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `action_repr` | `Any` | Action string in repr format, e.g., "ClickAction(x=100, y=200)" | + +**Returns:** Parsed Action object + +**Raises:** + +- `ValueError` - If the action string cannot be parsed + +### snake_case_to_action + +```python +def snake_case_to_action(action_str: str) -> Action +``` + +Parse an action from snake_case format string. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `action_str` | `Any` | Action string in snake_case format, e.g., "click(0.5, 0.5)" | + +**Returns:** Parsed Action object + +**Raises:** + +- `ValueError` - If the action string cannot be parsed + +### parse_action_string + +```python +def parse_action_string(action_str: str) -> Action +``` + +Parse an action from either repr or snake_case format. + +This is the unified entry point for parsing action strings. +It automatically detects the format and delegates to the appropriate parser. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `action_str` | `Any` | Action string in either format: - Repr format: "ClickAction(x=100, y=200)" - Snake_case format: "click(0.5, 0.5)" | + +**Returns:** Parsed Action object + +**Raises:** + +- `ValueError` - If the action string cannot be parsed in either format + +### action_to_dict + +```python +def action_to_dict(action: Action) -> Dict[str, Any] +``` + +Convert an Action object to a dictionary. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `action` | `Any` | Action object to convert | + +**Returns:** Dictionary representation of the action with 'type' key + +### dict_to_action + +```python +def dict_to_action(action_dict: Dict[str, Any]) -> Action +``` + +Convert a dictionary to an Action object. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `action_dict` | `Any` | Dictionary with 'type' key and action parameters | + +**Returns:** Action object + +**Raises:** + +- `ValueError` - If the action type is unknown + +--- + +## core + +Core classes and functions for cua-bench. + +--- + +## Task + +Represents a single task to be executed. + +### Constructor + +```python +Task(self, description: str, task_id: Optional[str] = None, metadata: Optional[dict] = None, computer: Optional[dict] = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `description` | `str` | | +| `task_id` | `Optional[str]` | | +| `metadata` | `Optional[dict]` | | +| `computer` | `Optional[dict]` | | + +### make + +```python +def make(env_name: str, split: str = 'train') -> Any +``` + +Create an Environment by loading the env's main.py as a module. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_name` | `Any` | Path to the environment directory (must contain main.py) | +| `split` | `Any` | Dataset split to use for decorated functions (e.g., 'train', 'test') | + +**Returns:** Environment instance + +### interact + +```python +def interact(env_path: str, task_id: int = 0) -> None +``` + +Run an environment interactively with simplified output. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_path` | `Any` | Path to the environment directory | +| `task_id` | `Any` | Task ID to run (default: 0) | + +--- + +## types + +--- + +## WindowSnapshot + +### Constructor + +```python +WindowSnapshot(self, window_type: Literal['webview', 'process', 'desktop'], pid: Optional[str] = None, url: Optional[str] = None, html: Optional[str] = None, title: str = '', x: int = 0, y: int = 0, width: int = 0, height: int = 0, active: bool = False, minimized: bool = False) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `window_type` | `Literal['webview', 'process', 'desktop']` | | +| `pid` | `Optional[str]` | | +| `url` | `Optional[str]` | | +| `html` | `Optional[str]` | | +| `title` | `str` | | +| `x` | `int` | | +| `y` | `int` | | +| `width` | `int` | | +| `height` | `int` | | +| `active` | `bool` | | +| `minimized` | `bool` | | + +--- + +## Snapshot + +### Constructor + +```python +Snapshot(self, windows: List[WindowSnapshot]) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `windows` | `List[WindowSnapshot]` | | + +--- + +## ClickAction + +### Constructor + +```python +ClickAction(self, x: int, y: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | + +--- + +## RightClickAction + +### Constructor + +```python +RightClickAction(self, x: int, y: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | + +--- + +## DoubleClickAction + +### Constructor + +```python +DoubleClickAction(self, x: int, y: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | + +--- + +## MiddleClickAction + +### Constructor + +```python +MiddleClickAction(self, x: int, y: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | + +--- + +## DragAction + +### Constructor + +```python +DragAction(self, from_x: int, from_y: int, to_x: int, to_y: int, duration: float = 1.0) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `from_x` | `int` | | +| `from_y` | `int` | | +| `to_x` | `int` | | +| `to_y` | `int` | | +| `duration` | `float` | | + +--- + +## MoveToAction + +### Constructor + +```python +MoveToAction(self, x: int, y: int, duration: float = 0.0) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | +| `duration` | `float` | | + +--- + +## ScrollAction + +### Constructor + +```python +ScrollAction(self, direction: Literal['up', 'down'] = 'up', amount: int = 100) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `direction` | `Literal['up', 'down']` | | +| `amount` | `int` | | + +--- + +## TypeAction + +### Constructor + +```python +TypeAction(self, text: str) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `text` | `str` | | + +--- + +## KeyAction + +### Constructor + +```python +KeyAction(self, key: str) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `key` | `str` | | + +--- + +## HotkeyAction + +### Constructor + +```python +HotkeyAction(self, keys: List[str]) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `keys` | `List[str]` | | + +--- + +## DoneAction + +### Constructor + +```python +DoneAction(self) -> None +``` + +--- + +## WaitAction + +### Constructor + +```python +WaitAction(self, seconds: float = 1.0) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `seconds` | `float` | | + +--- + +## bot + +--- + +## ClickAction + +### Constructor + +```python +ClickAction(self, x: int, y: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | + +--- + +## RightClickAction + +### Constructor + +```python +RightClickAction(self, x: int, y: int) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | + +--- + +## Bot + +Helper class for writing trajectories for task solutions. + +### Constructor + +```python +Bot(self, env: Any) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `env` | `Any` | | + +### Methods + +#### Bot.click_element + +```python +def click_element(self, pid: int, selector: str) -> None +``` + +Find element by CSS selector and click its center. + +Uses provider's bench-ui bridge to fetch element rect in screen space +and then dispatches a ClickAction via env.step(). + +#### Bot.right_click_element + +```python +def right_click_element(self, pid: int, selector: str) -> None +``` + +--- + +## utils + +Utility functions for synthetic data generation. + +--- + +## DesktopSetupConfig + +*Inherits from: TypedDict* + +Configuration for desktop setup provided to providers. + +Fields mirror high-level desktop appearance and workspace options. + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `os_type` | `Literal['win11', 'win10', 'win7', 'winxp', 'win98', 'macos', 'linux', 'android', 'ios', 'windows']` | | +| `width` | `int` | | +| `height` | `int` | | +| `background` | `str` | | +| `wallpaper` | `str` | | +| `installed_apps` | `List[str]` | | +| `image` | `str` | | +| `storage` | `str` | | +| `memory` | `str` | | +| `cpu` | `str` | | +| `provider_type` | `str` | | + +--- + +## Environment + +A minimal environment wrapper that delegates everything to a provider. + +Functions can be injected directly, or discovered from a module via +`make_from_module` based on cua-bench decorators (`_td_type`, `_td_split`). + +### Constructor + +```python +Environment(self, env_name: Optional[str] = None, split: str = 'train', tasks_config_fn: Optional[Callable[..., Any]] = None, setup_task_fn: Optional[Callable[..., Any]] = None, solve_task_fn: Optional[Callable[..., Any]] = None, evaluate_task_fn: Optional[Callable[..., Any]] = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `session` | `Optional[Any]` | | +| `env_name` | `Optional[str]` | | +| `split` | `Optional[str]` | | +| `headless` | `bool` | | +| `print_actions` | `bool` | | +| `bot` | `Optional[Bot]` | | +| `tracing` | `Optional[Tracing]` | | +| `step_count` | `int` | | +| `max_steps` | `Optional[int]` | | +| `tasks_config_fn` | `Any` | | +| `setup_task_fn` | `Any` | | +| `solve_task_fn` | `Any` | | +| `evaluate_task_fn` | `Any` | | +| `tasks` | `Optional[list]` | | +| `current_task` | `Optional[Any]` | | +| `session_name` | `Optional[str]` | | +| `session_config` | `Dict[str, Any]` | | +| `setup_config` | `DesktopSetupConfig` | | +| `page` | `Optional[Any]` | | + +### Methods + +#### Environment.make_from_module + +```python +def make_from_module(cls, module: Any, env_path: str | Path, split: str = 'train') -> 'Environment' +``` + +#### Environment.create_sandbox + +```python +async def create_sandbox(self, provider: str, provider_config: Dict[str, Any] | None = None, setup_config: DesktopSetupConfig | None = None) -> None +``` + +#### Environment.reset + +```python +async def reset(self, task_id: Optional[int] = None, run_id: Optional[str] = None) -> Tuple[bytes, Dict] +``` + +#### Environment.step + +```python +async def step(self, action: Action, dry_run: bool | Literal['before', 'after'] = False) -> bytes +``` + +#### Environment.solve + +```python +async def solve(self) -> bytes +``` + +#### Environment.evaluate + +```python +async def evaluate(self) -> Any +``` + +#### Environment.close + +```python +async def close(self) -> None +``` + +--- + +## Snapshot + +### Constructor + +```python +Snapshot(self, windows: List[WindowSnapshot]) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `windows` | `List[WindowSnapshot]` | | + +### render_snapshot_async + +```python +async def render_snapshot_async(setup_config: Dict[str, Any], snapshot: Dict[str, Any], screenshot_delay: float = 0, provider: Literal['webtop', 'computer'] = 'webtop') -> bytes +``` + +Render a snapshot and return screenshot bytes (async). + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `provider` | `Any` | Provider name ("webtop" or "computer") | +| `setup_config` | `Any` | Configuration dict for create_sandbox setup_config parameter | +| `snapshot` | `Any` | Snapshot dict containing windows and other state | +| `screenshot_delay` | `Any` | Delay in seconds before taking screenshot | + +**Returns:** Screenshot as bytes + +### render_windows_async + +```python +async def render_windows_async(setup_config: Dict[str, Any], windows: List[Dict[str, Any]], screenshot_delay: float = 0, provider: Literal['webtop', 'computer'] = 'webtop', return_snapshot: bool = False, scroll_into_view: Optional[str] = None) -> bytes | Tuple[bytes, Snapshot] +``` + +Render windows and return screenshot bytes (async). + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `provider` | `Any` | Provider name ("webtop" or "computer") | +| `setup_config` | `Any` | Configuration dict for create_sandbox setup_config parameter | +| `windows` | `Any` | List of window dicts to pass directly to launch_window | +| `screenshot_delay` | `Any` | Delay in seconds before taking screenshot | +| `return_snapshot` | `Any` | If True, return tuple of (bytes, Snapshot) instead of just bytes | +| `scroll_into_view` | `Any` | Optional CSS selector for an element to scroll into view | + +**Returns:** Screenshot as bytes, or tuple of (bytes, Snapshot) if return_snapshot=True + +### render_snapshot + +```python +def render_snapshot(setup_config: Dict[str, Any], snapshot: Dict[str, Any], screenshot_delay: float = 0, provider: Literal['webtop', 'computer'] = 'webtop') -> bytes +``` + +Render a snapshot and return screenshot bytes (sync wrapper). + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `provider` | `Any` | Provider name ("webtop" or "computer") | +| `setup_config` | `Any` | Configuration dict for create_sandbox setup_config parameter | +| `snapshot` | `Any` | Snapshot dict containing windows and other state | +| `screenshot_delay` | `Any` | Delay in seconds before taking screenshot | + +**Returns:** Screenshot as bytes + +### render_windows + +```python +def render_windows(setup_config: Dict[str, Any], windows: List[Dict[str, Any]], screenshot_delay: float = 0, provider: Literal['webtop', 'computer'] = 'webtop', return_snapshot: bool = False, scroll_into_view: Optional[str] = None) -> bytes | Tuple[bytes, Snapshot] +``` + +Render windows and return screenshot bytes (sync wrapper). + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `provider` | `Any` | Provider name ("webtop" or "computer") | +| `setup_config` | `Any` | Configuration dict for create_sandbox setup_config parameter | +| `windows` | `Any` | List of window dicts to pass directly to launch_window | +| `screenshot_delay` | `Any` | Delay in seconds before taking screenshot | +| `return_snapshot` | `Any` | If True, return tuple of (bytes, Snapshot) instead of just bytes | +| `scroll_into_view` | `Any` | Optional CSS selector for an element to scroll into view | + +**Returns:** Screenshot as bytes, or tuple of (bytes, Snapshot) if return_snapshot=True + +--- + +## runners + +Benchmark runner functions for cua-bench. + +This module provides programmatic interfaces for running benchmarks and +interactive environments, using the core gym interface (make, reset, step, evaluate). + +--- + +## Task + +Represents a single task to be executed. + +### Constructor + +```python +Task(self, description: str, task_id: Optional[str] = None, metadata: Optional[dict] = None, computer: Optional[dict] = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `description` | `str` | | +| `task_id` | `Optional[str]` | | +| `metadata` | `Optional[dict]` | | +| `computer` | `Optional[dict]` | | + +--- + +## Environment + +A minimal environment wrapper that delegates everything to a provider. + +Functions can be injected directly, or discovered from a module via +`make_from_module` based on cua-bench decorators (`_td_type`, `_td_split`). + +### Constructor + +```python +Environment(self, env_name: Optional[str] = None, split: str = 'train', tasks_config_fn: Optional[Callable[..., Any]] = None, setup_task_fn: Optional[Callable[..., Any]] = None, solve_task_fn: Optional[Callable[..., Any]] = None, evaluate_task_fn: Optional[Callable[..., Any]] = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `session` | `Optional[Any]` | | +| `env_name` | `Optional[str]` | | +| `split` | `Optional[str]` | | +| `headless` | `bool` | | +| `print_actions` | `bool` | | +| `bot` | `Optional[Bot]` | | +| `tracing` | `Optional[Tracing]` | | +| `step_count` | `int` | | +| `max_steps` | `Optional[int]` | | +| `tasks_config_fn` | `Any` | | +| `setup_task_fn` | `Any` | | +| `solve_task_fn` | `Any` | | +| `evaluate_task_fn` | `Any` | | +| `tasks` | `Optional[list]` | | +| `current_task` | `Optional[Any]` | | +| `session_name` | `Optional[str]` | | +| `session_config` | `Dict[str, Any]` | | +| `setup_config` | `DesktopSetupConfig` | | +| `page` | `Optional[Any]` | | + +### Methods + +#### Environment.make_from_module + +```python +def make_from_module(cls, module: Any, env_path: str | Path, split: str = 'train') -> 'Environment' +``` + +#### Environment.create_sandbox + +```python +async def create_sandbox(self, provider: str, provider_config: Dict[str, Any] | None = None, setup_config: DesktopSetupConfig | None = None) -> None +``` + +#### Environment.reset + +```python +async def reset(self, task_id: Optional[int] = None, run_id: Optional[str] = None) -> Tuple[bytes, Dict] +``` + +#### Environment.step + +```python +async def step(self, action: Action, dry_run: bool | Literal['before', 'after'] = False) -> bytes +``` + +#### Environment.solve + +```python +async def solve(self) -> bytes +``` + +#### Environment.evaluate + +```python +async def evaluate(self) -> Any +``` + +#### Environment.close + +```python +async def close(self) -> None +``` + +--- + +## DoneAction + +### Constructor + +```python +DoneAction(self) -> None +``` + +--- + +## BenchmarkResult + +Result of a benchmark run. + +Attributes: + run_id: Unique identifier for this run + task_results: List of individual task results + total_tasks: Total number of tasks in the benchmark + success_count: Number of successful tasks + failed_count: Number of failed tasks + avg_reward: Average reward across all tasks + duration_seconds: Total duration of the benchmark + output_dir: Output directory for results (if any) + +### Constructor + +```python +BenchmarkResult(self, run_id: str, task_results: List[Dict[str, Any]], total_tasks: int, success_count: int, failed_count: int, avg_reward: float, duration_seconds: float, output_dir: Optional[str] = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `run_id` | `str` | | +| `task_results` | `List[Dict[str, Any]]` | | +| `total_tasks` | `int` | | +| `success_count` | `int` | | +| `failed_count` | `int` | | +| `avg_reward` | `float` | | +| `duration_seconds` | `float` | | +| `output_dir` | `Optional[str]` | | + +--- + +## TaskResult + +Result of a single task execution. + +Attributes: + task_path: Path to the task + variant_id: Task variant index + success: Whether the task succeeded + reward: Reward from evaluation + steps: Number of steps taken + error: Error message if failed + +### Constructor + +```python +TaskResult(self, task_path: str, variant_id: int, success: bool, reward: float, steps: int, error: Optional[str] = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `task_path` | `str` | | +| `variant_id` | `int` | | +| `success` | `bool` | | +| `reward` | `float` | | +| `steps` | `int` | | +| `error` | `Optional[str]` | | + +### make + +```python +def make(env_name: str, split: str = 'train') -> Any +``` + +Create an Environment by loading the env's main.py as a module. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_name` | `Any` | Path to the environment directory (must contain main.py) | +| `split` | `Any` | Dataset split to use for decorated functions (e.g., 'train', 'test') | + +**Returns:** Environment instance + +### run_single_task + +```python +async def run_single_task(env_path: Path, task_index: int = 0, split: str = 'train', agent_fn: Optional[Callable[[bytes, Task], Action]] = None, max_steps: int = 100, oracle: bool = False) -> TaskResult +``` + +Run a single task using the gym interface. + +This function uses the core gym interface (make, reset, step, evaluate) +to run a task with either an agent function or the oracle solver. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_path` | `Any` | Path to the task environment directory | +| `task_index` | `Any` | Task variant index (default: 0) | +| `split` | `Any` | Dataset split (default: "train") | +| `agent_fn` | `Any` | Optional agent function that takes (screenshot, task_config) and returns an Action. If None and oracle=False, returns after setup. | +| `max_steps` | `Any` | Maximum steps per task (default: 100) | +| `oracle` | `Any` | Run oracle/solver mode (default: False) | + +**Returns:** TaskResult with execution results + +**Example:** + +```python +# Run with oracle +result = await run_single_task(Path("./task"), oracle=True) + +# Run with custom agent +def my_agent(screenshot: bytes, task: Task) -> Action: + return DoneAction() # Simple agent that immediately finishes + +result = await run_single_task(Path("./task"), agent_fn=my_agent) +``` + +### run_benchmark + +```python +async def run_benchmark(dataset_path: Path, agent_fn: Optional[Callable[[bytes, Task], Action]] = None, max_steps: int = 100, max_parallel: int = 4, oracle: bool = False, max_variants: Optional[int] = None, task_filter: Optional[str] = None, split: str = 'train') -> BenchmarkResult +``` + +Run a benchmark on a dataset using the gym interface. + +This function runs multiple tasks in parallel using the core gym interface +(make, reset, step, evaluate). + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `dataset_path` | `Any` | Path to the dataset directory | +| `agent_fn` | `Any` | Optional agent function that takes (screenshot, task_config) and returns an Action. Required if oracle=False. | +| `max_steps` | `Any` | Maximum steps per task (default: 100) | +| `max_parallel` | `Any` | Maximum parallel workers (default: 4) | +| `oracle` | `Any` | Run oracle/solver mode (default: False) | +| `max_variants` | `Any` | Maximum variants per task (optional) | +| `task_filter` | `Any` | Glob pattern to filter tasks (optional) | +| `split` | `Any` | Dataset split (default: "train") | + +**Returns:** BenchmarkResult with run statistics and task results + +**Example:** + +```python +# Run oracle benchmark +result = await run_benchmark( + Path("./datasets/cua-bench-basic"), + oracle=True, + max_parallel=8, +) +print(f"Success rate: {result.success_count / result.total_tasks:.2%}") + +# Run with custom agent +def random_agent(screenshot: bytes, task: Task) -> Action: + import random + return random.choice([ + ClickAction(x=random.randint(0, 1920), y=random.randint(0, 1080)), + DoneAction(), + ]) + +result = await run_benchmark( + Path("./datasets/my-dataset"), + agent_fn=random_agent, + max_parallel=4, +) +``` + +### run_interactive + +```python +async def run_interactive(env_path: Path, task_index: int = 0, split: str = 'train', headless: bool = False) -> Tuple[Environment, bytes, Task] +``` + +Run an environment interactively using the gym interface. + +This function sets up an environment for interactive use, returning +the environment instance, initial screenshot, and task configuration. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_path` | `Any` | Path to the environment directory | +| `task_index` | `Any` | Task variant index (default: 0) | +| `split` | `Any` | Dataset split (default: "train") | +| `headless` | `Any` | Run in headless mode (default: False) | + +**Returns:** Tuple of (env, screenshot, task_config) - env: Environment instance (caller should call env.close() when done) - screenshot: Initial screenshot bytes - task_config: Task configuration + +**Example:** + +```python +env, screenshot, task_cfg = await run_interactive(Path("./task")) +print(f"Task: {task_cfg.description}") + +# Execute actions... +screenshot = await env.step(ClickAction(x=100, y=200)) + +# Evaluate +reward = await env.evaluate() +print(f"Reward: {reward}") + +# Cleanup +await env.close() +``` + +--- + +## environment + +Simplified, provider-driven environment. + +--- + +## Bot + +Helper class for writing trajectories for task solutions. + +### Constructor + +```python +Bot(self, env: Any) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `env` | `Any` | | + +### Methods + +#### Bot.click_element + +```python +def click_element(self, pid: int, selector: str) -> None +``` + +Find element by CSS selector and click its center. + +Uses provider's bench-ui bridge to fetch element rect in screen space +and then dispatches a ClickAction via env.step(). + +#### Bot.right_click_element + +```python +def right_click_element(self, pid: int, selector: str) -> None +``` + +--- + +## Tracing + +Lightweight trajectory tracing using Hugging Face Datasets. + +Records events with arbitrary JSON metadata and a list of PIL images. +Exposes a datasets.Dataset-compatible interface for saving/pushing. + +### Constructor + +```python +Tracing(self, env: Any) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `env` | `Any` | | +| `trajectory_id` | `Optional[str]` | | +| `dataset` | `Dataset` | Return a HF Dataset built from current rows, constructing lazily. | + +### Methods + +#### Tracing.start + +```python +def start(self, trajectory_id: Optional[str] = None) -> str +``` + +Start a new trajectory. Resets any previously recorded rows. + +Returns the trajectory_id used. + +#### Tracing.record + +```python +def record(self, event_name: str, data_dict: Dict[str, Any], data_images: List[Image.Image | bytes] | None = None) -> None +``` + +#### Tracing.save_to_disk + +```python +def save_to_disk(self, output_dir: str, save_pngs: bool = False, image_dir: Optional[str] = None, filter_events: Optional[List[str]] = None) -> None +``` + +#### Tracing.push_to_hub + +```python +def push_to_hub(self, repo_id: str, private: bool | None = None) -> str +``` + +#### Tracing.bytes_to_image + +```python +def bytes_to_image(png_bytes: bytes) -> Image.Image +``` + +--- + +## MaxStepsExceeded + +*Inherits from: Exception* + +Raised when the environment's max step budget is exhausted. + +--- + +## Environment + +A minimal environment wrapper that delegates everything to a provider. + +Functions can be injected directly, or discovered from a module via +`make_from_module` based on cua-bench decorators (`_td_type`, `_td_split`). + +### Constructor + +```python +Environment(self, env_name: Optional[str] = None, split: str = 'train', tasks_config_fn: Optional[Callable[..., Any]] = None, setup_task_fn: Optional[Callable[..., Any]] = None, solve_task_fn: Optional[Callable[..., Any]] = None, evaluate_task_fn: Optional[Callable[..., Any]] = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `session` | `Optional[Any]` | | +| `env_name` | `Optional[str]` | | +| `split` | `Optional[str]` | | +| `headless` | `bool` | | +| `print_actions` | `bool` | | +| `bot` | `Optional[Bot]` | | +| `tracing` | `Optional[Tracing]` | | +| `step_count` | `int` | | +| `max_steps` | `Optional[int]` | | +| `tasks_config_fn` | `Any` | | +| `setup_task_fn` | `Any` | | +| `solve_task_fn` | `Any` | | +| `evaluate_task_fn` | `Any` | | +| `tasks` | `Optional[list]` | | +| `current_task` | `Optional[Any]` | | +| `session_name` | `Optional[str]` | | +| `session_config` | `Dict[str, Any]` | | +| `setup_config` | `DesktopSetupConfig` | | +| `page` | `Optional[Any]` | | + +### Methods + +#### Environment.make_from_module + +```python +def make_from_module(cls, module: Any, env_path: str | Path, split: str = 'train') -> 'Environment' +``` + +#### Environment.create_sandbox + +```python +async def create_sandbox(self, provider: str, provider_config: Dict[str, Any] | None = None, setup_config: DesktopSetupConfig | None = None) -> None +``` + +#### Environment.reset + +```python +async def reset(self, task_id: Optional[int] = None, run_id: Optional[str] = None) -> Tuple[bytes, Dict] +``` + +#### Environment.step + +```python +async def step(self, action: Action, dry_run: bool | Literal['before', 'after'] = False) -> bytes +``` + +#### Environment.solve + +```python +async def solve(self) -> bytes +``` + +#### Environment.evaluate + +```python +async def evaluate(self) -> Any +``` + +#### Environment.close + +```python +async def close(self) -> None +``` + +--- + +## iconify + +Iconify icon processing module for cua_bench. + +This module provides functionality to process HTML containing iconify-icon elements +and replace them with inline SVG content fetched from the Iconify API. + +Key features: +- Processes <iconify-icon icon="prefix:name"> elements +- Supports custom icons.json for icon resolution +- Option to ignore icon set prefixes for randomization +- Caches SVG content for performance +- Preserves element attributes (width, height, class, etc.) + +### process_icons + +```python +def process_icons(html: str, icons_json: Optional[str] = None, ignore_iconset: bool = False) -> str +``` + +Process HTML containing iconify-icon elements and replace them with inline SVGs. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `html` | `Any` | HTML content containing iconify-icon elements | +| `icons_json` | `Any` | Path to custom icons.json file. If None, uses default iconsets/icons.json | +| `ignore_iconset` | `Any` | If True, ignores the iconset prefix and searches for icon name only. Useful for shuffling/randomizing icon sets. For example: - eva:people-outline becomes */people-outline - mingcute:ad-circle-line becomes */ad-circle-line | + +**Returns:** HTML with iconify-icon elements replaced by inline SVG content + +**Example:** + +```python +>>> html = '' +>>> process_icons(html) +'...' + +>>> # With ignore_iconset=True for randomization +>>> process_icons(html, ignore_iconset=True) # May use different iconset +``` + +### clear_cache + +```python +def clear_cache() +``` + +Clear the SVG cache. Useful for testing or memory management. + +### get_cache_size + +```python +def get_cache_size() -> int +``` + +Get the number of cached SVG entries. + +--- + +## main + +Main entry point for cua-bench CLI. + +### main + +```python +def main() +``` + +Main CLI entry point. + +--- + +## desktop + +Desktop environment management for cua-bench. + +--- + +## Window + +Represents a window in the desktop environment. + +### Constructor + +```python +Window(self, x: int, y: int, width: int, height: int, title: str, content: str, focused: bool = False, icon: Optional[str] = None, title_bar_style: str = 'hidden') -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `x` | `int` | | +| `y` | `int` | | +| `width` | `int` | | +| `height` | `int` | | +| `title` | `str` | | +| `content` | `str` | | +| `focused` | `bool` | | +| `icon` | `Optional[str]` | | +| `title_bar_style` | `str` | | + +--- + +## DesktopState + +State of the unified desktop environment. + +### Constructor + +```python +DesktopState(self, os_type: str = 'win11', width: int = 1024, height: int = 768, background: str = '#000', windows: List[Window] = list(), dock_state: Dict[str, List[Dict[str, str]]] = (lambda: {'pinned_apps': [], 'recent_apps': [], 'pinned_folders': []})(), taskbar_state: Dict[str, List[Dict[str, str]]] = (lambda: {'pinned_apps': [], 'open_apps': []})()) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `os_type` | `str` | | +| `width` | `int` | | +| `height` | `int` | | +| `background` | `str` | | +| `windows` | `List[Window]` | | +| `dock_state` | `Dict[str, List[Dict[str, str]]]` | | +| `taskbar_state` | `Dict[str, List[Dict[str, str]]]` | | + +--- + +## Desktop + +Desktop environment manager. + +### Constructor + +```python +Desktop(self, env) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `env` | `Any` | | +| `state` | `Any` | | +| `template` | `Any` | | + +### Methods + +#### Desktop.configure + +```python +def configure(self, os_type: Optional[str] = None, width: Optional[int] = None, height: Optional[int] = None, background: Optional[str] = None, dock_state: Optional[Dict[str, List[Union[str, Dict[str, str]]]]] = None, randomize_dock: bool = True, taskbar_state: Optional[Dict[str, List[Union[str, Dict[str, str]]]]] = None, randomize_taskbar: bool = True) +``` + +Configure desktop appearance. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `os_type` | `Any` | OS appearance (win11, win10, win7, macos, winxp, win98, android, ios) | +| `width` | `Any` | Screen width in pixels | +| `height` | `Any` | Screen height in pixels | +| `background` | `Any` | Background color | +| `dock_state` | `Any` | Explicit dock state to set with keys 'pinned_apps', 'recent_apps', 'pinned_folders' | +| `randomize_dock` | `Any` | If True, populate dock_state using macOS icon sets | +| `taskbar_state` | `Any` | Explicit taskbar state to set with keys 'pinned_apps', 'open_apps' | +| `randomize_taskbar` | `Any` | If True, populate taskbar_state using Windows 11 icon sets | + +#### Desktop.launch + +```python +def launch(self, content: str, title: str = 'Window', x: Optional[int] = None, y: Optional[int] = None, width: int = 600, height: int = 400, icon: Optional[str] = None, use_inner_size: bool = False, title_bar_style: str = 'default') -> Window +``` + +Launch a new window on the desktop. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `content` | `Any` | HTML content for the window body | +| `title` | `Any` | Window title | +| `x` | `Any` | X position (auto-calculated if None) | +| `y` | `Any` | Y position (auto-calculated if None) | +| `width` | `Any` | Window width | +| `height` | `Any` | Window height | +| `use_inner_size` | `Any` | Whether to use the inner size of the window (i.e. content size) | + +**Returns:** Window instance + +--- + +## decorators + +Decorators for defining cua-bench environments. + +### tasks_config + +```python +def tasks_config(_arg: Optional[Callable] = None, args = (), kwargs = {}) -> Callable +``` + +Decorator for the function that loads tasks. + +Can be used as ``@cb.tasks_config`` or ``@cb.tasks_config("train")``. +The decorated function should return a list of Task objects. + +### setup_task + +```python +def setup_task(_arg: Optional[Callable] = None, args = (), kwargs = {}) -> Callable +``` + +Decorator for the function that sets up a task. + +Can be used as ``@cb.setup_task`` or ``@cb.setup_task("train")``. +The decorated function receives task_cfg and should initialize the environment. + +### solve_task + +```python +def solve_task(_arg: Optional[Callable] = None, args = (), kwargs = {}) -> Callable +``` + +Decorator for the function that solves a task. + +Can be used as ``@cb.solve_task`` or ``@cb.solve_task("train")``. +The decorated function receives task_cfg and should execute the solution. + +### evaluate_task + +```python +def evaluate_task(_arg: Optional[Callable] = None, args = (), kwargs = {}) -> Callable +``` + +Decorator for the function that evaluates a task. + +Can be used as ``@cb.evaluate_task`` or ``@cb.evaluate_task("train")``. +The decorated function receives task_cfg and should return evaluation results. + +--- + +## computers + +--- + +## DesktopSession + +*Inherits from: Protocol* + +Desktop session interface for environment backends. + +Usage: + # Preferred: async context manager + async with get_session("native")(os_type="linux") as session: + await session.screenshot() + + # Alternative: manual lifecycle + session = get_session("native")(os_type="linux") + await session.start() + try: + await session.screenshot() + finally: + await session.close() + +### Constructor + +```python +DesktopSession(self, env: Any) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `page` | `Any` | | +| `vnc_url` | `str` | Return the VNC URL for accessing the desktop environment. | +| `apps` | `'AppsProxy'` | Access registered apps via session.apps.\{app_name\}. | + +### Methods + +#### DesktopSession.start + +```python +async def start(self, config: Optional[DesktopSetupConfig] = None, headless: Optional[bool] = None) -> None +``` + +Start the session and connect to the environment. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `config` | `Any` | Optional configuration to apply before starting. | +| `headless` | `Any` | If False, shows browser/VNC preview. Defaults to True. | + +#### DesktopSession.serve_static + +```python +async def serve_static(self, url_path: str, local_path: str) -> None +``` + +#### DesktopSession.launch_window + +```python +async def launch_window(self, url: Optional[str] = None, html: Optional[str] = None, folder: Optional[str] = None, title: str = 'Window', x: Optional[int] = None, y: Optional[int] = None, width: int = 600, height: int = 400, icon: Optional[str] = None, use_inner_size: bool = False, title_bar_style: str = 'default') -> int | str +``` + +Launch a window and return its process ID. + +#### DesktopSession.get_element_rect + +```python +async def get_element_rect(self, pid: int | str, selector: str, space: Literal['window', 'screen'] = 'window', timeout: float = 0.5) -> dict[str, Any] | None +``` + +#### DesktopSession.execute_javascript + +```python +async def execute_javascript(self, pid: int | str, javascript: str) -> Any +``` + +#### DesktopSession.execute_action + +```python +async def execute_action(self, action: Any) -> None +``` + +#### DesktopSession.screenshot + +```python +async def screenshot(self) -> bytes +``` + +#### DesktopSession.get_snapshot + +```python +async def get_snapshot(self) -> Snapshot +``` + +Return a lightweight snapshot of the desktop state (windows, etc.). + +Implementations should populate the list of open windows with geometry +and metadata. If not supported, raise NotImplementedError. + +#### DesktopSession.close + +```python +async def close(self) -> None +``` + +#### DesktopSession.close_all_windows + +```python +async def close_all_windows(self) -> None +``` + +Close or clear all open windows in the desktop environment. + +#### DesktopSession.click_element + +```python +async def click_element(self, pid: int | str, selector: str) -> None +``` + +Find element by CSS selector and click its center. + +Uses the session's get_element_rect to fetch element rect in screen space +and then dispatches a ClickAction. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `pid` | `Any` | Process ID of the window | +| `selector` | `Any` | CSS selector for the element | + +#### DesktopSession.right_click_element + +```python +async def right_click_element(self, pid: int | str, selector: str) -> None +``` + +Find element by CSS selector and right-click its center. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `pid` | `Any` | Process ID of the window | +| `selector` | `Any` | CSS selector for the element | + +#### DesktopSession.run_command + +```python +async def run_command(self, command: str, timeout: Optional[float] = None, check: bool = True) -> 'CommandResult' +``` + +Execute a shell command on the native desktop environment. + +This method is only available with the native provider (Docker/QEMU). +It will raise NotImplementedError on simulated sessions. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `command` | `Any` | Shell command to execute | +| `timeout` | `Any` | Optional timeout in seconds | +| `check` | `Any` | If True (default), raise an exception if the command fails (non-zero return code). If False, return the result regardless. | + +**Returns:** CommandResult with stdout, stderr, and return_code + +**Raises:** + +- `NotImplementedError` - If called on simulated provider +- `RuntimeError` - If check=True and command returns non-zero exit code + +**Example:** + +```python +result = await session.run_command("ls -la /home/user") +print(result.stdout) +``` + +#### DesktopSession.install_app + +```python +async def install_app(self, app_name: str, with_shortcut: bool = True, kwargs = {}) -> None +``` + +Install a registered app on the native desktop environment. + +Uses the app registry to find platform-specific install functions. +This method is only available with the native provider (Docker/QEMU). + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `app_name` | `Any` | Name of the app to install (e.g., "godot", "firefox") | +| `with_shortcut` | `Any` | Create desktop shortcut (default True) **kwargs: App-specific arguments (e.g., version="4.2.1") | + +**Raises:** + +- `ValueError` - If app is not registered +- `NotImplementedError` - If app doesn't support the current platform + +**Example:** + +```python +await session.install_app("godot", version="4.2.1") +await session.install_app("firefox", with_shortcut=True) +``` + +#### DesktopSession.launch_app + +```python +async def launch_app(self, app_name: str, kwargs = {}) -> None +``` + +Launch a registered app on the native desktop environment. + +Uses the app registry to find platform-specific launch functions. +This method is only available with the native provider (Docker/QEMU). + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `app_name` | `Any` | Name of the app to launch **kwargs: App-specific arguments (e.g., project_path="/path") | + +**Raises:** + +- `ValueError` - If app is not registered +- `NotImplementedError` - If app doesn't support the current platform + +**Example:** + +```python +await session.launch_app("godot", project_path="~/project", editor=True) +``` + +--- + +## DesktopSetupConfig + +*Inherits from: TypedDict* + +Configuration for desktop setup provided to providers. + +Fields mirror high-level desktop appearance and workspace options. + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `os_type` | `Literal['win11', 'win10', 'win7', 'winxp', 'win98', 'macos', 'linux', 'android', 'ios', 'windows']` | | +| `width` | `int` | | +| `height` | `int` | | +| `background` | `str` | | +| `wallpaper` | `str` | | +| `installed_apps` | `List[str]` | | +| `image` | `str` | | +| `storage` | `str` | | +| `memory` | `str` | | +| `cpu` | `str` | | +| `provider_type` | `str` | | + +--- + +## RemoteDesktopSession + +Unified desktop session using cua-computer SDK. + +Supports two modes: +1. **Full lifecycle mode** (default): Computer SDK manages container/VM + - Pass config via constructor kwargs or start(config=\{...\}) + - SDK starts container, waits for boot, connects + +2. **Client-only mode**: Connect to pre-existing cua-computer-server + - Pass api_url to connect to existing server + - Used by 2-container architecture, batch execution + +Works with any golden environment type: +- linux-docker: trycua/cua-xfce container +- windows-qemu: Windows 11 VM +- linux-qemu: Linux VM +- android-qemu: Android VM + +Supports full bench_ui integration when bench_ui is installed in the +remote environment, enabling: +- launch_window() with HTML content via pywebview +- execute_javascript() for DOM manipulation +- get_element_rect() for element location queries +- click_element() / right_click_element() for element-based interaction + +### Constructor + +```python +RemoteDesktopSession(self, api_url: str = '', vnc_url: str = '', width: int = 1920, height: int = 1080, os_type: str = 'linux', image: str = '', provider_type: str = 'docker', memory: str = '8GB', cpu: str = '4', name: str = '', storage: str = '', ephemeral: bool = True, headless: bool = True, kwargs = {}) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `DEFAULT_TIMEOUT` | `Any` | | +| `SCREENSHOT_TIMEOUT` | `Any` | | +| `computer` | `Any` | Get the Computer SDK instance for advanced operations. | +| `interface` | `Any` | Get the computer interface for direct SDK access. | +| `page` | `Any` | Return underlying page object - not applicable for remote. | +| `vnc_url` | `str` | Return the VNC URL for accessing the environment. | +| `apps` | `'AppsProxy'` | Access registered apps via session.apps.\{app_name\}. | +| `os_type` | `str` | Return the OS type for this session. | + +### Methods + +#### RemoteDesktopSession.step + +```python +async def step(self, action: Action) -> None +``` + +Execute an action (alias for execute_action, for env.step() compatibility). + +#### RemoteDesktopSession.start + +```python +async def start(self, config: Optional[DesktopSetupConfig] = None, headless: Optional[bool] = None) -> None +``` + +Start the session and connect to the environment. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `config` | `Any` | Optional configuration to apply before starting. | +| `headless` | `Any` | If False, opens VNC preview in browser. Defaults to constructor value if not specified. | + +**Example:** + +```python +# Using constructor params (preferred) +async with RemoteDesktopSession(os_type="linux") as session: + await session.screenshot() + +# Or with config dict +session = RemoteDesktopSession() +await session.start(config={"os_type": "linux", "width": 1920}) +``` + +#### RemoteDesktopSession.serve_static + +```python +async def serve_static(self, url_path: str, local_path: str) -> None +``` + +Serve static files - not applicable for remote environments. + +#### RemoteDesktopSession.launch_window + +```python +async def launch_window(self, url: Optional[str] = None, html: Optional[str] = None, folder: Optional[str] = None, title: str = 'Window', x: Optional[int] = None, y: Optional[int] = None, width: int = 600, height: int = 400, icon: Optional[str] = None, use_inner_size: bool = False, title_bar_style: str = 'default') -> int | str +``` + +Launch a window in the remote environment using bench_ui (pywebview). + +Supports: +- url: Open a URL in a pywebview window +- html: Display HTML content in a pywebview window +- folder: Copy folder to remote and serve it in a pywebview window + +**Returns:** Process ID of the pywebview window (int) + +#### RemoteDesktopSession.get_element_rect + +```python +async def get_element_rect(self, pid: int | str, selector: str, space: Literal['window', 'screen'] = 'window', timeout: float = 0.5) -> dict[str, Any] | None +``` + +Get element rect by CSS selector using bench_ui. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `pid` | `Any` | Process ID of the pywebview window | +| `selector` | `Any` | CSS selector for the element | +| `space` | `Any` | Coordinate space - "window" or "screen" | +| `timeout` | `Any` | Maximum time to wait for element | + +**Returns:** Dict with x, y, width, height or None if not found + +#### RemoteDesktopSession.execute_javascript + +```python +async def execute_javascript(self, pid: int | str, javascript: str) -> Any +``` + +Execute JavaScript in a pywebview window using bench_ui. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `pid` | `Any` | Process ID of the pywebview window | +| `javascript` | `Any` | JavaScript code to execute | + +**Returns:** Result of the JavaScript execution + +#### RemoteDesktopSession.execute_action + +```python +async def execute_action(self, action: Action) -> None +``` + +Execute an action on the remote desktop using the SDK. + +#### RemoteDesktopSession.screenshot + +```python +async def screenshot(self) -> bytes +``` + +Capture screenshot from remote environment. + +**Returns:** PNG image bytes + +#### RemoteDesktopSession.get_snapshot + +```python +async def get_snapshot(self) -> Snapshot +``` + +Get snapshot of desktop state with active window info. + +Uses pywinctl on remote to get active window, and if it's a webview +we launched, extracts HTML via snapshot.js. + +#### RemoteDesktopSession.close + +```python +async def close(self) -> None +``` + +Close the session and cleanup resources. + +#### RemoteDesktopSession.close_all_windows + +```python +async def close_all_windows(self) -> None +``` + +Close all windows - best effort. + +#### RemoteDesktopSession.click_element + +```python +async def click_element(self, pid: int | str, selector: str) -> None +``` + +Find element by CSS selector and click its center. + +Uses get_element_rect to fetch element rect in screen space +and then dispatches a ClickAction. + +#### RemoteDesktopSession.right_click_element + +```python +async def right_click_element(self, pid: int | str, selector: str) -> None +``` + +Find element by CSS selector and right-click its center. + +#### RemoteDesktopSession.get_accessibility_tree + +```python +async def get_accessibility_tree(self) -> Dict[str, Any] +``` + +Get the accessibility tree if supported. + +#### RemoteDesktopSession.shell_command + +```python +async def shell_command(self, command: str, check: bool = True) -> Dict[str, Any] +``` + +Execute a shell command. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `command` | `Any` | Shell command to execute | +| `check` | `Any` | If True (default), raise an exception if the command fails (non-zero return code). If False, return the result regardless. | + +**Returns:** Command result with stdout/stderr + +**Raises:** + +- `RuntimeError` - If check=True and command returns non-zero exit code + +#### RemoteDesktopSession.read_file + +```python +async def read_file(self, path: str) -> str +``` + +Read a text file from the environment. + +#### RemoteDesktopSession.write_file + +```python +async def write_file(self, path: str, content: str) -> None +``` + +Write a text file to the environment. + +#### RemoteDesktopSession.read_bytes + +```python +async def read_bytes(self, path: str) -> bytes +``` + +Read a file as bytes from the environment. + +#### RemoteDesktopSession.write_bytes + +```python +async def write_bytes(self, path: str, data: bytes) -> None +``` + +Write bytes to a file in the environment. + +#### RemoteDesktopSession.file_exists + +```python +async def file_exists(self, path: str) -> bool +``` + +Check if a file exists in the environment. + +#### RemoteDesktopSession.directory_exists + +```python +async def directory_exists(self, path: str) -> bool +``` + +Check if a directory exists in the environment. + +#### RemoteDesktopSession.list_dir + +```python +async def list_dir(self, path: str) -> list[str] +``` + +List contents of a directory in the environment. + +#### RemoteDesktopSession.run_command + +```python +async def run_command(self, command: str, check: bool = True) -> Dict[str, Any] +``` + +Execute a shell command (alias for shell_command). + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `command` | `Any` | Shell command to execute | +| `check` | `Any` | If True (default), raise an exception if the command fails (non-zero return code). If False, return the result regardless. | + +**Returns:** Command result with stdout/stderr + +**Raises:** + +- `RuntimeError` - If check=True and command returns non-zero exit code + +#### RemoteDesktopSession.launch_application + +```python +async def launch_application(self, app_name: str) -> None +``` + +Launch an application by name. + +#### RemoteDesktopSession.check_status + +```python +async def check_status(self) -> bool +``` + +Check if the environment is responsive. + +**Returns:** True if environment is ready, False otherwise + +#### RemoteDesktopSession.wait_until_ready + +```python +async def wait_until_ready(self, timeout: int = 60, poll_interval: float = 2.0) -> bool +``` + +Wait until the environment is ready. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `timeout` | `Any` | Maximum time to wait in seconds | +| `poll_interval` | `Any` | Time between status checks | + +**Returns:** True if environment became ready, False if timeout + +#### RemoteDesktopSession.click + +```python +async def click(self, x: int, y: int) -> None +``` + +Click at coordinates. + +#### RemoteDesktopSession.right_click + +```python +async def right_click(self, x: int, y: int) -> None +``` + +Right-click at coordinates. + +#### RemoteDesktopSession.double_click + +```python +async def double_click(self, x: int, y: int) -> None +``` + +Double-click at coordinates. + +#### RemoteDesktopSession.type + +```python +async def type(self, text: str) -> None +``` + +Type text. + +#### RemoteDesktopSession.key + +```python +async def key(self, key: str) -> None +``` + +Press a key. + +#### RemoteDesktopSession.hotkey + +```python +async def hotkey(self, keys: list[str]) -> None +``` + +Press a key combination. + +#### RemoteDesktopSession.scroll + +```python +async def scroll(self, direction: str = 'down', amount: int = 300) -> None +``` + +Scroll the screen. + +#### RemoteDesktopSession.move_to + +```python +async def move_to(self, x: int, y: int) -> None +``` + +Move cursor to coordinates. + +#### RemoteDesktopSession.drag + +```python +async def drag(self, from_x: int, from_y: int, to_x: int, to_y: int) -> None +``` + +Drag from one position to another. + +#### RemoteDesktopSession.install_app + +```python +async def install_app(self, app_name: str, with_shortcut: bool = True, kwargs = {}) -> None +``` + +Install a registered app on the native desktop environment. + +Uses the app registry to find platform-specific install functions. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `app_name` | `Any` | Name of the app to install (e.g., "godot", "firefox") | +| `with_shortcut` | `Any` | Create desktop shortcut (default True) **kwargs: App-specific arguments (e.g., version="4.2.1") | + +**Raises:** + +- `ValueError` - If app is not registered +- `NotImplementedError` - If app doesn't support the current platform + +**Example:** + +```python +await session.install_app("godot", version="4.2.1") +await session.install_app("firefox", with_shortcut=True) +``` + +#### RemoteDesktopSession.launch_app + +```python +async def launch_app(self, app_name: str, kwargs = {}) -> None +``` + +Launch a registered app on the native desktop environment. + +Uses the app registry to find platform-specific launch functions. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `app_name` | `Any` | Name of the app to launch **kwargs: App-specific arguments (e.g., project_path="/path") | + +**Raises:** + +- `ValueError` - If app is not registered +- `NotImplementedError` - If app doesn't support the current platform + +**Example:** + +```python +await session.launch_app("godot", project_path="~/project", editor=True) +``` + +### get_session + +```python +def get_session(name: Optional[str] = None) -> type[DesktopSession] +``` + +Return session class by name. + +Provider names: + - "simulated" (alias: "webtop"): Playwright-based browser simulation + Fast, no Docker required. UI is HTML/CSS rendering of desktop. + Good for web-app testing, UI benchmarks. + + - "native" (alias: "computer"): Real OS in Docker/QEMU container + Actual desktop environment with real applications. + Requires Docker. Good for real app testing, OS-level tasks. + +### create_remote_session + +```python +def create_remote_session(api_url: str, vnc_url: str = '', os_type: str = 'linux', width: int = 1920, height: int = 1080) -> RemoteDesktopSession +``` + +Create a RemoteDesktopSession. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `api_url` | `Any` | URL of the environment's API endpoint | +| `vnc_url` | `Any` | URL for VNC access | +| `os_type` | `Any` | Operating system type | +| `width` | `Any` | Screen width | +| `height` | `Any` | Screen height | + +**Returns:** Configured RemoteDesktopSession instance + +--- + +## config + +Configuration module for cua-bench. + +--- + +## ConfigLoader + +Load and merge configuration from .cua/ directory. + +### Constructor + +```python +ConfigLoader(self, search_path: Path | None = None) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `CONFIG_DIR_NAME` | `Any` | | +| `CONFIG_FILE_NAME` | `Any` | | +| `AGENTS_FILE_NAME` | `Any` | | +| `search_path` | `Any` | | + +### Methods + +#### ConfigLoader.find_config_dir + +```python +def find_config_dir(self) -> Path | None +``` + +Walk up directory tree to find .cua/ directory. + +**Returns:** Path to .cua/ directory if found, None otherwise. + +#### ConfigLoader.load_config + +```python +def load_config(self) -> CuaConfig | None +``` + +Load .cua/config.yaml if it exists. + +**Returns:** CuaConfig object if config file exists, None otherwise. + +#### ConfigLoader.load_agents + +```python +def load_agents(self) -> list[CustomAgentEntry] +``` + +Load .cua/agents.yaml if it exists. + +**Returns:** List of CustomAgentEntry objects. + +#### ConfigLoader.get_agent_by_name + +```python +def get_agent_by_name(self, name: str) -> CustomAgentEntry | None +``` + +Get a custom agent entry by name. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `name` | `Any` | Agent name to look up. | + +**Returns:** CustomAgentEntry if found, None otherwise. + +#### ConfigLoader.get_effective_config + +```python +def get_effective_config(self, cli_args: dict[str, Any], env_type: str | None = None) -> dict[str, Any] +``` + +Merge configuration sources into effective config. + +Priority (highest to lowest): +1. CLI arguments +2. Environment-specific overrides +3. Agent defaults from agents.yaml +4. Agent config from config.yaml +5. Defaults from config.yaml + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `cli_args` | `Any` | Command line arguments as dictionary. | +| `env_type` | `Any` | Environment type for env-specific overrides (e.g., "webtop", "winarena"). | + +**Returns:** Merged configuration dictionary. + +--- + +## AgentConfig + +Agent configuration from .cua/config.yaml. + +### Constructor + +```python +AgentConfig(self, name: str | None = None, import_path: str | None = None, model: str | None = None, max_steps: int = 100, environments: dict[str, dict[str, Any]] | None = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `name` | `str | None` | | +| `import_path` | `str | None` | | +| `model` | `str | None` | | +| `max_steps` | `int` | | +| `environments` | `dict[str, dict[str, Any]] | None` | | + +### Methods + +#### AgentConfig.from_dict + +```python +def from_dict(cls, data: dict[str, Any]) -> AgentConfig +``` + +Create AgentConfig from dictionary. + +--- + +## AgentsConfig + +Configuration from .cua/agents.yaml. + +Supports two formats: +- Legacy: `custom_agents` list +- New: `agents` list (preferred) + +Example .cua/agents.yaml: + agents: + - name: my-agent + image: myregistry/my-agent:latest + defaults: + model: gpt-4o + + - name: dev-agent + import_path: my_agents.dev:DevAgent + +### Constructor + +```python +AgentsConfig(self, custom_agents: list[CustomAgentEntry] = list()) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `custom_agents` | `list[CustomAgentEntry]` | | + +### Methods + +#### AgentsConfig.from_dict + +```python +def from_dict(cls, data: dict[str, Any]) -> AgentsConfig +``` + +Create AgentsConfig from dictionary. + +--- + +## CuaConfig + +Root configuration from .cua/config.yaml. + +### Constructor + +```python +CuaConfig(self, defaults: DefaultsConfig | None = None, agent: AgentConfig | None = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `defaults` | `DefaultsConfig | None` | | +| `agent` | `AgentConfig | None` | | + +### Methods + +#### CuaConfig.from_dict + +```python +def from_dict(cls, data: dict[str, Any]) -> CuaConfig +``` + +Create CuaConfig from dictionary. + +--- + +## CustomAgentEntry + +Entry for a custom agent in .cua/agents.yaml. + +Agents can be defined in two ways: +1. Docker image (cloud-ready): Specify `image` field with a Docker image +2. Import path (local dev): Specify `import_path` for Python import + +Examples: + # Docker image agent + - name: my-agent + image: myregistry/my-agent:latest + + # Import path agent (uses default cua-agent image) + - name: dev-agent + import_path: my_agents.dev:DevAgent + + # Built-in agent + - name: cua-agent + builtin: true + +### Constructor + +```python +CustomAgentEntry(self, name: str, image: Optional[str] = None, import_path: Optional[str] = None, builtin: bool = False, command: Optional[list[str]] = None, defaults: dict[str, Any] = dict()) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `name` | `str` | | +| `image` | `Optional[str]` | | +| `import_path` | `Optional[str]` | | +| `builtin` | `bool` | | +| `command` | `Optional[list[str]]` | | +| `defaults` | `dict[str, Any]` | | + +### Methods + +#### CustomAgentEntry.get_image + +```python +def get_image(self) -> str +``` + +Get the Docker image to use for this agent. + +**Returns:** Docker image name. Uses custom image if specified, otherwise returns the default cua-agent image. + +#### CustomAgentEntry.is_docker_agent + +```python +def is_docker_agent(self) -> bool +``` + +Check if this agent is defined as a Docker image. + +**Returns:** True if agent has a custom Docker image specified. + +--- + +## DefaultsConfig + +Default configuration values from .cua/config.yaml. + +### Constructor + +```python +DefaultsConfig(self, model: str | None = None, max_steps: int = 100, output_dir: str = './results') -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `model` | `str | None` | | +| `max_steps` | `int` | | +| `output_dir` | `str` | | + +### Methods + +#### DefaultsConfig.from_dict + +```python +def from_dict(cls, data: dict[str, Any]) -> DefaultsConfig +``` + +Create DefaultsConfig from dictionary. + +### detect_env_type + +```python +def detect_env_type(env_path: str) -> str | None +``` + +Detect environment type from path. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_path` | `Any` | Path to the environment. | + +**Returns:** Environment type string ("webtop" or "winarena"), or None if unknown. + +--- + +## runner + +Runner module for 2-container task execution. + +--- + +## TaskResult + +Result of a task execution. + +### Constructor + +```python +TaskResult(self, success: bool, exit_code: int, agent_logs: str, env_logs: str, output_dir: Optional[str] = None, error: Optional[str] = None) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `success` | `bool` | | +| `exit_code` | `int` | | +| `agent_logs` | `str` | | +| `env_logs` | `str` | | +| `output_dir` | `Optional[str]` | | +| `error` | `Optional[str]` | | + +--- + +## TaskRunner + +Orchestrates 2-container task execution. + +Architecture: +- Creates isolated Docker network per task +- Creates task overlay to protect golden image (QEMU types) +- Starts environment container (base image with QCOW2 disk) +- Starts agent container (runs solver) +- Agent connects to env via network hostname +- Waits for agent completion +- Collects results and cleans up (including overlay) + +### Constructor + +```python +TaskRunner(self, agent_image: str = DEFAULT_AGENT_IMAGE, env_hostname: str = 'cua-env', agent_hostname: str = 'cua-agent') +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `agent_image` | `Any` | | +| `env_hostname` | `Any` | | +| `agent_hostname` | `Any` | | + +### Methods + +#### TaskRunner.run_task + +```python +async def run_task(self, env_path: Path, task_index: int, env_type: str, golden_name: Optional[str] = None, agent: Optional[str] = None, agent_image: Optional[str] = None, agent_command: Optional[List[str]] = None, agent_import_path: Optional[str] = None, model: Optional[str] = None, max_steps: int = 100, oracle: bool = False, memory: str = '8G', cpus: str = '8', vnc_port: Optional[int] = None, api_port: Optional[int] = None, output_dir: Optional[str] = None, stream_agent_logs: bool = False, timeout: Optional[int] = None, cleanup_before: bool = True, remove_images_after: bool = False, provider_type: Optional[str] = None) -> TaskResult +``` + +Run a task with 2-container architecture. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_path` | `Any` | Path to task environment directory | +| `task_index` | `Any` | Task index to run | +| `env_type` | `Any` | Environment type (linux-docker, windows-qemu, etc.) | +| `image_name` | `Any` | Image name to use (defaults to env_type). See: cb image list | +| `agent` | `Any` | Agent name (for built-in agents) | +| `agent_image` | `Any` | Docker image for agent container (overrides default) | +| `agent_command` | `Any` | Custom command for agent container | +| `agent_import_path` | `Any` | Custom agent import path | +| `model` | `Any` | Model to use | +| `max_steps` | `Any` | Maximum agent steps | +| `oracle` | `Any` | Run oracle solution instead of agent | +| `memory` | `Any` | Memory for environment (QEMU only) | +| `cpus` | `Any` | CPUs for environment (QEMU only) | +| `vnc_port` | `Any` | Host port to map VNC (for debugging) | +| `api_port` | `Any` | Host port to map API (for debugging) | +| `output_dir` | `Any` | Output directory for results | +| `stream_agent_logs` | `Any` | Stream agent logs to <output_dir>/run.log in real-time (default: False) | +| `timeout` | `Any` | Timeout in seconds (None = no timeout) | +| `cleanup_before` | `Any` | Clean up stale containers before starting (default: True) | +| `remove_images_after` | `Any` | Remove Docker images after task (default: False) Note: This removes Docker images but NOT base VM disk images. | +| `provider_type` | `Any` | Provider type ("simulated", "webtop", "native", "computer", None). If "simulated" or "webtop", the agent container will use a local Playwright session instead of connecting to a remote environment. | + +**Returns:** TaskResult with execution details + +#### TaskRunner.run_task_interactively + +```python +async def run_task_interactively(self, env_type: str, golden_name: Optional[str] = None, env_path: Optional[Path] = None, task_index: int = 0, memory: str = '8G', cpus: str = '8', vnc_port: Optional[int] = None, api_port: Optional[int] = None, auto_allocate_ports: bool = True, cleanup_before: bool = True) -> tuple[str, str, callable, Optional[dict]] +``` + +Start an environment container interactively (without agent). + +This method starts only the environment container with VNC and API ports +exposed to the host, allowing manual interaction or agent connection. +If env_path is provided, it will also load the task and run the setup. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_type` | `Any` | Environment type (linux-docker, windows-qemu, etc.) | +| `golden_name` | `Any` | Image name to use (defaults to env_type) | +| `env_path` | `Any` | Path to task directory (optional, for running task setup) | +| `task_index` | `Any` | Task index to run (default: 0) | +| `memory` | `Any` | Memory for environment (QEMU only) | +| `cpus` | `Any` | CPUs for environment (QEMU only) | +| `vnc_port` | `Any` | Host port to map VNC (None = auto-allocate) | +| `api_port` | `Any` | Host port to map API (None = auto-allocate) | +| `auto_allocate_ports` | `Any` | Auto-allocate ports if not specified (default: True) | +| `cleanup_before` | `Any` | Clean up stale containers before starting (default: True) | + +**Returns:** Tuple of (vnc_url, api_url, cleanup_func, task_config, env, session) - vnc_url: URL to access VNC (e.g., http://localhost:8006) - api_url: URL to access API (e.g., http://localhost:5000) - cleanup_func: Async function to call when done to cleanup resources - task_config: Task configuration dict (None if env_path not provided) - env: Environment object (None if env_path not provided) - session: RemoteDesktopSession object (None if env_path not provided) + +**Example:** + +```python +```python +runner = TaskRunner() +vnc_url, api_url, cleanup, task_cfg, env, session = await runner.run_task_interactively( + "linux-docker", + env_path=Path("./my_task"), + task_index=0 +) +print(f"VNC: {vnc_url}") +print(f"Task: {task_cfg.get('description')}") +# ... do interactive work ... +# Evaluate before cleanup +if env and env.evaluate_task_fn: + result = await env.evaluate_task_fn(task_cfg['_task_cfg'], session) + print(f"Result: {result}") +await cleanup() +``` +``` + +#### TaskRunner.cleanup_all + +```python +async def cleanup_all(self) -> None +``` + +Clean up all running tasks. + +#### TaskRunner.force_cleanup + +```python +async def force_cleanup() -> dict +``` + +Force cleanup of all stale cua-bench containers and networks. + +Use this when containers are left behind from previous runs. + +**Returns:** Dict with counts: \{"containers": N, "networks": N\} + +--- + +## agents + +--- + +## AgentResult + +Result of agent execution. + +### Constructor + +```python +AgentResult(self, total_input_tokens: int = 0, total_output_tokens: int = 0, failure_mode: FailureMode = FailureMode.UNSET) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `total_input_tokens` | `int` | | +| `total_output_tokens` | `int` | | +| `failure_mode` | `FailureMode` | | + +--- + +## BaseAgent + +*Inherits from: ABC* + +Base class for agents that can perform tasks. + +### Constructor + +```python +BaseAgent(self, kwargs = {}) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `version` | `str | None` | The version of the agent. Can be any string (e.g. could be | +| `prompt_template` | `str | None` | The path to a custom prompt template file. If specified, this template | + +### Methods + +#### BaseAgent.name + +```python +def name() -> str +``` + +Return the name of the agent. + +#### BaseAgent.perform_task + +```python +async def perform_task(self, task_description: str, session: DesktopSession, logging_dir: Path | None = None, tracer = None) -> AgentResult +``` + +Perform a task using the agent. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `task_description` | `Any` | The task description/instruction | +| `session` | `Any` | The desktop or mobile session to interact with | +| `logging_dir` | `Any` | Optional directory for logging agent execution | +| `tracer` | `Any` | Optional tracer object for recording agent actions | + +**Returns:** AgentResult with token counts and failure mode + +--- + +## FailureMode + +*Inherits from: Enum* + +Failure mode for agent execution. + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `UNSET` | `Any` | | +| `NONE` | `Any` | | +| `UNKNOWN` | `Any` | | +| `MAX_STEPS_EXCEEDED` | `Any` | | + +--- + +## CuaAgent + +*Inherits from: BaseAgent* + +Agent implementation using the CUA Computer Agent SDK. + +### Constructor + +```python +CuaAgent(self, kwargs = {}) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `model` | `Any` | | +| `max_steps` | `Any` | | + +### Methods + +#### CuaAgent.name + +```python +def name() -> str +``` + +#### CuaAgent.perform_task + +```python +async def perform_task(self, task_description: str, session: DesktopSession, logging_dir: Path | None = None, tracer = None) -> AgentResult +``` + +Perform a task using the CUA Computer Agent. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `task_description` | `Any` | The task description/instruction | +| `session` | `Any` | The desktop session to interact with | +| `logging_dir` | `Any` | Optional directory for logging agent execution | +| `tracer` | `Any` | Optional tracer object for recording agent actions | + +**Returns:** AgentResult with token counts and failure mode + +--- + +## GeminiAgent + +*Inherits from: BaseAgent* + +Agent implementation using Google's Gemini API with Computer Use. + +### Constructor + +```python +GeminiAgent(self, kwargs = {}) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `model` | `Any` | | +| `api_key` | `Any` | | +| `thinking_level` | `Any` | | +| `media_resolution` | `Any` | | +| `max_steps` | `Any` | | + +### Methods + +#### GeminiAgent.name + +```python +def name() -> str +``` + +#### GeminiAgent.perform_task + +```python +async def perform_task(self, task_description: str, session: DesktopSession, logging_dir: Path | None = None, tracer = None) -> AgentResult +``` + +Perform a task using the Gemini Computer Use agent. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `task_description` | `Any` | The task description/instruction | +| `session` | `Any` | The desktop session to interact with | +| `logging_dir` | `Any` | Optional directory for logging agent execution | +| `tracer` | `Any` | Optional tracer object for recording agent actions | + +**Returns:** AgentResult with token counts and failure mode + +### register_agent + +```python +def register_agent(name: str) +``` + +Decorator to register an agent class with a given name. + +### load_agent_from_path + +```python +def load_agent_from_path(import_path: str) -> type[BaseAgent] +``` + +Load an agent class from an import path. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `import_path` | `Any` | Import path in format 'module.path:ClassName' | + +**Returns:** Agent class + +**Raises:** + +- `ValueError` - If import path format is invalid +- `ImportError` - If module cannot be imported +- `AttributeError` - If class is not found in module + +### get_agent + +```python +def get_agent(name: str, config_loader: 'ConfigLoader | None' = None) -> type[BaseAgent] | None +``` + +Get an agent class by name. + +Lookup order: +1. Local registry (.cua/agents.yaml) - if config_loader provided +2. Built-in registry (_AGENT_REGISTRY) + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `name` | `Any` | Agent name to look up | +| `config_loader` | `Any` | Optional ConfigLoader for local registry lookup | + +**Returns:** Agent class if found, None otherwise + +### list_agents + +```python +def list_agents(config_loader: 'ConfigLoader | None' = None) -> list[str] +``` + +List all registered agent names. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `config_loader` | `Any` | Optional ConfigLoader to include local agents | + +**Returns:** List of agent names (local + built-in, deduplicated) + +--- + +## processors + +Snapshot processors for converting batch outputs into various dataset formats. + +--- + +## AgUVisStage1Processor + +*Inherits from: BaseProcessor* + +Processor for aguvis-stage-1 format (action augmentation dataset). + +### Methods + +#### AgUVisStage1Processor.get_dataset_name + +```python +def get_dataset_name(self) -> str +``` + +#### AgUVisStage1Processor.process + +```python +def process(self) -> List[Dict[str, Any]] +``` + +Process snapshots into aguvis-stage-1 format. + +--- + +## BaseProcessor + +*Inherits from: ABC* + +Base class for snapshot processors. + +A processor converts batch dump outputs (screenshots + snapshots) +into a specific dataset format. + +### Constructor + +```python +BaseProcessor(self, args: ProcessorArgs) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `args` | `Any` | | + +### Methods + +#### BaseProcessor.process + +```python +def process(self) -> List[Dict[str, Any]] +``` + +Process the snapshots and return a list of dataset rows. + +**Returns:** List of dictionaries, where each dict is a row in the dataset. The schema depends on the specific processor implementation. + +#### BaseProcessor.get_dataset_name + +```python +def get_dataset_name(self) -> str +``` + +Get the default dataset name for this processor. + +#### BaseProcessor.save_jsonl + +```python +def save_jsonl(self, rows: List[Dict[str, Any]], save_dir: Path, dataset_name: str) -> Path +``` + +Save dataset rows as JSONL file. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `rows` | `Any` | List of dataset row dictionaries | +| `save_dir` | `Any` | Directory to save to | +| `dataset_name` | `Any` | Name of the dataset file (without extension) | + +**Returns:** Path to the saved file + +#### BaseProcessor.save_to_disk + +```python +def save_to_disk(self, rows: List[Dict[str, Any]], save_dir: Path, dataset_name: str) -> Path +``` + +Save dataset rows using HuggingFace's save_to_disk method. + +This method properly handles PIL images and other complex data types +that cannot be serialized to JSON. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `rows` | `Any` | List of dataset row dictionaries | +| `save_dir` | `Any` | Directory to save to | +| `dataset_name` | `Any` | Name of the dataset directory | + +**Returns:** Path to the saved dataset directory + +#### BaseProcessor.push_to_hub + +```python +def push_to_hub(self, rows: List[Dict[str, Any]], repo_id: str, private: bool) -> None +``` + +Push dataset to Hugging Face Hub. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `rows` | `Any` | List of dataset row dictionaries | +| `repo_id` | `Any` | HuggingFace repository ID (e.g., "username/dataset-name") | +| `private` | `Any` | Whether to make the dataset private | + +--- + +## GuiR1Processor + +*Inherits from: BaseProcessor* + +Processor for gui-r1 format (low-level click instructions). + +### Methods + +#### GuiR1Processor.get_dataset_name + +```python +def get_dataset_name(self) -> str +``` + +#### GuiR1Processor.process + +```python +def process(self) -> List[Dict[str, Any]] +``` + +Process snapshots into gui-r1 format. + +### get_processor + +```python +def get_processor(name: str) -> type[BaseProcessor] +``` + +Get a processor class by name. + +--- + +## sessions + +Sessions module for async container management. + +--- + +## SessionProvider + +*Inherits from: ABC* + +Base class for session providers (Docker, CUA Cloud, etc.). + +### Methods + +#### SessionProvider.start_session + +```python +async def start_session(self, session_id: str, env_path: Path, container_script: str, image_uri: Optional[str] = None, output_dir: Optional[str] = None, kwargs = {}) -> Dict[str, Any] +``` + +Start a new session. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `session_id` | `Any` | Unique identifier for the session | +| `env_path` | `Any` | Path to the environment directory | +| `container_script` | `Any` | Script to run in the container | +| `image_uri` | `Any` | Container image to use | +| `output_dir` | `Any` | Directory to save outputs **kwargs: Additional provider-specific arguments | + +**Returns:** Dict containing session metadata (container_id, status, etc.) + +#### SessionProvider.get_session_status + +```python +async def get_session_status(self, session_id: str) -> Dict[str, Any] +``` + +Get the status of a running session. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `session_id` | `Any` | Session identifier | + +**Returns:** Dict containing session status information + +#### SessionProvider.stop_session + +```python +async def stop_session(self, session_id: str) -> None +``` + +Stop a running session. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `session_id` | `Any` | Session identifier | + +#### SessionProvider.get_session_logs + +```python +async def get_session_logs(self, session_id: str, tail: Optional[int] = None) -> str +``` + +Get logs from a session. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `session_id` | `Any` | Session identifier | +| `tail` | `Any` | Number of lines to return from the end (None for all) | + +**Returns:** Log output as string + +### list_sessions + +```python +def list_sessions(provider: Optional[str] = None) -> List[Dict[str, Any]] +``` + +List all stored sessions. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `provider` | `Any` | Optional provider filter ("docker", "cua-cloud", etc.) | + +**Returns:** List of session metadata dicts + +### make + +```python +def make(provider_name: str, env_type: Optional[str] = None) -> SessionProvider +``` + +Create a session provider for the specified provider. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `provider_name` | `Any` | Name of the provider: - "local": Run locally using Docker (webtop) or QEMU/KVM (winarena) - "cloud": Run on CUA Cloud (GCP Batch for webtop, Azure Batch for winarena) - "docker": (legacy) Alias for "local" | +| `env_type` | `Any` | Optional environment type hint ("webtop" or "winarena"). Used by local provider to select appropriate backend. | + +**Returns:** SessionProvider instance + +**Raises:** + +- `ValueError` - If provider is not supported + +--- + +## batch + +Batch integration for cua-bench. + +### execute_batch + +```python +async def execute_batch(job_name: str, env_path: Path, container_script: str, task_count: int = 4, task_parallelism: int = 4, run_local: bool = False, image_uri: Optional[str] = None, auto_cleanup: bool = True, output_dir: Optional[str] = None) -> List[str] +``` + +Execute a batch job for cua-bench environment. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `job_name` | `Any` | Name of the batch job | +| `env_path` | `Any` | Path to the environment directory | +| `container_script` | `Any` | Script to run in the container | +| `task_count` | `Any` | Number of tasks to run | +| `task_parallelism` | `Any` | Max concurrent tasks | +| `run_local` | `Any` | Run locally using Docker instead of GCP | +| `image_uri` | `Any` | Custom container image | +| `auto_cleanup` | `Any` | Clean up resources after completion | + +**Returns:** List of log lines from the job + +### run_local_docker + +```python +async def run_local_docker(env_path: Path, container_script: str, image_uri: Optional[str] = None, output_dir: Optional[str] = None, task_count: int = 1, parallelism: int = 1) -> List[str] +``` + +Run the batch job locally using Docker. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_path` | `Any` | Path to environment directory | +| `container_script` | `Any` | Script to run | +| `image_uri` | `Any` | Docker image to use | +| `output_dir` | `Any` | Local directory to mount as /tmp/td_output for results | +| `task_count` | `Any` | Total number of tasks to run | +| `parallelism` | `Any` | Maximum number of concurrent containers | + +**Returns:** List of output lines + +--- + +## workers + +Worker-based gym system for parallel environment management. + +This module provides a FastAPI-based worker system for running CUA-Bench +environments in parallel, enabling efficient RL training and evaluation. + +Components: +- worker_server: FastAPI server wrapping Environment instances +- worker_client: HTTP client for interacting with worker servers +- worker_manager: Utilities for spawning and managing multiple workers +- dataloader: MultiTurnDataloader and ReplayBuffer for RL training + +--- + +## MultiTurnDataloader + +Dataloader for RL training with parallel environment workers. + +Each env_config must contain a 'task_configs' key with a list of task +configurations that the client will use internally. + +### Constructor + +```python +MultiTurnDataloader(self, env_class, env_configs, tokenizer, processor = None, is_multi_modal = True, batch_size = 8, replay_capacity = 10000, replay_reward_discount = 0.9, max_prompt_length = 1024, max_response_length = 1024, only_keep_outcome_in_replay = False) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `num_envs` | `Any` | | +| `batch_size` | `Any` | | +| `replay` | `Any` | | + +### Methods + +#### MultiTurnDataloader.async_step + +```python +def async_step(self, batch_return) +``` + +#### MultiTurnDataloader.sample_from_buffer + +```python +def sample_from_buffer(self, batch_size = None) +``` + +#### MultiTurnDataloader.clear_replay_buffer + +```python +def clear_replay_buffer(self) +``` + +#### MultiTurnDataloader.get_balance_stats + +```python +def get_balance_stats(self) +``` + +#### MultiTurnDataloader.calculate_outcome_reward + +```python +def calculate_outcome_reward(self) +``` + +#### MultiTurnDataloader.print_examples + +```python +def print_examples(self, n = 2) +``` + +#### MultiTurnDataloader.print_stats_in_replay_buffer + +```python +def print_stats_in_replay_buffer(self) +``` + +#### MultiTurnDataloader.running_outcome_reward + +```python +def running_outcome_reward(self) +``` + +#### MultiTurnDataloader.close + +```python +def close(self) +``` + +Close all workers and clean up resources. + +--- + +## ReplayBuffer + +### Constructor + +```python +ReplayBuffer(self, capacity = 10000, gamma = 1.0, only_keep_outcome = False, balance_thres = 0.1) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `capacity` | `Any` | | +| `gamma` | `Any` | | +| `only_keep_outcome` | `Any` | | +| `balance_thres` | `Any` | | +| `ready_buffer` | `Any` | | +| `ready_position` | `Any` | | +| `ready_count` | `Any` | | +| `episode_buffer` | `Any` | | + +### Methods + +#### ReplayBuffer.add + +```python +def add(self, data) +``` + +Add data to the replay buffer + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `data` | `tuple` | A tuple of (worker_id, env_ret, meta_info) | + +#### ReplayBuffer.get_balance_stats + +```python +def get_balance_stats(self) +``` + +#### ReplayBuffer.should_keep + +```python +def should_keep(self, curr_below, curr_above, curr_ret) +``` + +#### ReplayBuffer.sample + +```python +def sample(self, batch_size) +``` + +Sample experiences from the ready buffer + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `batch_size` | `int` | Number of experiences to sample | + +**Returns:** list: List of sampled experiences + +#### ReplayBuffer.clear + +```python +def clear(self) +``` + +Clear both ready buffer and episode buffer + +--- + +## CBEnvWorkerClient + +HTTP client for CUA-Bench worker servers. + +This client manages communication with the worker server, image processing, +observation history tracking, and action normalization. + +Args: + env_config: Configuration dict with keys: + - server_url: URL of the worker server + - task_configs: List of task configs, each with env_path, task_index, split + - img_w: Image width (default: 1920) + - img_h: Image height (default: 1080) + - max_step: Maximum steps per episode (default: 50) + - max_hist: Maximum observation history length (default: 10) + - timeout: Environment timeout in seconds (default: 300) + +### Constructor + +```python +CBEnvWorkerClient(self, env_config) +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `vision_start_token` | `Any` | | +| `vision_end_token` | `Any` | | +| `think_start_token` | `Any` | | +| `think_end_token` | `Any` | | +| `action_start_token` | `Any` | | +| `action_end_token` | `Any` | | +| `valid_fn_names` | `Any` | | +| `vlm_img_w` | `Any` | | +| `vlm_img_h` | `Any` | | +| `dynamic_img_size` | `Any` | | +| `env_config` | `Any` | | +| `server_url` | `Any` | | +| `max_step` | `Any` | | +| `max_hist` | `Any` | | +| `task_configs` | `List[Dict[str, Any]]` | | +| `img_h` | `Any` | | +| `img_w` | `Any` | | +| `timeout` | `Any` | | +| `env_id` | `Any` | | +| `uid` | `Any` | | +| `step_count` | `Any` | | +| `done` | `Any` | | +| `prompt` | `Any` | | + +### Methods + +#### CBEnvWorkerClient.reset + +```python +def reset(self) +``` + +#### CBEnvWorkerClient.reset_attempt + +```python +def reset_attempt(self) +``` + +#### CBEnvWorkerClient.prompt_to_input_obs + +```python +def prompt_to_input_obs(self, prompt) +``` + +#### CBEnvWorkerClient.check_and_fix_action + +```python +def check_and_fix_action(self, action_str) +``` + +Parse action string and return (normalized_str, Action object for server). + +#### CBEnvWorkerClient.reward_shaping + +```python +def reward_shaping(self, reward) +``` + +#### CBEnvWorkerClient.check_and_resize_image + +```python +def check_and_resize_image(self, jpg_string) +``` + +#### CBEnvWorkerClient.step + +```python +def step(self, action) +``` + +#### CBEnvWorkerClient.step_attempt + +```python +def step_attempt(self, action) +``` + +#### CBEnvWorkerClient.render + +```python +def render(self) +``` + +Renders the current state in self.prompt as a sequence of text-image pairs into a single image + +**Returns:** PIL.Image: Combined image showing the instruction and interaction history + +--- + +## WorkerHandle + +Handle for a running worker server. + +Attributes: + worker_id: Unique identifier for this worker + port: Port the worker is listening on + process: Subprocess running the worker + api_url: Full URL for API requests + +### Constructor + +```python +WorkerHandle(self, worker_id: str, port: int, process: subprocess.Popen, api_url: str) -> None +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `worker_id` | `str` | | +| `port` | `int` | | +| `process` | `subprocess.Popen` | | +| `api_url` | `str` | | +| `is_running` | `bool` | Check if the worker process is still running. | + +### Methods + +#### WorkerHandle.health_check + +```python +async def health_check(self, timeout: float = 5.0) -> bool +``` + +Check if the worker is healthy. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `timeout` | `Any` | Request timeout in seconds | + +**Returns:** True if healthy, False otherwise + +#### WorkerHandle.stop + +```python +def stop(self) -> None +``` + +Stop the worker process. + +--- + +## WorkerPool + +Context manager for a pool of worker servers. + +Example: + async with WorkerPool(n_workers=4, allowed_ips=["127.0.0.1"]) as pool: + for url in pool.urls: + client = CBEnvWorkerClient(\{ "server_url": url \}) + # Use client... + +### Constructor + +```python +WorkerPool(self, n_workers: int, allowed_ips: List[str], startup_timeout: float = 30.0, host: str = '0.0.0.0') +``` + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `n_workers` | `Any` | | +| `allowed_ips` | `Any` | | +| `startup_timeout` | `Any` | | +| `host` | `Any` | | +| `workers` | `List[WorkerHandle]` | Get the list of worker handles. | +| `urls` | `List[str]` | Get the list of worker URLs. | + +### Methods + +#### WorkerPool.health_check_all + +```python +async def health_check_all(self) -> dict +``` + +Check health of all workers. + +**Returns:** Dict mapping worker_id to health status + +### cleanup_workers + +```python +async def cleanup_workers(workers: List[WorkerHandle]) -> None +``` + +Stop all workers. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `workers` | `Any` | List of WorkerHandle objects to stop | + +### create_workers + +```python +async def create_workers(n_workers: int, allowed_ips: List[str], startup_timeout: float = 30.0, host: str = '0.0.0.0') -> List[WorkerHandle] +``` + +Spawn N worker servers on automatically allocated free ports. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `n_workers` | `Any` | Number of worker servers to spawn | +| `allowed_ips` | `Any` | List of IPs allowed to access workers | +| `startup_timeout` | `Any` | Max time to wait for each worker to become healthy | +| `host` | `Any` | Host for workers to bind to | + +**Returns:** List of WorkerHandle objects + +**Raises:** + +- `RuntimeError` - If any worker fails to start + +**Example:** + +```python +workers = await create_workers( + n_workers=4, + allowed_ips=["127.0.0.1", "10.0.0.5"], +) +# Each worker manages up to 2 envs, so 4 workers = 8 parallel envs +``` + +--- + +## telemetry + +Telemetry module for cua-bench. + +This module provides analytics for tracking feature usage, user workflows, +and system performance. All telemetry is routed through cua-core's PostHog +infrastructure for consistency across the CUA ecosystem. + +Events tracked: +- Tier 1 (Core): command_invoked, task_execution_started, task_evaluation_completed, batch_job_started +- Tier 2 (High Value): task_step_executed, batch_task_completed, dataset_processing_completed, task_execution_failed + +Usage: + from cua_bench.telemetry import record_event, track_command + + # Track CLI command usage + @track_command + def my_command(args): + ... + + # Track custom events + record_event("custom_event", \{"property": "value"\}) + +Environment Variables: + CUA_TELEMETRY_ENABLED: Set to "false" to disable telemetry (default: "true") + CUA_TELEMETRY_DEBUG: Set to "on" for debug logging + +### flush_telemetry + +```python +def flush_telemetry() -> None +``` + +Flush pending telemetry events. + +Delegates to cua-core's PostHog client. + +### is_telemetry_enabled + +```python +def is_telemetry_enabled() -> bool +``` + +Check if telemetry is enabled. + +Delegates to cua-core's telemetry check. + +### record_event + +```python +def record_event(event_name: str, properties: Optional[Dict[str, Any]] = None) -> None +``` + +Record a telemetry event. + +Routes through cua-core's telemetry infrastructure. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `event_name` | `Any` | Name of the event (e.g., "cb_command_invoked") | +| `properties` | `Any` | Optional dict of event properties | + +### track_batch_job_started + +```python +def track_batch_job_started(dataset_name: str, task_count: int, variant_count: int, parallelism: int = 1, agent: Optional[str] = None, model: Optional[str] = None, run_id: Optional[str] = None, provider_type: Optional[str] = None) -> None +``` + +Track batch job start. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `dataset_name` | `Any` | Name of the dataset | +| `task_count` | `Any` | Number of unique tasks | +| `variant_count` | `Any` | Total variants to run | +| `parallelism` | `Any` | Max parallel workers | +| `agent` | `Any` | Agent name if specified | +| `model` | `Any` | Model name if specified | +| `run_id` | `Any` | Run ID for correlation | +| `provider_type` | `Any` | Provider type | + +### track_batch_task_completed + +```python +def track_batch_task_completed(env_name: str, task_index: int, success: bool, reward: Optional[float] = None, total_steps: int = 0, duration_seconds: float = 0, run_id: Optional[str] = None, error: Optional[str] = None) -> None +``` + +Track individual task completion in batch. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_name` | `Any` | Name of the environment/task | +| `task_index` | `Any` | Task variant index | +| `success` | `Any` | Whether task succeeded | +| `reward` | `Any` | Reward/score if available | +| `total_steps` | `Any` | Steps taken | +| `duration_seconds` | `Any` | Task duration | +| `run_id` | `Any` | Run ID for correlation | +| `error` | `Any` | Error message if failed | + +### track_command + +```python +def track_command(func: Callable) -> Callable +``` + +Decorator to track command invocation. + +Usage: + @track_command + def cmd_run_task(args): + ... + +### track_command_async + +```python +def track_command_async(func: Callable) -> Callable +``` + +Async decorator to track command invocation. + +### track_command_invoked + +```python +def track_command_invoked(command: str, subcommand: Optional[str] = None, args: Optional[Dict[str, Any]] = None) -> None +``` + +Track CLI command invocation. + +This is the primary event for understanding feature usage. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `command` | `Any` | Main command (e.g., "run", "interact", "trace") | +| `subcommand` | `Any` | Optional subcommand (e.g., "task", "dataset", "list") | +| `args` | `Any` | Optional sanitized arguments (no sensitive data) | + +### track_dataset_processing_completed + +```python +def track_dataset_processing_completed(processor_mode: str, rows_processed: int, duration_seconds: float, success: bool = True, output_format: Optional[str] = None) -> None +``` + +Track dataset processing completion. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `processor_mode` | `Any` | Processing mode (aguvis-stage-1, gui-r1, etc.) | +| `rows_processed` | `Any` | Number of rows processed | +| `duration_seconds` | `Any` | Processing duration | +| `success` | `Any` | Whether processing succeeded | +| `output_format` | `Any` | Output format (disk, hub, jsonl) | + +### track_task_evaluation_completed + +```python +def track_task_evaluation_completed(env_name: str, task_index: int, result: Any, success: bool, total_steps: int, duration_seconds: float, run_id: Optional[str] = None, agent: Optional[str] = None, model: Optional[str] = None) -> None +``` + +Track task evaluation completion. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_name` | `Any` | Name of the environment/task | +| `task_index` | `Any` | Task variant index | +| `result` | `Any` | Evaluation result (reward/score) | +| `success` | `Any` | Whether task was successful | +| `total_steps` | `Any` | Total steps taken | +| `duration_seconds` | `Any` | Total duration in seconds | +| `run_id` | `Any` | Run ID for correlation | +| `agent` | `Any` | Agent name if used | +| `model` | `Any` | Model name if used | + +### track_task_execution_failed + +```python +def track_task_execution_failed(env_name: str, task_index: int, error_type: str, error_message: str, stage: str, run_id: Optional[str] = None) -> None +``` + +Track task execution failure. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_name` | `Any` | Name of the environment/task | +| `task_index` | `Any` | Task variant index | +| `error_type` | `Any` | Exception class name | +| `error_message` | `Any` | Error message (truncated) | +| `stage` | `Any` | Stage where error occurred | +| `run_id` | `Any` | Run ID for correlation | + +### track_task_execution_started + +```python +def track_task_execution_started(env_name: str, task_index: int, provider_type: Optional[str] = None, os_type: Optional[str] = None, agent: Optional[str] = None, model: Optional[str] = None, max_steps: Optional[int] = None, execution_mode: str = 'single', run_id: Optional[str] = None) -> None +``` + +Track task execution start. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `env_name` | `Any` | Name of the environment/task | +| `task_index` | `Any` | Task variant index | +| `provider_type` | `Any` | Provider type (simulated, webtop, native, computer) | +| `os_type` | `Any` | OS type (linux, windows, android) | +| `agent` | `Any` | Agent name if specified | +| `model` | `Any` | Model name if specified | +| `max_steps` | `Any` | Max steps budget | +| `execution_mode` | `Any` | Execution mode (single, batch, interactive) | +| `run_id` | `Any` | Run ID for correlation | + +### track_task_step_executed + +```python +def track_task_step_executed(action_type: str, step_count: int, duration_ms: Optional[float] = None, run_id: Optional[str] = None) -> None +``` + +Track individual step execution. + +Note: This should be sampled to avoid high event volume. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `action_type` | `Any` | Type of action (ClickAction, TypeAction, etc.) | +| `step_count` | `Any` | Current step number | +| `duration_ms` | `Any` | Step duration in milliseconds | +| `run_id` | `Any` | Run ID for correlation | + +--- + +## apps + +App Registry for cua-bench. + +A decorator-based API for registering platform-specific app installers and launchers. +Makes it easy for contributors to add support for new applications. + +Example - Defining an app: + + # cua_bench/apps/godot.py + from cua_bench.apps import App, install, launch + + class Godot(App): + name = "godot" + description = "Godot game engine" + + @install("linux") + async def install_linux(session, *, with_shortcut=True, version="4.2.1"): + await session.run_command( + f"cd ~/Desktop && " + f"wget -q https://github.com/godotengine/godot/releases/download/\{version\}-stable/Godot_v\{version\}-stable_linux.x86_64.zip && " + f"unzip -q Godot_v\{version\}-stable_linux.x86_64.zip" + ) + if with_shortcut: + await session.run_command( + "ln -sf ~/Desktop/Godot_v*_linux.x86_64 ~/Desktop/Godot" + ) + + @install("windows") + async def install_windows(session, *, with_shortcut=True, version="4.2.1"): + await session.run_command(f"choco install godot --version=\{version\} -y") + + @launch("linux", "windows") + async def launch_editor(session, *, project_path=None): + cmd = "~/Desktop/Godot" if session.os_type == "linux" else "godot" + if project_path: + cmd += f" --editor --path \{project_path\}" + await session.run_command(f"\{cmd\} &") + +Example - Using in a task: + + @cb.setup_task(split="train") + async def start(task_cfg: cb.Task, session: cb.DesktopSession): + # Install app (auto-selects platform) + await session.install_app("godot", with_shortcut=True, version="4.2.1") + + # Launch app + await session.launch_app("godot", project_path="~/project") + +--- + +## App + +Base class for app definitions. + +Subclass this and define platform-specific methods using decorators: + + class MyApp(App): + name = "myapp" + description = "My application" + + @install("linux") + async def install_linux(session, **kwargs): + ... + + @install("windows") + async def install_windows(session, **kwargs): + ... + + @launch("linux", "windows") + async def launch(session, **kwargs): + ... + +### Attributes + +| Name | Type | Description | +|------|------|-------------| +| `name` | `str` | | +| `description` | `str` | | + +### Methods + +#### App.get_method + +```python +def get_method(self, method_type: str, platform: Platform) -> Optional[AppMethod] +``` + +Get a method for the given type and platform. + +#### App.get_install + +```python +def get_install(self, platform: Platform) -> Optional[AppMethod] +``` + +Get the install method for a platform. + +#### App.get_launch + +```python +def get_launch(self, platform: Platform) -> Optional[AppMethod] +``` + +Get the launch method for a platform. + +#### App.get_uninstall + +```python +def get_uninstall(self, platform: Platform) -> Optional[AppMethod] +``` + +Get the uninstall method for a platform. + +#### App.supported_platforms + +```python +def supported_platforms(self, method_type: str = 'install') -> Set[Platform] +``` + +Get platforms supported for a method type. + +--- + +## AppRegistry + +Registry access for DesktopSession integration. + +This class provides the interface used by DesktopSession to install/launch apps. + +### Methods + +#### AppRegistry.install_app + +```python +async def install_app(session: Any, app_name: str, with_shortcut: bool = True, kwargs = {}) -> None +``` + +Install an app on the session's platform. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `session` | `Any` | DesktopSession instance | +| `app_name` | `Any` | Name of the app to install | +| `with_shortcut` | `Any` | Whether to create desktop shortcut (default True) **kwargs: Additional app-specific arguments | + +#### AppRegistry.launch_app + +```python +async def launch_app(session: Any, app_name: str, kwargs = {}) -> None +``` + +Launch an app on the session's platform. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `session` | `Any` | DesktopSession instance | +| `app_name` | `Any` | Name of the app to launch **kwargs: App-specific launch arguments | + +#### AppRegistry.uninstall_app + +```python +async def uninstall_app(session: Any, app_name: str, kwargs = {}) -> None +``` + +Uninstall an app from the session's platform. + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `session` | `Any` | DesktopSession instance | +| `app_name` | `Any` | Name of the app to uninstall **kwargs: App-specific arguments | + +### get_app + +```python +def get_app(name: str) -> Optional[App] +``` + +Get a registered app by name. + +### list_apps + +```python +def list_apps() -> List[str] +``` + +List all registered app names. diff --git a/docs/content/docs/cuabench/reference/cli-reference.mdx b/docs/content/docs/cuabench/reference/cli-reference.mdx index 5ba667ba..42e838cd 100644 --- a/docs/content/docs/cuabench/reference/cli-reference.mdx +++ b/docs/content/docs/cuabench/reference/cli-reference.mdx @@ -3,6 +3,10 @@ title: CLI Reference description: Complete reference for all cua-bench CLI commands --- +import { VersionBadge } from '@/components/version-selector'; + + + ## Image Commands Create and manage base images for environments. diff --git a/docs/content/docs/cuabench/reference/meta.json b/docs/content/docs/cuabench/reference/meta.json index 362465d2..b1238bf8 100644 --- a/docs/content/docs/cuabench/reference/meta.json +++ b/docs/content/docs/cuabench/reference/meta.json @@ -2,5 +2,5 @@ "title": "Reference", "description": "CLI and API reference", "icon": "FileText", - "pages": ["cli-reference", "sdk-reference"] + "pages": ["cli-reference", "api"] } diff --git a/docs/content/docs/cuabot/install.mdx b/docs/content/docs/cuabot/guide/getting-started/installation.mdx similarity index 95% rename from docs/content/docs/cuabot/install.mdx rename to docs/content/docs/cuabot/guide/getting-started/installation.mdx index 49ed5f58..4fcddf5f 100644 --- a/docs/content/docs/cuabot/install.mdx +++ b/docs/content/docs/cuabot/guide/getting-started/installation.mdx @@ -1,6 +1,6 @@ --- title: Installation -description: Install CuaBot and its dependencies +description: Install Cua-Bot and its dependencies --- import { Tab, Tabs } from 'fumadocs-ui/components/tabs'; @@ -41,7 +41,7 @@ import { Callout } from 'fumadocs-ui/components/callout'; # Install Xpra sudo apt install xpra - # Install CuaBot + # Install Cua-Bot npm install -g cuabot ``` diff --git a/docs/content/docs/cuabot/cuabot.mdx b/docs/content/docs/cuabot/guide/getting-started/introduction.mdx similarity index 91% rename from docs/content/docs/cuabot/cuabot.mdx rename to docs/content/docs/cuabot/guide/getting-started/introduction.mdx index e26e514d..e41b0630 100644 --- a/docs/content/docs/cuabot/cuabot.mdx +++ b/docs/content/docs/cuabot/guide/getting-started/introduction.mdx @@ -1,10 +1,15 @@ --- -title: CuaBot +title: Introduction description: Multi-user computing with AI ---
- cuabot screenshot + cuabot screenshot
**Multi-user computing with AI** @@ -107,7 +112,7 @@ cuabot --help # Show help ## How It Works -CuaBot runs a Docker container with Xpra, streaming individual application windows to your desktop. Your chosen agent runs inside the container with: +Cua-Bot runs a Docker container with Xpra, streaming individual application windows to your desktop. Your chosen agent runs inside the container with: - Full Ubuntu 22.04 environment - Pre-installed: Node.js, Python, browsers, dev tools @@ -117,6 +122,7 @@ CuaBot runs a Docker container with Xpra, streaming individual application windo ## Configuration Config files are stored in `~/.cuabot/`: + - `settings.json` - Default agent and preferences - `server.pid` / `server..pid` - Server process ID - `server.port` / `server..port` - Server port number diff --git a/docs/content/docs/cuabot/guide/getting-started/meta.json b/docs/content/docs/cuabot/guide/getting-started/meta.json new file mode 100644 index 00000000..4e1aaf13 --- /dev/null +++ b/docs/content/docs/cuabot/guide/getting-started/meta.json @@ -0,0 +1,7 @@ +{ + "title": "Getting Started", + "description": "Get started with Cua-Bot", + "icon": "Rocket", + "defaultOpen": true, + "pages": ["introduction", "installation"] +} diff --git a/docs/content/docs/cuabot/guide/meta.json b/docs/content/docs/cuabot/guide/meta.json new file mode 100644 index 00000000..5f712fc7 --- /dev/null +++ b/docs/content/docs/cuabot/guide/meta.json @@ -0,0 +1,6 @@ +{ + "title": "Guide", + "description": "Learn how to use Cua-Bot", + "icon": "Book", + "pages": ["getting-started"] +} diff --git a/docs/content/docs/cuabot/meta.json b/docs/content/docs/cuabot/meta.json index ddbac8ba..2e99986a 100644 --- a/docs/content/docs/cuabot/meta.json +++ b/docs/content/docs/cuabot/meta.json @@ -1,5 +1,5 @@ { - "title": "CuaBot", + "title": "Cua-Bot", "description": "Co-op computer-use for any agent", - "pages": ["cuabot", "install"] + "pages": ["guide", "reference"] } diff --git a/docs/content/docs/cuabot/reference/changelog.mdx b/docs/content/docs/cuabot/reference/changelog.mdx new file mode 100644 index 00000000..51a2247c --- /dev/null +++ b/docs/content/docs/cuabot/reference/changelog.mdx @@ -0,0 +1,132 @@ +--- +title: Changelog +description: Release history for Cua-Bot +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/generate-changelog.ts +Last updated: 2026-02-09 +*/} + +# Cua-Bot Changelog + +All notable changes to the Cua-Bot are documented here. + +## 1.0.x + +### v1.0.13 (2026-02-05) + +- Bump cuabot to v1.0.13 by @github-actions[bot] + +Documentation + +- [Getting Started](https://cua.ai/docs/cuabot/cuabot) +- [Installation Guide](https://cua.ai/docs/cuabot/install) + +### v1.0.12 (2026-02-05) + +- Bump cuabot to v1.0.12 by @github-actions[bot] +- fix cursor blocking input, remove start command override for cuabot ([#1020](https://github.com/trycua/cua/pull/1020)) by @ddupont808 + +Documentation + +- [Getting Started](https://cua.ai/docs/cuabot/cuabot) +- [Installation Guide](https://cua.ai/docs/cuabot/install) + +### v1.0.11 (2026-02-04) + +- Bump cuabot to v1.0.11 by @github-actions[bot] + +Documentation + +- [Getting Started](https://cua.ai/docs/cuabot/cuabot) +- [Installation Guide](https://cua.ai/docs/cuabot/install) + +### v1.0.10 (2026-02-04) + +- Bump cuabot to v1.0.10 by @github-actions[bot] + +Documentation + +- [Getting Started](https://cua.ai/docs/cuabot/cuabot) +- [Installation Guide](https://cua.ai/docs/cuabot/install) + +### v1.0.9 (2026-02-04) + +- Bump cuabot to v1.0.9 by @github-actions[bot] +- Fix onboarding step missing on Windows over npx/pnpx + +Documentation + +- [Getting Started](https://cua.ai/docs/cuabot/cuabot) +- [Installation Guide](https://cua.ai/docs/cuabot/install) + +### v1.0.8 (2026-02-04) + +- Bump cuabot to v1.0.8 by @github-actions[bot] +- Add onboarding debug info and fix Xpra window detection on Windows ([#1014](https://github.com/trycua/cua/pull/1014)) by @ddupont808 + +Documentation + +- [Getting Started](https://cua.ai/docs/cuabot/cuabot) +- [Installation Guide](https://cua.ai/docs/cuabot/install) + +### v1.0.7 (2026-02-04) + +- Bump cuabot to v1.0.7 by @github-actions[bot] + +Documentation + +- [Getting Started](https://cua.ai/docs/cuabot/cuabot) +- [Installation Guide](https://cua.ai/docs/cuabot/install) + +### v1.0.6 (2026-02-04) + +- Bump cuabot to v1.0.6 by @github-actions[bot] +- Bugfixes for Windows + +Documentation + +- [Getting Started](https://cua.ai/docs/cuabot/cuabot) +- [Installation Guide](https://cua.ai/docs/cuabot/install) + +### v1.0.5 (2026-02-04) + +- Bump cuabot to v1.0.5 by @github-actions[bot] +- [Cuabot] add y to npx ([#1009](https://github.com/trycua/cua/pull/1009)) by @ddupont808 +- chore: bump cuabot container to 1.0.4 by @github-actions[bot] + +Documentation + +- [Getting Started](https://cua.ai/docs/cuabot/cuabot) +- [Installation Guide](https://cua.ai/docs/cuabot/install) + +### v1.0.4 (2026-02-04) + +- Initial release or no path-specific changes found + +Documentation + +- [Getting Started](https://cua.ai/docs/cuabot/cuabot) +- [Installation Guide](https://cua.ai/docs/cuabot/install) + +### v1.0.3 (2026-02-04) + +- Bump cuabot to v1.0.3 by @github-actions[bot] +- fix cuabot publish 3 ([#1003](https://github.com/trycua/cua/pull/1003)) by @ddupont808 +- fix cuabot publish 2 ([#1002](https://github.com/trycua/cua/pull/1002)) by @ddupont808 + +Documentation + +- [Getting Started](https://cua.ai/docs/cuabot/cuabot) +- [Installation Guide](https://cua.ai/docs/cuabot/install) + +### v1.0.2 (2026-02-04) + +- fix cuabot publish ([#1001](https://github.com/trycua/cua/pull/1001)) by @ddupont808 +- Bump cuabot to v1.0.2 by @github-actions[bot] + +Documentation + +See [cua.ai/docs/cuabot](https://cua.ai/docs/cuabot) diff --git a/docs/content/docs/cuabot/reference/index.mdx b/docs/content/docs/cuabot/reference/index.mdx new file mode 100644 index 00000000..90eacfd3 --- /dev/null +++ b/docs/content/docs/cuabot/reference/index.mdx @@ -0,0 +1,295 @@ +--- +title: API Reference +description: TypeScript API reference for the Cua-Bot sandboxed agent framework +--- + +{/* + AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY + Generated by: npx tsx scripts/docs-generators/typescript-sdk.ts + Source: libs/cuabot/src + Version: 1.0.13 +*/} + +import { Callout } from 'fumadocs-ui/components/callout'; +import { VersionHeader } from '@/components/version-selector'; + + + +--- + +## client + +CuaBot Server Client +Connects to the CuaBot server via HTTP + +### CuaBotClient + +#### Constructor + +```typescript +new CuaBotClient(port: number) +``` + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `port` | `number` | | + +#### Methods + +##### CuaBotClient.for + +```typescript +for(): Promise< +``` + +**Returns:** `Promise<` + +##### CuaBotClient.request + +```typescript +request(): Promise +``` + +**Returns:** `Promise<number | null>` + +##### CuaBotClient.status + +```typescript +status(): Promise< +``` + +**Returns:** `Promise<` + +##### CuaBotClient.request + +```typescript +request(): Promise +``` + +**Returns:** `Promise<string>` + +##### CuaBotClient.request + +```typescript +request(): Promise +``` + +##### CuaBotClient.request + +```typescript +request(): Promise +``` + +##### CuaBotClient.request + +```typescript +request(): Promise +``` + +##### CuaBotClient.request + +```typescript +request(): Promise +``` + +##### CuaBotClient.request + +```typescript +request(): Promise +``` + +##### CuaBotClient.request + +```typescript +request(): Promise +``` + +##### CuaBotClient.request + +```typescript +request(): Promise +``` + +##### CuaBotClient.request + +```typescript +request(): Promise +``` + +##### CuaBotClient.request + +```typescript +request(): Promise +``` + +##### CuaBotClient.request + +```typescript +request(): Promise +``` + +##### CuaBotClient.request + +```typescript +request(): Promise +``` + +### setSessionName + +```typescript +function setSessionName(name: string | null): void +``` + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `name` | `string | null` | | + +### getSessionName + +```typescript +function getSessionName(): string | null +``` + +**Returns:** `string | null` + +### isServerRunning + +```typescript +async function isServerRunning(): Promise< +``` + +**Returns:** `Promise<` + +### ensureServerRunning + +```typescript +async function ensureServerRunning(): Promise +``` + +**Returns:** `Promise<number>` + +--- + +## settings + +CuaBot Settings Management + +### Settings + +```typescript +interface Settings { + defaultAgent?: string; + telemetryEnabled?: boolean; + aliasIgnored?: boolean; +} +``` + +| Property | Type | Description | +|----------|------|-------------| +| `defaultAgent` | `string` | *(optional)* | +| `telemetryEnabled` | `boolean` | *(optional)* | +| `aliasIgnored` | `boolean` | *(optional)* | + +### AGENTS + +```typescript +const AGENTS: const +``` + +### loadSettings + +```typescript +function loadSettings(): Settings +``` + +**Returns:** `Settings` + +### saveSettings + +```typescript +function saveSettings(settings: Settings): void +``` + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `settings` | `Settings` | | + +### getDefaultAgent + +```typescript +function getDefaultAgent(): string | undefined +``` + +**Returns:** `string | undefined` + +### setDefaultAgent + +```typescript +function setDefaultAgent(agent: string): void +``` + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `agent` | `string` | | + +### getTelemetryEnabled + +```typescript +function getTelemetryEnabled(): boolean +``` + +**Returns:** `boolean` + +### isTelemetryConfigured + +```typescript +function isTelemetryConfigured(): boolean +``` + +**Returns:** `boolean` + +### setTelemetryEnabled + +```typescript +function setTelemetryEnabled(enabled: boolean): void +``` + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `enabled` | `boolean` | | + +### getAliasIgnored + +```typescript +function getAliasIgnored(): boolean +``` + +**Returns:** `boolean` + +### setAliasIgnored + +```typescript +function setAliasIgnored(ignored: boolean): void +``` + +**Parameters:** + +| Name | Type | Description | +|------|------|-------------| +| `ignored` | `boolean` | | diff --git a/docs/content/docs/cuabot/reference/meta.json b/docs/content/docs/cuabot/reference/meta.json new file mode 100644 index 00000000..6680696f --- /dev/null +++ b/docs/content/docs/cuabot/reference/meta.json @@ -0,0 +1,6 @@ +{ + "title": "Reference", + "description": "API and CLI reference for Cua-Bot", + "icon": "FileText", + "pages": ["changelog"] +} diff --git a/docs/content/docs/lume/examples/claude-code/homebrew-testing.mdx b/docs/content/docs/lume/examples/claude-code/homebrew-testing.mdx index 13bf5217..bbb25828 100644 --- a/docs/content/docs/lume/examples/claude-code/homebrew-testing.mdx +++ b/docs/content/docs/lume/examples/claude-code/homebrew-testing.mdx @@ -190,11 +190,11 @@ You: Install jq, uninstall it, and verify it's completely removed with no leftov ## Why this matters -| Testing Approach | Clean Environment | Isolated | Resettable | -|-----------------|-------------------|----------|------------| -| Host machine | No | No | No | -| Docker | Yes | Yes | Yes (but no macOS) | -| **macOS VM** | **Yes** | **Yes** | **Yes** | +| Testing Approach | Clean Environment | Isolated | Resettable | +| ---------------- | ----------------- | -------- | ------------------ | +| Host machine | No | No | No | +| Docker | Yes | Yes | Yes (but no macOS) | +| **macOS VM** | **Yes** | **Yes** | **Yes** | Homebrew is macOS-specific. Testing Homebrew formulas requires macOS, and testing them properly requires isolation. diff --git a/docs/content/docs/lume/examples/claude-code/sandbox.mdx b/docs/content/docs/lume/examples/claude-code/sandbox.mdx index b332c6bc..2038a826 100644 --- a/docs/content/docs/lume/examples/claude-code/sandbox.mdx +++ b/docs/content/docs/lume/examples/claude-code/sandbox.mdx @@ -89,13 +89,15 @@ Then tell Claude: > Install any dependencies you need, run builds, execute tests—all inside that VM." Claude Code will: + 1. SSH into the sandbox VM 2. Navigate to your shared project 3. Run commands in the isolated environment 4. Report results back to you -Claude Code runs on your host but executes commands via SSH inside the VM. Your host system stays untouched—only the sandbox and shared files are affected. + Claude Code runs on your host but executes commands via SSH inside the VM. Your host system stays + untouched—only the sandbox and shared files are affected. ## Example session @@ -157,18 +159,19 @@ echo "Then tell Claude to SSH into lume@$VM_IP" ## Security considerations -The shared folder is read/write. Claude can modify or delete files in that folder. For sensitive work: -- Share a copy of your project, not the original -- Use git to track changes and revert if needed -- Clone the VM before risky operations + The shared folder is read/write. Claude can modify or delete files in that folder. For sensitive + work: - Share a copy of your project, not the original - Use git to track changes and revert if + needed - Clone the VM before risky operations The VM itself is fully isolated—code running inside cannot: + - Access files outside the shared folder - Modify your host system - Read your host's environment variables or credentials But it can: + - Access the network - Install software inside the VM - Consume CPU/memory diff --git a/docs/content/docs/lume/examples/claude-cowork/numbers-stock-analysis.mdx b/docs/content/docs/lume/examples/claude-cowork/numbers-stock-analysis.mdx index 4e12530d..8680aa5a 100644 --- a/docs/content/docs/lume/examples/claude-cowork/numbers-stock-analysis.mdx +++ b/docs/content/docs/lume/examples/claude-cowork/numbers-stock-analysis.mdx @@ -159,11 +159,11 @@ Try these related tasks: This example shows why macOS sandboxes are valuable: -| Approach | Can use Numbers.app? | Native macOS automation? | -|----------|---------------------|-------------------------| -| Docker | No | No | -| Linux VM | No | No | -| **macOS VM** | **Yes** | **Yes** | +| Approach | Can use Numbers.app? | Native macOS automation? | +| ------------ | -------------------- | ------------------------ | +| Docker | No | No | +| Linux VM | No | No | +| **macOS VM** | **Yes** | **Yes** | Tasks requiring Numbers, Keynote, Pages, or other macOS-only apps need a macOS environment. Lume provides that in an isolated sandbox. diff --git a/docs/content/docs/lume/examples/claude-cowork/sandbox.mdx b/docs/content/docs/lume/examples/claude-cowork/sandbox.mdx index 865c3b25..6096aea9 100644 --- a/docs/content/docs/lume/examples/claude-cowork/sandbox.mdx +++ b/docs/content/docs/lume/examples/claude-cowork/sandbox.mdx @@ -53,7 +53,7 @@ Add the Lume MCP server to your Claude Desktop config: ``` -If `lume` isn't in your PATH, use the full path: `/Users/yourname/.local/bin/lume` + If `lume` isn't in your PATH, use the full path: `/Users/yourname/.local/bin/lume` ### 2. Restart Claude Desktop @@ -70,15 +70,15 @@ lume create sandbox --os macos --ipsw latest --unattended tahoe ## Available MCP tools -| Tool | Description | -|------|-------------| -| `lume_list_vms` | List all VMs with status and IP addresses | -| `lume_get_vm` | Get detailed VM info | -| `lume_run_vm` | Start a VM with optional shared directory | -| `lume_stop_vm` | Stop a running VM | -| `lume_clone_vm` | Clone a VM (for golden images) | -| `lume_delete_vm` | Delete a VM | -| `lume_exec` | Execute commands inside the VM | +| Tool | Description | +| ---------------- | ----------------------------------------- | +| `lume_list_vms` | List all VMs with status and IP addresses | +| `lume_get_vm` | Get detailed VM info | +| `lume_run_vm` | Start a VM with optional shared directory | +| `lume_stop_vm` | Stop a running VM | +| `lume_clone_vm` | Clone a VM (for golden images) | +| `lume_delete_vm` | Delete a VM | +| `lume_exec` | Execute commands inside the VM | ## Example workflow @@ -150,12 +150,12 @@ VM 'sandbox' started from clean state. ## Advantages over SSH -| Feature | SSH (Claude Code) | MCP (Claude Cowork) | -|---------|------------------|---------------------| -| VM lifecycle | Manual (`lume run/stop`) | Automated via tools | -| IP lookup | Manual (`lume get`) | Automatic | -| Credentials | User provides | Configured once | -| Reset workflow | Script required | Natural language | +| Feature | SSH (Claude Code) | MCP (Claude Cowork) | +| -------------- | ------------------------ | ------------------- | +| VM lifecycle | Manual (`lume run/stop`) | Automated via tools | +| IP lookup | Manual (`lume get`) | Automatic | +| Credentials | User provides | Configured once | +| Reset workflow | Script required | Natural language | With MCP, Cowork can manage the entire VM lifecycle through conversation. diff --git a/docs/content/docs/lume/examples/index.mdx b/docs/content/docs/lume/examples/index.mdx index 5fd35906..d16ebdfe 100644 --- a/docs/content/docs/lume/examples/index.mdx +++ b/docs/content/docs/lume/examples/index.mdx @@ -4,7 +4,7 @@ description: Step-by-step tutorials and use cases for Lume --- import { Card, Cards } from 'fumadocs-ui/components/card'; -import { Bot, Terminal, MessageCircle } from 'lucide-react'; +import { Bot, Terminal } from 'lucide-react'; # Examples @@ -23,12 +23,6 @@ Explore real-world examples and tutorials for using Lume to create and manage ma title="Claude Cowork" description="Use Claude Cowork with Lume MCP connector" /> - } - href="/lume/examples/openclaw" - title="OpenClaw" - description="Run OpenClaw messaging gateway with iMessage support" - /> ## Available Examples @@ -47,12 +41,6 @@ Use Claude Cowork with the Lume MCP connector for native VM management: - **[Sandbox with MCP](/lume/examples/claude-cowork/sandbox)** — Configure the MCP connector for VM lifecycle control - **[Stock Analysis with Numbers](/lume/examples/claude-cowork/numbers-stock-analysis)** — Use macOS-only apps like Numbers.app -### OpenClaw - -Run a unified messaging gateway that bridges WhatsApp, Telegram, iMessage, and more with AI agents: - -- **[OpenClaw Setup](/lume/examples/openclaw)** — Install and configure OpenClaw in a headless macOS VM with iMessage support - ## Coming Soon - **CI/CD with GitHub Actions** - Run macOS builds and tests in ephemeral VMs diff --git a/docs/content/docs/lume/examples/meta.json b/docs/content/docs/lume/examples/meta.json index 72804e2d..82acebbe 100644 --- a/docs/content/docs/lume/examples/meta.json +++ b/docs/content/docs/lume/examples/meta.json @@ -2,5 +2,5 @@ "title": "Examples", "description": "Step-by-step tutorials and use cases", "icon": "Blocks", - "pages": ["claude-code", "claude-cowork", "openclaw"] + "pages": ["claude-code", "claude-cowork"] } diff --git a/docs/content/docs/lume/guide/advanced/http-server.mdx b/docs/content/docs/lume/guide/advanced/http-server.mdx index 550546ab..009456d6 100644 --- a/docs/content/docs/lume/guide/advanced/http-server.mdx +++ b/docs/content/docs/lume/guide/advanced/http-server.mdx @@ -16,7 +16,8 @@ lume serve --port 8080 ``` -If you installed Lume with default settings, the server runs as a background service on port 7777. No need to start it manually. + If you installed Lume with default settings, the server runs as a background service on port 7777. + No need to start it manually. ## Base URL @@ -66,6 +67,7 @@ curl -X POST http://localhost:7777/lume/vms \ ``` Response: + ```json { "message": "VM creation started", diff --git a/docs/content/docs/lume/guide/advanced/lumier/building-lumier.mdx b/docs/content/docs/lume/guide/advanced/lumier/building-lumier.mdx index 178b8a8b..81a80384 100644 --- a/docs/content/docs/lume/guide/advanced/lumier/building-lumier.mdx +++ b/docs/content/docs/lume/guide/advanced/lumier/building-lumier.mdx @@ -27,11 +27,11 @@ docker run -it --rm \ ## Customization points -| Location | What to customize | -|----------|-------------------| -| `Dockerfile` | Base image, installed packages | -| `/run/hooks/` | Scripts that run during VM lifecycle | -| `/run/config/constants.sh` | Default settings | +| Location | What to customize | +| -------------------------- | ------------------------------------ | +| `Dockerfile` | Base image, installed packages | +| `/run/hooks/` | Scripts that run during VM lifecycle | +| `/run/config/constants.sh` | Default settings | ### Change the base image diff --git a/docs/content/docs/lume/guide/advanced/lumier/docker-compose.mdx b/docs/content/docs/lume/guide/advanced/lumier/docker-compose.mdx index 17f415ac..bf7ffee8 100644 --- a/docs/content/docs/lume/guide/advanced/lumier/docker-compose.mdx +++ b/docs/content/docs/lume/guide/advanced/lumier/docker-compose.mdx @@ -53,11 +53,11 @@ Access at `http://localhost:8006`. ## Configuration options -| Field | Description | -|-------|-------------| +| Field | Description | +| ------------------------- | ------------------------------------ | | `restart: unless-stopped` | Auto-restart unless manually stopped | -| `stop_grace_period: 2m` | Wait 2 minutes for graceful shutdown | -| `stop_signal: SIGINT` | Send interrupt signal to stop | +| `stop_grace_period: 2m` | Wait 2 minutes for graceful shutdown | +| `stop_signal: SIGINT` | Send interrupt signal to stop | ## Multiple VMs diff --git a/docs/content/docs/lume/guide/advanced/lumier/docker.mdx b/docs/content/docs/lume/guide/advanced/lumier/docker.mdx index e22cf749..3ea78348 100644 --- a/docs/content/docs/lume/guide/advanced/lumier/docker.mdx +++ b/docs/content/docs/lume/guide/advanced/lumier/docker.mdx @@ -22,7 +22,7 @@ docker run -it --rm \ Open `http://localhost:8006` in your browser. -In ephemeral mode, changes are lost when you stop the container. See below for persistent storage. + In ephemeral mode, changes are lost when you stop the container. See below for persistent storage. ## Persistent storage @@ -90,19 +90,20 @@ chmod +x shared/lifecycle/on-logon.sh ``` The script runs as the `lume` user and can access: + - `/Users/lume` — home directory - `/Volumes/My Shared Files` — shared folder ## Environment variables -| Variable | Description | Example | -|----------|-------------|---------| -| `VM_NAME` | Name for the VM | `my-vm` | -| `VERSION` | macOS image to use | `ghcr.io/trycua/macos-sequoia-cua:latest` | -| `CPU_CORES` | CPU cores | `4` | -| `RAM_SIZE` | Memory in MB | `8192` | -| `HOST_STORAGE_PATH` | Persistent storage path | `$(pwd)/storage` | -| `HOST_SHARED_PATH` | Shared folder path | `$(pwd)/shared` | +| Variable | Description | Example | +| ------------------- | ----------------------- | ----------------------------------------- | +| `VM_NAME` | Name for the VM | `my-vm` | +| `VERSION` | macOS image to use | `ghcr.io/trycua/macos-sequoia-cua:latest` | +| `CPU_CORES` | CPU cores | `4` | +| `RAM_SIZE` | Memory in MB | `8192` | +| `HOST_STORAGE_PATH` | Persistent storage path | `$(pwd)/storage` | +| `HOST_SHARED_PATH` | Shared folder path | `$(pwd)/shared` | ## Change the port diff --git a/docs/content/docs/lume/guide/advanced/lumier/index.mdx b/docs/content/docs/lume/guide/advanced/lumier/index.mdx index eb9f230d..ae754c9d 100644 --- a/docs/content/docs/lume/guide/advanced/lumier/index.mdx +++ b/docs/content/docs/lume/guide/advanced/lumier/index.mdx @@ -21,17 +21,18 @@ Open `http://localhost:8006` in your browser—you'll see a macOS desktop. ## When to use Lumier vs Lume directly -| Use Lumier when... | Use Lume directly when... | -|--------------------|---------------------------| -| You want browser-based VNC access | You prefer native VNC clients | -| You're distributing VM environments | You're running locally for development | -| You want Docker Compose orchestration | You need maximum performance | -| You're building Docker-based workflows | You want full CLI control | +| Use Lumier when... | Use Lume directly when... | +| -------------------------------------- | -------------------------------------- | +| You want browser-based VNC access | You prefer native VNC clients | +| You're distributing VM environments | You're running locally for development | +| You want Docker Compose orchestration | You need maximum performance | +| You're building Docker-based workflows | You want full CLI control | ## How it works -Docker is the delivery mechanism, not an isolation layer. Lumier connects to the Lume service on your host Mac to create real VMs using Apple's Virtualization Framework. + Docker is the delivery mechanism, not an isolation layer. Lumier connects to the Lume service on + your host Mac to create real VMs using Apple's Virtualization Framework. ``` @@ -53,6 +54,7 @@ Docker is the delivery mechanism, not an isolation layer. Lumier connects to the ``` The Docker container provides: + - noVNC server for browser access - Environment variable configuration - Lifecycle hooks for automation diff --git a/docs/content/docs/lume/guide/advanced/lumier/installation.mdx b/docs/content/docs/lume/guide/advanced/lumier/installation.mdx index 2def7c83..b8b8cf2d 100644 --- a/docs/content/docs/lume/guide/advanced/lumier/installation.mdx +++ b/docs/content/docs/lume/guide/advanced/lumier/installation.mdx @@ -32,7 +32,8 @@ curl http://localhost:7777/lume/vms ``` -Lume runs as a background service on port 7777. Lumier connects to this service to create and manage VMs. + Lume runs as a background service on port 7777. Lumier connects to this service to create and + manage VMs. ## Verify everything works diff --git a/docs/content/docs/lume/guide/advanced/mcp-server.mdx b/docs/content/docs/lume/guide/advanced/mcp-server.mdx index e521f646..a8de20ef 100644 --- a/docs/content/docs/lume/guide/advanced/mcp-server.mdx +++ b/docs/content/docs/lume/guide/advanced/mcp-server.mdx @@ -58,7 +58,8 @@ Or manually edit `~/Library/Application Support/Claude/claude_desktop_config.jso ``` -If editing manually, replace `/Users/yourname/.local/bin/lume` with the actual path to your lume binary. You can find it by running `which lume` in your terminal. + If editing manually, replace `/Users/yourname/.local/bin/lume` with the actual path to your lume + binary. You can find it by running `which lume` in your terminal. ### 2. Restart Claude Desktop @@ -68,6 +69,7 @@ Claude will now have access to Lume's VM management tools. ### 3. Try it out Ask Claude: + - "List my VMs" - "Start the sandbox VM" - "Run `ls -la` in the sandbox VM" @@ -75,35 +77,35 @@ Ask Claude: ## Available tools -| Tool | Description | -|------|-------------| -| `lume_create_vm` | Create a new macOS VM (async, returns immediately) | -| `lume_list_vms` | List all VMs with status, IP addresses, and resource allocation | -| `lume_get_vm` | Get detailed VM info (IP, VNC URL, SSH availability) | -| `lume_run_vm` | Start a VM with optional shared directory | -| `lume_stop_vm` | Stop a running VM gracefully | -| `lume_clone_vm` | Clone a VM (useful for golden images) | -| `lume_delete_vm` | Delete a VM and its files | -| `lume_exec` | Execute commands inside a VM via SSH | +| Tool | Description | +| ---------------- | --------------------------------------------------------------- | +| `lume_create_vm` | Create a new macOS VM (async, returns immediately) | +| `lume_list_vms` | List all VMs with status, IP addresses, and resource allocation | +| `lume_get_vm` | Get detailed VM info (IP, VNC URL, SSH availability) | +| `lume_run_vm` | Start a VM with optional shared directory | +| `lume_stop_vm` | Stop a running VM gracefully | +| `lume_clone_vm` | Clone a VM (useful for golden images) | +| `lume_delete_vm` | Delete a VM and its files | +| `lume_exec` | Execute commands inside a VM via SSH | ## Resources The MCP server provides documentation resources that AI agents can read: -| Resource | Description | -|----------|-------------| +| Resource | Description | +| -------------------- | ------------------------------------------------------------------------ | | `lume://usage-guide` | Comprehensive guide with workflows, best practices, and status reference | -| `lume://credentials` | Default SSH credentials for VMs created with unattended setup | +| `lume://credentials` | Default SSH credentials for VMs created with unattended setup | ## Prompts Pre-defined workflow prompts guide AI agents through common tasks: -| Prompt | Description | -|--------|-------------| -| `create-sandbox` | Create a new macOS VM with unattended setup | -| `run-in-sandbox` | Execute a command in an existing VM | -| `reset-sandbox` | Reset a sandbox by cloning from a golden image | +| Prompt | Description | +| ---------------- | ---------------------------------------------- | +| `create-sandbox` | Create a new macOS VM with unattended setup | +| `run-in-sandbox` | Execute a command in an existing VM | +| `reset-sandbox` | Reset a sandbox by cloning from a golden image | ## Creating VMs @@ -111,15 +113,15 @@ The `lume_create_vm` tool creates new macOS VMs asynchronously. Since VM creatio ### Parameters -| Parameter | Required | Default | Description | -|-----------|----------|---------|-------------| -| `name` | Yes | - | Name for the new VM | -| `ipsw` | No | `latest` | IPSW path or "latest" to download | -| `unattended` | No | - | Preset name (e.g., "tahoe", "sequoia") for automatic setup | -| `cpu` | No | 4 | Number of CPU cores | -| `memory` | No | `8GB` | Memory size (e.g., "8GB", "16GB") | -| `disk_size` | No | `64GB` | Disk size (e.g., "64GB", "128GB") | -| `storage` | No | - | Storage location name or path | +| Parameter | Required | Default | Description | +| ------------ | -------- | -------- | ---------------------------------------------------------- | +| `name` | Yes | - | Name for the new VM | +| `ipsw` | No | `latest` | IPSW path or "latest" to download | +| `unattended` | No | - | Preset name (e.g., "tahoe", "sequoia") for automatic setup | +| `cpu` | No | 4 | Number of CPU cores | +| `memory` | No | `8GB` | Memory size (e.g., "8GB", "16GB") | +| `disk_size` | No | `64GB` | Disk size (e.g., "64GB", "128GB") | +| `storage` | No | - | Storage location name or path | ### Provisioning status @@ -133,6 +135,7 @@ test stopped - ``` The operation progresses through: + 1. `ipsw_install` — Downloading and installing macOS 2. `unattended_setup` — Running Setup Assistant automation (if `unattended` specified) @@ -171,6 +174,7 @@ brew install hudochenkov/sshpass/sshpass ``` For VMs created with `--unattended tahoe`: + - Username: `lume` - Password: `lume` @@ -208,12 +212,12 @@ Tests completed! Here are the results: ## MCP vs HTTP API -| Feature | MCP Server | HTTP Server | -|---------|-----------|-------------| -| Command | `lume serve --mcp` | `lume serve` | -| Transport | stdio (spawned by client) | HTTP (port 7777) | -| Use case | AI agent integration | Scripts, CI/CD, SDKs | -| Lifecycle | On-demand, client-managed | Background daemon | +| Feature | MCP Server | HTTP Server | +| --------- | ------------------------- | -------------------- | +| Command | `lume serve --mcp` | `lume serve` | +| Transport | stdio (spawned by client) | HTTP (port 7777) | +| Use case | AI agent integration | Scripts, CI/CD, SDKs | +| Lifecycle | On-demand, client-managed | Background daemon | Use MCP for AI agent integration. Use HTTP for programmatic access from scripts and applications. @@ -226,6 +230,7 @@ npx @modelcontextprotocol/inspector lume serve --mcp ``` This opens a web UI where you can: + - See all available tools - Test tool calls interactively - Inspect request/response payloads @@ -276,15 +281,18 @@ New VMs created with `--unattended tahoe` have SSH enabled automatically. ## Security considerations -The MCP server gives AI agents direct control over your VMs. Only use it with trusted AI applications. + The MCP server gives AI agents direct control over your VMs. Only use it with trusted AI + applications. The MCP server: + - Runs locally (stdio transport, no network exposure) - Has full access to VM lifecycle operations - Can execute commands inside VMs with SSH access Consider: + - Using dedicated sandbox VMs for AI agent tasks - Cloning VMs before letting agents modify them - Reviewing agent actions in Claude's interface diff --git a/docs/content/docs/lume/guide/fundamentals/unattended-setup.mdx b/docs/content/docs/lume/guide/fundamentals/unattended-setup.mdx index 632093ca..4a5d5aad 100644 --- a/docs/content/docs/lume/guide/fundamentals/unattended-setup.mdx +++ b/docs/content/docs/lume/guide/fundamentals/unattended-setup.mdx @@ -14,7 +14,8 @@ lume create my-vm --os macos --ipsw latest --unattended tahoe This creates a VM, installs macOS, and runs through the entire Setup Assistant. When it's done, you have a configured VM with user `lume` (password `lume`) and SSH enabled. -Unattended configs are macOS version-specific. The `tahoe` preset works with macOS Tahoe (15.x). Setup Assistant changes between versions, so configs may need updating for new releases. + Unattended configs are macOS version-specific. The `tahoe` preset works with macOS Tahoe (15.x). + Setup Assistant changes between versions, so configs may need updating for new releases. ## Prerequisites @@ -72,15 +73,15 @@ Screenshots go to `/tmp/unattended-` by default, or specify `--debug-dir / A config has three parts: ```yaml -boot_wait: 30 # Seconds to wait after boot +boot_wait: 30 # Seconds to wait after boot -boot_commands: # Sequence of automation commands +boot_commands: # Sequence of automation commands - "" - "" - "" - - "" + - '' -health_check: # Optional verification +health_check: # Optional verification type: ssh user: lume password: lume @@ -91,20 +92,21 @@ health_check: # Optional verification ### Wait for text (OCR) ```yaml -- "" # Wait up to 120s (default) -- "" # Custom timeout +- "" # Wait up to 120s (default) +- "" # Custom timeout ``` ### Click text ```yaml -- "" # Click first occurrence -- "" # Click last occurrence -- "" # Click 50px right of text -- "" # Click exact coordinates +- "" # Click first occurrence +- "" # Click last occurrence +- "" # Click 50px right of text +- '' # Click exact coordinates ``` When text appears multiple times (like "Agree" in license text and button), use `index`: + - `index=0` — first (top) - `index=-1` — last (bottom) @@ -131,12 +133,12 @@ When text appears multiple times (like "Agree" in license text and button), use ### Hotkey combinations ```yaml -- "" # Spotlight -- "" # Copy -- "" # Paste -- "" # Quit -- "" # Screenshot -- "" +- '' # Spotlight +- '' # Copy +- '' # Paste +- '' # Quit +- '' # Screenshot +- '' ``` Modifiers: `cmd`/`command`/`super`, `shift`, `alt`/`option`, `ctrl`/`control` @@ -144,8 +146,8 @@ Modifiers: `cmd`/`command`/`super`, `shift`, `alt`/`option`, `ctrl`/`control` ### Delays ```yaml -- "" # Wait 2 seconds -- "" # Decimals work +- '' # Wait 2 seconds +- '' # Decimals work ``` ## Example: Complete Setup Assistant @@ -157,30 +159,30 @@ boot_wait: 30 boot_commands: # Dismiss greeting - - "" - - "" - - "" + - '' + - '' + - '' # Language - "" - "" - - "" - - "" - - "" + - '' + - '' + - '' # Country - "" - - "" + - '' - "" - - "" + - '' - "" - - "" + - '' # Transfer - select "Set up as new" - "" - - "" - - "" - - "" + - '' + - '' + - '' - "" # Accessibility @@ -189,13 +191,13 @@ boot_commands: # Account creation - "" - - "" + - '' - "" - - "" + - '' - "" - - "" + - '' - "" - - "" + - '' - "" # Terms @@ -222,9 +224,9 @@ health_check: type: ssh user: lume password: lume - timeout: 30 # Seconds per attempt - retries: 5 # Number of attempts - retry_delay: 10 # Seconds between retries + timeout: 30 # Seconds per attempt + retries: 5 # Number of attempts + retry_delay: 10 # Seconds between retries ``` The automation waits for the VM to become reachable via SSH before declaring success. @@ -240,10 +242,10 @@ The automation waits for the VM to become reachable via SSH before declaring suc **Keyboard navigation** — macOS doesn't Tab through all elements by default. Enable full keyboard access: ```yaml -- "" +- '' - "" -- "" -- "" +- '' +- '' # Then enable "Keyboard navigation" in settings ``` @@ -251,13 +253,13 @@ The automation waits for the VM to become reachable via SSH before declaring suc ## Troubleshooting -| Problem | Solution | -|---------|----------| -| Commands run too early | Increase `boot_wait`, add `` | -| Text not found | Check spelling/case, increase timeout, use coordinates | -| Wrong element clicked | Use `index` to select correct occurrence | -| Hotkeys ignored | Click desktop first to focus | -| Tab skips elements | Enable keyboard navigation in System Settings | +| Problem | Solution | +| ---------------------- | ------------------------------------------------------ | +| Commands run too early | Increase `boot_wait`, add `` | +| Text not found | Check spelling/case, increase timeout, use coordinates | +| Wrong element clicked | Use `index` to select correct occurrence | +| Hotkeys ignored | Click desktop first to focus | +| Tab skips elements | Enable keyboard navigation in System Settings | ## Limitations @@ -270,19 +272,19 @@ The automation waits for the VM to become reachable via SSH before declaring suc ### Config fields -| Field | Type | Default | Description | -|-------|------|---------|-------------| -| `boot_wait` | integer | 60 | Seconds to wait before starting | -| `boot_commands` | array | required | List of commands | -| `health_check` | object | optional | Verification config | +| Field | Type | Default | Description | +| --------------- | ------- | -------- | ------------------------------- | +| `boot_wait` | integer | 60 | Seconds to wait before starting | +| `boot_commands` | array | required | List of commands | +| `health_check` | object | optional | Verification config | ### Health check fields -| Field | Type | Default | Description | -|-------|------|---------|-------------| -| `type` | string | required | `ssh` | -| `user` | string | — | SSH username | -| `password` | string | — | SSH password | -| `timeout` | integer | 30 | Seconds per attempt | -| `retries` | integer | 3 | Number of attempts | -| `retry_delay` | integer | 5 | Seconds between retries | +| Field | Type | Default | Description | +| ------------- | ------- | -------- | ----------------------- | +| `type` | string | required | `ssh` | +| `user` | string | — | SSH username | +| `password` | string | — | SSH password | +| `timeout` | integer | 30 | Seconds per attempt | +| `retries` | integer | 3 | Number of attempts | +| `retry_delay` | integer | 5 | Seconds between retries | diff --git a/docs/content/docs/lume/guide/fundamentals/vm-management.mdx b/docs/content/docs/lume/guide/fundamentals/vm-management.mdx index 1110a814..38eda3df 100644 --- a/docs/content/docs/lume/guide/fundamentals/vm-management.mdx +++ b/docs/content/docs/lume/guide/fundamentals/vm-management.mdx @@ -9,15 +9,15 @@ Once you've [created a VM](/lume/guide/getting-started/quickstart), here's how t ## Quick reference -| Task | Command | -|------|---------| -| List VMs | `lume ls` | -| Get details | `lume get my-vm` | -| Run | `lume run my-vm` | +| Task | Command | +| ------------ | ----------------------------- | +| List VMs | `lume ls` | +| Get details | `lume get my-vm` | +| Run | `lume run my-vm` | | Run headless | `lume run my-vm --no-display` | -| Stop | `lume stop my-vm` | -| Clone | `lume clone my-vm backup` | -| Delete | `lume delete my-vm` | +| Stop | `lume stop my-vm` | +| Clone | `lume clone my-vm backup` | +| Delete | `lume delete my-vm` | ## Run a VM @@ -73,7 +73,8 @@ Any ARM64 Linux distribution works. Download the ARM64 (aarch64) ISO, not x86_64 - **Fedora** - [fedoraproject.org](https://fedoraproject.org/) -Linux VMs have no licensing restrictions—run as many concurrent instances as your hardware supports. Only macOS VMs are limited to 2 per Mac. + Linux VMs have no licensing restrictions—run as many concurrent instances as your hardware + supports. Only macOS VMs are limited to 2 per Mac. ## Stop a VM @@ -101,7 +102,7 @@ lume get my-vm ## Change VM settings -Stop the VM before changing settings. Disk size can only increase, not decrease. + Stop the VM before changing settings. Disk size can only increase, not decrease. ```bash @@ -207,7 +208,8 @@ lume prune ``` -Enable caching if you frequently pull the same images or images sharing base layers. With caching disabled, each `lume pull` downloads everything fresh. + Enable caching if you frequently pull the same images or images sharing base layers. With caching + disabled, each `lume pull` downloads everything fresh. ## What's next diff --git a/docs/content/docs/lume/guide/getting-started/comparison.mdx b/docs/content/docs/lume/guide/getting-started/comparison.mdx index 5cba87d2..c88ed2ac 100644 --- a/docs/content/docs/lume/guide/getting-started/comparison.mdx +++ b/docs/content/docs/lume/guide/getting-started/comparison.mdx @@ -9,16 +9,16 @@ This page compares Lume with other macOS virtualization tools. All of these are ## Quick Comparison -| Feature | Lume | Tart | Lima | UTM | -|---------|------|------|------|-----| -| **License** | MIT | Fair Source | Apache 2.0 | Apache 2.0 | -| **macOS VMs** | Yes | Yes | No | Yes | -| **Linux VMs** | Yes | Yes | Yes | Yes | -| **HTTP API** | Yes | No | No | No | -| **MCP Server** | Yes | No | No | No | -| **Unattended Setup** | Yes (VNC + OCR) | Via Packer | N/A | No | -| **Registry Support** | OCI (GHCR, GCS) | OCI registries | N/A | No | -| **Primary Use Case** | Agent automation, CI/CD | CI/CD | Linux containers | General desktop | +| Feature | Lume | Tart | Lima | UTM | +| -------------------- | ----------------------- | -------------- | ---------------- | --------------- | +| **License** | MIT | Fair Source | Apache 2.0 | Apache 2.0 | +| **macOS VMs** | Yes | Yes | No | Yes | +| **Linux VMs** | Yes | Yes | Yes | Yes | +| **HTTP API** | Yes | No | No | No | +| **MCP Server** | Yes | No | No | No | +| **Unattended Setup** | Yes (VNC + OCR) | Via Packer | N/A | No | +| **Registry Support** | OCI (GHCR, GCS) | OCI registries | N/A | No | +| **Primary Use Case** | Agent automation, CI/CD | CI/CD | Linux containers | General desktop | ## Tart @@ -91,6 +91,7 @@ This page compares Lume with other macOS virtualization tools. All of these are ## Summary Choose Lume if you need: + - An HTTP API for programmatic VM control - MCP server integration for AI agents - Fully automated macOS VM provisioning diff --git a/docs/content/docs/lume/guide/getting-started/faq.mdx b/docs/content/docs/lume/guide/getting-started/faq.mdx index 64580d82..1f5a1d2b 100644 --- a/docs/content/docs/lume/guide/getting-started/faq.mdx +++ b/docs/content/docs/lume/guide/getting-started/faq.mdx @@ -141,11 +141,11 @@ lume run linux-vm Check VM status with `lume ls`: -| Status | Description | -|--------|-------------| -| `stopped` | VM is not running and ready to start | -| `running` | VM is running | -| `provisioning (ipsw_install)` | VM is being created—downloading and installing macOS | +| Status | Description | +| --------------------------------- | ----------------------------------------------------- | +| `stopped` | VM is not running and ready to start | +| `running` | VM is running | +| `provisioning (ipsw_install)` | VM is being created—downloading and installing macOS | | `provisioning (unattended_setup)` | VM is being created—running automated Setup Assistant | VMs in `provisioning` status are being created asynchronously. Wait for status to change to `stopped` before starting them. diff --git a/docs/content/docs/lume/guide/getting-started/installation.mdx b/docs/content/docs/lume/guide/getting-started/installation.mdx index f6cb6302..4b683bba 100644 --- a/docs/content/docs/lume/guide/getting-started/installation.mdx +++ b/docs/content/docs/lume/guide/getting-started/installation.mdx @@ -22,6 +22,7 @@ source ~/.zshrc ``` Or restart your terminal after the first command. + ### Verify it worked @@ -47,11 +48,11 @@ If you see version output, you're ready to [create your first VM](/lume/guide/ge The default install includes a background service and auto-updater. You can disable either: -| Option | Command | -|--------|---------| -| No background service | `install.sh -- --no-background-service` | -| No auto-updates | `install.sh -- --no-auto-updater` | -| Neither | `install.sh -- --no-background-service --no-auto-updater` | +| Option | Command | +| --------------------- | --------------------------------------------------------- | +| No background service | `install.sh -- --no-background-service` | +| No auto-updates | `install.sh -- --no-auto-updater` | +| Neither | `install.sh -- --no-background-service --no-auto-updater` | Full command with options: @@ -60,7 +61,8 @@ Full command with options: ``` -Without the background service, you'll need to run `lume serve` manually when using tools that rely on the Lume API (like the Computer Use Agent). + Without the background service, you'll need to run `lume serve` manually when using tools that + rely on the Lume API (like the Computer Use Agent). ## Manual installation @@ -104,24 +106,29 @@ rm ~/Library/LaunchAgents/com.trycua.lume_updater.plist ## Uninstall -Remove Lume with a single command: +Remove Lume completely: ```bash -/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/uninstall.sh)" +# Stop and remove background service +launchctl unload ~/Library/LaunchAgents/com.trycua.lume_daemon.plist 2>/dev/null +rm -f ~/Library/LaunchAgents/com.trycua.lume_daemon.plist + +# Stop and remove auto-updater +launchctl unload ~/Library/LaunchAgents/com.trycua.lume_updater.plist 2>/dev/null +rm -f ~/Library/LaunchAgents/com.trycua.lume_updater.plist +rm -f ~/.local/bin/lume-update + +# Optional: Remove cached images (run before removing binary) +lume prune + +# Remove Lume binary +rm -f $(which lume) + +# Optional: Remove VMs and config +rm -rf ~/.lume +rm -rf ~/.config/lume ``` -This stops all services, removes the binary, and cleans up LaunchAgents while preserving your VMs and configuration. - -To also remove all VMs and data, use the `--purge` flag: - -```bash -/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/uninstall.sh)" -- --purge -``` - - -The `--purge` flag permanently deletes all VMs in `~/.lume/` and configuration in `~/.config/lume/`. This cannot be undone. - - ## Troubleshooting **`lume: command not found`** — The install location (`~/.local/bin`) isn't in your PATH. Add it and reload: diff --git a/docs/content/docs/lume/guide/getting-started/introduction.mdx b/docs/content/docs/lume/guide/getting-started/introduction.mdx index f2dbe35a..bcbaddeb 100644 --- a/docs/content/docs/lume/guide/getting-started/introduction.mdx +++ b/docs/content/docs/lume/guide/getting-started/introduction.mdx @@ -8,11 +8,14 @@ import { Callout } from 'fumadocs-ui/components/callout'; Lume is a VM runtime for building AI agents, running CI/CD pipelines, and automating macOS. It uses Apple's native Virtualization Framework to run macOS and Linux VMs at near-native speed on Apple Silicon. -Lume is open-source and MIT licensed. If you find it useful, we'd appreciate a [star on GitHub](https://github.com/trycua/cua)! + Lume is open-source and MIT licensed. If you find it useful, we'd appreciate a [star on + GitHub](https://github.com/trycua/cua)! -We're piloting a managed service for customers who want to run cloud macOS sandboxes for CI/CD and agent workloads. [Book a demo](https://cal.com/cua/cua-demo?overlayCalendar=true) if you're interested. + We're piloting a managed service for customers who want to run cloud macOS sandboxes for CI/CD and + agent workloads. [Book a demo](https://cal.com/cua/cua-demo?overlayCalendar=true) if you're + interested. ```bash @@ -52,7 +55,10 @@ Lume is a thin layer over Apple's [Virtualization Framework](https://developer.a **Building AI agents** — Lume powers the [Cua Computer SDK](/cua/reference/computer-sdk), providing VMs that AI models can interact with through screenshots and input simulation. -Apple's Virtualization Framework—the same technology Lume is built on—powers [Claude Cowork](https://support.claude.com/en/articles/13345190-getting-started-with-cowork), Anthropic's sandboxed environment for Claude Code. It downloads a Linux root filesystem and boots it in an isolated VM where Claude can safely execute commands without access to your broader system. + Apple's Virtualization Framework—the same technology Lume is built on—powers [Claude + Cowork](https://support.claude.com/en/articles/13345190-getting-started-with-cowork), Anthropic's + sandboxed environment for Claude Code. It downloads a Linux root filesystem and boots it in an + isolated VM where Claude can safely execute commands without access to your broader system. ## What Lume doesn't do diff --git a/docs/content/docs/lume/guide/getting-started/quickstart.mdx b/docs/content/docs/lume/guide/getting-started/quickstart.mdx index 19f5b5cd..976c5a46 100644 --- a/docs/content/docs/lume/guide/getting-started/quickstart.mdx +++ b/docs/content/docs/lume/guide/getting-started/quickstart.mdx @@ -38,7 +38,8 @@ lume create my-vm --os macos --ipsw latest ``` -The IPSW file is ~15GB. If you plan to create multiple VMs, downloading manually and reusing the file saves time and bandwidth. + The IPSW file is ~15GB. If you plan to create multiple VMs, downloading manually and reusing the + file saves time and bandwidth. ## Configure VM storage location (optional) @@ -67,7 +68,8 @@ lume create my-vm --os macos --ipsw latest --storage /path/to/vm/directory ``` -VM disk images can grow large (~50GB+). Using an external SSD keeps your internal storage free and can improve performance for I/O-heavy workloads. + VM disk images can grow large (~50GB+). Using an external SSD keeps your internal storage free and + can improve performance for I/O-heavy workloads. ## Create a Linux VM @@ -86,7 +88,8 @@ lume run ubuntu-vm ``` -Linux VMs require ARM64 ISO images (not x86). Ubuntu, Debian, and Fedora all provide ARM64 server and desktop images. + Linux VMs require ARM64 ISO images (not x86). Ubuntu, Debian, and Fedora all provide ARM64 server + and desktop images. ## Run your VM @@ -97,10 +100,6 @@ lume run my-vm A VNC window opens. For macOS VMs, you'll see the Setup Assistant—complete it manually or use [Unattended Setup](/lume/guide/fundamentals/unattended-setup) to automate it. - -**Automatic clipboard sync:** When Remote Login (SSH) is enabled on your VM, Lume automatically syncs clipboard between your Mac and the VM in both directions. Copy on your Mac, paste in the VM—or copy in the VM, paste on your Mac. VMs created with `--unattended` have SSH enabled by default. - - ## Create a VM with custom resources By default, Lume creates VMs with 4 CPU cores, 8GB memory, and a 50GB disk. Override any of these: @@ -112,11 +111,11 @@ lume create dev-vm --os macos --ipsw latest \ --disk-size 100GB ``` -| Resource | Default | Notes | -|----------|---------|-------| -| CPU | 4 cores | Max is your Mac's core count | -| Memory | 8GB | Leave headroom for your host | -| Disk | 50GB | Uses sparse files—only consumes actual usage | +| Resource | Default | Notes | +| -------- | ------- | -------------------------------------------- | +| CPU | 4 cores | Max is your Mac's core count | +| Memory | 8GB | Leave headroom for your host | +| Disk | 50GB | Uses sparse files—only consumes actual usage | ## Change display resolution @@ -128,12 +127,12 @@ lume set my-vm --display 1920x1080 The change takes effect on the next VM start. After starting the VM, go to **System Settings → Displays**, enable **Show all resolutions**, and select your new resolution. Common resolutions: -| Resolution | Aspect Ratio | Use Case | -|------------|--------------|----------| -| 1024x768 | 4:3 | Default, low resource usage | -| 1280x800 | 16:10 | MacBook-style | -| 1920x1080 | 16:9 | Full HD | -| 2560x1440 | 16:9 | QHD | +| Resolution | Aspect Ratio | Use Case | +| ---------- | ------------ | --------------------------- | +| 1024x768 | 4:3 | Default, low resource usage | +| 1280x800 | 16:10 | MacBook-style | +| 1920x1080 | 16:9 | Full HD | +| 2560x1440 | 16:9 | QHD | ## Automate the Setup Assistant @@ -143,7 +142,7 @@ Skip manual setup entirely with the `--unattended` flag: lume create my-vm --os macos --ipsw latest --unattended tahoe ``` -The `tahoe` preset runs through the Setup Assistant automatically, creating a user `lume` with password `lume` and enabling Remote Login (SSH). Once complete, you can connect with `lume ssh my-vm`. This takes 10-15 minutes but requires no interaction. +The `tahoe` preset runs through the Setup Assistant automatically, creating a user `lume` with password `lume` and enabling SSH. This takes 10-15 minutes but requires no interaction. For custom configurations, see [Unattended Setup](/lume/guide/fundamentals/unattended-setup). @@ -160,7 +159,7 @@ lume run my-vm --no-display lume run my-vm > /dev/null 2>&1 & ``` -When running headless or in background, connect via `lume ssh my-vm` (if Remote Login is enabled) or a VNC client. Use `lume ls` to see running VMs and their connection details. +When running headless or in background, connect via SSH (if enabled) or VNC client. Use `lume ls` to see running VMs and their connection details. ## Common operations @@ -181,33 +180,6 @@ lume clone my-vm my-vm-backup lume delete my-vm ``` -## Connect via SSH - -Access your VM from the terminal without needing the VNC window: - -```bash -# Interactive shell -lume ssh my-vm - -# Run a single command -lume ssh my-vm "ls -la" - -# Run multiple commands -lume ssh my-vm "cd /tmp && pwd" -``` - - -**SSH requires Remote Login to be enabled on the VM.** Either: -- Create the VM with `--unattended` (enables SSH automatically with user `lume`, password `lume`) -- Or manually enable it: **System Settings → General → Sharing → Remote Login** - - -Custom credentials: - -```bash -lume ssh my-vm -u myuser -p mypassword -``` - ## Share files with your VM Mount a host directory inside the VM: diff --git a/docs/content/docs/lume/reference/changelog.mdx b/docs/content/docs/lume/reference/changelog.mdx new file mode 100644 index 00000000..abfec79a --- /dev/null +++ b/docs/content/docs/lume/reference/changelog.mdx @@ -0,0 +1,441 @@ +--- +title: Changelog +description: Release history for Lume +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/generate-changelog.ts +Last updated: 2026-02-04 +*/} + +# Lume Changelog + +All notable changes to the Lume are documented here. + +## 0.2.x + +### v0.2.76 (2026-02-03) + +Maintenance release. + +### v0.2.75 (2026-02-03) + +Maintenance release. + +### v0.2.74 (2026-02-03) + +Maintenance release. + +### v0.2.73 (2026-02-03) + +Maintenance release. + +### v0.2.72 (2026-02-03) + +Maintenance release. + +### v0.2.71 (2026-02-03) + +Maintenance release. + +### v0.2.70 (2026-02-03) + +Maintenance release. + +### v0.2.69 (2026-02-02) + +Maintenance release. + +### v0.2.68 (2026-02-02) + +Maintenance release. + +### v0.2.67 (2026-02-02) + +Maintenance release. + +### v0.2.66 (2026-02-02) + +Maintenance release. + +### v0.2.65 (2026-02-02) + +Maintenance release. + +### v0.2.64 (2026-02-02) + +Maintenance release. + +### v0.2.63 (2026-02-02) + +Maintenance release. + +### v0.2.62 (2026-02-02) + +Maintenance release. + +### v0.2.61 (2026-02-02) + +Maintenance release. + +### v0.2.60 (2026-02-02) + +Maintenance release. + +### v0.2.59 (2026-02-01) + +Maintenance release. + +### v0.2.58 (2026-02-01) + +Maintenance release. + +### v0.2.57 (2026-02-01) + +Maintenance release. + +### v0.2.56 (2026-02-01) + +Maintenance release. + +### v0.2.55 (2026-02-01) + +Maintenance release. + +### v0.2.54 (2026-02-01) + +Maintenance release. + +### v0.2.52 (2026-01-25) + +Maintenance release. + +### v0.2.51 (2026-01-21) + +Maintenance release. + +### v0.2.50 (2026-01-21) + +Maintenance release. + +### v0.2.49 (2026-01-21) + +Maintenance release. + +### v0.2.46 (2026-01-20) + +Maintenance release. + +### v0.2.45 (2026-01-19) + +Maintenance release. + +### v0.2.44 (2026-01-19) + +Maintenance release. + +### v0.2.43 (2026-01-19) + +Maintenance release. + +### v0.2.42 (2026-01-19) + +Maintenance release. + +### v0.2.41 (2026-01-19) + +Maintenance release. + +### v0.2.40 (2026-01-18) + +Maintenance release. + +### v0.2.39 (2026-01-18) + +Maintenance release. + +### v0.2.38 (2026-01-18) + +Maintenance release. + +### v0.2.37 (2026-01-18) + +Maintenance release. + +### v0.2.36 (2026-01-18) + +Maintenance release. + +### v0.2.34 (2026-01-18) + +Maintenance release. + +### v0.2.33 (2026-01-18) + +Maintenance release. + +### v0.2.32 (2026-01-18) + +Maintenance release. + +### v0.2.31 (2026-01-17) + +Maintenance release. + +### v0.2.30 (2026-01-17) + +Maintenance release. + +### v0.2.29 (2026-01-17) + +Maintenance release. + +### v0.2.28 (2026-01-17) + +Maintenance release. + +### v0.2.27 (2026-01-11) + +Maintenance release. + +### v0.2.26 (2026-01-11) + +Maintenance release. + +### v0.2.25 (2026-01-11) + +Maintenance release. + +### v0.2.23 (2025-12-23) + +- Update README badges with sky/emerald colors and larger logo by @f-trycua in https://github.com/trycua/cua/pull/641 +- Add QEMU Ubuntu 22.04 template with CUA computer-server support by @synacktraa in https://github.com/trycua/cua/pull/622 +- Add QEMU Windows 11 template with CUA computer-server support by @synacktraa in https://github.com/trycua/cua/pull/551 +- docs: update intro page with hero image and Geist font by @f-trycua in https://github.com/trycua/cua/pull/642 +- docs: expand CUA description with screenshot-VLM-action loop by @f-trycua in https://github.com/trycua/cua/pull/643 +- F trycua/docs hero image by @f-trycua in https://github.com/trycua/cua/pull/644 +- Introduce cua-bench-ui submodules (webview API, configurable ports, improved python RPC) by @ddupont808 in https://github.com/trycua/cua/pull/645 +- Replace PyQt6 with PySide6 by @ddupont808 in https://github.com/trycua/cua/pull/646 +- Bump cua-bench-ui version in xfce Dockerfile by @ddupont808 in https://github.com/trycua/cua/pull/647 +- Replace qt backend with gtk in xfce Docker image by @ddupont808 in https://github.com/trycua/cua/pull/648 +- fix(docs): broken link references for docs by @sarinali in https://github.com/trycua/cua/pull/652 +- fix(docs): document cloud models by @sarinali in https://github.com/trycua/cua/pull/651 +- Refactor Docker publish workflow to correctly build and publish multi-arch images by @synacktraa in https://github.com/trycua/cua/pull/653 +- Refactor Python quickstart examples to use asyncio by @LucaStngn in https://github.com/trycua/cua/pull/650 +- fix(lume): Resolve install script downloading wrong release by @synacktraa in https://github.com/trycua/cua/pull/655 + +### v0.2.22 (2025-06-17) + +Maintenance release. + +### v0.2.16 (2025-06-16) + +- Add .devcontainer and Dockerfile fixes by @ddupont808 in https://github.com/trycua/cua/pull/287 +- [Agent] Removed extra screenshots in OpenAI/Anthropic tools, fixed image retention bug by @ddupont808 in https://github.com/trycua/cua/pull/291 + +### v0.2.15 (2025-05-23) + +- Lumier computer provider takes non-trycua image by @jklapacz in https://github.com/trycua/cua/pull/181 +- Added Cmake dep to playground.sh by @ddupont808 in https://github.com/trycua/cua/pull/184 + +### v0.2.14 (2025-05-14) + +- [Computer] Add initial VM Provider by @f-trycua in https://github.com/trycua/cua/pull/162 + +### v0.2.13 (2025-05-11) + +- Add clipboard and audio device by @f-trycua in https://github.com/trycua/cua/pull/170 + +### v0.2.12 (2025-05-07) + +- Replaced remaining `os=` occurances with `os_type=` by @ddupont808 in https://github.com/trycua/cua/pull/159 +- Prevent install if sudo by @f-trycua in https://github.com/trycua/cua/pull/161 + +### v0.2.11 (2025-05-06) + +Maintenance release. + +### v0.2.10 (2025-05-06) + +Maintenance release. + +### v0.2.9 (2025-05-05) + +- [Agent] Add standardized agent trajectory logging by @ddupont808 in https://github.com/trycua/cua/pull/155 + +### v0.2.8 (2025-05-02) + +Maintenance release. + +### v0.2.7 (2025-04-30) + +Maintenance release. + +### v0.2.6 (2025-04-29) + +- docs: fix wait action in blog notebook by @FinnBorge in https://github.com/trycua/cua/pull/142 +- [Lumier] Introduce Lumier - Docker-based macOS VM Management Interface by @f-trycua in https://github.com/trycua/cua/pull/144 + +New Contributors + +- @FinnBorge made their first contribution in https://github.com/trycua/cua/pull/142 + +### v0.2.5 (2025-04-25) + +Maintenance release. + +### v0.2.4 (2025-04-24) + +- [Root] Fix root py project toml by @f-trycua in https://github.com/trycua/cua/pull/133 +- Update Mac identifier during clone process by @dp221125 in https://github.com/trycua/cua/pull/130 + +New Contributors + +- @dp221125 made their first contribution in https://github.com/trycua/cua/pull/130 + +### v0.2.3 (2025-04-23) + +Maintenance release. + +### v0.2.2 (2025-04-22) + +Maintenance release. + +### v0.2.1 (2025-04-22) + +Maintenance release. + +### v0.2.0 (2025-04-22) + +- [Computer] Add Gradio UI and fix interaction bugs by @ddupont808 in https://github.com/trycua/cua/pull/116 +- Docs: Add EasyOCR SSL & Lume VM Startup Troubleshooting to FAQ by @trospix in https://github.com/trycua/cua/pull/114 +- [Lume] Fix broken storage tests by @f-trycua in https://github.com/trycua/cua/pull/107 +- docs: add trospix as a contributor for code by @allcontributors in https://github.com/trycua/cua/pull/115 +- docs: add eltociear as a contributor for code by @allcontributors in https://github.com/trycua/cua/pull/121 +- docs: update computer/README.md by @eltociear in https://github.com/trycua/cua/pull/119 +- [Lume] Sparse Push & Pull Optimizations by @f-trycua in https://github.com/trycua/cua/pull/122 + +New Contributors + +- @trospix made their first contribution in https://github.com/trycua/cua/pull/114 +- @eltociear made their first contribution in https://github.com/trycua/cua/pull/119 + +## 0.1.x + +### v0.1.38 (2025-04-15) + +Maintenance release. + +### v0.1.37 (2025-04-15) + +Maintenance release. + +### v0.1.36 (2025-04-15) + +Maintenance release. + +### v0.1.35 (2025-04-15) + +- Update install_mcp_server.sh - remove piping to /dev/null by @rahulkarajgikar in https://github.com/trycua/cua/pull/110 +- docs: add rahulkarajgikar as a contributor for code by @allcontributors in https://github.com/trycua/cua/pull/111 +- Support py >= 3.13 by @f-trycua in https://github.com/trycua/cua/pull/112 + +New Contributors + +- @rahulkarajgikar made their first contribution in https://github.com/trycua/cua/pull/110 + +### v0.1.34 (2025-04-14) + +- [Lume] Add multiple VM locations and configurable cache by @f-trycua in https://github.com/trycua/cua/pull/91 + +### v0.1.33 (2025-04-14) + +- [Agent] Improved Gradio UI by @ddupont808 in https://github.com/trycua/cua/pull/104 + +### v0.1.32 (2025-04-12) + +- [Lume] Optimize VM image assembly by @f-trycua in https://github.com/trycua/cua/pull/102 + +### v0.1.31 (2025-04-11) + +- [Agent] Fix leftover Gradio docs by @f-trycua in https://github.com/trycua/cua/pull/97 +- docs: add RicterZ as a contributor for code by @allcontributors in https://github.com/trycua/cua/pull/98 +- [Lume] Fix reassemble kill by @f-trycua in https://github.com/trycua/cua/pull/99 + +### v0.1.30 (2025-04-07) + +Maintenance release. + +### v0.1.29 (2025-04-07) + +Maintenance release. + +### v0.1.28 (2025-04-07) + +Maintenance release. + +### v0.1.27 (2025-04-07) + +Maintenance release. + +### v0.1.26 (2025-04-06) + +- Fix anthropic format in omni loop by @f-trycua in https://github.com/trycua/cua/pull/94 + +### v0.1.25 (2025-04-06) + +Maintenance release. + +### v0.1.24 (2025-04-06) + +- [Agent] Add Gradio UI & OAI-Compatible Provider by @f-trycua in https://github.com/trycua/cua/pull/93 + +### v0.1.23 (2025-04-04) + +- Add ai-gradio notebook by @f-trycua in https://github.com/trycua/cua/pull/85 +- Update README with gradio by @f-trycua in https://github.com/trycua/cua/pull/86 +- [SOM] Fix overlapping bounding boxes and added GPU/MPS support by @ddupont808 in https://github.com/trycua/cua/pull/87 +- Add Ollama support in Omni parser by @Lizzard1123 in https://github.com/trycua/cua/pull/76 +- docs: add ddupont808 as a contributor for code by @allcontributors in https://github.com/trycua/cua/pull/88 +- docs: add Lizzard1123 as a contributor for code by @allcontributors in https://github.com/trycua/cua/pull/89 +- [Agent] Add ollama support by @f-trycua in https://github.com/trycua/cua/pull/90 + +New Contributors + +- @ddupont808 made their first contribution in https://github.com/trycua/cua/pull/87 +- @Lizzard1123 made their first contribution in https://github.com/trycua/cua/pull/76 + +### v0.1.22 (2025-04-02) + +- Refactor Developer Guide by @f-trycua in https://github.com/trycua/cua/pull/77 +- Add operator blogpost - part 1 by @f-trycua in https://github.com/trycua/cua/pull/80 +- [Agent] Add OpenAI generate summary by @f-trycua in https://github.com/trycua/cua/pull/84 + +### v0.1.21 (2025-03-30) + +Maintenance release. + +### v0.1.20 (2025-03-30) + +- [Lume] Optimize disk image reassembly for performance and storage by @f-trycua in https://github.com/trycua/cua/pull/72 + +### v0.1.19 (2025-03-24) + +- Add `--no-cache` option in lume pull by @aktech in https://github.com/trycua/cua/pull/67 + +### v0.1.18 (2025-03-19) + +Maintenance release. + +### v0.1.17 (2025-03-17) + +Maintenance release. + +### v0.1.16 (2025-03-17) + +Maintenance release. + +### v0.1.14 (2025-03-17) + +- Fix url for clone endpoint by @aktech in https://github.com/trycua/computer/pull/54 diff --git a/docs/content/docs/lume/reference/cli-reference.mdx b/docs/content/docs/lume/reference/cli-reference.mdx index 800ea54a..a8c34273 100644 --- a/docs/content/docs/lume/reference/cli-reference.mdx +++ b/docs/content/docs/lume/reference/cli-reference.mdx @@ -1,5 +1,5 @@ --- -title: Lume CLI Reference +title: CLI Reference description: Command Line Interface reference for Lume --- @@ -7,9 +7,19 @@ description: Command Line Interface reference for Lume AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY Generated by: npx tsx scripts/docs-generators/lume.ts Source: libs/lume/src/Commands/*.swift + Version: 0.2.76 */} import { Callout } from 'fumadocs-ui/components/callout'; +import { VersionHeader } from '@/components/version-selector'; + + A lightweight CLI and local API server to build, run and manage macOS VMs. @@ -17,7 +27,7 @@ A lightweight CLI and local API server to build, run and manage macOS VMs. ```bash # Run a prebuilt macOS VM -lume run macos-sequoia-vanilla:latest +lume run macos-tahoe-vanilla:latest # Create a custom VM lume create my-vm --cpu 4 --memory 8GB --disk-size 50GB @@ -156,24 +166,6 @@ List virtual machines - `-f, --format` - Output format (json or text) (default: text) - `--storage` - Filter by storage location name -## Remote Access - -### lume ssh - -Connect to a VM via SSH or execute commands remotely - -**Arguments:** - -- `` - Name of the virtual machine -- `` - Command to execute (omit for interactive shell) (optional) - -**Options:** - -- `-u, --user` - SSH username (default: lume) -- `-p, --password` - SSH password (default: lume) -- `--storage` - Storage location name or path -- `-t, --timeout` - Command timeout in seconds (0 for no timeout) (default: 60) - ## Image Management ### lume pull diff --git a/docs/content/docs/lume/reference/http-api.mdx b/docs/content/docs/lume/reference/http-api.mdx index c7be47c4..52473878 100644 --- a/docs/content/docs/lume/reference/http-api.mdx +++ b/docs/content/docs/lume/reference/http-api.mdx @@ -1,5 +1,5 @@ --- -title: Lume HTTP API Reference +title: API Reference description: HTTP API reference for Lume server --- @@ -7,10 +7,20 @@ description: HTTP API reference for Lume server AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY Generated by: npx tsx scripts/docs-generators/lume.ts Source: libs/lume/src/Server/*.swift + Version: 0.2.76 */} import { Callout } from 'fumadocs-ui/components/callout'; import { Tabs, Tab } from 'fumadocs-ui/components/tabs'; +import { VersionHeader } from '@/components/version-selector'; + + HTTP API for managing macOS and Linux virtual machines @@ -589,7 +599,7 @@ Pull a VM image from a container registry curl -X POST "http://localhost:7777/lume/pull" \ -H "Content-Type: application/json" \ -d '{ - "image": "macos-sequoia-vanilla:latest" + "image": "macos-tahoe-vanilla:latest" }' ``` @@ -598,7 +608,7 @@ curl -X POST "http://localhost:7777/lume/pull" \ import requests data = { - "image": "macos-sequoia-vanilla:latest", + "image": "macos-tahoe-vanilla:latest", } response = requests.post("http://localhost:7777/lume/pull", json=data) @@ -611,7 +621,7 @@ const response = await fetch(`http://localhost:7777/lume/pull`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ - image: "macos-sequoia-vanilla:latest", + image: "macos-tahoe-vanilla:latest", }), }); const data = await response.json(); diff --git a/docs/content/docs/lume/reference/meta.json b/docs/content/docs/lume/reference/meta.json index bb4d7c19..8af3a48a 100644 --- a/docs/content/docs/lume/reference/meta.json +++ b/docs/content/docs/lume/reference/meta.json @@ -2,5 +2,5 @@ "title": "Reference", "description": "CLI and API reference documentation", "icon": "FileText", - "pages": ["cli-reference", "http-api"] + "pages": ["cli-reference", "http-api", "changelog"] } diff --git a/docs/content/docs/lume/reference/v0.2/cli-reference.mdx b/docs/content/docs/lume/reference/v0.2/cli-reference.mdx new file mode 100644 index 00000000..2842b774 --- /dev/null +++ b/docs/content/docs/lume/reference/v0.2/cli-reference.mdx @@ -0,0 +1,295 @@ +--- +title: Lume CLI Reference +description: Command Line Interface reference for Lume +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/lume.ts +Source: libs/lume/src/Commands/*.swift +Version: 0.2.75 +*/} + +import { Callout } from 'fumadocs-ui/components/callout'; +import { VersionHeader } from '@/components/version-selector'; + + + This is documentation for **v0.2**. [View latest version](/lume/reference/cli-reference). + + +
+ + v0.2.75 + + curl -fsSL .../install.sh | bash +
+ +A lightweight CLI and local API server to build, run and manage macOS VMs. + +## Quick Start + +```bash +# Run a prebuilt macOS VM +lume run macos-sequoia-vanilla:latest + +# Create a custom VM +lume create my-vm --cpu 4 --memory 8GB --disk-size 50GB +``` + +## VM Management + +### lume create + +Create a new virtual machine + +**Arguments:** + +- `` - Name for the virtual machine + +**Options:** + +- `--os` - Operating system to install (macOS or linux) (default: macOS) +- `--cpu` - Number of CPU cores (default: 4) +- `--memory` - Memory size (e.g., 8GB) (default: 8GB) +- `--disk-size` - Disk size (e.g., 50GB) (default: 50GB) +- `--display` - Display resolution (e.g., 1024x768) (default: 1024x768) +- `--ipsw` - Path to IPSW file or 'latest' for macOS VMs +- `--storage` - VM storage location to use +- `--unattended` - [Preview] Preset name or path to YAML config file for unattended macOS Setup Assistant automation. Built-in presets: sequoia, tahoe. Only supported for macOS VMs. +- `--debug-dir` - Custom directory for debug screenshots during unattended setup (defaults to unique folder in system temp) +- `--vnc-port` - Port to use for the VNC server during unattended setup. Defaults to 0 (auto-assign) (default: 0) + +**Flags:** + +- `--debug` - Enable debug mode for unattended setup - saves screenshots with click coordinates +- `--no-display` - Do not open the VNC client during unattended setup (default: true for unattended) + +### lume clone + +Clone an existing virtual machine + +**Arguments:** + +- `` - Name of the source VM +- `` - Name for the cloned VM + +**Options:** + +- `--source-storage` - Source VM storage location +- `--dest-storage` - Destination VM storage location + +### lume run + +Run a virtual machine + +**Arguments:** + +- `` - Name of the VM or image to run (format: name or name:tag) + +**Options:** + +- `--shared-dir` - Directory to share with the VM (format: path or path:ro or path:rw) +- `--mount` - For Linux VMs only, attach a read-only disk image +- `--usb-storage` - Disk image to attach as USB mass storage device +- `--registry` - Container registry URL (default: ghcr.io) +- `--organization` - Organization to pull from (default: trycua) +- `--vnc-port` - Port for VNC server (0 for auto-assign) (default: 0) +- `--recovery-mode` - For macOS VMs only, boot in recovery mode (default: false) +- `--storage` - VM storage location to use + +**Flags:** + +- `-d, --no-display` - Do not start the VNC client + +### lume stop + +Stop a virtual machine + +**Arguments:** + +- `` - Name of the VM to stop + +**Options:** + +- `--storage` - VM storage location to use + +### lume delete + +Delete a virtual machine + +**Arguments:** + +- `` - Name of the VM to delete + +**Options:** + +- `--storage` - VM storage location to use + +**Flags:** + +- `--force` - Force deletion without confirmation + +## VM Information and Configuration + +### lume get + +Get detailed information about a virtual machine + +**Arguments:** + +- `` - Name of the VM + +**Options:** + +- `-f, --format` - Output format (default: text) +- `--storage` - VM storage location to use + +### lume set + +Set new values for CPU, memory, and disk size of a virtual machine + +**Arguments:** + +- `` - Name of the VM + +**Options:** + +- `--cpu` - New number of CPU cores +- `--memory` - New memory size (e.g., 8GB) +- `--disk-size` - New disk size (e.g., 100GB) +- `--display` - New display resolution +- `--storage` - VM storage location to use + +### lume ls + +List virtual machines + +**Options:** + +- `-f, --format` - Output format (json or text) (default: text) +- `--storage` - Filter by storage location name + +## Image Management + +### lume pull + +Pull a macOS image from GitHub Container Registry + +**Arguments:** + +- `` - Image to pull (format: name:tag) +- `` - Name for the resulting VM (optional) + +**Options:** + +- `--registry` - Container registry URL (default: ghcr.io) +- `--organization` - Organization to pull from (default: trycua) +- `--storage` - VM storage location to use + +### lume push + +Push a macOS VM to GitHub Container Registry + +**Arguments:** + +- `` - Name of VM to push +- `` - Image tag (format: name:tag) + +**Options:** + +- `--additional-tags` - Additional tags to push +- `--registry` - Container registry URL (default: ghcr.io) +- `--organization` - Organization to push to (default: trycua) +- `--storage` - VM storage location to use +- `--chunk-size-mb` - Chunk size for upload in MB (default: 512) + +**Flags:** + +- `--verbose` - Enable verbose logging +- `--dry-run` - Prepare files without uploading +- `--reassemble` - Verify integrity by reassembling chunks + +### lume images + +List available macOS images from local cache + +**Options:** + +- `--organization` - Organization to list images for (default: trycua) + +### lume ipsw + +Get macOS restore image IPSW URL + +### lume prune + +Remove cached images + +## Configuration and Server + +### lume serve + +Start the VM management server + +**Options:** + +- `--port` - Port to listen on (default: 7777) + +### lume config + +Get or set lume configuration + +**Subcommands:** + +- `lume config get` - Get current configuration +- `lume config storage` - Manage VM storage locations + - `lume config storage add` - Add a new VM storage location + - `lume config storage remove` - Remove a VM storage location + - `lume config storage list` - List all VM storage locations + - `lume config storage default` - Set the default VM storage location +- `lume config cache` - Manage cache settings + - `lume config cache get` - Get current cache directory + - `lume config cache set` - Set cache directory +- `lume config caching` - Manage image caching settings + - `lume config caching get` - Show current caching status + - `lume config caching set` - Enable or disable image caching + +### lume logs + +View lume serve logs + +**Subcommands:** + +- `lume logs info` - View info logs from the daemon + - `-n, --lines` - Number of lines to display +- `lume logs error` - View error logs from the daemon + - `-n, --lines` - Number of lines to display +- `lume logs all` - View both info and error logs + - `-n, --lines` - Number of lines to display + +### lume setup + +[Preview] Run unattended Setup Assistant automation on a macOS VM + +**Arguments:** + +- `` - Name of the virtual machine + +**Options:** + +- `--unattended` - Preset name or path to YAML config file for unattended macOS Setup Assistant automation. Built-in presets: sequoia, tahoe. +- `--storage` - VM storage location to use or direct path to VM location +- `--vnc-port` - Port to use for the VNC server. Defaults to 0 (auto-assign) (default: 0) +- `--debug-dir` - Custom directory for debug screenshots (defaults to unique folder in system temp) + +**Flags:** + +- `--no-display` - Do not open the VNC client automatically +- `--debug` - Enable debug mode - saves screenshots with click coordinates + +## Global Options + +These options are available for all commands: + +- `--help` - Show help information +- `--version` - Show version number diff --git a/docs/content/docs/lume/reference/v0.2/http-api.mdx b/docs/content/docs/lume/reference/v0.2/http-api.mdx new file mode 100644 index 00000000..fa9715f6 --- /dev/null +++ b/docs/content/docs/lume/reference/v0.2/http-api.mdx @@ -0,0 +1,1114 @@ +--- +title: Lume HTTP API Reference +description: HTTP API reference for Lume server +--- + +{/* +AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY +Generated by: npx tsx scripts/docs-generators/lume.ts +Source: libs/lume/src/Server/*.swift +Version: 0.2.75 +*/} + +import { Callout } from 'fumadocs-ui/components/callout'; +import { Tabs, Tab } from 'fumadocs-ui/components/tabs'; +import { VersionHeader } from '@/components/version-selector'; + + + This is documentation for **v0.2**. [View latest version](/lume/reference/cli-reference). + + +
+ + v0.2.75 + + curl -fsSL .../install.sh | bash +
+ +HTTP API for managing macOS and Linux virtual machines + +## Default URL + +``` +http://localhost:7777 +``` + +Start the server with `lume serve` or specify a custom port with `lume serve --port `. + +## VM Management + +### All virtual machines + +List all virtual machines + +`GET: /lume/vms` + +#### Parameters + +| Name | Type | Required | Description | +| ------- | ------ | -------- | ------------------------------- | +| storage | string | No | Filter by storage location name | + +#### Example Request + + + +```bash +curl "http://localhost:7777/lume/vms" +``` + + +```python +import requests + +response = requests.get("http://localhost:7777/lume/vms") +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/vms`); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: Success +- **400**: Bad request + +--- + +### Detailed information about a specific virtual machine + +Get detailed information about a specific virtual machine + +`GET: /lume/vms/:name` + +#### Parameters + +| Name | Type | Required | Description | +| ------- | ------ | -------- | -------------------------- | +| name | string | Yes | Name of the VM | +| storage | string | No | VM storage location to use | + +#### Example Request + + + +```bash +curl "http://localhost:7777/lume/vms/{name}" +``` + + +```python +import requests + +response = requests.get("http://localhost:7777/lume/vms/{name}") +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/vms/${name}`); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: Success +- **400**: VM not found or invalid request + +--- + +### A new virtual machine + +Create a new virtual machine + +`POST: /lume/vms` + +#### Request Body + +| Name | Type | Required | Description | +| -------- | ------- | -------- | -------------------------------------------- | +| name | string | Yes | Name for the virtual machine | +| os | string | Yes | Operating system to install (macOS or linux) | +| cpu | integer | Yes | Number of CPU cores | +| memory | string | Yes | Memory size (e.g., 8GB) | +| diskSize | string | Yes | Disk size (e.g., 50GB) | +| display | string | Yes | Display resolution (e.g., 1024x768) | +| ipsw | string | No | Path to IPSW file or 'latest' for macOS VMs | +| storage | string | No | VM storage location to use | + +#### Example Request + + + +```bash +curl -X POST "http://localhost:7777/lume/vms" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "my-vm", + "os": "macOS", + "cpu": 4, + "memory": "8GB", + "diskSize": "50GB", + "display": "1024x768" +}' +``` + + +```python +import requests + +data = { +"name": "my-vm", +"os": "macOS", +"cpu": 4, +"memory": "8GB", +"diskSize": "50GB", +"display": "1024x768", +} + +response = requests.post("http://localhost:7777/lume/vms", json=data) +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/vms`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + name: "my-vm", + os: "macOS", + cpu: 4, + memory: "8GB", + diskSize: "50GB", + display: "1024x768", + }), +}); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: VM created successfully +- **400**: Invalid request body or VM creation failed + +--- + +### A virtual machine and its associated files + +Delete a virtual machine and its associated files + +`DELETE: /lume/vms/:name` + +#### Parameters + +| Name | Type | Required | Description | +| ------- | ------ | -------- | ------------------------ | +| name | string | Yes | Name of the VM to delete | +| storage | string | No | VM storage location | + +#### Example Request + + + +```bash +curl -X DELETE "http://localhost:7777/lume/vms/{name}" +``` + + +```python +import requests + +response = requests.delete("http://localhost:7777/lume/vms/{name}") +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/vms/${name}`, { + method: "DELETE", +}); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: VM deleted successfully +- **400**: VM not found or deletion failed + +--- + +### A copy of an existing virtual machine + +Create a copy of an existing virtual machine + +`POST: /lume/vms/clone` + +#### Request Body + +| Name | Type | Required | Description | +| -------------- | ------ | -------- | ------------------------------- | +| name | string | Yes | Name of the source VM | +| newName | string | Yes | Name for the cloned VM | +| sourceLocation | string | No | Source VM storage location | +| destLocation | string | No | Destination VM storage location | + +#### Example Request + + + +```bash +curl -X POST "http://localhost:7777/lume/vms/clone" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "my-vm", + "newName": "example" +}' +``` + + +```python +import requests + +data = { +"name": "my-vm", +"newName": "example", +} + +response = requests.post("http://localhost:7777/lume/vms/clone", json=data) +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/vms/clone`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + name: "my-vm", + newName: "example", + }), +}); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: VM cloned successfully +- **400**: Clone operation failed + +--- + +### Virtual machine configuration settings + +Update virtual machine configuration settings + +`PATCH: /lume/vms/:name` + +#### Parameters + +| Name | Type | Required | Description | +| ---- | ------ | -------- | ------------------------ | +| name | string | Yes | Name of the VM to update | + +#### Request Body + +| Name | Type | Required | Description | +| -------- | ------- | -------- | ---------------------------- | +| cpu | integer | No | New number of CPU cores | +| memory | string | No | New memory size (e.g., 16GB) | +| diskSize | string | No | New disk size (e.g., 100GB) | +| display | string | No | New display resolution | +| storage | string | No | VM storage location | + +#### Example Request + + + +```bash +curl -X PATCH "http://localhost:7777/lume/vms/{name}" \ + -H "Content-Type: application/json" \ + -d '{}' +``` + + +```python +import requests + +data = { +} + +response = requests.patch("http://localhost:7777/lume/vms/{name}", json=data) +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/vms/${name}`, { + method: "PATCH", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + }), +}); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: Settings updated successfully +- **400**: Invalid settings or update failed + +--- + +### Start a virtual machine + +Start a virtual machine + +`POST: /lume/vms/:name/run` + +#### Parameters + +| Name | Type | Required | Description | +| ---- | ------ | -------- | ----------------------- | +| name | string | Yes | Name of the VM to start | + +#### Request Body + +| Name | Type | Required | Description | +| ----------------- | ------- | -------- | ----------------------------------------------- | +| noDisplay | boolean | No | Run without VNC display (default: false) | +| sharedDirectories | array | No | Directories to share with the VM | +| recoveryMode | boolean | No | Boot macOS VM in recovery mode (default: false) | +| storage | string | No | VM storage location | + +#### Example Request + + + +```bash +curl -X POST "http://localhost:7777/lume/vms/{name}/run" \ + -H "Content-Type: application/json" \ + -d '{}' +``` + + +```python +import requests + +data = { +} + +response = requests.post("http://localhost:7777/lume/vms/{name}/run", json=data) +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/vms/${name}/run`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + }), +}); +const data = await response.json(); +```` + + + + +#### Response + +- **202**: VM start initiated (async operation) +- **400**: Invalid request or VM not found + +--- + +### A running virtual machine + +Stop a running virtual machine + +`POST: /lume/vms/:name/stop` + +#### Parameters + +| Name | Type | Required | Description | +| ---- | ------ | -------- | ---------------------- | +| name | string | Yes | Name of the VM to stop | + +#### Request Body + +| Name | Type | Required | Description | +| ------- | ------ | -------- | ------------------- | +| storage | string | No | VM storage location | + +#### Example Request + + + +```bash +curl -X POST "http://localhost:7777/lume/vms/{name}/stop" \ + -H "Content-Type: application/json" \ + -d '{}' +``` + + +```python +import requests + +data = { +} + +response = requests.post("http://localhost:7777/lume/vms/{name}/stop", json=data) +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/vms/${name}/stop`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + }), +}); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: VM stopped successfully +- **400**: Stop operation failed + +--- + +## Image Management + +### Available images from local cache + +List available images from local cache + +`GET: /lume/images` + +#### Parameters + +| Name | Type | Required | Description | +| ------------ | ------ | -------- | ------------------------------------------------- | +| organization | string | No | Organization to list images for (default: trycua) | + +#### Example Request + + + +```bash +curl "http://localhost:7777/lume/images" +``` + + +```python +import requests + +response = requests.get("http://localhost:7777/lume/images") +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/images`); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: Success +- **400**: Failed to list images + +--- + +### The latest macOS restore image (IPSW) URL + +Get the latest macOS restore image (IPSW) URL + +`GET: /lume/ipsw` + +#### Example Request + + + +```bash +curl "http://localhost:7777/lume/ipsw" +``` + + +```python +import requests + +response = requests.get("http://localhost:7777/lume/ipsw") +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/ipsw`); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: Success +- **400**: Failed to get IPSW URL + +--- + +### A VM image from a container registry + +Pull a VM image from a container registry + +`POST: /lume/pull` + +#### Request Body + +| Name | Type | Required | Description | +| ------------ | ------ | -------- | ------------------------------------------- | +| image | string | Yes | Image to pull (format: name:tag) | +| name | string | No | Name for the resulting VM | +| registry | string | No | Container registry URL (default: ghcr.io) | +| organization | string | No | Organization to pull from (default: trycua) | +| storage | string | No | VM storage location | + +#### Example Request + + + +```bash +curl -X POST "http://localhost:7777/lume/pull" \ + -H "Content-Type: application/json" \ + -d '{ + "image": "macos-sequoia-vanilla:latest" +}' +``` + + +```python +import requests + +data = { +"image": "macos-sequoia-vanilla:latest", +} + +response = requests.post("http://localhost:7777/lume/pull", json=data) +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/pull`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + image: "macos-sequoia-vanilla:latest", + }), +}); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: Image pulled successfully +- **400**: Pull operation failed + +--- + +### A VM image to a container registry + +Push a VM image to a container registry + +`POST: /lume/vms/push` + +#### Request Body + +| Name | Type | Required | Description | +| ------------ | ------- | -------- | ------------------------------------------ | +| name | string | Yes | Name of the local VM to push | +| imageName | string | Yes | Base name for the image in the registry | +| tags | array | Yes | List of tags to push | +| registry | string | No | Container registry URL (default: ghcr.io) | +| organization | string | No | Organization to push to (default: trycua) | +| storage | string | No | VM storage location | +| chunkSizeMb | integer | No | Chunk size for upload in MB (default: 512) | + +#### Example Request + + + +```bash +curl -X POST "http://localhost:7777/lume/vms/push" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "my-vm", + "imageName": "example", + "tags": [ + "latest" + ] +}' +``` + + +```python +import requests + +data = { +"name": "my-vm", +"imageName": "example", +"tags": ["latest"], +} + +response = requests.post("http://localhost:7777/lume/vms/push", json=data) +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/vms/push`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + name: "my-vm", + imageName: "example", + tags: ["latest"], + }), +}); +const data = await response.json(); +```` + + + + +#### Response + +- **202**: Push initiated (async operation) +- **400**: Invalid request + +--- + +### Cached images to free up disk space + +Remove cached images to free up disk space + +`POST: /lume/prune` + +#### Example Request + + + +```bash +curl -X POST "http://localhost:7777/lume/prune" +``` + + +```python +import requests + +response = requests.post("http://localhost:7777/lume/prune") +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/prune`); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: Images pruned successfully +- **400**: Prune operation failed + +--- + +## Configuration + +### Current Lume configuration settings + +Get current Lume configuration settings + +`GET: /lume/config` + +#### Example Request + + + +```bash +curl "http://localhost:7777/lume/config" +``` + + +```python +import requests + +response = requests.get("http://localhost:7777/lume/config") +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/config`); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: Success +- **400**: Failed to get config + +--- + +### Lume configuration settings + +Update Lume configuration settings + +`POST: /lume/config` + +#### Request Body + +| Name | Type | Required | Description | +| -------------- | ------- | -------- | ------------------------------- | +| homeDirectory | string | No | VM home directory path | +| cacheDirectory | string | No | Cache directory path | +| cachingEnabled | boolean | No | Enable or disable image caching | + +#### Example Request + + + +```bash +curl -X POST "http://localhost:7777/lume/config" \ + -H "Content-Type: application/json" \ + -d '{}' +``` + + +```python +import requests + +data = { +} + +response = requests.post("http://localhost:7777/lume/config", json=data) +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/config`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + }), +}); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: Configuration updated successfully +- **400**: Invalid request + +--- + +### All VM storage locations + +List all VM storage locations + +`GET: /lume/config/locations` + +#### Example Request + + + +```bash +curl "http://localhost:7777/lume/config/locations" +``` + + +```python +import requests + +response = requests.get("http://localhost:7777/lume/config/locations") +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/config/locations`); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: Success +- **400**: Failed to get locations + +--- + +### A new VM storage location + +Add a new VM storage location + +`POST: /lume/config/locations` + +#### Request Body + +| Name | Type | Required | Description | +| ---- | ------ | -------- | ------------------------- | +| name | string | Yes | Storage location name | +| path | string | Yes | Path to storage directory | + +#### Example Request + + + +```bash +curl -X POST "http://localhost:7777/lume/config/locations" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "my-vm", + "path": "/path/to/storage" +}' +``` + + +```python +import requests + +data = { +"name": "my-vm", +"path": "/path/to/storage", +} + +response = requests.post("http://localhost:7777/lume/config/locations", json=data) +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/config/locations`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + name: "my-vm", + path: "/path/to/storage", + }), +}); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: Location added successfully +- **400**: Invalid request or location already exists + +--- + +### A VM storage location + +Remove a VM storage location + +`DELETE: /lume/config/locations/:name` + +#### Parameters + +| Name | Type | Required | Description | +| ---- | ------ | -------- | ------------------------------ | +| name | string | Yes | Name of the location to remove | + +#### Example Request + + + +```bash +curl -X DELETE "http://localhost:7777/lume/config/locations/{name}" +``` + + +```python +import requests + +response = requests.delete("http://localhost:7777/lume/config/locations/{name}") +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/config/locations/${name}`, { + method: "DELETE", +}); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: Location removed successfully +- **400**: Location not found or cannot be removed + +--- + +### The default VM storage location + +Set the default VM storage location + +`POST: /lume/config/locations/default/:name` + +#### Parameters + +| Name | Type | Required | Description | +| ---- | ------ | -------- | -------------------------------------- | +| name | string | Yes | Name of the location to set as default | + +#### Example Request + + + +```bash +curl -X POST "http://localhost:7777/lume/config/locations/default/{name}" +``` + + +```python +import requests + +response = requests.post("http://localhost:7777/lume/config/locations/default/{name}") +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/config/locations/default/${name}`); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: Default location set successfully +- **400**: Location not found + +--- + +## Logs + +### Lume server logs + +Retrieve Lume server logs + +`GET: /lume/logs` + +#### Parameters + +| Name | Type | Required | Description | +| ----- | ------- | -------- | -------------------------------------------------- | +| type | string | No | Log type: 'info', 'error', or 'all' (default: all) | +| lines | integer | No | Number of lines to return from end of log | + +#### Example Request + + + +```bash +curl "http://localhost:7777/lume/logs" +``` + + +```python +import requests + +response = requests.get("http://localhost:7777/lume/logs") +print(response.json()) + +```` + + +```typescript +const response = await fetch(`http://localhost:7777/lume/logs`); +const data = await response.json(); +```` + + + + +#### Response + +- **200**: Success +- **400**: Failed to read logs + +--- diff --git a/docs/content/docs/lume/reference/v0.2/meta.json b/docs/content/docs/lume/reference/v0.2/meta.json new file mode 100644 index 00000000..a31a729a --- /dev/null +++ b/docs/content/docs/lume/reference/v0.2/meta.json @@ -0,0 +1,5 @@ +{ + "title": "v0.2", + "description": "Lume v0.2 Reference", + "pages": ["cli-reference", "http-api"] +} diff --git a/docs/next.config.mjs b/docs/next.config.mjs index a495960d..531236fa 100644 --- a/docs/next.config.mjs +++ b/docs/next.config.mjs @@ -50,6 +50,17 @@ const config = { destination: '/get-started/quickstart', permanent: true, }, + // Redirect old /api URLs to SDK landing pages + { + source: '/cua/reference/computer-sdk/api', + destination: '/cua/reference/computer-sdk', + permanent: true, + }, + { + source: '/cua/reference/agent-sdk/api', + destination: '/cua/reference/agent-sdk', + permanent: true, + }, ]; }, images: { diff --git a/docs/package.json b/docs/package.json index fa34dd22..bbb9eb3f 100644 --- a/docs/package.json +++ b/docs/package.json @@ -10,7 +10,10 @@ "docs:generate": "tsx ../scripts/docs-generators/runner.ts", "docs:check": "tsx ../scripts/docs-generators/runner.ts --check", "docs:list": "tsx ../scripts/docs-generators/runner.ts --list", - "docs:generate:lume": "tsx ../scripts/docs-generators/lume.ts" + "docs:generate:lume": "tsx ../scripts/docs-generators/lume.ts", + "docs:generate:python": "tsx ../scripts/docs-generators/python-sdk.ts", + "docs:generate:changelog": "tsx ../scripts/docs-generators/generate-changelog.ts", + "docs:generate:versions": "tsx ../scripts/docs-generators/generate-versioned-docs.ts" }, "dependencies": { "@ai-sdk/anthropic": "^3.0.9", diff --git a/docs/scripts/README.md b/docs/scripts/README.md index fac104e7..e08e3080 100644 --- a/docs/scripts/README.md +++ b/docs/scripts/README.md @@ -4,7 +4,15 @@ This directory contains scripts for crawling, indexing, and serving CUA document ## Scripts -- **modal_app.py**: Complete Modal app with scheduled crawling, database generation, and MCP server deployment +### Local Scripts + +- **crawl_docs.py**: Crawls cua.ai/docs using crawl4ai +- **generate_db.py**: Creates LanceDB vector database for semantic search +- **generate_sqlite.py**: Creates SQLite FTS5 database for full-text search + +### Modal Deployment + +- **modal_app.py**: Complete Modal app with scheduled crawling and MCP server deployment ## Installation @@ -17,7 +25,25 @@ uv sync --group docs-scripts ## Usage -### Modal Deployment +### Option 1: Local Development + +#### 1. Crawl Documentation + +```bash +uv run docs/scripts/crawl_docs.py +``` + +#### 2. Generate Databases + +```bash +# Generate vector database for semantic search +uv run docs/scripts/generate_db.py + +# Generate SQLite FTS5 database for full-text search +uv run docs/scripts/generate_sqlite.py +``` + +### Option 2: Modal Deployment (Production) The Modal app provides a production-ready deployment with: diff --git a/docs/scripts/crawl_docs.py b/docs/scripts/crawl_docs.py new file mode 100644 index 00000000..59d3a858 --- /dev/null +++ b/docs/scripts/crawl_docs.py @@ -0,0 +1,261 @@ +""" +Comprehensive crawler for cua.ai/docs using crawl4ai +Recursively crawls all documentation pages and saves content to JSON files +""" + +import asyncio +import json +import re +from pathlib import Path +from urllib.parse import urljoin, urlparse + +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +# Configuration +BASE_URL = "https://cua.ai" +DOCS_URL = f"{BASE_URL}/docs" +OUTPUT_DIR = Path(__file__).parent.parent / "crawled_data" +MAX_CONCURRENT = 5 # Limit concurrent requests to be polite +DELAY_BETWEEN_REQUESTS = 0.5 # seconds + + +class CuaDocsCrawler: + def __init__(self): + self.visited_urls: set[str] = set() + self.to_visit: set[str] = set() + self.failed_urls: set[str] = set() + self.all_data: list[dict] = [] + self.semaphore = asyncio.Semaphore(MAX_CONCURRENT) + + def normalize_url(self, url: str) -> str: + """Normalize URL to avoid duplicates""" + parsed = urlparse(url) + # Remove trailing slashes and fragments + path = parsed.path.rstrip("/") + if not path: + path = "" + return f"{parsed.scheme}://{parsed.netloc}{path}" + + def is_valid_url(self, url: str) -> bool: + """Check if URL should be crawled (only /docs pages)""" + parsed = urlparse(url) + + # Only crawl cua.ai pages + if parsed.netloc and parsed.netloc not in ["cua.ai", "www.cua.ai"]: + return False + + # Only crawl /docs paths + if not parsed.path.startswith("/docs"): + return False + + # Skip non-page resources + skip_extensions = [ + ".pdf", + ".png", + ".jpg", + ".jpeg", + ".gif", + ".svg", + ".css", + ".js", + ".ico", + ".woff", + ".woff2", + ".ttf", + ".zip", + ".tar", + ".gz", + ] + if any(parsed.path.lower().endswith(ext) for ext in skip_extensions): + return False + + # Skip external links and anchors + if url.startswith("#") or url.startswith("mailto:") or url.startswith("javascript:"): + return False + + return True + + def extract_links(self, html: str, current_url: str) -> set[str]: + """Extract all internal links from HTML content""" + links = set() + + # Find all href attributes + href_pattern = r'href=["\']([^"\']+)["\']' + matches = re.findall(href_pattern, html, re.IGNORECASE) + + for href in matches: + # Convert relative URLs to absolute + if href.startswith("/"): + full_url = urljoin(BASE_URL, href) + elif href.startswith("http"): + full_url = href + elif not href.startswith("#") and not href.startswith("mailto:"): + full_url = urljoin(current_url, href) + else: + continue + + normalized = self.normalize_url(full_url) + if self.is_valid_url(normalized): + links.add(normalized) + + return links + + def extract_path_info(self, url: str) -> dict: + """Extract meaningful path information from URL""" + parsed = urlparse(url) + path = parsed.path.replace("/docs/", "").strip("/") + parts = path.split("/") if path else [] + + return { + "path": path, + "category": parts[0] if parts else "root", + "subcategory": parts[1] if len(parts) > 1 else None, + "page": parts[-1] if parts else "index", + "depth": len(parts), + } + + async def crawl_page(self, crawler: AsyncWebCrawler, url: str) -> dict | None: + """Crawl a single page""" + async with self.semaphore: + try: + print(f"Crawling: {url}") + + config = CrawlerRunConfig( + word_count_threshold=10, + exclude_external_links=True, + ) + + result = await crawler.arun(url=url, config=config) + + if result.success: + # Extract new links from the page + new_links = self.extract_links(result.html, url) + for link in new_links: + if link not in self.visited_urls and link not in self.to_visit: + self.to_visit.add(link) + + path_info = self.extract_path_info(url) + + page_data = { + "url": url, + "title": result.metadata.get("title", "") if result.metadata else "", + "description": ( + result.metadata.get("description", "") if result.metadata else "" + ), + "markdown": result.markdown, + "path_info": path_info, + "links_found": list(new_links), + } + + # Save individual page + self.save_page(url, page_data) + + await asyncio.sleep(DELAY_BETWEEN_REQUESTS) + return page_data + else: + print(f"Failed to crawl {url}: {result.error_message}") + self.failed_urls.add(url) + return None + + except Exception as e: + print(f"Error crawling {url}: {e}") + self.failed_urls.add(url) + return None + + def save_page(self, url: str, data: dict): + """Save page data to a JSON file""" + # Create filename from URL path + parsed = urlparse(url) + path = parsed.path.strip("/") or "index" + filename = path.replace("/", "_") + ".json" + + filepath = OUTPUT_DIR / filename + with open(filepath, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + + async def crawl_all(self): + """Main crawl loop""" + OUTPUT_DIR.mkdir(exist_ok=True) + + # Start with the docs URL and key sections based on typical CUA docs structure + seed_urls = [ + DOCS_URL, + f"{DOCS_URL}/cua", + f"{DOCS_URL}/cua/guide", + f"{DOCS_URL}/cua/guide/get-started", + f"{DOCS_URL}/cua/reference", + f"{DOCS_URL}/cua/reference/computer-sdk", + f"{DOCS_URL}/cua-bench", + f"{BASE_URL}/llms.txt", # LLM-optimized content if available + ] + + for url in seed_urls: + normalized = self.normalize_url(url) + if self.is_valid_url(normalized) or url.endswith("llms.txt"): + self.to_visit.add(normalized) + + browser_config = BrowserConfig( + headless=True, + verbose=False, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + while self.to_visit: + # Get batch of URLs to crawl + batch = [] + while self.to_visit and len(batch) < MAX_CONCURRENT: + url = self.to_visit.pop() + if url not in self.visited_urls: + batch.append(url) + self.visited_urls.add(url) + + if not batch: + break + + # Crawl batch concurrently + tasks = [self.crawl_page(crawler, url) for url in batch] + results = await asyncio.gather(*tasks) + + # Collect successful results + for result in results: + if result: + self.all_data.append(result) + + print(f"Progress: {len(self.visited_urls)} crawled, {len(self.to_visit)} remaining") + + # Save summary + summary = { + "total_pages": len(self.all_data), + "failed_urls": list(self.failed_urls), + "all_urls": list(self.visited_urls), + "categories": self._get_categories(), + } + + with open(OUTPUT_DIR / "_summary.json", "w", encoding="utf-8") as f: + json.dump(summary, f, indent=2) + + # Save all data in one file too + with open(OUTPUT_DIR / "_all_pages.json", "w", encoding="utf-8") as f: + json.dump(self.all_data, f, indent=2, ensure_ascii=False) + + print("\nCrawl complete!") + print(f"Total pages crawled: {len(self.all_data)}") + print(f"Failed URLs: {len(self.failed_urls)}") + print(f"Output saved to: {OUTPUT_DIR.absolute()}") + + def _get_categories(self) -> dict: + """Get summary of categories crawled""" + categories = {} + for page in self.all_data: + cat = page.get("path_info", {}).get("category", "unknown") + categories[cat] = categories.get(cat, 0) + 1 + return categories + + +async def main(): + crawler = CuaDocsCrawler() + await crawler.crawl_all() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/scripts/generate_db.py b/docs/scripts/generate_db.py new file mode 100644 index 00000000..c0418e21 --- /dev/null +++ b/docs/scripts/generate_db.py @@ -0,0 +1,261 @@ +""" +Database generator for CUA documentation +Parses crawled JSON data and creates a LanceDB vector database for RAG +""" + +import json +import re +from pathlib import Path +from typing import Optional + +import lancedb +from lancedb.embeddings import get_registry +from lancedb.pydantic import LanceModel, Vector + +# Configuration +CRAWLED_DATA_DIR = Path(__file__).parent.parent / "crawled_data" +DB_PATH = Path(__file__).parent.parent / "docs_db" +CHUNK_SIZE = 1000 # Characters per chunk +CHUNK_OVERLAP = 200 # Overlap between chunks + +# Use sentence-transformers for embeddings +model = get_registry().get("sentence-transformers").create(name="all-MiniLM-L6-v2") + + +class DocChunk(LanceModel): + """Schema for document chunks in the database""" + + text: str = model.SourceField() + vector: Vector(model.ndims()) = model.VectorField() + url: str + title: str + category: str + subcategory: Optional[str] + page: str + chunk_index: int + + +def clean_markdown(markdown: str) -> str: + """Clean markdown content for better chunking""" + # Remove excessive whitespace + text = re.sub(r"\n{3,}", "\n\n", markdown) + # Remove image markdown + text = re.sub(r"!\[.*?\]\(.*?\)", "", text) + # Remove link URLs but keep text + text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) + # Remove HTML tags + text = re.sub(r"<[^>]+>", "", text) + # Clean up whitespace + text = re.sub(r" {2,}", " ", text) + return text.strip() + + +def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list[str]: + """Split text into overlapping chunks, respecting sentence boundaries""" + if not text: + return [] + + # Split by paragraphs first + paragraphs = text.split("\n\n") + chunks = [] + current_chunk = "" + + for para in paragraphs: + para = para.strip() + if not para: + continue + + # If adding this paragraph exceeds chunk size, save current and start new + if len(current_chunk) + len(para) + 2 > chunk_size: + if current_chunk: + chunks.append(current_chunk.strip()) + # Start new chunk with overlap from previous + if overlap > 0 and len(current_chunk) > overlap: + # Try to find a sentence boundary for overlap + overlap_text = current_chunk[-overlap:] + sentence_end = overlap_text.rfind(". ") + if sentence_end > 0: + overlap_text = overlap_text[sentence_end + 2 :] + current_chunk = overlap_text + "\n\n" + para + else: + current_chunk = para + else: + # Single paragraph exceeds chunk size, split by sentences + sentences = re.split(r"(?<=[.!?])\s+", para) + for sentence in sentences: + if len(current_chunk) + len(sentence) + 1 > chunk_size: + if current_chunk: + chunks.append(current_chunk.strip()) + # Start new chunk with overlap from previous, similar to paragraph logic + if overlap > 0 and len(current_chunk) > overlap: + overlap_text = current_chunk[-overlap:] + sentence_end = overlap_text.rfind(". ") + if sentence_end > 0: + overlap_text = overlap_text[sentence_end + 2 :] + current_chunk = (overlap_text + " " + sentence).strip() + else: + current_chunk = sentence.strip() + else: + # No existing chunk; start with this sentence + current_chunk = sentence.strip() + else: + current_chunk = (current_chunk + " " + sentence).strip() + else: + current_chunk = (current_chunk + "\n\n" + para).strip() + + # Don't forget the last chunk + if current_chunk: + chunks.append(current_chunk.strip()) + + return chunks + + +def load_crawled_data() -> list[dict]: + """Load all crawled page data""" + all_pages_file = CRAWLED_DATA_DIR / "_all_pages.json" + + if all_pages_file.exists(): + with open(all_pages_file, "r", encoding="utf-8") as f: + return json.load(f) + + # Fallback: load individual files + pages = [] + for json_file in CRAWLED_DATA_DIR.glob("*.json"): + if json_file.name.startswith("_"): + continue + with open(json_file, "r", encoding="utf-8") as f: + pages.append(json.load(f)) + + return pages + + +def process_pages(pages: list[dict]) -> list[dict]: + """Process pages into document chunks""" + all_chunks = [] + + for page in pages: + markdown = page.get("markdown", "") + if not markdown: + continue + + # Clean the markdown + cleaned_text = clean_markdown(markdown) + if not cleaned_text or len(cleaned_text) < 50: + continue + + # Get path info + path_info = page.get("path_info", {}) + + # Chunk the text + text_chunks = chunk_text(cleaned_text) + + # Ensure non-null values for required fields + url = page.get("url", "") + title = page.get("title") or path_info.get("page", "") or "Untitled" + category = path_info.get("category") or "unknown" + page_name = path_info.get("page") or "" + + for i, chunk_text_content in enumerate(text_chunks): + chunk = { + "text": chunk_text_content, + "url": url, + "title": title, + "category": category, + "subcategory": path_info.get("subcategory"), + "page": page_name, + "chunk_index": i, + } + all_chunks.append(chunk) + + return all_chunks + + +def create_database(chunks: list[dict]): + """Create LanceDB database from chunks""" + # Remove existing database + if DB_PATH.exists(): + import shutil + + shutil.rmtree(DB_PATH) + + # Create database + db = lancedb.connect(DB_PATH) + + # Create table with schema + table = db.create_table( + "docs", + schema=DocChunk, + mode="overwrite", + ) + + # Add data in batches + batch_size = 100 + for i in range(0, len(chunks), batch_size): + batch = chunks[i : i + batch_size] + print(f"Adding batch {i // batch_size + 1}/{(len(chunks) + batch_size - 1) // batch_size}") + table.add(batch) + + print(f"Database created at: {DB_PATH}") + print(f"Total chunks: {len(chunks)}") + + return db + + +def test_search(db: lancedb.DBConnection, query: str, limit: int = 5): + """Test search functionality""" + table = db.open_table("docs") + + print(f"\nSearching for: '{query}'") + print("-" * 50) + + results = table.search(query).limit(limit).to_list() + + for i, result in enumerate(results): + print(f"\n{i + 1}. [{result['category']}] {result['title']}") + print(f" URL: {result['url']}") + print(f" Score: {result.get('_distance', 'N/A'):.4f}") + print(f" Preview: {result['text'][:150]}...") + + +def main(): + print("Loading crawled data...") + pages = load_crawled_data() + print(f"Loaded {len(pages)} pages") + + if not pages: + print("No crawled data found. Run crawl_docs.py first.") + return + + print("\nProcessing pages into chunks...") + chunks = process_pages(pages) + print(f"Created {len(chunks)} chunks") + + if not chunks: + print("No chunks created. Check your crawled data.") + return + + print("\nCreating database...") + db = create_database(chunks) + + # Test with sample queries + print("\n" + "=" * 50) + print("Testing search functionality") + print("=" * 50) + + test_queries = [ + "how to install CUA", + "computer use agent", + "benchmark evaluation", + "API reference", + ] + + for query in test_queries: + test_search(db, query) + + print("\n" + "=" * 50) + print("Database generation complete!") + print(f"Database location: {DB_PATH}") + + +if __name__ == "__main__": + main() diff --git a/docs/scripts/generate_sqlite.py b/docs/scripts/generate_sqlite.py new file mode 100644 index 00000000..8aac1932 --- /dev/null +++ b/docs/scripts/generate_sqlite.py @@ -0,0 +1,278 @@ +""" +SQLite database generator for CUA documentation +Creates a full-text search enabled SQLite database from crawled data +""" + +import json +import re +import sqlite3 +from pathlib import Path + +from markdown_it import MarkdownIt + +# Configuration +CRAWLED_DATA_DIR = Path(__file__).parent.parent / "crawled_data" +SQLITE_PATH = Path(__file__).parent.parent / "docs_db" / "docs.sqlite" + + +def clean_markdown(markdown: str) -> str: + """ + Extract plain text content from markdown using a proper markdown parser. + + This function uses markdown-it-py to parse the markdown into a token tree + and then extracts only the text content, removing: + - Markdown formatting (bold, italic, headers, etc.) + - Links (keeping only the link text) + - Images (alt text is discarded) + - HTML tags + - Code block language identifiers + + Args: + markdown: Raw markdown content + + Returns: + Plain text content suitable for full-text search + """ + md_parser = MarkdownIt() + tokens = md_parser.parse(markdown) + + text_parts = [] + + def extract_text(token_list): + """Recursively extract text from token tree""" + for token in token_list: + if token.type == "inline" and token.children: + # Process inline content (text, links, formatting, etc.) + for child in token.children: + if child.type == "text": + text_parts.append(child.content) + elif child.type == "code_inline": + text_parts.append(child.content) + elif child.type == "softbreak": + text_parts.append(" ") + elif child.type == "hardbreak": + text_parts.append("\n") + # Skip link markup, images, and formatting tokens + # (link_open, link_close, image, strong_open, strong_close, em_open, em_close, etc.) + elif token.type == "fence" or token.type == "code_block": + # Include code content and add newline after + text_parts.append(token.content) + text_parts.append("\n") + elif token.type == "html_block" or token.type == "html_inline": + # Skip HTML blocks and inline HTML + pass + + # Recursively process nested children + if token.children: + extract_text(token.children) + + # Add spacing after block elements + if token.type in [ + "heading_close", + "paragraph_close", + "list_item_close", + "blockquote_close", + ]: + text_parts.append("\n") + + extract_text(tokens) + + # Join and clean up whitespace + text = "".join(text_parts) + # Normalize multiple newlines to at most double newlines + text = re.sub(r"\n{3,}", "\n\n", text) + # Normalize multiple spaces to single space within lines + text = re.sub(r" {2,}", " ", text) + + return text.strip() + + +def load_crawled_data() -> list[dict]: + """Load all crawled page data""" + all_pages_file = CRAWLED_DATA_DIR / "_all_pages.json" + + if all_pages_file.exists(): + with open(all_pages_file, "r", encoding="utf-8") as f: + return json.load(f) + + pages = [] + for json_file in CRAWLED_DATA_DIR.glob("*.json"): + if json_file.name.startswith("_"): + continue + with open(json_file, "r", encoding="utf-8") as f: + pages.append(json.load(f)) + + return pages + + +def create_database(pages: list[dict]): + """Create SQLite database with FTS5 full-text search""" + # Ensure parent directory exists + SQLITE_PATH.parent.mkdir(parents=True, exist_ok=True) + + # Remove existing database + if SQLITE_PATH.exists(): + SQLITE_PATH.unlink() + + conn = sqlite3.connect(SQLITE_PATH) + cursor = conn.cursor() + + # Create main pages table + cursor.execute( + """ + CREATE TABLE pages ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT UNIQUE NOT NULL, + title TEXT, + description TEXT, + category TEXT, + subcategory TEXT, + page_name TEXT, + content TEXT, + raw_markdown TEXT + ) + """ + ) + + # Create FTS5 virtual table for full-text search + cursor.execute( + """ + CREATE VIRTUAL TABLE pages_fts USING fts5( + content, + url UNINDEXED, + title UNINDEXED, + category UNINDEXED, + content='pages', + content_rowid='id' + ) + """ + ) + + # Create triggers to keep FTS in sync + cursor.execute( + """ + CREATE TRIGGER pages_ai AFTER INSERT ON pages BEGIN + INSERT INTO pages_fts(rowid, content, url, title, category) + VALUES (new.id, new.content, new.url, new.title, new.category); + END; + """ + ) + + cursor.execute( + """ + CREATE TRIGGER pages_ad AFTER DELETE ON pages BEGIN + DELETE FROM pages_fts WHERE rowid = old.id; + END; + """ + ) + + cursor.execute( + """ + CREATE TRIGGER pages_au AFTER UPDATE ON pages BEGIN + DELETE FROM pages_fts WHERE rowid = old.id; + INSERT INTO pages_fts(rowid, content, url, title, category) + VALUES (new.id, new.content, new.url, new.title, new.category); + END; + """ + ) + + # Insert pages + for page in pages: + markdown = page.get("markdown", "") + if not markdown: + continue + + content = clean_markdown(markdown) + if not content or len(content) < 50: + continue + + path_info = page.get("path_info", {}) + + cursor.execute( + """ + INSERT OR REPLACE INTO pages + (url, title, description, category, subcategory, page_name, content, raw_markdown) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + page.get("url", ""), + page.get("title") or path_info.get("page", "") or "Untitled", + page.get("description", ""), + path_info.get("category", "unknown"), + path_info.get("subcategory"), + path_info.get("page", ""), + content, + markdown, + ), + ) + + conn.commit() + + # Get stats + cursor.execute("SELECT COUNT(*) FROM pages") + page_count = cursor.fetchone()[0] + + cursor.execute("SELECT category, COUNT(*) FROM pages GROUP BY category") + categories = cursor.fetchall() + + conn.close() + + print(f"SQLite database created at: {SQLITE_PATH}") + print(f"Total pages: {page_count}") + print("Pages by category:") + for cat, count in categories: + print(f" - {cat}: {count}") + + +def test_search(query: str): + """Test full-text search""" + conn = sqlite3.connect(SQLITE_PATH) + cursor = conn.cursor() + + print(f"\nFTS5 search for: '{query}'") + print("-" * 50) + + cursor.execute( + """ + SELECT url, title, snippet(pages_fts, 0, '>>>', '<<<', '...', 50) as snippet + FROM pages_fts + WHERE pages_fts MATCH ? + ORDER BY rank + LIMIT 5 + """, + (query,), + ) + + results = cursor.fetchall() + for url, title, snippet in results: + print(f"\n{title}") + print(f" URL: {url}") + print(f" Snippet: {snippet}") + + conn.close() + + +def main(): + print("Loading crawled data...") + pages = load_crawled_data() + print(f"Loaded {len(pages)} pages") + + if not pages: + print("No crawled data found. Run crawl_docs.py first.") + return + + print("\nCreating SQLite database...") + create_database(pages) + + # Test searches + print("\n" + "=" * 50) + print("Testing FTS5 search") + print("=" * 50) + + test_search("install") + test_search("computer use agent") + test_search("benchmark") + + +if __name__ == "__main__": + main() diff --git a/docs/scripts/modal_app.py b/docs/scripts/modal_app.py index 52457e1b..cb3499d1 100644 --- a/docs/scripts/modal_app.py +++ b/docs/scripts/modal_app.py @@ -1,12 +1,9 @@ """ -Modal app for CUA documentation crawling and database generation +Modal app for CUA documentation crawling and MCP server This app provides: 1. Scheduled daily crawling of cua.ai/docs stored in a Modal volume -2. Database generation (LanceDB vectors + SQLite FTS) for the MCP server - -The MCP server that queries these databases runs as a separate containerized -service (see docs/scripts/docs-mcp-server/). +2. MCP server that serves documentation search over the crawled data Usage: modal deploy docs/scripts/modal_app.py @@ -17,6 +14,7 @@ import json import re import sqlite3 from pathlib import Path +from typing import Optional import modal from markdown_it import MarkdownIt @@ -41,6 +39,8 @@ image = ( "lancedb>=0.4.0", "sentence-transformers>=2.2.0", "pyarrow>=14.0.1", + "fastapi>=0.100.0", + "fastmcp>=2.14.0", "pydantic>=2.0.0", "pandas>=2.0.0", "markdown-it-py>=3.0.0", @@ -1492,6 +1492,384 @@ async def scheduled_code_index(): } +# ============================================================================= +# MCP Server +# ============================================================================= + + +@app.function( + image=image, + volumes={VOLUME_PATH: docs_volume, CODE_VOLUME_PATH: code_volume}, + cpu=1.0, + memory=2048, + keep_warm=1, # Keep one container warm to avoid cold start latency +) +@modal.concurrent(max_inputs=10) +@modal.asgi_app(custom_domains=["docs-mcp.cua.ai"]) +def web(): + """ASGI web endpoint for the MCP server""" + import lancedb + from fastmcp import FastMCP + from lancedb.embeddings import get_registry + from starlette.middleware.cors import CORSMiddleware + + # Initialize the MCP server + mcp = FastMCP( + name="CUA Docs & Code", + instructions="""CUA Documentation and Code Server - provides direct read-only query access to Computer Use Agent (CUA) documentation and versioned source code. + +=== AVAILABLE TOOLS === + +Documentation: +- query_docs_db: Execute SQL queries against the documentation SQLite database +- query_docs_vectors: Execute vector similarity searches against the documentation LanceDB + +Code: +- query_code_db: Execute SQL queries against the code search SQLite database +- query_code_vectors: Execute vector similarity searches against the code LanceDB + +All tools are READ-ONLY. Only SELECT queries are allowed for SQL databases. + +=== DOCUMENTATION DATABASE === + +The documentation database contains crawled pages from cua.ai/docs covering: +- CUA SDK: Python library for building computer-use agents +- CUA Bench: Benchmarking framework for evaluating computer-use agents +- Agent Loop: Core execution loop for autonomous agent operation +- Sandboxes: Docker and cloud VM environments for safe agent execution +- Computer interfaces: Screen, mouse, keyboard, and bash interaction APIs + +=== CODE DATABASE === + +The code database contains versioned source code indexed across all git tags. +Components include: agent, computer, mcp-server, som, etc. + +=== WORKFLOW EXAMPLES === + +1. Find documentation about a topic: + - Use query_docs_vectors with a natural language query for semantic search + - Use query_docs_db with FTS5 MATCH for keyword search + +2. Explore code across versions: + - List components: SELECT component, COUNT(DISTINCT version) FROM code_files GROUP BY component + - Search code: Use query_code_db with FTS5 on code_files_fts + - Get file content: SELECT content FROM code_files WHERE component='agent' AND version='0.7.3' AND file_path='...' + +3. Semantic code search: + - Use query_code_vectors with natural language queries like "screenshot capture implementation" + +IMPORTANT: Always cite sources - URLs for docs, component@version:path for code.""", + ) + + # Initialize embedding model - load eagerly to avoid cold start on first search + print("Initializing embedding model...") + model = get_registry().get("sentence-transformers").create(name="all-MiniLM-L6-v2") + + # Eagerly initialize database connections at startup to reduce first-request latency + print("Initializing database connections...") + + # Docs LanceDB + _docs_lance_db = None + _docs_lance_table = None + db_path = Path(DB_PATH) + if db_path.exists(): + try: + _docs_lance_db = lancedb.connect(db_path) + _docs_lance_table = _docs_lance_db.open_table("docs") + print(f" Docs LanceDB loaded from {db_path}") + except Exception as e: + print(f" Warning: Could not load docs LanceDB: {e}") + + # Docs SQLite + _docs_sqlite_conn = None + sqlite_path = Path(DB_PATH) / "docs.sqlite" + if sqlite_path.exists(): + try: + _docs_sqlite_conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True) + _docs_sqlite_conn.row_factory = sqlite3.Row + print(f" Docs SQLite loaded from {sqlite_path}") + except Exception as e: + print(f" Warning: Could not load docs SQLite: {e}") + + # Code LanceDB + _code_lance_db = None + _code_lance_table = None + code_lance_path = Path(CODE_DB_PATH) / "code_index.lancedb" + if code_lance_path.exists(): + try: + _code_lance_db = lancedb.connect(code_lance_path) + _code_lance_table = _code_lance_db.open_table("code") + print(f" Code LanceDB loaded from {code_lance_path}") + except Exception as e: + print(f" Warning: Could not load code LanceDB: {e}") + + # Code SQLite + _code_sqlite_conn = None + code_sqlite_path = Path(CODE_DB_PATH) / "code_index.sqlite" + if code_sqlite_path.exists(): + try: + _code_sqlite_conn = sqlite3.connect(f"file:{code_sqlite_path}?mode=ro", uri=True) + _code_sqlite_conn.row_factory = sqlite3.Row + print(f" Code SQLite loaded from {code_sqlite_path}") + except Exception as e: + print(f" Warning: Could not load code SQLite: {e}") + + print("Database initialization complete.") + + def get_lance_table(): + """Get LanceDB connection for docs (eagerly loaded)""" + if _docs_lance_table is None: + raise RuntimeError("Database not found. Run crawl and generation functions first.") + return _docs_lance_table + + def get_sqlite_conn(): + """Get read-only SQLite connection for docs (eagerly loaded)""" + if _docs_sqlite_conn is None: + raise RuntimeError("SQLite database not found.") + return _docs_sqlite_conn + + def get_code_lance_table(): + """Get LanceDB connection for the aggregated code database (eagerly loaded).""" + if _code_lance_table is None: + raise RuntimeError( + "Code LanceDB not found. Run generate_code_index_parallel and aggregate_code_databases first." + ) + return _code_lance_table + + def get_code_sqlite_conn(): + """Get read-only SQLite connection for the aggregated code database (eagerly loaded).""" + if _code_sqlite_conn is None: + raise RuntimeError( + "Code SQLite database not found. Run generate_code_index_parallel and aggregate_code_databases first." + ) + return _code_sqlite_conn + + # =================== DOCUMENTATION QUERY TOOLS (READ-ONLY) =================== + + @mcp.tool() + def query_docs_db(sql: str) -> list[dict]: + """ + Execute a SQL query against the documentation database. + The database is READ-ONLY. + + Database Schema: + + Table: pages + - id INTEGER PRIMARY KEY AUTOINCREMENT + - url TEXT NOT NULL UNIQUE -- Full URL of the documentation page + - title TEXT NOT NULL -- Page title + - category TEXT NOT NULL -- Category (e.g., 'cua', 'cuabench', 'llms.txt') + - content TEXT NOT NULL -- Plain text content (markdown stripped) + + Virtual Table: pages_fts (FTS5 full-text search) + - content TEXT -- Full-text indexed content + - url TEXT UNINDEXED + - title TEXT UNINDEXED + - category TEXT UNINDEXED + + Example queries: + + 1. List all pages: SELECT url, title, category FROM pages ORDER BY category, title + + 2. Full-text search with snippets: + SELECT p.url, p.title, snippet(pages_fts, 0, '>>>', '<<<', '...', 64) as snippet + FROM pages_fts JOIN pages p ON pages_fts.rowid = p.id + WHERE pages_fts MATCH 'agent loop' ORDER BY rank LIMIT 10 + + 3. Get page content: SELECT url, title, content FROM pages WHERE url LIKE '%quickstart%' + + Args: + sql: SQL query to execute + + Returns: + List of dictionaries, one per row, with column names as keys + """ + conn = get_sqlite_conn() + cursor = conn.cursor() + cursor.execute(sql) + return [dict(row) for row in cursor.fetchall()] + + @mcp.tool() + def query_docs_vectors( + query: str, + limit: int = 10, + where: Optional[str] = None, + select: Optional[list[str]] = None, + ) -> list[dict]: + """ + Execute a vector similarity search against the documentation LanceDB (read-only). + + Schema: + - text TEXT -- The document chunk text + - vector VECTOR -- Embedding vector (all-MiniLM-L6-v2, 384 dimensions) + - url TEXT -- Source URL + - title TEXT -- Document title + - category TEXT -- Category (e.g., 'cua', 'cuabench') + - chunk_index INT -- Index of chunk within document + + Args: + query: Natural language query to embed and search for + limit: Maximum number of results (default: 10, max: 100) + where: Optional SQL-like filter (e.g., "category = 'cua'") + select: Optional list of columns to return (default: all except vector) + + Returns: + List of matching documents with similarity scores (_distance field) + """ + limit = min(max(1, limit), 100) + table = get_lance_table() + + search = table.search(query).limit(limit) + + if where: + search = search.where(where) + if select: + search = search.select(select) + + results = search.to_list() + + formatted = [] + for r in results: + result = {} + for key, value in r.items(): + if key == "vector": + continue + result[key] = value + formatted.append(result) + + return formatted + + # =================== CODE QUERY TOOLS (READ-ONLY) =================== + + @mcp.tool() + def query_code_db(sql: str) -> list[dict]: + """ + Execute a SQL query against the code search database. + The database is READ-ONLY. + + Database Schema: + + Table: code_files + - id INTEGER PRIMARY KEY AUTOINCREMENT + - component TEXT NOT NULL -- Component name (e.g., "agent", "computer") + - version TEXT NOT NULL -- Version string (e.g., "0.7.3") + - file_path TEXT NOT NULL -- Path to file + - content TEXT NOT NULL -- Full source code content + - language TEXT NOT NULL -- Programming language + - UNIQUE(component, version, file_path) + + Virtual Table: code_files_fts (FTS5 full-text search) + - content TEXT -- Full-text indexed content + - component TEXT UNINDEXED + - version TEXT UNINDEXED + - file_path TEXT UNINDEXED + + Example queries: + + 1. List components: SELECT component, COUNT(DISTINCT version) as version_count + FROM code_files GROUP BY component ORDER BY component + + 2. List versions: SELECT DISTINCT version FROM code_files + WHERE component = 'agent' ORDER BY version DESC + + 3. Full-text search: + SELECT f.component, f.version, f.file_path, + snippet(code_files_fts, 0, '>>>', '<<<', '...', 64) as snippet + FROM code_files_fts JOIN code_files f ON code_files_fts.rowid = f.id + WHERE code_files_fts MATCH 'ComputerAgent' ORDER BY rank LIMIT 10 + + 4. Get file content: SELECT content, language FROM code_files + WHERE component = 'agent' AND version = '0.7.3' AND file_path = 'agent/core.py' + + Args: + sql: SQL query to execute + + Returns: + List of dictionaries, one per row, with column names as keys + """ + conn = get_code_sqlite_conn() + cursor = conn.cursor() + cursor.execute(sql) + return [dict(row) for row in cursor.fetchall()] + + @mcp.tool() + def query_code_vectors( + query: str, + limit: int = 10, + where: Optional[str] = None, + select: Optional[list[str]] = None, + component: Optional[str] = None, + ) -> list[dict]: + """ + Execute a vector similarity search against the code LanceDB (read-only). + + Schema: + - text TEXT -- The source code content + - vector VECTOR -- Embedding vector (all-MiniLM-L6-v2, 384 dimensions) + - component TEXT -- Component name (e.g., "agent", "computer") + - version TEXT -- Version string (e.g., "0.7.3") + - file_path TEXT -- Path to file within the component + - language TEXT -- Programming language + + Args: + query: Natural language query to embed and search for + limit: Maximum number of results (default: 10, max: 100) + where: Optional SQL-like filter (e.g., "version = '0.7.3'") + select: Optional list of columns to return (default: all except vector) + component: Optional component to filter by (if not specified, searches all) + + Returns: + List of matching code files with similarity scores (_distance field) + """ + limit = min(max(1, limit), 100) + table = get_code_lance_table() + + search = table.search(query).limit(limit) + + # Build where clause, adding component filter if specified + where_clauses = [] + if component: + where_clauses.append(f"component = '{component}'") + if where: + where_clauses.append(where) + + if where_clauses: + search = search.where(" AND ".join(where_clauses)) + if select: + search = search.select(select) + + results = search.to_list() + + formatted = [] + for r in results: + result = {} + for key, value in r.items(): + if key == "vector": + continue + result[key] = value + formatted.append(result) + + return formatted + + # Create SSE app directly - endpoints at /sse (GET) and /messages (POST) + from starlette.middleware import Middleware + + mcp_app = mcp.http_app( + transport="sse", + middleware=[ + Middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + ], + ) + + return mcp_app + + # ============================================================================= # Local testing functions # ============================================================================= diff --git a/docs/src/components/custom-header.tsx b/docs/src/components/custom-header.tsx index 188f8069..27e8e6f9 100644 --- a/docs/src/components/custom-header.tsx +++ b/docs/src/components/custom-header.tsx @@ -7,7 +7,8 @@ import { usePathname } from 'next/navigation'; import { cn } from 'fumadocs-ui/utils/cn'; import { SearchToggle } from 'fumadocs-ui/components/layout/search-toggle'; import { ThemeToggle } from 'fumadocs-ui/components/layout/theme-toggle'; -import { ChevronsUpDown, Check } from 'lucide-react'; +import { ChevronsUpDown, Check, Menu } from 'lucide-react'; +import { useSidebar } from 'fumadocs-ui/provider'; import LogoBlack from '@/assets/cuala-icon-black.svg'; import LogoWhite from '@/assets/cuala-icon-white.svg'; import CuaBenchLogoBlack from '@/assets/cuabench-logo-black.svg'; @@ -68,32 +69,6 @@ const docsSites = [ }, ], }, - { - name: 'CuaBot', - label: 'Docs', - href: '/cuabot/cuabot', - prefix: '/cuabot', - isDefault: false, - description: 'Co-op computer-use for any agent', - logoBlack: LogoBlack, - logoWhite: LogoWhite, - iconWidth: 24, - iconHeight: 24, - dropdownIconWidth: 20, - dropdownIconHeight: 20, - navTabs: [ - { - name: 'Overview', - href: '/cuabot/cuabot', - prefix: '/cuabot/cuabot', - }, - { - name: 'Install', - href: '/cuabot/install', - prefix: '/cuabot/install', - }, - ], - }, { name: 'Lume', label: 'Docs', @@ -125,12 +100,39 @@ const docsSites = [ }, ], }, + { + name: 'Cua-Bot', + label: 'Docs', + href: '/cuabot/guide/getting-started/introduction', + prefix: '/cuabot', + isDefault: false, + description: 'Co-op computer-use for any agent', + logoBlack: LogoBlack, + logoWhite: LogoWhite, + iconWidth: 24, + iconHeight: 24, + dropdownIconWidth: 20, + dropdownIconHeight: 20, + navTabs: [ + { + name: 'Guide', + href: '/cuabot/guide/getting-started/introduction', + prefix: '/cuabot/guide', + }, + { + name: 'Reference', + href: '/cuabot/reference', + prefix: '/cuabot/reference', + }, + ], + }, ]; export function CustomHeader() { const pathname = usePathname(); const [isOpen, setIsOpen] = useState(false); const dropdownRef = useRef(null); + const { open: sidebarOpen, setOpen: setSidebarOpen } = useSidebar(); // Determine current docs site based on pathname const currentSite = @@ -153,26 +155,32 @@ export function CustomHeader() {
{/* Left: Logo and Nav */} -
+
+ {/* Hamburger Menu Button - visible on mobile only, opens native fumadocs sidebar */} + + {/* Docs Switcher - wraps logo, name, and label */}