Merge branch 'main' into feat/cua-bench-submodules

This commit is contained in:
Dillon DuPont
2025-12-09 15:22:15 -05:00
177 changed files with 12246 additions and 3019 deletions

View File

@@ -1,4 +1,4 @@
name: Bump Version
name: Bump Version & Publish
on:
workflow_dispatch:
@@ -30,6 +30,9 @@ permissions:
jobs:
bump-version:
runs-on: ubuntu-latest
outputs:
agent_version: ${{ steps.agent_version.outputs.version }}
computer_version: ${{ steps.computer_version.outputs.version }}
steps:
- name: Set package directory
id: package
@@ -86,6 +89,46 @@ jobs:
cd ${{ steps.package.outputs.directory }}
bump2version ${{ inputs.bump_type }}
- name: Also bump cua-agent
if: ${{ inputs.service == 'cua-computer' }}
run: |
cd libs/python/agent
bump2version ${{ inputs.bump_type }}
- name: Capture bumped agent version
if: ${{ inputs.service == 'cua-agent' || inputs.service == 'cua-computer' }}
id: agent_version
run: |
cd libs/python/agent
VERSION=$(python -c "import tomllib; from pathlib import Path; data = tomllib.loads(Path('pyproject.toml').read_text()); print(data['project']['version'])")
echo "Agent version: $VERSION"
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
- name: Capture bumped computer version
if: ${{ inputs.service == 'cua-computer' }}
id: computer_version
run: |
cd libs/python/computer
VERSION=$(python -c "import tomllib; from pathlib import Path; data = tomllib.loads(Path('pyproject.toml').read_text()); print(data['project']['version'])")
echo "Computer version: $VERSION"
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
- name: Push changes
run: |
git push origin main --follow-tags
publish-computer:
needs: bump-version
if: ${{ inputs.service == 'cua-computer' }}
uses: ./.github/workflows/pypi-publish-computer.yml
with:
version: ${{ needs.bump-version.outputs.computer_version }}
secrets: inherit
publish-agent:
needs: [bump-version, publish-computer]
if: ${{ always() && (inputs.service == 'cua-agent' || inputs.service == 'cua-computer') && needs.bump-version.result == 'success' && (inputs.service == 'cua-agent' || needs.publish-computer.result == 'success') }}
uses: ./.github/workflows/pypi-publish-agent.yml
with:
version: ${{ needs.bump-version.outputs.agent_version }}
secrets: inherit

View File

@@ -3,7 +3,13 @@ on:
push:
branches:
- "main"
pull_request: {}
paths:
- "libs/lume/**"
- ".github/workflows/ci-lume.yml"
pull_request:
paths:
- "libs/lume/**"
- ".github/workflows/ci-lume.yml"
concurrency:
group: lume-${{ github.workflow }}-${{ github.ref }}

74
.github/workflows/link-check.yml vendored Normal file
View File

@@ -0,0 +1,74 @@
name: Link Checker
on:
pull_request_target:
branches: [main, master]
push:
branches:
- main
workflow_dispatch:
jobs:
link-check:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Run Lychee link checker
uses: lycheeverse/lychee-action@v2
id: lychee
with:
# Check all markdown files
args: --verbose --no-progress --max-cache-age 1d --accept 200..=299,403 --exclude '^file://' --exclude 'localhost' --exclude '127\.0\.0\.1' '**/*.md'
# Output results to file for parsing
output: lychee-output.md
# Don't fail the build on broken links (warning mode)
fail: false
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Parse link check results
id: parse-results
if: always()
run: |
# Use lychee exit code: 0 = success, >0 = errors found
EXIT_CODE="${{ steps.lychee.outputs.exit_code }}"
echo "Exit code: $EXIT_CODE"
# Show summary if output file exists
if [ -f "lychee-output.md" ]; then
echo "=== Link Check Summary ==="
cat lychee-output.md
fi
# Set status based on exit code
if [ "$EXIT_CODE" = "0" ]; then
echo "STATUS_ICON=✅" >> $GITHUB_ENV
echo "STATUS_TEXT=All links are working" >> $GITHUB_ENV
echo "COLOR=#36a64f" >> $GITHUB_ENV
elif [ "$EXIT_CODE" = "2" ]; then
echo "STATUS_ICON=❌" >> $GITHUB_ENV
echo "STATUS_TEXT=Link checker failed to run" >> $GITHUB_ENV
echo "COLOR=#dc3545" >> $GITHUB_ENV
else
echo "STATUS_ICON=⚠️" >> $GITHUB_ENV
echo "STATUS_TEXT=Found broken links" >> $GITHUB_ENV
echo "COLOR=#ffa500" >> $GITHUB_ENV
fi
- name: Send results to Slack
if: always() && github.ref == 'refs/heads/main'
uses: rtCamp/action-slack-notify@v2
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
SLACK_CHANNEL: ${{ vars.SLACK_CHANNEL }}
SLACK_TITLE: "🔗 Link Check Results"
SLACK_COLOR: ${{ env.COLOR }}
SLACK_MESSAGE: |
*Status:* ${{ env.STATUS_ICON }} ${{ env.STATUS_TEXT }}
*Branch:* `${{ github.ref_name }}`
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}${{ github.event.pull_request.number && format('?pr={0}', github.event.pull_request.number) || '' }}|View broken links>

212
.github/workflows/npm-publish-cli.yml vendored Normal file
View File

@@ -0,0 +1,212 @@
name: Publish @trycua/cli
on:
workflow_dispatch:
inputs:
version:
description: "Version to publish (default: from package.json)"
required: false
default: ""
jobs:
build-and-publish:
permissions:
id-token: write
contents: write
packages: write
strategy:
matrix:
include:
- target: bun-linux-x64
ext: ""
binary_name: cua-linux-x64
- target: bun-darwin-x64
ext: ""
binary_name: cua-darwin-x64
- target: bun-darwin-arm64
ext: ""
binary_name: cua-darwin-arm64
- target: bun-windows-x64
ext: ".exe"
binary_name: cua-windows-x64
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup Bun
uses: oven-sh/setup-bun@v2
with:
bun-version: latest
- name: Get version
id: version
run: |
if [ -n "${{ github.event.inputs.version }}" ]; then
echo "version=${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT
else
VERSION=$(bun -p "require('./libs/typescript/cua-cli/package.json').version")
echo "version=${VERSION}" >> $GITHUB_OUTPUT
fi
- name: Install dependencies
working-directory: ./libs/typescript/cua-cli
run: bun install --frozen-lockfile
- name: Build binary
working-directory: ./libs/typescript/cua-cli
run: |
bun build --compile --minify --sourcemap --target=${{ matrix.target }} index.ts --outfile ${{ matrix.binary_name }}${{ matrix.ext }}
mkdir -p ../../../dist
mv ${{ matrix.binary_name }}${{ matrix.ext }}* ../../../dist/
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: cua-binary-${{ matrix.target }}
path: dist/
if-no-files-found: error
retention-days: 1
publish-npm:
needs: build-and-publish
if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/cua-v')
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Bun
uses: oven-sh/setup-bun@v2
with:
bun-version: latest
- name: Install dependencies
working-directory: ./libs/typescript/cua-cli
run: bun install --frozen-lockfile
- name: Publish to npm
working-directory: ./libs/typescript/cua-cli
env:
NPM_CONFIG_TOKEN: ${{ secrets.NPM_TOKEN }}
run: bun publish --production --access public --tolerate-republish
create-release:
needs: [build-and-publish, publish-npm]
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Bun
uses: oven-sh/setup-bun@v2
with:
bun-version: latest
- name: Get version
id: version
run: |
VERSION=$(bun -p "require('./libs/typescript/cua-cli/package.json').version")
echo "version=${VERSION}" >> $GITHUB_OUTPUT
echo "tag=cua-v${VERSION}" >> $GITHUB_OUTPUT
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: dist
merge-multiple: true
- name: Create Release
id: create_release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ steps.version.outputs.tag }}
release_name: cua-cli v${{ steps.version.outputs.version }}
body: |
# cua-cli v${{ steps.version.outputs.version }}
## Installation
### Using install script (recommended)
```bash
# For Linux/macOS
curl -fsSL https://cua.ai/cli/install.sh | sh
# For Windows (PowerShell)
irm https://cua.ai/cli/install.ps1 | iex
```
### Using npm/bun
```bash
# Using bun
bun add -g @trycua/cli
# Or using npm
npm install -g @trycua/cli
```
### From source
```bash
git clone -b ${{ steps.version.outputs.tag }} https://github.com/trycua/cua.git
cd cua/libs/typescript/cua-cli
bun install
bun link
bun link cua-cli
```
## Release Assets
- `cua-darwin-arm64`: macOS (Apple Silicon)
- `cua-darwin-x64`: macOS (Intel)
- `cua-linux-x64`: Linux (x86_64)
- `cua-windows-x64.exe`: Windows (x86_64)
draft: false
prerelease: false
- name: Upload Linux Binary
uses: actions/upload-release-asset@v1
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: ./dist/cua-linux-x64
asset_name: cua-linux-x64
asset_content_type: application/octet-stream
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Upload macOS Intel Binary
uses: actions/upload-release-asset@v1
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: ./dist/cua-darwin-x64
asset_name: cua-darwin-x64
asset_content_type: application/octet-stream
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Upload macOS Apple Silicon Binary
uses: actions/upload-release-asset@v1
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: ./dist/cua-darwin-arm64
asset_name: cua-darwin-arm64
asset_content_type: application/octet-stream
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Upload Windows Binary
uses: actions/upload-release-asset@v1
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: ./dist/cua-windows-x64.exe
asset_name: cua-windows-x64.exe
asset_content_type: application/octet-stream
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -31,26 +31,39 @@ jobs:
core_version: ${{ steps.update-deps.outputs.core_version }}
steps:
- uses: actions/checkout@v4
with:
ref: main
fetch-depth: 0
- name: Ensure latest main branch
run: |
git fetch origin main
git reset --hard origin/main
echo "Current HEAD commit:"
git log -1 --oneline
- name: Determine version
id: get-version
run: |
if [ "${{ github.event_name }}" == "push" ]; then
# Check inputs.version first (works for workflow_call regardless of event_name)
if [ -n "${{ inputs.version }}" ]; then
VERSION=${{ inputs.version }}
elif [ "${{ github.event_name }}" == "push" ]; then
# Extract version from tag (for package-specific tags)
if [[ "${{ github.ref }}" =~ ^refs/tags/agent-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then
VERSION=${BASH_REMATCH[1]}
else
echo "Invalid tag format for agent"
echo "ERROR: Invalid tag format for agent"
exit 1
fi
elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
# Use version from workflow dispatch
elif [ -n "${{ github.event.inputs.version }}" ]; then
VERSION=${{ github.event.inputs.version }}
else
# Use version from workflow_call
VERSION=${{ inputs.version }}
echo "ERROR: No version found (inputs.version, event.inputs.version, and tag all empty)"
exit 1
fi
echo "VERSION=$VERSION"
echo "Agent version: $VERSION"
echo "version=$VERSION" >> $GITHUB_OUTPUT
- name: Set up Python

View File

@@ -33,21 +33,39 @@ jobs:
- name: Determine version
id: get-version
run: |
if [ "${{ github.event_name }}" == "push" ]; then
echo "=== Version Detection Debug ==="
echo "Event name: ${{ github.event_name }}"
echo "Workflow call version: ${{ inputs.version }}"
echo "Workflow dispatch version: ${{ github.event.inputs.version }}"
echo "GitHub ref: ${{ github.ref }}"
# Check inputs.version first (works for workflow_call regardless of event_name)
if [ -n "${{ inputs.version }}" ]; then
# Version provided via workflow_call or workflow_dispatch with version input
VERSION=${{ inputs.version }}
echo "Using inputs.version: $VERSION"
elif [ "${{ github.event_name }}" == "push" ]; then
# Extract version from tag (for package-specific tags)
if [[ "${{ github.ref }}" =~ ^refs/tags/computer-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then
VERSION=${BASH_REMATCH[1]}
echo "Extracted from tag: $VERSION"
else
echo "Invalid tag format for computer"
exit 1
fi
elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
# Use version from workflow dispatch
elif [ -n "${{ github.event.inputs.version }}" ]; then
# Use version from workflow_dispatch event inputs
VERSION=${{ github.event.inputs.version }}
echo "Using event.inputs.version: $VERSION"
else
# Use version from workflow_call
VERSION=${{ inputs.version }}
echo "ERROR: No version found!"
echo " - inputs.version is empty"
echo " - event.inputs.version is empty"
echo " - Not a tag push event"
exit 1
fi
echo "=== Final Version ==="
echo "VERSION=$VERSION"
echo "version=$VERSION" >> $GITHUB_OUTPUT

View File

@@ -47,8 +47,16 @@ jobs:
steps:
- uses: actions/checkout@v4
with:
ref: main
fetch-depth: 0 # Full history for release creation
- name: Ensure latest main branch
run: |
git fetch origin main
git reset --hard origin/main
echo "Current HEAD commit:"
git log -1 --oneline
- name: Set up Python
uses: actions/setup-python@v4
with:
@@ -78,7 +86,7 @@ jobs:
# Verify version matches using script (exits with error if mismatch)
python ${GITHUB_WORKSPACE}/.github/scripts/get_pyproject_version.py \
${{ inputs.package_dir }}/pyproject.toml \
${GITHUB_WORKSPACE}/${{ inputs.package_dir }}/pyproject.toml \
${{ inputs.version }}
- name: Initialize PDM in package directory

View File

@@ -49,8 +49,15 @@ jobs:
# Install the package in editable mode with dev dependencies
if [ -f pyproject.toml ]; then
uv pip install --system -e .
# Install test dependencies
uv pip install --system pytest pytest-asyncio pytest-mock pytest-cov
fi
shell: bash
- name: Install test dependencies
run: |
# Install test dependencies from root pyproject.toml if tests directory exists
# The root pyproject.toml has package=false, so we install just the dependency group
if [ -d "libs/python/${{ matrix.package }}/tests" ]; then
uv pip install --system --group test
fi
shell: bash

View File

@@ -4,8 +4,6 @@ name: Test CUA Supporting Models
# Run manually using workflow_dispatch with test_models=true
on:
pull_request_target:
branches: [main, master]
workflow_dispatch:
inputs:
test_models:
@@ -20,7 +18,7 @@ on:
jobs:
# Test all CUA models - runs on PRs, schedules, or when manually triggered
test-all-models:
if: ${{ github.event_name == 'pull_request_target' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false') }}
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false') }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
@@ -42,13 +40,13 @@ jobs:
- gemini-2.5-computer-use-preview-10-2025
# InternVL
- huggingface-local/OpenGVLab/InternVL3_5-1B
# - huggingface-local/OpenGVLab/InternVL3_5-1B
# - huggingface-local/OpenGVLab/InternVL3_5-2B
# - huggingface-local/OpenGVLab/InternVL3_5-4B
# - huggingface-local/OpenGVLab/InternVL3_5-8B
# UI-TARS (supports full computer-use, can run standalone)
- huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
# - huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
# Note: OpenCUA, GTA, and Holo are grounding-only models
# They only support predict_click(), not agent.run()
@@ -56,7 +54,7 @@ jobs:
# Moondream (typically used in composed agents)
# Format: moondream3+{any-llm-with-tools}
- moondream3+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools
# - moondream3+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools
# - moondream3+openai/gpt-4o # GPT-4o has VLM + Tools
# OmniParser (typically used in composed agents)
@@ -68,9 +66,9 @@ jobs:
# Format: {grounding-model}+{any-vlm-with-tools}
# These grounding-only models (OpenCUA, GTA, Holo) must be used in composed form
# since they only support predict_click(), not full agent.run()
- huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929
- huggingface-local/xlangai/OpenCUA-7B+anthropic/claude-sonnet-4-5-20250929
- huggingface-local/Hcompany/Holo1.5-3B+anthropic/claude-sonnet-4-5-20250929
# - huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929
# - huggingface-local/xlangai/OpenCUA-7B+anthropic/claude-sonnet-4-5-20250929
# - huggingface-local/Hcompany/Holo1.5-3B+anthropic/claude-sonnet-4-5-20250929
steps:
- name: Checkout repository
@@ -219,6 +217,7 @@ jobs:
path: |
tests/agent_loop_testing/test_images/
*.log
if-no-files-found: ignore
retention-days: 7
- name: Upload test summary data
@@ -228,6 +227,7 @@ jobs:
# Unique, slash-free artifact name per matrix entry
name: test-summary-${{ env.SAFE_MODEL_NAME }}
path: test_summary/
if-no-files-found: ignore
retention-days: 1
- name: Set default Slack color
@@ -248,7 +248,7 @@ jobs:
# Summary job that aggregates all model test results
test-summary:
if: ${{ always() && (github.event_name == 'pull_request_target' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false')) }}
if: ${{ always() && (github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false')) }}
needs: test-all-models
runs-on: ubuntu-latest
steps:

View File

@@ -29,4 +29,7 @@ venv/
*.db
*.sqlite
pnpm-lock.yaml
uv.lock
uv.lock
# Docs with complex JSX formatting
docs/content/docs/get-started/quickstart.mdx

View File

@@ -376,6 +376,61 @@ All packages are managed through a single consolidated workflow: [Bump Version](
5. Click "Run workflow" to start the version bump
6. The workflow will automatically commit changes and push to main
## Releasing a New CLI Version
To release a new version of the CUA CLI, follow these steps:
### 1. Update the Version
1. Update the version in `libs/typescript/cua-cli/package.json`
2. Commit the version change with a message like "Bump version to x.y.z"
3. Push the changes to the main branch
### 2. Trigger the Release Workflow
1. Go to the GitHub Actions tab in the repository
2. Select the "Publish @trycua/cli" workflow
3. Click "Run workflow"
4. Optionally, specify a version (e.g., "1.2.3") or leave empty to use the version from package.json
5. Click "Run workflow"
The workflow will:
- Build single-file executables for all supported platforms
- Publish the package to npm
- Create a GitHub release with the version tag (format: `cua-vX.Y.Z`)
- Attach all platform-specific binaries to the release
### 3. Verify the Release
1. Check the GitHub Releases page to ensure the new version is published
2. Verify the npm package was published to the registry
3. Test installation on different platforms:
```bash
# Test Linux/macOS installation
curl -fsSL https://cua.ai/install.sh | sh
# Test Windows installation (PowerShell)
irm https://cua.ai/install.ps1 | iex
```
### 4. Update Documentation
Update any relevant documentation with the new version number, including:
- Example code in documentation
- Any version-specific instructions
- Compatibility matrices
### 5. Announce the Release
- Create a new GitHub release with release notes
- Update the changelog if maintained separately
- Announce in relevant channels (Slack, Discord, etc.)
---
### Rolling Back a Version Bump
If you need to revert a version bump, follow these steps:

110
README.md
View File

@@ -6,15 +6,17 @@
</picture>
[![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
[![Swift](https://img.shields.io/badge/Swift-F05138?logo=swift&logoColor=white)](#)
[![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
[![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
<br>
<a href="https://trendshift.io/repositories/13685" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13685" alt="trycua%2Fcua | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</div>
**Cua** ("koo-ah") is Docker for [Computer-Use Agents](https://www.oneusefulthing.org/p/when-you-give-a-claude-a-mouse) - it enables AI agents to control full operating systems in virtual containers and deploy them locally or to the cloud.
**Cua** ("koo-ah") is an open-source framework for Computer-Use Agents - enabling AI systems to autonomously operate computers through visual understanding and action execution. Used for research, evaluation, and production deployment of desktop, browser, and mobile automation agents.
## What are Computer-Use Agents?
Computer-Use Agents (CUAs) are AI systems that can autonomously interact with computer interfaces through visual understanding and action execution. Unlike traditional automation tools that rely on brittle selectors or APIs, CUAs use vision-language models to perceive screen content and reason about interface interactions - enabling them to adapt to UI changes and handle complex, multi-step workflows across applications.
<div align="center">
<video src="https://github.com/user-attachments/assets/c619b4ea-bb8e-4382-860e-f3757e36af20" width="600" controls></video>
@@ -22,14 +24,14 @@
With the [Computer SDK](#computer-sdk), you can:
- automate Windows, Linux, and macOS VMs with a consistent, [pyautogui-like API](https://cua.ai/docs/docs/libraries/computer#interface-actions)
- create & manage VMs [locally](https://cua.ai/docs/docs/computer-sdk/computers#cua-local-containers) or using [Cua cloud](https://www.cua.ai/)
- automate Windows, Linux, and macOS VMs with a consistent, [pyautogui-like API](https://cua.ai/docs/computer-sdk/commands)
- create & manage VMs [locally](https://cua.ai/docs/quickstart-devs#using-computer) or using [Cua cloud](https://www.cua.ai/)
With the [Agent SDK](#agent-sdk), you can:
- run computer-use models with a [consistent schema](https://cua.ai/docs/docs/agent-sdk/message-format)
- benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://cua.ai/docs/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb))
- combine UI grounding models with any LLM using [composed agents](https://cua.ai/docs/docs/agent-sdk/supported-agents/composed-agents)
- run computer-use models with a [consistent schema](https://cua.ai/docs/agent-sdk/message-format)
- benchmark on OSWorld-Verified (369 tasks), SheetBench-V2, and ScreenSpot [with a single line of code using HUD](https://cua.ai/docs/agent-sdk/integrations/hud) - see [benchmark results](#research--benchmarks) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb))
- combine UI grounding models with any LLM using [composed agents](https://cua.ai/docs/agent-sdk/supported-agents/composed-agents)
- use new UI agent models and UI grounding models from the Model Zoo below with just a model string (e.g., `ComputerAgent(model="openai/computer-use-preview")`)
- use API or local inference by changing a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers))
@@ -96,8 +98,8 @@ Core utilities for Cua
# Quick Start
- [Clone a starter template and run the code in <1 min](https://github.com/trycua/agent-template)
- [Get started with the Cua SDKs](https://cua.ai/docs/docs/quickstart-devs)
- [Get started with the Cua CLI](https://cua.ai/docs/docs/quickstart-cli)
- [Get started with the Cua SDKs](https://cua.ai/docs/quickstart-devs)
- [Get started with the Cua CLI](https://cua.ai/docs/quickstart-cli)
# Agent SDK
@@ -115,7 +117,7 @@ from agent import ComputerAgent
# ComputerAgent works with any computer initialized with the Computer SDK
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
max_trajectory_budget=5.0
)
@@ -194,12 +196,12 @@ Cua uses the OpenAI Agent response format.
These are the valid model configurations for `ComputerAgent(model="...")`:
| Configuration | Description |
| ---------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- |
| `{computer-use-model}` | A single model to perform all computer-use tasks |
| `{grounding-model}+{any-vlm-with-tools}` | [Composed](https://cua.ai/docs/docs/agent-sdk/supported-agents/composed-agents) with VLM for captioning and grounding LLM for element detection |
| `moondream3+{any-llm-with-tools}` | [Composed](https://cua.ai/docs/docs/agent-sdk/supported-agents/composed-agents) with Moondream3 for captioning and UI element detection |
| `human/human` | A [human-in-the-loop](https://cua.ai/docs/docs/agent-sdk/supported-agents/human-in-the-loop) in place of a model |
| Configuration | Description |
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `{computer-use-model}` | A single model to perform all computer-use tasks |
| `{grounding-model}+{any-vlm-with-tools}` | [Composed](https://cua.ai/docs/agent-sdk/supported-agents/composed-agents) with VLM for captioning and grounding LLM for element detection |
| `moondream3+{any-llm-with-tools}` | [Composed](https://cua.ai/docs/agent-sdk/supported-agents/composed-agents) with Moondream3 for captioning and UI element detection |
| `human/human` | A [human-in-the-loop](https://cua.ai/docs/agent-sdk/supported-agents/human-in-the-loop) in place of a model |
### Model Capabilities
@@ -209,16 +211,46 @@ The following table shows which capabilities are supported by each model:
| -------------------------------------------------------------------------------------------------------------------------------- | :----------: | :-------: | :---: | :-: |
| [Claude Sonnet/Haiku](https://docs.claude.com/en/docs/agents-and-tools/tool-use/computer-use-tool#how-to-implement-computer-use) | 🖥️ | 🎯 | 🛠️ | 👁️ |
| [OpenAI CU Preview](https://platform.openai.com/docs/models/computer-use-preview) | 🖥️ | 🎯 | | 👁️ |
| [Qwen3 VL](https://huggingface.co/collections/Qwen/qwen3-vl) | 🖥️ | 🎯 | 🛠️ | 👁️ |
| [GLM-V](https://huggingface.co/THUDM/glm-4v-9b) | 🖥️ | 🎯 | 🛠️ | 👁️ |
| [Gemini CU Preview](https://ai.google.dev/gemini-api/docs/computer-use) | 🖥️ | 🎯 | | 👁️ |
| [InternVL](https://huggingface.co/OpenGVLab/InternVL3_5-1B) | 🖥️ | 🎯 | 🛠️ | 👁️ |
| [UI-TARS](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B) | 🖥️ | 🎯 | 🛠️ | 👁️ |
| [UI-TARS-2](https://cua.ai/dashboard/vlm-router) | 🖥️ | 🎯 | 🛠️ | 👁️ |
| [OpenCUA](https://huggingface.co/xlangai/OpenCUA-7B) | | 🎯 | | |
| [GTA](https://huggingface.co/HelloKKMe/GTA1-7B) | | 🎯 | | |
| [Holo](https://huggingface.co/Hcompany/Holo1.5-3B) | | 🎯 | | |
| [Moondream](https://huggingface.co/moondream/moondream3-preview) | | 🎯 | | |
| [OmniParser](https://github.com/microsoft/OmniParser) | | 🎯 | | |
**Legend:**
- 🖥️ **Computer-Use**: Full agentic loop with planning and execution
- 🎯 **Grounding**: UI element detection and click coordinate prediction
- 🛠️ **Tools**: Support for function calling beyond screen interaction
- 👁️ **VLM**: Vision-language understanding
**Composition Examples:**
See more examples on our [composition docs](https://cua.ai/docs/agent-sdk/supported-agents/composed-agents).
```python
# Use OpenAI's GPT-5 for planning with specialized grounding
agent = ComputerAgent(model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5")
# Composition via OmniParser
agent = ComputerAgent(model="omniparser+openai/gpt-4o")
# Combine state-of-the-art grounding with powerful reasoning
agent = ComputerAgent(model="huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929")
# Combine two different vision models for enhanced capabilities
agent = ComputerAgent(model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B+openai/gpt-4o")
# Use the built-in Moondream3 grounding with any planning mode.
agent = ComputerAgent(model="moondream3+openai/gpt-4o")
```
### Model IDs
<details>
@@ -229,9 +261,11 @@ The following table shows which capabilities are supported by each model:
| [Claude Sonnet/Haiku](https://docs.claude.com/en/docs/agents-and-tools/tool-use/computer-use-tool#how-to-implement-computer-use) | `anthropic/claude-sonnet-4-5`, `anthropic/claude-haiku-4-5` |
| [OpenAI CU Preview](https://platform.openai.com/docs/models/computer-use-preview) | `openai/computer-use-preview` |
| [GLM-V](https://huggingface.co/THUDM/glm-4v-9b) | `openrouter/z-ai/glm-4.5v`, `huggingface-local/zai-org/GLM-4.5V` |
| [Qwen3 VL](https://huggingface.co/collections/Qwen/qwen3-vl) | `openrouter/qwen/qwen3-vl-235b-a22b-instruct` |
| [Gemini CU Preview](https://ai.google.dev/gemini-api/docs/computer-use) | `gemini-2.5-computer-use-preview` |
| [InternVL](https://huggingface.co/OpenGVLab/InternVL3_5-1B) | `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` |
| [UI-TARS](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B) | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` |
| [UI-TARS-2](https://cua.ai/dashboard/vlm-router) | `cua/bytedance/ui-tars-2` |
| [OpenCUA](https://huggingface.co/xlangai/OpenCUA-7B) | `huggingface-local/xlangai/OpenCUA-{7B,32B}` |
| [GTA](https://huggingface.co/HelloKKMe/GTA1-7B) | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` |
| [Holo](https://huggingface.co/Hcompany/Holo1.5-3B) | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` |
@@ -273,7 +307,7 @@ try:
# Click and type
await computer.interface.left_click(100, 100)
await computer.interface.type("Hello!")
await computer.interface.type_text("Hello!")
finally:
await computer.close()
```
@@ -331,6 +365,46 @@ pip install cua-som
Learn more in the [SOM documentation](./libs/python/som/README.md).
# Recent Updates
## 2025
### September 2025
- **Hack the North Competition**: First benchmark-driven hackathon track with guaranteed YC interview prize. Winner achieved 68.3% on OSWorld-Tiny ([Blog Post](https://www.cua.ai/blog/hack-the-north))
- **Global Hackathon Launch**: Ollama × Cua global online competition for creative local/hybrid agents
### August 2025
- **v0.4 Release - Composite Agents**: Mix grounding + planning models with `+` operator (e.g., `"GTA-7B+GPT-4o"`) ([Blog Post](https://www.cua.ai/blog/composite-agents))
- **HUD Integration**: One-line benchmarking on OSWorld-Verified with live trace visualization ([Blog Post](https://www.cua.ai/blog/hud-agent-evals))
- **Human-in-the-Loop**: Interactive agent mode with `human/human` model string
- **Web-Based Computer Use**: Browser-based agent execution ([Blog Post](https://www.cua.ai/blog/bringing-computer-use-to-the-web))
### June 2025
- **Windows Sandbox Support**: Native Windows agent execution ([Blog Post](https://www.cua.ai/blog/windows-sandbox))
- **Containerization Evolution**: From Lume to full Docker support ([Blog Post](https://www.cua.ai/blog/lume-to-containerization))
- **Sandboxed Python Execution**: Secure code execution in agent workflows
### May 2025
- **Cua Cloud Containers**: Production-ready cloud deployment with elastic scaling ([Blog Post](https://www.cua.ai/blog/introducing-cua-cloud-containers))
- **Trajectory Viewer**: Visual debugging tool for agent actions ([Blog Post](https://www.cua.ai/blog/trajectory-viewer))
- **Training Data Collection**: Tools for creating computer-use training datasets ([Blog Post](https://www.cua.ai/blog/training-computer-use-models-trajectories-1))
- **App-Use Framework**: Mobile and desktop app automation capabilities
### April 2025
- **Agent Framework v0.4**: Unified API for 100+ model configurations
- **UI-TARS Integration**: Local inference support for ByteDance's desktop-optimized model
- **Blog Series**: "Build Your Own Operator" tutorials ([Part 1](https://www.cua.ai/blog/build-your-own-operator-on-macos-1) | [Part 2](https://www.cua.ai/blog/build-your-own-operator-on-macos-2))
### March 2025
- **Initial Public Release**: Core Agent SDK and Computer SDK
- **Lume VM Manager**: macOS VM management tool for local development
# Resources
- [Cua Blog](https://www.cua.ai/blog)

View File

@@ -25,7 +25,7 @@ desktop = computer.create_desktop_from_apps(["Safari", "Notes"])
# Your agent can now only see and interact with these apps
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[desktop]
)
```
@@ -94,7 +94,7 @@ async def main():
# Initialize an agent
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[desktop]
)
@@ -160,7 +160,7 @@ async def automate_iphone():
# Initialize an agent for iPhone automation
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[my_iphone]
)

View File

@@ -8,7 +8,7 @@ In this first blogpost, we'll learn how to build our own Computer-Use Operator u
- **computer-use-preview** is OpenAI's specialized language model trained to understand and interact with computer interfaces through screenshots.
- A **Computer-Use Agent** is an AI agent that can control a computer just like a human would - clicking buttons, typing text, and interacting with applications.
Our Operator will run in an isolated macOS VM, by making use of our [cua-computer](https://github.com/trycua/cua/tree/main/libs/computer) package and [lume virtualization CLI](https://github.com/trycua/cua/tree/main/libs/lume).
Our Operator will run in an isolated macOS VM, by making use of our [cua-computer](https://github.com/trycua/cua/tree/main/libs/python/computer) package and [lume virtualization CLI](https://github.com/trycua/cua/tree/main/libs/lume).
Check out what it looks like to use your own Operator from a Gradio app:
@@ -294,7 +294,7 @@ This design keeps everything organized and safe. The AI can only interact with t
### Prerequisites
1. **Lume CLI Setup**
For installing the standalone lume binary, run the following command from a terminal, or download the [latest pkg](https://github.com/trycua/cua/releases/latest/download/lume.pkg.tar.gz).
For installing the standalone lume binary, run the following command from a terminal, or download the [latest pkg](https://github.com/trycua/cua/releases/download/lume-v0.2.22/lume-darwin.pkg.tar.gz).
```bash
sudo /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
@@ -567,10 +567,10 @@ In a production setting, you would wrap the action-response cycle in a loop, han
### Next Steps
In the next blogpost, we'll introduce our Agent framework which abstracts away all these tedious implementation steps. This framework provides a higher-level API that handles the interaction loop between OpenAI's computer-use model and the macOS sandbox, allowing you to focus on building sophisticated applications rather than managing the low-level details we've explored here. Can't wait? Check out the [cua-agent](https://github.com/trycua/cua/tree/main/libs/agent) package!
In the next blogpost, we'll introduce our Agent framework which abstracts away all these tedious implementation steps. This framework provides a higher-level API that handles the interaction loop between OpenAI's computer-use model and the macOS sandbox, allowing you to focus on building sophisticated applications rather than managing the low-level details we've explored here. Can't wait? Check out the [cua-agent](https://github.com/trycua/cua/tree/main/libs/python/agent) package!
### Resources
- [OpenAI Computer-Use docs](https://platform.openai.com/docs/guides/tools-computer-use)
- [cua-computer](https://github.com/trycua/cua/tree/main/libs/computer)
- [cua-computer](https://github.com/trycua/cua/tree/main/libs/python/computer)
- [lume](https://github.com/trycua/cua/tree/main/libs/lume)

View File

@@ -145,9 +145,9 @@ While the core concept remains the same across all agent loops, different AI mod
| Agent Loop | Supported Models | Description | Set-Of-Marks |
|:-----------|:-----------------|:------------|:-------------|
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA Preview model | Not Required |
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use Beta Tools | Not Required |
| `AgentLoop.ANTHROPIC` | • `claude-sonnet-4-5-20250929`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use Beta Tools | Not Required |
| `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required |
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
| `AgentLoop.OMNI` | • `claude-sonnet-4-5-20250929`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
Each loop handles the same basic pattern we implemented manually in Part 1:
@@ -171,7 +171,7 @@ The `cua-agent` framework provides multiple agent loop implementations to abstra
- **AgentLoop.OMNI**: The most flexible option that works with virtually any vision-language model including local and open-source ones. Perfect for cost-effective development or when you need to use models without native computer-use capabilities.
These abstractions allow you to easily switch between providers without changing your application code. All loop implementations are available in the [cua-agent GitHub repository](https://github.com/trycua/cua/tree/main/libs/agent/agent/providers).
These abstractions allow you to easily switch between providers without changing your application code. All loop implementations are available in the [cua-agent GitHub repository](https://github.com/trycua/cua/tree/main/libs/python/agent).
Choosing the right agent loop depends not only on your API access and technical requirements but also on the specific tasks you need to accomplish. To make an informed decision, it's helpful to understand how these underlying models perform across different computing environments from desktop operating systems to web browsers and mobile interfaces.
@@ -191,7 +191,7 @@ The performance of different Computer-Use models varies significantly across tas
- **AgentLoop.OPENAI**: Choose when you have OpenAI Tier 3 access and need the most capable computer-use agent for web-based tasks. Uses the same [OpenAI Computer-Use Loop](https://platform.openai.com/docs/guides/tools-computer-use) as Part 1, delivering strong performance on browser-based benchmarks.
- **AgentLoop.ANTHROPIC**: Ideal for users with Anthropic API access who need strong reasoning capabilities with computer-use abilities. Works with `claude-3-5-sonnet-20240620` and `claude-3-7-sonnet-20250219` models following [Anthropic's Computer-Use tools](https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#understanding-the-multi-agent-loop).
- **AgentLoop.ANTHROPIC**: Ideal for users with Anthropic API access who need strong reasoning capabilities with computer-use abilities. Works with `claude-sonnet-4-5-20250929` and `claude-3-7-sonnet-20250219` models following [Anthropic's Computer-Use tools](https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#understanding-the-multi-agent-loop).
- **AgentLoop.UITARS**: Best for scenarios requiring more powerful OS/desktop, and latency-sensitive automation, as UI-TARS-1.5 leads in OS capabilities benchmarks. Requires running the model locally or accessing it through compatible endpoints (e.g. on Hugging Face).
@@ -268,7 +268,7 @@ from agent import ComputerAgent
async def run_multi_task_workflow():
async with Computer() as macos_computer:
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[macos_computer]
)
@@ -674,7 +674,7 @@ With the basics covered, you might want to explore:
### Resources
- [cua-agent GitHub repository](https://github.com/trycua/cua/tree/main/libs/agent)
- [cua-agent GitHub repository](https://github.com/trycua/cua/tree/main/libs/python/agent)
- [Agent Notebook Examples](https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb)
- [OpenAI Agent SDK Specification](https://platform.openai.com/docs/api-reference/responses)
- [Anthropic API Documentation](https://docs.anthropic.com/en/api/getting-started)

View File

@@ -0,0 +1,120 @@
# Cloud Windows Sandboxes GA + macOS Preview
If you've been building with our `cua` libraries, you might've hit a limitation with local computer-use sandboxes: to run agents on Windows or macOS, you need to be on that OS - Windows Sandbox for Windows, Apple Virtualization for macOS. The only cross-platform option is Linux on Docker, which limits you to virtualizing Linux environments ([see all local options here](https://cua.ai/docs/computer-sdk/computers)).
Today the story changes - we're announcing general availability of **Cloud Windows Sandboxes** and opening early preview access for **Cloud macOS Sandboxes**.
## Cloud Windows Sandboxes: Now GA
![Cloud Windows Sandboxes](https://github.com/user-attachments/assets/db15f4c4-70a4-425a-a264-82e629074de7)
Cloud Windows Sandboxes are now generally available. You get a full Windows 11 desktop in your browser with Edge and Python pre-installed, working seamlessly with all our [Computer-Use libraries](https://github.com/trycua/cua) for RPA, UI automation, code execution, and agent development.
**What's new with this release:**
- Hot-start under 1 second
- Direct noVNC over HTTPS under our sandbox.cua.ai domain
- 3 sandbox sizes available:
| Size | CPU | RAM | Storage |
| ------ | ------- | ----- | ---------- |
| Small | 2 cores | 8 GB | 128 GB SSD |
| Medium | 4 cores | 16 GB | 128 GB SSD |
| Large | 8 cores | 32 GB | 256 GB SSD |
<div align="center">
<video src="https://github.com/user-attachments/assets/8ab07646-6018-4128-87ce-53180cfea696" width="600" controls></video>
</div>
**Pricing:** Windows Sandboxes start at 8 credits/hour (Small), 15 credits/hour (Medium), or 31 credits/hour (Large).
## Cloud macOS Sandboxes: Now in Preview
Running macOS locally comes with challenges: 30GB golden images, a maximum of 2 sandboxes per host, and unpredictable compatibility issues. With Cloud macOS Sandboxes, we provision bare-metal macOS hosts (M1, M2, M4) on-demand—giving you full desktop access without the overhead of managing local sandboxes.
![macOS Preview Waitlist](https://github.com/user-attachments/assets/343c9a3f-59d8-4b1a-bba8-6af91e8a9cf0)
**Preview access:** Invite-only. [Join the waitlist](https://cua.ai/macos-waitlist) if you're building agents for macOS workflows.
## Getting Started Today
Sign up at [cua.ai/signin](https://cua.ai/signin) and grab your API key from the dashboard. Then connect to a sandbox:
```python
from computer import Computer
computer = Computer(
os_type="windows", # or "macos"
provider_type="cloud",
name="my-sandbox",
api_key="your-api-key"
)
await computer.run()
```
Manage existing sandboxes:
```python
from computer.providers.cloud.provider import CloudProvider
provider = CloudProvider(api_key="your-api-key")
async with provider:
sandboxes = await provider.list_vms()
await provider.run_vm("my-sandbox")
await provider.stop_vm("my-sandbox")
```
Run an agent on Windows to automate a workflow:
```python
from agent import ComputerAgent
agent = ComputerAgent(
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
max_trajectory_budget=5.0
)
response = await agent.run(
"Open Excel, create a sales report with this month's data, and save it to the desktop"
)
```
## FAQs
<details>
<summary><strong>Why not just use local Windows Sandbox?</strong></summary>
Local Windows Sandbox resets on every restart. No persistence, no hot-start, and you need Windows Pro. Our sandboxes persist state, hot-start in under a second, and work from any OS.
</details>
<details>
<summary><strong>What happens to my work when I stop a sandbox?</strong></summary>
Everything persists. Files, installed software, browser profiles—it's all there when you restart. Only pay for runtime, not storage.
</details>
<details>
<summary><strong>How's the latency for UI automation?</strong></summary>
We run in 4 regions so you can pick what's closest. The noVNC connection is optimized for automation, not video streaming. Your agent sees crisp screenshots, not compressed video.
</details>
<details>
<summary><strong>Are there software restrictions?</strong></summary>
No. Full admin access on both platforms. Install whatever you need—Visual Studio, Photoshop, custom enterprise software. It's your sandbox.
</details>
## Need help?
If you hit issues getting either platform working, reach out in [Discord](https://discord.gg/cua-ai). We respond fast and fix based on what people actually use.
---
Get started at [cua.ai](https://cua.ai) or [join the macOS waitlist](https://cua.ai/macos-waitlist).

View File

@@ -14,12 +14,12 @@ This is the kind of problem that makes you wonder if we're building the future o
Agent framework 0.4 solves this by doing something radical: making all these different models speak the same language.
Instead of writing separate code for each model's peculiarities, you now just pick a model with a string like `"anthropic/claude-3-5-sonnet-20241022"` or `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`, and everything else Just Works™. Behind the scenes, we handle all the coordinate normalization, token parsing, and image preprocessing so you don't have to.
Instead of writing separate code for each model's peculiarities, you now just pick a model with a string like `"anthropic/claude-sonnet-4-5-20250929"` or `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`, and everything else Just Works™. Behind the scenes, we handle all the coordinate normalization, token parsing, and image preprocessing so you don't have to.
```python
# This works the same whether you're using Anthropic, OpenAI, or that new model you found on Hugging Face
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022", # or any other supported model
model="anthropic/claude-sonnet-4-5-20250929", # or any other supported model
tools=[computer]
)
```

View File

@@ -0,0 +1,172 @@
# Computer Use Agents for Growth Hacking: The Cua-la Strategy
_Published on January 16, 2025 by Sarina Li_
<img src="./assets/esther-and-sarina.JPG" alt="Esther and Sarina at DevFest Toronto">
Growing a developer-focused product is hard. Traditional marketing doesn't work. Booth rentals cost thousands. Sponsorships cost tens of thousands.
So we tried something different at Google DevFest Toronto: show up with backpacks full of cute cua-la keychains and see what happens.
This is the story of how two new hires, a growth engineer and a designer/artist, guerrilla marketed their way through a major tech conference with $200 worth of merch and a post-event automation pipeline.
## Meet the Team
**Sarina** (Growth Engineering): Built the post-event automation pipeline that extracts LinkedIn connections and generates personalized messages while you sleep.
**Esther** (Design + Art): Hand-crafted every piece of artwork, giving life to Cua through illustrations, branding, and yes, extremely cute cua-la keychains.
The thesis: what if we could draw people in with irresistible physical merch, then use computer use agents to handle all the tedious follow-up work?
## The cua-la Strategy
<img src="./assets/cua-at-devfest.JPG" alt="Guerrilla marketing at DevFest Toronto">
Google DevFest Toronto brought together hundreds of developers and AI enthusiasts. We didn't have a booth. We didn't have demos. We showed up with backpacks full of cua-la keychains with the cua.ai logo and started handing them out.
That's it. Pure guerrilla marketing, the cua-las were absurdly effective.
People would literally crowd around us, not because they were interested in computer use (at first), but because they wanted a cua-la. We'd pitch Cua while handing out keychains, and suddenly we had an engaged audience!
<img src="./assets/devfest-image.JPG" alt="DevFest crowd">
### The Magic Moment
A few people stuck the cua-las on their bags immediately. Then, throughout the event, we started getting approached:
"Wait, are you the Cua girls?"
They'd seen the cua-las on someone's bag, asked about it, and tracked us down! The keychains became walking advertisements.
<img src="./assets/htn-at-devfest.JPG" alt="Hack the North recognition at DevFest">
Even better: two attendees recognized Cua from Hack the North. Our previous event marketing was actually working. People remembered us.
## Part 2: The Automation (Try It Yourself)
After DevFest, we had 20+ new LinkedIn connections. Normally, this means hours of:
- Manually copying names, roles, companies
- Opening each profile to find contact info
- Crafting personalized follow-up messages
- Updating your CRM
Sarina had a better idea: build the automation we wish existed, then open source it.
**The automation is live**: [Post-Event Contact Export cookbook](https://cua.ai/docs/example-usecases/post-event-contact-export)
### How It Works
<video controls width="100%">
<source src="./assets/linkedin-scraping.mp4" type="video/mp4">
LinkedIn scraping automation in action
</video>
The agent navigates LinkedIn like a human would: click profile, extract info, navigate back, repeat. But it does it overnight while you sleep.
The secret sauce: **VM session persistence**. By logging into LinkedIn once through Cua's VM, the session stays alive. No captchas, no bot detection, just smooth automation.
<video controls width="100%">
<source src="./assets/adding-row-csv.mp4" type="video/mp4">
Automatic CSV generation
</video>
Wake up to a clean CSV with:
- First name, last name
- Current role and company
- LinkedIn profile URLs
- Pre-generated messaging links
Then use that data to craft personalized messages. Sarina wrote unique follow-ups for each person, mentioning specific conversations from DevFest.
**Works for any platform**: LinkedIn, X/Twitter, or wherever your connections are. The cookbook includes full setup instructions and customizable code.
## The Results
**Cost Breakdown**
- Booth rental: $0 (didn't have one)
- Sponsorship: $0 (didn't buy one)
- cua-la keychains: ~$200
- Automation: Built by Sarina in a few hours post-event
- **Total spend: $200**
**What We Got**
- People crowding around us for cua-las
- Walking advertisements on bags throughout the event
- Instant brand recognition ("Are you the Cua girls?")
- Two people who remembered us from Hack the North
- 20+ quality connections extracted and messaged within 24 hours
- Several demo requests from personalized follow-ups
**ROI**
Traditional event marketing at this scale: $5-10K minimum for booth + sponsorship.
Our approach: $200 + scrappy execution.
The automation is reuseable and will save hours of manual work, and the cua-las created more organic conversations than any booth could have.
## What Didn't Work (Yet)
**cua-la Distribution**
We ran out faster than expected! Next time: bigger bag, or limit to one per person.
**Automation Setup**
The VM login step added friction. "Log in manually first, then run the script" confused some people who wanted to try it themselves. Need better first-run UX.
**Message Personalization**
While the extraction was automated, I still wrote each follow-up message manually, I think we are looking for ways to better enrich messages with context from the event, which is hard to automate.
## What's Next: NeurIPS 2025
NeurIPS is the biggest AI conference of the year. Thousands of researchers, hundreds of companies.
**The good news**: We still have one giant bag of cua-las left. They're already packed and ready.
**The better news**: We're upgrading the automation.
### The Hypothesis
The cua-las get people interested. The automation ensures we actually follow through.
Most event marketing fails at the follow-up stage. You collect business cards, connect on LinkedIn, and then... nothing. The moment passes. People forget.
With Cua handling the mechanical work (data organization, connection tracking, follow-up scheduling), we can focus on the human part: genuine conversations, valuable introductions, and actually helping people.
## The Framework: Cute Merch + Smart Automation
Traditional event marketing: show up, pitch, collect cards.
Our approach: combine two forces that shouldn't work together but do.
**The Physical Hook**
- Make something people actually want (not another branded pen)
- Hand-crafted, memorable, Instagram-worthy
- Turns attendees into walking billboards
- Creates natural conversation starters
**The Digital Follow-Through**
- Automate the tedious post-event work
- Extract connections while you sleep
- Personalize follow-ups with real context
- Actually close the loop before the moment passes
**Why It Works**
The cua-las get you in the door. The automation ensures you don't waste the opportunity.
Most companies nail one or the other:
- Great merch, terrible follow-up → missed opportunities
- Amazing automation, boring presence → no one cares
Do both, and you create a flywheel: each event builds brand recognition for the next, while automation ensures maximum value from every connection.
See you at NeurIPS 2025!
---
_Want to build your own growth hacking automations? Check out [Cua on GitHub](https://github.com/trycua/cua) or join our [Discord](https://discord.gg/cua) to share your experiments. cua-las not included (yet)._

View File

@@ -0,0 +1,86 @@
# Cua Playground: Agents + Sandboxes in Your Browser
Building computer-use agents means constant iteration—writing code, deploying to a sandbox, testing behavior, debugging issues, then repeating the cycle. Every test requires switching between your code editor, terminal, and VNC viewer. Want to try a different prompt? Edit your code, redeploy, and wait for the agent to restart. It works, but it's slow.
Today we're launching the **Cua Playground**: a browser-based environment for testing computer-use agents without writing code. Send messages to your sandboxes, watch them execute in real-time, and iterate on prompts instantly—all from your dashboard at cua.ai.
![Cua Playground](https://github.com/user-attachments/assets/af1071ba-3df3-4e4b-aafb-df8c3d00b0a5)
**What's new with this release:**
- Instant testing—send messages to any running sandbox directly from your browser
- Real-time execution—watch your agent work with live tool call updates and screenshots
- Multi-model support—test with Claude Sonnet 4.5, Haiku 4.5, and more
- Persistent chat history—conversations save automatically to local storage
The Playground connects to your existing Cua sandboxes—the same ones you use with the Agent SDK. Select a running sandbox and a model, then start chatting. The agent uses computer-use tools (mouse, keyboard, bash, editor) to complete your tasks, and you see every action it takes.
## Getting Started Today
<div align="center">
<video src="https://github.com/user-attachments/assets/9fef0f30-1024-4833-8b7a-6a2c02d8eb99" width="600" controls></video>
</div>
Sign up at [cua.ai/signin](https://cua.ai/signin) and grab your API key from the dashboard. Then navigate to the Playground:
1. Navigate to Dashboard > Playground
2. Select a sandbox from the dropdown (must be "running" status)
3. Choose a model (we recommend Claude Sonnet 4.5 to start)
4. Send a message: "Take a screenshot and describe what you see"
5. Watch the agent execute computer actions in real-time
Example use cases:
**Prompt Testing**
```
❌ "Check the website"
✅ "Navigate to example.com in Firefox and take a screenshot of the homepage"
```
**Model Comparison**
Run the same task with different models to compare quality, speed, and cost.
**Debugging Agent Behavior**
1. Send: "Find the login button and click it"
2. View tool calls to see each mouse movement
3. Check screenshots to verify the agent found the right element
4. Adjust your prompt based on what you observe
## FAQs
<details>
<summary><strong>Do I need to know how to code?</strong></summary>
No. The Playground is designed for testing agent behavior without writing code. However, for production deployments, you'll need to use the Agent SDK (Python/TypeScript).
</details>
<details>
<summary><strong>Does this replace the Agent SDK?</strong></summary>
No. The Playground is for rapid testing and experimentation. For production deployments, scheduled tasks, or complex workflows, use the Agent SDK.
</details>
<details>
<summary><strong>How much does it cost?</strong></summary>
Playground requests use the same credit system as Agent SDK requests. You're charged for model inference (varies by model) and sandbox runtime (billed per hour while running).
</details>
<details>
<summary><strong>Why is my sandbox not showing up?</strong></summary>
The sandbox must have `status = "running"` to appear in the dropdown. Check Dashboard > Sandboxes to verify status. If stopped, click "Start" and wait ~30 seconds for it to become available.
</details>
## Need help?
If you hit issues getting the Playground working, reach out in [Discord](https://discord.gg/cua-ai). We respond fast and fix based on what people actually use.
---
Get started at [cua.ai](https://cua.ai) or try the Playground at [cua.ai/dashboard/playground](https://cua.ai/dashboard/playground).

181
blog/cua-vlm-router.md Normal file
View File

@@ -0,0 +1,181 @@
# Cua VLM Router: One Provider for All Your Computer-Use Models
If you've been building computer-use agents, you know the reality: every model provider has its own specification and deployment process. Anthropic has one API format, OpenAI another, Google something else entirely. Want to try a Hugging Face model? That's a completely different setup. Self-hosting? Even more complexity. Each provider requires learning their specific API, managing their credentials, and adapting your code to their particular requirements.
Today we're launching the **Cua VLM Router**: a managed inference API that gives you unified access to multiple vision-language model providers through a single API key. We're starting with Anthropic's Claude models (Sonnet 4.5 and Haiku 4.5)—some of the most loved and widely-used computer-use models in the Cua ecosystem - with more providers coming soon.
![Cua VLM Router Banner](https://github.com/user-attachments/assets/1b978f62-2cae-4cf7-932a-55ac8c8f2e06)
## What You Get
The Cua VLM Router handles the infrastructure so you can focus on building:
**Single API Key**
- One key for all model providers (no juggling multiple credentials)
- Works for both model inference and sandbox access
- Manage everything from one dashboard at cua.ai
**Smart Routing**
- Automatic provider selection for optimal availability and performance
- For Anthropic models, we route to the best provider (Anthropic, AWS Bedrock, or Microsoft Foundry)
- No configuration needed—just specify the model and we handle the rest
**Cost Tracking & Optimization**
- Unified usage dashboard across all models
- Real-time credit balance tracking
- Detailed cost breakdown per request (gateway cost + upstream cost)
**Production-Ready**
- OpenAI-compatible API (drop-in replacement for existing code)
- Full streaming support with Server-Sent Events
- Metadata about routing decisions in every response
## Available Models (Launch)
We're starting with Anthropic's latest Claude models:
| Model | Best For |
| --------------------------------- | ---------------------------------- |
| `cua/anthropic/claude-sonnet-4.5` | General-purpose tasks, recommended |
| `cua/anthropic/claude-haiku-4.5` | Fast responses, cost-effective |
## How It Works
When you request an Anthropic model through Cua, we automatically route to the best available provider—whether that's Anthropic directly, AWS Bedrock, or Microsoft Foundry. You just specify `cua/anthropic/claude-sonnet-4.5`, and we handle the provider selection, failover, and optimization behind the scenes. No need to manage multiple accounts or implement fallback logic yourself.
## Getting Started
Sign up at [cua.ai/signin](https://cua.ai/signin) and create your API key from **Dashboard > API Keys > New API Key** (save it immediately—you won't see it again).
Use it with the Agent SDK (make sure to set your environment variable):
```python
import asyncio
from agent import ComputerAgent
from computer import Computer
async def main():
# Initialize cloud computer
computer = Computer(
os_type="linux",
provider_type="cloud",
name="your-container-name",
api_key="your-cua-api-key"
)
# Initialize agent with Claude Sonnet 4.5
agent = ComputerAgent(
tools=[computer],
model="cua/anthropic/claude-sonnet-4.5",
api_key="your-cua-api-key",
instructions="You are a helpful assistant that can control computers",
only_n_most_recent_images=3
)
# Run a task
async for result in agent.run("Open a browser and search for Python tutorials"):
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
## Migration is Simple
Already using Anthropic directly? Just add the `cua/` prefix:
**Before:**
```python
export ANTHROPIC_API_KEY="sk-ant-..."
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
```
**After:**
```python
export CUA_API_KEY="sk_cua-api01_..."
agent = ComputerAgent(model="cua/anthropic/claude-sonnet-4.5")
```
Same code structure. No other changes needed.
## Direct API Access
The router exposes an OpenAI-compatible API at `https://inference.cua.ai/v1`:
```bash
curl -X POST https://inference.cua.ai/v1/chat/completions \
-H "Authorization: Bearer ${CUA_API_KEY}" \
-H "Content-Type: application/json" \
-d '{
"model": "anthropic/claude-sonnet-4.5",
"messages": [{"role": "user", "content": "Hello!"}],
"stream": true
}'
```
Works with any OpenAI-compatible client library.
## FAQs
<details>
<summary><strong>Do I still need provider API keys?</strong></summary>
No. Cua manages all provider API keys and infrastructure. You only need one Cua API key for everything—model inference and sandbox access.
</details>
<details>
<summary><strong>How does pricing work?</strong></summary>
Requests are billed in credits, deducted from your Cua account balance. Every response includes both the Cua gateway cost and the actual upstream API cost for transparency.
</details>
<details>
<summary><strong>Can I still use my own Anthropic key (BYOK)?</strong></summary>
Yes. The agent SDK still supports direct provider access. Just use `anthropic/claude-sonnet-4-5-20250929` instead of the `cua/` prefix and set your `ANTHROPIC_API_KEY`. See [Supported Model Providers](https://cua.ai/docs/agent-sdk/supported-model-providers/) for details.
</details>
<details>
<summary><strong>What about other providers?</strong></summary>
We're starting with Anthropic and adding more providers based on what people actually use. Request access to specific models in [Discord](https://discord.gg/cua-ai).
</details>
<details>
<summary><strong>Does streaming work?</strong></summary>
Yes. Set `"stream": true` in your request to receive Server-Sent Events. Works identically to OpenAI's streaming API.
</details>
## What's Next
This is just the beginning. We're actively iterating based on feedback:
- Additional model providers
- Custom model routing rules
- Usage alerts and budget controls
- Team collaboration features
If there's a model or feature you need, let us know in [Discord](https://discord.gg/cua-ai).
## Need Help?
- **Documentation**: [cua.ai/docs/agent-sdk/supported-model-providers/cua-vlm-router](https://cua.ai/docs/agent-sdk/supported-model-providers/cua-vlm-router)
- **Quickstart Guide**: [cua.ai/docs/get-started/quickstart](https://cua.ai/docs/get-started/quickstart)
- **Discord Community**: [discord.gg/cua-ai](https://discord.gg/cua-ai)
---
Get started at [cua.ai](https://cua.ai) or check out the [VLM Router docs](https://cua.ai/docs/agent-sdk/supported-model-providers/cua-vlm-router).

View File

@@ -58,7 +58,7 @@ await run_full_dataset(
# Or test on SheetBench (50 spreadsheet tasks)
await run_full_dataset(
dataset="hud-evals/SheetBench-V2",
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
split="train[:2]"
)
```

264
blog/introducing-cua-cli.md Normal file
View File

@@ -0,0 +1,264 @@
# Introducing the Cua CLI: Manage Cloud Sandboxes from Your Terminal
If you've been using our Cloud Sandboxes, you've probably been managing them through the web dashboard - clicking through forms to create instances, copying credentials, manually starting and stopping sandboxes. It works, but it's not exactly built for power users like yourself.
Today we're launching the **Cua CLI**: a command-line interface that brings the full power of our Cloud Sandbox platform to your terminal. Create, manage, and connect to Linux, Windows, or macOS sandboxes in seconds—all from a single command.
![Cua CLI Banner](https://github.com/user-attachments/assets/f8358acf-9194-46ee-b9e3-50cfcff5e489)
## What You Can Do
The Cua CLI handles everything you need to work with Cloud Sandboxes:
**Authentication**
- Browser-based OAuth login with automatic credential storage
- Direct API key support for CI/CD pipelines
- Export credentials to `.env` files for SDK integration
**Sandbox Management**
- Create sandboxes with your choice of OS, size, and region
- List all your sandboxes with status and connection details
- Start, stop, restart, and delete sandboxes
- Open remote desktop (VNC) connections directly in your browser
**Two Command Styles**
The CLI supports both flat and grouped command structures—use whichever fits your workflow:
```bash
# Grouped style (explicit & clear)
cua sb ls
cua sb create --os linux --size small --region north-america
cua sb vnc my-sandbox
# Flat style (quick & concise)
cua ls
cua create --os linux --size small --region north-america
cua vnc my-sandbox
```
Both styles work identically. The CLI shows grouped commands in help by default, but all flat commands remain available for backwards compatibility.
## Installation
One command installs everything (includes Bun runtime + Cua CLI):
```bash
# macOS/Linux
curl -LsSf https://cua.ai/cli/install.sh | sh
# Windows
powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
```
Or install via npm if you prefer:
```bash
npm install -g @trycua/cli
```
## Getting Started
Authenticate with your Cua account:
```bash
# Interactive browser login (recommended)
cua auth login
# Or provide your API key directly
cua auth login --api-key sk-your-api-key-here
```
Create a sandbox:
```bash
cua sb create --os linux --size small --region north-america
# Sandbox created and ready: my-sandbox-abc123
# Password: secure-password-here
# Host: my-sandbox-abc123.sandbox.cua.ai
```
List your sandboxes:
```bash
cua sb list
# NAME STATUS HOST
# my-sandbox-abc123 running my-sandbox-abc123.sandbox.cua.ai
# test-windows-456 stopped test-windows-456.sandbox.cua.ai
```
Open a remote desktop:
```bash
cua sb vnc my-sandbox-abc123
# Opens your browser to the VNC interface with password pre-filled
```
## SDK Integration
Export your API key to a `.env` file for seamless SDK integration:
```bash
cd my-project
cua auth env
# Wrote /path/to/my-project/.env
```
Then use it with our Python or TypeScript SDKs:
```python
from computer import Computer
computer = Computer(
os_type="linux",
provider_type="cloud",
name="my-sandbox-abc123",
api_key="your-api-key" # Or load from .env
)
await computer.run()
```
## Sandbox Sizes & Regions
Create sandboxes in the size and region that fits your needs:
**Sizes:**
- `small` - 2 cores, 8 GB RAM, 128 GB SSD
- `medium` - 4 cores, 16 GB RAM, 128 GB SSD
- `large` - 8 cores, 32 GB RAM, 256 GB SSD
**Regions:**
- `north-america`
- `europe`
- `asia-pacific`
- `south-america`
**OS Options:**
- `linux` - Ubuntu with XFCE desktop
- `windows` - Windows 11 with Edge and Python
- `macos` - macOS (preview access)
## Example Workflows
**Quick Testing Environment**
```bash
# Spin up a sandbox, test something, tear it down
cua sb create --os linux --size small --region north-america
# ... do your testing ...
cua sb delete my-sandbox-abc123
```
**Persistent Development Sandbox**
```bash
# Create a sandbox for long-term use
cua sb create --os linux --size medium --region north-america
# Stop it when not in use (data persists)
cua sb stop my-sandbox-abc123
# Start it again when needed
cua sb start my-sandbox-abc123
```
**CI/CD Integration**
```bash
# Provision sandboxes in your pipeline
export CUA_API_KEY="sk-your-api-key"
cua auth login --api-key "$CUA_API_KEY"
cua sb create --os linux --size large --region north-america
# Run your tests with the Cua Computer SDK
python run_tests.py
# Clean up
cua sb delete my-test-sandbox
```
## Command Aliases
We've added aliases for common commands to speed up your workflow:
```bash
# List aliases
cua list # or: cua ls, cua ps, cua sb list
# VNC aliases
cua vnc # or: cua open, cua sb vnc
```
## FAQs
<details>
<summary><strong>Can I use this in scripts and CI/CD?</strong></summary>
Yes. All commands support non-interactive mode with `--api-key` flags, and the CLI exits with proper status codes for scripting. The flat command style (`cua list`, `cua create`) is particularly useful for quick scripts.
</details>
<details>
<summary><strong>Where are my credentials stored?</strong></summary>
API keys are stored in `~/.cua/cli.sqlite` using a local SQLite database. They never leave your machine. Use `cua auth logout` to clear stored credentials.
</details>
<details>
<summary><strong>What happens to passwords in the output?</strong></summary>
Passwords are hidden by default in `cua list` for security. Use `cua list --show-passwords` to display them when needed.
</details>
<details>
<summary><strong>Can I manage sandboxes created through the web dashboard?</strong></summary>
Yes. The CLI and dashboard share the same API. Any sandbox you create in the dashboard will show up in `cua list`, and vice versa.
</details>
<details>
<summary><strong>How do I update the CLI?</strong></summary>
If you installed via script:
```bash
curl -LsSf https://cua.ai/cli/install.sh | sh
```
If you installed via npm:
```bash
npm install -g @trycua/cli@latest
```
</details>
## What's Next
We're actively iterating based on feedback. Planned features include:
- SSH key management for secure sandbox access
- Template-based sandbox creation
- Batch operations (start/stop multiple sandboxes)
- Custom sandbox configurations
- Snapshot management
If there's a feature you need, let us know in [Discord](https://discord.gg/cua-ai).
## Need Help?
- **Documentation**: [https://cua.ai/docs/libraries/cua-cli/commands](https://cua.ai/docs/libraries/cua-cli/commands)
- **Installation Guide**: [https://cua.ai/docs/libraries/cua-cli/installation](https://cua.ai/docs/libraries/cua-cli/installation)
- **Discord Community**: [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
---
Get started at [cua.ai](https://cua.ai) or check out the [quickstart guide](https://cua.ai/docs/get-started/quickstart).

View File

@@ -90,7 +90,7 @@ lume run macos-sequoia-vanilla:latest
### Lumier: Docker-Style VM Management
[Lumier](https://github.com/trycua/lumier) works differently. It lets you use Docker commands to manage VMs. But here's the key: **Docker is just for packaging, not for isolation**.
[Lumier](https://github.com/trycua/cua/tree/main/libs/lumier) works differently. It lets you use Docker commands to manage VMs. But here's the key: **Docker is just for packaging, not for isolation**.
What makes Lumier useful:

View File

@@ -0,0 +1,658 @@
# NeurIPS 2025: 45 Computer-Use Agent Papers You Should Know About
<img alt="neurips" src="https://github.com/user-attachments/assets/bd649067-bb2c-45f4-827b-087021ec3ad7" />
If you're following the computer-use agent space, you already know that NeurIPS is where the most important work gets presented. But with thousands of papers across every area of machine learning, finding the ones relevant to CUAs means hours of filtering through proceedings, skimming abstracts, and hoping you don't miss something important.
We did that work for you. We're excited to announce that **Cua will be at NeurIPS 2025**, and we've compiled a curated list of **45 papers** focused specifically on Computer-Use Agents—covering benchmarks, safety, grounding, visual reasoning, and agent architectures.
## Why This Matters
Computer-use agents are evolving rapidly. This year's NeurIPS showcases several important developments:
**The benchmark landscape is maturing.** We're seeing comprehensive evaluations across macOS (macOSWorld), professional tools (VideoCAD), and real-world websites (REAL, TheAgentCompany). These aren't toy problems anymore—they're measuring what agents can actually do in production environments.
**Safety is becoming a first-class concern.** Multiple papers (OS-Harm, RiOSWorld, WASP, AgentDAM) are systematically documenting how agents fail when confronted with adversarial inputs, privacy requirements, or misuse scenarios. The findings are sobering: even frontier models often comply with harmful requests.
**Grounding remains the bottleneck.** Papers like GUI-Actor, GUI-G1, and SE-GUI are pushing the state of the art on mapping language to UI actions. The best approaches are achieving significant gains with surprisingly small models and datasets.
**Open-source is catching up.** OpenCUA's 72B model hits 45% on OSWorld-Verified, establishing that community-driven development can compete with proprietary systems.
## Highlights Worth Your Attention
A few papers stand out for their immediate relevance to anyone building or deploying computer-use agents:
- **macOSWorld** reveals a dramatic capability gap: proprietary agents achieve 30%+ success on macOS tasks while open-source models struggle below 5%.
- **TheAgentCompany** simulates a software company where agents browse, code, and communicate. The best agent completes 30% of tasks autonomously.
- **WASP** demonstrates that simple prompt injections deceive top-tier models in 86% of cases.
- **GUI-G1** shows that a 3B model can achieve 90.3% on ScreenSpot by fixing issues with chain-of-thought reasoning.
## Summary Statistics
| Category | Count |
|----------|-------|
| Benchmarks & Datasets | 18 |
| Safety & Security | 12 |
| Grounding & Visual Reasoning | 14 |
| Agent Architectures & Training | 11 |
| Adversarial Attacks | 8 |
**Total Papers:** 45
## Meet Us at NeurIPS
We'll be at NeurIPS in San Diego. If you're working on computer-use agents, building applications on top of CUA infrastructure, or just curious about where this space is heading, we'd love to connect.
- **Book a Meeting**: [cal.com/cua/neurips-slot](https://cal.com/cua/neurips-slot)
- **X/Twitter**: [@trycua](https://x.com/trycua)
- **Discord**: [discord.gg/cua-ai](https://discord.gg/cua-ai)
---
# The Papers
## 1. macOSWorld: A Multilingual Interactive Benchmark for GUI Agents
**Summary:** The first comprehensive benchmark for evaluating GUI agents on macOS. Features 202 multilingual interactive tasks across 30 applications (28 macOS-exclusive), with support for 5 languages (English, Chinese, Arabic, Japanese, Russian). Reveals a dramatic gap: proprietary agents achieve 30%+ success rate while open-source models lag below 5%. Also includes safety benchmarking for deception attacks.
**Key Findings:**
- Proprietary computer-use agents lead at above 30% success rate
- Open-source lightweight models struggle below 5%, highlighting need for macOS domain adaptation
- Multilingual benchmarks expose weaknesses, especially in Arabic (28.8% degradation vs English)
- Deception attacks are a general vulnerability requiring immediate attention
**Poster:** https://neurips.cc/virtual/2025/poster/117427
---
## 2. OS-Harm: A Benchmark for Measuring Safety of Computer Use Agents
**Summary:** A comprehensive safety benchmark built on OSWorld for testing computer-use agents across three harm categories: deliberate user misuse, prompt injection attacks, and model misbehavior. Includes 150 tasks spanning harassment, copyright infringement, disinformation, data exfiltration, and more. Proposes an automated judge achieving high agreement with human annotations (0.76-0.79 F1 score).
**Key Findings:**
- All tested models (o4-mini, Claude 3.7 Sonnet, Gemini 2.5 Pro) tend to directly comply with many deliberate misuse queries
- Models are relatively vulnerable to static prompt injections
- Models occasionally perform unsafe actions without explicit malicious prompts
**Poster:** https://neurips.cc/virtual/2025/loc/san-diego/poster/121772
---
## 3. OpenCUA: Open Foundations for Computer-Use Agents
**Summary:** A comprehensive open-source framework for scaling computer-use agent data and foundation models. Introduces AgentNet, the first large-scale computer-use task dataset spanning 3 operating systems and 200+ applications/websites. OpenCUA-72B achieves 45% success rate on OSWorld-Verified, establishing new state-of-the-art among open-source models.
**Key Contributions:**
- Annotation infrastructure for capturing human computer-use demonstrations
- AgentNet: large-scale dataset across 3 OSes and 200+ apps
- Scalable pipeline transforming demonstrations into state-action pairs with reflective Chain-of-Thought reasoning
- Models generalize well across domains and benefit from increased test-time computation
**Poster:** https://neurips.cc/virtual/2025/poster/119771
---
## 4. Mind2Web 2: Evaluating Agentic Search with Agent-as-a-Judge
**Summary:** A benchmark of 130 realistic, high-quality, long-horizon tasks for agentic search systems (like Deep Research), requiring real-time web browsing and extensive information synthesis. Constructed with 1000+ hours of human labor. Introduces Agent-as-a-Judge framework using tree-structured rubric design for automated evaluation.
**Key Findings:**
- OpenAI Deep Research achieves 50-70% of human performance while spending half the time
- First systematic evaluation of ten frontier agentic search systems vs. human performance
- Addresses the challenge of evaluating time-varying, complex answers
**Poster:** https://neurips.cc/virtual/2025/poster/121798
---
## 5. Scaling Computer-Use Grounding via User Interface Decomposition and Synthesis
**Summary:** Addresses GUI grounding—mapping natural language to specific UI actions—as a critical bottleneck in agent development. Introduces OSWorld-G benchmark (564 annotated samples) and Jedi dataset (4 million synthetic examples), the largest computer-use grounding dataset. Improved grounding directly enhances agentic capabilities, boosting OSWorld performance from 23% to 51%.
**Key Contributions:**
- OSWorld-G: comprehensive benchmark for diverse grounding tasks (text matching, element recognition, layout understanding, precise manipulation)
- Jedi: 4M examples through multi-perspective task decoupling
- Demonstrates compositional generalization to novel interfaces
**Poster:** https://neurips.cc/virtual/2025/poster/121759
---
## 6. RiOSWorld: Benchmarking the Risk of Multimodal Computer-Use Agents
**Summary:** Evaluates potential safety risks of MLLM-based agents during real-world computer manipulation. Features 492 risky tasks spanning web, social media, multimedia, OS, email, and office software. Categorizes risks into user-originated and environmental risks, evaluating both risk goal intention and completion.
**Key Findings:**
- Current computer-use agents face significant safety risks in real-world scenarios
- Safety principles designed for dialogue scenarios don't transfer well to computer-use
- Highlights necessity and urgency of safety alignment for computer-use agents
**Poster:** https://neurips.cc/virtual/2025/poster/117273
---
## 7. REAL: Benchmarking Autonomous Agents on Deterministic Simulations of Real Websites
**Summary:** A benchmark featuring high-fidelity, deterministic replicas of 11 widely-used websites across e-commerce, travel, communication, and professional networking. Contains 112 practical tasks requiring both information retrieval and state-changing actions. Enables reproducible evaluation without safety risks.
**Key Findings:**
- Best frontier language models achieve only 41% success rate
- Highlights critical gaps in autonomous web navigation and task completion
- Supports scalable post-training data generation
**Poster:** https://neurips.cc/virtual/2025/poster/121619
---
## 8. SE-GUI: Enhancing Visual Grounding for GUI Agents via Self-Evolutionary Reinforcement Learning
**Summary:** An RL-based framework for GUI grounding incorporating seed data curation, dense policy gradients, and self-evolutionary reinforcement finetuning using attention maps. With only 3K training samples, the 7B model achieves state-of-the-art on three grounding benchmarks, outperforming UI-TARS-72B by 24.2% on ScreenSpot-Pro.
**Key Results:**
- 47.3% accuracy on ScreenSpot-Pro with 7B model
- Outperforms 72B models with fraction of training data
- Demonstrates effectiveness of RL for high-resolution, complex environments
**Poster:** https://neurips.cc/virtual/2025/poster/118788
---
## 9. TRAP: Targeted Redirecting of Agentic Preferences
**Summary:** A generative adversarial framework that manipulates agent decision-making using diffusion-based semantic injections. Combines negative prompt degradation with positive semantic optimization. Without model access, produces visually natural images that induce consistent decision biases in agents.
**Key Findings:**
- Consistently induces decision-level preference redirection on LLaVA-34B, Gemma3, GPT-4o, and Mistral-3.2
- Outperforms baselines (SPSA, Bandit, standard diffusion)
- Exposes vulnerability: autonomous agents can be misled through visually subtle, semantically-guided manipulations
**Poster:** https://neurips.cc/virtual/2025/poster/117547
---
## 10. TheAgentCompany: Benchmarking LLM Agents on Consequential Real World Tasks
**Summary:** An extensible benchmark simulating a small software company environment where AI agents interact like digital workers: browsing the web, writing code, running programs, and communicating with coworkers. Tests agents on real professional tasks with important implications for industry adoption and labor market effects.
**Key Findings:**
- Best agent achieves 30% autonomous task completion
- Simpler tasks are solvable autonomously
- More difficult long-horizon tasks remain beyond current systems' reach
**Poster:** https://neurips.cc/virtual/2025/poster/121705
---
## 11. VideoGameQA-Bench: Evaluating Vision-Language Models for Video Game Quality Assurance
**Summary:** A comprehensive benchmark for VLMs in video game QA, encompassing visual unit testing, visual regression testing, needle-in-a-haystack challenges, glitch detection, and bug report generation for both images and videos. Addresses the need for standardized benchmarks in this labor-intensive domain.
**Key Focus:**
- First benchmark specifically designed for video game QA with VLMs
- Covers wide range of QA activities across images and videos
- Addresses lack of automation in game development workflows
**Poster:** https://neurips.cc/virtual/2025/poster/121740
---
## 12. WASP: Benchmarking Web Agent Security Against Prompt Injection Attacks
**Summary:** End-to-end benchmark for evaluating web agent security against prompt injection attacks. Tests realistic scenarios where even simple, low-effort human-written injections can deceive top-tier AI models including those with advanced reasoning.
**Key Findings:**
- Attacks partially succeed in up to 86% of cases
- State-of-the-art agents often struggle to fully complete attacker goals
- Reveals "security by incompetence"—agents' limitations sometimes prevent full attack success
**Poster:** https://neurips.cc/virtual/2025/poster/121728
---
## 13. AgentDAM: Privacy Leakage Evaluation for Autonomous Web Agents
**Summary:** Measures whether AI web-navigation agents follow the privacy principle of "data minimization"—using sensitive information only when truly necessary to complete a task. Simulates realistic web interaction scenarios end-to-end.
**Key Findings:**
- Agents built on GPT-4, Llama-3, and Claude are prone to inadvertent use of unnecessary sensitive information
- Proposes prompting-based defense that reduces information leakage
- End-to-end benchmarking provides more realistic measure than probing LLMs about privacy
**Poster:** https://neurips.cc/virtual/2025/poster/121443
---
## 14. Embodied Web Agents: Bridging Physical-Digital Realms for Integrated Agent Intelligence
**Summary:** A novel paradigm for AI agents that fluidly bridge embodiment and web-scale reasoning. Creates unified simulation integrating realistic 3D indoor/outdoor environments with functional web interfaces. Tasks include cooking from online recipes, navigating with dynamic map data, and interpreting landmarks using web knowledge.
**Key Contributions:**
- Unified platform combining 3D environments with web interfaces
- Benchmark spanning cooking, navigation, shopping, tourism, and geolocation
- Reveals significant performance gaps between AI systems and humans
**Poster:** https://neurips.cc/virtual/2025/poster/121809
---
## 15. VideoCAD: A Dataset and Model for Learning Long-Horizon 3D CAD UI Interactions from Video
**Summary:** The first attempt to model UI interactions for precision engineering tasks. Features 41K+ annotated video recordings of CAD operations with time horizons up to 20x longer than existing datasets. Proposes VideoCADFormer for learning CAD interactions directly from video.
**Key Contributions:**
- Large-scale synthetic dataset for CAD UI interactions
- VQA benchmark for evaluating spatial reasoning and video understanding
- Reveals challenges in precise action grounding and long-horizon dependencies
**Poster:** https://neurips.cc/virtual/2025/poster/121820
---
## 16. Look Before You Leap: A GUI-Critic-R1 Model for Pre-Operative Error Diagnosis
**Summary:** Introduces a pre-operative critic mechanism that provides feedback before action execution by reasoning about potential outcomes. Proposes Suggestion-aware Group Relative Policy Optimization (S-GRPO) for building the GUI-Critic-R1 model with fully automated data generation.
**Key Results:**
- Significant advantages in critic accuracy compared to current MLLMs
- Improved success rates and operational efficiency on GUI automation benchmarks
- Works across both mobile and web domains
**Poster:** https://neurips.cc/virtual/2025/poster/115566
---
## 17. Grounded Reinforcement Learning for Visual Reasoning (ViGoRL)
**Summary:** A vision-language model trained with RL to explicitly anchor each reasoning step to specific visual coordinates. Introduces multi-turn RL framework enabling dynamic zooming into predicted coordinates during reasoning.
**Key Results:**
- 86.4% on V*Bench for visual search
- Outperforms supervised fine-tuning and conventional RL across spatial reasoning, visual search, and web-based grounding
- Grounding amplifies region exploration, subgoal setting, and visual verification
**Poster:** https://neurips.cc/virtual/2025/poster/120218
---
## 18. GUI-Actor: Coordinate-Free Visual Grounding for GUI Agents
**Summary:** A VLM-based method for coordinate-free GUI grounding using an attention-based action head. Enables proposing one or more action regions in a single forward pass with a grounding verifier for selection.
**Key Results:**
- GUI-Actor-7B achieves 44.6 on ScreenSpot-Pro with Qwen2.5-VL, outperforming UI-TARS-72B (38.1)
- Improved generalization to unseen resolutions and layouts
- Fine-tuning only ~100M parameters achieves SOTA performance
**Poster:** https://neurips.cc/virtual/2025/poster/119841
---
## 19. GUI-G1: Understanding R1-Zero-Like Training for Visual Grounding in GUI Agents
**Summary:** Extensive analysis of the R1-Zero paradigm (online RL + chain-of-thought reasoning) for GUI grounding. Identifies issues: longer reasoning chains lead to worse performance, reward hacking via box size exploitation, and overfitting easy examples.
**Solutions Proposed:**
- Fast Thinking Template for direct answer generation
- Box size constraint in reward function
- Difficulty-aware scaling in RL objective
**Key Results:**
- GUI-G1-3B achieves 90.3% on ScreenSpot and 37.1% on ScreenSpot-Pro
- Outperforms larger UI-TARS-7B with only 3B parameters
**Poster:** https://neurips.cc/virtual/2025/poster/120227
---
## 20. GUI-Reflection: Empowering Multimodal GUI Models with Self-Reflection Behavior
**Summary:** Framework integrating self-reflection and error correction into end-to-end multimodal GUI models through GUI-specific pre-training, offline SFT, and online reflection tuning. Enables self-reflection emergence with fully automated data generation.
**Key Contributions:**
- Scalable pipelines for automatic reflection/correction data from successful trajectories
- GUI-Reflection Task Suite for reflection-oriented abilities
- Diverse environment for online training on mobile devices
- Iterative online reflection tuning algorithm
**Poster:** https://neurips.cc/virtual/2025/poster/115826
---
## 21. InfantAgent-Next: A Multimodal Generalist Agent for Automated Computer Interaction
**Summary:** A generalist agent capable of multimodal computer interaction (text, images, audio, video). Integrates tool-based and pure vision agents within highly modular architecture, enabling collaborative step-by-step task solving.
**Key Results:**
- 7.27 accuracy gain over Claude-Computer-Use on OSWorld
- Evaluated on pure vision benchmarks (OSWorld), general benchmarks (GAIA), and tool-intensive benchmarks (SWE-Bench)
- Demonstrates value of modular, collaborative agent architecture
**Poster:** https://neurips.cc/virtual/2025/poster/118379
---
## 22. AdvEDM: Fine-grained Adversarial Attack against VLM-based Embodied Agents
**Summary:** A fine-grained adversarial attack framework that modifies VLM perception of only key objects while preserving semantics of remaining regions. Unlike broad semantic disruption, this targeted approach reduces conflicts with task context, making VLMs output valid but incorrect decisions that affect agent actions in the physical world.
**Key Contributions:**
- AdvEDM-R: removes semantics of specific objects from images
- AdvEDM-A: adds semantics of new objects into images
- Demonstrates fine-grained control with excellent attack performance in embodied decision-making tasks
**Poster:** https://neurips.cc/virtual/2025/poster/116436
---
## 23. BLINK-Twice: A Reasoning Benchmark on Visual Perception
**Summary:** A vision-centric reasoning benchmark grounded in challenging perceptual tasks. Unlike prior benchmarks, it moves beyond shallow perception ("see") to require fine-grained observation and analytical reasoning ("observe"). Features natural adversarial image pairs and annotated reasoning chains for process evaluation.
**Key Findings:**
- Tests 20 leading MLLMs including 12 foundation models and 8 reasoning-enhanced models
- Existing reasoning strategies (chain-of-thought, self-criticism) result in unstable and redundant reasoning
- Repeated image observation improves performance across models
- Active visual interaction (as in o3) highlights need for new vision reasoning paradigm
**Poster:** https://neurips.cc/virtual/2025/poster/121522
---
## 24. BadVLA: Backdoor Attacks on Vision-Language-Action Models
**Summary:** First systematic investigation of backdoor vulnerabilities in VLA models. Proposes Objective-Decoupled Optimization with two stages: explicit feature-space separation to isolate trigger representations, and conditional control deviations activated only by triggers.
**Key Findings:**
- Consistently achieves near-100% attack success rates with minimal impact on clean task accuracy
- Robust against common input perturbations, task transfers, and model fine-tuning
- Exposes critical security vulnerabilities in current VLA deployments under Training-as-a-Service paradigm
**Poster:** https://neurips.cc/virtual/2025/poster/115803
---
## 25. Benchmarking Egocentric Multimodal Goal Inference for Assistive Wearable Agents
**Summary:** Benchmark for proactively inferring user goals from multimodal contextual observations for wearable assistant agents (smart glasses). Dataset comprises ~30 hours from 363 participants across 3,482 recordings with visual, audio, digital, and longitudinal context.
**Key Findings:**
- Humans achieve 93% MCQ accuracy; best VLM reaches ~84%
- For open-ended generation, best models produce relevant goals only ~57% of the time
- Smaller models (suited for wearables) achieve ~49% accuracy
- Models benefit from relevant modalities but struggle with noisy ones
**Poster:** https://neurips.cc/virtual/2025/poster/121655
---
## 26. GAM-Agent: Game-Theoretic Multi-Agent Framework for Visual Reasoning
**Summary:** A game-theoretic multi-agent framework formulating reasoning as a non-zero-sum game between base agents (visual perception specialists) and a critical agent (logic/fact verification). Features uncertainty-aware controller for dynamic agent collaboration with multi-round debates.
**Key Results:**
- Boosts small-to-mid scale models (Qwen2.5-VL-7B, InternVL3-14B) by 5-6%
- Enhances strong models like GPT-4o by 2-3%
- Modular, scalable, and generalizable framework
**Poster:** https://neurips.cc/virtual/2025/poster/119144
---
## 27. GRIT: Teaching MLLMs to Think with Images
**Summary:** Introduces Grounded Reasoning with Images and Texts—a method for training MLLMs to generate reasoning chains interleaving natural language with explicit bounding box coordinates. Uses GRPO-GR reinforcement learning with rewards focused on answer accuracy and grounding format.
**Key Contributions:**
- Exceptional data efficiency: requires as few as 20 image-question-answer triplets
- Successfully unifies reasoning and grounding abilities
- Eliminates need for reasoning chain annotations or explicit bounding box labels
**Poster:** https://neurips.cc/virtual/2025/poster/118020
---
## 28. Safe RLHF-V: Safe Reinforcement Learning from Multi-modal Human Feedback
**Summary:** First multimodal safety alignment framework. Introduces BeaverTails-V (first dataset with dual preference annotations for helpfulness and safety), and Beaver-Guard-V (multi-level guardrail system defending against unsafe queries and adversarial attacks).
**Key Results:**
- Guard model improves precursor model's safety by average of 40.9% over five filtering rounds
- Safe RLHF-V enhances model safety by 34.2% and helpfulness by 34.3%
- First exploration of multi-modal safety alignment within constrained optimization
**Poster:** https://neurips.cc/virtual/2025/poster/118304
---
## 29. Dropout Decoding: Uncertainty-Guided Token Dropout for LVLM Reliability
**Summary:** An inference-time approach that quantifies visual token uncertainty and selectively masks uncertain tokens. Decomposes uncertainty into aleatoric and epistemic components, focusing on epistemic uncertainty for perception-related errors.
**Key Results:**
- Significantly reduces object hallucinations
- Enhances reliability and quality of LVLM outputs across diverse visual contexts
- Validated on CHAIR, THRONE, and MMBench benchmarks
**Poster:** https://neurips.cc/virtual/2025/poster/118572
---
## 30. FOCUS: Unified Vision-Language Modeling for Interactive Editing
**Summary:** A unified LVLM integrating segmentation-aware perception and controllable object-centric generation. Uses dual-branch visual encoder for global semantic context and fine-grained spatial details, with MoVQGAN-based visual tokenizer for discrete visual tokens.
**Key Contributions:**
- Progressive multi-stage training pipeline
- Segmentation masks jointly optimized as spatial condition prompts
- Bridges segmentation-aware perception with fine-grained visual synthesis
**Poster:** https://neurips.cc/virtual/2025/poster/119062
---
## 31. Fine-Grained Preference Optimization for Spatial Reasoning (SpatialReasoner-R1)
**Summary:** Introduces Multi-Model Monte Carlo Tree Search (M3CTS) for generating diverse Long Chain-of-Thought reasoning trajectories. Proposes fine-grained Direct Preference Optimization (fDPO) with segment-specific preference granularity guided by spatial reward mechanism.
**Key Results:**
- fDPO achieves 4.1% and 9.0% gains over standard DPO on spatial quality and quantity tasks
- SpatialReasoner-R1 sets new SOTA on SpatialRGPT-Bench, outperforming strongest baseline by 9.8%
- Maintains competitive performance on general vision-language tasks
**Poster:** https://neurips.cc/virtual/2025/poster/118573
---
## 32. Reason-RFT: Reinforcement Fine-Tuning for Visual Reasoning
**Summary:** A two-stage reinforcement fine-tuning framework: SFT with curated Chain-of-Thought data activates reasoning potential, followed by RL based on Group Relative Policy Optimization (GRPO) for domain shift adaptability.
**Key Advantages:**
- State-of-the-art results outperforming both open-source and proprietary models
- Robust performance under domain shifts across various tasks
- Excellent data efficiency in few-shot learning scenarios
**Poster:** https://neurips.cc/virtual/2025/poster/118345
---
## 33. Safe + Safe = Unsafe? Exploiting Safe Images to Jailbreak LVLMs
**Summary:** Reveals that safe images can be exploited for jailbreaking when combined with additional safe images and prompts, exploiting LVLMs' universal reasoning capabilities and safety snowball effect. Proposes Safety Snowball Agent (SSA) framework.
**Key Findings:**
- SSA can use nearly any image to induce LVLMs to produce unsafe content
- Achieves high jailbreak success rates against latest LVLMs
- Exploits inherent LVLM properties rather than alignment flaws
**Poster:** https://neurips.cc/virtual/2025/loc/san-diego/poster/116422
---
## 34. MIP against Agent: Malicious Image Patches Hijacking Multimodal OS Agents
**Summary:** Uncovers novel attack vector: Malicious Image Patches (MIPs)—adversarially perturbed screen regions that induce OS agents to perform harmful actions. MIPs can be embedded in wallpapers or shared on social media to exfiltrate sensitive data.
**Key Findings:**
- MIPs generalize across user prompts and screen configurations
- Can hijack multiple OS agents during execution of benign instructions
- Exposes critical security vulnerabilities requiring attention before widespread deployment
**Poster:** https://neurips.cc/virtual/2025/loc/san-diego/poster/117813
---
## 35. CogVLA: Cognition-Aligned Vision-Language-Action Models
**Summary:** A framework leveraging instruction-driven routing and sparsification for VLA efficiency. Features 3-stage progressive architecture inspired by human multimodal coordination: Encoder-FiLM Aggregation Routing, LLM-FiLM Pruning Routing, and V-L-A Coupled Attention.
**Key Results:**
- 97.4% success rate on LIBERO benchmark, 70.0% on real-world robotic tasks
- Reduces training costs by 2.5x and inference latency by 2.8x compared to OpenVLA
- Achieves state-of-the-art performance
**Poster:** https://neurips.cc/virtual/2025/poster/119023
---
## 36. Succeed or Learn Slowly (SoLS): Sample Efficient RL for Mobile App Control
**Summary:** Novel off-policy RL algorithm applying direct policy updates for positive samples and conservative, regularized updates for negative ones. Augmented with Successful Transition Replay (STR) for prioritizing successful interactions.
**Key Results:**
- At least 17% relative increase over existing methods on AndroidWorld benchmark
- Substantially fewer computational resources than GPT-4o-based methods
- 5-60x faster inference
**Poster:** https://neurips.cc/virtual/2025/poster/119910
---
## 37. TAI3: Testing Agent Integrity in Interpreting User Intent
**Summary:** An API-centric stress testing framework that uncovers intent integrity violations in LLM agents. Uses semantic partitioning to organize tasks into meaningful categories, with targeted mutations to expose subtle agent errors while preserving user intent.
**Key Contributions:**
- Datatype-aware strategy memory for retrieving effective mutation patterns
- Lightweight predictor for ranking mutations by error likelihood
- Generalizes to stronger target models using smaller LLMs for test generation
**Poster:** https://neurips.cc/virtual/2025/poster/118952
---
## 38. ThinkAct: Vision-Language-Action Reasoning via Reinforced Visual Latent Planning
**Summary:** A dual-system framework bridging high-level reasoning with low-level action execution. Trains multimodal LLM to generate embodied reasoning plans guided by action-aligned visual rewards, compressed into visual plan latents for downstream action execution.
**Key Capabilities:**
- Few-shot adaptation
- Long-horizon planning
- Self-correction behaviors in complex embodied AI tasks
**Poster:** https://neurips.cc/virtual/2025/poster/119747
---
## 39. Visualization-of-Thought Attack (VoTA) against VLMs
**Summary:** Automated attack framework that constructs chains of images with risky visual thoughts to challenge VLMs. Exploits the conflict between logical processing and safety protocols, leading to unsafe content generation.
**Key Results:**
- Improves average attack success rate by 26.71% (from 63.70% to 90.41%)
- Tested on 9 open-source and 6 commercial VLMs
- Outperforms state-of-the-art methods
**Poster:** https://neurips.cc/virtual/2025/poster/119873
---
## 40. Open CaptchaWorld: Benchmarking MLLM Agents on CAPTCHA Puzzles
**Summary:** First web-based benchmark evaluating MLLM agents on diverse CAPTCHA puzzles. Spans 20 modern CAPTCHA types (225 total) with novel metric: CAPTCHA Reasoning Depth quantifying cognitive and motor steps required.
**Key Findings:**
- Humans achieve 93.3% success rate
- State-of-the-art agents achieve at most 40.0% (Browser-Use OpenAI-o3)
- Highlights significant gap between human and agent capabilities
**Poster:** https://neurips.cc/virtual/2025/poster/121537
---
## 41. Pixel Reasoner: Pixel-Space Reasoning with Curiosity-Driven RL
**Summary:** Introduces pixel-space reasoning framework where VLMs use visual operations (zoom-in, select-frame) to directly inspect and infer from visual evidence. Two-phase training: instruction tuning on synthesized traces, then RL with curiosity-driven rewards.
**Key Results:**
- 84% on V*Bench, 74% on TallyQA-Complex, 84% on InfographicsVQA
- Highest accuracy achieved by any open-source 7B model
- Enables proactive information gathering from complex visual inputs
**Poster:** https://neurips.cc/virtual/2025/poster/117667
---
## 42. BTL-UI: Blink-Think-Link Reasoning Model for GUI Agent
**Summary:** Brain-inspired framework decomposing interactions into three biologically plausible phases: Blink (rapid detection via saccadic-like attention), Think (higher-level reasoning/planning), and Link (executable command generation for motor control).
**Key Innovations:**
- Automated annotation pipeline for blink data
- BTL Reward: first rule-based reward mechanism driven by both process and outcome
- Competitive performance on static GUI understanding and dynamic interaction tasks
**Poster:** https://neurips.cc/virtual/2025/poster/119419
---
## 43. GUI Exploration Lab: Multi-Turn RL for Screen Navigation
**Summary:** Simulation environment engine enabling flexible definition of screens, icons, and navigation graphs with full environment access for agent training/evaluation. Demonstrates progressive training approach from SFT to multi-turn RL.
**Key Findings:**
- Supervised fine-tuning enables memorization of fundamental knowledge
- Single-turn RL enhances generalization to unseen scenarios
- Multi-turn RL encourages exploration strategies through interactive trial and error
**Poster:** https://neurips.cc/virtual/2025/loc/san-diego/poster/117497
---
## 44. GUI-Rise: Structured Reasoning and History Summarization for GUI Navigation
**Summary:** Reasoning-enhanced framework integrating structured reasoning, action prediction, and history summarization. Uses Chain-of-Thought analyses combining progress estimation and decision reasoning, trained via SFT and GRPO with history-aware rewards.
**Key Results:**
- State-of-the-art under identical training data conditions
- Particularly strong in out-of-domain scenarios
- Robust reasoning and generalization across diverse GUI navigation tasks
**Poster:** https://neurips.cc/virtual/2025/poster/117425
---
## 45. UI-Genie: A Self-Improving Framework for MLLM-based Mobile GUI Agents
**Summary:** Self-improving framework addressing trajectory verification and training data scalability. Features UI-Genie-RM (image-text interleaved reward model) and self-improvement pipeline with reward-guided exploration and outcome verification.
**Key Contributions:**
- UI-Genie-RM-517k: first reward-specific dataset for GUI agents
- UI-Genie-Agent-16k: high-quality synthetic trajectories without manual annotation
- State-of-the-art across multiple GUI agent benchmarks through three generations of self-improvement
**Poster:** https://neurips.cc/virtual/2025/poster/119990
---
## What We're Building
At Cua, we're focused on the infrastructure layer for computer-use agents: cloud sandboxes for safe execution, SDKs for agent development, and tools that make it easier to build and deploy agents in production.
If you're experimenting with any of the approaches in these papers, our [Cloud Sandboxes](https://cua.ai) provide isolated Linux, Windows, and macOS environments where you can test agent behavior without risk to real systems.
---
**Start building:** [cua.ai](https://cua.ai)
**Join the community:** [Discord](https://discord.gg/cua-ai)

View File

@@ -378,4 +378,4 @@ Happy coding (safely)!
---
_Want to dive deeper? Check out our [sandboxed functions examples](https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py) and [virtual environment tests](https://github.com/trycua/cua/blob/main/tests/venv.py) on GitHub. Questions? Come chat with us on Discord!_
_Want to dive deeper? Check out our [sandboxed functions examples](https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py) and [virtual environment tests](https://github.com/trycua/cua/blob/main/tests/test_venv.py) on GitHub. Questions? Come chat with us on Discord!_

View File

@@ -247,7 +247,7 @@ try:
await computer.interface.right_click(300, 300)
await computer.interface.double_click(400, 400)
await computer.interface.type("Hello, World!")
await computer.interface.type_text("Hello, World!")
await computer.interface.press_key("enter")
await computer.interface.set_clipboard("Test clipboard")
@@ -306,6 +306,6 @@ Now that you know how to create and share trajectories, consider these advanced
### Resources
- [Computer-Use Interface GitHub](https://github.com/trycua/cua/tree/main/libs/computer)
- [Computer-Use Interface GitHub](https://github.com/trycua/cua/tree/main/libs/python/computer)
- [Hugging Face Datasets Documentation](https://huggingface.co/docs/datasets)
- [Example Dataset: ddupont/test-dataset](https://huggingface.co/datasets/ddupont/test-dataset)

View File

@@ -174,7 +174,7 @@ await computer.run()
## Links
- **Docker Provider Docs:** [https://cua.ai/docs/computers/docker](https://cua.ai/docs/computers/docker)
- **Docker Provider Docs:** [https://cua.ai/docs/computers/docker](https://cua.ai/docs/computer-sdk/computers#linux-on-docker)
- **KasmVNC:** [https://github.com/kasmtech/KasmVNC](https://github.com/kasmtech/KasmVNC)
- **Container Source:** [https://github.com/trycua/cua/tree/main/libs/kasm](https://github.com/trycua/cua/tree/main/libs/kasm)
- **Computer SDK:** [https://cua.ai/docs/computer-sdk/computers](https://cua.ai/docs/computer-sdk/computers)

View File

@@ -239,7 +239,7 @@ But for development, prototyping, and learning Windows RPA workflows, **Windows
- [Windows Sandbox Documentation](https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/)
- [Cua GitHub Repository](https://github.com/trycua/cua)
- [Agent UI Documentation](https://github.com/trycua/cua/tree/main/libs/agent)
- [Agent UI Documentation](https://github.com/trycua/cua/tree/main/libs/python/agent)
- [Join our Discord Community](https://discord.gg/cua-ai)
---

View File

@@ -34,7 +34,7 @@ async def take_screenshot():
) as computer:
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
max_trajectory_budget=5.0
)
@@ -89,7 +89,7 @@ Use the following environment variables to configure the agent and its access to
```bash
# Computer instance (cloud)
export CUA_CONTAINER_NAME="your-container-name"
export CUA_SANDBOX_NAME="your-sandbox-name"
export CUA_API_KEY="your-cua-api-key"
# LLM API keys
@@ -121,7 +121,7 @@ The output is an AsyncGenerator that yields response chunks.
The `ComputerAgent` constructor provides a wide range of options for customizing agent behavior, tool integration, callbacks, resource management, and more.
- `model` (`str`): Default: **required**
The LLM or agent model to use. Determines which agent loop is selected unless `custom_loop` is provided. (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro")
The LLM or agent model to use. Determines which agent loop is selected unless `custom_loop` is provided. (e.g., "claude-sonnet-4-5-20250929", "computer-use-preview", "omni+vertex_ai/gemini-pro")
- `tools` (`List[Any]`):
List of tools the agent can use (e.g., `Computer`, sandboxed Python functions, etc.).
- `custom_loop` (`Callable`):
@@ -159,7 +159,7 @@ from computer import Computer
from agent.callbacks import ImageRetentionCallback
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[Computer(...)],
only_n_most_recent_images=3,
callbacks=[ImageRetentionCallback(only_n_most_recent_images=3)],

View File

@@ -13,7 +13,7 @@ Optimize agent costs with budget management and image retention callbacks.
from agent.callbacks import BudgetManagerCallback
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
callbacks=[
BudgetManagerCallback(
@@ -30,7 +30,7 @@ agent = ComputerAgent(
```python
# Simple budget limit
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
max_trajectory_budget=5.0 # $5 limit
)
```
@@ -40,7 +40,7 @@ agent = ComputerAgent(
```python
# Advanced budget configuration
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
max_trajectory_budget={
"max_budget": 10.0,
"raise_error": True, # Raise error when exceeded
@@ -55,7 +55,7 @@ agent = ComputerAgent(
from agent.callbacks import ImageRetentionCallback
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
callbacks=[
ImageRetentionCallback(only_n_most_recent_images=3)
@@ -67,7 +67,7 @@ agent = ComputerAgent(
```python
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
only_n_most_recent_images=3 # Auto-adds ImageRetentionCallback
)
@@ -77,7 +77,7 @@ agent = ComputerAgent(
```python
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
max_trajectory_budget=5.0, # Budget limit
only_n_most_recent_images=3, # Image retention

View File

@@ -21,7 +21,7 @@ from agent.callbacks import (
)
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
callbacks=[
ImageRetentionCallback(only_n_most_recent_images=3),

View File

@@ -14,7 +14,7 @@ from agent.callbacks import LoggingCallback
import logging
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
callbacks=[
LoggingCallback(
@@ -29,7 +29,7 @@ agent = ComputerAgent(
```python
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
verbosity=logging.INFO # Auto-adds LoggingCallback
)
@@ -72,7 +72,7 @@ class CustomLogger(AsyncCallbackHandler):
# Use custom logger
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
callbacks=[CustomLogger("my_agent")]
)

View File

@@ -13,7 +13,7 @@ The TrajectorySaverCallback records complete agent conversations including messa
from agent.callbacks import TrajectorySaverCallback
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
callbacks=[
TrajectorySaverCallback(
@@ -28,7 +28,7 @@ agent = ComputerAgent(
```python
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
trajectory_dir="trajectories", # Auto-save trajectories
tools=[computer]
)

View File

@@ -83,7 +83,7 @@ For long conversations, consider using the `only_n_most_recent_images` parameter
```python
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer],
only_n_most_recent_images=3
)

View File

@@ -16,7 +16,7 @@ def calculate(a: int, b: int) -> int:
# Use with agent
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer, calculate]
)
```
@@ -43,7 +43,7 @@ from computer import Computer
computer = Computer(...)
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer, read_file],
)
```

View File

@@ -1,5 +1,5 @@
---
title: Customizing Your ComputerAgent
title: Customize ComputerAgent
---
<Callout>
@@ -74,7 +74,7 @@ Callbacks provide lifecycle hooks to preprocess messages, postprocess outputs, r
from agent.callbacks import ImageRetentionCallback, TrajectorySaverCallback, BudgetManagerCallback
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer],
callbacks=[
ImageRetentionCallback(only_n_most_recent_images=3),

View File

@@ -1,4 +1,4 @@
{
"title": "Integrations",
"pages": ["hud"]
"pages": ["hud", "observability"]
}

View File

@@ -0,0 +1,66 @@
---
title: Observability
description: Trace CUA execution steps and sessions
---
## Observability
CUA has a native integration with [Laminar](https://laminar.sh/) open-source platform for tracing, evals, and labeling of autonomous AI agents. Read more about Laminar in the [Laminar docs](https://docs.lmnr.ai/).
## Setup
Register on [Laminar Cloud](https://laminar.sh/) or spin up a [local instance](https://github.com/lmnr-ai/lmnr) and get the key from your project settings. Set the `LMNR_PROJECT_API_KEY` environment variable to your key.
```bash
pip install lmnr[all]
export LMNR_PROJECT_API_KEY=your-key
```
## Usage
Then, initialize Laminar at the entry point of your application, register Laminar LiteLLM callback, and all steps of CUA will be automatically traced.
```python
import os
import litellm
from agent import ComputerAgent
from computer import Computer
from lmnr import Laminar, LaminarLiteLLMCallback # [!code highlight]
Laminar.initialize() # [!code highlight]
litellm.callbacks.append(LaminarLiteLLMCallback()) # [!code highlight]
computer = Computer(
os_type="linux",
provider_type="cloud",
name=os.getenv("CUA_CONTAINER_NAME"),
api_key=os.getenv("CUA_API_KEY"),
)
agent = ComputerAgent(
model="openai/computer-use-preview",
tools=[computer],
)
async def main():
async for step in agent.run("Create a new file called 'test.txt' in the current directory"):
print(step["output"])
if __name__ == "__main__":
asyncio.run(main())
```
## Viewing traces
You can view traces in the Laminar UI by going to the traces tab in your project. When you select a trace,
you will see all the agent execution steps, including computer actions, LLM calls, and screenshots.
For each step, you will see the LLM call, the computer action. The computer actions are highlighted in the timeline in yellow.
<img
src="/docs/img/laminar_trace_example.png"
alt="Example trace in Laminar showing the litellm.response span and its output."
width="800px"
/>

View File

@@ -10,11 +10,10 @@
"customizing-computeragent",
"callbacks",
"custom-tools",
"custom-computer-handlers",
"prompt-caching",
"usage-tracking",
"telemetry",
"benchmarks",
"migration-guide",
"integrations"
]
}

View File

@@ -7,7 +7,7 @@ This guide lists **breaking changes** when migrating from the original `Computer
## Breaking Changes
- **Initialization:**
- `ComputerAgent` (v0.4.x) uses `model` as a string (e.g. "anthropic/claude-3-5-sonnet-20241022") instead of `LLM` and `AgentLoop` objects.
- `ComputerAgent` (v0.4.x) uses `model` as a string (e.g. "anthropic/claude-sonnet-4-5-20250929") instead of `LLM` and `AgentLoop` objects.
- `tools` is a list (can include multiple computers and decorated functions).
- `callbacks` are now first-class for extensibility (image retention, budget, trajectory, logging, etc).
- **No explicit `loop` parameter:**
@@ -39,7 +39,7 @@ async with Computer() as computer:
```python
async with Computer() as computer:
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer]
)
messages = [{"role": "user", "content": "Take a screenshot"}]

View File

@@ -38,7 +38,7 @@ With the OpenAI provider, prompt caching is handled automatically for prompts of
```python
from agent import ComputerAgent
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
use_prompt_caching=True,
)
```

View File

@@ -32,7 +32,7 @@ Any vision-enabled LiteLLM-compatible model can be used as the planning componen
- Any Allinone CUA (planning-capable). See [Allinone CUAs](./computer-use-agents).
- Any VLM via LiteLLM providers: `anthropic/*`, `openai/*`, `openrouter/*`, `gemini/*`, `vertex_ai/*`, `huggingface-local/*`, `mlx/*`, etc.
- Examples:
- **Anthropic**: `anthropic/claude-3-5-sonnet-20241022`, `anthropic/claude-opus-4-1-20250805`
- **Anthropic**: `anthropic/claude-sonnet-4-5-20250929`, `anthropic/claude-opus-4-1-20250805`
- **OpenAI**: `openai/gpt-5`, `openai/gpt-o3`, `openai/gpt-4o`
- **Google**: `gemini/gemini-1.5-pro`, `vertex_ai/gemini-pro-vision`
- **Local models**: Any Hugging Face vision-language model
@@ -41,7 +41,7 @@ Any vision-enabled LiteLLM-compatible model can be used as the planning componen
### GTA1 + GPT-5
Use Google's Gemini for planning with specialized grounding:
Use OpenAI's GPT-5 for planning with specialized grounding:
```python
agent = ComputerAgent(
@@ -59,7 +59,7 @@ Combine state-of-the-art grounding with powerful reasoning:
```python
agent = ComputerAgent(
"huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-3-5-sonnet-20241022",
"huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929",
tools=[computer]
)
@@ -113,7 +113,7 @@ async for _ in agent.run("Close the settings window, then open the Downloads fol
Composed agents support both capabilities:
```python
agent = ComputerAgent("huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-3-5-sonnet-20241022")
agent = ComputerAgent("huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929")
# Full computer-use agent capabilities
async for _ in agent.run("Complete this online form"):

View File

@@ -29,10 +29,9 @@ Claude models with computer-use capabilities:
- Claude 4.1: `claude-opus-4-1-20250805`
- Claude 4: `claude-opus-4-20250514`, `claude-sonnet-4-20250514`
- Claude 3.7: `claude-3-7-sonnet-20250219`
- Claude 3.5: `claude-3-5-sonnet-20241022`
```python
agent = ComputerAgent("claude-3-5-sonnet-20241022", tools=[computer])
agent = ComputerAgent("claude-sonnet-4-5-20250929", tools=[computer])
async for _ in agent.run("Open Firefox and navigate to github.com"):
pass
```
@@ -78,10 +77,10 @@ async for _ in agent.run("Open Firefox and navigate to github.com"):
Qwen3 VL family:
- `openrouter/qwen/qwen3-vl-235b-a22b-instruct`
- `cua/qwen/qwen3-vl-235b` (via CUA VLM Router - recommended)
```python
agent = ComputerAgent("openrouter/qwen/qwen3-vl-235b-a22b-instruct", tools=[computer])
agent = ComputerAgent("cua/qwen/qwen3-vl-235b", tools=[computer])
async for _ in agent.run("Open Firefox and navigate to github.com"):
pass
```

View File

@@ -11,10 +11,10 @@ All models that support `ComputerAgent.run()` also support `ComputerAgent.predic
### Anthropic CUAs
- Claude 4.5: `claude-sonnet-4-5-20250929`
- Claude 4.1: `claude-opus-4-1-20250805`
- Claude 4: `claude-opus-4-20250514`, `claude-sonnet-4-20250514`
- Claude 3.7: `claude-3-7-sonnet-20250219`
- Claude 3.5: `claude-3-5-sonnet-20241022`
### OpenAI CUA Preview
@@ -61,7 +61,7 @@ Moondream3 is a powerful small model that can perform UI grounding and click pre
```python
# Using any grounding model for click prediction
agent = ComputerAgent("claude-3-5-sonnet-20241022", tools=[computer])
agent = ComputerAgent("claude-sonnet-4-5-20250929", tools=[computer])
# Predict coordinates for specific elements
login_coords = agent.predict_click("find the login button")
@@ -75,7 +75,7 @@ print(f"Menu icon: {menu_coords}")
```python
# OmniParser is just for OCR, so it requires an LLM for predict_click
agent = ComputerAgent("omniparser+anthropic/claude-3-5-sonnet-20241022", tools=[computer])
agent = ComputerAgent("omniparser+anthropic/claude-sonnet-4-5-20250929", tools=[computer])
# Predict click coordinates using composed agent
coords = agent.predict_click("find the submit button")

View File

@@ -0,0 +1,441 @@
---
title: CUA VLM Router
description: Intelligent vision-language model routing with cost optimization and unified access
---
# CUA VLM Router
The **CUA VLM Router** is an intelligent inference API that provides unified access to multiple vision-language model providers through a single API key. It offers cost optimization and detailed observability for production AI applications.
## Overview
Instead of managing multiple API keys and provider-specific code, CUA VLM Router acts as a smart cloud gateway that:
- **Unifies access** to multiple model providers
- **Optimizes costs** through intelligent routing and provider selection
- **Tracks usage** and costs with detailed metadata
- **Provides observability** with routing decisions and attempt logs
- **Managed infrastructure** - no need to manage provider API keys yourself
## Quick Start
### 1. Get Your API Key
Sign up at [cua.ai](https://cua.ai/signin) and get your CUA API key from the dashboard.
### 2. Set Environment Variable
```bash
export CUA_API_KEY="sk_cua-api01_..."
```
### 3. Use with Agent SDK
```python
from agent import ComputerAgent
from computer import Computer
computer = Computer(os_type="linux", provider_type="docker")
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer],
max_trajectory_budget=5.0
)
messages = [{"role": "user", "content": "Take a screenshot and tell me what's on screen"}]
async for result in agent.run(messages):
for item in result["output"]:
if item["type"] == "message":
print(item["content"][0]["text"])
```
## Available Models
The CUA VLM Router currently supports these models:
| Model ID | Provider | Description | Best For |
| --------------------------------- | --------- | ----------------- | --------------------------------------- |
| `cua/anthropic/claude-sonnet-4.5` | Anthropic | Claude Sonnet 4.5 | General-purpose tasks, recommended |
| `cua/anthropic/claude-opus-4.5` | Anthropic | Claude Opus 4.5 | Enhanced agentic and computer-use tasks |
| `cua/anthropic/claude-haiku-4.5` | Anthropic | Claude Haiku 4.5 | Fast responses, cost-effective |
| `cua/qwen/qwen3-vl-235b` | Qwen | Qwen3 VL 235B | Large-scale vision-language tasks |
## How It Works
### Intelligent Routing
When you make a request to CUA VLM Router:
1. **Model Resolution**: Your model ID (e.g., `cua/anthropic/claude-sonnet-4.5`) is resolved to the appropriate provider
2. **Provider Selection**: CUA routes your request to the appropriate model provider
3. **Response**: You receive an OpenAI-compatible response with metadata
## API Reference
### Base URL
```
https://inference.cua.ai/v1
```
### Authentication
All requests require an API key in the Authorization header:
```bash
Authorization: Bearer sk_cua-api01_...
```
### Endpoints
#### List Available Models
```bash
GET /v1/models
```
**Response:**
```json
{
"data": [
{
"id": "anthropic/claude-sonnet-4.5",
"name": "Claude Sonnet 4.5",
"object": "model",
"owned_by": "cua"
}
],
"object": "list"
}
```
#### Chat Completions
```bash
POST /v1/chat/completions
Content-Type: application/json
```
**Request:**
```json
{
"model": "anthropic/claude-sonnet-4.5",
"messages": [{ "role": "user", "content": "Hello!" }],
"max_tokens": 100,
"temperature": 0.7,
"stream": false
}
```
**Response:**
```json
{
"id": "gen_...",
"object": "chat.completion",
"created": 1763554838,
"model": "anthropic/claude-sonnet-4.5",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "Hello! How can I help you today?"
},
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": 10,
"completion_tokens": 12,
"total_tokens": 22,
"cost": 0.01,
"is_byok": true
}
}
```
#### Streaming
Set `"stream": true` to receive server-sent events:
```bash
curl -X POST https://inference.cua.ai/v1/chat/completions \
-H "Authorization: Bearer sk_cua-api01_..." \
-H "Content-Type: application/json" \
-d '{
"model": "anthropic/claude-sonnet-4.5",
"messages": [{"role": "user", "content": "Count to 5"}],
"stream": true
}'
```
**Response (SSE format):**
```
data: {"id":"gen_...","choices":[{"delta":{"content":"1"}}],"object":"chat.completion.chunk"}
data: {"id":"gen_...","choices":[{"delta":{"content":"\n2"}}],"object":"chat.completion.chunk"}
data: {"id":"gen_...","choices":[{"delta":{"content":"\n3\n4\n5"}}],"object":"chat.completion.chunk"}
data: {"id":"gen_...","choices":[{"delta":{},"finish_reason":"stop"}],"usage":{...}}
```
#### Check Balance
```bash
GET /v1/balance
```
**Response:**
```json
{
"balance": 211689.85,
"currency": "credits"
}
```
## Cost Tracking
CUA VLM Router provides detailed cost information in every response:
### Credit System
Requests are billed in **credits**:
- Credits are deducted from your CUA account balance
- Prices vary by model and usage
- CUA manages all provider API keys and infrastructure
### Response Cost Fields
```json
{
"usage": {
"cost": 0.01, // CUA gateway cost in credits
"market_cost": 0.000065 // Actual upstream API cost
}
}
```
**Note:** CUA VLM Router is a fully managed cloud service. If you want to use your own provider API keys directly (BYOK), see the [Supported Model Providers](/agent-sdk/supported-model-providers/) page for direct provider access via the agent SDK.
## Response Metadata
CUA VLM Router includes metadata about routing decisions and costs in the response. This information helps with debugging and monitoring your application's model usage.
## Configuration
### Environment Variables
```bash
# Required: Your CUA API key
export CUA_API_KEY="sk_cua-api01_..."
# Optional: Custom endpoint (defaults to https://inference.cua.ai/v1)
export CUA_BASE_URL="https://custom-endpoint.cua.ai/v1"
```
### Python SDK Configuration
```python
from agent import ComputerAgent
# Using environment variables (recommended)
agent = ComputerAgent(model="cua/anthropic/claude-sonnet-4.5")
# Or explicit configuration
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5",
# CUA adapter automatically loads from CUA_API_KEY
)
```
## Benefits Over Direct Provider Access
| Feature | CUA VLM Router | Direct Provider (BYOK) |
| -------------------------- | ---------------------------- | --------------------------------- |
| **Single API Key** | ✅ One key for all providers | ❌ Multiple keys to manage |
| **Managed Infrastructure** | ✅ No API key management | ❌ Manage multiple provider keys |
| **Usage Tracking** | ✅ Unified dashboard | ❌ Per-provider tracking |
| **Model Switching** | ✅ Change model string only | ❌ Change code + keys |
| **Setup Complexity** | ✅ One environment variable | ❌ Multiple environment variables |
## Error Handling
### Common Error Responses
#### Invalid API Key
```json
{
"detail": "Insufficient credits. Current balance: 0.00 credits"
}
```
#### Missing Authorization
```json
{
"detail": "Missing Authorization: Bearer token"
}
```
#### Invalid Model
```json
{
"detail": "Invalid or unavailable model"
}
```
### Best Practices
1. **Check balance periodically** using `/v1/balance`
2. **Handle rate limits** with exponential backoff
3. **Log generation IDs** for debugging
4. **Set up usage alerts** in your CUA dashboard
## Examples
### Basic Usage
```python
from agent import ComputerAgent
from computer import Computer
computer = Computer(os_type="linux", provider_type="docker")
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer]
)
messages = [{"role": "user", "content": "Open Firefox"}]
async for result in agent.run(messages):
print(result)
```
### Direct API Call (curl)
```bash
curl -X POST https://inference.cua.ai/v1/chat/completions \
-H "Authorization: Bearer ${CUA_API_KEY}" \
-H "Content-Type: application/json" \
-d '{
"model": "anthropic/claude-sonnet-4.5",
"messages": [
{"role": "user", "content": "Explain quantum computing"}
],
"max_tokens": 200
}'
```
### With Custom Parameters
```python
agent = ComputerAgent(
model="cua/anthropic/claude-haiku-4.5",
tools=[computer],
max_trajectory_budget=10.0,
temperature=0.7
)
```
### Using Qwen3 VL 235B
```python
from agent import ComputerAgent
from computer import Computer
computer = Computer(os_type="linux", provider_type="docker")
agent = ComputerAgent(
model="cua/qwen/qwen3-vl-235b",
tools=[computer],
only_n_most_recent_images=3
)
messages = [{"role": "user", "content": "Open a browser and search for Python tutorials"}]
async for result in agent.run(messages):
print(result)
```
### Using Claude Opus 4.5
```python
from agent import ComputerAgent
from computer import Computer
computer = Computer(
os_type="linux",
provider_type="cloud",
name="your-container-name",
api_key="your-cua-api-key"
)
agent = ComputerAgent(
model="cua/anthropic/claude-opus-4.5",
tools=[computer],
instructions="You are a helpful assistant that can control computers",
only_n_most_recent_images=3
)
messages = [{"role": "user", "content": "Open a browser and search for Python tutorials"}]
async for result in agent.run(messages):
print(result)
```
## Migration from Direct Provider Access
Switching from direct provider access (BYOK) to CUA VLM Router is simple:
**Before (Direct Provider Access with BYOK):**
```python
import os
# Required: Provider-specific API key
os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..."
agent = ComputerAgent(
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer]
)
```
**After (CUA VLM Router - Cloud Service):**
```python
import os
# Required: CUA API key only (no provider keys needed)
os.environ["CUA_API_KEY"] = "sk_cua-api01_..."
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5", # Add "cua/" prefix
tools=[computer]
)
```
That's it! Same code structure, just different model format. CUA manages all provider infrastructure and credentials for you.
## Support
- **Documentation**: [cua.ai/docs](https://cua.ai/docs)
- **Discord**: [Join our community](https://discord.com/invite/mVnXXpdE85)
- **Issues**: [GitHub Issues](https://github.com/trycua/cua/issues)
## Next Steps
- Explore [Agent Loops](/agent-sdk/agent-loops) to customize agent behavior
- Learn about [Cost Saving Callbacks](/agent-sdk/callbacks/cost-saving)
- Try [Example Use Cases](/example-usecases/form-filling)
- Review [Supported Model Providers](/agent-sdk/supported-model-providers/) for all options

View File

@@ -4,23 +4,51 @@ title: Supported Model Providers
## Supported Models
### Anthropic Claude (Computer Use API)
### CUA VLM Router (Recommended)
Use CUA's cloud inference API for intelligent routing and cost optimization with a single API key. CUA manages all provider infrastructure and credentials for you.
```python
model="cua/anthropic/claude-sonnet-4.5" # Claude Sonnet 4.5 (recommended)
model="cua/anthropic/claude-haiku-4.5" # Claude Haiku 4.5 (faster)
```
**Benefits:**
- Single API key for multiple providers
- Cost tracking and optimization
- Fully managed infrastructure (no provider keys to manage)
[Learn more about CUA VLM Router →](/agent-sdk/supported-model-providers/cua-vlm-router)
---
### Anthropic Claude (Computer Use API - BYOK)
Direct access to Anthropic's Claude models using your own Anthropic API key (BYOK - Bring Your Own Key).
```python
model="anthropic/claude-3-5-sonnet-20241022"
model="anthropic/claude-3-7-sonnet-20250219"
model="anthropic/claude-opus-4-20250514"
model="anthropic/claude-sonnet-4-20250514"
```
### OpenAI Computer Use Preview
**Setup:** Set `ANTHROPIC_API_KEY` environment variable with your Anthropic API key.
### OpenAI Computer Use Preview (BYOK)
Direct access to OpenAI's computer use models using your own OpenAI API key (BYOK).
```python
model="openai/computer-use-preview"
```
**Setup:** Set `OPENAI_API_KEY` environment variable with your OpenAI API key.
### UI-TARS (Local or Huggingface Inference)
Run UI-TARS models locally for privacy and offline use.
```python
model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"
model="ollama_chat/0000/ui-tars-1.5-7b"
@@ -28,9 +56,11 @@ model="ollama_chat/0000/ui-tars-1.5-7b"
### Omniparser + Any LLM
Combine Omniparser for UI understanding with any LLM provider.
```python
model="omniparser+ollama_chat/mistral-small3.2"
model="omniparser+vertex_ai/gemini-pro"
model="omniparser+anthropic/claude-3-5-sonnet-20241022"
model="omniparser+anthropic/claude-sonnet-4-5-20250929"
model="omniparser+openai/gpt-4o"
```

View File

@@ -1,84 +1,74 @@
---
title: Telemetry
description: This document explains how telemetry works in CUA libraries and how you can control it.
icon: RadioTower
description: How telemetry works in Cua and how to control it
---
# Telemetry in CUA
# Telemetry
CUA tracks anonymized usage and error report statistics; we ascribe to Posthog's approach as detailed [here](https://posthog.com/blog/open-source-telemetry-ethical). If you would like to opt out of sending anonymized info, you can set `telemetry_enabled` to false.
Cua collects anonymized usage and error statistics. We follow [Posthog's ethical telemetry approach](https://posthog.com/blog/open-source-telemetry-ethical). To opt out, set `telemetry_enabled` to false.
## What telemetry data we collect
## What we collect
CUA libraries collect usage data to help improve our software. We have two categories of telemetry:
### Enabled by default (opt-out)
### Opt-Out Telemetry (Enabled by Default)
- System info: OS, OS version, Python version
- Module initialization: When modules are imported and their versions
- Performance: Agent run durations, step counts, token usage, API costs
- Session tracking: Anonymous session IDs and run IDs
Basic performance metrics and system information that help us understand usage patterns:
### Disabled by default (opt-in)
- **System Information**: Operating system, OS version, Python version
- **Module Initialization**: When modules are imported and their versions
- **Performance Metrics**: Agent run durations, step counts, token usage, and API costs
- **Session Tracking**: Anonymous session IDs and run IDs for performance analysis
### Opt-In Telemetry (Disabled by Default)
**Conversation Trajectory Logging**: Full conversation history including:
**Trajectory logging** captures full conversation history:
- User messages and agent responses
- Computer actions and their outputs
- Reasoning traces from the agent
- Computer actions and outputs
- Agent reasoning traces
**Important**: Trajectory logging is **opt-in only** and must be explicitly enabled.
Must be explicitly enabled.
### We do NOT collect:
### We don't collect
- Personal information or user identifiers
- API keys or credentials
- File contents or application data
- Information about files being accessed
- Actual screenshots or screen contents (unless trajectory logging is enabled)
- Specific text being typed, including user inputs, model outputs, computer outputs, or tool call outputs (unless trajectory logging is enabled)
- Files being accessed
- Screenshots or screen contents (unless trajectory logging is enabled)
- Text being typed, user inputs, model outputs, computer outputs, or tool call outputs (unless trajectory logging is enabled)
## Controlling Telemetry
## How to disable
We are committed to transparency and user control over telemetry. There are two ways to control telemetry:
### Environment variable (global)
### 1. Environment Variable (Global Control)
Telemetry is enabled by default. To disable telemetry, set the `CUA_TELEMETRY_ENABLED` environment variable to a falsy value (`0`, `false`, `no`, or `off`):
Set `CUA_TELEMETRY_ENABLED` to a falsy value (`0`, `false`, `no`, or `off`):
```bash
# Disable telemetry before running your script
export CUA_TELEMETRY_ENABLED=false
# Or as part of the command
CUA_TELEMETRY_ENABLED=1 python your_script.py
```
Or from Python:
Or in Python:
```python
import os
os.environ["CUA_TELEMETRY_ENABLED"] = "false"
```
### 2. Instance-Level Control
<Callout type="info">
**Deprecated environment variables:** The environment variables `CUA_TELEMETRY` and
`CUA_TELEMETRY_DISABLED` are deprecated and no longer have any effect. Use `CUA_TELEMETRY_ENABLED`
instead.
</Callout>
#### Computer SDK
### Per instance
**Computer SDK:**
```python
from computer import Computer
# Enable telemetry (default)
computer = Computer(telemetry_enabled=True)
# Disable telemetry
computer = Computer(telemetry_enabled=False)
```
#### Agent SDK
**Agent SDK:**
```python
from agent import ComputerAgent
@@ -86,60 +76,60 @@ import os
# Basic telemetry - performance metrics only (opt-out, enabled by default)
agent = ComputerAgent(
model="claude-3-5-sonnet-20241022",
model="claude-sonnet-4-5-20250929",
telemetry_enabled=True # Default is True
)
# Enable telemetry with full conversation trajectory logging (opt-in)
agent = ComputerAgent(
model="claude-3-5-sonnet-20241022",
model="claude-sonnet-4-5-20250929",
telemetry_enabled={
"log_trajectory": True # Logs full conversation items
}
)
# Disable telemetry completely
# Disable completely
agent = ComputerAgent(
model="claude-3-5-sonnet-20241022",
model="claude-sonnet-4-5-20250929",
telemetry_enabled=False
)
# Disable telemetry completely using environment variables
os.environ["CUA_TELEMETRY_ENABLED"] = "false"
# Enable trajectory logging (opt-in)
agent = ComputerAgent(
model="claude-3-5-sonnet-20241022"
model="claude-sonnet-4-5-20250929",
telemetry_enabled={"log_trajectory": True}
)
```
You can check if telemetry is enabled for an instance:
Check status:
```python
print(computer.telemetry_enabled) # Will print True or False
print(agent.telemetry_enabled) # Will print True, False, or dict
print(computer.telemetry_enabled) # True or False
print(agent.telemetry_enabled) # True, False, or dict
```
Note that telemetry settings must be configured during initialization and cannot be changed after the object is created.
Telemetry settings are configured at initialization and can't be changed afterward.
## Detailed Telemetry Events
## Events collected
### Computer SDK Events
### Computer SDK
| Event Name | Data Collected | Trigger Notes |
| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------- |
| **computer_initialized** | • `os`: Operating system (e.g., 'windows', 'darwin', 'linux')<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when a Computer instance is created |
| **module_init** | • `module`: "computer"<br />• `version`: Package version<br />• `python_version`: Full Python version string | Triggered once when the computer package is imported for the first time |
### Agent SDK Events
### Agent SDK
| Event Name | Data Collected | Trigger Notes |
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------- |
| **module_init** | • `module`: "agent"<br />• `version`: Package version<br />• `python_version`: Full Python version string | Triggered once when the agent package is imported for the first time |
| **agent_session_start** | • `session_id`: Unique UUID for this agent instance<br />• `agent_type`: Class name (e.g., "ComputerAgent")<br />• `model`: Model name (e.g., "claude-3-5-sonnet")<br />• `os`: Operating system<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when TelemetryCallback is initialized (agent instantiation) |
| **agent_session_start** | • `session_id`: Unique UUID for this agent instance<br />• `agent_type`: Class name (e.g., "ComputerAgent")<br />• `model`: Model name (e.g., "claude-sonnet-4-5")<br />• `os`: Operating system<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when TelemetryCallback is initialized (agent instantiation) |
| **agent_run_start** | • `session_id`: Agent session UUID<br />• `run_id`: Unique UUID for this run<br />• `start_time`: Unix timestamp<br />• `input_context_size`: Character count of input messages<br />• `num_existing_messages`: Count of existing messages<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the start of each agent.run() call |
| **agent_run_end** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `end_time`: Unix timestamp<br />• `duration_seconds`: Total run duration<br />• `num_steps`: Total steps taken in this run<br />• `total_usage`: Accumulated token usage and costs<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the end of each agent.run() call |
| **agent_step** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Step number (incremental)<br />• `timestamp`: Unix timestamp<br />• `duration_seconds`: Duration of previous step | Triggered on each agent response/step during a run |
| **agent_usage** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Current step number<br />• `prompt_tokens`: Tokens in prompt<br />• `completion_tokens`: Tokens in response<br />• `total_tokens`: Total tokens used<br />• `response_cost`: Cost of this API call | Triggered whenever usage information is received from LLM API |
## Transparency
## Questions
We believe in being transparent about the data we collect. If you have any questions about our telemetry practices, please open an issue on our GitHub repository.
Questions about telemetry? Open an issue on our [GitHub repository](https://github.com/trycua/cua).

View File

@@ -1,32 +1,32 @@
---
title: Cloud VM Management
description: Manage your Cua Cloud sandboxes (VMs) via Python SDK or HTTP API
title: Cloud Sandbox Management
description: Manage your Cua Cloud sandboxes via Python SDK or HTTP API
---
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
Using the Cua Cloud API, you can manage your Cua Cloud sandboxes (VMs) with Python or HTTP (curl).
Using the Cua Cloud API, you can manage your Cua Cloud sandboxes with Python or HTTP (curl).
All examples require a CUA API key. You can obtain one from the [Dashboard](https://www.cua.ai/dashboard/keys).
---
## List VMs
## List Sandboxes
<Tabs items={['Python', 'curl']}>
<Tab value="Python">
```python
import os
import asyncio
from computer.providers.cloud.provider import CloudProvider
async def main():
api_key = os.getenv("CUA_API_KEY") or "your-api-key"
# CloudProvider automatically reads CUA_API_KEY from environment
# You can also pass api_key explicitly: CloudProvider(api_key="your-api-key")
# Optional: point to a different API base
# os.environ["CUA_API_BASE"] = "https://api.cua.ai"
provider = CloudProvider(api_key=api_key, verbose=False)
provider = CloudProvider(verbose=False)
async with provider:
vms = await provider.list_vms()
for vm in vms:
@@ -51,7 +51,7 @@ curl -H "Authorization: Bearer $CUA_API_KEY" \
Responses:
- 200: Array of minimal VM objects with fields `{ name, password, status }`
- 200: Array of minimal sandbox objects with fields `{ name, password, status }`
- 401: Unauthorized (missing/invalid API key)
```json
@@ -66,11 +66,11 @@ Responses:
Status values:
- `pending`: VM deployment in progress
- `running`: VM is active and accessible
- `stopped`: VM is stopped but not terminated
- `terminated`: VM has been permanently destroyed
- `failed`: VM deployment or operation failed
- `pending`: Sandbox deployment in progress
- `running`: Sandbox is active and accessible
- `stopped`: Sandbox is stopped but not terminated
- `terminated`: Sandbox has been permanently destroyed
- `failed`: Sandbox deployment or operation failed
---
@@ -80,23 +80,22 @@ Status values:
---
## Start a VM
## Start a Sandbox
Provide the VM name you want to start.
Provide the sandbox name you want to start.
<Tabs items={["Python", "curl"]}>
<Tab value="Python">
```python
import os
import asyncio
from computer.providers.cloud.provider import CloudProvider
async def main():
api_key = os.getenv("CUA_API_KEY") or "your-api-key"
# CloudProvider automatically reads CUA_API_KEY from environment
name = "my-vm-name" # e.g., "m-linux-96lcxd2c2k"
provider = CloudProvider(api_key=api_key)
provider = CloudProvider()
async with provider:
resp = await provider.run_vm(name)
print(resp) # { "name": name, "status": "starting" }
@@ -118,7 +117,7 @@ Responses:
- 204: No Content (start accepted)
- 401: Unauthorized (missing/invalid API key)
- 404: VM not found or not owned by the user
- 404: Sandbox not found or not owned by the user
```text
HTTP/1.1 204 No Content
@@ -129,23 +128,22 @@ HTTP/1.1 204 No Content
---
## Stop a VM
## Stop a Sandbox
Stops the VM asynchronously.
Stops the sandbox asynchronously.
<Tabs items={["Python", "curl"]}>
<Tab value="Python">
```python
import os
import asyncio
from computer.providers.cloud.provider import CloudProvider
async def main():
api_key = os.getenv("CUA_API_KEY") or "your-api-key"
# CloudProvider automatically reads CUA_API_KEY from environment
name = "my-vm-name"
provider = CloudProvider(api_key=api_key)
provider = CloudProvider()
async with provider:
resp = await provider.stop_vm(name)
print(resp) # { "name": name, "status": "stopping" }
@@ -167,7 +165,7 @@ Responses:
- 202: Accepted with `{ "status": "stopping" }`
- 401: Unauthorized (missing/invalid API key)
- 404: VM not found or not owned by the user
- 404: Sandbox not found or not owned by the user
```json
{ "status": "stopping" }
@@ -178,23 +176,22 @@ Responses:
---
## Restart a VM
## Restart a Sandbox
Restarts the VM asynchronously.
Restarts the sandbox asynchronously.
<Tabs items={["Python", "curl"]}>
<Tab value="Python">
```python
import os
import asyncio
from computer.providers.cloud.provider import CloudProvider
async def main():
api_key = os.getenv("CUA_API_KEY") or "your-api-key"
# CloudProvider automatically reads CUA_API_KEY from environment
name = "my-vm-name"
provider = CloudProvider(api_key=api_key)
provider = CloudProvider()
async with provider:
resp = await provider.restart_vm(name)
print(resp) # { "name": name, "status": "restarting" }
@@ -216,7 +213,7 @@ Responses:
- 202: Accepted with `{ "status": "restarting" }`
- 401: Unauthorized (missing/invalid API key)
- 404: VM not found or not owned by the user
- 404: Sandbox not found or not owned by the user
```json
{ "status": "restarting" }
@@ -227,23 +224,22 @@ Responses:
---
## Query a VM by name
## Query a Sandbox by name
Query the computer-server running on the VM. Useful for checking details like status or OS type.
Query the computer-server running on the sandbox. Useful for checking details like status or OS type.
<Tabs items={["Python", "curl"]}>
<Tab value="Python">
```python
import os
import asyncio
from computer.providers.cloud.provider import CloudProvider
async def main():
api_key = os.getenv("CUA_API_KEY") or "your-api-key"
# CloudProvider automatically reads CUA_API_KEY from environment
name = "my-vm-name"
provider = CloudProvider(api_key=api_key)
provider = CloudProvider()
async with provider:
info = await provider.get_vm(name)
print(info)

View File

@@ -18,7 +18,7 @@ Execute shell commands and get detailed results:
# Run shell command
result = await computer.interface.run_command(cmd) # result.stdout, result.stderr, result.returncode
```
</Tab>
<Tab value="TypeScript">
@@ -230,7 +230,7 @@ Control desktop environment features like wallpaper:
env = await computer.interface.get_desktop_environment()
print(env) # "xfce4"
# Set desktop wallpaper to an image file accessible on the VM
# Set desktop wallpaper to an image file accessible on the sandbox
await computer.interface.set_wallpaper("/home/cua/shared/wallpaper.png")
```
@@ -241,7 +241,7 @@ Control desktop environment features like wallpaper:
const env = await computer.interface.getDesktopEnvironment();
print(env) # "xfce4"
// Set desktop wallpaper to an image file accessible on the VM
// Set desktop wallpaper to an image file accessible on the sandbox
await computer.interface.setWallpaper('/home/cua/shared/wallpaper.png');
```

View File

@@ -1,7 +1,12 @@
---
title: Computer UI
title: Computer UI (Deprecated)
---
<Callout type="warn" title="Deprecated">
The Computer UI is deprecated and will be replaced with a revamped playground experience soon. We
recommend using VNC or Screen Sharing for precise control of the computer instead.
</Callout>
The computer module includes a Gradio UI for creating and sharing demonstration data. We make it easy for people to build community datasets for better computer use models with an upload to Huggingface feature.
```bash

View File

@@ -1,29 +1,20 @@
---
title: Cua Computers
title: Computer Types
description: Understanding Cua computer types and connection methods
---
<Callout>
A corresponding{' '}
<a href="https://github.com/trycua/cua/blob/main/notebooks/computer_nb.ipynb" target="_blank">
Jupyter Notebook
</a>{' '}
and{' '}
<a href="https://github.com/trycua/cua/tree/main/examples/computer-example-ts" target="_blank">
NodeJS project
</a>{' '}
are available for this documentation.
</Callout>
{/* prettier-ignore */}
<Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/computer_nb.ipynb" target="_blank">Jupyter Notebook</a> and <a href="https://github.com/trycua/cua/tree/main/examples/computer-example-ts" target="_blank">NodeJS project</a> are available for this documentation.</Callout>
Before we can automate apps using AI, we need to first connect to a Computer Server to give the AI a safe environment to execute workflows in.
Cua Computers are preconfigured virtual machines running the Computer Server. They can be either macOS, Linux, or Windows. They're found in either a cloud-native container, or on your host desktop.
Cua Computers are preconfigured sandboxes running the Computer Server. They can be either macOS, Linux, or Windows. They're found in either a cloud-native sandbox, or on your host desktop.
## Cloud Sandbox
**Easiest & safest way to get started - works on any host OS**
This is a Cloud Sandbox running the Computer Server. Get a container at [cua.ai](https://cua.ai/).
This is a Cloud Sandbox running the Computer Server. Get a sandbox at [cua.ai](https://cua.ai/).
<Tabs items={['Python', 'TypeScript']}>
<Tab value="Python">
@@ -85,7 +76,7 @@ Cua provides two Docker images for running Linux desktops:
os_type="linux",
provider_type="docker",
image="trycua/cua-xfce:latest",
name="my-xfce-container"
name="my-xfce-sandbox"
)
await computer.run() # Launch & connect to Docker sandbox
@@ -118,7 +109,7 @@ Cua provides two Docker images for running Linux desktops:
os_type="linux",
provider_type="docker",
image="trycua/cua-ubuntu:latest",
name="my-kasm-container"
name="my-kasm-sandbox"
)
await computer.run() # Launch & connect to Docker sandbox
@@ -152,7 +143,7 @@ computer = Computer(
await computer.run() # Launch & connect to Windows Sandbox
```
## macOS VM
## macOS Sandbox
**macOS hosts only - requires Lume CLI**
@@ -162,7 +153,7 @@ await computer.run() # Launch & connect to Windows Sandbox
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
```
2. Start a local Cua macOS VM
2. Start a local Cua macOS sandbox
```bash
lume run macos-sequoia-cua:latest

View File

@@ -34,7 +34,7 @@ You can then use this as a tool for your agent:
from agent import ComputerAgent
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="cua/anthropic/claude-sonnet-4.5",
tools=[custom_computer],
)
@@ -122,7 +122,7 @@ class MyCustomComputer(AsyncComputerHandler):
custom_computer = MyCustomComputer()
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="cua/anthropic/claude-sonnet-4.5",
tools=[custom_computer],
)

View File

@@ -1,5 +1,12 @@
{
"title": "Computer SDK",
"description": "Build computer-using agents with the Computer SDK",
"pages": ["computers", "commands", "computer-ui", "tracing-api", "sandboxed-python"]
"pages": [
"computers",
"commands",
"tracing-api",
"sandboxed-python",
"custom-computer-handlers",
"computer-ui"
]
}

View File

@@ -33,7 +33,7 @@ def read_file(location: str) -> str:
return f.read()
async def main():
async with Computer(os_type="linux", provider_type="cloud", name="my-container", api_key="...") as computer:
async with Computer(os_type="linux", provider_type="cloud", name="my-sandbox", api_key="...") as computer:
# Call the sandboxed function (runs remotely)
result = await read_file("/etc/hostname")
print(result)
@@ -60,7 +60,7 @@ await my_computer.venv_install("myenv", ["requests"])
You can use sandboxed functions to interact with macOS applications on a local Cua Computer (requires `os_type="darwin"`). This is particularly useful for automation tasks that involve GUI applications.
```python
# Example: Use sandboxed functions to execute code in a Cua Container
# Example: Use sandboxed functions to execute code in a Cua Sandbox
from computer.helpers import sandboxed
await computer.venv_install("demo_venv", ["macos-pyxa"]) # Install packages in a virtual environment
@@ -71,10 +71,10 @@ def greet_and_print(name):
import PyXA
safari = PyXA.Application("Safari")
html = safari.current_document.source()
print(f"Hello from inside the container, {name}!")
print(f"Hello from inside the sandbox, {name}!")
return {"greeted": name, "safari_html": html}
# When a @sandboxed function is called, it will execute in the container
# When a @sandboxed function is called, it will execute in the sandbox
result = await greet_and_print("Cua")
# Result: {"greeted": "Cua", "safari_html": "<html>...</html>"}
# stdout and stderr are also captured and printed / raised

View File

@@ -7,11 +7,6 @@ description: Record computer interactions for debugging, training, and analysis
The Computer tracing API provides a powerful way to record computer interactions for debugging, training, analysis, and compliance purposes. Inspired by Playwright's tracing functionality, it offers flexible recording options and standardized output formats.
<Callout>
The tracing API addresses GitHub issue #299 by providing a unified recording interface that works
with any Computer usage pattern, not just ComputerAgent.
</Callout>
## Overview
The tracing API allows you to:

View File

@@ -1,9 +1,9 @@
---
title: Form Filling
title: PDF to Form Automation
description: Enhance and Automate Interactions Between Form Filling and Local File Systems
---
import { EditableCodeBlock, EditableValue, S } from '@/components/editable-code-block';
import { Step, Steps } from 'fumadocs-ui/components/steps';
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
## Overview
@@ -12,9 +12,17 @@ Cua can be used to automate interactions between form filling and local file sys
This preset usecase uses [Cua Computer](/computer-sdk/computers) to interact with a web page and local file systems along with [Agent Loops](/agent-sdk/agent-loops) to run the agent in a loop with message history.
## Quickstart
---
Create a `requirements.txt` file with the following dependencies:
<Steps>
<Step>
### Set Up Your Environment
First, install the required dependencies:
Create a `requirements.txt` file:
```text
cua-agent
@@ -22,33 +30,32 @@ cua-computer
python-dotenv>=1.0.0
```
And install:
Install the dependencies:
```bash
pip install -r requirements.txt
```
Create a `.env` file with the following environment variables:
Create a `.env` file with your API keys:
```text
ANTHROPIC_API_KEY=your-api-key
ANTHROPIC_API_KEY=your-anthropic-api-key
CUA_API_KEY=sk_cua-api01...
```
Select the environment you want to run the code in (_click on the underlined values in the code to edit them directly!_):
</Step>
<Tabs items={['☁️ Cloud', '🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox']}>
<Tab value="☁️ Cloud">
<Step>
<EditableCodeBlock
key="cloud-tab"
lang="python"
defaultValues={{
"container-name": "m-linux-...",
"api_key": "sk_cua-api01..."
}}
>
{`import asyncio
### Create Your Form Filling Script
Create a Python file (e.g., `form_filling.py`) and select your environment:
<Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox']}>
<Tab value="Cloud Sandbox">
```python
import asyncio
import logging
import os
import signal
@@ -59,24 +66,24 @@ from computer import Computer, VMProviderType
from dotenv import load_dotenv
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(**name**)
logger = logging.getLogger(__name__)
def handle_sigint(sig, frame):
print("\\n\\nExecution interrupted by user. Exiting gracefully...")
exit(0)
print("\n\nExecution interrupted by user. Exiting gracefully...")
exit(0)
async def fill_application():
try:
async with Computer(
os_type="linux",
provider_type=VMProviderType.CLOUD,
name="`}<EditableValue placeholder="container-name" />{`",
api_key="`}<EditableValue placeholder="api_key" />{`",
verbosity=logging.INFO,
) as computer:
try:
async with Computer(
os_type="linux",
provider_type=VMProviderType.CLOUD,
name="your-sandbox-name", # Replace with your sandbox name
api_key=os.environ["CUA_API_KEY"],
verbosity=logging.INFO,
) as computer:
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.INFO,
@@ -93,7 +100,7 @@ verbosity=logging.INFO,
history = []
for i, task in enumerate(tasks, 1):
print(f"\\n[Task {i}/{len(tasks)}] {task}")
print(f"\n[Task {i}/{len(tasks)}] {task}")
# Add user message to history
history.append({"role": "user", "content": task})
@@ -116,7 +123,7 @@ verbosity=logging.INFO,
print(f"✅ Task {i}/{len(tasks)} completed")
print("\\n🎉 All tasks completed successfully!")
print("\n🎉 All tasks completed successfully!")
except Exception as e:
logger.error(f"Error in fill_application: {e}")
@@ -124,18 +131,18 @@ verbosity=logging.INFO,
raise
def main():
try:
load_dotenv()
try:
load_dotenv()
if "ANTHROPIC_API_KEY" not in os.environ:
raise RuntimeError(
"Please set the ANTHROPIC_API_KEY environment variable.\\n"
"Please set the ANTHROPIC_API_KEY environment variable.\n"
"You can add it to a .env file in the project root."
)
if "CUA_API_KEY" not in os.environ:
raise RuntimeError(
"Please set the CUA_API_KEY environment variable.\\n"
"Please set the CUA_API_KEY environment variable.\n"
"You can add it to a .env file in the project root."
)
@@ -147,22 +154,15 @@ load_dotenv()
logger.error(f"Error running automation: {e}")
traceback.print_exc()
if **name** == "**main**":
main()`}
</EditableCodeBlock>
if __name__ == "__main__":
main()
```
</Tab>
<Tab value="🍎 Lume">
<Tab value="Linux on Docker">
<EditableCodeBlock
key="lume-tab"
lang="python"
defaultValues={{
"container-name": "macos-sequoia-cua:latest"
}}
>
{`import asyncio
```python
import asyncio
import logging
import os
import signal
@@ -173,23 +173,23 @@ from computer import Computer, VMProviderType
from dotenv import load_dotenv
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(**name**)
logger = logging.getLogger(__name__)
def handle_sigint(sig, frame):
print("\\n\\nExecution interrupted by user. Exiting gracefully...")
exit(0)
print("\n\nExecution interrupted by user. Exiting gracefully...")
exit(0)
async def fill_application():
try:
async with Computer(
os_type="macos",
provider_type=VMProviderType.LUME,
name="`}<EditableValue placeholder="container-name" />{`",
verbosity=logging.INFO,
) as computer:
try:
async with Computer(
os_type="linux",
provider_type=VMProviderType.DOCKER,
image="trycua/cua-xfce:latest", # or "trycua/cua-ubuntu:latest"
verbosity=logging.INFO,
) as computer:
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.INFO,
@@ -206,7 +206,7 @@ verbosity=logging.INFO,
history = []
for i, task in enumerate(tasks, 1):
print(f"\\n[Task {i}/{len(tasks)}] {task}")
print(f"\n[Task {i}/{len(tasks)}] {task}")
# Add user message to history
history.append({"role": "user", "content": task})
@@ -229,7 +229,7 @@ verbosity=logging.INFO,
print(f"✅ Task {i}/{len(tasks)} completed")
print("\\n🎉 All tasks completed successfully!")
print("\n🎉 All tasks completed successfully!")
except Exception as e:
logger.error(f"Error in fill_application: {e}")
@@ -237,12 +237,12 @@ verbosity=logging.INFO,
raise
def main():
try:
load_dotenv()
try:
load_dotenv()
if "ANTHROPIC_API_KEY" not in os.environ:
raise RuntimeError(
"Please set the ANTHROPIC_API_KEY environment variable.\\n"
"Please set the ANTHROPIC_API_KEY environment variable.\n"
"You can add it to a .env file in the project root."
)
@@ -254,20 +254,15 @@ load_dotenv()
logger.error(f"Error running automation: {e}")
traceback.print_exc()
if **name** == "**main**":
main()`}
</EditableCodeBlock>
if __name__ == "__main__":
main()
```
</Tab>
<Tab value="🪟 Windows Sandbox">
<Tab value="macOS Sandbox">
<EditableCodeBlock
key="windows-tab"
lang="python"
defaultValues={{}}
>
{`import asyncio
```python
import asyncio
import logging
import os
import signal
@@ -278,22 +273,23 @@ from computer import Computer, VMProviderType
from dotenv import load_dotenv
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(**name**)
logger = logging.getLogger(__name__)
def handle_sigint(sig, frame):
print("\\n\\nExecution interrupted by user. Exiting gracefully...")
exit(0)
print("\n\nExecution interrupted by user. Exiting gracefully...")
exit(0)
async def fill_application():
try:
async with Computer(
os_type="windows",
provider_type=VMProviderType.WINDOWS_SANDBOX,
verbosity=logging.INFO,
) as computer:
try:
async with Computer(
os_type="macos",
provider_type=VMProviderType.LUME,
name="macos-sequoia-cua:latest",
verbosity=logging.INFO,
) as computer:
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.INFO,
@@ -310,7 +306,7 @@ verbosity=logging.INFO,
history = []
for i, task in enumerate(tasks, 1):
print(f"\\n[Task {i}/{len(tasks)}] {task}")
print(f"\n[Task {i}/{len(tasks)}] {task}")
# Add user message to history
history.append({"role": "user", "content": task})
@@ -333,7 +329,7 @@ verbosity=logging.INFO,
print(f"✅ Task {i}/{len(tasks)} completed")
print("\\n🎉 All tasks completed successfully!")
print("\n🎉 All tasks completed successfully!")
except Exception as e:
logger.error(f"Error in fill_application: {e}")
@@ -341,12 +337,12 @@ verbosity=logging.INFO,
raise
def main():
try:
load_dotenv()
try:
load_dotenv()
if "ANTHROPIC_API_KEY" not in os.environ:
raise RuntimeError(
"Please set the ANTHROPIC_API_KEY environment variable.\\n"
"Please set the ANTHROPIC_API_KEY environment variable.\n"
"You can add it to a .env file in the project root."
)
@@ -358,22 +354,15 @@ load_dotenv()
logger.error(f"Error running automation: {e}")
traceback.print_exc()
if **name** == "**main**":
main()`}
</EditableCodeBlock>
if __name__ == "__main__":
main()
```
</Tab>
<Tab value="🐳 Docker">
<Tab value="Windows Sandbox">
<EditableCodeBlock
key="docker-tab"
lang="python"
defaultValues={{
"container-name": "trycua/cua-ubuntu:latest"
}}
>
{`import asyncio
```python
import asyncio
import logging
import os
import signal
@@ -384,23 +373,22 @@ from computer import Computer, VMProviderType
from dotenv import load_dotenv
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(**name**)
logger = logging.getLogger(__name__)
def handle_sigint(sig, frame):
print("\\n\\nExecution interrupted by user. Exiting gracefully...")
exit(0)
print("\n\nExecution interrupted by user. Exiting gracefully...")
exit(0)
async def fill_application():
try:
async with Computer(
os_type="linux",
provider_type=VMProviderType.DOCKER,
name="`}<EditableValue placeholder="container-name" />{`",
verbosity=logging.INFO,
) as computer:
try:
async with Computer(
os_type="windows",
provider_type=VMProviderType.WINDOWS_SANDBOX,
verbosity=logging.INFO,
) as computer:
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.INFO,
@@ -417,7 +405,7 @@ verbosity=logging.INFO,
history = []
for i, task in enumerate(tasks, 1):
print(f"\\n[Task {i}/{len(tasks)}] {task}")
print(f"\n[Task {i}/{len(tasks)}] {task}")
# Add user message to history
history.append({"role": "user", "content": task})
@@ -440,7 +428,7 @@ verbosity=logging.INFO,
print(f"✅ Task {i}/{len(tasks)} completed")
print("\\n🎉 All tasks completed successfully!")
print("\n🎉 All tasks completed successfully!")
except Exception as e:
logger.error(f"Error in fill_application: {e}")
@@ -448,12 +436,12 @@ verbosity=logging.INFO,
raise
def main():
try:
load_dotenv()
try:
load_dotenv()
if "ANTHROPIC_API_KEY" not in os.environ:
raise RuntimeError(
"Please set the ANTHROPIC_API_KEY environment variable.\\n"
"Please set the ANTHROPIC_API_KEY environment variable.\n"
"You can add it to a .env file in the project root."
)
@@ -465,16 +453,42 @@ load_dotenv()
logger.error(f"Error running automation: {e}")
traceback.print_exc()
if **name** == "**main**":
main()`}
</EditableCodeBlock>
if __name__ == "__main__":
main()
```
</Tab>
</Tabs>
</Step>
<Step>
### Run Your Script
Execute your form filling automation:
```bash
python form_filling.py
```
The agent will:
1. Download the PDF resume from Overleaf
2. Extract information from the PDF
3. Fill out the JotForm with the extracted information
Monitor the output to see the agent's progress through each task.
</Step>
</Steps>
---
## Next Steps
- Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands)
- Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/)
- Experiment with different [Models and Providers](/agent-sdk/supported-model-providers/)
- Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help

View File

@@ -0,0 +1,640 @@
---
title: GUI Grounding with Gemini 3
description: Using Google's Gemini 3 with OmniParser for Advanced GUI Grounding Tasks
---
import { Step, Steps } from 'fumadocs-ui/components/steps';
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
import { Callout } from 'fumadocs-ui/components/callout';
## Overview
This example demonstrates how to use Google's Gemini 3 models with OmniParser for complex GUI grounding tasks. Gemini 3 Pro achieves exceptional performance on the [ScreenSpot-Pro benchmark](https://github.com/likaixin2000/ScreenSpot-Pro-GUI-Grounding) with a **72.7% accuracy** (compared to Claude Sonnet 4.5's 36.2%), making it ideal for precise UI element location and complex navigation tasks.
<img
src="/docs/img/grounding-with-gemini3.gif"
alt="Demo of Gemini 3 with OmniParser performing complex GUI navigation tasks"
width="800px"
/>
<Callout type="info" title="Why Gemini 3 for UI Navigation?">
According to [Google's Gemini 3 announcement](https://blog.google/products/gemini/gemini-3/),
Gemini 3 Pro achieves: - **72.7%** on ScreenSpot-Pro (vs. Gemini 2.5 Pro's 11.4%) -
Industry-leading performance on complex UI navigation tasks - Advanced multimodal understanding
for high-resolution screens
</Callout>
### What You'll Build
This guide shows how to:
- Set up Vertex AI with proper authentication
- Use OmniParser with Gemini 3 for GUI element detection
- Leverage Gemini 3-specific features like `thinking_level` and `media_resolution`
- Create agents that can perform complex multi-step UI interactions
---
<Steps>
<Step>
### Set Up Google Cloud and Vertex AI
Before using Gemini 3 models, you need to enable Vertex AI in Google Cloud Console.
#### 1. Create a Google Cloud Project
1. Go to [Google Cloud Console](https://console.cloud.google.com/)
2. Click **Select a project** → **New Project**
3. Enter a project name and click **Create**
4. Note your **Project ID** (you'll need this later)
#### 2. Enable Vertex AI API
1. Navigate to [Vertex AI API](https://console.cloud.google.com/apis/library/aiplatform.googleapis.com)
2. Select your project
3. Click **Enable**
#### 3. Enable Billing
1. Go to [Billing](https://console.cloud.google.com/billing)
2. Link a billing account to your project
3. Vertex AI offers a [free tier](https://cloud.google.com/vertex-ai/pricing) for testing
#### 4. Create a Service Account
1. Go to [IAM & Admin > Service Accounts](https://console.cloud.google.com/iam-admin/serviceaccounts)
2. Click **Create Service Account**
3. Enter a name (e.g., "cua-gemini-agent")
4. Click **Create and Continue**
5. Grant the **Vertex AI User** role
6. Click **Done**
#### 5. Create and Download Service Account Key
1. Click on your newly created service account
2. Go to **Keys** tab
3. Click **Add Key** → **Create new key**
4. Select **JSON** format
5. Click **Create** (the key file will download automatically)
6. **Important**: Store this key file securely! It contains credentials for accessing your Google Cloud resources
<Callout type="warn">
Never commit your service account JSON key to version control! Add it to `.gitignore` immediately.
</Callout>
</Step>
<Step>
### Install Dependencies
Install the required packages for OmniParser and Gemini 3:
Create a `requirements.txt` file:
```text
cua-agent
cua-computer
cua-som # OmniParser for GUI element detection
litellm>=1.0.0
python-dotenv>=1.0.0
google-cloud-aiplatform>=1.70.0
```
Install the dependencies:
```bash
pip install -r requirements.txt
```
</Step>
<Step>
### Configure Environment Variables
Create a `.env` file in your project root:
```text
# Google Cloud / Vertex AI credentials
GOOGLE_CLOUD_PROJECT=your-project-id
GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-service-account-key.json
# Cua credentials (for cloud sandboxes)
CUA_API_KEY=sk_cua-api01...
CUA_SANDBOX_NAME=your-sandbox-name
```
Replace the values:
- `your-project-id`: Your Google Cloud Project ID from Step 1
- `/path/to/your-service-account-key.json`: Path to the JSON key file you downloaded
- `sk_cua-api01...`: Your Cua API key from the [Cua dashboard](https://cua.dev)
- `your-sandbox-name`: Your sandbox name (if using cloud sandboxes)
</Step>
<Step>
### Create Your Complex UI Navigation Script
Create a Python file (e.g., `gemini_ui_navigation.py`):
<Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox']}>
<Tab value="Cloud Sandbox">
```python
import asyncio
import logging
import os
import signal
import traceback
from agent import ComputerAgent
from computer import Computer, VMProviderType
from dotenv import load_dotenv
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def handle_sigint(sig, frame):
print("\n\nExecution interrupted by user. Exiting gracefully...")
exit(0)
async def complex_ui_navigation():
"""
Demonstrate Gemini 3's exceptional UI grounding capabilities
with complex, multi-step navigation tasks.
"""
try:
async with Computer(
os_type="linux",
provider_type=VMProviderType.CLOUD,
name=os.environ["CUA_SANDBOX_NAME"],
api_key=os.environ["CUA_API_KEY"],
verbosity=logging.INFO,
) as computer:
agent = ComputerAgent(
# Use OmniParser with Gemini 3 Pro for optimal GUI grounding
model="omniparser+vertex_ai/gemini-3-pro-preview",
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.INFO,
trajectory_dir="trajectories",
use_prompt_caching=False,
max_trajectory_budget=5.0,
# Gemini 3-specific parameters
thinking_level="high", # Enables deeper reasoning (vs "low")
media_resolution="high", # High-resolution image processing (vs "low" or "medium")
)
# Complex GUI grounding tasks inspired by ScreenSpot-Pro benchmark
# These test precise element location in professional UIs
tasks = [
# Task 1: GitHub repository navigation
{
"instruction": (
"Go to github.com/trycua/cua. "
"Find and click on the 'Issues' tab. "
"Then locate and click on the search box within the issues page "
"(not the global GitHub search). "
"Type 'omniparser' and press Enter."
),
"description": "Tests precise UI element distinction in a complex interface",
},
# Task 2: Search for and install Visual Studio Code
{
"instruction": (
"Open your system's app store (e.g., Microsoft Store). "
"Search for 'Visual Studio Code'. "
"In the search results, select 'Visual Studio Code'. "
"Click on 'Install' or 'Get' to begin the installation. "
"If prompted, accept any permissions or confirm the installation. "
"Wait for Visual Studio Code to finish installing."
),
"description": "Tests the ability to search for an application and complete its installation through a step-by-step app store workflow.",
},
]
history = []
for i, task_info in enumerate(tasks, 1):
task = task_info["instruction"]
print(f"\n{'='*60}")
print(f"[Task {i}/{len(tasks)}] {task_info['description']}")
print(f"{'='*60}")
print(f"\nInstruction: {task}\n")
# Add user message to history
history.append({"role": "user", "content": task})
# Run agent with conversation history
async for result in agent.run(history, stream=False):
history += result.get("output", [])
# Print output for debugging
for item in result.get("output", []):
if item.get("type") == "message":
content = item.get("content", [])
for content_part in content:
if content_part.get("text"):
logger.info(f"Agent: {content_part.get('text')}")
elif item.get("type") == "computer_call":
action = item.get("action", {})
action_type = action.get("type", "")
logger.debug(f"Computer Action: {action_type}")
print(f"\n✅ Task {i}/{len(tasks)} completed")
print("\n🎉 All complex UI navigation tasks completed successfully!")
except Exception as e:
logger.error(f"Error in complex_ui_navigation: {e}")
traceback.print_exc()
raise
def main():
try:
load_dotenv()
# Validate required environment variables
required_vars = [
"GOOGLE_CLOUD_PROJECT",
"GOOGLE_APPLICATION_CREDENTIALS",
"CUA_API_KEY",
"CUA_SANDBOX_NAME",
]
missing_vars = [var for var in required_vars if not os.environ.get(var)]
if missing_vars:
raise RuntimeError(
f"Missing required environment variables: {', '.join(missing_vars)}\n"
f"Please check your .env file and ensure all keys are set.\n"
f"See the setup guide for details on configuring Vertex AI credentials."
)
signal.signal(signal.SIGINT, handle_sigint)
asyncio.run(complex_ui_navigation())
except Exception as e:
logger.error(f"Error running automation: {e}")
traceback.print_exc()
if __name__ == "__main__":
main()
```
</Tab>
<Tab value="Linux on Docker">
```python
import asyncio
import logging
import os
import signal
import traceback
from agent import ComputerAgent
from computer import Computer, VMProviderType
from dotenv import load_dotenv
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def handle_sigint(sig, frame):
print("\n\nExecution interrupted by user. Exiting gracefully...")
exit(0)
async def complex_ui_navigation():
"""
Demonstrate Gemini 3's exceptional UI grounding capabilities
with complex, multi-step navigation tasks.
"""
try:
async with Computer(
os_type="linux",
provider_type=VMProviderType.DOCKER,
image="trycua/cua-xfce:latest",
verbosity=logging.INFO,
) as computer:
agent = ComputerAgent(
# Use OmniParser with Gemini 3 Pro for optimal GUI grounding
model="omniparser+vertex_ai/gemini-3-pro-preview",
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.INFO,
trajectory_dir="trajectories",
use_prompt_caching=False,
max_trajectory_budget=5.0,
# Gemini 3-specific parameters
thinking_level="high", # Enables deeper reasoning (vs "low")
media_resolution="high", # High-resolution image processing (vs "low" or "medium")
)
# Complex GUI grounding tasks inspired by ScreenSpot-Pro benchmark
tasks = [
{
"instruction": (
"Go to github.com/trycua/cua. "
"Find and click on the 'Issues' tab. "
"Then locate and click on the search box within the issues page "
"(not the global GitHub search). "
"Type 'omniparser' and press Enter."
),
"description": "Tests precise UI element distinction in a complex interface",
},
]
history = []
for i, task_info in enumerate(tasks, 1):
task = task_info["instruction"]
print(f"\n{'='*60}")
print(f"[Task {i}/{len(tasks)}] {task_info['description']}")
print(f"{'='*60}")
print(f"\nInstruction: {task}\n")
history.append({"role": "user", "content": task})
async for result in agent.run(history, stream=False):
history += result.get("output", [])
for item in result.get("output", []):
if item.get("type") == "message":
content = item.get("content", [])
for content_part in content:
if content_part.get("text"):
logger.info(f"Agent: {content_part.get('text')}")
elif item.get("type") == "computer_call":
action = item.get("action", {})
action_type = action.get("type", "")
logger.debug(f"Computer Action: {action_type}")
print(f"\n✅ Task {i}/{len(tasks)} completed")
print("\n🎉 All complex UI navigation tasks completed successfully!")
except Exception as e:
logger.error(f"Error in complex_ui_navigation: {e}")
traceback.print_exc()
raise
def main():
try:
load_dotenv()
required_vars = [
"GOOGLE_CLOUD_PROJECT",
"GOOGLE_APPLICATION_CREDENTIALS",
]
missing_vars = [var for var in required_vars if not os.environ.get(var)]
if missing_vars:
raise RuntimeError(
f"Missing required environment variables: {', '.join(missing_vars)}\n"
f"Please check your .env file."
)
signal.signal(signal.SIGINT, handle_sigint)
asyncio.run(complex_ui_navigation())
except Exception as e:
logger.error(f"Error running automation: {e}")
traceback.print_exc()
if __name__ == "__main__":
main()
```
</Tab>
<Tab value="macOS Sandbox">
```python
import asyncio
import logging
import os
import signal
import traceback
from agent import ComputerAgent
from computer import Computer, VMProviderType
from dotenv import load_dotenv
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def handle_sigint(sig, frame):
print("\n\nExecution interrupted by user. Exiting gracefully...")
exit(0)
async def complex_ui_navigation():
"""
Demonstrate Gemini 3's exceptional UI grounding capabilities
with complex, multi-step navigation tasks.
"""
try:
async with Computer(
os_type="macos",
provider_type=VMProviderType.LUME,
name="macos-sequoia-cua:latest",
verbosity=logging.INFO,
) as computer:
agent = ComputerAgent(
# Use OmniParser with Gemini 3 Pro for optimal GUI grounding
model="omniparser+vertex_ai/gemini-3-pro-preview",
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.INFO,
trajectory_dir="trajectories",
use_prompt_caching=False,
max_trajectory_budget=5.0,
# Gemini 3-specific parameters
thinking_level="high", # Enables deeper reasoning (vs "low")
media_resolution="high", # High-resolution image processing (vs "low" or "medium")
)
# Complex GUI grounding tasks inspired by ScreenSpot-Pro benchmark
tasks = [
{
"instruction": (
"Go to github.com/trycua/cua. "
"Find and click on the 'Issues' tab. "
"Then locate and click on the search box within the issues page "
"(not the global GitHub search). "
"Type 'omniparser' and press Enter."
),
"description": "Tests precise UI element distinction in a complex interface",
},
]
history = []
for i, task_info in enumerate(tasks, 1):
task = task_info["instruction"]
print(f"\n{'='*60}")
print(f"[Task {i}/{len(tasks)}] {task_info['description']}")
print(f"{'='*60}")
print(f"\nInstruction: {task}\n")
history.append({"role": "user", "content": task})
async for result in agent.run(history, stream=False):
history += result.get("output", [])
for item in result.get("output", []):
if item.get("type") == "message":
content = item.get("content", [])
for content_part in content:
if content_part.get("text"):
logger.info(f"Agent: {content_part.get('text')}")
elif item.get("type") == "computer_call":
action = item.get("action", {})
action_type = action.get("type", "")
logger.debug(f"Computer Action: {action_type}")
print(f"\n✅ Task {i}/{len(tasks)} completed")
print("\n🎉 All complex UI navigation tasks completed successfully!")
except Exception as e:
logger.error(f"Error in complex_ui_navigation: {e}")
traceback.print_exc()
raise
def main():
try:
load_dotenv()
required_vars = [
"GOOGLE_CLOUD_PROJECT",
"GOOGLE_APPLICATION_CREDENTIALS",
]
missing_vars = [var for var in required_vars if not os.environ.get(var)]
if missing_vars:
raise RuntimeError(
f"Missing required environment variables: {', '.join(missing_vars)}\n"
f"Please check your .env file."
)
signal.signal(signal.SIGINT, handle_sigint)
asyncio.run(complex_ui_navigation())
except Exception as e:
logger.error(f"Error running automation: {e}")
traceback.print_exc()
if __name__ == "__main__":
main()
```
</Tab>
</Tabs>
</Step>
<Step>
### Run Your Script
Execute your complex UI navigation automation:
```bash
python gemini_ui_navigation.py
```
The agent will:
1. Navigate to GitHub and locate specific UI elements
2. Distinguish between similar elements (e.g., global search vs. issues search)
3. Perform multi-step interactions with visual feedback
4. Use Gemini 3's advanced reasoning for precise element grounding
Monitor the output to see the agent's progress through each task.
</Step>
</Steps>
---
## Understanding Gemini 3-Specific Parameters
### `thinking_level`
Controls the amount of internal reasoning the model performs:
- `"high"`: Deeper reasoning, better for complex UI navigation (recommended for ScreenSpot-like tasks)
- `"low"`: Faster responses, suitable for simpler tasks
### `media_resolution`
Controls vision processing for multimodal inputs:
- `"high"`: Best for complex UIs with many small elements (recommended)
- `"medium"`: Balanced quality and speed
- `"low"`: Faster processing for simple interfaces
<Callout type="info">
For tasks requiring precise GUI element location (like ScreenSpot-Pro), use
`thinking_level="high"` and `media_resolution="high"` for optimal performance.
</Callout>
---
## Benchmark Performance
Gemini 3 Pro's performance on ScreenSpot-Pro demonstrates its exceptional UI grounding capabilities:
| Model | ScreenSpot-Pro Score |
| ----------------- | -------------------- |
| **Gemini 3 Pro** | **72.7%** |
| Claude Sonnet 4.5 | 36.2% |
| Gemini 2.5 Pro | 11.4% |
| GPT-5.1 | 3.5% |
This makes Gemini 3 the ideal choice for complex UI navigation, element detection, and professional GUI automation tasks.
---
## Troubleshooting
### Authentication Issues
If you encounter authentication errors:
1. Verify your service account JSON key path is correct
2. Ensure the service account has the **Vertex AI User** role
3. Check that the Vertex AI API is enabled in your project
4. Confirm your `GOOGLE_CLOUD_PROJECT` matches your actual project ID
### "Vertex AI API not enabled" Error
Run this command to enable the API:
```bash
gcloud services enable aiplatform.googleapis.com --project=YOUR_PROJECT_ID
```
### Billing Issues
Ensure billing is enabled for your Google Cloud project. Visit the [Billing section](https://console.cloud.google.com/billing) to verify.
---
## Next Steps
- Learn more about [OmniParser agent loops](/agent-sdk/agent-loops)
- Explore [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing)
- Read about [ScreenSpot-Pro benchmark](https://github.com/likaixin2000/ScreenSpot-Pro-GUI-Grounding)
- Check out [Google's Gemini 3 announcement](https://blog.google/products/gemini/gemini-3/)
- Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help

View File

@@ -1,5 +1,10 @@
{
"title": "Example Use Cases",
"title": "Cookbook",
"description": "Real-world examples of building with Cua",
"pages": ["form-filling"]
"pages": [
"windows-app-behind-vpn",
"form-filling",
"post-event-contact-export",
"gemini-complex-ui-navigation"
]
}

View File

@@ -0,0 +1,474 @@
---
title: Post-Event Contact Export
description: Run overnight contact extraction from LinkedIn, X, or other social platforms after networking events
---
import { Step, Steps } from 'fumadocs-ui/components/steps';
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
## Overview
After networking events, you need to export new connections from LinkedIn, X, or other platforms into your CRM. This automation handles it for you.
**The workflow**: Kick off the script after an event and let it run overnight. Wake up to a clean CSV ready for your CRM or email tool.
This example focuses on LinkedIn but works across platforms. It uses [Cua Computer](/computer-sdk/computers) to interact with web interfaces and [Agent Loops](/agent-sdk/agent-loops) to iterate through connections with conversation history.
### Why Cua is Perfect for This
**Cua's VMs save your session data**, bypassing bot detection entirely:
- **Log in once manually** through the VM browser
- **Session persists** - you appear as a regular user, not a bot
- **No captchas** - the platform treats automation like normal browsing
- **No login code** - script doesn't handle authentication
- **Run overnight** - kick off and forget
Traditional web scraping triggers anti-bot measures immediately. Cua's approach works across all platforms.
### What You Get
The script generates two files with your extracted connections:
**CSV Export** (`linkedin_connections_20250116_143022.csv`):
```csv
first,last,role,company,met_at,linkedin
John,Smith,Software Engineer,Acme Corp,Google Devfest Toronto,https://www.linkedin.com/in/johnsmith
Sarah,Johnson,Product Manager,Tech Inc,Google Devfest Toronto,https://www.linkedin.com/in/sarahjohnson
```
**Messaging Links** (`linkedin_messaging_links_20250116_143022.txt`):
```
LinkedIn Messaging Compose Links
================================================================================
1. https://www.linkedin.com/messaging/compose/?recipient=johnsmith
2. https://www.linkedin.com/messaging/compose/?recipient=sarahjohnson
```
---
<Steps>
<Step>
### Set Up Your Environment
First, install the required dependencies:
Create a `requirements.txt` file:
```text
cua-agent
cua-computer
python-dotenv>=1.0.0
```
Install the dependencies:
```bash
pip install -r requirements.txt
```
Create a `.env` file with your API keys:
```text
ANTHROPIC_API_KEY=your-anthropic-api-key
CUA_API_KEY=sk_cua-api01...
CUA_CONTAINER_NAME=m-linux-...
```
</Step>
<Step>
### Log Into LinkedIn Manually
**Important**: Before running the script, manually log into LinkedIn through your VM:
1. Access your VM through the Cua dashboard
2. Open a browser and navigate to LinkedIn
3. Log in with your credentials (handle any captchas manually)
4. Close the browser but leave the VM running
5. Your session is now saved and ready for automation!
This one-time manual login bypasses all bot detection.
</Step>
<Step>
### Configure and Create Your Script
Create a Python file (e.g., `contact_export.py`). You can customize:
```python
# Where you met these connections (automatically added to CSV)
MET_AT_REASON = "Google Devfest Toronto"
# Number of contacts to extract (in the main loop)
for contact_num in range(1, 21): # Change 21 to extract more/fewer contacts
```
Select your environment:
<Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox']}>
<Tab value="Cloud Sandbox">
```python
import asyncio
import csv
import logging
import os
import signal
import traceback
from datetime import datetime
from agent import ComputerAgent
from computer import Computer, VMProviderType
from dotenv import load_dotenv
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Configuration: Define where you met these connections
MET_AT_REASON = "Google Devfest Toronto"
def handle_sigint(sig, frame):
print("\n\nExecution interrupted by user. Exiting gracefully...")
exit(0)
def extract_public_id_from_linkedin_url(linkedin_url):
"""Extract public ID from LinkedIn profile URL."""
if not linkedin_url:
return None
url = linkedin_url.split('?')[0].rstrip('/')
if '/in/' in url:
public_id = url.split('/in/')[-1]
return public_id
return None
def extract_contact_from_response(result_output):
"""
Extract contact information from agent's response.
Expects format:
FIRST: value
LAST: value
ROLE: value
COMPANY: value
LINKEDIN: value
"""
contact = {
'first': '',
'last': '',
'role': '',
'company': '',
'met_at': MET_AT_REASON,
'linkedin': ''
}
for item in result_output:
if item.get("type") == "message":
content = item.get("content", [])
for content_part in content:
text = content_part.get("text", "")
if text:
for line in text.split('\n'):
line = line.strip()
line_upper = line.upper()
if line_upper.startswith("FIRST:"):
value = line[6:].strip()
if value and value.upper() != "N/A":
contact['first'] = value
elif line_upper.startswith("LAST:"):
value = line[5:].strip()
if value and value.upper() != "N/A":
contact['last'] = value
elif line_upper.startswith("ROLE:"):
value = line[5:].strip()
if value and value.upper() != "N/A":
contact['role'] = value
elif line_upper.startswith("COMPANY:"):
value = line[8:].strip()
if value and value.upper() != "N/A":
contact['company'] = value
elif line_upper.startswith("LINKEDIN:"):
value = line[9:].strip()
if value and value.upper() != "N/A":
contact['linkedin'] = value
return contact
async def scrape_linkedin_connections():
"""Scrape LinkedIn connections and export to CSV."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"linkedin_connections_{timestamp}.csv"
csv_path = os.path.join(os.getcwd(), csv_filename)
# Initialize CSV file
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=['first', 'last', 'role', 'company', 'met_at', 'linkedin'])
writer.writeheader()
print(f"\n🚀 Starting LinkedIn connections scraper")
print(f"📁 Output file: {csv_path}")
print(f"📍 Met at: {MET_AT_REASON}")
print("=" * 80)
try:
async with Computer(
os_type="linux",
provider_type=VMProviderType.CLOUD,
name=os.environ["CUA_CONTAINER_NAME"], # Your sandbox name
api_key=os.environ["CUA_API_KEY"],
verbosity=logging.INFO,
) as computer:
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.INFO,
trajectory_dir="trajectories",
use_prompt_caching=True,
max_trajectory_budget=10.0,
)
history = []
# Task 1: Navigate to LinkedIn connections page
navigation_task = (
"STEP 1 - NAVIGATE TO LINKEDIN CONNECTIONS PAGE:\n"
"1. Open a web browser (Chrome or Firefox)\n"
"2. Navigate to https://www.linkedin.com/mynetwork/invite-connect/connections/\n"
"3. Wait for the page to fully load\n"
"4. Confirm you can see the list of connections\n"
"5. Ready to start extracting contacts"
)
print(f"\n[Task 1/21] Navigating to LinkedIn...")
history.append({"role": "user", "content": navigation_task})
async for result in agent.run(history, stream=False):
history += result.get("output", [])
print(f"✅ Navigation completed\n")
# Extract 20 contacts
contacts_extracted = 0
linkedin_urls = []
previous_contact_name = None
for contact_num in range(1, 21):
# Build extraction task
if contact_num == 1:
extraction_task = (
f"STEP {contact_num + 1} - EXTRACT CONTACT {contact_num} OF 20:\n"
f"1. Click on the first connection's profile\n"
f"2. Extract: FIRST, LAST, ROLE, COMPANY, LINKEDIN URL\n"
f"3. Return in exact format:\n"
f"FIRST: [value]\n"
f"LAST: [value]\n"
f"ROLE: [value]\n"
f"COMPANY: [value]\n"
f"LINKEDIN: [value]\n"
f"4. Navigate back to connections list"
)
else:
extraction_task = (
f"STEP {contact_num + 1} - EXTRACT CONTACT {contact_num} OF 20:\n"
f"1. Find '{previous_contact_name}' in the list\n"
f"2. Click on the contact BELOW them\n"
f"3. Extract: FIRST, LAST, ROLE, COMPANY, LINKEDIN URL\n"
f"4. Return in exact format:\n"
f"FIRST: [value]\n"
f"LAST: [value]\n"
f"ROLE: [value]\n"
f"COMPANY: [value]\n"
f"LINKEDIN: [value]\n"
f"5. Navigate back"
)
print(f"[Task {contact_num + 1}/21] Extracting contact {contact_num}/20...")
history.append({"role": "user", "content": extraction_task})
all_output = []
async for result in agent.run(history, stream=False):
output = result.get("output", [])
history += output
all_output.extend(output)
contact_data = extract_contact_from_response(all_output)
has_name = bool(contact_data['first'] and contact_data['last'])
has_linkedin = bool(contact_data['linkedin'] and 'linkedin.com' in contact_data['linkedin'])
if has_name or has_linkedin:
with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=['first', 'last', 'role', 'company', 'met_at', 'linkedin'])
writer.writerow(contact_data)
contacts_extracted += 1
if contact_data['linkedin']:
linkedin_urls.append(contact_data['linkedin'])
if has_name:
previous_contact_name = f"{contact_data['first']} {contact_data['last']}".strip()
name_str = f"{contact_data['first']} {contact_data['last']}" if has_name else "[No name]"
print(f"✅ Contact {contact_num}/20 saved: {name_str}")
else:
print(f"⚠️ Could not extract valid data for contact {contact_num}")
if contact_num % 5 == 0:
print(f"\n📈 Progress: {contacts_extracted}/{contact_num} contacts extracted\n")
# Create messaging links file
messaging_filename = f"linkedin_messaging_links_{timestamp}.txt"
messaging_path = os.path.join(os.getcwd(), messaging_filename)
with open(messaging_path, 'w', encoding='utf-8') as txtfile:
txtfile.write("LinkedIn Messaging Compose Links\n")
txtfile.write("=" * 80 + "\n\n")
for i, linkedin_url in enumerate(linkedin_urls, 1):
public_id = extract_public_id_from_linkedin_url(linkedin_url)
if public_id:
messaging_url = f"https://www.linkedin.com/messaging/compose/?recipient={public_id}"
txtfile.write(f"{i}. {messaging_url}\n")
print("\n" + "="*80)
print("🎉 All tasks completed!")
print(f"📁 CSV file saved to: {csv_path}")
print(f"📊 Total contacts extracted: {contacts_extracted}/20")
print(f"💬 Messaging links saved to: {messaging_path}")
print("="*80)
except Exception as e:
print(f"\n❌ Error: {e}")
traceback.print_exc()
raise
def main():
try:
load_dotenv()
if "ANTHROPIC_API_KEY" not in os.environ:
raise RuntimeError("Please set ANTHROPIC_API_KEY in .env")
if "CUA_API_KEY" not in os.environ:
raise RuntimeError("Please set CUA_API_KEY in .env")
if "CUA_CONTAINER_NAME" not in os.environ:
raise RuntimeError("Please set CUA_CONTAINER_NAME in .env")
signal.signal(signal.SIGINT, handle_sigint)
asyncio.run(scrape_linkedin_connections())
except Exception as e:
print(f"\n❌ Error: {e}")
traceback.print_exc()
if __name__ == "__main__":
main()
```
</Tab>
<Tab value="Linux on Docker">
```python
# Same code as Cloud Sandbox, but change Computer initialization to:
async with Computer(
os_type="linux",
provider_type=VMProviderType.DOCKER,
image="trycua/cua-xfce:latest",
verbosity=logging.INFO,
) as computer:
```
And remove the `CUA_API_KEY` and `CUA_CONTAINER_NAME` requirements from `.env` and the validation checks.
</Tab>
<Tab value="macOS Sandbox">
```python
# Same code as Cloud Sandbox, but change Computer initialization to:
async with Computer(
os_type="macos",
provider_type=VMProviderType.LUME,
name="macos-sequoia-cua:latest",
verbosity=logging.INFO,
) as computer:
```
And remove the `CUA_API_KEY` and `CUA_CONTAINER_NAME` requirements from `.env` and the validation checks.
</Tab>
<Tab value="Windows Sandbox">
```python
# Same code as Cloud Sandbox, but change Computer initialization to:
async with Computer(
os_type="windows",
provider_type=VMProviderType.WINDOWS_SANDBOX,
verbosity=logging.INFO,
) as computer:
```
And remove the `CUA_API_KEY` and `CUA_CONTAINER_NAME` requirements from `.env` and the validation checks.
</Tab>
</Tabs>
</Step>
<Step>
### Run Your Script
Execute your contact extraction automation:
```bash
python contact_export.py
```
The agent will:
1. Navigate to your LinkedIn connections page
2. Extract data from 20 contacts (first name, last name, role, company, LinkedIn URL)
3. Save contacts to a timestamped CSV file
4. Generate messaging compose links for easy follow-up
Monitor the output to see the agent's progress. The script will show a progress update every 5 contacts.
</Step>
</Steps>
---
## How It Works
This script demonstrates a practical workflow for extracting LinkedIn connection data:
1. **Session Persistence** - Manually log into LinkedIn through the VM once, and the VM saves your session
2. **Navigation** - The script navigates to your connections page using your saved authenticated session
3. **Data Extraction** - For each contact, the agent clicks their profile, extracts data, and navigates back
4. **Python Processing** - Python parses responses, validates data, and writes to CSV incrementally
5. **Output Files** - Generates a CSV with contact data and a text file with messaging URLs
## Next Steps
- Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands)
- Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/)
- Experiment with different [Models and Providers](/agent-sdk/supported-model-providers/)
- Adapt this script for other platforms (Twitter/X, email extraction, etc.)
- Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help

View File

@@ -0,0 +1,629 @@
---
title: Windows App behind VPN
description: Automate legacy Windows desktop applications behind VPN with Cua
---
import { Step, Steps } from 'fumadocs-ui/components/steps';
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
## Overview
This guide demonstrates how to automate Windows desktop applications (like eGecko HR/payroll systems) that run behind corporate VPN. This is a common enterprise scenario where legacy desktop applications require manual data entry, report generation, or workflow execution.
**Use cases:**
- HR/payroll processing (employee onboarding, payroll runs, benefits administration)
- Desktop ERP systems behind corporate networks
- Legacy financial applications requiring VPN access
- Compliance reporting from on-premise systems
**Architecture:**
- Client-side Cua agent (Python SDK or Playground UI)
- Windows VM/Sandbox with VPN client configured
- RDP/remote desktop connection to target environment
- Desktop application automation via computer vision and UI control
<Callout type="info">
**Production Deployment**: For production use, consider workflow mining and custom finetuning to
create vertical-specific actions (e.g., "Run payroll", "Onboard employee") instead of generic UI
automation. This provides better audit trails and higher success rates.
</Callout>
---
## Video Demo
<div className="rounded-lg border bg-card text-card-foreground shadow-sm p-4 mb-6">
<video
src="https://github.com/user-attachments/assets/8ab07646-6018-4128-87ce-53180cfea696"
controls
className="w-full rounded"
>
Your browser does not support the video tag.
</video>
<div className="text-sm text-muted-foreground mt-2">
Demo showing Cua automating an eGecko-like desktop application on Windows behind AWS VPN
</div>
</div>
---
<Steps>
<Step>
### Set Up Your Environment
Install the required dependencies:
Create a `requirements.txt` file:
```text
cua-agent
cua-computer
python-dotenv>=1.0.0
```
Install the dependencies:
```bash
pip install -r requirements.txt
```
Create a `.env` file with your API keys:
```text
ANTHROPIC_API_KEY=your-anthropic-api-key
CUA_API_KEY=sk_cua-api01...
CUA_SANDBOX_NAME=your-windows-sandbox
```
</Step>
<Step>
### Configure Windows Sandbox with VPN
<Tabs items={['Cloud Sandbox (Recommended)', 'Windows Sandbox', 'Self-Hosted VM']}>
<Tab value="Cloud Sandbox (Recommended)">
For enterprise deployments, use Cua Cloud Sandbox with pre-configured VPN:
1. Go to [cua.ai/signin](https://cua.ai/signin)
2. Navigate to **Dashboard > Containers > Create Instance**
3. Create a **Windows** sandbox (Medium or Large for desktop apps)
4. Configure VPN settings:
- Upload your AWS VPN Client configuration (`.ovpn` file)
- Or configure VPN credentials directly in the dashboard
5. Note your sandbox name and API key
Your Windows sandbox will launch with VPN automatically connected.
</Tab>
<Tab value="Windows Sandbox">
For local development on Windows 10 Pro/Enterprise or Windows 11:
1. Enable [Windows Sandbox](https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/windows-sandbox-install)
2. Install the `pywinsandbox` dependency:
```bash
pip install -U git+git://github.com/karkason/pywinsandbox.git
```
3. Create a VPN setup script that runs on sandbox startup
4. Configure your desktop application installation within the sandbox
<Callout type="warn">
**Manual VPN Setup**: Windows Sandbox requires manual VPN configuration each time it starts. For
production use, consider Cloud Sandbox or self-hosted VMs with persistent VPN connections.
</Callout>
</Tab>
<Tab value="Self-Hosted VM">
For self-managed infrastructure:
1. Deploy Windows VM on your preferred cloud (AWS, Azure, GCP)
2. Install and configure VPN client (AWS VPN Client, OpenVPN, etc.)
3. Install target desktop application and any dependencies
4. Install `cua-computer-server`:
```bash
pip install cua-computer-server
python -m computer_server
```
5. Configure firewall rules to allow Cua agent connections
</Tab>
</Tabs>
</Step>
<Step>
### Create Your Automation Script
Create a Python file (e.g., `hr_automation.py`):
<Tabs items={['Cloud Sandbox', 'Windows Sandbox', 'Self-Hosted']}>
<Tab value="Cloud Sandbox">
```python
import asyncio
import logging
import os
from agent import ComputerAgent
from computer import Computer, VMProviderType
from dotenv import load_dotenv
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
load_dotenv()
async def automate_hr_workflow():
"""
Automate HR/payroll desktop application workflow.
This example demonstrates:
- Launching Windows desktop application
- Navigating complex desktop UI
- Data entry and form filling
- Report generation and export
"""
try:
# Connect to Windows Cloud Sandbox with VPN
async with Computer(
os_type="windows",
provider_type=VMProviderType.CLOUD,
name=os.environ["CUA_SANDBOX_NAME"],
api_key=os.environ["CUA_API_KEY"],
verbosity=logging.INFO,
) as computer:
# Configure agent with specialized instructions
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.INFO,
trajectory_dir="trajectories",
use_prompt_caching=True,
max_trajectory_budget=10.0,
instructions="""
You are automating a Windows desktop HR/payroll application.
IMPORTANT GUIDELINES:
- Always wait for windows and dialogs to fully load before interacting
- Look for loading indicators and wait for them to disappear
- Verify each action by checking on-screen confirmation messages
- If a button or field is not visible, try scrolling or navigating tabs
- Desktop apps often have nested menus - explore systematically
- Save work frequently using File > Save or Ctrl+S
- Before closing, always verify changes were saved
COMMON UI PATTERNS:
- Menu bar navigation (File, Edit, View, etc.)
- Ribbon interfaces with tabs
- Modal dialogs that block interaction
- Data grids/tables for viewing records
- Form fields with validation
- Status bars showing operation progress
""".strip()
)
# Define workflow tasks
tasks = [
"Launch the HR application from the desktop or start menu",
"Log in with the credentials shown in credentials.txt on the desktop",
"Navigate to Employee Management section",
"Create a new employee record with information from new_hire.xlsx on desktop",
"Verify the employee was created successfully by searching for their name",
"Generate an onboarding report for the new employee",
"Export the report as PDF to the desktop",
"Log out of the application"
]
history = []
for task in tasks:
logger.info(f"\n{'='*60}")
logger.info(f"Task: {task}")
logger.info(f"{'='*60}\n")
history.append({"role": "user", "content": task})
async for result in agent.run(history):
for item in result.get("output", []):
if item.get("type") == "message":
content = item.get("content", [])
for block in content:
if block.get("type") == "text":
response = block.get("text", "")
logger.info(f"Agent: {response}")
history.append({"role": "assistant", "content": response})
logger.info("\nTask completed. Moving to next task...\n")
logger.info("\n" + "="*60)
logger.info("All tasks completed successfully!")
logger.info("="*60)
except Exception as e:
logger.error(f"Error during automation: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(automate_hr_workflow())
```
</Tab>
<Tab value="Windows Sandbox">
```python
import asyncio
import logging
import os
from agent import ComputerAgent
from computer import Computer, VMProviderType
from dotenv import load_dotenv
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
load_dotenv()
async def automate_hr_workflow():
try:
# Connect to Windows Sandbox
async with Computer(
os_type="windows",
provider_type=VMProviderType.WINDOWS_SANDBOX,
verbosity=logging.INFO,
) as computer:
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.INFO,
trajectory_dir="trajectories",
use_prompt_caching=True,
max_trajectory_budget=10.0,
instructions="""
You are automating a Windows desktop HR/payroll application.
IMPORTANT GUIDELINES:
- Always wait for windows and dialogs to fully load before interacting
- Verify each action by checking on-screen confirmation messages
- Desktop apps often have nested menus - explore systematically
- Save work frequently using File > Save or Ctrl+S
""".strip()
)
tasks = [
"Launch the HR application from the desktop",
"Log in with credentials from credentials.txt on desktop",
"Navigate to Employee Management and create new employee from new_hire.xlsx",
"Generate and export onboarding report as PDF",
"Log out of the application"
]
history = []
for task in tasks:
logger.info(f"\nTask: {task}")
history.append({"role": "user", "content": task})
async for result in agent.run(history):
for item in result.get("output", []):
if item.get("type") == "message":
content = item.get("content", [])
for block in content:
if block.get("type") == "text":
response = block.get("text", "")
logger.info(f"Agent: {response}")
history.append({"role": "assistant", "content": response})
logger.info("\nAll tasks completed!")
except Exception as e:
logger.error(f"Error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(automate_hr_workflow())
```
</Tab>
<Tab value="Self-Hosted">
```python
import asyncio
import logging
import os
from agent import ComputerAgent
from computer import Computer
from dotenv import load_dotenv
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
load_dotenv()
async def automate_hr_workflow():
try:
# Connect to self-hosted Windows VM running computer-server
async with Computer(
use_host_computer_server=True,
base_url="http://your-windows-vm-ip:5757", # Update with your VM IP
verbosity=logging.INFO,
) as computer:
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.INFO,
trajectory_dir="trajectories",
use_prompt_caching=True,
max_trajectory_budget=10.0,
instructions="""
You are automating a Windows desktop HR/payroll application.
IMPORTANT GUIDELINES:
- Always wait for windows and dialogs to fully load before interacting
- Verify each action by checking on-screen confirmation messages
- Save work frequently using File > Save or Ctrl+S
""".strip()
)
tasks = [
"Launch the HR application",
"Log in with provided credentials",
"Complete the required HR workflow",
"Generate and export report",
"Log out"
]
history = []
for task in tasks:
logger.info(f"\nTask: {task}")
history.append({"role": "user", "content": task})
async for result in agent.run(history):
for item in result.get("output", []):
if item.get("type") == "message":
content = item.get("content", [])
for block in content:
if block.get("type") == "text":
response = block.get("text", "")
logger.info(f"Agent: {response}")
history.append({"role": "assistant", "content": response})
logger.info("\nAll tasks completed!")
except Exception as e:
logger.error(f"Error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(automate_hr_workflow())
```
</Tab>
</Tabs>
</Step>
<Step>
### Run Your Automation
Execute the script:
```bash
python hr_automation.py
```
The agent will:
1. Connect to your Windows environment (with VPN if configured)
2. Launch and navigate the desktop application
3. Execute each workflow step sequentially
4. Verify actions and handle errors
5. Save trajectory logs for audit and debugging
Monitor the console output to see the agent's progress through each task.
</Step>
</Steps>
---
## Key Configuration Options
### Agent Instructions
The `instructions` parameter is critical for reliable desktop automation:
```python
instructions="""
You are automating a Windows desktop HR/payroll application.
IMPORTANT GUIDELINES:
- Always wait for windows and dialogs to fully load before interacting
- Look for loading indicators and wait for them to disappear
- Verify each action by checking on-screen confirmation messages
- If a button or field is not visible, try scrolling or navigating tabs
- Desktop apps often have nested menus - explore systematically
- Save work frequently using File > Save or Ctrl+S
- Before closing, always verify changes were saved
COMMON UI PATTERNS:
- Menu bar navigation (File, Edit, View, etc.)
- Ribbon interfaces with tabs
- Modal dialogs that block interaction
- Data grids/tables for viewing records
- Form fields with validation
- Status bars showing operation progress
APPLICATION-SPECIFIC:
- Login is at top-left corner
- Employee records are under "HR Management" > "Employees"
- Reports are generated via "Tools" > "Reports" > "Generate"
- Always click "Save" before navigating away from a form
""".strip()
```
### Budget Management
For long-running workflows, adjust budget limits:
```python
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer],
max_trajectory_budget=20.0, # Increase for complex workflows
# ... other params
)
```
### Image Retention
Balance context and cost by retaining only recent screenshots:
```python
agent = ComputerAgent(
# ...
only_n_most_recent_images=3, # Keep last 3 screenshots
# ...
)
```
---
## Production Considerations
<Callout type="warn" title="Production Deployment">
For enterprise production deployments, consider these additional steps:
</Callout>
### 1. Workflow Mining
Before deploying, analyze your actual workflows:
- Record user interactions with the application
- Identify common patterns and edge cases
- Map out decision trees and validation requirements
- Document application-specific quirks and timing issues
### 2. Custom Finetuning
Create vertical-specific actions instead of generic UI automation:
```python
# Instead of generic steps:
tasks = ["Click login", "Type username", "Type password", "Click submit"]
# Create semantic actions:
tasks = ["onboard_employee", "run_payroll", "generate_compliance_report"]
```
This provides:
- Better audit trails
- Approval gates at business logic level
- Higher success rates
- Easier maintenance and updates
### 3. Human-in-the-Loop
Add approval gates for critical operations:
```python
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer],
# Add human approval callback for sensitive operations
callbacks=[ApprovalCallback(require_approval_for=["payroll", "termination"])]
)
```
### 4. Deployment Options
Choose your deployment model:
**Managed (Recommended)**
- Cua hosts Windows sandboxes, VPN/RDP stack, and agent runtime
- You get UI/API endpoints for triggering workflows
- Automatic scaling, monitoring, and maintenance
- SLA guarantees and enterprise support
**Self-Hosted**
- You manage Windows VMs, VPN infrastructure, and agent deployment
- Full control over data and security
- Custom network configurations
- On-premise or your preferred cloud
---
## Troubleshooting
### VPN Connection Issues
If the agent cannot reach the application:
1. Verify VPN is connected: Check VPN client status in the Windows sandbox
2. Test network connectivity: Try pinging internal resources
3. Check firewall rules: Ensure RDP and application ports are open
4. Review VPN logs: Look for authentication or routing errors
### Application Not Launching
If the desktop application fails to start:
1. Verify installation: Check the application is installed in the sandbox
2. Check dependencies: Ensure all required DLLs and frameworks are present
3. Review permissions: Application may require admin rights
4. Check logs: Look for error messages in Windows Event Viewer
### UI Element Not Found
If the agent cannot find buttons or fields:
1. Increase wait times: Some applications load slowly
2. Check screen resolution: UI elements may be off-screen
3. Verify DPI scaling: High DPI settings can affect element positions
4. Update instructions: Provide more specific navigation guidance
### Cost Management
If costs are higher than expected:
1. Reduce `max_trajectory_budget`
2. Decrease `only_n_most_recent_images`
3. Use prompt caching: Set `use_prompt_caching=True`
4. Optimize task descriptions: Be more specific to reduce retry attempts
---
## Next Steps
- **Explore custom tools**: Learn how to create [custom tools](/agent-sdk/custom-tools) for application-specific actions
- **Implement callbacks**: Add [monitoring and logging](/agent-sdk/callbacks) for production workflows
- **Join community**: Get help in our [Discord](https://discord.com/invite/mVnXXpdE85)
---
## Related Examples
- [Form Filling](/example-usecases/form-filling) - Web form automation
- [Post-Event Contact Export](/example-usecases/post-event-contact-export) - Data extraction workflows
- [Custom Tools](/agent-sdk/custom-tools) - Building application-specific functions

View File

@@ -0,0 +1,7 @@
{
"title": "Get Started",
"description": "Get started with Cua",
"defaultOpen": true,
"icon": "Rocket",
"pages": ["../index", "quickstart"]
}

View File

@@ -0,0 +1,571 @@
---
title: Quickstart
description: Get started with Cua
---
import { Step, Steps } from 'fumadocs-ui/components/steps';
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
import { Accordion, Accordions } from 'fumadocs-ui/components/accordion';
import { Code, Terminal } from 'lucide-react';
{/* Choose your quickstart path:
<div className="grid grid-cols-1 md:grid-cols-2 gap-6 mt-8 mb-8">
<Card icon={<Code />} href="#developer-quickstart" title="Developer Quickstart">
Build with Python or TypeScript SDKs - full programmatic control
</Card>
<Card icon={<Terminal />} href="#cli-quickstart" title="CLI Quickstart">
Get started quickly with the command-line interface
</Card>
</div> */}
---
## Set Up Your Computer Environment
Choose how you want to run your Cua computer. This will be the environment where your automated tasks will execute.
You can run your Cua computer in the cloud (recommended for easiest setup), locally on macOS with Lume, locally on Windows with a Windows Sandbox, or in a Docker container on any platform. Choose the option that matches your system and needs.
<Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox']}>
<Tab value="Cloud Sandbox">
Create and manage cloud sandboxes that run Linux (Ubuntu), Windows, or macOS.
**First, create your API key:**
1. Go to [cua.ai/signin](https://cua.ai/signin)
2. Navigate to **Dashboard > API Keys > New API Key** to create your API key
3. **Important:** Copy and save your API key immediately - you won't be able to see it again (you'll need to regenerate if lost)
**Then, create your sandbox using either option:**
**Option 1: Via Website**
1. Navigate to **Dashboard > Sandboxes > Create Sandbox**
2. Create a **Small** sandbox, choosing **Linux**, **Windows**, or **macOS**
3. Note your sandbox name
**Option 2: Via CLI**
1. Install the CUA CLI:
```bash
# macOS/Linux
curl -LsSf https://cua.ai/cli/install.sh | sh
# Windows
powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
```
2. Login and create a sandbox:
```bash
cua auth login
cua sb create --os linux --size small --region north-america
```
3. Note your sandbox name and password from the output
Your Cloud Sandbox will be automatically configured and ready to use.
</Tab>
<Tab value="Linux on Docker">
Run Linux desktop locally on macOS, Windows, or Linux hosts.
1. Install Docker Desktop or Docker Engine
2. Pull a CUA Docker image:
```bash
# XFCE (Lightweight) - recommended for most use cases
docker pull --platform=linux/amd64 trycua/cua-xfce:latest
# OR KASM (Full-Featured) - full Ubuntu desktop
docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest
```
</Tab>
<Tab value="macOS Sandbox">
macOS hosts only - requires Lume CLI.
1. Install the Lume CLI:
```bash
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
```
2. Start a local Cua sandbox:
```bash
lume run macos-sequoia-cua:latest
```
</Tab>
<Tab value="Windows Sandbox">
Windows hosts only - requires Windows 10 Pro/Enterprise or Windows 11.
1. Enable [Windows Sandbox](https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/windows-sandbox-install)
2. Install the `pywinsandbox` dependency:
```bash
pip install -U git+git://github.com/karkason/pywinsandbox.git
```
3. Windows Sandbox will be automatically configured when you run the CLI
</Tab>
</Tabs>
---
## Developer Quickstart
<Steps>
<Step>
### Using Computer
Connect to your Cua computer and perform basic interactions, such as taking screenshots or simulating user input.
<Tabs items={['Python', 'TypeScript']}>
<Tab value="Python">
Install the Cua computer Python SDK:
```bash
pip install cua-computer
```
Then, connect to your desired computer environment:
<Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox', 'Your host desktop']}>
<Tab value="Cloud Sandbox">
Set your CUA API key (same key used for model inference) and connect to your sandbox:
```python
import os
from computer import Computer
os.environ["CUA_API_KEY"] = "sk_cua-api01_..."
computer = Computer(
os_type="linux", # or "windows" or "macos"
provider_type="cloud",
name="your-sandbox-name" # from CLI or website
)
await computer.run() # Connect to the sandbox
```
</Tab>
<Tab value="Linux on Docker">
```python
from computer import Computer
computer = Computer(
os_type="linux",
provider_type="docker",
image="trycua/cua-xfce:latest" # or "trycua/cua-ubuntu:latest"
)
await computer.run() # Launch & connect to the sandbox
```
</Tab>
<Tab value="macOS Sandbox">
```python
from computer import Computer
computer = Computer(
os_type="macos",
provider_type="lume",
name="macos-sequoia-cua:latest"
)
await computer.run() # Launch & connect to the sandbox
```
</Tab>
<Tab value="Windows Sandbox">
```python
from computer import Computer
computer = Computer(
os_type="windows",
provider_type="windows_sandbox"
)
await computer.run() # Launch & connect to the sandbox
```
</Tab>
<Tab value="Your host desktop">
Install and run `cua-computer-server`:
```bash
pip install cua-computer-server
python -m computer_server
```
Then, use the `Computer` object to connect:
```python
from computer import Computer
computer = Computer(use_host_computer_server=True)
await computer.run() # Connect to the host desktop
```
</Tab>
</Tabs>
Once connected, you can perform interactions:
```python
try:
# Take a screenshot of the computer's current display
screenshot = await computer.interface.screenshot()
# Simulate a left-click at coordinates (100, 100)
await computer.interface.left_click(100, 100)
# Type "Hello!" into the active application
await computer.interface.type_text("Hello!")
finally:
await computer.close()
```
</Tab>
<Tab value="TypeScript">
<Callout type="warn" title="TypeScript SDK Deprecated">
The TypeScript interface is currently deprecated. We're working on version 0.2.0 with improved TypeScript support. In the meantime, please use the Python SDK.
</Callout>
Install the Cua computer TypeScript SDK:
```bash
npm install @trycua/computer
```
Then, connect to your desired computer environment:
<Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox', 'Your host desktop']}>
<Tab value="Cloud Sandbox">
Set your CUA API key (same key used for model inference):
```bash
export CUA_API_KEY="sk_cua-api01_..."
```
Then connect to your sandbox:
```typescript
import { Computer, OSType } from '@trycua/computer';
const computer = new Computer({
osType: OSType.LINUX, // or OSType.WINDOWS or OSType.MACOS
name: "your-sandbox-name" // from CLI or website
});
await computer.run(); // Connect to the sandbox
```
</Tab>
<Tab value="Linux on Docker">
```typescript
import { Computer, OSType, ProviderType } from '@trycua/computer';
const computer = new Computer({
osType: OSType.LINUX,
providerType: ProviderType.DOCKER,
image: "trycua/cua-xfce:latest" // or "trycua/cua-ubuntu:latest"
});
await computer.run(); // Launch & connect to the sandbox
```
</Tab>
<Tab value="macOS Sandbox">
```typescript
import { Computer, OSType, ProviderType } from '@trycua/computer';
const computer = new Computer({
osType: OSType.MACOS,
providerType: ProviderType.LUME,
name: "macos-sequoia-cua:latest"
});
await computer.run(); // Launch & connect to the sandbox
```
</Tab>
<Tab value="Windows Sandbox">
```typescript
import { Computer, OSType, ProviderType } from '@trycua/computer';
const computer = new Computer({
osType: OSType.WINDOWS,
providerType: ProviderType.WINDOWS_SANDBOX
});
await computer.run(); // Launch & connect to the sandbox
```
</Tab>
<Tab value="Your host desktop">
First, install and run `cua-computer-server`:
```bash
pip install cua-computer-server
python -m computer_server
```
Then, use the `Computer` object to connect:
```typescript
import { Computer } from '@trycua/computer';
const computer = new Computer({ useHostComputerServer: true });
await computer.run(); // Connect to the host desktop
```
</Tab>
</Tabs>
Once connected, you can perform interactions:
```typescript
try {
// Take a screenshot of the computer's current display
const screenshot = await computer.interface.screenshot();
// Simulate a left-click at coordinates (100, 100)
await computer.interface.leftClick(100, 100);
// Type "Hello!" into the active application
await computer.interface.typeText("Hello!");
} finally {
await computer.close();
}
```
</Tab>
</Tabs>
Learn more about computers in the [Cua computers documentation](/computer-sdk/computers). You will see how to automate computers with agents in the next step.
</Step>
<Step>
### Using Agent
Utilize an Agent to automate complex tasks by providing it with a goal and allowing it to interact with the computer environment.
Install the Cua agent Python SDK:
```bash
pip install "cua-agent[all]"
```
Choose how you want to access vision-language models for your agent:
<Tabs items={['CUA VLM Router', 'BYOK (Bring Your Own Key)']}>
<Tab value="CUA VLM Router">
Use CUA's inference API to access multiple model providers with a single API key (same key used for sandbox access). CUA VLM Router provides intelligent routing and cost optimization.
**Use the agent with CUA models:**
```python
import os
from agent import ComputerAgent
os.environ["CUA_API_KEY"] = "sk_cua-api01_..."
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5", # CUA-routed model
tools=[computer],
max_trajectory_budget=5.0
)
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
async for result in agent.run(messages):
for item in result["output"]:
if item["type"] == "message":
print(item["content"][0]["text"])
```
**Available CUA models:**
- `cua/anthropic/claude-sonnet-4.5` - Claude Sonnet 4.5 (recommended)
- `cua/anthropic/claude-opus-4.5` - Claude Opus 4.5 (enhanced agentic capabilities)
- `cua/anthropic/claude-haiku-4.5` - Claude Haiku 4.5 (faster, cost-effective)
- `cua/qwen/qwen3-vl-235b` - Qwen3 VL 235B (large-scale vision-language tasks)
**Benefits:**
- Single API key for multiple providers
- Cost tracking and optimization
- No need to manage multiple provider keys
</Tab>
<Tab value="BYOK (Bring Your Own Key)">
Use your own API keys from model providers like Anthropic, OpenAI, or others.
**Use the agent with your provider:**
```python
import os
from agent import ComputerAgent
# Set your provider API key
os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..." # For Anthropic
# OR
os.environ["OPENAI_API_KEY"] = "sk-..." # For OpenAI
agent = ComputerAgent(
model="anthropic/claude-sonnet-4-5-20250929", # Direct provider model
tools=[computer],
max_trajectory_budget=5.0
)
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
async for result in agent.run(messages):
for item in result["output"]:
if item["type"] == "message":
print(item["content"][0]["text"])
```
**Supported providers:**
- `anthropic/claude-*` - Anthropic Claude models
- `openai/gpt-*` - OpenAI GPT models
- `openai/o1-*` - OpenAI o1 models
- `huggingface-local/*` - Local HuggingFace models
- And many more via LiteLLM
See [Supported Models](/agent-sdk/supported-model-providers/) for the complete list.
</Tab>
</Tabs>
Learn more about agents in [Agent Loops](/agent-sdk/agent-loops) and available models in [Supported Models](/agent-sdk/supported-model-providers/).
</Step>
</Steps>
### Next Steps
- Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands)
- Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/)
- Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help
- Try out [Form Filling](/example-usecases/form-filling) preset usecase
{/* ---
## CLI Quickstart
Get started quickly with the CUA CLI - the easiest way to manage cloud sandboxes and run AI agents.
<Steps>
<Step>
### Install the CUA CLI
<Tabs items={['macOS / Linux', 'Windows', 'Bun (Alternative)', 'From Source']}>
<Tab value="macOS / Linux">
```bash
curl -LsSf https://cua.ai/cli/install.sh | sh
```
</Tab>
<Tab value="Windows">
```powershell
powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
```
</Tab>
<Tab value="Bun (Alternative)">
```bash
# Install Bun if you don't have it
curl -fsSL https://bun.sh/install | bash
# Install CUA CLI
bun add -g @trycua/cli
```
</Tab>
<Tab value="From Source">
```bash
# Install Bun (macOS/Linux)
curl -fsSL https://bun.sh/install | bash
# Install Bun (Windows)
# powershell -c "irm bun.sh/install.ps1|iex"
# Clone the repo
git clone https://github.com/trycua/cua
cd cua/libs/typescript/cua-cli
# Install the CLI
bun install
bun link
bun link cua-cli
```
</Tab>
</Tabs>
</Step>
<Step>
### Authenticate with CUA
Login to your CUA account:
```bash
# Interactive browser login (recommended)
cua auth login
# Or provide your API key directly
cua auth login --api-key sk-your-api-key-here
```
If you don't have a CUA account yet, sign up at [cua.ai/signin](https://cua.ai/signin).
</Step>
<Step>
### Create Your First Sandbox
Create a cloud sandbox where your AI agents will run:
```bash
# Create a Linux sandbox (recommended for most use cases)
cua sb create --os linux --size small --region north-america
# Or create a Windows sandbox
cua sb create --os windows --size small --region north-america
```
Your sandbox will be created and you'll see output like:
```
Sandbox created and ready: my-sandbox-abc123
Password: secure-password-here
Host: my-sandbox-abc123.sandbox.cua.ai
```
</Step>
<Step>
### Start Using Your Sandbox
You can now interact with your sandbox in multiple ways:
#### Option 1: Access VNC Desktop
```bash
cua sb vnc my-sandbox-abc123
```
This opens a remote desktop connection to your sandbox.
#### Option 2: List and Manage Sandboxes
```bash
# List all your sandboxes
cua sb list
# Start/stop sandboxes as needed
cua sb stop my-sandbox-abc123
cua sb start my-sandbox-abc123
# Delete sandboxes when done
cua sb delete my-sandbox-abc123
```
</Step>
</Steps>
### What's Next?
- **Explore more commands**: Check out the [complete CLI reference](/libraries/cua-cli/commands)
- **Learn about programming**: Try the [Developer Quickstart](#developer-quickstart) to build custom automations
- **Join the community**: Get help in our [Discord community](https://discord.com/invite/mVnXXpdE85)
---
For running models locally, see [Running Models Locally](/agent-sdk/supported-model-providers/local-models). */}

View File

@@ -1,25 +1,58 @@
---
title: Home
icon: House
title: Introduction
---
import { Monitor, Code, BookOpen } from 'lucide-react';
import { Monitor, Code, BookOpen, Zap, Bot, Boxes, Rocket } from 'lucide-react';
# Welcome!
<div className="rounded-lg border bg-card text-card-foreground shadow-sm px-4 py-2 mb-6">
Cua is an open-source framework for building **Computer-Use Agents** - AI systems that see,
understand, and interact with desktop applications through vision and action, just like humans do.
</div>
Cua is a framework for automating Windows, Mac, and Linux apps powered by computer-using agents (CUAs).
## Why Cua?
Cua makes every stage of computer-using agent development simple:
Cua gives you everything you need to automate any desktop application without brittle selectors or APIs.
- **Development**: Use any LLM provider with liteLLM. The agent SDK makes multiple agent loop providers, trajectory tracing, caching, and budget management easy
- **Containerization**: Cua offers Docker containers pre-installed with everything needed for AI-powered RPA
- **Deployment**: Cua cloud gives you a production-ready cloud environment for your assistants
Some highlights include:
- **Model flexibility** - Connect to 100+ LLM providers through liteLLM's standard interface. Use models from Anthropic, OpenAI, Google, and more - or run them locally with Ollama, Hugging Face, or MLX.
- **Composed agents** - Mix and match grounding models with planning models for optimal performance. Use specialized models like GTA, OpenCUA, or OmniParser for UI element detection paired with powerful reasoning models like Claude or GPT-4.
- **Cross-platform sandboxes** - Run agents safely in isolated environments. Choose from Docker containers, macOS VMs with Lume, Windows Sandbox, or deploy to Cua Cloud with production-ready infrastructure.
- **Computer SDK** - Control any application with a PyAutoGUI-like API. Click, type, scroll, take screenshots, manage windows, read/write files - everything you need for desktop automation.
- **Agent SDK** - Build autonomous agents with trajectory tracing, prompt caching, cost tracking, and budget controls. Test agents on industry-standard benchmarks like OSWorld-Verified with one line of code.
- **Human-in-the-loop** - Pause agent execution and await user input or approval before continuing. Use the `human/human` model string to let humans control the agent directly.
- **Production essentials** - Ship reliable agents with built-in PII anonymization, cost tracking, trajectory logging, and integration with observability platforms like Laminar and HUD.
## What can you build?
- RPA automation that works with any application - even legacy software without APIs.
- Form-filling agents that handle complex multi-step web workflows.
- Testing automation that adapts to UI changes without brittle selectors.
- Data extraction from desktop applications and document processing.
- Cross-application workflows that combine multiple tools and services.
- Research agents that browse, read, and synthesize information from the web.
Explore real-world examples in our [blog posts](https://cua.ai/blog).
## Get started
Follow the [Quickstart guide](/docs/get-started/quickstart) for step-by-step setup with Python or TypeScript.
If you're new to computer-use agents, check out our [tutorials](https://cua.ai/blog), [examples](https://github.com/trycua/cua/tree/main/examples), and [notebooks](https://github.com/trycua/cua/tree/main/notebooks) to start building with Cua today.
<div className="grid grid-cols-1 md:grid-cols-2 gap-6 mt-8">
<Card icon={<Monitor />} href="/quickstart-devs" title="Quickstart (Developers)">
Build with Python—full SDK and agent code examples.
<Card icon={<Rocket />} href="/get-started/quickstart" title="Quickstart">
Get up and running in 3 steps with Python or TypeScript.
</Card>
<Card icon={<BookOpen />} href="/libraries/agent" title="API Reference">
Explore the agent SDK and APIs
<Card icon={<Zap />} href="/agent-sdk/agent-loops" title="Agent Loops">
Learn how agents work and how to build your own.
</Card>
<Card icon={<BookOpen />} href="/computer-sdk/computers" title="Computer SDK">
Control desktop applications with the Computer SDK.
</Card>
<Card icon={<Monitor />} href="/example-usecases/form-filling" title="Example Use Cases">
See Cua in action with real-world examples.
</Card>
</div>
We can't wait to see what you build with Cua ✨

View File

@@ -0,0 +1,360 @@
---
title: Commands
description: Complete reference for all CUA CLI commands
---
import { Tabs, Tab } from 'fumadocs-ui/components/tabs';
import { Callout } from 'fumadocs-ui/components/callout';
## Overview
The CUA CLI provides commands for authentication and sandbox management.
### Command Styles
The CLI supports **two command styles** for flexibility:
**Flat style** (quick & concise):
```bash
cua list
cua create --os linux --size small --region north-america
cua start my-sandbox
```
**Grouped style** (explicit & clear):
```bash
cua sb list # or: cua sandbox list
cua sb create # or: cua sandbox create
cua sb start # or: cua sandbox start
```
Both styles work identically - use whichever you prefer!
### Available Commands
- **Authentication** - `cua auth login`, `cua auth env`, `cua auth logout` (also available as flat commands: `cua login`, `cua env`, `cua logout`)
- **Sandbox Management** - `cua list`, `cua create`, `cua start`, `cua stop`, `cua restart`, `cua delete`, `cua vnc`
## Authentication Commands
### `cua auth login`
Authenticate with your CUA account using browser-based OAuth flow.
```bash
# Interactive browser login
cua auth login
# Direct API key login
cua auth login --api-key sk-your-api-key-here
# Alternative flat style
cua login
cua login --api-key sk-your-api-key-here
```
**Options:**
- `--api-key <key>` - Provide API key directly instead of browser flow
**Example:**
```bash
$ cua auth login
Opening browser for CLI auth...
API key saved
```
### `cua auth env`
Create or update a `.env` file in the current directory with your CUA API key.
```bash
cua auth env
# Alternative flat style
cua env
```
**Example:**
```bash
$ cua auth env
Wrote /path/to/your/project/.env
```
The generated `.env` file will contain:
```
CUA_API_KEY=sk-your-api-key-here
```
### `cua auth logout`
Remove the stored API key from your system.
```bash
cua auth logout
# Alternative flat style
cua logout
```
**Example:**
```bash
$ cua auth logout
Logged out
```
## Sandbox Commands
### `cua list`
List all your sandboxes with their current status. Passwords are hidden by default for security.
```bash
# List sandboxes (passwords hidden)
cua list
# Show passwords explicitly
cua list --show-passwords
# Alternative aliases
cua ls
cua ps
```
**Example Output (default, passwords hidden):**
```
NAME STATUS HOST
my-dev-sandbox running my-dev-sandbox.sandbox.cua.ai
test-windows stopped test-windows.sandbox.cua.ai
```
**Example Output (with --show-passwords):**
```
NAME STATUS PASSWORD HOST
my-dev-sandbox running secure-pass-123 my-dev-sandbox.sandbox.cua.ai
test-windows stopped another-pass-456 test-windows.sandbox.cua.ai
```
### `cua create`
Create a new sandbox.
```bash
cua create --os <OS> --size <SIZE> --region <REGION>
```
**Required Options:**
- `--os` - Operating system: `linux`, `windows`, `macos`
- `--size` - Sandbox size: `small`, `medium`, `large`
- `--region` - Region: `north-america`, `europe`, `asia-pacific`, `south-america`
**Examples:**
```bash
# Create a small Linux sandbox in North America
cua create --os linux --size small --region north-america
# Create a medium Windows sandbox in Europe
cua create --os windows --size medium --region europe
# Create a large macOS sandbox in Asia Pacific
cua create --os macos --size large --region asia-pacific
```
**Response Types:**
**Immediate (Status 200):**
```bash
Sandbox created and ready: my-new-sandbox-abc123
Password: secure-password-here
Host: my-new-sandbox-abc123.sandbox.cua.ai
```
**Provisioning (Status 202):**
```bash
Sandbox provisioning started: my-new-sandbox-abc123
Job ID: job-xyz789
Use 'cua list' to monitor provisioning progress
```
### `cua start`
Start a stopped sandbox.
```bash
cua start <name>
```
**Example:**
```bash
$ cua start my-dev-sandbox
Start accepted
```
### `cua stop`
Stop a running sandbox.
```bash
cua stop <name>
```
**Example:**
```bash
$ cua stop my-dev-sandbox
stopping
```
### `cua restart`
Restart a sandbox.
```bash
cua restart <name>
```
**Example:**
```bash
$ cua restart my-dev-sandbox
restarting
```
### `cua delete`
Delete a sandbox permanently.
```bash
cua delete <name>
```
**Example:**
```bash
$ cua delete old-test-sandbox
Sandbox deletion initiated: deleting
```
<Callout type="warn">
This action is irreversible. All data on the sandbox will be permanently lost.
</Callout>
### `cua vnc`
Open the VNC interface for a sandbox in your browser.
```bash
cua vnc <name>
# Alternative alias
cua open <name>
```
**Example:**
```bash
$ cua vnc my-dev-sandbox
Opening NoVNC: https://my-dev-sandbox.sandbox.cua.ai/vnc.html?autoconnect=true&password=...
```
This command automatically opens your default browser to the VNC interface with the correct password pre-filled.
## Global Options
### Help
Get help for any command:
```bash
cua --help
cua auth login --help
cua create --help
cua list --help
```
## Error Handling
The CLI provides clear error messages for common issues:
### Authentication Errors
```bash
$ cua list
Unauthorized. Try 'cua auth login' again.
```
### Sandbox Not Found
```bash
$ cua start nonexistent-sandbox
Sandbox not found
```
### Invalid Configuration
```bash
$ cua create --os invalid --configuration small --region north-america
Invalid request or unsupported configuration
```
## Tips and Best Practices
### 1. Use Descriptive Sandbox Names
```bash
# Good
cua create --os linux --size small --region north-america
# Then rename or use meaningful names in the dashboard
# Better workflow
cua list # Check the generated name
# Use that name consistently
```
### 2. Environment Management
```bash
# Set up your project with API key
cd my-project
cua auth env
# Now your project has CUA_API_KEY in .env
```
### 3. Quick Sandbox Access
```bash
# Create aliases for frequently used sandboxes
alias dev-sandbox="cua vnc my-development-sandbox"
alias prod-sandbox="cua vnc my-production-sandbox"
```
### 4. Monitoring Provisioning
```bash
# For sandboxes that need provisioning time
cua create --os windows --size large --region europe
# Sandbox provisioning started: my-sandbox-abc123
# Job ID: job-xyz789
# Check status periodically
watch -n 5 cua list
```
## Next Steps
- [Get started with the quickstart guide](/get-started/quickstart#cli-quickstart)
- [Learn about CUA computers](/computer-sdk/computers)
- [Explore agent automation](/agent-sdk/agent-loops)

View File

@@ -0,0 +1,58 @@
---
title: Cua CLI
description: Command-line interface for managing Cua cloud sandboxes and authentication
---
import { Tabs, Tab } from 'fumadocs-ui/components/tabs';
The Cua CLI is a command-line tool that provides an intuitive interface for managing your Cua cloud sandboxes and authentication. It offers a streamlined workflow for creating, managing, and connecting to cloud sandboxes.
## Key Features
- **Authentication Management**: Secure login with browser-based OAuth flow
- **Sandbox Lifecycle**: Create, start, stop, restart, and delete cloud sandboxes
- **Quick Access**: Direct links to VNC and playground interfaces
- **Cross-Platform**: Works on macOS, Linux, and Windows
- **Environment Integration**: Automatic `.env` file generation
## Quick Example
```bash
# Install the CLI (installs Bun + CUA CLI)
curl -LsSf https://cua.ai/cli/install.sh | sh
# Login to your CUA account
cua auth login
# Create a new Linux sandbox
cua sb create --os linux --size small --region north-america
# List your sandboxes
cua sb list
```
## Use Cases
### Development Workflow
- Quickly spin up cloud sandboxes for testing
- Manage multiple sandboxes across different regions
- Integrate with CI/CD pipelines
### Team Collaboration
- Share sandbox configurations and access
- Standardize development environments
- Quick onboarding for new team members
### Automation
- Script sandbox provisioning and management
- Integrate with deployment workflows
- Automate environment setup
## Next Steps
- [Install the CLI](/libraries/cua-cli/installation)
- [Learn about available commands](/libraries/cua-cli/commands)
- [Get started with the quickstart guide](/get-started/quickstart#cli-quickstart)

View File

@@ -0,0 +1,130 @@
---
title: Installation
description: Install the CUA CLI on your system
---
import { Tabs, Tab } from 'fumadocs-ui/components/tabs';
import { Callout } from 'fumadocs-ui/components/callout';
## Quick Install
The fastest way to install the CUA CLI is using our installation scripts:
<Tabs items={['macOS / Linux', 'Windows']}>
<Tab value="macOS / Linux">```bash curl -LsSf https://cua.ai/cli/install.sh | sh ```</Tab>
<Tab value="Windows">
```powershell powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
```
</Tab>
</Tabs>
These scripts will automatically:
1. Install [Bun](https://bun.sh) (a fast JavaScript runtime)
2. Install the CUA CLI via `bun add -g @trycua/cli`
<Callout type="info">
The installation scripts will automatically detect your system and install the appropriate binary
to your PATH.
</Callout>
## Alternative: Install with Bun
You can also install the CLI directly using Bun:
```bash
# Install Bun if you don't have it
curl -fsSL https://bun.sh/install | bash
# Install CUA CLI
bun add -g @trycua/cli
```
<Callout type="info">
Using Bun provides faster installation and better performance compared to npm. If you don't have
Bun installed, the first command will install it for you.
</Callout>
## Verify Installation
After installation, verify the CLI is working:
```bash
cua --help
```
You should see the CLI help output with available commands.
## First Time Setup
After installation, you'll need to authenticate with your CUA account:
```bash
# Login with browser-based OAuth flow
cua auth login
# Or provide your API key directly
cua auth login --api-key sk-your-api-key-here
```
## Updating
To update to the latest version:
<Tabs items={['Script Install', 'npm Install']}>
<Tab value="Script Install">
Re-run the installation script: ```bash # macOS/Linux curl -LsSf https://cua.ai/cli/install.sh |
sh # Windows powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
```
</Tab>
<Tab value="npm Install">```bash npm update -g @trycua/cli ```</Tab>
</Tabs>
## Uninstalling
<Tabs items={['Script Install', 'npm Install']}>
<Tab value="Script Install">
Remove the binary from your PATH: ```bash # macOS/Linux rm $(which cua) # Windows # Remove from
your PATH or delete the executable ```
</Tab>
<Tab value="npm Install">```bash npm uninstall -g @trycua/cli ```</Tab>
</Tabs>
## Troubleshooting
### Command Not Found
If you get a "command not found" error after installation:
1. **Check your PATH**: Make sure the installation directory is in your PATH
2. **Restart your terminal**: Close and reopen your terminal/command prompt
3. **Manual PATH setup**: Add the installation directory to your PATH manually
### Permission Issues
If you encounter permission issues during installation:
<Tabs items={['macOS / Linux', 'Windows']}>
<Tab value="macOS / Linux">
Try running with sudo (not recommended for the curl method): ```bash # If using npm sudo npm
install -g @trycua/cli ```
</Tab>
<Tab value="Windows">
Run PowerShell as Administrator: ```powershell # Right-click PowerShell and "Run as
Administrator" powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
```
</Tab>
</Tabs>
### Network Issues
If the installation script fails due to network issues:
1. **Check your internet connection**
2. **Try the npm installation method instead**
3. **Check if your firewall is blocking the download**
## Next Steps
- [Learn about CLI commands](/libraries/cua-cli/commands)
- [Follow the quickstart guide](/get-started/quickstart#cli-quickstart)

View File

@@ -0,0 +1,5 @@
{
"title": "CLI",
"description": "Command-line interface for CUA",
"pages": ["index", "installation", "commands"]
}

View File

@@ -5,7 +5,7 @@ description: Installation instructions for the current version of the Lume CLI.
## Quickstart
Install and run a prebuilt macOS VM in two commands:
Install and run a prebuilt macOS sandbox in two commands:
```bash
# Install Lume

View File

@@ -6,6 +6,72 @@ title: Client Integrations
To use with Claude Desktop, add an entry to your Claude Desktop configuration (`claude_desktop_config.json`, typically found in `~/.config/claude-desktop/`):
### Package Installation Method
```json
{
"mcpServers": {
"cua-agent": {
"command": "/bin/bash",
"args": ["~/.cua/start_mcp_server.sh"],
"env": {
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-20250514",
"ANTHROPIC_API_KEY": "your-anthropic-api-key-here",
"CUA_MAX_IMAGES": "3",
"CUA_USE_HOST_COMPUTER_SERVER": "false"
}
}
}
}
```
### Development Method
If you're working with the CUA source code:
**Standard VM Mode:**
```json
{
"mcpServers": {
"cua-agent": {
"command": "/usr/bin/env",
"args": [
"bash",
"-lc",
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
]
}
}
}
```
**Host Computer Control Mode:**
```json
{
"mcpServers": {
"cua-agent": {
"command": "/usr/bin/env",
"args": [
"bash",
"-lc",
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; export CUA_USE_HOST_COMPUTER_SERVER='true'; export CUA_MAX_IMAGES='1'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
]
}
}
}
```
**Note**: Replace `/path/to/cua` with the absolute path to your CUA repository directory.
**⚠️ Host Computer Control Setup**: When using `CUA_USE_HOST_COMPUTER_SERVER='true'`, you must also:
1. Install computer server dependencies: `python3 -m pip install uvicorn fastapi`
2. Install the computer server: `python3 -m pip install -e libs/python/computer-server --break-system-packages`
3. Start the computer server: `python -m computer_server --log-level debug`
4. The AI will have direct access to your desktop - use with caution!
For more information on MCP with Claude Desktop, see the [official MCP User Guide](https://modelcontextprotocol.io/quickstart/user).
## Cursor Integration
@@ -15,6 +81,43 @@ To use with Cursor, add an MCP configuration file in one of these locations:
- **Project-specific**: Create `.cursor/mcp.json` in your project directory
- **Global**: Create `~/.cursor/mcp.json` in your home directory
Example configuration for Cursor:
```json
{
"mcpServers": {
"cua-agent": {
"command": "/bin/bash",
"args": ["~/.cua/start_mcp_server.sh"],
"env": {
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-20250514",
"ANTHROPIC_API_KEY": "your-anthropic-api-key-here"
}
}
}
}
```
After configuration, you can simply tell Cursor's Agent to perform computer tasks by explicitly mentioning the CUA agent, such as "Use the computer control tools to open Safari."
For more information on MCP with Cursor, see the [official Cursor MCP documentation](https://docs.cursor.com/context/model-context-protocol).
## Other MCP Clients
The MCP server is compatible with any MCP-compliant client. The server exposes the following tools:
- `run_cua_task` - Execute single computer tasks
- `run_multi_cua_tasks` - Execute multiple tasks (sequential or concurrent)
- `screenshot_cua` - Capture screenshots
- `get_session_stats` - Monitor session statistics
- `cleanup_session` - Manage session lifecycle
### Configuration Options
All MCP clients can configure the server using environment variables:
- `CUA_MODEL_NAME` - Model to use for task execution
- `CUA_MAX_IMAGES` - Maximum images to keep in context
- `CUA_USE_HOST_COMPUTER_SERVER` - Use host system instead of VM
See the [Configuration](/docs/libraries/mcp-server/configuration) page for detailed configuration options.

View File

@@ -4,7 +4,70 @@ title: Configuration
The server is configured using environment variables (can be set in the Claude Desktop config):
| Variable | Description | Default |
| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------ |
| `CUA_MODEL_NAME` | Model string (e.g., "anthropic/claude-3-5-sonnet-20241022", "openai/computer-use-preview", "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", "omniparser+litellm/gpt-4o", "omniparser+ollama_chat/gemma3") | anthropic/claude-3-5-sonnet-20241022 |
| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 |
| Variable | Description | Default |
| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------- |
| `CUA_MODEL_NAME` | Model string (e.g., "anthropic/claude-sonnet-4-20250514", "openai/computer-use-preview", "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", "omniparser+litellm/gpt-4o", "omniparser+ollama_chat/gemma3") | anthropic/claude-sonnet-4-20250514 |
| `ANTHROPIC_API_KEY` | Your Anthropic API key (required for Anthropic models) | None |
| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 |
| `CUA_USE_HOST_COMPUTER_SERVER` | Target your local desktop instead of a VM. Set to "true" to use your host system. **Warning:** AI models may perform risky actions. | false |
## Model Configuration
The `CUA_MODEL_NAME` environment variable supports various model providers through LiteLLM integration:
### Supported Providers
- **Anthropic**: `anthropic/claude-sonnet-4-20250514`,
- **OpenAI**: `openai/computer-use-preview`, `openai/gpt-4o`
- **Local Models**: `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`
- **Omni + LiteLLM**: `omniparser+litellm/gpt-4o`, `omniparser+litellm/claude-3-haiku`
- **Ollama**: `omniparser+ollama_chat/gemma3`
### Example Configurations
**Claude Desktop Configuration:**
```json
{
"mcpServers": {
"cua-agent": {
"command": "/bin/bash",
"args": ["~/.cua/start_mcp_server.sh"],
"env": {
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-20250514",
"ANTHROPIC_API_KEY": "your-anthropic-api-key-here",
"CUA_MAX_IMAGES": "5",
"CUA_USE_HOST_COMPUTER_SERVER": "false"
}
}
}
}
```
**Local Model Configuration:**
```json
{
"mcpServers": {
"cua-agent": {
"command": "/bin/bash",
"args": ["~/.cua/start_mcp_server.sh"],
"env": {
"CUA_MODEL_NAME": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
"CUA_MAX_IMAGES": "3"
}
}
}
}
```
## Session Management Configuration
The MCP server automatically manages sessions with the following defaults:
- **Max Concurrent Sessions**: 10
- **Session Timeout**: 10 minutes of inactivity
- **Computer Pool Size**: 5 instances
- **Automatic Cleanup**: Enabled
These settings are optimized for typical usage and don't require configuration for most users.

View File

@@ -7,3 +7,21 @@ github:
---
**cua-mcp-server** is a MCP server for the Computer-Use Agent (CUA), allowing you to run CUA through Claude Desktop or other MCP clients.
## Features
- **Multi-Client Support**: Concurrent sessions with automatic resource management
- **Progress Reporting**: Real-time progress updates during task execution
- **Error Handling**: Robust error recovery with screenshot capture
- **Concurrent Execution**: Run multiple tasks in parallel for improved performance
- **Session Management**: Automatic cleanup and resource pooling
- **LiteLLM Integration**: Support for multiple model providers
- **VM Safety**: Default VM execution with optional host system control
## Quick Start
1. **Install**: `pip install cua-mcp-server`
2. **Configure**: Add to your MCP client configuration
3. **Use**: Ask Claude to perform computer tasks
See the [Installation](/docs/libraries/mcp-server/installation) guide for detailed setup instructions.

View File

@@ -38,19 +38,103 @@ You can then use the script in your MCP configuration like this:
"command": "/bin/bash",
"args": ["~/.cua/start_mcp_server.sh"],
"env": {
"CUA_MODEL_NAME": "anthropic/claude-3-5-sonnet-20241022"
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-20250514",
"ANTHROPIC_API_KEY": "your-anthropic-api-key-here"
}
}
}
}
```
**Important**: You must include your Anthropic API key for the MCP server to work properly.
## Development Setup
If you're working with the CUA source code directly (like in the CUA repository), you can use the development script instead:
```json
{
"mcpServers": {
"cua-agent": {
"command": "/usr/bin/env",
"args": [
"bash",
"-lc",
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
]
}
}
}
```
**For host computer control** (development setup):
1. **Install Computer Server Dependencies**:
```bash
python3 -m pip install uvicorn fastapi
python3 -m pip install -e libs/python/computer-server --break-system-packages
```
2. **Start the Computer Server**:
```bash
cd /path/to/cua
python -m computer_server --log-level debug
```
This will start the computer server on `http://localhost:8000` that controls your actual desktop.
3. **Configure Claude Desktop**:
```json
{
"mcpServers": {
"cua-agent": {
"command": "/usr/bin/env",
"args": [
"bash",
"-lc",
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; export CUA_USE_HOST_COMPUTER_SERVER='true'; export CUA_MAX_IMAGES='1'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
]
}
}
}
```
**Note**: Replace `/path/to/cua` with the absolute path to your CUA repository directory.
**⚠️ Important**: When using host computer control (`CUA_USE_HOST_COMPUTER_SERVER='true'`), the AI will have direct access to your desktop and can perform actions like opening applications, clicking, typing, and taking screenshots. Make sure you're comfortable with this level of access.
### Troubleshooting
If you get a `/bin/bash: ~/cua/libs/python/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative.
**Common Issues:**
To see the logs:
1. **"Claude's response was interrupted"** - This usually means:
- Missing API key: Add `ANTHROPIC_API_KEY` to your environment variables
- Invalid model name: Use a valid model like `anthropic/claude-sonnet-4-20250514`
- Check logs for specific error messages
```
2. **"Missing Anthropic API Key"** - Add your API key to the configuration:
```json
"env": {
"ANTHROPIC_API_KEY": "your-api-key-here"
}
```
3. **"model not found"** - Use a valid model name:
- ✅ `anthropic/claude-sonnet-4-20250514`
4. **Script not found** - If you get a `/bin/bash: ~/cua/libs/python/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative.
5. **Host Computer Control Issues** - If using `CUA_USE_HOST_COMPUTER_SERVER='true'`:
- **Computer Server not running**: Make sure you've started the computer server with `python -m computer_server --log-level debug`
- **Port 8000 in use**: Check if another process is using port 8000 with `lsof -i :8000`
- **Missing dependencies**: Install `uvicorn` and `fastapi` with `python3 -m pip install uvicorn fastapi`
- **Image size errors**: Use `CUA_MAX_IMAGES='1'` to reduce image context size
**Viewing Logs:**
```bash
tail -n 20 -f ~/Library/Logs/Claude/mcp*.log
```

View File

@@ -12,7 +12,7 @@ This MCP server features comprehensive liteLLM integration, allowing you to use
### Model String Examples:
- **Anthropic**: `"anthropic/claude-3-5-sonnet-20241022"`
- **Anthropic**: `"anthropic/claude-sonnet-4-5-20250929"`
- **OpenAI**: `"openai/computer-use-preview"`
- **UI-TARS**: `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`
- **Omni + Any LiteLLM**: `"omniparser+litellm/gpt-4o"`, `"omniparser+litellm/claude-3-haiku"`, `"omniparser+ollama_chat/gemma3"`

View File

@@ -6,5 +6,61 @@ title: Tools
The MCP server exposes the following tools to Claude:
1. `run_cua_task` - Run a single Computer-Use Agent task with the given instruction
2. `run_multi_cua_tasks` - Run multiple tasks in sequence
### Core Task Execution Tools
1. **`run_cua_task`** - Run a single Computer-Use Agent task with the given instruction
- `task` (string): The task description for the agent to execute
- `session_id` (string, optional): Session ID for multi-client support. If not provided, a new session will be created
- Returns: Tuple of (combined text output, final screenshot)
2. **`run_multi_cua_tasks`** - Run multiple tasks in sequence or concurrently
- `tasks` (list of strings): List of task descriptions to execute
- `session_id` (string, optional): Session ID for multi-client support. If not provided, a new session will be created
- `concurrent` (boolean, optional): If true, run tasks concurrently. If false, run sequentially (default)
- Returns: List of tuples (combined text output, screenshot) for each task
### Utility Tools
3. **`screenshot_cua`** - Take a screenshot of the current screen
- `session_id` (string, optional): Session ID for multi-client support. If not provided, a new session will be created
- Returns: Screenshot image
4. **`get_session_stats`** - Get statistics about active sessions and resource usage
- Returns: Dictionary with session statistics including total sessions, active tasks, and session details
5. **`cleanup_session`** - Cleanup a specific session and release its resources
- `session_id` (string): The session ID to cleanup
- Returns: Confirmation message
## Session Management
The MCP server supports multi-client sessions with automatic resource management:
- **Session Isolation**: Each client can have its own session with isolated computer instances
- **Resource Pooling**: Computer instances are pooled for efficient resource usage
- **Automatic Cleanup**: Idle sessions are automatically cleaned up after 10 minutes
- **Concurrent Tasks**: Multiple tasks can run concurrently within the same session
- **Progress Reporting**: Real-time progress updates during task execution
## Usage Examples
### Basic Task Execution
```
"Open Chrome and navigate to github.com"
"Create a folder called 'Projects' on my desktop"
```
### Multi-Task Execution
```
"Run these tasks: 1) Open Finder, 2) Navigate to Documents, 3) Create a new folder called 'Work'"
```
### Session Management
```
"Take a screenshot of the current screen"
"Show me the session statistics"
"Cleanup session abc123"
```

View File

@@ -2,7 +2,7 @@
title: Usage
---
## Usage
## Basic Usage
Once configured, you can simply ask Claude to perform computer tasks:
@@ -13,9 +13,157 @@ Once configured, you can simply ask Claude to perform computer tasks:
Claude will automatically use your CUA agent to perform these tasks.
### First-time Usage Notes
## Advanced Features
### Progress Reporting
The MCP server provides real-time progress updates during task execution:
- Task progress is reported as percentages (0-100%)
- Multi-task operations show progress for each individual task
- Progress updates are streamed to the MCP client for real-time feedback
### Error Handling
Robust error handling ensures reliable operation:
- Failed tasks return error messages with screenshots when possible
- Session state is preserved even when individual tasks fail
- Automatic cleanup prevents resource leaks
- Detailed error logging for troubleshooting
### Concurrent Task Execution
For improved performance, multiple tasks can run concurrently:
- Set `concurrent=true` in `run_multi_cua_tasks` for parallel execution
- Each task runs in its own context with isolated state
- Progress tracking works for both sequential and concurrent modes
- Resource pooling ensures efficient computer instance usage
### Session Management
Multi-client support with automatic resource management:
- Each client gets isolated sessions with separate computer instances
- Sessions automatically clean up after 10 minutes of inactivity
- Resource pooling prevents resource exhaustion
- Session statistics available for monitoring
## Target Computer Options
By default, the MCP server runs CUA in a virtual machine for safety. However, you can also configure it to run on your local system.
### Default: Using a VM (Recommended)
The MCP server will automatically start and connect to a VM based on your platform. This is the safest option as AI actions are isolated from your host system.
No additional configuration is needed - this is the default behavior.
### Option: Targeting Your Local Desktop
<Callout type="warn">
**Warning:** When targeting your local system, AI models have direct access to your desktop and
may perform risky actions. Use with caution.
</Callout>
To have the MCP server control your local desktop instead of a VM:
1. **Start the Computer Server on your host:**
```bash
pip install cua-computer-server
python -m computer_server
```
2. **Configure the MCP server to use your host system:**
Add the `CUA_USE_HOST_COMPUTER_SERVER` environment variable to your MCP client configuration:
<Tabs items={['Claude Desktop', 'Other MCP Clients']}>
<Tab value="Claude Desktop">
Update your Claude Desktop config (see [Installation](/docs/libraries/mcp-server/installation)) to include the environment variable:
```json
{
"mcpServers": {
"cua-agent": {
"command": "/bin/bash",
"args": ["~/.cua/start_mcp_server.sh"],
"env": {
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-5-20250929",
"CUA_USE_HOST_COMPUTER_SERVER": "true"
}
}
}
}
```
</Tab>
<Tab value="Other MCP Clients">
Set the environment variable in your MCP client configuration:
```bash
export CUA_USE_HOST_COMPUTER_SERVER=true
```
Then start your MCP client as usual.
</Tab>
</Tabs>
3. **Restart your MCP client** (e.g., Claude Desktop) to apply the changes.
Now Claude will control your local desktop directly when you ask it to perform computer tasks.
## Usage Examples
### Single Task Execution
```
"Open Safari and navigate to apple.com"
"Create a new folder on the desktop called 'My Projects'"
"Take a screenshot of the current screen"
```
### Multi-Task Execution (Sequential)
```
"Run these tasks in order: 1) Open Finder, 2) Navigate to Documents folder, 3) Create a new folder called 'Work'"
```
### Multi-Task Execution (Concurrent)
```
"Run these tasks simultaneously: 1) Open Chrome, 2) Open Safari, 3) Open Finder"
```
### Session Management
```
"Show me the current session statistics"
"Take a screenshot using session abc123"
"Cleanup session xyz789"
```
### Error Recovery
```
"Try to open a non-existent application and show me the error"
"Find all files with .tmp extension and delete them safely"
```
## First-time Usage Notes
**API Keys**: Ensure you have valid API keys:
- Add your Anthropic API key, or other model provider API key in the Claude Desktop config (as shown above)
- Add your Anthropic API key in the Claude Desktop config (as shown above)
- Or set it as an environment variable in your shell profile
- **Required**: The MCP server needs an API key to authenticate with the model provider
**Model Selection**: Choose the appropriate model for your needs:
- **Claude Sonnet 4**: Latest model with best performance (`anthropic/claude-sonnet-4-20250514`)
- **Computer-Use Preview**: Specialized for computer tasks (`openai/computer-use-preview`)
- **Local Models**: For privacy-sensitive environments
- **Ollama**: For offline usage

View File

@@ -4,11 +4,10 @@
"root": true,
"defaultOpen": true,
"pages": [
"index",
"quickstart-devs",
"quickstart-cli",
"telemetry",
"example-usecases",
"---[Rocket]Get Started---",
"...get-started",
"---[ChefHat]Cookbook---",
"...example-usecases",
"---[BookCopy]Computer Playbook---",
"...computer-sdk",
"---[BookCopy]Agent Playbook---",

View File

@@ -1,343 +0,0 @@
---
title: Quickstart (CLI)
description: Get started with the Cua Agent CLI in 4 steps
icon: Rocket
---
import { Step, Steps } from 'fumadocs-ui/components/steps';
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
import { Accordion, Accordions } from 'fumadocs-ui/components/accordion';
Get up and running with the Cua Agent CLI in 4 simple steps.
<Steps>
<Step>
## Introduction
Cua combines Computer (interface) + Agent (AI) for automating desktop apps. The Agent CLI provides a clean terminal interface to control your remote computer using natural language commands.
</Step>
<Step>
## Set Up Your Computer Environment
Choose how you want to run your Cua computer. **Cloud Sandbox is recommended** for the easiest setup:
<Tabs items={['☁️ Cloud Sandbox (Recommended)', 'Linux on Docker', 'Windows Sandbox', 'macOS VM']}>
<Tab value="☁️ Cloud Sandbox (Recommended)">
**Easiest & safest way to get started - works on any host OS**
1. Go to [cua.ai/signin](https://cua.ai/signin)
2. Navigate to **Dashboard > Containers > Create Instance**
3. Create a **Medium, Ubuntu 22** container
4. Note your container name and API key
Your cloud container will be automatically configured and ready to use.
</Tab>
<Tab value="Linux on Docker">
**Run Linux desktop locally on macOS, Windows, or Linux hosts**
1. Install Docker Desktop or Docker Engine
2. Pull the CUA XFCE container (lightweight desktop)
```bash
docker pull --platform=linux/amd64 trycua/cua-xfce:latest
```
Or use KASM for a full-featured desktop:
```bash
docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest
```
</Tab>
<Tab value="Windows Sandbox">
**Windows hosts only - requires Windows 10 Pro/Enterprise or Windows 11**
1. Enable Windows Sandbox
2. Install pywinsandbox dependency
```bash
pip install -U git+git://github.com/karkason/pywinsandbox.git
```
3. Windows Sandbox will be automatically configured when you run the CLI
</Tab>
<Tab value="macOS VM">
**macOS hosts only - requires Lume CLI**
1. Install lume cli
```bash
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
```
2. Start a local Cua macOS VM
```bash
lume run macos-sequoia-cua:latest
```
</Tab>
</Tabs>
</Step>
<Step>
## Install Cua
<Accordions type="single" defaultValue="uv">
<Accordion title="uv (Recommended)" value="uv">
### Install uv
<Tabs items={['macOS / Linux', 'Windows']} persist>
<Tab value="macOS / Linux">
```bash
# Use curl to download the script and execute it with sh:
curl -LsSf https://astral.sh/uv/install.sh | sh
# If your system doesn't have curl, you can use wget:
# wget -qO- https://astral.sh/uv/install.sh | sh
```
</Tab>
<Tab value="Windows">
```powershell
# Use irm to download the script and execute it with iex:
powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
```
</Tab>
</Tabs>
### Install Python 3.12
```bash
uv python install 3.12
# uv will install Cua dependencies automatically when you use --with "cua-agent[cli]"
```
</Accordion>
<Accordion title="conda" value="conda">
### Install conda
<Tabs items={['macOS', 'Linux', 'Windows']} persist>
<Tab value="macOS">
```bash
mkdir -p ~/miniconda3
curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o ~/miniconda3/miniconda.sh
bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
rm ~/miniconda3/miniconda.sh
source ~/miniconda3/bin/activate
```
</Tab>
<Tab value="Linux">
```bash
mkdir -p ~/miniconda3
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
rm ~/miniconda3/miniconda.sh
source ~/miniconda3/bin/activate
```
</Tab>
<Tab value="Windows">
```powershell
wget "https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe" -outfile ".\miniconda.exe"
Start-Process -FilePath ".\miniconda.exe" -ArgumentList "/S" -Wait
del .\miniconda.exe
```
</Tab>
</Tabs>
### Create and activate Python 3.12 environment
```bash
conda create -n cua python=3.12
conda activate cua
```
### Install Cua
```bash
pip install "cua-agent[cli]" cua-computer
```
</Accordion>
<Accordion title="pip" value="pip">
### Install Cua
```bash
pip install "cua-agent[cli]" cua-computer
```
</Accordion>
</Accordions>
</Step>
<Step>
## Run Cua CLI
Choose your preferred AI model:
### OpenAI Computer Use Preview
<Tabs items={['uv', 'conda/pip']} persist>
<Tab value="uv">
```bash
uv run --with "cua-agent[cli]" -m agent.cli openai/computer-use-preview
```
</Tab>
<Tab value="conda/pip">
```bash
python -m agent.cli openai/computer-use-preview
```
</Tab>
</Tabs>
### Anthropic Claude
<Tabs items={['uv', 'conda/pip']} persist>
<Tab value="uv">
```bash
uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-sonnet-4-5-20250929
uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-opus-4-20250514
uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-opus-4-1-20250805
uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-sonnet-4-20250514
uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-3-5-sonnet-20241022
```
</Tab>
<Tab value="conda/pip">
```bash
python -m agent.cli anthropic/claude-sonnet-4-5-20250929
python -m agent.cli anthropic/claude-opus-4-1-20250805
python -m agent.cli anthropic/claude-opus-4-20250514
python -m agent.cli anthropic/claude-sonnet-4-20250514
python -m agent.cli anthropic/claude-3-5-sonnet-20241022
```
</Tab>
</Tabs>
### Omniparser + LLMs
<Tabs items={['uv', 'conda/pip']} persist>
<Tab value="uv">
```bash
uv run --with "cua-agent[cli]" -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
uv run --with "cua-agent[cli]" -m agent.cli omniparser+openai/gpt-4o
uv run --with "cua-agent[cli]" -m agent.cli omniparser+vertex_ai/gemini-pro
```
</Tab>
<Tab value="conda/pip">
```bash
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
python -m agent.cli omniparser+openai/gpt-4o
python -m agent.cli omniparser+vertex_ai/gemini-pro
```
</Tab>
</Tabs>
### Local Models
<Tabs items={['uv', 'conda/pip']} persist>
<Tab value="uv">
```bash
# Hugging Face models (local)
uv run --with "cua-agent[cli]" -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
# MLX models (Apple Silicon)
uv run --with "cua-agent[cli]" -m agent.cli mlx/mlx-community/UI-TARS-1.5-7B-6bit
# Ollama models
uv run --with "cua-agent[cli]" -m agent.cli omniparser+ollama_chat/llama3.2:latest
```
</Tab>
<Tab value="conda/pip">
```bash
# Hugging Face models (local)
python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
# MLX models (Apple Silicon)
python -m agent.cli mlx/mlx-community/UI-TARS-1.5-7B-6bit
# Ollama models
python -m agent.cli omniparser+ollama_chat/llama3.2:latest
```
</Tab>
</Tabs>
### Interactive Setup
If you haven't set up environment variables, the CLI will guide you through the setup:
1. **Sandbox Name**: Enter your Cua sandbox name (or get one at [cua.ai](https://cua.ai/))
2. **CUA API Key**: Enter your Cua API key
3. **Provider API Key**: Enter your AI provider API key (OpenAI, Anthropic, etc.)
### Start Chatting
Once connected, you'll see:
```
💻 Connected to your-container-name (model, agent_loop)
Type 'exit' to quit.
>
```
You can ask your agent to perform actions like:
- "Take a screenshot and tell me what's on the screen"
- "Open Firefox and go to github.com"
- "Type 'Hello world' into the terminal"
- "Close the current window"
- "Click on the search button"
</Step>
</Steps>
---
For running models locally, see [Running Models Locally](/agent-sdk/supported-model-providers/local-models).

View File

@@ -1,313 +0,0 @@
---
title: Quickstart
description: Get started with Cua in three steps
icon: Rocket
---
import { Step, Steps } from 'fumadocs-ui/components/steps';
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
This quickstart guides you through setting up your [computer environment](#set-up-your-computer-environment), programmatic control with a [Cua computer](#using-computer), and task automation with a [Cua agent](#using-agent):
<Steps>
<Step>
## Set Up Your Computer Environment
Choose how you want to run your Cua computer. This will be the environment where your automated tasks will execute.
You can run your Cua computer in the cloud (recommended for easiest setup), locally on macOS with Lume, locally on Windows with a Windows Sandbox, or in a Docker container on any platform. Choose the option that matches your system and needs.
<Tabs items={['☁️ Cloud', '🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox']}>
<Tab value="☁️ Cloud">
Cua Cloud Sandbox provides virtual machines that run Ubuntu.
1. Go to [cua.ai/signin](https://cua.ai/signin)
2. Navigate to **Dashboard > Containers > Create Instance**
3. Create a **Medium, Ubuntu 22** sandbox
4. Note your sandbox name and API key
Your Cloud Sandbox will be automatically configured and ready to use.
</Tab>
<Tab value="🍎 Lume">
Lume containers are macOS virtual machines that run on a macOS host machine.
1. Install the Lume CLI:
```bash
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
```
2. Start a local Cua sandbox:
```bash
lume run macos-sequoia-cua:latest
```
</Tab>
<Tab value="🪟 Windows Sandbox">
Windows Sandbox provides Windows virtual environments that run on a Windows host machine.
1. Enable [Windows Sandbox](https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/windows-sandbox-install) (requires Windows 10 Pro/Enterprise or Windows 11)
2. Install the `pywinsandbox` dependency:
```bash
pip install -U git+git://github.com/karkason/pywinsandbox.git
```
3. Windows Sandbox will be automatically configured when you run the CLI
</Tab>
<Tab value="🐳 Docker">
Docker provides a way to run Ubuntu containers on any host machine.
1. Install Docker Desktop or Docker Engine:
2. Pull the CUA Ubuntu sandbox:
```bash
docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest
```
</Tab>
</Tabs>
</Step>
<Step>
## Using Computer
Connect to your Cua computer and perform basic interactions, such as taking screenshots or simulating user input.
<Tabs items={['Python', 'TypeScript']}>
<Tab value="Python">
Install the Cua computer Python SDK:
```bash
pip install cua-computer
```
Then, connect to your desired computer environment:
<Tabs items={['☁️ Cloud', '🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox', '🖥️ Host Desktop']}>
<Tab value="☁️ Cloud">
```python
from computer import Computer
computer = Computer(
os_type="linux",
provider_type="cloud",
name="your-sandbox-name",
api_key="your-api-key"
)
await computer.run() # Connect to the sandbox
```
</Tab>
<Tab value="🍎 Lume">
```python
from computer import Computer
computer = Computer(
os_type="macos",
provider_type="lume",
name="macos-sequoia-cua:latest"
)
await computer.run() # Launch & connect to the container
```
</Tab>
<Tab value="🪟 Windows Sandbox">
```python
from computer import Computer
computer = Computer(
os_type="windows",
provider_type="windows_sandbox"
)
await computer.run() # Launch & connect to the container
```
</Tab>
<Tab value="🐳 Docker">
```python
from computer import Computer
computer = Computer(
os_type="linux",
provider_type="docker",
name="trycua/cua-ubuntu:latest"
)
await computer.run() # Launch & connect to the container
```
</Tab>
<Tab value="🖥️ Host Desktop">
Install and run `cua-computer-server`:
```bash
pip install cua-computer-server
python -m computer_server
```
Then, use the `Computer` object to connect:
```python
from computer import Computer
computer = Computer(use_host_computer_server=True)
await computer.run() # Connect to the host desktop
```
</Tab>
</Tabs>
Once connected, you can perform interactions:
```python
try:
# Take a screenshot of the computer's current display
screenshot = await computer.interface.screenshot()
# Simulate a left-click at coordinates (100, 100)
await computer.interface.left_click(100, 100)
# Type "Hello!" into the active application
await computer.interface.type("Hello!")
finally:
await computer.close()
```
</Tab>
<Tab value="TypeScript">
Install the Cua computer TypeScript SDK:
```bash
npm install @trycua/computer
```
Then, connect to your desired computer environment:
<Tabs items={['☁️ Cloud','🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox', '🖥️ Host Desktop']}>
<Tab value="☁️ Cloud">
```typescript
import { Computer, OSType } from '@trycua/computer';
const computer = new Computer({
osType: OSType.LINUX,
name: "your-sandbox-name",
apiKey: "your-api-key"
});
await computer.run(); // Connect to the sandbox
```
</Tab>
<Tab value="🍎 Lume">
```typescript
import { Computer, OSType, ProviderType } from '@trycua/computer';
const computer = new Computer({
osType: OSType.MACOS,
providerType: ProviderType.LUME,
name: "macos-sequoia-cua:latest"
});
await computer.run(); // Launch & connect to the container
```
</Tab>
<Tab value="🪟 Windows Sandbox">
```typescript
import { Computer, OSType, ProviderType } from '@trycua/computer';
const computer = new Computer({
osType: OSType.WINDOWS,
providerType: ProviderType.WINDOWS_SANDBOX
});
await computer.run(); // Launch & connect to the container
```
</Tab>
<Tab value="🐳 Docker">
```typescript
import { Computer, OSType, ProviderType } from '@trycua/computer';
const computer = new Computer({
osType: OSType.LINUX,
providerType: ProviderType.DOCKER,
name: "trycua/cua-ubuntu:latest"
});
await computer.run(); // Launch & connect to the container
```
</Tab>
<Tab value="🖥️ Host Desktop">
First, install and run `cua-computer-server`:
```bash
pip install cua-computer-server
python -m computer_server
```
Then, use the `Computer` object to connect:
```typescript
import { Computer } from '@trycua/computer';
const computer = new Computer({ useHostComputerServer: true });
await computer.run(); // Connect to the host desktop
```
</Tab>
</Tabs>
Once connected, you can perform interactions:
```typescript
try {
// Take a screenshot of the computer's current display
const screenshot = await computer.interface.screenshot();
// Simulate a left-click at coordinates (100, 100)
await computer.interface.leftClick(100, 100);
// Type "Hello!" into the active application
await computer.interface.typeText("Hello!");
} finally {
await computer.close();
}
```
</Tab>
</Tabs>
Learn more about computers in the [Cua computers documentation](/computer-sdk/computers). You will see how to automate computers with agents in the next step.
</Step>
<Step>
## Using Agent
Utilize an Agent to automate complex tasks by providing it with a goal and allowing it to interact with the computer environment.
Install the Cua agent Python SDK:
```bash
pip install "cua-agent[all]"
```
Then, use the `ComputerAgent` object:
```python
from agent import ComputerAgent
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
tools=[computer],
max_trajectory_budget=5.0
)
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
async for result in agent.run(messages):
for item in result["output"]:
if item["type"] == "message":
print(item["content"][0]["text"])
```
Learn more about agents in [Agent Loops](/agent-sdk/agent-loops) and available models in [Supported Models](/agent-sdk/supported-model-providers/).
</Step>
</Steps>
## Next Steps
- Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands)
- Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/)
- Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help
- Try out [Form Filling](/example-usecases/form-filling) preset usecase

View File

@@ -24,6 +24,39 @@ const config = {
basePath: false, // Important: this bypasses the basePath
permanent: false,
},
// Redirect old docs.cua.ai URLs to cua.ai/docs with 301 for SEO
// This handles URLs that Google has indexed from the old domain
{
source: '/:path*',
has: [
{
type: 'host',
value: 'docs.cua.ai',
},
],
destination: 'https://cua.ai/docs/:path*',
permanent: true, // 301 redirect to preserve SEO authority
basePath: false,
},
// Redirects for documentation restructure (PR #568)
// Moved quickstart-devs to get-started section
{
source: '/quickstart-devs',
destination: '/get-started/quickstart',
permanent: true,
},
// Moved telemetry to agent-sdk section
{
source: '/telemetry',
destination: '/agent-sdk/telemetry',
permanent: true,
},
// Removed quickstart-cli, consolidated into main quickstart
{
source: '/quickstart-cli',
destination: '/get-started/quickstart',
permanent: true,
},
];
},
images: {

View File

@@ -9,22 +9,22 @@
"postinstall": "fumadocs-mdx"
},
"dependencies": {
"fumadocs-core": "15.5.1",
"fumadocs-mdx": "11.6.7",
"fumadocs-ui": "15.5.1",
"fumadocs-core": "16.0.8",
"fumadocs-mdx": "13.0.5",
"fumadocs-ui": "16.0.8",
"lucide-react": "^0.525.0",
"mermaid": "^11.8.1",
"next": "15.3.3",
"next": "16.0.1",
"next-themes": "^0.4.6",
"posthog-js": "^1.276.0",
"react": "^19.1.0",
"react-dom": "^19.1.0",
"react": "^19.2.0",
"react-dom": "^19.2.0",
"react-icons": "^5.5.0",
"remark": "^15.0.1",
"remark-gfm": "^4.0.1",
"remark-mdx": "^3.1.0",
"tailwind-merge": "^3.3.1",
"zod": "^3.25.76"
"zod": "^4.1.12"
},
"devDependencies": {
"@tailwindcss/postcss": "^4.1.8",

1634
docs/pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.2 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 628 KiB

View File

@@ -6,12 +6,12 @@ import { z } from 'zod';
export const docs = defineDocs({
docs: {
schema: frontmatterSchema.extend({
macos: z.boolean().optional(),
windows: z.boolean().optional(),
linux: z.boolean().optional(),
pypi: z.string().optional(),
npm: z.string().optional(),
github: z.array(z.string()).optional(),
macos: z.boolean().default(false),
windows: z.boolean().default(false),
linux: z.boolean().default(false),
}),
},
meta: {

View File

@@ -8,15 +8,16 @@ import { cn } from 'fumadocs-ui/utils/cn';
import { ChevronDown, CodeXml, ExternalLink } from 'lucide-react';
import type { Metadata } from 'next';
import Link from 'next/link';
import { notFound, redirect } from 'next/navigation';
import { notFound } from 'next/navigation';
import { PageFeedback } from '@/components/page-feedback';
import { DocActionsMenu } from '@/components/doc-actions-menu';
export default async function Page(props: { params: Promise<{ slug?: string[] }> }) {
const params = await props.params;
const slug = params.slug || [];
const page = source.getPage(slug);
if (!page) notFound(); //redirect('/docs');
if (!page) notFound();
// Detect if this is an API reference page: /api/[section] or /api/[section]/[version]
let apiSection: string | null = null;
@@ -179,9 +180,13 @@ export default async function Page(props: { params: Promise<{ slug?: string[] }>
};
const tocFooter = () => {
// Construct file path from slug
// For root index, use 'index.mdx', otherwise join slug parts
const filePath = slug.length === 0 ? 'index.mdx' : `${slug.join('/')}.mdx`;
return (
<div className="mt-4">
<DocActionsMenu pageUrl={page.url} pageTitle={page.data.title} filePath={page.file.path} />
<DocActionsMenu pageUrl={page.url} pageTitle={page.data.title} filePath={filePath} />
</div>
);
};
@@ -282,9 +287,9 @@ export async function generateMetadata(props: {
const page = source.getPage(params.slug);
if (!page) notFound();
let title = `${page.data.title} | Cua Docs`;
if (page.url.includes('api')) title = `${page.data.title} | Cua API Docs`;
if (page.url.includes('guide')) title = ` Guide: ${page.data.title} | Cua Docs`;
let title = `${page.data.title} | Cua`;
if (page.url.includes('api')) title = `${page.data.title} | Cua API`;
if (page.url.includes('guide')) title = ` Guide: ${page.data.title} | Cua`;
// Canonical URL points to cua.ai to consolidate all SEO authority on main domain
const canonicalUrl = `https://cua.ai${page.url}`;
@@ -368,7 +373,7 @@ export async function generateMetadata(props: {
title,
description: page.data.description,
type: 'article',
siteName: 'Cua Docs',
siteName: 'Cua',
url: canonicalUrl,
},
twitter: {

View File

@@ -1,3 +1,14 @@
@import 'tailwindcss';
@import 'fumadocs-ui/css/neutral.css';
@import 'fumadocs-ui/css/preset.css';
/* Fix TOC overflow on production builds */
#nd-toc {
overflow-y: auto;
overflow-x: hidden;
}
#nd-toc > div {
overflow-y: auto;
overflow-x: hidden;
}

View File

@@ -34,9 +34,10 @@ export const baseOptions: BaseLayoutProps = {
className="hidden dark:block"
alt="Logo"
/>
Cua Documentation
Cua
</>
),
url: 'https://cua.ai',
},
githubUrl: 'https://github.com/trycua/cua',
links: [

View File

@@ -7,7 +7,7 @@ import posthog from 'posthog-js';
interface DocActionsMenuProps {
pageUrl: string;
pageTitle: string;
filePath: string;
filePath?: string;
}
export function DocActionsMenu({ pageUrl, pageTitle, filePath }: DocActionsMenuProps) {
@@ -15,6 +15,9 @@ export function DocActionsMenu({ pageUrl, pageTitle, filePath }: DocActionsMenuP
const handleCopyMarkdown = async () => {
try {
if (!filePath) {
throw new Error('No file path available');
}
const githubRawUrl = `https://raw.githubusercontent.com/trycua/cua/refs/heads/main/docs/content/docs/${filePath}`;
const response = await fetch(githubRawUrl);
@@ -55,6 +58,9 @@ export function DocActionsMenu({ pageUrl, pageTitle, filePath }: DocActionsMenuP
};
const handleEditGithub = () => {
if (!filePath) {
return;
}
posthog.capture('docs_edit_github_clicked', {
page: pageUrl,
page_title: pageTitle,

View File

@@ -56,7 +56,7 @@ export function Footer() {
</li>
<li>
<a
href="/docs/quickstart-devs"
href="/docs/get-started/quickstart"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Quick Start

View File

@@ -0,0 +1,34 @@
export function Hero({ children }: { children: React.ReactNode }) {
return (
<div className="not-prose relative mb-12 overflow-hidden rounded-xl border border-fd-border bg-gradient-to-br from-fd-background via-fd-muted/30 to-fd-muted/50 p-8 shadow-lg md:p-12 lg:p-16">
{/* Background Pattern */}
<div className="pointer-events-none absolute inset-0">
{/* Grid */}
<svg
className="absolute h-full w-full text-fd-foreground"
xmlns="http://www.w3.org/2000/svg"
>
<defs>
<pattern id="hero-grid" width="40" height="40" patternUnits="userSpaceOnUse">
<path
d="M 40 0 L 0 0 0 40"
fill="none"
stroke="currentColor"
strokeWidth="0.5"
opacity="0.1"
/>
</pattern>
</defs>
<rect width="100%" height="100%" fill="url(#hero-grid)" />
</svg>
{/* Subtle glow effects */}
<div className="absolute -right-20 -top-20 h-96 w-96 rounded-full bg-fd-primary/5 blur-3xl" />
<div className="absolute -bottom-32 -left-20 h-96 w-96 rounded-full bg-fd-primary/5 blur-3xl" />
</div>
{/* Content */}
<div className="relative z-10">{children}</div>
</div>
);
}

View File

@@ -12,15 +12,24 @@ const processor = remark()
.use(remarkGfm);
export async function getLLMText(page: InferPageType<typeof source>) {
const processed = await processor.process({
path: page.data._file.absolutePath,
value: page.data.content,
});
const pageData = page.data as any;
const filePath = pageData._file?.absolutePath;
const content = pageData.content || pageData.body || '';
let processed;
if (filePath && typeof content === 'string') {
processed = await processor.process({ path: filePath, value: content });
} else if (typeof content === 'string') {
processed = await processor.process(content);
} else {
// Handle case where content is not available
processed = { value: '' };
}
return `# ${page.data.title}
URL: ${page.url}
${page.data.description}
${page.data.description || ''}
${processed.value}`;
}

View File

@@ -9,6 +9,7 @@ import {
EditableForm,
EditableInput,
} from './components/editable-code-block';
import { Hero } from './components/hero';
// use this function to get MDX components, you will need it for rendering MDX
export function getMDXComponents(components?: MDXComponents): MDXComponents {
@@ -20,6 +21,7 @@ export function getMDXComponents(components?: MDXComponents): MDXComponents {
EditableValue,
EditableForm,
EditableInput,
Hero,
...TabsComponents,
...components,
};

View File

@@ -6,13 +6,19 @@ import { useEffect } from 'react';
import { usePathname, useSearchParams } from 'next/navigation';
if (typeof window !== 'undefined') {
posthog.init(process.env.NEXT_PUBLIC_POSTHOG_API_KEY!, {
api_host: '/docs/api/posthog',
ui_host: process.env.NEXT_PUBLIC_POSTHOG_HOST,
person_profiles: 'always',
capture_pageview: false,
capture_pageleave: true,
});
const apiKey = process.env.NEXT_PUBLIC_POSTHOG_API_KEY;
if (apiKey) {
posthog.init(apiKey, {
api_host: '/docs/api/posthog',
ui_host: process.env.NEXT_PUBLIC_POSTHOG_HOST,
person_profiles: 'always',
capture_pageview: false,
capture_pageleave: true,
});
} else {
console.warn('[PostHog] API key not configured. Analytics will be disabled.');
}
}
export function PHProvider({ children }: { children: React.ReactNode }) {

View File

@@ -13,7 +13,7 @@
"moduleResolution": "bundler",
"resolveJsonModule": true,
"isolatedModules": true,
"jsx": "preserve",
"jsx": "react-jsx",
"incremental": true,
"paths": {
"@/.source": ["./.source/index.ts"],
@@ -25,6 +25,12 @@
}
]
},
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
"include": [
"next-env.d.ts",
"**/*.ts",
"**/*.tsx",
".next/types/**/*.ts",
".next/dev/types/**/*.ts"
],
"exclude": ["node_modules"]
}

View File

@@ -45,7 +45,7 @@ async def run_agent_example():
# model="anthropic/claude-opus-4-20250514",
# model="anthropic/claude-sonnet-4-20250514",
# model="anthropic/claude-3-7-sonnet-20250219",
# model="anthropic/claude-3-5-sonnet-20241022",
# model="anthropic/claude-sonnet-4-5-20250929",
# == UI-TARS ==
# model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
# model="mlx/mlx-community/UI-TARS-1.5-7B-6bit",
@@ -53,6 +53,10 @@ async def run_agent_example():
# == Omniparser + Any LLM ==
# model="omniparser+anthropic/claude-opus-4-20250514",
# model="omniparser+ollama_chat/gemma3:12b-it-q4_K_M",
# == Omniparser + Vertex AI Gemini 3 (with thinking_level) ==
# model="omni+vertex_ai/gemini-3-flash",
# thinking_level="high", # or "low"
# media_resolution="medium", # or "low" or "high"
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.DEBUG,

View File

@@ -9,14 +9,13 @@ from computer.providers.cloud.provider import CloudProvider
async def main() -> None:
api_key = os.getenv("CUA_API_KEY")
if not api_key:
raise RuntimeError("CUA_API_KEY environment variable is not set")
# CloudProvider will automatically read CUA_API_KEY from environment if not provided
# You can still pass api_key explicitly if needed: CloudProvider(api_key="your_key")
api_base = os.getenv("CUA_API_BASE")
if api_base:
print(f"Using API base: {api_base}")
provider = CloudProvider(api_key=api_key, verbose=True)
provider = CloudProvider(verbose=True)
async with provider:
# List all VMs

View File

@@ -34,14 +34,6 @@ This example demonstrates how to control a Cua Cloud Sandbox using the OpenAI `c
- `src/index.ts` — Main example script
- `src/helpers.ts` — Helper for executing actions on the container
## Further Reading
For a step-by-step tutorial and more detailed explanation, see the accompanying blog post:
➡️ [Controlling a Cua Cloud Sandbox with JavaScript](https://placeholder-url-to-blog-post.com)
_(This link will be updated once the article is published.)_
---
If you have questions or issues, please open an issue or contact the maintainers.

View File

@@ -58,7 +58,7 @@ To get set up with Lume for development, read [these instructions](Development.m
- [Installation](https://cua.ai/docs/libraries/lume/installation)
- [Prebuilt Images](https://cua.ai/docs/libraries/lume/prebuilt-images)
- [CLI Reference](https://cua.ai/docs/libraries/lume/cli-reference)
- [HTTP API](https://cuai.ai/docs/libraries/lume/http-api)
- [HTTP API](https://cua.ai/docs/libraries/lume/http-api)
- [FAQ](https://cua.ai/docs/libraries/lume/faq)
## Contributing

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.4.37
current_version = 0.5.1
commit = True
tag = True
tag_name = agent-v{new_version}

View File

@@ -51,7 +51,7 @@ async def main():
# Create agent
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
only_n_most_recent_images=3,
trajectory_dir="trajectories",
@@ -78,7 +78,7 @@ if __name__ == "__main__":
- [Chat History](https://cua.ai/docs/agent-sdk/chat-history)
- [Callbacks](https://cua.ai/docs/agent-sdk/callbacks)
- [Custom Tools](https://cua.ai/docs/agent-sdk/custom-tools)
- [Custom Computer Handlers](https://cua.ai/docs/agent-sdk/custom-computer-handlers)
- [Custom Computer Handlers](https://cua.ai/docs/computer-sdk/custom-computer-handlers)
- [Prompt Caching](https://cua.ai/docs/agent-sdk/prompt-caching)
- [Usage Tracking](https://cua.ai/docs/agent-sdk/usage-tracking)
- [Benchmarks](https://cua.ai/docs/agent-sdk/benchmarks)

Some files were not shown because too many files have changed in this diff Show More