mirror of
https://github.com/trycua/computer.git
synced 2026-01-01 11:00:31 -06:00
Merge branch 'main' into feat/cua-bench-submodules
This commit is contained in:
45
.github/workflows/bump-version.yml
vendored
45
.github/workflows/bump-version.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: Bump Version
|
||||
name: Bump Version & Publish
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@@ -30,6 +30,9 @@ permissions:
|
||||
jobs:
|
||||
bump-version:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
agent_version: ${{ steps.agent_version.outputs.version }}
|
||||
computer_version: ${{ steps.computer_version.outputs.version }}
|
||||
steps:
|
||||
- name: Set package directory
|
||||
id: package
|
||||
@@ -86,6 +89,46 @@ jobs:
|
||||
cd ${{ steps.package.outputs.directory }}
|
||||
bump2version ${{ inputs.bump_type }}
|
||||
|
||||
- name: Also bump cua-agent
|
||||
if: ${{ inputs.service == 'cua-computer' }}
|
||||
run: |
|
||||
cd libs/python/agent
|
||||
bump2version ${{ inputs.bump_type }}
|
||||
|
||||
- name: Capture bumped agent version
|
||||
if: ${{ inputs.service == 'cua-agent' || inputs.service == 'cua-computer' }}
|
||||
id: agent_version
|
||||
run: |
|
||||
cd libs/python/agent
|
||||
VERSION=$(python -c "import tomllib; from pathlib import Path; data = tomllib.loads(Path('pyproject.toml').read_text()); print(data['project']['version'])")
|
||||
echo "Agent version: $VERSION"
|
||||
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Capture bumped computer version
|
||||
if: ${{ inputs.service == 'cua-computer' }}
|
||||
id: computer_version
|
||||
run: |
|
||||
cd libs/python/computer
|
||||
VERSION=$(python -c "import tomllib; from pathlib import Path; data = tomllib.loads(Path('pyproject.toml').read_text()); print(data['project']['version'])")
|
||||
echo "Computer version: $VERSION"
|
||||
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Push changes
|
||||
run: |
|
||||
git push origin main --follow-tags
|
||||
|
||||
publish-computer:
|
||||
needs: bump-version
|
||||
if: ${{ inputs.service == 'cua-computer' }}
|
||||
uses: ./.github/workflows/pypi-publish-computer.yml
|
||||
with:
|
||||
version: ${{ needs.bump-version.outputs.computer_version }}
|
||||
secrets: inherit
|
||||
|
||||
publish-agent:
|
||||
needs: [bump-version, publish-computer]
|
||||
if: ${{ always() && (inputs.service == 'cua-agent' || inputs.service == 'cua-computer') && needs.bump-version.result == 'success' && (inputs.service == 'cua-agent' || needs.publish-computer.result == 'success') }}
|
||||
uses: ./.github/workflows/pypi-publish-agent.yml
|
||||
with:
|
||||
version: ${{ needs.bump-version.outputs.agent_version }}
|
||||
secrets: inherit
|
||||
|
||||
8
.github/workflows/ci-lume.yml
vendored
8
.github/workflows/ci-lume.yml
vendored
@@ -3,7 +3,13 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
pull_request: {}
|
||||
paths:
|
||||
- "libs/lume/**"
|
||||
- ".github/workflows/ci-lume.yml"
|
||||
pull_request:
|
||||
paths:
|
||||
- "libs/lume/**"
|
||||
- ".github/workflows/ci-lume.yml"
|
||||
|
||||
concurrency:
|
||||
group: lume-${{ github.workflow }}-${{ github.ref }}
|
||||
|
||||
74
.github/workflows/link-check.yml
vendored
Normal file
74
.github/workflows/link-check.yml
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
name: Link Checker
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
branches: [main, master]
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
link-check:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Run Lychee link checker
|
||||
uses: lycheeverse/lychee-action@v2
|
||||
id: lychee
|
||||
with:
|
||||
# Check all markdown files
|
||||
args: --verbose --no-progress --max-cache-age 1d --accept 200..=299,403 --exclude '^file://' --exclude 'localhost' --exclude '127\.0\.0\.1' '**/*.md'
|
||||
# Output results to file for parsing
|
||||
output: lychee-output.md
|
||||
# Don't fail the build on broken links (warning mode)
|
||||
fail: false
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Parse link check results
|
||||
id: parse-results
|
||||
if: always()
|
||||
run: |
|
||||
# Use lychee exit code: 0 = success, >0 = errors found
|
||||
EXIT_CODE="${{ steps.lychee.outputs.exit_code }}"
|
||||
|
||||
echo "Exit code: $EXIT_CODE"
|
||||
|
||||
# Show summary if output file exists
|
||||
if [ -f "lychee-output.md" ]; then
|
||||
echo "=== Link Check Summary ==="
|
||||
cat lychee-output.md
|
||||
fi
|
||||
|
||||
# Set status based on exit code
|
||||
if [ "$EXIT_CODE" = "0" ]; then
|
||||
echo "STATUS_ICON=✅" >> $GITHUB_ENV
|
||||
echo "STATUS_TEXT=All links are working" >> $GITHUB_ENV
|
||||
echo "COLOR=#36a64f" >> $GITHUB_ENV
|
||||
elif [ "$EXIT_CODE" = "2" ]; then
|
||||
echo "STATUS_ICON=❌" >> $GITHUB_ENV
|
||||
echo "STATUS_TEXT=Link checker failed to run" >> $GITHUB_ENV
|
||||
echo "COLOR=#dc3545" >> $GITHUB_ENV
|
||||
else
|
||||
echo "STATUS_ICON=⚠️" >> $GITHUB_ENV
|
||||
echo "STATUS_TEXT=Found broken links" >> $GITHUB_ENV
|
||||
echo "COLOR=#ffa500" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- name: Send results to Slack
|
||||
if: always() && github.ref == 'refs/heads/main'
|
||||
uses: rtCamp/action-slack-notify@v2
|
||||
env:
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
SLACK_CHANNEL: ${{ vars.SLACK_CHANNEL }}
|
||||
SLACK_TITLE: "🔗 Link Check Results"
|
||||
SLACK_COLOR: ${{ env.COLOR }}
|
||||
SLACK_MESSAGE: |
|
||||
*Status:* ${{ env.STATUS_ICON }} ${{ env.STATUS_TEXT }}
|
||||
|
||||
*Branch:* `${{ github.ref_name }}`
|
||||
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}${{ github.event.pull_request.number && format('?pr={0}', github.event.pull_request.number) || '' }}|View broken links>
|
||||
212
.github/workflows/npm-publish-cli.yml
vendored
Normal file
212
.github/workflows/npm-publish-cli.yml
vendored
Normal file
@@ -0,0 +1,212 @@
|
||||
name: Publish @trycua/cli
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
version:
|
||||
description: "Version to publish (default: from package.json)"
|
||||
required: false
|
||||
default: ""
|
||||
|
||||
jobs:
|
||||
build-and-publish:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: write
|
||||
packages: write
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- target: bun-linux-x64
|
||||
ext: ""
|
||||
binary_name: cua-linux-x64
|
||||
- target: bun-darwin-x64
|
||||
ext: ""
|
||||
binary_name: cua-darwin-x64
|
||||
- target: bun-darwin-arm64
|
||||
ext: ""
|
||||
binary_name: cua-darwin-arm64
|
||||
- target: bun-windows-x64
|
||||
ext: ".exe"
|
||||
binary_name: cua-windows-x64
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Bun
|
||||
uses: oven-sh/setup-bun@v2
|
||||
with:
|
||||
bun-version: latest
|
||||
|
||||
- name: Get version
|
||||
id: version
|
||||
run: |
|
||||
if [ -n "${{ github.event.inputs.version }}" ]; then
|
||||
echo "version=${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
VERSION=$(bun -p "require('./libs/typescript/cua-cli/package.json').version")
|
||||
echo "version=${VERSION}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Install dependencies
|
||||
working-directory: ./libs/typescript/cua-cli
|
||||
run: bun install --frozen-lockfile
|
||||
|
||||
- name: Build binary
|
||||
working-directory: ./libs/typescript/cua-cli
|
||||
run: |
|
||||
bun build --compile --minify --sourcemap --target=${{ matrix.target }} index.ts --outfile ${{ matrix.binary_name }}${{ matrix.ext }}
|
||||
mkdir -p ../../../dist
|
||||
mv ${{ matrix.binary_name }}${{ matrix.ext }}* ../../../dist/
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: cua-binary-${{ matrix.target }}
|
||||
path: dist/
|
||||
if-no-files-found: error
|
||||
retention-days: 1
|
||||
|
||||
publish-npm:
|
||||
needs: build-and-publish
|
||||
if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/cua-v')
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Bun
|
||||
uses: oven-sh/setup-bun@v2
|
||||
with:
|
||||
bun-version: latest
|
||||
|
||||
- name: Install dependencies
|
||||
working-directory: ./libs/typescript/cua-cli
|
||||
run: bun install --frozen-lockfile
|
||||
|
||||
- name: Publish to npm
|
||||
working-directory: ./libs/typescript/cua-cli
|
||||
env:
|
||||
NPM_CONFIG_TOKEN: ${{ secrets.NPM_TOKEN }}
|
||||
run: bun publish --production --access public --tolerate-republish
|
||||
|
||||
create-release:
|
||||
needs: [build-and-publish, publish-npm]
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Bun
|
||||
uses: oven-sh/setup-bun@v2
|
||||
with:
|
||||
bun-version: latest
|
||||
|
||||
- name: Get version
|
||||
id: version
|
||||
run: |
|
||||
VERSION=$(bun -p "require('./libs/typescript/cua-cli/package.json').version")
|
||||
echo "version=${VERSION}" >> $GITHUB_OUTPUT
|
||||
echo "tag=cua-v${VERSION}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Download all artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: dist
|
||||
merge-multiple: true
|
||||
|
||||
- name: Create Release
|
||||
id: create_release
|
||||
uses: actions/create-release@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
tag_name: ${{ steps.version.outputs.tag }}
|
||||
release_name: cua-cli v${{ steps.version.outputs.version }}
|
||||
body: |
|
||||
# cua-cli v${{ steps.version.outputs.version }}
|
||||
|
||||
## Installation
|
||||
|
||||
### Using install script (recommended)
|
||||
```bash
|
||||
# For Linux/macOS
|
||||
curl -fsSL https://cua.ai/cli/install.sh | sh
|
||||
|
||||
# For Windows (PowerShell)
|
||||
irm https://cua.ai/cli/install.ps1 | iex
|
||||
```
|
||||
|
||||
### Using npm/bun
|
||||
```bash
|
||||
# Using bun
|
||||
bun add -g @trycua/cli
|
||||
|
||||
# Or using npm
|
||||
npm install -g @trycua/cli
|
||||
```
|
||||
|
||||
### From source
|
||||
```bash
|
||||
git clone -b ${{ steps.version.outputs.tag }} https://github.com/trycua/cua.git
|
||||
cd cua/libs/typescript/cua-cli
|
||||
bun install
|
||||
bun link
|
||||
bun link cua-cli
|
||||
```
|
||||
|
||||
## Release Assets
|
||||
- `cua-darwin-arm64`: macOS (Apple Silicon)
|
||||
- `cua-darwin-x64`: macOS (Intel)
|
||||
- `cua-linux-x64`: Linux (x86_64)
|
||||
- `cua-windows-x64.exe`: Windows (x86_64)
|
||||
draft: false
|
||||
prerelease: false
|
||||
|
||||
- name: Upload Linux Binary
|
||||
uses: actions/upload-release-asset@v1
|
||||
with:
|
||||
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
||||
asset_path: ./dist/cua-linux-x64
|
||||
asset_name: cua-linux-x64
|
||||
asset_content_type: application/octet-stream
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Upload macOS Intel Binary
|
||||
uses: actions/upload-release-asset@v1
|
||||
with:
|
||||
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
||||
asset_path: ./dist/cua-darwin-x64
|
||||
asset_name: cua-darwin-x64
|
||||
asset_content_type: application/octet-stream
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Upload macOS Apple Silicon Binary
|
||||
uses: actions/upload-release-asset@v1
|
||||
with:
|
||||
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
||||
asset_path: ./dist/cua-darwin-arm64
|
||||
asset_name: cua-darwin-arm64
|
||||
asset_content_type: application/octet-stream
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Upload Windows Binary
|
||||
uses: actions/upload-release-asset@v1
|
||||
with:
|
||||
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
||||
asset_path: ./dist/cua-windows-x64.exe
|
||||
asset_name: cua-windows-x64.exe
|
||||
asset_content_type: application/octet-stream
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
27
.github/workflows/pypi-publish-agent.yml
vendored
27
.github/workflows/pypi-publish-agent.yml
vendored
@@ -31,26 +31,39 @@ jobs:
|
||||
core_version: ${{ steps.update-deps.outputs.core_version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: main
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Ensure latest main branch
|
||||
run: |
|
||||
git fetch origin main
|
||||
git reset --hard origin/main
|
||||
echo "Current HEAD commit:"
|
||||
git log -1 --oneline
|
||||
|
||||
- name: Determine version
|
||||
id: get-version
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" == "push" ]; then
|
||||
# Check inputs.version first (works for workflow_call regardless of event_name)
|
||||
if [ -n "${{ inputs.version }}" ]; then
|
||||
VERSION=${{ inputs.version }}
|
||||
elif [ "${{ github.event_name }}" == "push" ]; then
|
||||
# Extract version from tag (for package-specific tags)
|
||||
if [[ "${{ github.ref }}" =~ ^refs/tags/agent-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then
|
||||
VERSION=${BASH_REMATCH[1]}
|
||||
else
|
||||
echo "Invalid tag format for agent"
|
||||
echo "ERROR: Invalid tag format for agent"
|
||||
exit 1
|
||||
fi
|
||||
elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
|
||||
# Use version from workflow dispatch
|
||||
elif [ -n "${{ github.event.inputs.version }}" ]; then
|
||||
VERSION=${{ github.event.inputs.version }}
|
||||
else
|
||||
# Use version from workflow_call
|
||||
VERSION=${{ inputs.version }}
|
||||
echo "ERROR: No version found (inputs.version, event.inputs.version, and tag all empty)"
|
||||
exit 1
|
||||
fi
|
||||
echo "VERSION=$VERSION"
|
||||
|
||||
echo "Agent version: $VERSION"
|
||||
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Python
|
||||
|
||||
28
.github/workflows/pypi-publish-computer.yml
vendored
28
.github/workflows/pypi-publish-computer.yml
vendored
@@ -33,21 +33,39 @@ jobs:
|
||||
- name: Determine version
|
||||
id: get-version
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" == "push" ]; then
|
||||
echo "=== Version Detection Debug ==="
|
||||
echo "Event name: ${{ github.event_name }}"
|
||||
echo "Workflow call version: ${{ inputs.version }}"
|
||||
echo "Workflow dispatch version: ${{ github.event.inputs.version }}"
|
||||
echo "GitHub ref: ${{ github.ref }}"
|
||||
|
||||
# Check inputs.version first (works for workflow_call regardless of event_name)
|
||||
if [ -n "${{ inputs.version }}" ]; then
|
||||
# Version provided via workflow_call or workflow_dispatch with version input
|
||||
VERSION=${{ inputs.version }}
|
||||
echo "Using inputs.version: $VERSION"
|
||||
elif [ "${{ github.event_name }}" == "push" ]; then
|
||||
# Extract version from tag (for package-specific tags)
|
||||
if [[ "${{ github.ref }}" =~ ^refs/tags/computer-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then
|
||||
VERSION=${BASH_REMATCH[1]}
|
||||
echo "Extracted from tag: $VERSION"
|
||||
else
|
||||
echo "Invalid tag format for computer"
|
||||
exit 1
|
||||
fi
|
||||
elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
|
||||
# Use version from workflow dispatch
|
||||
elif [ -n "${{ github.event.inputs.version }}" ]; then
|
||||
# Use version from workflow_dispatch event inputs
|
||||
VERSION=${{ github.event.inputs.version }}
|
||||
echo "Using event.inputs.version: $VERSION"
|
||||
else
|
||||
# Use version from workflow_call
|
||||
VERSION=${{ inputs.version }}
|
||||
echo "ERROR: No version found!"
|
||||
echo " - inputs.version is empty"
|
||||
echo " - event.inputs.version is empty"
|
||||
echo " - Not a tag push event"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== Final Version ==="
|
||||
echo "VERSION=$VERSION"
|
||||
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
||||
|
||||
|
||||
10
.github/workflows/pypi-reusable-publish.yml
vendored
10
.github/workflows/pypi-reusable-publish.yml
vendored
@@ -47,8 +47,16 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: main
|
||||
fetch-depth: 0 # Full history for release creation
|
||||
|
||||
- name: Ensure latest main branch
|
||||
run: |
|
||||
git fetch origin main
|
||||
git reset --hard origin/main
|
||||
echo "Current HEAD commit:"
|
||||
git log -1 --oneline
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
@@ -78,7 +86,7 @@ jobs:
|
||||
|
||||
# Verify version matches using script (exits with error if mismatch)
|
||||
python ${GITHUB_WORKSPACE}/.github/scripts/get_pyproject_version.py \
|
||||
${{ inputs.package_dir }}/pyproject.toml \
|
||||
${GITHUB_WORKSPACE}/${{ inputs.package_dir }}/pyproject.toml \
|
||||
${{ inputs.version }}
|
||||
|
||||
- name: Initialize PDM in package directory
|
||||
|
||||
11
.github/workflows/python-tests.yml
vendored
11
.github/workflows/python-tests.yml
vendored
@@ -49,8 +49,15 @@ jobs:
|
||||
# Install the package in editable mode with dev dependencies
|
||||
if [ -f pyproject.toml ]; then
|
||||
uv pip install --system -e .
|
||||
# Install test dependencies
|
||||
uv pip install --system pytest pytest-asyncio pytest-mock pytest-cov
|
||||
fi
|
||||
shell: bash
|
||||
|
||||
- name: Install test dependencies
|
||||
run: |
|
||||
# Install test dependencies from root pyproject.toml if tests directory exists
|
||||
# The root pyproject.toml has package=false, so we install just the dependency group
|
||||
if [ -d "libs/python/${{ matrix.package }}/tests" ]; then
|
||||
uv pip install --system --group test
|
||||
fi
|
||||
shell: bash
|
||||
|
||||
|
||||
20
.github/workflows/test-cua-models.yml
vendored
20
.github/workflows/test-cua-models.yml
vendored
@@ -4,8 +4,6 @@ name: Test CUA Supporting Models
|
||||
# Run manually using workflow_dispatch with test_models=true
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
branches: [main, master]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
test_models:
|
||||
@@ -20,7 +18,7 @@ on:
|
||||
jobs:
|
||||
# Test all CUA models - runs on PRs, schedules, or when manually triggered
|
||||
test-all-models:
|
||||
if: ${{ github.event_name == 'pull_request_target' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false') }}
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false') }}
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -42,13 +40,13 @@ jobs:
|
||||
- gemini-2.5-computer-use-preview-10-2025
|
||||
|
||||
# InternVL
|
||||
- huggingface-local/OpenGVLab/InternVL3_5-1B
|
||||
# - huggingface-local/OpenGVLab/InternVL3_5-1B
|
||||
# - huggingface-local/OpenGVLab/InternVL3_5-2B
|
||||
# - huggingface-local/OpenGVLab/InternVL3_5-4B
|
||||
# - huggingface-local/OpenGVLab/InternVL3_5-8B
|
||||
|
||||
# UI-TARS (supports full computer-use, can run standalone)
|
||||
- huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
||||
# - huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
||||
|
||||
# Note: OpenCUA, GTA, and Holo are grounding-only models
|
||||
# They only support predict_click(), not agent.run()
|
||||
@@ -56,7 +54,7 @@ jobs:
|
||||
|
||||
# Moondream (typically used in composed agents)
|
||||
# Format: moondream3+{any-llm-with-tools}
|
||||
- moondream3+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools
|
||||
# - moondream3+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools
|
||||
# - moondream3+openai/gpt-4o # GPT-4o has VLM + Tools
|
||||
|
||||
# OmniParser (typically used in composed agents)
|
||||
@@ -68,9 +66,9 @@ jobs:
|
||||
# Format: {grounding-model}+{any-vlm-with-tools}
|
||||
# These grounding-only models (OpenCUA, GTA, Holo) must be used in composed form
|
||||
# since they only support predict_click(), not full agent.run()
|
||||
- huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929
|
||||
- huggingface-local/xlangai/OpenCUA-7B+anthropic/claude-sonnet-4-5-20250929
|
||||
- huggingface-local/Hcompany/Holo1.5-3B+anthropic/claude-sonnet-4-5-20250929
|
||||
# - huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929
|
||||
# - huggingface-local/xlangai/OpenCUA-7B+anthropic/claude-sonnet-4-5-20250929
|
||||
# - huggingface-local/Hcompany/Holo1.5-3B+anthropic/claude-sonnet-4-5-20250929
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
@@ -219,6 +217,7 @@ jobs:
|
||||
path: |
|
||||
tests/agent_loop_testing/test_images/
|
||||
*.log
|
||||
if-no-files-found: ignore
|
||||
retention-days: 7
|
||||
|
||||
- name: Upload test summary data
|
||||
@@ -228,6 +227,7 @@ jobs:
|
||||
# Unique, slash-free artifact name per matrix entry
|
||||
name: test-summary-${{ env.SAFE_MODEL_NAME }}
|
||||
path: test_summary/
|
||||
if-no-files-found: ignore
|
||||
retention-days: 1
|
||||
|
||||
- name: Set default Slack color
|
||||
@@ -248,7 +248,7 @@ jobs:
|
||||
|
||||
# Summary job that aggregates all model test results
|
||||
test-summary:
|
||||
if: ${{ always() && (github.event_name == 'pull_request_target' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false')) }}
|
||||
if: ${{ always() && (github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false')) }}
|
||||
needs: test-all-models
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
|
||||
@@ -29,4 +29,7 @@ venv/
|
||||
*.db
|
||||
*.sqlite
|
||||
pnpm-lock.yaml
|
||||
uv.lock
|
||||
uv.lock
|
||||
|
||||
# Docs with complex JSX formatting
|
||||
docs/content/docs/get-started/quickstart.mdx
|
||||
@@ -376,6 +376,61 @@ All packages are managed through a single consolidated workflow: [Bump Version](
|
||||
5. Click "Run workflow" to start the version bump
|
||||
6. The workflow will automatically commit changes and push to main
|
||||
|
||||
## Releasing a New CLI Version
|
||||
|
||||
To release a new version of the CUA CLI, follow these steps:
|
||||
|
||||
### 1. Update the Version
|
||||
|
||||
1. Update the version in `libs/typescript/cua-cli/package.json`
|
||||
2. Commit the version change with a message like "Bump version to x.y.z"
|
||||
3. Push the changes to the main branch
|
||||
|
||||
### 2. Trigger the Release Workflow
|
||||
|
||||
1. Go to the GitHub Actions tab in the repository
|
||||
2. Select the "Publish @trycua/cli" workflow
|
||||
3. Click "Run workflow"
|
||||
4. Optionally, specify a version (e.g., "1.2.3") or leave empty to use the version from package.json
|
||||
5. Click "Run workflow"
|
||||
|
||||
The workflow will:
|
||||
|
||||
- Build single-file executables for all supported platforms
|
||||
- Publish the package to npm
|
||||
- Create a GitHub release with the version tag (format: `cua-vX.Y.Z`)
|
||||
- Attach all platform-specific binaries to the release
|
||||
|
||||
### 3. Verify the Release
|
||||
|
||||
1. Check the GitHub Releases page to ensure the new version is published
|
||||
2. Verify the npm package was published to the registry
|
||||
3. Test installation on different platforms:
|
||||
|
||||
```bash
|
||||
# Test Linux/macOS installation
|
||||
curl -fsSL https://cua.ai/install.sh | sh
|
||||
|
||||
# Test Windows installation (PowerShell)
|
||||
irm https://cua.ai/install.ps1 | iex
|
||||
```
|
||||
|
||||
### 4. Update Documentation
|
||||
|
||||
Update any relevant documentation with the new version number, including:
|
||||
|
||||
- Example code in documentation
|
||||
- Any version-specific instructions
|
||||
- Compatibility matrices
|
||||
|
||||
### 5. Announce the Release
|
||||
|
||||
- Create a new GitHub release with release notes
|
||||
- Update the changelog if maintained separately
|
||||
- Announce in relevant channels (Slack, Discord, etc.)
|
||||
|
||||
---
|
||||
|
||||
### Rolling Back a Version Bump
|
||||
|
||||
If you need to revert a version bump, follow these steps:
|
||||
|
||||
110
README.md
110
README.md
@@ -6,15 +6,17 @@
|
||||
</picture>
|
||||
|
||||
[](#)
|
||||
[](#)
|
||||
[](#)
|
||||
[](https://discord.com/invite/mVnXXpdE85)
|
||||
<br>
|
||||
<a href="https://trendshift.io/repositories/13685" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13685" alt="trycua%2Fcua | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||
|
||||
</div>
|
||||
|
||||
**Cua** ("koo-ah") is Docker for [Computer-Use Agents](https://www.oneusefulthing.org/p/when-you-give-a-claude-a-mouse) - it enables AI agents to control full operating systems in virtual containers and deploy them locally or to the cloud.
|
||||
**Cua** ("koo-ah") is an open-source framework for Computer-Use Agents - enabling AI systems to autonomously operate computers through visual understanding and action execution. Used for research, evaluation, and production deployment of desktop, browser, and mobile automation agents.
|
||||
|
||||
## What are Computer-Use Agents?
|
||||
|
||||
Computer-Use Agents (CUAs) are AI systems that can autonomously interact with computer interfaces through visual understanding and action execution. Unlike traditional automation tools that rely on brittle selectors or APIs, CUAs use vision-language models to perceive screen content and reason about interface interactions - enabling them to adapt to UI changes and handle complex, multi-step workflows across applications.
|
||||
|
||||
<div align="center">
|
||||
<video src="https://github.com/user-attachments/assets/c619b4ea-bb8e-4382-860e-f3757e36af20" width="600" controls></video>
|
||||
@@ -22,14 +24,14 @@
|
||||
|
||||
With the [Computer SDK](#computer-sdk), you can:
|
||||
|
||||
- automate Windows, Linux, and macOS VMs with a consistent, [pyautogui-like API](https://cua.ai/docs/docs/libraries/computer#interface-actions)
|
||||
- create & manage VMs [locally](https://cua.ai/docs/docs/computer-sdk/computers#cua-local-containers) or using [Cua cloud](https://www.cua.ai/)
|
||||
- automate Windows, Linux, and macOS VMs with a consistent, [pyautogui-like API](https://cua.ai/docs/computer-sdk/commands)
|
||||
- create & manage VMs [locally](https://cua.ai/docs/quickstart-devs#using-computer) or using [Cua cloud](https://www.cua.ai/)
|
||||
|
||||
With the [Agent SDK](#agent-sdk), you can:
|
||||
|
||||
- run computer-use models with a [consistent schema](https://cua.ai/docs/docs/agent-sdk/message-format)
|
||||
- benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://cua.ai/docs/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb))
|
||||
- combine UI grounding models with any LLM using [composed agents](https://cua.ai/docs/docs/agent-sdk/supported-agents/composed-agents)
|
||||
- run computer-use models with a [consistent schema](https://cua.ai/docs/agent-sdk/message-format)
|
||||
- benchmark on OSWorld-Verified (369 tasks), SheetBench-V2, and ScreenSpot [with a single line of code using HUD](https://cua.ai/docs/agent-sdk/integrations/hud) - see [benchmark results](#research--benchmarks) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb))
|
||||
- combine UI grounding models with any LLM using [composed agents](https://cua.ai/docs/agent-sdk/supported-agents/composed-agents)
|
||||
- use new UI agent models and UI grounding models from the Model Zoo below with just a model string (e.g., `ComputerAgent(model="openai/computer-use-preview")`)
|
||||
- use API or local inference by changing a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers))
|
||||
|
||||
@@ -96,8 +98,8 @@ Core utilities for Cua
|
||||
# Quick Start
|
||||
|
||||
- [Clone a starter template and run the code in <1 min](https://github.com/trycua/agent-template)
|
||||
- [Get started with the Cua SDKs](https://cua.ai/docs/docs/quickstart-devs)
|
||||
- [Get started with the Cua CLI](https://cua.ai/docs/docs/quickstart-cli)
|
||||
- [Get started with the Cua SDKs](https://cua.ai/docs/quickstart-devs)
|
||||
- [Get started with the Cua CLI](https://cua.ai/docs/quickstart-cli)
|
||||
|
||||
# Agent SDK
|
||||
|
||||
@@ -115,7 +117,7 @@ from agent import ComputerAgent
|
||||
# ComputerAgent works with any computer initialized with the Computer SDK
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0
|
||||
)
|
||||
@@ -194,12 +196,12 @@ Cua uses the OpenAI Agent response format.
|
||||
|
||||
These are the valid model configurations for `ComputerAgent(model="...")`:
|
||||
|
||||
| Configuration | Description |
|
||||
| ---------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `{computer-use-model}` | A single model to perform all computer-use tasks |
|
||||
| `{grounding-model}+{any-vlm-with-tools}` | [Composed](https://cua.ai/docs/docs/agent-sdk/supported-agents/composed-agents) with VLM for captioning and grounding LLM for element detection |
|
||||
| `moondream3+{any-llm-with-tools}` | [Composed](https://cua.ai/docs/docs/agent-sdk/supported-agents/composed-agents) with Moondream3 for captioning and UI element detection |
|
||||
| `human/human` | A [human-in-the-loop](https://cua.ai/docs/docs/agent-sdk/supported-agents/human-in-the-loop) in place of a model |
|
||||
| Configuration | Description |
|
||||
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `{computer-use-model}` | A single model to perform all computer-use tasks |
|
||||
| `{grounding-model}+{any-vlm-with-tools}` | [Composed](https://cua.ai/docs/agent-sdk/supported-agents/composed-agents) with VLM for captioning and grounding LLM for element detection |
|
||||
| `moondream3+{any-llm-with-tools}` | [Composed](https://cua.ai/docs/agent-sdk/supported-agents/composed-agents) with Moondream3 for captioning and UI element detection |
|
||||
| `human/human` | A [human-in-the-loop](https://cua.ai/docs/agent-sdk/supported-agents/human-in-the-loop) in place of a model |
|
||||
|
||||
### Model Capabilities
|
||||
|
||||
@@ -209,16 +211,46 @@ The following table shows which capabilities are supported by each model:
|
||||
| -------------------------------------------------------------------------------------------------------------------------------- | :----------: | :-------: | :---: | :-: |
|
||||
| [Claude Sonnet/Haiku](https://docs.claude.com/en/docs/agents-and-tools/tool-use/computer-use-tool#how-to-implement-computer-use) | 🖥️ | 🎯 | 🛠️ | 👁️ |
|
||||
| [OpenAI CU Preview](https://platform.openai.com/docs/models/computer-use-preview) | 🖥️ | 🎯 | | 👁️ |
|
||||
| [Qwen3 VL](https://huggingface.co/collections/Qwen/qwen3-vl) | 🖥️ | 🎯 | 🛠️ | 👁️ |
|
||||
| [GLM-V](https://huggingface.co/THUDM/glm-4v-9b) | 🖥️ | 🎯 | 🛠️ | 👁️ |
|
||||
| [Gemini CU Preview](https://ai.google.dev/gemini-api/docs/computer-use) | 🖥️ | 🎯 | | 👁️ |
|
||||
| [InternVL](https://huggingface.co/OpenGVLab/InternVL3_5-1B) | 🖥️ | 🎯 | 🛠️ | 👁️ |
|
||||
| [UI-TARS](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B) | 🖥️ | 🎯 | 🛠️ | 👁️ |
|
||||
| [UI-TARS-2](https://cua.ai/dashboard/vlm-router) | 🖥️ | 🎯 | 🛠️ | 👁️ |
|
||||
| [OpenCUA](https://huggingface.co/xlangai/OpenCUA-7B) | | 🎯 | | |
|
||||
| [GTA](https://huggingface.co/HelloKKMe/GTA1-7B) | | 🎯 | | |
|
||||
| [Holo](https://huggingface.co/Hcompany/Holo1.5-3B) | | 🎯 | | |
|
||||
| [Moondream](https://huggingface.co/moondream/moondream3-preview) | | 🎯 | | |
|
||||
| [OmniParser](https://github.com/microsoft/OmniParser) | | 🎯 | | |
|
||||
|
||||
**Legend:**
|
||||
|
||||
- 🖥️ **Computer-Use**: Full agentic loop with planning and execution
|
||||
- 🎯 **Grounding**: UI element detection and click coordinate prediction
|
||||
- 🛠️ **Tools**: Support for function calling beyond screen interaction
|
||||
- 👁️ **VLM**: Vision-language understanding
|
||||
|
||||
**Composition Examples:**
|
||||
|
||||
See more examples on our [composition docs](https://cua.ai/docs/agent-sdk/supported-agents/composed-agents).
|
||||
|
||||
```python
|
||||
# Use OpenAI's GPT-5 for planning with specialized grounding
|
||||
agent = ComputerAgent(model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5")
|
||||
|
||||
# Composition via OmniParser
|
||||
agent = ComputerAgent(model="omniparser+openai/gpt-4o")
|
||||
|
||||
# Combine state-of-the-art grounding with powerful reasoning
|
||||
agent = ComputerAgent(model="huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929")
|
||||
|
||||
# Combine two different vision models for enhanced capabilities
|
||||
agent = ComputerAgent(model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B+openai/gpt-4o")
|
||||
|
||||
# Use the built-in Moondream3 grounding with any planning mode.
|
||||
agent = ComputerAgent(model="moondream3+openai/gpt-4o")
|
||||
```
|
||||
|
||||
### Model IDs
|
||||
|
||||
<details>
|
||||
@@ -229,9 +261,11 @@ The following table shows which capabilities are supported by each model:
|
||||
| [Claude Sonnet/Haiku](https://docs.claude.com/en/docs/agents-and-tools/tool-use/computer-use-tool#how-to-implement-computer-use) | `anthropic/claude-sonnet-4-5`, `anthropic/claude-haiku-4-5` |
|
||||
| [OpenAI CU Preview](https://platform.openai.com/docs/models/computer-use-preview) | `openai/computer-use-preview` |
|
||||
| [GLM-V](https://huggingface.co/THUDM/glm-4v-9b) | `openrouter/z-ai/glm-4.5v`, `huggingface-local/zai-org/GLM-4.5V` |
|
||||
| [Qwen3 VL](https://huggingface.co/collections/Qwen/qwen3-vl) | `openrouter/qwen/qwen3-vl-235b-a22b-instruct` |
|
||||
| [Gemini CU Preview](https://ai.google.dev/gemini-api/docs/computer-use) | `gemini-2.5-computer-use-preview` |
|
||||
| [InternVL](https://huggingface.co/OpenGVLab/InternVL3_5-1B) | `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` |
|
||||
| [UI-TARS](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B) | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` |
|
||||
| [UI-TARS-2](https://cua.ai/dashboard/vlm-router) | `cua/bytedance/ui-tars-2` |
|
||||
| [OpenCUA](https://huggingface.co/xlangai/OpenCUA-7B) | `huggingface-local/xlangai/OpenCUA-{7B,32B}` |
|
||||
| [GTA](https://huggingface.co/HelloKKMe/GTA1-7B) | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` |
|
||||
| [Holo](https://huggingface.co/Hcompany/Holo1.5-3B) | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` |
|
||||
@@ -273,7 +307,7 @@ try:
|
||||
|
||||
# Click and type
|
||||
await computer.interface.left_click(100, 100)
|
||||
await computer.interface.type("Hello!")
|
||||
await computer.interface.type_text("Hello!")
|
||||
finally:
|
||||
await computer.close()
|
||||
```
|
||||
@@ -331,6 +365,46 @@ pip install cua-som
|
||||
|
||||
Learn more in the [SOM documentation](./libs/python/som/README.md).
|
||||
|
||||
# Recent Updates
|
||||
|
||||
## 2025
|
||||
|
||||
### September 2025
|
||||
|
||||
- **Hack the North Competition**: First benchmark-driven hackathon track with guaranteed YC interview prize. Winner achieved 68.3% on OSWorld-Tiny ([Blog Post](https://www.cua.ai/blog/hack-the-north))
|
||||
- **Global Hackathon Launch**: Ollama × Cua global online competition for creative local/hybrid agents
|
||||
|
||||
### August 2025
|
||||
|
||||
- **v0.4 Release - Composite Agents**: Mix grounding + planning models with `+` operator (e.g., `"GTA-7B+GPT-4o"`) ([Blog Post](https://www.cua.ai/blog/composite-agents))
|
||||
- **HUD Integration**: One-line benchmarking on OSWorld-Verified with live trace visualization ([Blog Post](https://www.cua.ai/blog/hud-agent-evals))
|
||||
- **Human-in-the-Loop**: Interactive agent mode with `human/human` model string
|
||||
- **Web-Based Computer Use**: Browser-based agent execution ([Blog Post](https://www.cua.ai/blog/bringing-computer-use-to-the-web))
|
||||
|
||||
### June 2025
|
||||
|
||||
- **Windows Sandbox Support**: Native Windows agent execution ([Blog Post](https://www.cua.ai/blog/windows-sandbox))
|
||||
- **Containerization Evolution**: From Lume to full Docker support ([Blog Post](https://www.cua.ai/blog/lume-to-containerization))
|
||||
- **Sandboxed Python Execution**: Secure code execution in agent workflows
|
||||
|
||||
### May 2025
|
||||
|
||||
- **Cua Cloud Containers**: Production-ready cloud deployment with elastic scaling ([Blog Post](https://www.cua.ai/blog/introducing-cua-cloud-containers))
|
||||
- **Trajectory Viewer**: Visual debugging tool for agent actions ([Blog Post](https://www.cua.ai/blog/trajectory-viewer))
|
||||
- **Training Data Collection**: Tools for creating computer-use training datasets ([Blog Post](https://www.cua.ai/blog/training-computer-use-models-trajectories-1))
|
||||
- **App-Use Framework**: Mobile and desktop app automation capabilities
|
||||
|
||||
### April 2025
|
||||
|
||||
- **Agent Framework v0.4**: Unified API for 100+ model configurations
|
||||
- **UI-TARS Integration**: Local inference support for ByteDance's desktop-optimized model
|
||||
- **Blog Series**: "Build Your Own Operator" tutorials ([Part 1](https://www.cua.ai/blog/build-your-own-operator-on-macos-1) | [Part 2](https://www.cua.ai/blog/build-your-own-operator-on-macos-2))
|
||||
|
||||
### March 2025
|
||||
|
||||
- **Initial Public Release**: Core Agent SDK and Computer SDK
|
||||
- **Lume VM Manager**: macOS VM management tool for local development
|
||||
|
||||
# Resources
|
||||
|
||||
- [Cua Blog](https://www.cua.ai/blog)
|
||||
|
||||
@@ -25,7 +25,7 @@ desktop = computer.create_desktop_from_apps(["Safari", "Notes"])
|
||||
|
||||
# Your agent can now only see and interact with these apps
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[desktop]
|
||||
)
|
||||
```
|
||||
@@ -94,7 +94,7 @@ async def main():
|
||||
|
||||
# Initialize an agent
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[desktop]
|
||||
)
|
||||
|
||||
@@ -160,7 +160,7 @@ async def automate_iphone():
|
||||
|
||||
# Initialize an agent for iPhone automation
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[my_iphone]
|
||||
)
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ In this first blogpost, we'll learn how to build our own Computer-Use Operator u
|
||||
- **computer-use-preview** is OpenAI's specialized language model trained to understand and interact with computer interfaces through screenshots.
|
||||
- A **Computer-Use Agent** is an AI agent that can control a computer just like a human would - clicking buttons, typing text, and interacting with applications.
|
||||
|
||||
Our Operator will run in an isolated macOS VM, by making use of our [cua-computer](https://github.com/trycua/cua/tree/main/libs/computer) package and [lume virtualization CLI](https://github.com/trycua/cua/tree/main/libs/lume).
|
||||
Our Operator will run in an isolated macOS VM, by making use of our [cua-computer](https://github.com/trycua/cua/tree/main/libs/python/computer) package and [lume virtualization CLI](https://github.com/trycua/cua/tree/main/libs/lume).
|
||||
|
||||
Check out what it looks like to use your own Operator from a Gradio app:
|
||||
|
||||
@@ -294,7 +294,7 @@ This design keeps everything organized and safe. The AI can only interact with t
|
||||
### Prerequisites
|
||||
|
||||
1. **Lume CLI Setup**
|
||||
For installing the standalone lume binary, run the following command from a terminal, or download the [latest pkg](https://github.com/trycua/cua/releases/latest/download/lume.pkg.tar.gz).
|
||||
For installing the standalone lume binary, run the following command from a terminal, or download the [latest pkg](https://github.com/trycua/cua/releases/download/lume-v0.2.22/lume-darwin.pkg.tar.gz).
|
||||
|
||||
```bash
|
||||
sudo /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
|
||||
@@ -567,10 +567,10 @@ In a production setting, you would wrap the action-response cycle in a loop, han
|
||||
|
||||
### Next Steps
|
||||
|
||||
In the next blogpost, we'll introduce our Agent framework which abstracts away all these tedious implementation steps. This framework provides a higher-level API that handles the interaction loop between OpenAI's computer-use model and the macOS sandbox, allowing you to focus on building sophisticated applications rather than managing the low-level details we've explored here. Can't wait? Check out the [cua-agent](https://github.com/trycua/cua/tree/main/libs/agent) package!
|
||||
In the next blogpost, we'll introduce our Agent framework which abstracts away all these tedious implementation steps. This framework provides a higher-level API that handles the interaction loop between OpenAI's computer-use model and the macOS sandbox, allowing you to focus on building sophisticated applications rather than managing the low-level details we've explored here. Can't wait? Check out the [cua-agent](https://github.com/trycua/cua/tree/main/libs/python/agent) package!
|
||||
|
||||
### Resources
|
||||
|
||||
- [OpenAI Computer-Use docs](https://platform.openai.com/docs/guides/tools-computer-use)
|
||||
- [cua-computer](https://github.com/trycua/cua/tree/main/libs/computer)
|
||||
- [cua-computer](https://github.com/trycua/cua/tree/main/libs/python/computer)
|
||||
- [lume](https://github.com/trycua/cua/tree/main/libs/lume)
|
||||
|
||||
@@ -145,9 +145,9 @@ While the core concept remains the same across all agent loops, different AI mod
|
||||
| Agent Loop | Supported Models | Description | Set-Of-Marks |
|
||||
|:-----------|:-----------------|:------------|:-------------|
|
||||
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA Preview model | Not Required |
|
||||
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use Beta Tools | Not Required |
|
||||
| `AgentLoop.ANTHROPIC` | • `claude-sonnet-4-5-20250929`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use Beta Tools | Not Required |
|
||||
| `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required |
|
||||
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
||||
| `AgentLoop.OMNI` | • `claude-sonnet-4-5-20250929`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
||||
|
||||
Each loop handles the same basic pattern we implemented manually in Part 1:
|
||||
|
||||
@@ -171,7 +171,7 @@ The `cua-agent` framework provides multiple agent loop implementations to abstra
|
||||
|
||||
- **AgentLoop.OMNI**: The most flexible option that works with virtually any vision-language model including local and open-source ones. Perfect for cost-effective development or when you need to use models without native computer-use capabilities.
|
||||
|
||||
These abstractions allow you to easily switch between providers without changing your application code. All loop implementations are available in the [cua-agent GitHub repository](https://github.com/trycua/cua/tree/main/libs/agent/agent/providers).
|
||||
These abstractions allow you to easily switch between providers without changing your application code. All loop implementations are available in the [cua-agent GitHub repository](https://github.com/trycua/cua/tree/main/libs/python/agent).
|
||||
|
||||
Choosing the right agent loop depends not only on your API access and technical requirements but also on the specific tasks you need to accomplish. To make an informed decision, it's helpful to understand how these underlying models perform across different computing environments – from desktop operating systems to web browsers and mobile interfaces.
|
||||
|
||||
@@ -191,7 +191,7 @@ The performance of different Computer-Use models varies significantly across tas
|
||||
|
||||
- **AgentLoop.OPENAI**: Choose when you have OpenAI Tier 3 access and need the most capable computer-use agent for web-based tasks. Uses the same [OpenAI Computer-Use Loop](https://platform.openai.com/docs/guides/tools-computer-use) as Part 1, delivering strong performance on browser-based benchmarks.
|
||||
|
||||
- **AgentLoop.ANTHROPIC**: Ideal for users with Anthropic API access who need strong reasoning capabilities with computer-use abilities. Works with `claude-3-5-sonnet-20240620` and `claude-3-7-sonnet-20250219` models following [Anthropic's Computer-Use tools](https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#understanding-the-multi-agent-loop).
|
||||
- **AgentLoop.ANTHROPIC**: Ideal for users with Anthropic API access who need strong reasoning capabilities with computer-use abilities. Works with `claude-sonnet-4-5-20250929` and `claude-3-7-sonnet-20250219` models following [Anthropic's Computer-Use tools](https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#understanding-the-multi-agent-loop).
|
||||
|
||||
- **AgentLoop.UITARS**: Best for scenarios requiring more powerful OS/desktop, and latency-sensitive automation, as UI-TARS-1.5 leads in OS capabilities benchmarks. Requires running the model locally or accessing it through compatible endpoints (e.g. on Hugging Face).
|
||||
|
||||
@@ -268,7 +268,7 @@ from agent import ComputerAgent
|
||||
async def run_multi_task_workflow():
|
||||
async with Computer() as macos_computer:
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[macos_computer]
|
||||
)
|
||||
|
||||
@@ -674,7 +674,7 @@ With the basics covered, you might want to explore:
|
||||
|
||||
### Resources
|
||||
|
||||
- [cua-agent GitHub repository](https://github.com/trycua/cua/tree/main/libs/agent)
|
||||
- [cua-agent GitHub repository](https://github.com/trycua/cua/tree/main/libs/python/agent)
|
||||
- [Agent Notebook Examples](https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb)
|
||||
- [OpenAI Agent SDK Specification](https://platform.openai.com/docs/api-reference/responses)
|
||||
- [Anthropic API Documentation](https://docs.anthropic.com/en/api/getting-started)
|
||||
|
||||
120
blog/cloud-windows-ga-macos-preview.md
Normal file
120
blog/cloud-windows-ga-macos-preview.md
Normal file
@@ -0,0 +1,120 @@
|
||||
# Cloud Windows Sandboxes GA + macOS Preview
|
||||
|
||||
If you've been building with our `cua` libraries, you might've hit a limitation with local computer-use sandboxes: to run agents on Windows or macOS, you need to be on that OS - Windows Sandbox for Windows, Apple Virtualization for macOS. The only cross-platform option is Linux on Docker, which limits you to virtualizing Linux environments ([see all local options here](https://cua.ai/docs/computer-sdk/computers)).
|
||||
|
||||
Today the story changes - we're announcing general availability of **Cloud Windows Sandboxes** and opening early preview access for **Cloud macOS Sandboxes**.
|
||||
|
||||
## Cloud Windows Sandboxes: Now GA
|
||||
|
||||

|
||||
|
||||
Cloud Windows Sandboxes are now generally available. You get a full Windows 11 desktop in your browser with Edge and Python pre-installed, working seamlessly with all our [Computer-Use libraries](https://github.com/trycua/cua) for RPA, UI automation, code execution, and agent development.
|
||||
|
||||
**What's new with this release:**
|
||||
|
||||
- Hot-start under 1 second
|
||||
- Direct noVNC over HTTPS under our sandbox.cua.ai domain
|
||||
- 3 sandbox sizes available:
|
||||
|
||||
| Size | CPU | RAM | Storage |
|
||||
| ------ | ------- | ----- | ---------- |
|
||||
| Small | 2 cores | 8 GB | 128 GB SSD |
|
||||
| Medium | 4 cores | 16 GB | 128 GB SSD |
|
||||
| Large | 8 cores | 32 GB | 256 GB SSD |
|
||||
|
||||
<div align="center">
|
||||
<video src="https://github.com/user-attachments/assets/8ab07646-6018-4128-87ce-53180cfea696" width="600" controls></video>
|
||||
</div>
|
||||
|
||||
**Pricing:** Windows Sandboxes start at 8 credits/hour (Small), 15 credits/hour (Medium), or 31 credits/hour (Large).
|
||||
|
||||
## Cloud macOS Sandboxes: Now in Preview
|
||||
|
||||
Running macOS locally comes with challenges: 30GB golden images, a maximum of 2 sandboxes per host, and unpredictable compatibility issues. With Cloud macOS Sandboxes, we provision bare-metal macOS hosts (M1, M2, M4) on-demand—giving you full desktop access without the overhead of managing local sandboxes.
|
||||
|
||||

|
||||
|
||||
**Preview access:** Invite-only. [Join the waitlist](https://cua.ai/macos-waitlist) if you're building agents for macOS workflows.
|
||||
|
||||
## Getting Started Today
|
||||
|
||||
Sign up at [cua.ai/signin](https://cua.ai/signin) and grab your API key from the dashboard. Then connect to a sandbox:
|
||||
|
||||
```python
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(
|
||||
os_type="windows", # or "macos"
|
||||
provider_type="cloud",
|
||||
name="my-sandbox",
|
||||
api_key="your-api-key"
|
||||
)
|
||||
|
||||
await computer.run()
|
||||
```
|
||||
|
||||
Manage existing sandboxes:
|
||||
|
||||
```python
|
||||
from computer.providers.cloud.provider import CloudProvider
|
||||
|
||||
provider = CloudProvider(api_key="your-api-key")
|
||||
async with provider:
|
||||
sandboxes = await provider.list_vms()
|
||||
await provider.run_vm("my-sandbox")
|
||||
await provider.stop_vm("my-sandbox")
|
||||
```
|
||||
|
||||
Run an agent on Windows to automate a workflow:
|
||||
|
||||
```python
|
||||
from agent import ComputerAgent
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0
|
||||
)
|
||||
|
||||
response = await agent.run(
|
||||
"Open Excel, create a sales report with this month's data, and save it to the desktop"
|
||||
)
|
||||
```
|
||||
|
||||
## FAQs
|
||||
|
||||
<details>
|
||||
<summary><strong>Why not just use local Windows Sandbox?</strong></summary>
|
||||
|
||||
Local Windows Sandbox resets on every restart. No persistence, no hot-start, and you need Windows Pro. Our sandboxes persist state, hot-start in under a second, and work from any OS.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>What happens to my work when I stop a sandbox?</strong></summary>
|
||||
|
||||
Everything persists. Files, installed software, browser profiles—it's all there when you restart. Only pay for runtime, not storage.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>How's the latency for UI automation?</strong></summary>
|
||||
|
||||
We run in 4 regions so you can pick what's closest. The noVNC connection is optimized for automation, not video streaming. Your agent sees crisp screenshots, not compressed video.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>Are there software restrictions?</strong></summary>
|
||||
|
||||
No. Full admin access on both platforms. Install whatever you need—Visual Studio, Photoshop, custom enterprise software. It's your sandbox.
|
||||
|
||||
</details>
|
||||
|
||||
## Need help?
|
||||
|
||||
If you hit issues getting either platform working, reach out in [Discord](https://discord.gg/cua-ai). We respond fast and fix based on what people actually use.
|
||||
|
||||
---
|
||||
|
||||
Get started at [cua.ai](https://cua.ai) or [join the macOS waitlist](https://cua.ai/macos-waitlist).
|
||||
@@ -14,12 +14,12 @@ This is the kind of problem that makes you wonder if we're building the future o
|
||||
|
||||
Agent framework 0.4 solves this by doing something radical: making all these different models speak the same language.
|
||||
|
||||
Instead of writing separate code for each model's peculiarities, you now just pick a model with a string like `"anthropic/claude-3-5-sonnet-20241022"` or `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`, and everything else Just Works™. Behind the scenes, we handle all the coordinate normalization, token parsing, and image preprocessing so you don't have to.
|
||||
Instead of writing separate code for each model's peculiarities, you now just pick a model with a string like `"anthropic/claude-sonnet-4-5-20250929"` or `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`, and everything else Just Works™. Behind the scenes, we handle all the coordinate normalization, token parsing, and image preprocessing so you don't have to.
|
||||
|
||||
```python
|
||||
# This works the same whether you're using Anthropic, OpenAI, or that new model you found on Hugging Face
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022", # or any other supported model
|
||||
model="anthropic/claude-sonnet-4-5-20250929", # or any other supported model
|
||||
tools=[computer]
|
||||
)
|
||||
```
|
||||
|
||||
172
blog/computer-use-agents-for-growth-hacking.md
Normal file
172
blog/computer-use-agents-for-growth-hacking.md
Normal file
@@ -0,0 +1,172 @@
|
||||
# Computer Use Agents for Growth Hacking: The Cua-la Strategy
|
||||
|
||||
_Published on January 16, 2025 by Sarina Li_
|
||||
|
||||
<img src="./assets/esther-and-sarina.JPG" alt="Esther and Sarina at DevFest Toronto">
|
||||
|
||||
Growing a developer-focused product is hard. Traditional marketing doesn't work. Booth rentals cost thousands. Sponsorships cost tens of thousands.
|
||||
|
||||
So we tried something different at Google DevFest Toronto: show up with backpacks full of cute cua-la keychains and see what happens.
|
||||
|
||||
This is the story of how two new hires, a growth engineer and a designer/artist, guerrilla marketed their way through a major tech conference with $200 worth of merch and a post-event automation pipeline.
|
||||
|
||||
## Meet the Team
|
||||
|
||||
**Sarina** (Growth Engineering): Built the post-event automation pipeline that extracts LinkedIn connections and generates personalized messages while you sleep.
|
||||
|
||||
**Esther** (Design + Art): Hand-crafted every piece of artwork, giving life to Cua through illustrations, branding, and yes, extremely cute cua-la keychains.
|
||||
|
||||
The thesis: what if we could draw people in with irresistible physical merch, then use computer use agents to handle all the tedious follow-up work?
|
||||
|
||||
## The cua-la Strategy
|
||||
|
||||
<img src="./assets/cua-at-devfest.JPG" alt="Guerrilla marketing at DevFest Toronto">
|
||||
|
||||
Google DevFest Toronto brought together hundreds of developers and AI enthusiasts. We didn't have a booth. We didn't have demos. We showed up with backpacks full of cua-la keychains with the cua.ai logo and started handing them out.
|
||||
|
||||
That's it. Pure guerrilla marketing, the cua-las were absurdly effective.
|
||||
|
||||
People would literally crowd around us, not because they were interested in computer use (at first), but because they wanted a cua-la. We'd pitch Cua while handing out keychains, and suddenly we had an engaged audience!
|
||||
|
||||
<img src="./assets/devfest-image.JPG" alt="DevFest crowd">
|
||||
|
||||
### The Magic Moment
|
||||
|
||||
A few people stuck the cua-las on their bags immediately. Then, throughout the event, we started getting approached:
|
||||
|
||||
"Wait, are you the Cua girls?"
|
||||
|
||||
They'd seen the cua-las on someone's bag, asked about it, and tracked us down! The keychains became walking advertisements.
|
||||
|
||||
<img src="./assets/htn-at-devfest.JPG" alt="Hack the North recognition at DevFest">
|
||||
|
||||
Even better: two attendees recognized Cua from Hack the North. Our previous event marketing was actually working. People remembered us.
|
||||
|
||||
## Part 2: The Automation (Try It Yourself)
|
||||
|
||||
After DevFest, we had 20+ new LinkedIn connections. Normally, this means hours of:
|
||||
|
||||
- Manually copying names, roles, companies
|
||||
- Opening each profile to find contact info
|
||||
- Crafting personalized follow-up messages
|
||||
- Updating your CRM
|
||||
|
||||
Sarina had a better idea: build the automation we wish existed, then open source it.
|
||||
|
||||
**The automation is live**: [Post-Event Contact Export cookbook](https://cua.ai/docs/example-usecases/post-event-contact-export)
|
||||
|
||||
### How It Works
|
||||
|
||||
<video controls width="100%">
|
||||
<source src="./assets/linkedin-scraping.mp4" type="video/mp4">
|
||||
LinkedIn scraping automation in action
|
||||
</video>
|
||||
|
||||
The agent navigates LinkedIn like a human would: click profile, extract info, navigate back, repeat. But it does it overnight while you sleep.
|
||||
|
||||
The secret sauce: **VM session persistence**. By logging into LinkedIn once through Cua's VM, the session stays alive. No captchas, no bot detection, just smooth automation.
|
||||
|
||||
<video controls width="100%">
|
||||
<source src="./assets/adding-row-csv.mp4" type="video/mp4">
|
||||
Automatic CSV generation
|
||||
</video>
|
||||
|
||||
Wake up to a clean CSV with:
|
||||
|
||||
- First name, last name
|
||||
- Current role and company
|
||||
- LinkedIn profile URLs
|
||||
- Pre-generated messaging links
|
||||
|
||||
Then use that data to craft personalized messages. Sarina wrote unique follow-ups for each person, mentioning specific conversations from DevFest.
|
||||
|
||||
**Works for any platform**: LinkedIn, X/Twitter, or wherever your connections are. The cookbook includes full setup instructions and customizable code.
|
||||
|
||||
## The Results
|
||||
|
||||
**Cost Breakdown**
|
||||
|
||||
- Booth rental: $0 (didn't have one)
|
||||
- Sponsorship: $0 (didn't buy one)
|
||||
- cua-la keychains: ~$200
|
||||
- Automation: Built by Sarina in a few hours post-event
|
||||
- **Total spend: $200**
|
||||
|
||||
**What We Got**
|
||||
|
||||
- People crowding around us for cua-las
|
||||
- Walking advertisements on bags throughout the event
|
||||
- Instant brand recognition ("Are you the Cua girls?")
|
||||
- Two people who remembered us from Hack the North
|
||||
- 20+ quality connections extracted and messaged within 24 hours
|
||||
- Several demo requests from personalized follow-ups
|
||||
|
||||
**ROI**
|
||||
Traditional event marketing at this scale: $5-10K minimum for booth + sponsorship.
|
||||
|
||||
Our approach: $200 + scrappy execution.
|
||||
|
||||
The automation is reuseable and will save hours of manual work, and the cua-las created more organic conversations than any booth could have.
|
||||
|
||||
## What Didn't Work (Yet)
|
||||
|
||||
**cua-la Distribution**
|
||||
We ran out faster than expected! Next time: bigger bag, or limit to one per person.
|
||||
|
||||
**Automation Setup**
|
||||
The VM login step added friction. "Log in manually first, then run the script" confused some people who wanted to try it themselves. Need better first-run UX.
|
||||
|
||||
**Message Personalization**
|
||||
While the extraction was automated, I still wrote each follow-up message manually, I think we are looking for ways to better enrich messages with context from the event, which is hard to automate.
|
||||
|
||||
## What's Next: NeurIPS 2025
|
||||
|
||||
NeurIPS is the biggest AI conference of the year. Thousands of researchers, hundreds of companies.
|
||||
|
||||
**The good news**: We still have one giant bag of cua-las left. They're already packed and ready.
|
||||
|
||||
**The better news**: We're upgrading the automation.
|
||||
|
||||
### The Hypothesis
|
||||
|
||||
The cua-las get people interested. The automation ensures we actually follow through.
|
||||
|
||||
Most event marketing fails at the follow-up stage. You collect business cards, connect on LinkedIn, and then... nothing. The moment passes. People forget.
|
||||
|
||||
With Cua handling the mechanical work (data organization, connection tracking, follow-up scheduling), we can focus on the human part: genuine conversations, valuable introductions, and actually helping people.
|
||||
|
||||
## The Framework: Cute Merch + Smart Automation
|
||||
|
||||
Traditional event marketing: show up, pitch, collect cards.
|
||||
|
||||
Our approach: combine two forces that shouldn't work together but do.
|
||||
|
||||
**The Physical Hook**
|
||||
|
||||
- Make something people actually want (not another branded pen)
|
||||
- Hand-crafted, memorable, Instagram-worthy
|
||||
- Turns attendees into walking billboards
|
||||
- Creates natural conversation starters
|
||||
|
||||
**The Digital Follow-Through**
|
||||
|
||||
- Automate the tedious post-event work
|
||||
- Extract connections while you sleep
|
||||
- Personalize follow-ups with real context
|
||||
- Actually close the loop before the moment passes
|
||||
|
||||
**Why It Works**
|
||||
The cua-las get you in the door. The automation ensures you don't waste the opportunity.
|
||||
|
||||
Most companies nail one or the other:
|
||||
|
||||
- Great merch, terrible follow-up → missed opportunities
|
||||
- Amazing automation, boring presence → no one cares
|
||||
|
||||
Do both, and you create a flywheel: each event builds brand recognition for the next, while automation ensures maximum value from every connection.
|
||||
|
||||
See you at NeurIPS 2025!
|
||||
|
||||
---
|
||||
|
||||
_Want to build your own growth hacking automations? Check out [Cua on GitHub](https://github.com/trycua/cua) or join our [Discord](https://discord.gg/cua) to share your experiments. cua-las not included (yet)._
|
||||
86
blog/cua-playground-preview.md
Normal file
86
blog/cua-playground-preview.md
Normal file
@@ -0,0 +1,86 @@
|
||||
# Cua Playground: Agents + Sandboxes in Your Browser
|
||||
|
||||
Building computer-use agents means constant iteration—writing code, deploying to a sandbox, testing behavior, debugging issues, then repeating the cycle. Every test requires switching between your code editor, terminal, and VNC viewer. Want to try a different prompt? Edit your code, redeploy, and wait for the agent to restart. It works, but it's slow.
|
||||
|
||||
Today we're launching the **Cua Playground**: a browser-based environment for testing computer-use agents without writing code. Send messages to your sandboxes, watch them execute in real-time, and iterate on prompts instantly—all from your dashboard at cua.ai.
|
||||
|
||||

|
||||
|
||||
**What's new with this release:**
|
||||
|
||||
- Instant testing—send messages to any running sandbox directly from your browser
|
||||
- Real-time execution—watch your agent work with live tool call updates and screenshots
|
||||
- Multi-model support—test with Claude Sonnet 4.5, Haiku 4.5, and more
|
||||
- Persistent chat history—conversations save automatically to local storage
|
||||
|
||||
The Playground connects to your existing Cua sandboxes—the same ones you use with the Agent SDK. Select a running sandbox and a model, then start chatting. The agent uses computer-use tools (mouse, keyboard, bash, editor) to complete your tasks, and you see every action it takes.
|
||||
|
||||
## Getting Started Today
|
||||
|
||||
<div align="center">
|
||||
<video src="https://github.com/user-attachments/assets/9fef0f30-1024-4833-8b7a-6a2c02d8eb99" width="600" controls></video>
|
||||
</div>
|
||||
|
||||
|
||||
Sign up at [cua.ai/signin](https://cua.ai/signin) and grab your API key from the dashboard. Then navigate to the Playground:
|
||||
|
||||
1. Navigate to Dashboard > Playground
|
||||
2. Select a sandbox from the dropdown (must be "running" status)
|
||||
3. Choose a model (we recommend Claude Sonnet 4.5 to start)
|
||||
4. Send a message: "Take a screenshot and describe what you see"
|
||||
5. Watch the agent execute computer actions in real-time
|
||||
|
||||
Example use cases:
|
||||
|
||||
**Prompt Testing**
|
||||
```
|
||||
❌ "Check the website"
|
||||
✅ "Navigate to example.com in Firefox and take a screenshot of the homepage"
|
||||
```
|
||||
|
||||
**Model Comparison**
|
||||
Run the same task with different models to compare quality, speed, and cost.
|
||||
|
||||
**Debugging Agent Behavior**
|
||||
1. Send: "Find the login button and click it"
|
||||
2. View tool calls to see each mouse movement
|
||||
3. Check screenshots to verify the agent found the right element
|
||||
4. Adjust your prompt based on what you observe
|
||||
|
||||
## FAQs
|
||||
|
||||
<details>
|
||||
<summary><strong>Do I need to know how to code?</strong></summary>
|
||||
|
||||
No. The Playground is designed for testing agent behavior without writing code. However, for production deployments, you'll need to use the Agent SDK (Python/TypeScript).
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>Does this replace the Agent SDK?</strong></summary>
|
||||
|
||||
No. The Playground is for rapid testing and experimentation. For production deployments, scheduled tasks, or complex workflows, use the Agent SDK.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>How much does it cost?</strong></summary>
|
||||
|
||||
Playground requests use the same credit system as Agent SDK requests. You're charged for model inference (varies by model) and sandbox runtime (billed per hour while running).
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>Why is my sandbox not showing up?</strong></summary>
|
||||
|
||||
The sandbox must have `status = "running"` to appear in the dropdown. Check Dashboard > Sandboxes to verify status. If stopped, click "Start" and wait ~30 seconds for it to become available.
|
||||
|
||||
</details>
|
||||
|
||||
## Need help?
|
||||
|
||||
If you hit issues getting the Playground working, reach out in [Discord](https://discord.gg/cua-ai). We respond fast and fix based on what people actually use.
|
||||
|
||||
---
|
||||
|
||||
Get started at [cua.ai](https://cua.ai) or try the Playground at [cua.ai/dashboard/playground](https://cua.ai/dashboard/playground).
|
||||
181
blog/cua-vlm-router.md
Normal file
181
blog/cua-vlm-router.md
Normal file
@@ -0,0 +1,181 @@
|
||||
# Cua VLM Router: One Provider for All Your Computer-Use Models
|
||||
|
||||
If you've been building computer-use agents, you know the reality: every model provider has its own specification and deployment process. Anthropic has one API format, OpenAI another, Google something else entirely. Want to try a Hugging Face model? That's a completely different setup. Self-hosting? Even more complexity. Each provider requires learning their specific API, managing their credentials, and adapting your code to their particular requirements.
|
||||
|
||||
Today we're launching the **Cua VLM Router**: a managed inference API that gives you unified access to multiple vision-language model providers through a single API key. We're starting with Anthropic's Claude models (Sonnet 4.5 and Haiku 4.5)—some of the most loved and widely-used computer-use models in the Cua ecosystem - with more providers coming soon.
|
||||
|
||||

|
||||
|
||||
## What You Get
|
||||
|
||||
The Cua VLM Router handles the infrastructure so you can focus on building:
|
||||
|
||||
**Single API Key**
|
||||
|
||||
- One key for all model providers (no juggling multiple credentials)
|
||||
- Works for both model inference and sandbox access
|
||||
- Manage everything from one dashboard at cua.ai
|
||||
|
||||
**Smart Routing**
|
||||
|
||||
- Automatic provider selection for optimal availability and performance
|
||||
- For Anthropic models, we route to the best provider (Anthropic, AWS Bedrock, or Microsoft Foundry)
|
||||
- No configuration needed—just specify the model and we handle the rest
|
||||
|
||||
**Cost Tracking & Optimization**
|
||||
|
||||
- Unified usage dashboard across all models
|
||||
- Real-time credit balance tracking
|
||||
- Detailed cost breakdown per request (gateway cost + upstream cost)
|
||||
|
||||
**Production-Ready**
|
||||
|
||||
- OpenAI-compatible API (drop-in replacement for existing code)
|
||||
- Full streaming support with Server-Sent Events
|
||||
- Metadata about routing decisions in every response
|
||||
|
||||
## Available Models (Launch)
|
||||
|
||||
We're starting with Anthropic's latest Claude models:
|
||||
|
||||
| Model | Best For |
|
||||
| --------------------------------- | ---------------------------------- |
|
||||
| `cua/anthropic/claude-sonnet-4.5` | General-purpose tasks, recommended |
|
||||
| `cua/anthropic/claude-haiku-4.5` | Fast responses, cost-effective |
|
||||
|
||||
## How It Works
|
||||
|
||||
When you request an Anthropic model through Cua, we automatically route to the best available provider—whether that's Anthropic directly, AWS Bedrock, or Microsoft Foundry. You just specify `cua/anthropic/claude-sonnet-4.5`, and we handle the provider selection, failover, and optimization behind the scenes. No need to manage multiple accounts or implement fallback logic yourself.
|
||||
|
||||
## Getting Started
|
||||
|
||||
Sign up at [cua.ai/signin](https://cua.ai/signin) and create your API key from **Dashboard > API Keys > New API Key** (save it immediately—you won't see it again).
|
||||
|
||||
|
||||
Use it with the Agent SDK (make sure to set your environment variable):
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer
|
||||
|
||||
async def main():
|
||||
# Initialize cloud computer
|
||||
computer = Computer(
|
||||
os_type="linux",
|
||||
provider_type="cloud",
|
||||
name="your-container-name",
|
||||
api_key="your-cua-api-key"
|
||||
)
|
||||
|
||||
# Initialize agent with Claude Sonnet 4.5
|
||||
agent = ComputerAgent(
|
||||
tools=[computer],
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
api_key="your-cua-api-key",
|
||||
instructions="You are a helpful assistant that can control computers",
|
||||
only_n_most_recent_images=3
|
||||
)
|
||||
|
||||
# Run a task
|
||||
async for result in agent.run("Open a browser and search for Python tutorials"):
|
||||
print(result)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Migration is Simple
|
||||
|
||||
Already using Anthropic directly? Just add the `cua/` prefix:
|
||||
|
||||
**Before:**
|
||||
|
||||
```python
|
||||
export ANTHROPIC_API_KEY="sk-ant-..."
|
||||
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
|
||||
```
|
||||
|
||||
**After:**
|
||||
|
||||
```python
|
||||
export CUA_API_KEY="sk_cua-api01_..."
|
||||
agent = ComputerAgent(model="cua/anthropic/claude-sonnet-4.5")
|
||||
```
|
||||
|
||||
Same code structure. No other changes needed.
|
||||
|
||||
## Direct API Access
|
||||
|
||||
The router exposes an OpenAI-compatible API at `https://inference.cua.ai/v1`:
|
||||
|
||||
```bash
|
||||
curl -X POST https://inference.cua.ai/v1/chat/completions \
|
||||
-H "Authorization: Bearer ${CUA_API_KEY}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "anthropic/claude-sonnet-4.5",
|
||||
"messages": [{"role": "user", "content": "Hello!"}],
|
||||
"stream": true
|
||||
}'
|
||||
```
|
||||
|
||||
Works with any OpenAI-compatible client library.
|
||||
|
||||
## FAQs
|
||||
|
||||
<details>
|
||||
<summary><strong>Do I still need provider API keys?</strong></summary>
|
||||
|
||||
No. Cua manages all provider API keys and infrastructure. You only need one Cua API key for everything—model inference and sandbox access.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>How does pricing work?</strong></summary>
|
||||
|
||||
Requests are billed in credits, deducted from your Cua account balance. Every response includes both the Cua gateway cost and the actual upstream API cost for transparency.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>Can I still use my own Anthropic key (BYOK)?</strong></summary>
|
||||
|
||||
Yes. The agent SDK still supports direct provider access. Just use `anthropic/claude-sonnet-4-5-20250929` instead of the `cua/` prefix and set your `ANTHROPIC_API_KEY`. See [Supported Model Providers](https://cua.ai/docs/agent-sdk/supported-model-providers/) for details.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>What about other providers?</strong></summary>
|
||||
|
||||
We're starting with Anthropic and adding more providers based on what people actually use. Request access to specific models in [Discord](https://discord.gg/cua-ai).
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>Does streaming work?</strong></summary>
|
||||
|
||||
Yes. Set `"stream": true` in your request to receive Server-Sent Events. Works identically to OpenAI's streaming API.
|
||||
|
||||
</details>
|
||||
|
||||
## What's Next
|
||||
|
||||
This is just the beginning. We're actively iterating based on feedback:
|
||||
|
||||
- Additional model providers
|
||||
- Custom model routing rules
|
||||
- Usage alerts and budget controls
|
||||
- Team collaboration features
|
||||
|
||||
If there's a model or feature you need, let us know in [Discord](https://discord.gg/cua-ai).
|
||||
|
||||
## Need Help?
|
||||
|
||||
- **Documentation**: [cua.ai/docs/agent-sdk/supported-model-providers/cua-vlm-router](https://cua.ai/docs/agent-sdk/supported-model-providers/cua-vlm-router)
|
||||
- **Quickstart Guide**: [cua.ai/docs/get-started/quickstart](https://cua.ai/docs/get-started/quickstart)
|
||||
- **Discord Community**: [discord.gg/cua-ai](https://discord.gg/cua-ai)
|
||||
|
||||
---
|
||||
|
||||
Get started at [cua.ai](https://cua.ai) or check out the [VLM Router docs](https://cua.ai/docs/agent-sdk/supported-model-providers/cua-vlm-router).
|
||||
@@ -58,7 +58,7 @@ await run_full_dataset(
|
||||
# Or test on SheetBench (50 spreadsheet tasks)
|
||||
await run_full_dataset(
|
||||
dataset="hud-evals/SheetBench-V2",
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
split="train[:2]"
|
||||
)
|
||||
```
|
||||
|
||||
264
blog/introducing-cua-cli.md
Normal file
264
blog/introducing-cua-cli.md
Normal file
@@ -0,0 +1,264 @@
|
||||
# Introducing the Cua CLI: Manage Cloud Sandboxes from Your Terminal
|
||||
|
||||
If you've been using our Cloud Sandboxes, you've probably been managing them through the web dashboard - clicking through forms to create instances, copying credentials, manually starting and stopping sandboxes. It works, but it's not exactly built for power users like yourself.
|
||||
|
||||
Today we're launching the **Cua CLI**: a command-line interface that brings the full power of our Cloud Sandbox platform to your terminal. Create, manage, and connect to Linux, Windows, or macOS sandboxes in seconds—all from a single command.
|
||||
|
||||

|
||||
|
||||
## What You Can Do
|
||||
|
||||
The Cua CLI handles everything you need to work with Cloud Sandboxes:
|
||||
|
||||
**Authentication**
|
||||
|
||||
- Browser-based OAuth login with automatic credential storage
|
||||
- Direct API key support for CI/CD pipelines
|
||||
- Export credentials to `.env` files for SDK integration
|
||||
|
||||
**Sandbox Management**
|
||||
|
||||
- Create sandboxes with your choice of OS, size, and region
|
||||
- List all your sandboxes with status and connection details
|
||||
- Start, stop, restart, and delete sandboxes
|
||||
- Open remote desktop (VNC) connections directly in your browser
|
||||
|
||||
**Two Command Styles**
|
||||
The CLI supports both flat and grouped command structures—use whichever fits your workflow:
|
||||
|
||||
```bash
|
||||
# Grouped style (explicit & clear)
|
||||
cua sb ls
|
||||
cua sb create --os linux --size small --region north-america
|
||||
cua sb vnc my-sandbox
|
||||
|
||||
# Flat style (quick & concise)
|
||||
cua ls
|
||||
cua create --os linux --size small --region north-america
|
||||
cua vnc my-sandbox
|
||||
```
|
||||
|
||||
Both styles work identically. The CLI shows grouped commands in help by default, but all flat commands remain available for backwards compatibility.
|
||||
|
||||
## Installation
|
||||
|
||||
One command installs everything (includes Bun runtime + Cua CLI):
|
||||
|
||||
```bash
|
||||
# macOS/Linux
|
||||
curl -LsSf https://cua.ai/cli/install.sh | sh
|
||||
|
||||
# Windows
|
||||
powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```
|
||||
|
||||
Or install via npm if you prefer:
|
||||
|
||||
```bash
|
||||
npm install -g @trycua/cli
|
||||
```
|
||||
|
||||
## Getting Started
|
||||
|
||||
Authenticate with your Cua account:
|
||||
|
||||
```bash
|
||||
# Interactive browser login (recommended)
|
||||
cua auth login
|
||||
|
||||
# Or provide your API key directly
|
||||
cua auth login --api-key sk-your-api-key-here
|
||||
```
|
||||
|
||||
Create a sandbox:
|
||||
|
||||
```bash
|
||||
cua sb create --os linux --size small --region north-america
|
||||
# Sandbox created and ready: my-sandbox-abc123
|
||||
# Password: secure-password-here
|
||||
# Host: my-sandbox-abc123.sandbox.cua.ai
|
||||
```
|
||||
|
||||
List your sandboxes:
|
||||
|
||||
```bash
|
||||
cua sb list
|
||||
# NAME STATUS HOST
|
||||
# my-sandbox-abc123 running my-sandbox-abc123.sandbox.cua.ai
|
||||
# test-windows-456 stopped test-windows-456.sandbox.cua.ai
|
||||
```
|
||||
|
||||
Open a remote desktop:
|
||||
|
||||
```bash
|
||||
cua sb vnc my-sandbox-abc123
|
||||
# Opens your browser to the VNC interface with password pre-filled
|
||||
```
|
||||
|
||||
## SDK Integration
|
||||
|
||||
Export your API key to a `.env` file for seamless SDK integration:
|
||||
|
||||
```bash
|
||||
cd my-project
|
||||
cua auth env
|
||||
# Wrote /path/to/my-project/.env
|
||||
```
|
||||
|
||||
Then use it with our Python or TypeScript SDKs:
|
||||
|
||||
```python
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(
|
||||
os_type="linux",
|
||||
provider_type="cloud",
|
||||
name="my-sandbox-abc123",
|
||||
api_key="your-api-key" # Or load from .env
|
||||
)
|
||||
|
||||
await computer.run()
|
||||
```
|
||||
|
||||
## Sandbox Sizes & Regions
|
||||
|
||||
Create sandboxes in the size and region that fits your needs:
|
||||
|
||||
**Sizes:**
|
||||
|
||||
- `small` - 2 cores, 8 GB RAM, 128 GB SSD
|
||||
- `medium` - 4 cores, 16 GB RAM, 128 GB SSD
|
||||
- `large` - 8 cores, 32 GB RAM, 256 GB SSD
|
||||
|
||||
**Regions:**
|
||||
|
||||
- `north-america`
|
||||
- `europe`
|
||||
- `asia-pacific`
|
||||
- `south-america`
|
||||
|
||||
**OS Options:**
|
||||
|
||||
- `linux` - Ubuntu with XFCE desktop
|
||||
- `windows` - Windows 11 with Edge and Python
|
||||
- `macos` - macOS (preview access)
|
||||
|
||||
## Example Workflows
|
||||
|
||||
**Quick Testing Environment**
|
||||
|
||||
```bash
|
||||
# Spin up a sandbox, test something, tear it down
|
||||
cua sb create --os linux --size small --region north-america
|
||||
# ... do your testing ...
|
||||
cua sb delete my-sandbox-abc123
|
||||
```
|
||||
|
||||
**Persistent Development Sandbox**
|
||||
|
||||
```bash
|
||||
# Create a sandbox for long-term use
|
||||
cua sb create --os linux --size medium --region north-america
|
||||
|
||||
# Stop it when not in use (data persists)
|
||||
cua sb stop my-sandbox-abc123
|
||||
|
||||
# Start it again when needed
|
||||
cua sb start my-sandbox-abc123
|
||||
```
|
||||
|
||||
**CI/CD Integration**
|
||||
|
||||
```bash
|
||||
# Provision sandboxes in your pipeline
|
||||
export CUA_API_KEY="sk-your-api-key"
|
||||
cua auth login --api-key "$CUA_API_KEY"
|
||||
cua sb create --os linux --size large --region north-america
|
||||
|
||||
# Run your tests with the Cua Computer SDK
|
||||
python run_tests.py
|
||||
|
||||
# Clean up
|
||||
cua sb delete my-test-sandbox
|
||||
```
|
||||
|
||||
## Command Aliases
|
||||
|
||||
We've added aliases for common commands to speed up your workflow:
|
||||
|
||||
```bash
|
||||
# List aliases
|
||||
cua list # or: cua ls, cua ps, cua sb list
|
||||
|
||||
# VNC aliases
|
||||
cua vnc # or: cua open, cua sb vnc
|
||||
```
|
||||
|
||||
## FAQs
|
||||
|
||||
<details>
|
||||
<summary><strong>Can I use this in scripts and CI/CD?</strong></summary>
|
||||
|
||||
Yes. All commands support non-interactive mode with `--api-key` flags, and the CLI exits with proper status codes for scripting. The flat command style (`cua list`, `cua create`) is particularly useful for quick scripts.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>Where are my credentials stored?</strong></summary>
|
||||
|
||||
API keys are stored in `~/.cua/cli.sqlite` using a local SQLite database. They never leave your machine. Use `cua auth logout` to clear stored credentials.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>What happens to passwords in the output?</strong></summary>
|
||||
|
||||
Passwords are hidden by default in `cua list` for security. Use `cua list --show-passwords` to display them when needed.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>Can I manage sandboxes created through the web dashboard?</strong></summary>
|
||||
|
||||
Yes. The CLI and dashboard share the same API. Any sandbox you create in the dashboard will show up in `cua list`, and vice versa.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>How do I update the CLI?</strong></summary>
|
||||
|
||||
If you installed via script:
|
||||
|
||||
```bash
|
||||
curl -LsSf https://cua.ai/cli/install.sh | sh
|
||||
```
|
||||
|
||||
If you installed via npm:
|
||||
|
||||
```bash
|
||||
npm install -g @trycua/cli@latest
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## What's Next
|
||||
|
||||
We're actively iterating based on feedback. Planned features include:
|
||||
|
||||
- SSH key management for secure sandbox access
|
||||
- Template-based sandbox creation
|
||||
- Batch operations (start/stop multiple sandboxes)
|
||||
- Custom sandbox configurations
|
||||
- Snapshot management
|
||||
|
||||
If there's a feature you need, let us know in [Discord](https://discord.gg/cua-ai).
|
||||
|
||||
## Need Help?
|
||||
|
||||
- **Documentation**: [https://cua.ai/docs/libraries/cua-cli/commands](https://cua.ai/docs/libraries/cua-cli/commands)
|
||||
- **Installation Guide**: [https://cua.ai/docs/libraries/cua-cli/installation](https://cua.ai/docs/libraries/cua-cli/installation)
|
||||
- **Discord Community**: [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
|
||||
|
||||
---
|
||||
|
||||
Get started at [cua.ai](https://cua.ai) or check out the [quickstart guide](https://cua.ai/docs/get-started/quickstart).
|
||||
@@ -90,7 +90,7 @@ lume run macos-sequoia-vanilla:latest
|
||||
|
||||
### Lumier: Docker-Style VM Management
|
||||
|
||||
[Lumier](https://github.com/trycua/lumier) works differently. It lets you use Docker commands to manage VMs. But here's the key: **Docker is just for packaging, not for isolation**.
|
||||
[Lumier](https://github.com/trycua/cua/tree/main/libs/lumier) works differently. It lets you use Docker commands to manage VMs. But here's the key: **Docker is just for packaging, not for isolation**.
|
||||
|
||||
What makes Lumier useful:
|
||||
|
||||
|
||||
658
blog/neurips-2025-cua-papers.md
Normal file
658
blog/neurips-2025-cua-papers.md
Normal file
@@ -0,0 +1,658 @@
|
||||
# NeurIPS 2025: 45 Computer-Use Agent Papers You Should Know About
|
||||
|
||||
<img alt="neurips" src="https://github.com/user-attachments/assets/bd649067-bb2c-45f4-827b-087021ec3ad7" />
|
||||
|
||||
If you're following the computer-use agent space, you already know that NeurIPS is where the most important work gets presented. But with thousands of papers across every area of machine learning, finding the ones relevant to CUAs means hours of filtering through proceedings, skimming abstracts, and hoping you don't miss something important.
|
||||
|
||||
We did that work for you. We're excited to announce that **Cua will be at NeurIPS 2025**, and we've compiled a curated list of **45 papers** focused specifically on Computer-Use Agents—covering benchmarks, safety, grounding, visual reasoning, and agent architectures.
|
||||
|
||||
## Why This Matters
|
||||
|
||||
Computer-use agents are evolving rapidly. This year's NeurIPS showcases several important developments:
|
||||
|
||||
**The benchmark landscape is maturing.** We're seeing comprehensive evaluations across macOS (macOSWorld), professional tools (VideoCAD), and real-world websites (REAL, TheAgentCompany). These aren't toy problems anymore—they're measuring what agents can actually do in production environments.
|
||||
|
||||
**Safety is becoming a first-class concern.** Multiple papers (OS-Harm, RiOSWorld, WASP, AgentDAM) are systematically documenting how agents fail when confronted with adversarial inputs, privacy requirements, or misuse scenarios. The findings are sobering: even frontier models often comply with harmful requests.
|
||||
|
||||
**Grounding remains the bottleneck.** Papers like GUI-Actor, GUI-G1, and SE-GUI are pushing the state of the art on mapping language to UI actions. The best approaches are achieving significant gains with surprisingly small models and datasets.
|
||||
|
||||
**Open-source is catching up.** OpenCUA's 72B model hits 45% on OSWorld-Verified, establishing that community-driven development can compete with proprietary systems.
|
||||
|
||||
## Highlights Worth Your Attention
|
||||
|
||||
A few papers stand out for their immediate relevance to anyone building or deploying computer-use agents:
|
||||
|
||||
- **macOSWorld** reveals a dramatic capability gap: proprietary agents achieve 30%+ success on macOS tasks while open-source models struggle below 5%.
|
||||
- **TheAgentCompany** simulates a software company where agents browse, code, and communicate. The best agent completes 30% of tasks autonomously.
|
||||
- **WASP** demonstrates that simple prompt injections deceive top-tier models in 86% of cases.
|
||||
- **GUI-G1** shows that a 3B model can achieve 90.3% on ScreenSpot by fixing issues with chain-of-thought reasoning.
|
||||
|
||||
## Summary Statistics
|
||||
|
||||
| Category | Count |
|
||||
|----------|-------|
|
||||
| Benchmarks & Datasets | 18 |
|
||||
| Safety & Security | 12 |
|
||||
| Grounding & Visual Reasoning | 14 |
|
||||
| Agent Architectures & Training | 11 |
|
||||
| Adversarial Attacks | 8 |
|
||||
|
||||
**Total Papers:** 45
|
||||
|
||||
## Meet Us at NeurIPS
|
||||
|
||||
We'll be at NeurIPS in San Diego. If you're working on computer-use agents, building applications on top of CUA infrastructure, or just curious about where this space is heading, we'd love to connect.
|
||||
|
||||
- **Book a Meeting**: [cal.com/cua/neurips-slot](https://cal.com/cua/neurips-slot)
|
||||
- **X/Twitter**: [@trycua](https://x.com/trycua)
|
||||
- **Discord**: [discord.gg/cua-ai](https://discord.gg/cua-ai)
|
||||
|
||||
---
|
||||
|
||||
# The Papers
|
||||
|
||||
## 1. macOSWorld: A Multilingual Interactive Benchmark for GUI Agents
|
||||
|
||||
**Summary:** The first comprehensive benchmark for evaluating GUI agents on macOS. Features 202 multilingual interactive tasks across 30 applications (28 macOS-exclusive), with support for 5 languages (English, Chinese, Arabic, Japanese, Russian). Reveals a dramatic gap: proprietary agents achieve 30%+ success rate while open-source models lag below 5%. Also includes safety benchmarking for deception attacks.
|
||||
|
||||
**Key Findings:**
|
||||
- Proprietary computer-use agents lead at above 30% success rate
|
||||
- Open-source lightweight models struggle below 5%, highlighting need for macOS domain adaptation
|
||||
- Multilingual benchmarks expose weaknesses, especially in Arabic (28.8% degradation vs English)
|
||||
- Deception attacks are a general vulnerability requiring immediate attention
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/117427
|
||||
|
||||
---
|
||||
|
||||
## 2. OS-Harm: A Benchmark for Measuring Safety of Computer Use Agents
|
||||
|
||||
**Summary:** A comprehensive safety benchmark built on OSWorld for testing computer-use agents across three harm categories: deliberate user misuse, prompt injection attacks, and model misbehavior. Includes 150 tasks spanning harassment, copyright infringement, disinformation, data exfiltration, and more. Proposes an automated judge achieving high agreement with human annotations (0.76-0.79 F1 score).
|
||||
|
||||
**Key Findings:**
|
||||
- All tested models (o4-mini, Claude 3.7 Sonnet, Gemini 2.5 Pro) tend to directly comply with many deliberate misuse queries
|
||||
- Models are relatively vulnerable to static prompt injections
|
||||
- Models occasionally perform unsafe actions without explicit malicious prompts
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/loc/san-diego/poster/121772
|
||||
|
||||
---
|
||||
|
||||
## 3. OpenCUA: Open Foundations for Computer-Use Agents
|
||||
|
||||
**Summary:** A comprehensive open-source framework for scaling computer-use agent data and foundation models. Introduces AgentNet, the first large-scale computer-use task dataset spanning 3 operating systems and 200+ applications/websites. OpenCUA-72B achieves 45% success rate on OSWorld-Verified, establishing new state-of-the-art among open-source models.
|
||||
|
||||
**Key Contributions:**
|
||||
- Annotation infrastructure for capturing human computer-use demonstrations
|
||||
- AgentNet: large-scale dataset across 3 OSes and 200+ apps
|
||||
- Scalable pipeline transforming demonstrations into state-action pairs with reflective Chain-of-Thought reasoning
|
||||
- Models generalize well across domains and benefit from increased test-time computation
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/119771
|
||||
|
||||
---
|
||||
|
||||
## 4. Mind2Web 2: Evaluating Agentic Search with Agent-as-a-Judge
|
||||
|
||||
**Summary:** A benchmark of 130 realistic, high-quality, long-horizon tasks for agentic search systems (like Deep Research), requiring real-time web browsing and extensive information synthesis. Constructed with 1000+ hours of human labor. Introduces Agent-as-a-Judge framework using tree-structured rubric design for automated evaluation.
|
||||
|
||||
**Key Findings:**
|
||||
- OpenAI Deep Research achieves 50-70% of human performance while spending half the time
|
||||
- First systematic evaluation of ten frontier agentic search systems vs. human performance
|
||||
- Addresses the challenge of evaluating time-varying, complex answers
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/121798
|
||||
|
||||
---
|
||||
|
||||
## 5. Scaling Computer-Use Grounding via User Interface Decomposition and Synthesis
|
||||
|
||||
**Summary:** Addresses GUI grounding—mapping natural language to specific UI actions—as a critical bottleneck in agent development. Introduces OSWorld-G benchmark (564 annotated samples) and Jedi dataset (4 million synthetic examples), the largest computer-use grounding dataset. Improved grounding directly enhances agentic capabilities, boosting OSWorld performance from 23% to 51%.
|
||||
|
||||
**Key Contributions:**
|
||||
- OSWorld-G: comprehensive benchmark for diverse grounding tasks (text matching, element recognition, layout understanding, precise manipulation)
|
||||
- Jedi: 4M examples through multi-perspective task decoupling
|
||||
- Demonstrates compositional generalization to novel interfaces
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/121759
|
||||
|
||||
---
|
||||
|
||||
## 6. RiOSWorld: Benchmarking the Risk of Multimodal Computer-Use Agents
|
||||
|
||||
**Summary:** Evaluates potential safety risks of MLLM-based agents during real-world computer manipulation. Features 492 risky tasks spanning web, social media, multimedia, OS, email, and office software. Categorizes risks into user-originated and environmental risks, evaluating both risk goal intention and completion.
|
||||
|
||||
**Key Findings:**
|
||||
- Current computer-use agents face significant safety risks in real-world scenarios
|
||||
- Safety principles designed for dialogue scenarios don't transfer well to computer-use
|
||||
- Highlights necessity and urgency of safety alignment for computer-use agents
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/117273
|
||||
|
||||
---
|
||||
|
||||
## 7. REAL: Benchmarking Autonomous Agents on Deterministic Simulations of Real Websites
|
||||
|
||||
**Summary:** A benchmark featuring high-fidelity, deterministic replicas of 11 widely-used websites across e-commerce, travel, communication, and professional networking. Contains 112 practical tasks requiring both information retrieval and state-changing actions. Enables reproducible evaluation without safety risks.
|
||||
|
||||
**Key Findings:**
|
||||
- Best frontier language models achieve only 41% success rate
|
||||
- Highlights critical gaps in autonomous web navigation and task completion
|
||||
- Supports scalable post-training data generation
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/121619
|
||||
|
||||
---
|
||||
|
||||
## 8. SE-GUI: Enhancing Visual Grounding for GUI Agents via Self-Evolutionary Reinforcement Learning
|
||||
|
||||
**Summary:** An RL-based framework for GUI grounding incorporating seed data curation, dense policy gradients, and self-evolutionary reinforcement finetuning using attention maps. With only 3K training samples, the 7B model achieves state-of-the-art on three grounding benchmarks, outperforming UI-TARS-72B by 24.2% on ScreenSpot-Pro.
|
||||
|
||||
**Key Results:**
|
||||
- 47.3% accuracy on ScreenSpot-Pro with 7B model
|
||||
- Outperforms 72B models with fraction of training data
|
||||
- Demonstrates effectiveness of RL for high-resolution, complex environments
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/118788
|
||||
|
||||
---
|
||||
|
||||
## 9. TRAP: Targeted Redirecting of Agentic Preferences
|
||||
|
||||
**Summary:** A generative adversarial framework that manipulates agent decision-making using diffusion-based semantic injections. Combines negative prompt degradation with positive semantic optimization. Without model access, produces visually natural images that induce consistent decision biases in agents.
|
||||
|
||||
**Key Findings:**
|
||||
- Consistently induces decision-level preference redirection on LLaVA-34B, Gemma3, GPT-4o, and Mistral-3.2
|
||||
- Outperforms baselines (SPSA, Bandit, standard diffusion)
|
||||
- Exposes vulnerability: autonomous agents can be misled through visually subtle, semantically-guided manipulations
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/117547
|
||||
|
||||
---
|
||||
|
||||
## 10. TheAgentCompany: Benchmarking LLM Agents on Consequential Real World Tasks
|
||||
|
||||
**Summary:** An extensible benchmark simulating a small software company environment where AI agents interact like digital workers: browsing the web, writing code, running programs, and communicating with coworkers. Tests agents on real professional tasks with important implications for industry adoption and labor market effects.
|
||||
|
||||
**Key Findings:**
|
||||
- Best agent achieves 30% autonomous task completion
|
||||
- Simpler tasks are solvable autonomously
|
||||
- More difficult long-horizon tasks remain beyond current systems' reach
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/121705
|
||||
|
||||
---
|
||||
|
||||
## 11. VideoGameQA-Bench: Evaluating Vision-Language Models for Video Game Quality Assurance
|
||||
|
||||
**Summary:** A comprehensive benchmark for VLMs in video game QA, encompassing visual unit testing, visual regression testing, needle-in-a-haystack challenges, glitch detection, and bug report generation for both images and videos. Addresses the need for standardized benchmarks in this labor-intensive domain.
|
||||
|
||||
**Key Focus:**
|
||||
- First benchmark specifically designed for video game QA with VLMs
|
||||
- Covers wide range of QA activities across images and videos
|
||||
- Addresses lack of automation in game development workflows
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/121740
|
||||
|
||||
---
|
||||
|
||||
## 12. WASP: Benchmarking Web Agent Security Against Prompt Injection Attacks
|
||||
|
||||
**Summary:** End-to-end benchmark for evaluating web agent security against prompt injection attacks. Tests realistic scenarios where even simple, low-effort human-written injections can deceive top-tier AI models including those with advanced reasoning.
|
||||
|
||||
**Key Findings:**
|
||||
- Attacks partially succeed in up to 86% of cases
|
||||
- State-of-the-art agents often struggle to fully complete attacker goals
|
||||
- Reveals "security by incompetence"—agents' limitations sometimes prevent full attack success
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/121728
|
||||
|
||||
---
|
||||
|
||||
## 13. AgentDAM: Privacy Leakage Evaluation for Autonomous Web Agents
|
||||
|
||||
**Summary:** Measures whether AI web-navigation agents follow the privacy principle of "data minimization"—using sensitive information only when truly necessary to complete a task. Simulates realistic web interaction scenarios end-to-end.
|
||||
|
||||
**Key Findings:**
|
||||
- Agents built on GPT-4, Llama-3, and Claude are prone to inadvertent use of unnecessary sensitive information
|
||||
- Proposes prompting-based defense that reduces information leakage
|
||||
- End-to-end benchmarking provides more realistic measure than probing LLMs about privacy
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/121443
|
||||
|
||||
---
|
||||
|
||||
## 14. Embodied Web Agents: Bridging Physical-Digital Realms for Integrated Agent Intelligence
|
||||
|
||||
**Summary:** A novel paradigm for AI agents that fluidly bridge embodiment and web-scale reasoning. Creates unified simulation integrating realistic 3D indoor/outdoor environments with functional web interfaces. Tasks include cooking from online recipes, navigating with dynamic map data, and interpreting landmarks using web knowledge.
|
||||
|
||||
**Key Contributions:**
|
||||
- Unified platform combining 3D environments with web interfaces
|
||||
- Benchmark spanning cooking, navigation, shopping, tourism, and geolocation
|
||||
- Reveals significant performance gaps between AI systems and humans
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/121809
|
||||
|
||||
---
|
||||
|
||||
## 15. VideoCAD: A Dataset and Model for Learning Long-Horizon 3D CAD UI Interactions from Video
|
||||
|
||||
**Summary:** The first attempt to model UI interactions for precision engineering tasks. Features 41K+ annotated video recordings of CAD operations with time horizons up to 20x longer than existing datasets. Proposes VideoCADFormer for learning CAD interactions directly from video.
|
||||
|
||||
**Key Contributions:**
|
||||
- Large-scale synthetic dataset for CAD UI interactions
|
||||
- VQA benchmark for evaluating spatial reasoning and video understanding
|
||||
- Reveals challenges in precise action grounding and long-horizon dependencies
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/121820
|
||||
|
||||
---
|
||||
|
||||
## 16. Look Before You Leap: A GUI-Critic-R1 Model for Pre-Operative Error Diagnosis
|
||||
|
||||
**Summary:** Introduces a pre-operative critic mechanism that provides feedback before action execution by reasoning about potential outcomes. Proposes Suggestion-aware Group Relative Policy Optimization (S-GRPO) for building the GUI-Critic-R1 model with fully automated data generation.
|
||||
|
||||
**Key Results:**
|
||||
- Significant advantages in critic accuracy compared to current MLLMs
|
||||
- Improved success rates and operational efficiency on GUI automation benchmarks
|
||||
- Works across both mobile and web domains
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/115566
|
||||
|
||||
---
|
||||
|
||||
## 17. Grounded Reinforcement Learning for Visual Reasoning (ViGoRL)
|
||||
|
||||
**Summary:** A vision-language model trained with RL to explicitly anchor each reasoning step to specific visual coordinates. Introduces multi-turn RL framework enabling dynamic zooming into predicted coordinates during reasoning.
|
||||
|
||||
**Key Results:**
|
||||
- 86.4% on V*Bench for visual search
|
||||
- Outperforms supervised fine-tuning and conventional RL across spatial reasoning, visual search, and web-based grounding
|
||||
- Grounding amplifies region exploration, subgoal setting, and visual verification
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/120218
|
||||
|
||||
---
|
||||
|
||||
## 18. GUI-Actor: Coordinate-Free Visual Grounding for GUI Agents
|
||||
|
||||
**Summary:** A VLM-based method for coordinate-free GUI grounding using an attention-based action head. Enables proposing one or more action regions in a single forward pass with a grounding verifier for selection.
|
||||
|
||||
**Key Results:**
|
||||
- GUI-Actor-7B achieves 44.6 on ScreenSpot-Pro with Qwen2.5-VL, outperforming UI-TARS-72B (38.1)
|
||||
- Improved generalization to unseen resolutions and layouts
|
||||
- Fine-tuning only ~100M parameters achieves SOTA performance
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/119841
|
||||
|
||||
---
|
||||
|
||||
## 19. GUI-G1: Understanding R1-Zero-Like Training for Visual Grounding in GUI Agents
|
||||
|
||||
**Summary:** Extensive analysis of the R1-Zero paradigm (online RL + chain-of-thought reasoning) for GUI grounding. Identifies issues: longer reasoning chains lead to worse performance, reward hacking via box size exploitation, and overfitting easy examples.
|
||||
|
||||
**Solutions Proposed:**
|
||||
- Fast Thinking Template for direct answer generation
|
||||
- Box size constraint in reward function
|
||||
- Difficulty-aware scaling in RL objective
|
||||
|
||||
**Key Results:**
|
||||
- GUI-G1-3B achieves 90.3% on ScreenSpot and 37.1% on ScreenSpot-Pro
|
||||
- Outperforms larger UI-TARS-7B with only 3B parameters
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/120227
|
||||
|
||||
---
|
||||
|
||||
## 20. GUI-Reflection: Empowering Multimodal GUI Models with Self-Reflection Behavior
|
||||
|
||||
**Summary:** Framework integrating self-reflection and error correction into end-to-end multimodal GUI models through GUI-specific pre-training, offline SFT, and online reflection tuning. Enables self-reflection emergence with fully automated data generation.
|
||||
|
||||
**Key Contributions:**
|
||||
- Scalable pipelines for automatic reflection/correction data from successful trajectories
|
||||
- GUI-Reflection Task Suite for reflection-oriented abilities
|
||||
- Diverse environment for online training on mobile devices
|
||||
- Iterative online reflection tuning algorithm
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/115826
|
||||
|
||||
---
|
||||
|
||||
## 21. InfantAgent-Next: A Multimodal Generalist Agent for Automated Computer Interaction
|
||||
|
||||
**Summary:** A generalist agent capable of multimodal computer interaction (text, images, audio, video). Integrates tool-based and pure vision agents within highly modular architecture, enabling collaborative step-by-step task solving.
|
||||
|
||||
**Key Results:**
|
||||
- 7.27 accuracy gain over Claude-Computer-Use on OSWorld
|
||||
- Evaluated on pure vision benchmarks (OSWorld), general benchmarks (GAIA), and tool-intensive benchmarks (SWE-Bench)
|
||||
- Demonstrates value of modular, collaborative agent architecture
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/118379
|
||||
|
||||
---
|
||||
|
||||
## 22. AdvEDM: Fine-grained Adversarial Attack against VLM-based Embodied Agents
|
||||
|
||||
**Summary:** A fine-grained adversarial attack framework that modifies VLM perception of only key objects while preserving semantics of remaining regions. Unlike broad semantic disruption, this targeted approach reduces conflicts with task context, making VLMs output valid but incorrect decisions that affect agent actions in the physical world.
|
||||
|
||||
**Key Contributions:**
|
||||
- AdvEDM-R: removes semantics of specific objects from images
|
||||
- AdvEDM-A: adds semantics of new objects into images
|
||||
- Demonstrates fine-grained control with excellent attack performance in embodied decision-making tasks
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/116436
|
||||
|
||||
---
|
||||
|
||||
## 23. BLINK-Twice: A Reasoning Benchmark on Visual Perception
|
||||
|
||||
**Summary:** A vision-centric reasoning benchmark grounded in challenging perceptual tasks. Unlike prior benchmarks, it moves beyond shallow perception ("see") to require fine-grained observation and analytical reasoning ("observe"). Features natural adversarial image pairs and annotated reasoning chains for process evaluation.
|
||||
|
||||
**Key Findings:**
|
||||
- Tests 20 leading MLLMs including 12 foundation models and 8 reasoning-enhanced models
|
||||
- Existing reasoning strategies (chain-of-thought, self-criticism) result in unstable and redundant reasoning
|
||||
- Repeated image observation improves performance across models
|
||||
- Active visual interaction (as in o3) highlights need for new vision reasoning paradigm
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/121522
|
||||
|
||||
---
|
||||
|
||||
## 24. BadVLA: Backdoor Attacks on Vision-Language-Action Models
|
||||
|
||||
**Summary:** First systematic investigation of backdoor vulnerabilities in VLA models. Proposes Objective-Decoupled Optimization with two stages: explicit feature-space separation to isolate trigger representations, and conditional control deviations activated only by triggers.
|
||||
|
||||
**Key Findings:**
|
||||
- Consistently achieves near-100% attack success rates with minimal impact on clean task accuracy
|
||||
- Robust against common input perturbations, task transfers, and model fine-tuning
|
||||
- Exposes critical security vulnerabilities in current VLA deployments under Training-as-a-Service paradigm
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/115803
|
||||
|
||||
---
|
||||
|
||||
## 25. Benchmarking Egocentric Multimodal Goal Inference for Assistive Wearable Agents
|
||||
|
||||
**Summary:** Benchmark for proactively inferring user goals from multimodal contextual observations for wearable assistant agents (smart glasses). Dataset comprises ~30 hours from 363 participants across 3,482 recordings with visual, audio, digital, and longitudinal context.
|
||||
|
||||
**Key Findings:**
|
||||
- Humans achieve 93% MCQ accuracy; best VLM reaches ~84%
|
||||
- For open-ended generation, best models produce relevant goals only ~57% of the time
|
||||
- Smaller models (suited for wearables) achieve ~49% accuracy
|
||||
- Models benefit from relevant modalities but struggle with noisy ones
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/121655
|
||||
|
||||
---
|
||||
|
||||
## 26. GAM-Agent: Game-Theoretic Multi-Agent Framework for Visual Reasoning
|
||||
|
||||
**Summary:** A game-theoretic multi-agent framework formulating reasoning as a non-zero-sum game between base agents (visual perception specialists) and a critical agent (logic/fact verification). Features uncertainty-aware controller for dynamic agent collaboration with multi-round debates.
|
||||
|
||||
**Key Results:**
|
||||
- Boosts small-to-mid scale models (Qwen2.5-VL-7B, InternVL3-14B) by 5-6%
|
||||
- Enhances strong models like GPT-4o by 2-3%
|
||||
- Modular, scalable, and generalizable framework
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/119144
|
||||
|
||||
---
|
||||
|
||||
## 27. GRIT: Teaching MLLMs to Think with Images
|
||||
|
||||
**Summary:** Introduces Grounded Reasoning with Images and Texts—a method for training MLLMs to generate reasoning chains interleaving natural language with explicit bounding box coordinates. Uses GRPO-GR reinforcement learning with rewards focused on answer accuracy and grounding format.
|
||||
|
||||
**Key Contributions:**
|
||||
- Exceptional data efficiency: requires as few as 20 image-question-answer triplets
|
||||
- Successfully unifies reasoning and grounding abilities
|
||||
- Eliminates need for reasoning chain annotations or explicit bounding box labels
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/118020
|
||||
|
||||
---
|
||||
|
||||
## 28. Safe RLHF-V: Safe Reinforcement Learning from Multi-modal Human Feedback
|
||||
|
||||
**Summary:** First multimodal safety alignment framework. Introduces BeaverTails-V (first dataset with dual preference annotations for helpfulness and safety), and Beaver-Guard-V (multi-level guardrail system defending against unsafe queries and adversarial attacks).
|
||||
|
||||
**Key Results:**
|
||||
- Guard model improves precursor model's safety by average of 40.9% over five filtering rounds
|
||||
- Safe RLHF-V enhances model safety by 34.2% and helpfulness by 34.3%
|
||||
- First exploration of multi-modal safety alignment within constrained optimization
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/118304
|
||||
|
||||
---
|
||||
|
||||
## 29. Dropout Decoding: Uncertainty-Guided Token Dropout for LVLM Reliability
|
||||
|
||||
**Summary:** An inference-time approach that quantifies visual token uncertainty and selectively masks uncertain tokens. Decomposes uncertainty into aleatoric and epistemic components, focusing on epistemic uncertainty for perception-related errors.
|
||||
|
||||
**Key Results:**
|
||||
- Significantly reduces object hallucinations
|
||||
- Enhances reliability and quality of LVLM outputs across diverse visual contexts
|
||||
- Validated on CHAIR, THRONE, and MMBench benchmarks
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/118572
|
||||
|
||||
---
|
||||
|
||||
## 30. FOCUS: Unified Vision-Language Modeling for Interactive Editing
|
||||
|
||||
**Summary:** A unified LVLM integrating segmentation-aware perception and controllable object-centric generation. Uses dual-branch visual encoder for global semantic context and fine-grained spatial details, with MoVQGAN-based visual tokenizer for discrete visual tokens.
|
||||
|
||||
**Key Contributions:**
|
||||
- Progressive multi-stage training pipeline
|
||||
- Segmentation masks jointly optimized as spatial condition prompts
|
||||
- Bridges segmentation-aware perception with fine-grained visual synthesis
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/119062
|
||||
|
||||
---
|
||||
|
||||
## 31. Fine-Grained Preference Optimization for Spatial Reasoning (SpatialReasoner-R1)
|
||||
|
||||
**Summary:** Introduces Multi-Model Monte Carlo Tree Search (M3CTS) for generating diverse Long Chain-of-Thought reasoning trajectories. Proposes fine-grained Direct Preference Optimization (fDPO) with segment-specific preference granularity guided by spatial reward mechanism.
|
||||
|
||||
**Key Results:**
|
||||
- fDPO achieves 4.1% and 9.0% gains over standard DPO on spatial quality and quantity tasks
|
||||
- SpatialReasoner-R1 sets new SOTA on SpatialRGPT-Bench, outperforming strongest baseline by 9.8%
|
||||
- Maintains competitive performance on general vision-language tasks
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/118573
|
||||
|
||||
---
|
||||
|
||||
## 32. Reason-RFT: Reinforcement Fine-Tuning for Visual Reasoning
|
||||
|
||||
**Summary:** A two-stage reinforcement fine-tuning framework: SFT with curated Chain-of-Thought data activates reasoning potential, followed by RL based on Group Relative Policy Optimization (GRPO) for domain shift adaptability.
|
||||
|
||||
**Key Advantages:**
|
||||
- State-of-the-art results outperforming both open-source and proprietary models
|
||||
- Robust performance under domain shifts across various tasks
|
||||
- Excellent data efficiency in few-shot learning scenarios
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/118345
|
||||
|
||||
---
|
||||
|
||||
## 33. Safe + Safe = Unsafe? Exploiting Safe Images to Jailbreak LVLMs
|
||||
|
||||
**Summary:** Reveals that safe images can be exploited for jailbreaking when combined with additional safe images and prompts, exploiting LVLMs' universal reasoning capabilities and safety snowball effect. Proposes Safety Snowball Agent (SSA) framework.
|
||||
|
||||
**Key Findings:**
|
||||
- SSA can use nearly any image to induce LVLMs to produce unsafe content
|
||||
- Achieves high jailbreak success rates against latest LVLMs
|
||||
- Exploits inherent LVLM properties rather than alignment flaws
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/loc/san-diego/poster/116422
|
||||
|
||||
---
|
||||
|
||||
## 34. MIP against Agent: Malicious Image Patches Hijacking Multimodal OS Agents
|
||||
|
||||
**Summary:** Uncovers novel attack vector: Malicious Image Patches (MIPs)—adversarially perturbed screen regions that induce OS agents to perform harmful actions. MIPs can be embedded in wallpapers or shared on social media to exfiltrate sensitive data.
|
||||
|
||||
**Key Findings:**
|
||||
- MIPs generalize across user prompts and screen configurations
|
||||
- Can hijack multiple OS agents during execution of benign instructions
|
||||
- Exposes critical security vulnerabilities requiring attention before widespread deployment
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/loc/san-diego/poster/117813
|
||||
|
||||
---
|
||||
|
||||
## 35. CogVLA: Cognition-Aligned Vision-Language-Action Models
|
||||
|
||||
**Summary:** A framework leveraging instruction-driven routing and sparsification for VLA efficiency. Features 3-stage progressive architecture inspired by human multimodal coordination: Encoder-FiLM Aggregation Routing, LLM-FiLM Pruning Routing, and V-L-A Coupled Attention.
|
||||
|
||||
**Key Results:**
|
||||
- 97.4% success rate on LIBERO benchmark, 70.0% on real-world robotic tasks
|
||||
- Reduces training costs by 2.5x and inference latency by 2.8x compared to OpenVLA
|
||||
- Achieves state-of-the-art performance
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/119023
|
||||
|
||||
---
|
||||
|
||||
## 36. Succeed or Learn Slowly (SoLS): Sample Efficient RL for Mobile App Control
|
||||
|
||||
**Summary:** Novel off-policy RL algorithm applying direct policy updates for positive samples and conservative, regularized updates for negative ones. Augmented with Successful Transition Replay (STR) for prioritizing successful interactions.
|
||||
|
||||
**Key Results:**
|
||||
- At least 17% relative increase over existing methods on AndroidWorld benchmark
|
||||
- Substantially fewer computational resources than GPT-4o-based methods
|
||||
- 5-60x faster inference
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/119910
|
||||
|
||||
---
|
||||
|
||||
## 37. TAI3: Testing Agent Integrity in Interpreting User Intent
|
||||
|
||||
**Summary:** An API-centric stress testing framework that uncovers intent integrity violations in LLM agents. Uses semantic partitioning to organize tasks into meaningful categories, with targeted mutations to expose subtle agent errors while preserving user intent.
|
||||
|
||||
**Key Contributions:**
|
||||
- Datatype-aware strategy memory for retrieving effective mutation patterns
|
||||
- Lightweight predictor for ranking mutations by error likelihood
|
||||
- Generalizes to stronger target models using smaller LLMs for test generation
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/118952
|
||||
|
||||
---
|
||||
|
||||
## 38. ThinkAct: Vision-Language-Action Reasoning via Reinforced Visual Latent Planning
|
||||
|
||||
**Summary:** A dual-system framework bridging high-level reasoning with low-level action execution. Trains multimodal LLM to generate embodied reasoning plans guided by action-aligned visual rewards, compressed into visual plan latents for downstream action execution.
|
||||
|
||||
**Key Capabilities:**
|
||||
- Few-shot adaptation
|
||||
- Long-horizon planning
|
||||
- Self-correction behaviors in complex embodied AI tasks
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/119747
|
||||
|
||||
---
|
||||
|
||||
## 39. Visualization-of-Thought Attack (VoTA) against VLMs
|
||||
|
||||
**Summary:** Automated attack framework that constructs chains of images with risky visual thoughts to challenge VLMs. Exploits the conflict between logical processing and safety protocols, leading to unsafe content generation.
|
||||
|
||||
**Key Results:**
|
||||
- Improves average attack success rate by 26.71% (from 63.70% to 90.41%)
|
||||
- Tested on 9 open-source and 6 commercial VLMs
|
||||
- Outperforms state-of-the-art methods
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/119873
|
||||
|
||||
---
|
||||
|
||||
## 40. Open CaptchaWorld: Benchmarking MLLM Agents on CAPTCHA Puzzles
|
||||
|
||||
**Summary:** First web-based benchmark evaluating MLLM agents on diverse CAPTCHA puzzles. Spans 20 modern CAPTCHA types (225 total) with novel metric: CAPTCHA Reasoning Depth quantifying cognitive and motor steps required.
|
||||
|
||||
**Key Findings:**
|
||||
- Humans achieve 93.3% success rate
|
||||
- State-of-the-art agents achieve at most 40.0% (Browser-Use OpenAI-o3)
|
||||
- Highlights significant gap between human and agent capabilities
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/121537
|
||||
|
||||
---
|
||||
|
||||
## 41. Pixel Reasoner: Pixel-Space Reasoning with Curiosity-Driven RL
|
||||
|
||||
**Summary:** Introduces pixel-space reasoning framework where VLMs use visual operations (zoom-in, select-frame) to directly inspect and infer from visual evidence. Two-phase training: instruction tuning on synthesized traces, then RL with curiosity-driven rewards.
|
||||
|
||||
**Key Results:**
|
||||
- 84% on V*Bench, 74% on TallyQA-Complex, 84% on InfographicsVQA
|
||||
- Highest accuracy achieved by any open-source 7B model
|
||||
- Enables proactive information gathering from complex visual inputs
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/117667
|
||||
|
||||
---
|
||||
|
||||
## 42. BTL-UI: Blink-Think-Link Reasoning Model for GUI Agent
|
||||
|
||||
**Summary:** Brain-inspired framework decomposing interactions into three biologically plausible phases: Blink (rapid detection via saccadic-like attention), Think (higher-level reasoning/planning), and Link (executable command generation for motor control).
|
||||
|
||||
**Key Innovations:**
|
||||
- Automated annotation pipeline for blink data
|
||||
- BTL Reward: first rule-based reward mechanism driven by both process and outcome
|
||||
- Competitive performance on static GUI understanding and dynamic interaction tasks
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/119419
|
||||
|
||||
---
|
||||
|
||||
## 43. GUI Exploration Lab: Multi-Turn RL for Screen Navigation
|
||||
|
||||
**Summary:** Simulation environment engine enabling flexible definition of screens, icons, and navigation graphs with full environment access for agent training/evaluation. Demonstrates progressive training approach from SFT to multi-turn RL.
|
||||
|
||||
**Key Findings:**
|
||||
- Supervised fine-tuning enables memorization of fundamental knowledge
|
||||
- Single-turn RL enhances generalization to unseen scenarios
|
||||
- Multi-turn RL encourages exploration strategies through interactive trial and error
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/loc/san-diego/poster/117497
|
||||
|
||||
---
|
||||
|
||||
## 44. GUI-Rise: Structured Reasoning and History Summarization for GUI Navigation
|
||||
|
||||
**Summary:** Reasoning-enhanced framework integrating structured reasoning, action prediction, and history summarization. Uses Chain-of-Thought analyses combining progress estimation and decision reasoning, trained via SFT and GRPO with history-aware rewards.
|
||||
|
||||
**Key Results:**
|
||||
- State-of-the-art under identical training data conditions
|
||||
- Particularly strong in out-of-domain scenarios
|
||||
- Robust reasoning and generalization across diverse GUI navigation tasks
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/117425
|
||||
|
||||
---
|
||||
|
||||
## 45. UI-Genie: A Self-Improving Framework for MLLM-based Mobile GUI Agents
|
||||
|
||||
**Summary:** Self-improving framework addressing trajectory verification and training data scalability. Features UI-Genie-RM (image-text interleaved reward model) and self-improvement pipeline with reward-guided exploration and outcome verification.
|
||||
|
||||
**Key Contributions:**
|
||||
- UI-Genie-RM-517k: first reward-specific dataset for GUI agents
|
||||
- UI-Genie-Agent-16k: high-quality synthetic trajectories without manual annotation
|
||||
- State-of-the-art across multiple GUI agent benchmarks through three generations of self-improvement
|
||||
|
||||
**Poster:** https://neurips.cc/virtual/2025/poster/119990
|
||||
|
||||
---
|
||||
|
||||
## What We're Building
|
||||
|
||||
At Cua, we're focused on the infrastructure layer for computer-use agents: cloud sandboxes for safe execution, SDKs for agent development, and tools that make it easier to build and deploy agents in production.
|
||||
|
||||
If you're experimenting with any of the approaches in these papers, our [Cloud Sandboxes](https://cua.ai) provide isolated Linux, Windows, and macOS environments where you can test agent behavior without risk to real systems.
|
||||
|
||||
---
|
||||
|
||||
**Start building:** [cua.ai](https://cua.ai)
|
||||
|
||||
**Join the community:** [Discord](https://discord.gg/cua-ai)
|
||||
@@ -378,4 +378,4 @@ Happy coding (safely)!
|
||||
|
||||
---
|
||||
|
||||
_Want to dive deeper? Check out our [sandboxed functions examples](https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py) and [virtual environment tests](https://github.com/trycua/cua/blob/main/tests/venv.py) on GitHub. Questions? Come chat with us on Discord!_
|
||||
_Want to dive deeper? Check out our [sandboxed functions examples](https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py) and [virtual environment tests](https://github.com/trycua/cua/blob/main/tests/test_venv.py) on GitHub. Questions? Come chat with us on Discord!_
|
||||
|
||||
@@ -247,7 +247,7 @@ try:
|
||||
await computer.interface.right_click(300, 300)
|
||||
await computer.interface.double_click(400, 400)
|
||||
|
||||
await computer.interface.type("Hello, World!")
|
||||
await computer.interface.type_text("Hello, World!")
|
||||
await computer.interface.press_key("enter")
|
||||
|
||||
await computer.interface.set_clipboard("Test clipboard")
|
||||
@@ -306,6 +306,6 @@ Now that you know how to create and share trajectories, consider these advanced
|
||||
|
||||
### Resources
|
||||
|
||||
- [Computer-Use Interface GitHub](https://github.com/trycua/cua/tree/main/libs/computer)
|
||||
- [Computer-Use Interface GitHub](https://github.com/trycua/cua/tree/main/libs/python/computer)
|
||||
- [Hugging Face Datasets Documentation](https://huggingface.co/docs/datasets)
|
||||
- [Example Dataset: ddupont/test-dataset](https://huggingface.co/datasets/ddupont/test-dataset)
|
||||
|
||||
@@ -174,7 +174,7 @@ await computer.run()
|
||||
|
||||
## Links
|
||||
|
||||
- **Docker Provider Docs:** [https://cua.ai/docs/computers/docker](https://cua.ai/docs/computers/docker)
|
||||
- **Docker Provider Docs:** [https://cua.ai/docs/computers/docker](https://cua.ai/docs/computer-sdk/computers#linux-on-docker)
|
||||
- **KasmVNC:** [https://github.com/kasmtech/KasmVNC](https://github.com/kasmtech/KasmVNC)
|
||||
- **Container Source:** [https://github.com/trycua/cua/tree/main/libs/kasm](https://github.com/trycua/cua/tree/main/libs/kasm)
|
||||
- **Computer SDK:** [https://cua.ai/docs/computer-sdk/computers](https://cua.ai/docs/computer-sdk/computers)
|
||||
|
||||
@@ -239,7 +239,7 @@ But for development, prototyping, and learning Windows RPA workflows, **Windows
|
||||
|
||||
- [Windows Sandbox Documentation](https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/)
|
||||
- [Cua GitHub Repository](https://github.com/trycua/cua)
|
||||
- [Agent UI Documentation](https://github.com/trycua/cua/tree/main/libs/agent)
|
||||
- [Agent UI Documentation](https://github.com/trycua/cua/tree/main/libs/python/agent)
|
||||
- [Join our Discord Community](https://discord.gg/cua-ai)
|
||||
|
||||
---
|
||||
|
||||
@@ -34,7 +34,7 @@ async def take_screenshot():
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0
|
||||
)
|
||||
@@ -89,7 +89,7 @@ Use the following environment variables to configure the agent and its access to
|
||||
|
||||
```bash
|
||||
# Computer instance (cloud)
|
||||
export CUA_CONTAINER_NAME="your-container-name"
|
||||
export CUA_SANDBOX_NAME="your-sandbox-name"
|
||||
export CUA_API_KEY="your-cua-api-key"
|
||||
|
||||
# LLM API keys
|
||||
@@ -121,7 +121,7 @@ The output is an AsyncGenerator that yields response chunks.
|
||||
The `ComputerAgent` constructor provides a wide range of options for customizing agent behavior, tool integration, callbacks, resource management, and more.
|
||||
|
||||
- `model` (`str`): Default: **required**
|
||||
The LLM or agent model to use. Determines which agent loop is selected unless `custom_loop` is provided. (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro")
|
||||
The LLM or agent model to use. Determines which agent loop is selected unless `custom_loop` is provided. (e.g., "claude-sonnet-4-5-20250929", "computer-use-preview", "omni+vertex_ai/gemini-pro")
|
||||
- `tools` (`List[Any]`):
|
||||
List of tools the agent can use (e.g., `Computer`, sandboxed Python functions, etc.).
|
||||
- `custom_loop` (`Callable`):
|
||||
@@ -159,7 +159,7 @@ from computer import Computer
|
||||
from agent.callbacks import ImageRetentionCallback
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[Computer(...)],
|
||||
only_n_most_recent_images=3,
|
||||
callbacks=[ImageRetentionCallback(only_n_most_recent_images=3)],
|
||||
|
||||
@@ -13,7 +13,7 @@ Optimize agent costs with budget management and image retention callbacks.
|
||||
from agent.callbacks import BudgetManagerCallback
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
BudgetManagerCallback(
|
||||
@@ -30,7 +30,7 @@ agent = ComputerAgent(
|
||||
```python
|
||||
# Simple budget limit
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
max_trajectory_budget=5.0 # $5 limit
|
||||
)
|
||||
```
|
||||
@@ -40,7 +40,7 @@ agent = ComputerAgent(
|
||||
```python
|
||||
# Advanced budget configuration
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
max_trajectory_budget={
|
||||
"max_budget": 10.0,
|
||||
"raise_error": True, # Raise error when exceeded
|
||||
@@ -55,7 +55,7 @@ agent = ComputerAgent(
|
||||
from agent.callbacks import ImageRetentionCallback
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
ImageRetentionCallback(only_n_most_recent_images=3)
|
||||
@@ -67,7 +67,7 @@ agent = ComputerAgent(
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3 # Auto-adds ImageRetentionCallback
|
||||
)
|
||||
@@ -77,7 +77,7 @@ agent = ComputerAgent(
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0, # Budget limit
|
||||
only_n_most_recent_images=3, # Image retention
|
||||
|
||||
@@ -21,7 +21,7 @@ from agent.callbacks import (
|
||||
)
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
ImageRetentionCallback(only_n_most_recent_images=3),
|
||||
|
||||
@@ -14,7 +14,7 @@ from agent.callbacks import LoggingCallback
|
||||
import logging
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
LoggingCallback(
|
||||
@@ -29,7 +29,7 @@ agent = ComputerAgent(
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
verbosity=logging.INFO # Auto-adds LoggingCallback
|
||||
)
|
||||
@@ -72,7 +72,7 @@ class CustomLogger(AsyncCallbackHandler):
|
||||
|
||||
# Use custom logger
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
callbacks=[CustomLogger("my_agent")]
|
||||
)
|
||||
|
||||
@@ -13,7 +13,7 @@ The TrajectorySaverCallback records complete agent conversations including messa
|
||||
from agent.callbacks import TrajectorySaverCallback
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
TrajectorySaverCallback(
|
||||
@@ -28,7 +28,7 @@ agent = ComputerAgent(
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
trajectory_dir="trajectories", # Auto-save trajectories
|
||||
tools=[computer]
|
||||
)
|
||||
|
||||
@@ -83,7 +83,7 @@ For long conversations, consider using the `only_n_most_recent_images` parameter
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3
|
||||
)
|
||||
|
||||
@@ -16,7 +16,7 @@ def calculate(a: int, b: int) -> int:
|
||||
|
||||
# Use with agent
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer, calculate]
|
||||
)
|
||||
```
|
||||
@@ -43,7 +43,7 @@ from computer import Computer
|
||||
|
||||
computer = Computer(...)
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer, read_file],
|
||||
)
|
||||
```
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
title: Customizing Your ComputerAgent
|
||||
title: Customize ComputerAgent
|
||||
---
|
||||
|
||||
<Callout>
|
||||
@@ -74,7 +74,7 @@ Callbacks provide lifecycle hooks to preprocess messages, postprocess outputs, r
|
||||
from agent.callbacks import ImageRetentionCallback, TrajectorySaverCallback, BudgetManagerCallback
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
ImageRetentionCallback(only_n_most_recent_images=3),
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
{
|
||||
"title": "Integrations",
|
||||
"pages": ["hud"]
|
||||
"pages": ["hud", "observability"]
|
||||
}
|
||||
|
||||
66
docs/content/docs/agent-sdk/integrations/observability.mdx
Normal file
66
docs/content/docs/agent-sdk/integrations/observability.mdx
Normal file
@@ -0,0 +1,66 @@
|
||||
---
|
||||
title: Observability
|
||||
description: Trace CUA execution steps and sessions
|
||||
---
|
||||
|
||||
## Observability
|
||||
|
||||
CUA has a native integration with [Laminar](https://laminar.sh/) – open-source platform for tracing, evals, and labeling of autonomous AI agents. Read more about Laminar in the [Laminar docs](https://docs.lmnr.ai/).
|
||||
|
||||
## Setup
|
||||
|
||||
Register on [Laminar Cloud](https://laminar.sh/) or spin up a [local instance](https://github.com/lmnr-ai/lmnr) and get the key from your project settings. Set the `LMNR_PROJECT_API_KEY` environment variable to your key.
|
||||
|
||||
```bash
|
||||
pip install lmnr[all]
|
||||
export LMNR_PROJECT_API_KEY=your-key
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Then, initialize Laminar at the entry point of your application, register Laminar LiteLLM callback, and all steps of CUA will be automatically traced.
|
||||
|
||||
```python
|
||||
import os
|
||||
|
||||
import litellm
|
||||
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer
|
||||
from lmnr import Laminar, LaminarLiteLLMCallback # [!code highlight]
|
||||
|
||||
Laminar.initialize() # [!code highlight]
|
||||
litellm.callbacks.append(LaminarLiteLLMCallback()) # [!code highlight]
|
||||
|
||||
computer = Computer(
|
||||
os_type="linux",
|
||||
provider_type="cloud",
|
||||
name=os.getenv("CUA_CONTAINER_NAME"),
|
||||
api_key=os.getenv("CUA_API_KEY"),
|
||||
)
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="openai/computer-use-preview",
|
||||
tools=[computer],
|
||||
)
|
||||
|
||||
async def main():
|
||||
async for step in agent.run("Create a new file called 'test.txt' in the current directory"):
|
||||
print(step["output"])
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Viewing traces
|
||||
|
||||
You can view traces in the Laminar UI by going to the traces tab in your project. When you select a trace,
|
||||
you will see all the agent execution steps, including computer actions, LLM calls, and screenshots.
|
||||
|
||||
For each step, you will see the LLM call, the computer action. The computer actions are highlighted in the timeline in yellow.
|
||||
|
||||
<img
|
||||
src="/docs/img/laminar_trace_example.png"
|
||||
alt="Example trace in Laminar showing the litellm.response span and its output."
|
||||
width="800px"
|
||||
/>
|
||||
@@ -10,11 +10,10 @@
|
||||
"customizing-computeragent",
|
||||
"callbacks",
|
||||
"custom-tools",
|
||||
"custom-computer-handlers",
|
||||
"prompt-caching",
|
||||
"usage-tracking",
|
||||
"telemetry",
|
||||
"benchmarks",
|
||||
"migration-guide",
|
||||
"integrations"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ This guide lists **breaking changes** when migrating from the original `Computer
|
||||
## Breaking Changes
|
||||
|
||||
- **Initialization:**
|
||||
- `ComputerAgent` (v0.4.x) uses `model` as a string (e.g. "anthropic/claude-3-5-sonnet-20241022") instead of `LLM` and `AgentLoop` objects.
|
||||
- `ComputerAgent` (v0.4.x) uses `model` as a string (e.g. "anthropic/claude-sonnet-4-5-20250929") instead of `LLM` and `AgentLoop` objects.
|
||||
- `tools` is a list (can include multiple computers and decorated functions).
|
||||
- `callbacks` are now first-class for extensibility (image retention, budget, trajectory, logging, etc).
|
||||
- **No explicit `loop` parameter:**
|
||||
@@ -39,7 +39,7 @@ async with Computer() as computer:
|
||||
```python
|
||||
async with Computer() as computer:
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer]
|
||||
)
|
||||
messages = [{"role": "user", "content": "Take a screenshot"}]
|
||||
|
||||
@@ -38,7 +38,7 @@ With the OpenAI provider, prompt caching is handled automatically for prompts of
|
||||
```python
|
||||
from agent import ComputerAgent
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
use_prompt_caching=True,
|
||||
)
|
||||
```
|
||||
|
||||
@@ -32,7 +32,7 @@ Any vision-enabled LiteLLM-compatible model can be used as the planning componen
|
||||
- Any All‑in‑one CUA (planning-capable). See [All‑in‑one CUAs](./computer-use-agents).
|
||||
- Any VLM via LiteLLM providers: `anthropic/*`, `openai/*`, `openrouter/*`, `gemini/*`, `vertex_ai/*`, `huggingface-local/*`, `mlx/*`, etc.
|
||||
- Examples:
|
||||
- **Anthropic**: `anthropic/claude-3-5-sonnet-20241022`, `anthropic/claude-opus-4-1-20250805`
|
||||
- **Anthropic**: `anthropic/claude-sonnet-4-5-20250929`, `anthropic/claude-opus-4-1-20250805`
|
||||
- **OpenAI**: `openai/gpt-5`, `openai/gpt-o3`, `openai/gpt-4o`
|
||||
- **Google**: `gemini/gemini-1.5-pro`, `vertex_ai/gemini-pro-vision`
|
||||
- **Local models**: Any Hugging Face vision-language model
|
||||
@@ -41,7 +41,7 @@ Any vision-enabled LiteLLM-compatible model can be used as the planning componen
|
||||
|
||||
### GTA1 + GPT-5
|
||||
|
||||
Use Google's Gemini for planning with specialized grounding:
|
||||
Use OpenAI's GPT-5 for planning with specialized grounding:
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
@@ -59,7 +59,7 @@ Combine state-of-the-art grounding with powerful reasoning:
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
"huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-3-5-sonnet-20241022",
|
||||
"huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer]
|
||||
)
|
||||
|
||||
@@ -113,7 +113,7 @@ async for _ in agent.run("Close the settings window, then open the Downloads fol
|
||||
Composed agents support both capabilities:
|
||||
|
||||
```python
|
||||
agent = ComputerAgent("huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-3-5-sonnet-20241022")
|
||||
agent = ComputerAgent("huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929")
|
||||
|
||||
# Full computer-use agent capabilities
|
||||
async for _ in agent.run("Complete this online form"):
|
||||
|
||||
@@ -29,10 +29,9 @@ Claude models with computer-use capabilities:
|
||||
- Claude 4.1: `claude-opus-4-1-20250805`
|
||||
- Claude 4: `claude-opus-4-20250514`, `claude-sonnet-4-20250514`
|
||||
- Claude 3.7: `claude-3-7-sonnet-20250219`
|
||||
- Claude 3.5: `claude-3-5-sonnet-20241022`
|
||||
|
||||
```python
|
||||
agent = ComputerAgent("claude-3-5-sonnet-20241022", tools=[computer])
|
||||
agent = ComputerAgent("claude-sonnet-4-5-20250929", tools=[computer])
|
||||
async for _ in agent.run("Open Firefox and navigate to github.com"):
|
||||
pass
|
||||
```
|
||||
@@ -78,10 +77,10 @@ async for _ in agent.run("Open Firefox and navigate to github.com"):
|
||||
|
||||
Qwen3 VL family:
|
||||
|
||||
- `openrouter/qwen/qwen3-vl-235b-a22b-instruct`
|
||||
- `cua/qwen/qwen3-vl-235b` (via CUA VLM Router - recommended)
|
||||
|
||||
```python
|
||||
agent = ComputerAgent("openrouter/qwen/qwen3-vl-235b-a22b-instruct", tools=[computer])
|
||||
agent = ComputerAgent("cua/qwen/qwen3-vl-235b", tools=[computer])
|
||||
async for _ in agent.run("Open Firefox and navigate to github.com"):
|
||||
pass
|
||||
```
|
||||
|
||||
@@ -11,10 +11,10 @@ All models that support `ComputerAgent.run()` also support `ComputerAgent.predic
|
||||
|
||||
### Anthropic CUAs
|
||||
|
||||
- Claude 4.5: `claude-sonnet-4-5-20250929`
|
||||
- Claude 4.1: `claude-opus-4-1-20250805`
|
||||
- Claude 4: `claude-opus-4-20250514`, `claude-sonnet-4-20250514`
|
||||
- Claude 3.7: `claude-3-7-sonnet-20250219`
|
||||
- Claude 3.5: `claude-3-5-sonnet-20241022`
|
||||
|
||||
### OpenAI CUA Preview
|
||||
|
||||
@@ -61,7 +61,7 @@ Moondream3 is a powerful small model that can perform UI grounding and click pre
|
||||
|
||||
```python
|
||||
# Using any grounding model for click prediction
|
||||
agent = ComputerAgent("claude-3-5-sonnet-20241022", tools=[computer])
|
||||
agent = ComputerAgent("claude-sonnet-4-5-20250929", tools=[computer])
|
||||
|
||||
# Predict coordinates for specific elements
|
||||
login_coords = agent.predict_click("find the login button")
|
||||
@@ -75,7 +75,7 @@ print(f"Menu icon: {menu_coords}")
|
||||
|
||||
```python
|
||||
# OmniParser is just for OCR, so it requires an LLM for predict_click
|
||||
agent = ComputerAgent("omniparser+anthropic/claude-3-5-sonnet-20241022", tools=[computer])
|
||||
agent = ComputerAgent("omniparser+anthropic/claude-sonnet-4-5-20250929", tools=[computer])
|
||||
|
||||
# Predict click coordinates using composed agent
|
||||
coords = agent.predict_click("find the submit button")
|
||||
|
||||
@@ -0,0 +1,441 @@
|
||||
---
|
||||
title: CUA VLM Router
|
||||
description: Intelligent vision-language model routing with cost optimization and unified access
|
||||
---
|
||||
|
||||
# CUA VLM Router
|
||||
|
||||
The **CUA VLM Router** is an intelligent inference API that provides unified access to multiple vision-language model providers through a single API key. It offers cost optimization and detailed observability for production AI applications.
|
||||
|
||||
## Overview
|
||||
|
||||
Instead of managing multiple API keys and provider-specific code, CUA VLM Router acts as a smart cloud gateway that:
|
||||
|
||||
- **Unifies access** to multiple model providers
|
||||
- **Optimizes costs** through intelligent routing and provider selection
|
||||
- **Tracks usage** and costs with detailed metadata
|
||||
- **Provides observability** with routing decisions and attempt logs
|
||||
- **Managed infrastructure** - no need to manage provider API keys yourself
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Get Your API Key
|
||||
|
||||
Sign up at [cua.ai](https://cua.ai/signin) and get your CUA API key from the dashboard.
|
||||
|
||||
### 2. Set Environment Variable
|
||||
|
||||
```bash
|
||||
export CUA_API_KEY="sk_cua-api01_..."
|
||||
```
|
||||
|
||||
### 3. Use with Agent SDK
|
||||
|
||||
```python
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(os_type="linux", provider_type="docker")
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": "Take a screenshot and tell me what's on screen"}]
|
||||
|
||||
async for result in agent.run(messages):
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"])
|
||||
```
|
||||
|
||||
## Available Models
|
||||
|
||||
The CUA VLM Router currently supports these models:
|
||||
|
||||
| Model ID | Provider | Description | Best For |
|
||||
| --------------------------------- | --------- | ----------------- | --------------------------------------- |
|
||||
| `cua/anthropic/claude-sonnet-4.5` | Anthropic | Claude Sonnet 4.5 | General-purpose tasks, recommended |
|
||||
| `cua/anthropic/claude-opus-4.5` | Anthropic | Claude Opus 4.5 | Enhanced agentic and computer-use tasks |
|
||||
| `cua/anthropic/claude-haiku-4.5` | Anthropic | Claude Haiku 4.5 | Fast responses, cost-effective |
|
||||
| `cua/qwen/qwen3-vl-235b` | Qwen | Qwen3 VL 235B | Large-scale vision-language tasks |
|
||||
|
||||
## How It Works
|
||||
|
||||
### Intelligent Routing
|
||||
|
||||
When you make a request to CUA VLM Router:
|
||||
|
||||
1. **Model Resolution**: Your model ID (e.g., `cua/anthropic/claude-sonnet-4.5`) is resolved to the appropriate provider
|
||||
2. **Provider Selection**: CUA routes your request to the appropriate model provider
|
||||
3. **Response**: You receive an OpenAI-compatible response with metadata
|
||||
|
||||
## API Reference
|
||||
|
||||
### Base URL
|
||||
|
||||
```
|
||||
https://inference.cua.ai/v1
|
||||
```
|
||||
|
||||
### Authentication
|
||||
|
||||
All requests require an API key in the Authorization header:
|
||||
|
||||
```bash
|
||||
Authorization: Bearer sk_cua-api01_...
|
||||
```
|
||||
|
||||
### Endpoints
|
||||
|
||||
#### List Available Models
|
||||
|
||||
```bash
|
||||
GET /v1/models
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"id": "anthropic/claude-sonnet-4.5",
|
||||
"name": "Claude Sonnet 4.5",
|
||||
"object": "model",
|
||||
"owned_by": "cua"
|
||||
}
|
||||
],
|
||||
"object": "list"
|
||||
}
|
||||
```
|
||||
|
||||
#### Chat Completions
|
||||
|
||||
```bash
|
||||
POST /v1/chat/completions
|
||||
Content-Type: application/json
|
||||
```
|
||||
|
||||
**Request:**
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "anthropic/claude-sonnet-4.5",
|
||||
"messages": [{ "role": "user", "content": "Hello!" }],
|
||||
"max_tokens": 100,
|
||||
"temperature": 0.7,
|
||||
"stream": false
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "gen_...",
|
||||
"object": "chat.completion",
|
||||
"created": 1763554838,
|
||||
"model": "anthropic/claude-sonnet-4.5",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I help you today?"
|
||||
},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 10,
|
||||
"completion_tokens": 12,
|
||||
"total_tokens": 22,
|
||||
"cost": 0.01,
|
||||
"is_byok": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Streaming
|
||||
|
||||
Set `"stream": true` to receive server-sent events:
|
||||
|
||||
```bash
|
||||
curl -X POST https://inference.cua.ai/v1/chat/completions \
|
||||
-H "Authorization: Bearer sk_cua-api01_..." \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "anthropic/claude-sonnet-4.5",
|
||||
"messages": [{"role": "user", "content": "Count to 5"}],
|
||||
"stream": true
|
||||
}'
|
||||
```
|
||||
|
||||
**Response (SSE format):**
|
||||
|
||||
```
|
||||
data: {"id":"gen_...","choices":[{"delta":{"content":"1"}}],"object":"chat.completion.chunk"}
|
||||
|
||||
data: {"id":"gen_...","choices":[{"delta":{"content":"\n2"}}],"object":"chat.completion.chunk"}
|
||||
|
||||
data: {"id":"gen_...","choices":[{"delta":{"content":"\n3\n4\n5"}}],"object":"chat.completion.chunk"}
|
||||
|
||||
data: {"id":"gen_...","choices":[{"delta":{},"finish_reason":"stop"}],"usage":{...}}
|
||||
```
|
||||
|
||||
#### Check Balance
|
||||
|
||||
```bash
|
||||
GET /v1/balance
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"balance": 211689.85,
|
||||
"currency": "credits"
|
||||
}
|
||||
```
|
||||
|
||||
## Cost Tracking
|
||||
|
||||
CUA VLM Router provides detailed cost information in every response:
|
||||
|
||||
### Credit System
|
||||
|
||||
Requests are billed in **credits**:
|
||||
|
||||
- Credits are deducted from your CUA account balance
|
||||
- Prices vary by model and usage
|
||||
- CUA manages all provider API keys and infrastructure
|
||||
|
||||
### Response Cost Fields
|
||||
|
||||
```json
|
||||
{
|
||||
"usage": {
|
||||
"cost": 0.01, // CUA gateway cost in credits
|
||||
"market_cost": 0.000065 // Actual upstream API cost
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Note:** CUA VLM Router is a fully managed cloud service. If you want to use your own provider API keys directly (BYOK), see the [Supported Model Providers](/agent-sdk/supported-model-providers/) page for direct provider access via the agent SDK.
|
||||
|
||||
## Response Metadata
|
||||
|
||||
CUA VLM Router includes metadata about routing decisions and costs in the response. This information helps with debugging and monitoring your application's model usage.
|
||||
|
||||
## Configuration
|
||||
|
||||
### Environment Variables
|
||||
|
||||
```bash
|
||||
# Required: Your CUA API key
|
||||
export CUA_API_KEY="sk_cua-api01_..."
|
||||
|
||||
# Optional: Custom endpoint (defaults to https://inference.cua.ai/v1)
|
||||
export CUA_BASE_URL="https://custom-endpoint.cua.ai/v1"
|
||||
```
|
||||
|
||||
### Python SDK Configuration
|
||||
|
||||
```python
|
||||
from agent import ComputerAgent
|
||||
|
||||
# Using environment variables (recommended)
|
||||
agent = ComputerAgent(model="cua/anthropic/claude-sonnet-4.5")
|
||||
|
||||
# Or explicit configuration
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
# CUA adapter automatically loads from CUA_API_KEY
|
||||
)
|
||||
```
|
||||
|
||||
## Benefits Over Direct Provider Access
|
||||
|
||||
| Feature | CUA VLM Router | Direct Provider (BYOK) |
|
||||
| -------------------------- | ---------------------------- | --------------------------------- |
|
||||
| **Single API Key** | ✅ One key for all providers | ❌ Multiple keys to manage |
|
||||
| **Managed Infrastructure** | ✅ No API key management | ❌ Manage multiple provider keys |
|
||||
| **Usage Tracking** | ✅ Unified dashboard | ❌ Per-provider tracking |
|
||||
| **Model Switching** | ✅ Change model string only | ❌ Change code + keys |
|
||||
| **Setup Complexity** | ✅ One environment variable | ❌ Multiple environment variables |
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Common Error Responses
|
||||
|
||||
#### Invalid API Key
|
||||
|
||||
```json
|
||||
{
|
||||
"detail": "Insufficient credits. Current balance: 0.00 credits"
|
||||
}
|
||||
```
|
||||
|
||||
#### Missing Authorization
|
||||
|
||||
```json
|
||||
{
|
||||
"detail": "Missing Authorization: Bearer token"
|
||||
}
|
||||
```
|
||||
|
||||
#### Invalid Model
|
||||
|
||||
```json
|
||||
{
|
||||
"detail": "Invalid or unavailable model"
|
||||
}
|
||||
```
|
||||
|
||||
### Best Practices
|
||||
|
||||
1. **Check balance periodically** using `/v1/balance`
|
||||
2. **Handle rate limits** with exponential backoff
|
||||
3. **Log generation IDs** for debugging
|
||||
4. **Set up usage alerts** in your CUA dashboard
|
||||
|
||||
## Examples
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```python
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(os_type="linux", provider_type="docker")
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer]
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": "Open Firefox"}]
|
||||
|
||||
async for result in agent.run(messages):
|
||||
print(result)
|
||||
```
|
||||
|
||||
### Direct API Call (curl)
|
||||
|
||||
```bash
|
||||
curl -X POST https://inference.cua.ai/v1/chat/completions \
|
||||
-H "Authorization: Bearer ${CUA_API_KEY}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "anthropic/claude-sonnet-4.5",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Explain quantum computing"}
|
||||
],
|
||||
"max_tokens": 200
|
||||
}'
|
||||
```
|
||||
|
||||
### With Custom Parameters
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-haiku-4.5",
|
||||
tools=[computer],
|
||||
max_trajectory_budget=10.0,
|
||||
temperature=0.7
|
||||
)
|
||||
```
|
||||
|
||||
### Using Qwen3 VL 235B
|
||||
|
||||
```python
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(os_type="linux", provider_type="docker")
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="cua/qwen/qwen3-vl-235b",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": "Open a browser and search for Python tutorials"}]
|
||||
|
||||
async for result in agent.run(messages):
|
||||
print(result)
|
||||
```
|
||||
|
||||
### Using Claude Opus 4.5
|
||||
|
||||
```python
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(
|
||||
os_type="linux",
|
||||
provider_type="cloud",
|
||||
name="your-container-name",
|
||||
api_key="your-cua-api-key"
|
||||
)
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-opus-4.5",
|
||||
tools=[computer],
|
||||
instructions="You are a helpful assistant that can control computers",
|
||||
only_n_most_recent_images=3
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": "Open a browser and search for Python tutorials"}]
|
||||
|
||||
async for result in agent.run(messages):
|
||||
print(result)
|
||||
```
|
||||
|
||||
## Migration from Direct Provider Access
|
||||
|
||||
Switching from direct provider access (BYOK) to CUA VLM Router is simple:
|
||||
|
||||
**Before (Direct Provider Access with BYOK):**
|
||||
|
||||
```python
|
||||
import os
|
||||
# Required: Provider-specific API key
|
||||
os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..."
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer]
|
||||
)
|
||||
```
|
||||
|
||||
**After (CUA VLM Router - Cloud Service):**
|
||||
|
||||
```python
|
||||
import os
|
||||
# Required: CUA API key only (no provider keys needed)
|
||||
os.environ["CUA_API_KEY"] = "sk_cua-api01_..."
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-sonnet-4.5", # Add "cua/" prefix
|
||||
tools=[computer]
|
||||
)
|
||||
```
|
||||
|
||||
That's it! Same code structure, just different model format. CUA manages all provider infrastructure and credentials for you.
|
||||
|
||||
## Support
|
||||
|
||||
- **Documentation**: [cua.ai/docs](https://cua.ai/docs)
|
||||
- **Discord**: [Join our community](https://discord.com/invite/mVnXXpdE85)
|
||||
- **Issues**: [GitHub Issues](https://github.com/trycua/cua/issues)
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Explore [Agent Loops](/agent-sdk/agent-loops) to customize agent behavior
|
||||
- Learn about [Cost Saving Callbacks](/agent-sdk/callbacks/cost-saving)
|
||||
- Try [Example Use Cases](/example-usecases/form-filling)
|
||||
- Review [Supported Model Providers](/agent-sdk/supported-model-providers/) for all options
|
||||
@@ -4,23 +4,51 @@ title: Supported Model Providers
|
||||
|
||||
## Supported Models
|
||||
|
||||
### Anthropic Claude (Computer Use API)
|
||||
### CUA VLM Router (Recommended)
|
||||
|
||||
Use CUA's cloud inference API for intelligent routing and cost optimization with a single API key. CUA manages all provider infrastructure and credentials for you.
|
||||
|
||||
```python
|
||||
model="cua/anthropic/claude-sonnet-4.5" # Claude Sonnet 4.5 (recommended)
|
||||
model="cua/anthropic/claude-haiku-4.5" # Claude Haiku 4.5 (faster)
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
|
||||
- Single API key for multiple providers
|
||||
- Cost tracking and optimization
|
||||
- Fully managed infrastructure (no provider keys to manage)
|
||||
|
||||
[Learn more about CUA VLM Router →](/agent-sdk/supported-model-providers/cua-vlm-router)
|
||||
|
||||
---
|
||||
|
||||
### Anthropic Claude (Computer Use API - BYOK)
|
||||
|
||||
Direct access to Anthropic's Claude models using your own Anthropic API key (BYOK - Bring Your Own Key).
|
||||
|
||||
```python
|
||||
model="anthropic/claude-3-5-sonnet-20241022"
|
||||
model="anthropic/claude-3-7-sonnet-20250219"
|
||||
model="anthropic/claude-opus-4-20250514"
|
||||
model="anthropic/claude-sonnet-4-20250514"
|
||||
```
|
||||
|
||||
### OpenAI Computer Use Preview
|
||||
**Setup:** Set `ANTHROPIC_API_KEY` environment variable with your Anthropic API key.
|
||||
|
||||
### OpenAI Computer Use Preview (BYOK)
|
||||
|
||||
Direct access to OpenAI's computer use models using your own OpenAI API key (BYOK).
|
||||
|
||||
```python
|
||||
model="openai/computer-use-preview"
|
||||
```
|
||||
|
||||
**Setup:** Set `OPENAI_API_KEY` environment variable with your OpenAI API key.
|
||||
|
||||
### UI-TARS (Local or Huggingface Inference)
|
||||
|
||||
Run UI-TARS models locally for privacy and offline use.
|
||||
|
||||
```python
|
||||
model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"
|
||||
model="ollama_chat/0000/ui-tars-1.5-7b"
|
||||
@@ -28,9 +56,11 @@ model="ollama_chat/0000/ui-tars-1.5-7b"
|
||||
|
||||
### Omniparser + Any LLM
|
||||
|
||||
Combine Omniparser for UI understanding with any LLM provider.
|
||||
|
||||
```python
|
||||
model="omniparser+ollama_chat/mistral-small3.2"
|
||||
model="omniparser+vertex_ai/gemini-pro"
|
||||
model="omniparser+anthropic/claude-3-5-sonnet-20241022"
|
||||
model="omniparser+anthropic/claude-sonnet-4-5-20250929"
|
||||
model="omniparser+openai/gpt-4o"
|
||||
```
|
||||
|
||||
@@ -1,84 +1,74 @@
|
||||
---
|
||||
title: Telemetry
|
||||
description: This document explains how telemetry works in CUA libraries and how you can control it.
|
||||
icon: RadioTower
|
||||
description: How telemetry works in Cua and how to control it
|
||||
---
|
||||
|
||||
# Telemetry in CUA
|
||||
# Telemetry
|
||||
|
||||
CUA tracks anonymized usage and error report statistics; we ascribe to Posthog's approach as detailed [here](https://posthog.com/blog/open-source-telemetry-ethical). If you would like to opt out of sending anonymized info, you can set `telemetry_enabled` to false.
|
||||
Cua collects anonymized usage and error statistics. We follow [Posthog's ethical telemetry approach](https://posthog.com/blog/open-source-telemetry-ethical). To opt out, set `telemetry_enabled` to false.
|
||||
|
||||
## What telemetry data we collect
|
||||
## What we collect
|
||||
|
||||
CUA libraries collect usage data to help improve our software. We have two categories of telemetry:
|
||||
### Enabled by default (opt-out)
|
||||
|
||||
### Opt-Out Telemetry (Enabled by Default)
|
||||
- System info: OS, OS version, Python version
|
||||
- Module initialization: When modules are imported and their versions
|
||||
- Performance: Agent run durations, step counts, token usage, API costs
|
||||
- Session tracking: Anonymous session IDs and run IDs
|
||||
|
||||
Basic performance metrics and system information that help us understand usage patterns:
|
||||
### Disabled by default (opt-in)
|
||||
|
||||
- **System Information**: Operating system, OS version, Python version
|
||||
- **Module Initialization**: When modules are imported and their versions
|
||||
- **Performance Metrics**: Agent run durations, step counts, token usage, and API costs
|
||||
- **Session Tracking**: Anonymous session IDs and run IDs for performance analysis
|
||||
|
||||
### Opt-In Telemetry (Disabled by Default)
|
||||
|
||||
**Conversation Trajectory Logging**: Full conversation history including:
|
||||
**Trajectory logging** captures full conversation history:
|
||||
|
||||
- User messages and agent responses
|
||||
- Computer actions and their outputs
|
||||
- Reasoning traces from the agent
|
||||
- Computer actions and outputs
|
||||
- Agent reasoning traces
|
||||
|
||||
**Important**: Trajectory logging is **opt-in only** and must be explicitly enabled.
|
||||
Must be explicitly enabled.
|
||||
|
||||
### We do NOT collect:
|
||||
### We don't collect
|
||||
|
||||
- Personal information or user identifiers
|
||||
- API keys or credentials
|
||||
- File contents or application data
|
||||
- Information about files being accessed
|
||||
- Actual screenshots or screen contents (unless trajectory logging is enabled)
|
||||
- Specific text being typed, including user inputs, model outputs, computer outputs, or tool call outputs (unless trajectory logging is enabled)
|
||||
- Files being accessed
|
||||
- Screenshots or screen contents (unless trajectory logging is enabled)
|
||||
- Text being typed, user inputs, model outputs, computer outputs, or tool call outputs (unless trajectory logging is enabled)
|
||||
|
||||
## Controlling Telemetry
|
||||
## How to disable
|
||||
|
||||
We are committed to transparency and user control over telemetry. There are two ways to control telemetry:
|
||||
### Environment variable (global)
|
||||
|
||||
### 1. Environment Variable (Global Control)
|
||||
|
||||
Telemetry is enabled by default. To disable telemetry, set the `CUA_TELEMETRY_ENABLED` environment variable to a falsy value (`0`, `false`, `no`, or `off`):
|
||||
Set `CUA_TELEMETRY_ENABLED` to a falsy value (`0`, `false`, `no`, or `off`):
|
||||
|
||||
```bash
|
||||
# Disable telemetry before running your script
|
||||
export CUA_TELEMETRY_ENABLED=false
|
||||
|
||||
# Or as part of the command
|
||||
CUA_TELEMETRY_ENABLED=1 python your_script.py
|
||||
|
||||
```
|
||||
|
||||
Or from Python:
|
||||
Or in Python:
|
||||
|
||||
```python
|
||||
import os
|
||||
os.environ["CUA_TELEMETRY_ENABLED"] = "false"
|
||||
```
|
||||
|
||||
### 2. Instance-Level Control
|
||||
<Callout type="info">
|
||||
**Deprecated environment variables:** The environment variables `CUA_TELEMETRY` and
|
||||
`CUA_TELEMETRY_DISABLED` are deprecated and no longer have any effect. Use `CUA_TELEMETRY_ENABLED`
|
||||
instead.
|
||||
</Callout>
|
||||
|
||||
#### Computer SDK
|
||||
### Per instance
|
||||
|
||||
**Computer SDK:**
|
||||
|
||||
```python
|
||||
from computer import Computer
|
||||
|
||||
# Enable telemetry (default)
|
||||
computer = Computer(telemetry_enabled=True)
|
||||
|
||||
# Disable telemetry
|
||||
computer = Computer(telemetry_enabled=False)
|
||||
```
|
||||
|
||||
#### Agent SDK
|
||||
**Agent SDK:**
|
||||
|
||||
```python
|
||||
from agent import ComputerAgent
|
||||
@@ -86,60 +76,60 @@ import os
|
||||
|
||||
# Basic telemetry - performance metrics only (opt-out, enabled by default)
|
||||
agent = ComputerAgent(
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
model="claude-sonnet-4-5-20250929",
|
||||
telemetry_enabled=True # Default is True
|
||||
)
|
||||
|
||||
# Enable telemetry with full conversation trajectory logging (opt-in)
|
||||
agent = ComputerAgent(
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
model="claude-sonnet-4-5-20250929",
|
||||
telemetry_enabled={
|
||||
"log_trajectory": True # Logs full conversation items
|
||||
}
|
||||
)
|
||||
|
||||
# Disable telemetry completely
|
||||
# Disable completely
|
||||
agent = ComputerAgent(
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
model="claude-sonnet-4-5-20250929",
|
||||
telemetry_enabled=False
|
||||
)
|
||||
|
||||
# Disable telemetry completely using environment variables
|
||||
os.environ["CUA_TELEMETRY_ENABLED"] = "false"
|
||||
# Enable trajectory logging (opt-in)
|
||||
agent = ComputerAgent(
|
||||
model="claude-3-5-sonnet-20241022"
|
||||
model="claude-sonnet-4-5-20250929",
|
||||
telemetry_enabled={"log_trajectory": True}
|
||||
)
|
||||
```
|
||||
|
||||
You can check if telemetry is enabled for an instance:
|
||||
Check status:
|
||||
|
||||
```python
|
||||
print(computer.telemetry_enabled) # Will print True or False
|
||||
print(agent.telemetry_enabled) # Will print True, False, or dict
|
||||
print(computer.telemetry_enabled) # True or False
|
||||
print(agent.telemetry_enabled) # True, False, or dict
|
||||
```
|
||||
|
||||
Note that telemetry settings must be configured during initialization and cannot be changed after the object is created.
|
||||
Telemetry settings are configured at initialization and can't be changed afterward.
|
||||
|
||||
## Detailed Telemetry Events
|
||||
## Events collected
|
||||
|
||||
### Computer SDK Events
|
||||
### Computer SDK
|
||||
|
||||
| Event Name | Data Collected | Trigger Notes |
|
||||
| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------- |
|
||||
| **computer_initialized** | • `os`: Operating system (e.g., 'windows', 'darwin', 'linux')<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when a Computer instance is created |
|
||||
| **module_init** | • `module`: "computer"<br />• `version`: Package version<br />• `python_version`: Full Python version string | Triggered once when the computer package is imported for the first time |
|
||||
|
||||
### Agent SDK Events
|
||||
### Agent SDK
|
||||
|
||||
| Event Name | Data Collected | Trigger Notes |
|
||||
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------- |
|
||||
| **module_init** | • `module`: "agent"<br />• `version`: Package version<br />• `python_version`: Full Python version string | Triggered once when the agent package is imported for the first time |
|
||||
| **agent_session_start** | • `session_id`: Unique UUID for this agent instance<br />• `agent_type`: Class name (e.g., "ComputerAgent")<br />• `model`: Model name (e.g., "claude-3-5-sonnet")<br />• `os`: Operating system<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when TelemetryCallback is initialized (agent instantiation) |
|
||||
| **agent_session_start** | • `session_id`: Unique UUID for this agent instance<br />• `agent_type`: Class name (e.g., "ComputerAgent")<br />• `model`: Model name (e.g., "claude-sonnet-4-5")<br />• `os`: Operating system<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when TelemetryCallback is initialized (agent instantiation) |
|
||||
| **agent_run_start** | • `session_id`: Agent session UUID<br />• `run_id`: Unique UUID for this run<br />• `start_time`: Unix timestamp<br />• `input_context_size`: Character count of input messages<br />• `num_existing_messages`: Count of existing messages<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the start of each agent.run() call |
|
||||
| **agent_run_end** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `end_time`: Unix timestamp<br />• `duration_seconds`: Total run duration<br />• `num_steps`: Total steps taken in this run<br />• `total_usage`: Accumulated token usage and costs<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the end of each agent.run() call |
|
||||
| **agent_step** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Step number (incremental)<br />• `timestamp`: Unix timestamp<br />• `duration_seconds`: Duration of previous step | Triggered on each agent response/step during a run |
|
||||
| **agent_usage** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Current step number<br />• `prompt_tokens`: Tokens in prompt<br />• `completion_tokens`: Tokens in response<br />• `total_tokens`: Total tokens used<br />• `response_cost`: Cost of this API call | Triggered whenever usage information is received from LLM API |
|
||||
|
||||
## Transparency
|
||||
## Questions
|
||||
|
||||
We believe in being transparent about the data we collect. If you have any questions about our telemetry practices, please open an issue on our GitHub repository.
|
||||
Questions about telemetry? Open an issue on our [GitHub repository](https://github.com/trycua/cua).
|
||||
@@ -1,32 +1,32 @@
|
||||
---
|
||||
title: Cloud VM Management
|
||||
description: Manage your Cua Cloud sandboxes (VMs) via Python SDK or HTTP API
|
||||
title: Cloud Sandbox Management
|
||||
description: Manage your Cua Cloud sandboxes via Python SDK or HTTP API
|
||||
---
|
||||
|
||||
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
|
||||
|
||||
Using the Cua Cloud API, you can manage your Cua Cloud sandboxes (VMs) with Python or HTTP (curl).
|
||||
Using the Cua Cloud API, you can manage your Cua Cloud sandboxes with Python or HTTP (curl).
|
||||
|
||||
All examples require a CUA API key. You can obtain one from the [Dashboard](https://www.cua.ai/dashboard/keys).
|
||||
|
||||
---
|
||||
|
||||
## List VMs
|
||||
## List Sandboxes
|
||||
|
||||
<Tabs items={['Python', 'curl']}>
|
||||
<Tab value="Python">
|
||||
|
||||
```python
|
||||
import os
|
||||
import asyncio
|
||||
from computer.providers.cloud.provider import CloudProvider
|
||||
|
||||
async def main():
|
||||
api_key = os.getenv("CUA_API_KEY") or "your-api-key"
|
||||
# CloudProvider automatically reads CUA_API_KEY from environment
|
||||
# You can also pass api_key explicitly: CloudProvider(api_key="your-api-key")
|
||||
# Optional: point to a different API base
|
||||
# os.environ["CUA_API_BASE"] = "https://api.cua.ai"
|
||||
|
||||
provider = CloudProvider(api_key=api_key, verbose=False)
|
||||
provider = CloudProvider(verbose=False)
|
||||
async with provider:
|
||||
vms = await provider.list_vms()
|
||||
for vm in vms:
|
||||
@@ -51,7 +51,7 @@ curl -H "Authorization: Bearer $CUA_API_KEY" \
|
||||
|
||||
Responses:
|
||||
|
||||
- 200: Array of minimal VM objects with fields `{ name, password, status }`
|
||||
- 200: Array of minimal sandbox objects with fields `{ name, password, status }`
|
||||
- 401: Unauthorized (missing/invalid API key)
|
||||
|
||||
```json
|
||||
@@ -66,11 +66,11 @@ Responses:
|
||||
|
||||
Status values:
|
||||
|
||||
- `pending`: VM deployment in progress
|
||||
- `running`: VM is active and accessible
|
||||
- `stopped`: VM is stopped but not terminated
|
||||
- `terminated`: VM has been permanently destroyed
|
||||
- `failed`: VM deployment or operation failed
|
||||
- `pending`: Sandbox deployment in progress
|
||||
- `running`: Sandbox is active and accessible
|
||||
- `stopped`: Sandbox is stopped but not terminated
|
||||
- `terminated`: Sandbox has been permanently destroyed
|
||||
- `failed`: Sandbox deployment or operation failed
|
||||
|
||||
---
|
||||
|
||||
@@ -80,23 +80,22 @@ Status values:
|
||||
|
||||
---
|
||||
|
||||
## Start a VM
|
||||
## Start a Sandbox
|
||||
|
||||
Provide the VM name you want to start.
|
||||
Provide the sandbox name you want to start.
|
||||
|
||||
<Tabs items={["Python", "curl"]}>
|
||||
<Tab value="Python">
|
||||
|
||||
```python
|
||||
import os
|
||||
import asyncio
|
||||
from computer.providers.cloud.provider import CloudProvider
|
||||
|
||||
async def main():
|
||||
api_key = os.getenv("CUA_API_KEY") or "your-api-key"
|
||||
# CloudProvider automatically reads CUA_API_KEY from environment
|
||||
name = "my-vm-name" # e.g., "m-linux-96lcxd2c2k"
|
||||
|
||||
provider = CloudProvider(api_key=api_key)
|
||||
provider = CloudProvider()
|
||||
async with provider:
|
||||
resp = await provider.run_vm(name)
|
||||
print(resp) # { "name": name, "status": "starting" }
|
||||
@@ -118,7 +117,7 @@ Responses:
|
||||
|
||||
- 204: No Content (start accepted)
|
||||
- 401: Unauthorized (missing/invalid API key)
|
||||
- 404: VM not found or not owned by the user
|
||||
- 404: Sandbox not found or not owned by the user
|
||||
|
||||
```text
|
||||
HTTP/1.1 204 No Content
|
||||
@@ -129,23 +128,22 @@ HTTP/1.1 204 No Content
|
||||
|
||||
---
|
||||
|
||||
## Stop a VM
|
||||
## Stop a Sandbox
|
||||
|
||||
Stops the VM asynchronously.
|
||||
Stops the sandbox asynchronously.
|
||||
|
||||
<Tabs items={["Python", "curl"]}>
|
||||
<Tab value="Python">
|
||||
|
||||
```python
|
||||
import os
|
||||
import asyncio
|
||||
from computer.providers.cloud.provider import CloudProvider
|
||||
|
||||
async def main():
|
||||
api_key = os.getenv("CUA_API_KEY") or "your-api-key"
|
||||
# CloudProvider automatically reads CUA_API_KEY from environment
|
||||
name = "my-vm-name"
|
||||
|
||||
provider = CloudProvider(api_key=api_key)
|
||||
provider = CloudProvider()
|
||||
async with provider:
|
||||
resp = await provider.stop_vm(name)
|
||||
print(resp) # { "name": name, "status": "stopping" }
|
||||
@@ -167,7 +165,7 @@ Responses:
|
||||
|
||||
- 202: Accepted with `{ "status": "stopping" }`
|
||||
- 401: Unauthorized (missing/invalid API key)
|
||||
- 404: VM not found or not owned by the user
|
||||
- 404: Sandbox not found or not owned by the user
|
||||
|
||||
```json
|
||||
{ "status": "stopping" }
|
||||
@@ -178,23 +176,22 @@ Responses:
|
||||
|
||||
---
|
||||
|
||||
## Restart a VM
|
||||
## Restart a Sandbox
|
||||
|
||||
Restarts the VM asynchronously.
|
||||
Restarts the sandbox asynchronously.
|
||||
|
||||
<Tabs items={["Python", "curl"]}>
|
||||
<Tab value="Python">
|
||||
|
||||
```python
|
||||
import os
|
||||
import asyncio
|
||||
from computer.providers.cloud.provider import CloudProvider
|
||||
|
||||
async def main():
|
||||
api_key = os.getenv("CUA_API_KEY") or "your-api-key"
|
||||
# CloudProvider automatically reads CUA_API_KEY from environment
|
||||
name = "my-vm-name"
|
||||
|
||||
provider = CloudProvider(api_key=api_key)
|
||||
provider = CloudProvider()
|
||||
async with provider:
|
||||
resp = await provider.restart_vm(name)
|
||||
print(resp) # { "name": name, "status": "restarting" }
|
||||
@@ -216,7 +213,7 @@ Responses:
|
||||
|
||||
- 202: Accepted with `{ "status": "restarting" }`
|
||||
- 401: Unauthorized (missing/invalid API key)
|
||||
- 404: VM not found or not owned by the user
|
||||
- 404: Sandbox not found or not owned by the user
|
||||
|
||||
```json
|
||||
{ "status": "restarting" }
|
||||
@@ -227,23 +224,22 @@ Responses:
|
||||
|
||||
---
|
||||
|
||||
## Query a VM by name
|
||||
## Query a Sandbox by name
|
||||
|
||||
Query the computer-server running on the VM. Useful for checking details like status or OS type.
|
||||
Query the computer-server running on the sandbox. Useful for checking details like status or OS type.
|
||||
|
||||
<Tabs items={["Python", "curl"]}>
|
||||
<Tab value="Python">
|
||||
|
||||
```python
|
||||
import os
|
||||
import asyncio
|
||||
from computer.providers.cloud.provider import CloudProvider
|
||||
|
||||
async def main():
|
||||
api_key = os.getenv("CUA_API_KEY") or "your-api-key"
|
||||
# CloudProvider automatically reads CUA_API_KEY from environment
|
||||
name = "my-vm-name"
|
||||
|
||||
provider = CloudProvider(api_key=api_key)
|
||||
provider = CloudProvider()
|
||||
async with provider:
|
||||
info = await provider.get_vm(name)
|
||||
print(info)
|
||||
|
||||
@@ -18,7 +18,7 @@ Execute shell commands and get detailed results:
|
||||
# Run shell command
|
||||
result = await computer.interface.run_command(cmd) # result.stdout, result.stderr, result.returncode
|
||||
```
|
||||
|
||||
|
||||
</Tab>
|
||||
<Tab value="TypeScript">
|
||||
|
||||
@@ -230,7 +230,7 @@ Control desktop environment features like wallpaper:
|
||||
env = await computer.interface.get_desktop_environment()
|
||||
print(env) # "xfce4"
|
||||
|
||||
# Set desktop wallpaper to an image file accessible on the VM
|
||||
# Set desktop wallpaper to an image file accessible on the sandbox
|
||||
await computer.interface.set_wallpaper("/home/cua/shared/wallpaper.png")
|
||||
```
|
||||
|
||||
@@ -241,7 +241,7 @@ Control desktop environment features like wallpaper:
|
||||
const env = await computer.interface.getDesktopEnvironment();
|
||||
print(env) # "xfce4"
|
||||
|
||||
// Set desktop wallpaper to an image file accessible on the VM
|
||||
// Set desktop wallpaper to an image file accessible on the sandbox
|
||||
await computer.interface.setWallpaper('/home/cua/shared/wallpaper.png');
|
||||
```
|
||||
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
---
|
||||
title: Computer UI
|
||||
title: Computer UI (Deprecated)
|
||||
---
|
||||
|
||||
<Callout type="warn" title="Deprecated">
|
||||
The Computer UI is deprecated and will be replaced with a revamped playground experience soon. We
|
||||
recommend using VNC or Screen Sharing for precise control of the computer instead.
|
||||
</Callout>
|
||||
|
||||
The computer module includes a Gradio UI for creating and sharing demonstration data. We make it easy for people to build community datasets for better computer use models with an upload to Huggingface feature.
|
||||
|
||||
```bash
|
||||
|
||||
@@ -1,29 +1,20 @@
|
||||
---
|
||||
title: Cua Computers
|
||||
title: Computer Types
|
||||
description: Understanding Cua computer types and connection methods
|
||||
---
|
||||
|
||||
<Callout>
|
||||
A corresponding{' '}
|
||||
<a href="https://github.com/trycua/cua/blob/main/notebooks/computer_nb.ipynb" target="_blank">
|
||||
Jupyter Notebook
|
||||
</a>{' '}
|
||||
and{' '}
|
||||
<a href="https://github.com/trycua/cua/tree/main/examples/computer-example-ts" target="_blank">
|
||||
NodeJS project
|
||||
</a>{' '}
|
||||
are available for this documentation.
|
||||
</Callout>
|
||||
{/* prettier-ignore */}
|
||||
<Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/computer_nb.ipynb" target="_blank">Jupyter Notebook</a> and <a href="https://github.com/trycua/cua/tree/main/examples/computer-example-ts" target="_blank">NodeJS project</a> are available for this documentation.</Callout>
|
||||
|
||||
Before we can automate apps using AI, we need to first connect to a Computer Server to give the AI a safe environment to execute workflows in.
|
||||
|
||||
Cua Computers are preconfigured virtual machines running the Computer Server. They can be either macOS, Linux, or Windows. They're found in either a cloud-native container, or on your host desktop.
|
||||
Cua Computers are preconfigured sandboxes running the Computer Server. They can be either macOS, Linux, or Windows. They're found in either a cloud-native sandbox, or on your host desktop.
|
||||
|
||||
## Cloud Sandbox
|
||||
|
||||
**Easiest & safest way to get started - works on any host OS**
|
||||
|
||||
This is a Cloud Sandbox running the Computer Server. Get a container at [cua.ai](https://cua.ai/).
|
||||
This is a Cloud Sandbox running the Computer Server. Get a sandbox at [cua.ai](https://cua.ai/).
|
||||
|
||||
<Tabs items={['Python', 'TypeScript']}>
|
||||
<Tab value="Python">
|
||||
@@ -85,7 +76,7 @@ Cua provides two Docker images for running Linux desktops:
|
||||
os_type="linux",
|
||||
provider_type="docker",
|
||||
image="trycua/cua-xfce:latest",
|
||||
name="my-xfce-container"
|
||||
name="my-xfce-sandbox"
|
||||
)
|
||||
|
||||
await computer.run() # Launch & connect to Docker sandbox
|
||||
@@ -118,7 +109,7 @@ Cua provides two Docker images for running Linux desktops:
|
||||
os_type="linux",
|
||||
provider_type="docker",
|
||||
image="trycua/cua-ubuntu:latest",
|
||||
name="my-kasm-container"
|
||||
name="my-kasm-sandbox"
|
||||
)
|
||||
|
||||
await computer.run() # Launch & connect to Docker sandbox
|
||||
@@ -152,7 +143,7 @@ computer = Computer(
|
||||
await computer.run() # Launch & connect to Windows Sandbox
|
||||
```
|
||||
|
||||
## macOS VM
|
||||
## macOS Sandbox
|
||||
|
||||
**macOS hosts only - requires Lume CLI**
|
||||
|
||||
@@ -162,7 +153,7 @@ await computer.run() # Launch & connect to Windows Sandbox
|
||||
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
|
||||
```
|
||||
|
||||
2. Start a local Cua macOS VM
|
||||
2. Start a local Cua macOS sandbox
|
||||
|
||||
```bash
|
||||
lume run macos-sequoia-cua:latest
|
||||
|
||||
@@ -34,7 +34,7 @@ You can then use this as a tool for your agent:
|
||||
from agent import ComputerAgent
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[custom_computer],
|
||||
)
|
||||
|
||||
@@ -122,7 +122,7 @@ class MyCustomComputer(AsyncComputerHandler):
|
||||
custom_computer = MyCustomComputer()
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[custom_computer],
|
||||
)
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
{
|
||||
"title": "Computer SDK",
|
||||
"description": "Build computer-using agents with the Computer SDK",
|
||||
"pages": ["computers", "commands", "computer-ui", "tracing-api", "sandboxed-python"]
|
||||
"pages": [
|
||||
"computers",
|
||||
"commands",
|
||||
"tracing-api",
|
||||
"sandboxed-python",
|
||||
"custom-computer-handlers",
|
||||
"computer-ui"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -33,7 +33,7 @@ def read_file(location: str) -> str:
|
||||
return f.read()
|
||||
|
||||
async def main():
|
||||
async with Computer(os_type="linux", provider_type="cloud", name="my-container", api_key="...") as computer:
|
||||
async with Computer(os_type="linux", provider_type="cloud", name="my-sandbox", api_key="...") as computer:
|
||||
# Call the sandboxed function (runs remotely)
|
||||
result = await read_file("/etc/hostname")
|
||||
print(result)
|
||||
@@ -60,7 +60,7 @@ await my_computer.venv_install("myenv", ["requests"])
|
||||
You can use sandboxed functions to interact with macOS applications on a local Cua Computer (requires `os_type="darwin"`). This is particularly useful for automation tasks that involve GUI applications.
|
||||
|
||||
```python
|
||||
# Example: Use sandboxed functions to execute code in a Cua Container
|
||||
# Example: Use sandboxed functions to execute code in a Cua Sandbox
|
||||
from computer.helpers import sandboxed
|
||||
|
||||
await computer.venv_install("demo_venv", ["macos-pyxa"]) # Install packages in a virtual environment
|
||||
@@ -71,10 +71,10 @@ def greet_and_print(name):
|
||||
import PyXA
|
||||
safari = PyXA.Application("Safari")
|
||||
html = safari.current_document.source()
|
||||
print(f"Hello from inside the container, {name}!")
|
||||
print(f"Hello from inside the sandbox, {name}!")
|
||||
return {"greeted": name, "safari_html": html}
|
||||
|
||||
# When a @sandboxed function is called, it will execute in the container
|
||||
# When a @sandboxed function is called, it will execute in the sandbox
|
||||
result = await greet_and_print("Cua")
|
||||
# Result: {"greeted": "Cua", "safari_html": "<html>...</html>"}
|
||||
# stdout and stderr are also captured and printed / raised
|
||||
|
||||
@@ -7,11 +7,6 @@ description: Record computer interactions for debugging, training, and analysis
|
||||
|
||||
The Computer tracing API provides a powerful way to record computer interactions for debugging, training, analysis, and compliance purposes. Inspired by Playwright's tracing functionality, it offers flexible recording options and standardized output formats.
|
||||
|
||||
<Callout>
|
||||
The tracing API addresses GitHub issue #299 by providing a unified recording interface that works
|
||||
with any Computer usage pattern, not just ComputerAgent.
|
||||
</Callout>
|
||||
|
||||
## Overview
|
||||
|
||||
The tracing API allows you to:
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
---
|
||||
title: Form Filling
|
||||
title: PDF to Form Automation
|
||||
description: Enhance and Automate Interactions Between Form Filling and Local File Systems
|
||||
---
|
||||
|
||||
import { EditableCodeBlock, EditableValue, S } from '@/components/editable-code-block';
|
||||
import { Step, Steps } from 'fumadocs-ui/components/steps';
|
||||
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
|
||||
|
||||
## Overview
|
||||
@@ -12,9 +12,17 @@ Cua can be used to automate interactions between form filling and local file sys
|
||||
|
||||
This preset usecase uses [Cua Computer](/computer-sdk/computers) to interact with a web page and local file systems along with [Agent Loops](/agent-sdk/agent-loops) to run the agent in a loop with message history.
|
||||
|
||||
## Quickstart
|
||||
---
|
||||
|
||||
Create a `requirements.txt` file with the following dependencies:
|
||||
<Steps>
|
||||
|
||||
<Step>
|
||||
|
||||
### Set Up Your Environment
|
||||
|
||||
First, install the required dependencies:
|
||||
|
||||
Create a `requirements.txt` file:
|
||||
|
||||
```text
|
||||
cua-agent
|
||||
@@ -22,33 +30,32 @@ cua-computer
|
||||
python-dotenv>=1.0.0
|
||||
```
|
||||
|
||||
And install:
|
||||
Install the dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
Create a `.env` file with the following environment variables:
|
||||
Create a `.env` file with your API keys:
|
||||
|
||||
```text
|
||||
ANTHROPIC_API_KEY=your-api-key
|
||||
ANTHROPIC_API_KEY=your-anthropic-api-key
|
||||
CUA_API_KEY=sk_cua-api01...
|
||||
```
|
||||
|
||||
Select the environment you want to run the code in (_click on the underlined values in the code to edit them directly!_):
|
||||
</Step>
|
||||
|
||||
<Tabs items={['☁️ Cloud', '🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox']}>
|
||||
<Tab value="☁️ Cloud">
|
||||
<Step>
|
||||
|
||||
<EditableCodeBlock
|
||||
key="cloud-tab"
|
||||
lang="python"
|
||||
defaultValues={{
|
||||
"container-name": "m-linux-...",
|
||||
"api_key": "sk_cua-api01..."
|
||||
}}
|
||||
>
|
||||
{`import asyncio
|
||||
### Create Your Form Filling Script
|
||||
|
||||
Create a Python file (e.g., `form_filling.py`) and select your environment:
|
||||
|
||||
<Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox']}>
|
||||
<Tab value="Cloud Sandbox">
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
@@ -59,24 +66,24 @@ from computer import Computer, VMProviderType
|
||||
from dotenv import load_dotenv
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(**name**)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def handle_sigint(sig, frame):
|
||||
print("\\n\\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
print("\n\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
|
||||
async def fill_application():
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="linux",
|
||||
provider_type=VMProviderType.CLOUD,
|
||||
name="`}<EditableValue placeholder="container-name" />{`",
|
||||
api_key="`}<EditableValue placeholder="api_key" />{`",
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="linux",
|
||||
provider_type=VMProviderType.CLOUD,
|
||||
name="your-sandbox-name", # Replace with your sandbox name
|
||||
api_key=os.environ["CUA_API_KEY"],
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.INFO,
|
||||
@@ -93,7 +100,7 @@ verbosity=logging.INFO,
|
||||
history = []
|
||||
|
||||
for i, task in enumerate(tasks, 1):
|
||||
print(f"\\n[Task {i}/{len(tasks)}] {task}")
|
||||
print(f"\n[Task {i}/{len(tasks)}] {task}")
|
||||
|
||||
# Add user message to history
|
||||
history.append({"role": "user", "content": task})
|
||||
@@ -116,7 +123,7 @@ verbosity=logging.INFO,
|
||||
|
||||
print(f"✅ Task {i}/{len(tasks)} completed")
|
||||
|
||||
print("\\n🎉 All tasks completed successfully!")
|
||||
print("\n🎉 All tasks completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in fill_application: {e}")
|
||||
@@ -124,18 +131,18 @@ verbosity=logging.INFO,
|
||||
raise
|
||||
|
||||
def main():
|
||||
try:
|
||||
load_dotenv()
|
||||
try:
|
||||
load_dotenv()
|
||||
|
||||
if "ANTHROPIC_API_KEY" not in os.environ:
|
||||
raise RuntimeError(
|
||||
"Please set the ANTHROPIC_API_KEY environment variable.\\n"
|
||||
"Please set the ANTHROPIC_API_KEY environment variable.\n"
|
||||
"You can add it to a .env file in the project root."
|
||||
)
|
||||
|
||||
if "CUA_API_KEY" not in os.environ:
|
||||
raise RuntimeError(
|
||||
"Please set the CUA_API_KEY environment variable.\\n"
|
||||
"Please set the CUA_API_KEY environment variable.\n"
|
||||
"You can add it to a .env file in the project root."
|
||||
)
|
||||
|
||||
@@ -147,22 +154,15 @@ load_dotenv()
|
||||
logger.error(f"Error running automation: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
if **name** == "**main**":
|
||||
main()`}
|
||||
|
||||
</EditableCodeBlock>
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="🍎 Lume">
|
||||
<Tab value="Linux on Docker">
|
||||
|
||||
<EditableCodeBlock
|
||||
key="lume-tab"
|
||||
lang="python"
|
||||
defaultValues={{
|
||||
"container-name": "macos-sequoia-cua:latest"
|
||||
}}
|
||||
>
|
||||
{`import asyncio
|
||||
```python
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
@@ -173,23 +173,23 @@ from computer import Computer, VMProviderType
|
||||
from dotenv import load_dotenv
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(**name**)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def handle_sigint(sig, frame):
|
||||
print("\\n\\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
print("\n\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
|
||||
async def fill_application():
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="macos",
|
||||
provider_type=VMProviderType.LUME,
|
||||
name="`}<EditableValue placeholder="container-name" />{`",
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="linux",
|
||||
provider_type=VMProviderType.DOCKER,
|
||||
image="trycua/cua-xfce:latest", # or "trycua/cua-ubuntu:latest"
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.INFO,
|
||||
@@ -206,7 +206,7 @@ verbosity=logging.INFO,
|
||||
history = []
|
||||
|
||||
for i, task in enumerate(tasks, 1):
|
||||
print(f"\\n[Task {i}/{len(tasks)}] {task}")
|
||||
print(f"\n[Task {i}/{len(tasks)}] {task}")
|
||||
|
||||
# Add user message to history
|
||||
history.append({"role": "user", "content": task})
|
||||
@@ -229,7 +229,7 @@ verbosity=logging.INFO,
|
||||
|
||||
print(f"✅ Task {i}/{len(tasks)} completed")
|
||||
|
||||
print("\\n🎉 All tasks completed successfully!")
|
||||
print("\n🎉 All tasks completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in fill_application: {e}")
|
||||
@@ -237,12 +237,12 @@ verbosity=logging.INFO,
|
||||
raise
|
||||
|
||||
def main():
|
||||
try:
|
||||
load_dotenv()
|
||||
try:
|
||||
load_dotenv()
|
||||
|
||||
if "ANTHROPIC_API_KEY" not in os.environ:
|
||||
raise RuntimeError(
|
||||
"Please set the ANTHROPIC_API_KEY environment variable.\\n"
|
||||
"Please set the ANTHROPIC_API_KEY environment variable.\n"
|
||||
"You can add it to a .env file in the project root."
|
||||
)
|
||||
|
||||
@@ -254,20 +254,15 @@ load_dotenv()
|
||||
logger.error(f"Error running automation: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
if **name** == "**main**":
|
||||
main()`}
|
||||
|
||||
</EditableCodeBlock>
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="🪟 Windows Sandbox">
|
||||
<Tab value="macOS Sandbox">
|
||||
|
||||
<EditableCodeBlock
|
||||
key="windows-tab"
|
||||
lang="python"
|
||||
defaultValues={{}}
|
||||
>
|
||||
{`import asyncio
|
||||
```python
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
@@ -278,22 +273,23 @@ from computer import Computer, VMProviderType
|
||||
from dotenv import load_dotenv
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(**name**)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def handle_sigint(sig, frame):
|
||||
print("\\n\\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
print("\n\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
|
||||
async def fill_application():
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="windows",
|
||||
provider_type=VMProviderType.WINDOWS_SANDBOX,
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="macos",
|
||||
provider_type=VMProviderType.LUME,
|
||||
name="macos-sequoia-cua:latest",
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.INFO,
|
||||
@@ -310,7 +306,7 @@ verbosity=logging.INFO,
|
||||
history = []
|
||||
|
||||
for i, task in enumerate(tasks, 1):
|
||||
print(f"\\n[Task {i}/{len(tasks)}] {task}")
|
||||
print(f"\n[Task {i}/{len(tasks)}] {task}")
|
||||
|
||||
# Add user message to history
|
||||
history.append({"role": "user", "content": task})
|
||||
@@ -333,7 +329,7 @@ verbosity=logging.INFO,
|
||||
|
||||
print(f"✅ Task {i}/{len(tasks)} completed")
|
||||
|
||||
print("\\n🎉 All tasks completed successfully!")
|
||||
print("\n🎉 All tasks completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in fill_application: {e}")
|
||||
@@ -341,12 +337,12 @@ verbosity=logging.INFO,
|
||||
raise
|
||||
|
||||
def main():
|
||||
try:
|
||||
load_dotenv()
|
||||
try:
|
||||
load_dotenv()
|
||||
|
||||
if "ANTHROPIC_API_KEY" not in os.environ:
|
||||
raise RuntimeError(
|
||||
"Please set the ANTHROPIC_API_KEY environment variable.\\n"
|
||||
"Please set the ANTHROPIC_API_KEY environment variable.\n"
|
||||
"You can add it to a .env file in the project root."
|
||||
)
|
||||
|
||||
@@ -358,22 +354,15 @@ load_dotenv()
|
||||
logger.error(f"Error running automation: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
if **name** == "**main**":
|
||||
main()`}
|
||||
|
||||
</EditableCodeBlock>
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="🐳 Docker">
|
||||
<Tab value="Windows Sandbox">
|
||||
|
||||
<EditableCodeBlock
|
||||
key="docker-tab"
|
||||
lang="python"
|
||||
defaultValues={{
|
||||
"container-name": "trycua/cua-ubuntu:latest"
|
||||
}}
|
||||
>
|
||||
{`import asyncio
|
||||
```python
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
@@ -384,23 +373,22 @@ from computer import Computer, VMProviderType
|
||||
from dotenv import load_dotenv
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(**name**)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def handle_sigint(sig, frame):
|
||||
print("\\n\\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
print("\n\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
|
||||
async def fill_application():
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="linux",
|
||||
provider_type=VMProviderType.DOCKER,
|
||||
name="`}<EditableValue placeholder="container-name" />{`",
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="windows",
|
||||
provider_type=VMProviderType.WINDOWS_SANDBOX,
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.INFO,
|
||||
@@ -417,7 +405,7 @@ verbosity=logging.INFO,
|
||||
history = []
|
||||
|
||||
for i, task in enumerate(tasks, 1):
|
||||
print(f"\\n[Task {i}/{len(tasks)}] {task}")
|
||||
print(f"\n[Task {i}/{len(tasks)}] {task}")
|
||||
|
||||
# Add user message to history
|
||||
history.append({"role": "user", "content": task})
|
||||
@@ -440,7 +428,7 @@ verbosity=logging.INFO,
|
||||
|
||||
print(f"✅ Task {i}/{len(tasks)} completed")
|
||||
|
||||
print("\\n🎉 All tasks completed successfully!")
|
||||
print("\n🎉 All tasks completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in fill_application: {e}")
|
||||
@@ -448,12 +436,12 @@ verbosity=logging.INFO,
|
||||
raise
|
||||
|
||||
def main():
|
||||
try:
|
||||
load_dotenv()
|
||||
try:
|
||||
load_dotenv()
|
||||
|
||||
if "ANTHROPIC_API_KEY" not in os.environ:
|
||||
raise RuntimeError(
|
||||
"Please set the ANTHROPIC_API_KEY environment variable.\\n"
|
||||
"Please set the ANTHROPIC_API_KEY environment variable.\n"
|
||||
"You can add it to a .env file in the project root."
|
||||
)
|
||||
|
||||
@@ -465,16 +453,42 @@ load_dotenv()
|
||||
logger.error(f"Error running automation: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
if **name** == "**main**":
|
||||
main()`}
|
||||
|
||||
</EditableCodeBlock>
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Run Your Script
|
||||
|
||||
Execute your form filling automation:
|
||||
|
||||
```bash
|
||||
python form_filling.py
|
||||
```
|
||||
|
||||
The agent will:
|
||||
|
||||
1. Download the PDF resume from Overleaf
|
||||
2. Extract information from the PDF
|
||||
3. Fill out the JotForm with the extracted information
|
||||
|
||||
Monitor the output to see the agent's progress through each task.
|
||||
|
||||
</Step>
|
||||
|
||||
</Steps>
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands)
|
||||
- Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/)
|
||||
- Experiment with different [Models and Providers](/agent-sdk/supported-model-providers/)
|
||||
- Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help
|
||||
|
||||
@@ -0,0 +1,640 @@
|
||||
---
|
||||
title: GUI Grounding with Gemini 3
|
||||
description: Using Google's Gemini 3 with OmniParser for Advanced GUI Grounding Tasks
|
||||
---
|
||||
|
||||
import { Step, Steps } from 'fumadocs-ui/components/steps';
|
||||
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
|
||||
import { Callout } from 'fumadocs-ui/components/callout';
|
||||
|
||||
## Overview
|
||||
|
||||
This example demonstrates how to use Google's Gemini 3 models with OmniParser for complex GUI grounding tasks. Gemini 3 Pro achieves exceptional performance on the [ScreenSpot-Pro benchmark](https://github.com/likaixin2000/ScreenSpot-Pro-GUI-Grounding) with a **72.7% accuracy** (compared to Claude Sonnet 4.5's 36.2%), making it ideal for precise UI element location and complex navigation tasks.
|
||||
|
||||
<img
|
||||
src="/docs/img/grounding-with-gemini3.gif"
|
||||
alt="Demo of Gemini 3 with OmniParser performing complex GUI navigation tasks"
|
||||
width="800px"
|
||||
/>
|
||||
|
||||
<Callout type="info" title="Why Gemini 3 for UI Navigation?">
|
||||
According to [Google's Gemini 3 announcement](https://blog.google/products/gemini/gemini-3/),
|
||||
Gemini 3 Pro achieves: - **72.7%** on ScreenSpot-Pro (vs. Gemini 2.5 Pro's 11.4%) -
|
||||
Industry-leading performance on complex UI navigation tasks - Advanced multimodal understanding
|
||||
for high-resolution screens
|
||||
</Callout>
|
||||
|
||||
### What You'll Build
|
||||
|
||||
This guide shows how to:
|
||||
|
||||
- Set up Vertex AI with proper authentication
|
||||
- Use OmniParser with Gemini 3 for GUI element detection
|
||||
- Leverage Gemini 3-specific features like `thinking_level` and `media_resolution`
|
||||
- Create agents that can perform complex multi-step UI interactions
|
||||
|
||||
---
|
||||
|
||||
<Steps>
|
||||
|
||||
<Step>
|
||||
|
||||
### Set Up Google Cloud and Vertex AI
|
||||
|
||||
Before using Gemini 3 models, you need to enable Vertex AI in Google Cloud Console.
|
||||
|
||||
#### 1. Create a Google Cloud Project
|
||||
|
||||
1. Go to [Google Cloud Console](https://console.cloud.google.com/)
|
||||
2. Click **Select a project** → **New Project**
|
||||
3. Enter a project name and click **Create**
|
||||
4. Note your **Project ID** (you'll need this later)
|
||||
|
||||
#### 2. Enable Vertex AI API
|
||||
|
||||
1. Navigate to [Vertex AI API](https://console.cloud.google.com/apis/library/aiplatform.googleapis.com)
|
||||
2. Select your project
|
||||
3. Click **Enable**
|
||||
|
||||
#### 3. Enable Billing
|
||||
|
||||
1. Go to [Billing](https://console.cloud.google.com/billing)
|
||||
2. Link a billing account to your project
|
||||
3. Vertex AI offers a [free tier](https://cloud.google.com/vertex-ai/pricing) for testing
|
||||
|
||||
#### 4. Create a Service Account
|
||||
|
||||
1. Go to [IAM & Admin > Service Accounts](https://console.cloud.google.com/iam-admin/serviceaccounts)
|
||||
2. Click **Create Service Account**
|
||||
3. Enter a name (e.g., "cua-gemini-agent")
|
||||
4. Click **Create and Continue**
|
||||
5. Grant the **Vertex AI User** role
|
||||
6. Click **Done**
|
||||
|
||||
#### 5. Create and Download Service Account Key
|
||||
|
||||
1. Click on your newly created service account
|
||||
2. Go to **Keys** tab
|
||||
3. Click **Add Key** → **Create new key**
|
||||
4. Select **JSON** format
|
||||
5. Click **Create** (the key file will download automatically)
|
||||
6. **Important**: Store this key file securely! It contains credentials for accessing your Google Cloud resources
|
||||
|
||||
<Callout type="warn">
|
||||
Never commit your service account JSON key to version control! Add it to `.gitignore` immediately.
|
||||
</Callout>
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Install Dependencies
|
||||
|
||||
Install the required packages for OmniParser and Gemini 3:
|
||||
|
||||
Create a `requirements.txt` file:
|
||||
|
||||
```text
|
||||
cua-agent
|
||||
cua-computer
|
||||
cua-som # OmniParser for GUI element detection
|
||||
litellm>=1.0.0
|
||||
python-dotenv>=1.0.0
|
||||
google-cloud-aiplatform>=1.70.0
|
||||
```
|
||||
|
||||
Install the dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Configure Environment Variables
|
||||
|
||||
Create a `.env` file in your project root:
|
||||
|
||||
```text
|
||||
# Google Cloud / Vertex AI credentials
|
||||
GOOGLE_CLOUD_PROJECT=your-project-id
|
||||
GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-service-account-key.json
|
||||
|
||||
# Cua credentials (for cloud sandboxes)
|
||||
CUA_API_KEY=sk_cua-api01...
|
||||
CUA_SANDBOX_NAME=your-sandbox-name
|
||||
```
|
||||
|
||||
Replace the values:
|
||||
|
||||
- `your-project-id`: Your Google Cloud Project ID from Step 1
|
||||
- `/path/to/your-service-account-key.json`: Path to the JSON key file you downloaded
|
||||
- `sk_cua-api01...`: Your Cua API key from the [Cua dashboard](https://cua.dev)
|
||||
- `your-sandbox-name`: Your sandbox name (if using cloud sandboxes)
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Create Your Complex UI Navigation Script
|
||||
|
||||
Create a Python file (e.g., `gemini_ui_navigation.py`):
|
||||
|
||||
<Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox']}>
|
||||
<Tab value="Cloud Sandbox">
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import traceback
|
||||
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer, VMProviderType
|
||||
from dotenv import load_dotenv
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def handle_sigint(sig, frame):
|
||||
print("\n\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
|
||||
async def complex_ui_navigation():
|
||||
"""
|
||||
Demonstrate Gemini 3's exceptional UI grounding capabilities
|
||||
with complex, multi-step navigation tasks.
|
||||
"""
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="linux",
|
||||
provider_type=VMProviderType.CLOUD,
|
||||
name=os.environ["CUA_SANDBOX_NAME"],
|
||||
api_key=os.environ["CUA_API_KEY"],
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
# Use OmniParser with Gemini 3 Pro for optimal GUI grounding
|
||||
model="omniparser+vertex_ai/gemini-3-pro-preview",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.INFO,
|
||||
trajectory_dir="trajectories",
|
||||
use_prompt_caching=False,
|
||||
max_trajectory_budget=5.0,
|
||||
# Gemini 3-specific parameters
|
||||
thinking_level="high", # Enables deeper reasoning (vs "low")
|
||||
media_resolution="high", # High-resolution image processing (vs "low" or "medium")
|
||||
)
|
||||
|
||||
# Complex GUI grounding tasks inspired by ScreenSpot-Pro benchmark
|
||||
# These test precise element location in professional UIs
|
||||
tasks = [
|
||||
# Task 1: GitHub repository navigation
|
||||
{
|
||||
"instruction": (
|
||||
"Go to github.com/trycua/cua. "
|
||||
"Find and click on the 'Issues' tab. "
|
||||
"Then locate and click on the search box within the issues page "
|
||||
"(not the global GitHub search). "
|
||||
"Type 'omniparser' and press Enter."
|
||||
),
|
||||
"description": "Tests precise UI element distinction in a complex interface",
|
||||
},
|
||||
|
||||
# Task 2: Search for and install Visual Studio Code
|
||||
{
|
||||
"instruction": (
|
||||
"Open your system's app store (e.g., Microsoft Store). "
|
||||
"Search for 'Visual Studio Code'. "
|
||||
"In the search results, select 'Visual Studio Code'. "
|
||||
"Click on 'Install' or 'Get' to begin the installation. "
|
||||
"If prompted, accept any permissions or confirm the installation. "
|
||||
"Wait for Visual Studio Code to finish installing."
|
||||
),
|
||||
"description": "Tests the ability to search for an application and complete its installation through a step-by-step app store workflow.",
|
||||
},
|
||||
]
|
||||
|
||||
history = []
|
||||
|
||||
for i, task_info in enumerate(tasks, 1):
|
||||
task = task_info["instruction"]
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[Task {i}/{len(tasks)}] {task_info['description']}")
|
||||
print(f"{'='*60}")
|
||||
print(f"\nInstruction: {task}\n")
|
||||
|
||||
# Add user message to history
|
||||
history.append({"role": "user", "content": task})
|
||||
|
||||
# Run agent with conversation history
|
||||
async for result in agent.run(history, stream=False):
|
||||
history += result.get("output", [])
|
||||
|
||||
# Print output for debugging
|
||||
for item in result.get("output", []):
|
||||
if item.get("type") == "message":
|
||||
content = item.get("content", [])
|
||||
for content_part in content:
|
||||
if content_part.get("text"):
|
||||
logger.info(f"Agent: {content_part.get('text')}")
|
||||
elif item.get("type") == "computer_call":
|
||||
action = item.get("action", {})
|
||||
action_type = action.get("type", "")
|
||||
logger.debug(f"Computer Action: {action_type}")
|
||||
|
||||
print(f"\n✅ Task {i}/{len(tasks)} completed")
|
||||
|
||||
print("\n🎉 All complex UI navigation tasks completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in complex_ui_navigation: {e}")
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
def main():
|
||||
try:
|
||||
load_dotenv()
|
||||
|
||||
# Validate required environment variables
|
||||
required_vars = [
|
||||
"GOOGLE_CLOUD_PROJECT",
|
||||
"GOOGLE_APPLICATION_CREDENTIALS",
|
||||
"CUA_API_KEY",
|
||||
"CUA_SANDBOX_NAME",
|
||||
]
|
||||
|
||||
missing_vars = [var for var in required_vars if not os.environ.get(var)]
|
||||
if missing_vars:
|
||||
raise RuntimeError(
|
||||
f"Missing required environment variables: {', '.join(missing_vars)}\n"
|
||||
f"Please check your .env file and ensure all keys are set.\n"
|
||||
f"See the setup guide for details on configuring Vertex AI credentials."
|
||||
)
|
||||
|
||||
signal.signal(signal.SIGINT, handle_sigint)
|
||||
|
||||
asyncio.run(complex_ui_navigation())
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error running automation: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="Linux on Docker">
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import traceback
|
||||
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer, VMProviderType
|
||||
from dotenv import load_dotenv
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def handle_sigint(sig, frame):
|
||||
print("\n\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
|
||||
async def complex_ui_navigation():
|
||||
"""
|
||||
Demonstrate Gemini 3's exceptional UI grounding capabilities
|
||||
with complex, multi-step navigation tasks.
|
||||
"""
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="linux",
|
||||
provider_type=VMProviderType.DOCKER,
|
||||
image="trycua/cua-xfce:latest",
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
# Use OmniParser with Gemini 3 Pro for optimal GUI grounding
|
||||
model="omniparser+vertex_ai/gemini-3-pro-preview",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.INFO,
|
||||
trajectory_dir="trajectories",
|
||||
use_prompt_caching=False,
|
||||
max_trajectory_budget=5.0,
|
||||
# Gemini 3-specific parameters
|
||||
thinking_level="high", # Enables deeper reasoning (vs "low")
|
||||
media_resolution="high", # High-resolution image processing (vs "low" or "medium")
|
||||
)
|
||||
|
||||
# Complex GUI grounding tasks inspired by ScreenSpot-Pro benchmark
|
||||
tasks = [
|
||||
{
|
||||
"instruction": (
|
||||
"Go to github.com/trycua/cua. "
|
||||
"Find and click on the 'Issues' tab. "
|
||||
"Then locate and click on the search box within the issues page "
|
||||
"(not the global GitHub search). "
|
||||
"Type 'omniparser' and press Enter."
|
||||
),
|
||||
"description": "Tests precise UI element distinction in a complex interface",
|
||||
},
|
||||
]
|
||||
|
||||
history = []
|
||||
|
||||
for i, task_info in enumerate(tasks, 1):
|
||||
task = task_info["instruction"]
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[Task {i}/{len(tasks)}] {task_info['description']}")
|
||||
print(f"{'='*60}")
|
||||
print(f"\nInstruction: {task}\n")
|
||||
|
||||
history.append({"role": "user", "content": task})
|
||||
|
||||
async for result in agent.run(history, stream=False):
|
||||
history += result.get("output", [])
|
||||
|
||||
for item in result.get("output", []):
|
||||
if item.get("type") == "message":
|
||||
content = item.get("content", [])
|
||||
for content_part in content:
|
||||
if content_part.get("text"):
|
||||
logger.info(f"Agent: {content_part.get('text')}")
|
||||
elif item.get("type") == "computer_call":
|
||||
action = item.get("action", {})
|
||||
action_type = action.get("type", "")
|
||||
logger.debug(f"Computer Action: {action_type}")
|
||||
|
||||
print(f"\n✅ Task {i}/{len(tasks)} completed")
|
||||
|
||||
print("\n🎉 All complex UI navigation tasks completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in complex_ui_navigation: {e}")
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
def main():
|
||||
try:
|
||||
load_dotenv()
|
||||
|
||||
required_vars = [
|
||||
"GOOGLE_CLOUD_PROJECT",
|
||||
"GOOGLE_APPLICATION_CREDENTIALS",
|
||||
]
|
||||
|
||||
missing_vars = [var for var in required_vars if not os.environ.get(var)]
|
||||
if missing_vars:
|
||||
raise RuntimeError(
|
||||
f"Missing required environment variables: {', '.join(missing_vars)}\n"
|
||||
f"Please check your .env file."
|
||||
)
|
||||
|
||||
signal.signal(signal.SIGINT, handle_sigint)
|
||||
|
||||
asyncio.run(complex_ui_navigation())
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error running automation: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="macOS Sandbox">
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import traceback
|
||||
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer, VMProviderType
|
||||
from dotenv import load_dotenv
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def handle_sigint(sig, frame):
|
||||
print("\n\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
|
||||
async def complex_ui_navigation():
|
||||
"""
|
||||
Demonstrate Gemini 3's exceptional UI grounding capabilities
|
||||
with complex, multi-step navigation tasks.
|
||||
"""
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="macos",
|
||||
provider_type=VMProviderType.LUME,
|
||||
name="macos-sequoia-cua:latest",
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
# Use OmniParser with Gemini 3 Pro for optimal GUI grounding
|
||||
model="omniparser+vertex_ai/gemini-3-pro-preview",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.INFO,
|
||||
trajectory_dir="trajectories",
|
||||
use_prompt_caching=False,
|
||||
max_trajectory_budget=5.0,
|
||||
# Gemini 3-specific parameters
|
||||
thinking_level="high", # Enables deeper reasoning (vs "low")
|
||||
media_resolution="high", # High-resolution image processing (vs "low" or "medium")
|
||||
)
|
||||
|
||||
# Complex GUI grounding tasks inspired by ScreenSpot-Pro benchmark
|
||||
tasks = [
|
||||
{
|
||||
"instruction": (
|
||||
"Go to github.com/trycua/cua. "
|
||||
"Find and click on the 'Issues' tab. "
|
||||
"Then locate and click on the search box within the issues page "
|
||||
"(not the global GitHub search). "
|
||||
"Type 'omniparser' and press Enter."
|
||||
),
|
||||
"description": "Tests precise UI element distinction in a complex interface",
|
||||
},
|
||||
]
|
||||
|
||||
history = []
|
||||
|
||||
for i, task_info in enumerate(tasks, 1):
|
||||
task = task_info["instruction"]
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[Task {i}/{len(tasks)}] {task_info['description']}")
|
||||
print(f"{'='*60}")
|
||||
print(f"\nInstruction: {task}\n")
|
||||
|
||||
history.append({"role": "user", "content": task})
|
||||
|
||||
async for result in agent.run(history, stream=False):
|
||||
history += result.get("output", [])
|
||||
|
||||
for item in result.get("output", []):
|
||||
if item.get("type") == "message":
|
||||
content = item.get("content", [])
|
||||
for content_part in content:
|
||||
if content_part.get("text"):
|
||||
logger.info(f"Agent: {content_part.get('text')}")
|
||||
elif item.get("type") == "computer_call":
|
||||
action = item.get("action", {})
|
||||
action_type = action.get("type", "")
|
||||
logger.debug(f"Computer Action: {action_type}")
|
||||
|
||||
print(f"\n✅ Task {i}/{len(tasks)} completed")
|
||||
|
||||
print("\n🎉 All complex UI navigation tasks completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in complex_ui_navigation: {e}")
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
def main():
|
||||
try:
|
||||
load_dotenv()
|
||||
|
||||
required_vars = [
|
||||
"GOOGLE_CLOUD_PROJECT",
|
||||
"GOOGLE_APPLICATION_CREDENTIALS",
|
||||
]
|
||||
|
||||
missing_vars = [var for var in required_vars if not os.environ.get(var)]
|
||||
if missing_vars:
|
||||
raise RuntimeError(
|
||||
f"Missing required environment variables: {', '.join(missing_vars)}\n"
|
||||
f"Please check your .env file."
|
||||
)
|
||||
|
||||
signal.signal(signal.SIGINT, handle_sigint)
|
||||
|
||||
asyncio.run(complex_ui_navigation())
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error running automation: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Run Your Script
|
||||
|
||||
Execute your complex UI navigation automation:
|
||||
|
||||
```bash
|
||||
python gemini_ui_navigation.py
|
||||
```
|
||||
|
||||
The agent will:
|
||||
|
||||
1. Navigate to GitHub and locate specific UI elements
|
||||
2. Distinguish between similar elements (e.g., global search vs. issues search)
|
||||
3. Perform multi-step interactions with visual feedback
|
||||
4. Use Gemini 3's advanced reasoning for precise element grounding
|
||||
|
||||
Monitor the output to see the agent's progress through each task.
|
||||
|
||||
</Step>
|
||||
|
||||
</Steps>
|
||||
|
||||
---
|
||||
|
||||
## Understanding Gemini 3-Specific Parameters
|
||||
|
||||
### `thinking_level`
|
||||
|
||||
Controls the amount of internal reasoning the model performs:
|
||||
|
||||
- `"high"`: Deeper reasoning, better for complex UI navigation (recommended for ScreenSpot-like tasks)
|
||||
- `"low"`: Faster responses, suitable for simpler tasks
|
||||
|
||||
### `media_resolution`
|
||||
|
||||
Controls vision processing for multimodal inputs:
|
||||
|
||||
- `"high"`: Best for complex UIs with many small elements (recommended)
|
||||
- `"medium"`: Balanced quality and speed
|
||||
- `"low"`: Faster processing for simple interfaces
|
||||
|
||||
<Callout type="info">
|
||||
For tasks requiring precise GUI element location (like ScreenSpot-Pro), use
|
||||
`thinking_level="high"` and `media_resolution="high"` for optimal performance.
|
||||
</Callout>
|
||||
|
||||
---
|
||||
|
||||
## Benchmark Performance
|
||||
|
||||
Gemini 3 Pro's performance on ScreenSpot-Pro demonstrates its exceptional UI grounding capabilities:
|
||||
|
||||
| Model | ScreenSpot-Pro Score |
|
||||
| ----------------- | -------------------- |
|
||||
| **Gemini 3 Pro** | **72.7%** |
|
||||
| Claude Sonnet 4.5 | 36.2% |
|
||||
| Gemini 2.5 Pro | 11.4% |
|
||||
| GPT-5.1 | 3.5% |
|
||||
|
||||
This makes Gemini 3 the ideal choice for complex UI navigation, element detection, and professional GUI automation tasks.
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Authentication Issues
|
||||
|
||||
If you encounter authentication errors:
|
||||
|
||||
1. Verify your service account JSON key path is correct
|
||||
2. Ensure the service account has the **Vertex AI User** role
|
||||
3. Check that the Vertex AI API is enabled in your project
|
||||
4. Confirm your `GOOGLE_CLOUD_PROJECT` matches your actual project ID
|
||||
|
||||
### "Vertex AI API not enabled" Error
|
||||
|
||||
Run this command to enable the API:
|
||||
|
||||
```bash
|
||||
gcloud services enable aiplatform.googleapis.com --project=YOUR_PROJECT_ID
|
||||
```
|
||||
|
||||
### Billing Issues
|
||||
|
||||
Ensure billing is enabled for your Google Cloud project. Visit the [Billing section](https://console.cloud.google.com/billing) to verify.
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Learn more about [OmniParser agent loops](/agent-sdk/agent-loops)
|
||||
- Explore [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing)
|
||||
- Read about [ScreenSpot-Pro benchmark](https://github.com/likaixin2000/ScreenSpot-Pro-GUI-Grounding)
|
||||
- Check out [Google's Gemini 3 announcement](https://blog.google/products/gemini/gemini-3/)
|
||||
- Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help
|
||||
@@ -1,5 +1,10 @@
|
||||
{
|
||||
"title": "Example Use Cases",
|
||||
"title": "Cookbook",
|
||||
"description": "Real-world examples of building with Cua",
|
||||
"pages": ["form-filling"]
|
||||
"pages": [
|
||||
"windows-app-behind-vpn",
|
||||
"form-filling",
|
||||
"post-event-contact-export",
|
||||
"gemini-complex-ui-navigation"
|
||||
]
|
||||
}
|
||||
|
||||
474
docs/content/docs/example-usecases/post-event-contact-export.mdx
Normal file
474
docs/content/docs/example-usecases/post-event-contact-export.mdx
Normal file
@@ -0,0 +1,474 @@
|
||||
---
|
||||
title: Post-Event Contact Export
|
||||
description: Run overnight contact extraction from LinkedIn, X, or other social platforms after networking events
|
||||
---
|
||||
|
||||
import { Step, Steps } from 'fumadocs-ui/components/steps';
|
||||
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
|
||||
|
||||
## Overview
|
||||
|
||||
After networking events, you need to export new connections from LinkedIn, X, or other platforms into your CRM. This automation handles it for you.
|
||||
|
||||
**The workflow**: Kick off the script after an event and let it run overnight. Wake up to a clean CSV ready for your CRM or email tool.
|
||||
|
||||
This example focuses on LinkedIn but works across platforms. It uses [Cua Computer](/computer-sdk/computers) to interact with web interfaces and [Agent Loops](/agent-sdk/agent-loops) to iterate through connections with conversation history.
|
||||
|
||||
### Why Cua is Perfect for This
|
||||
|
||||
**Cua's VMs save your session data**, bypassing bot detection entirely:
|
||||
|
||||
- **Log in once manually** through the VM browser
|
||||
- **Session persists** - you appear as a regular user, not a bot
|
||||
- **No captchas** - the platform treats automation like normal browsing
|
||||
- **No login code** - script doesn't handle authentication
|
||||
- **Run overnight** - kick off and forget
|
||||
|
||||
Traditional web scraping triggers anti-bot measures immediately. Cua's approach works across all platforms.
|
||||
|
||||
### What You Get
|
||||
|
||||
The script generates two files with your extracted connections:
|
||||
|
||||
**CSV Export** (`linkedin_connections_20250116_143022.csv`):
|
||||
|
||||
```csv
|
||||
first,last,role,company,met_at,linkedin
|
||||
John,Smith,Software Engineer,Acme Corp,Google Devfest Toronto,https://www.linkedin.com/in/johnsmith
|
||||
Sarah,Johnson,Product Manager,Tech Inc,Google Devfest Toronto,https://www.linkedin.com/in/sarahjohnson
|
||||
```
|
||||
|
||||
**Messaging Links** (`linkedin_messaging_links_20250116_143022.txt`):
|
||||
|
||||
```
|
||||
LinkedIn Messaging Compose Links
|
||||
================================================================================
|
||||
|
||||
1. https://www.linkedin.com/messaging/compose/?recipient=johnsmith
|
||||
2. https://www.linkedin.com/messaging/compose/?recipient=sarahjohnson
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
<Steps>
|
||||
|
||||
<Step>
|
||||
|
||||
### Set Up Your Environment
|
||||
|
||||
First, install the required dependencies:
|
||||
|
||||
Create a `requirements.txt` file:
|
||||
|
||||
```text
|
||||
cua-agent
|
||||
cua-computer
|
||||
python-dotenv>=1.0.0
|
||||
```
|
||||
|
||||
Install the dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
Create a `.env` file with your API keys:
|
||||
|
||||
```text
|
||||
ANTHROPIC_API_KEY=your-anthropic-api-key
|
||||
CUA_API_KEY=sk_cua-api01...
|
||||
CUA_CONTAINER_NAME=m-linux-...
|
||||
```
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Log Into LinkedIn Manually
|
||||
|
||||
**Important**: Before running the script, manually log into LinkedIn through your VM:
|
||||
|
||||
1. Access your VM through the Cua dashboard
|
||||
2. Open a browser and navigate to LinkedIn
|
||||
3. Log in with your credentials (handle any captchas manually)
|
||||
4. Close the browser but leave the VM running
|
||||
5. Your session is now saved and ready for automation!
|
||||
|
||||
This one-time manual login bypasses all bot detection.
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Configure and Create Your Script
|
||||
|
||||
Create a Python file (e.g., `contact_export.py`). You can customize:
|
||||
|
||||
```python
|
||||
# Where you met these connections (automatically added to CSV)
|
||||
MET_AT_REASON = "Google Devfest Toronto"
|
||||
|
||||
# Number of contacts to extract (in the main loop)
|
||||
for contact_num in range(1, 21): # Change 21 to extract more/fewer contacts
|
||||
```
|
||||
|
||||
Select your environment:
|
||||
|
||||
<Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox']}>
|
||||
<Tab value="Cloud Sandbox">
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer, VMProviderType
|
||||
from dotenv import load_dotenv
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration: Define where you met these connections
|
||||
MET_AT_REASON = "Google Devfest Toronto"
|
||||
|
||||
def handle_sigint(sig, frame):
|
||||
print("\n\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
|
||||
def extract_public_id_from_linkedin_url(linkedin_url):
|
||||
"""Extract public ID from LinkedIn profile URL."""
|
||||
if not linkedin_url:
|
||||
return None
|
||||
|
||||
url = linkedin_url.split('?')[0].rstrip('/')
|
||||
|
||||
if '/in/' in url:
|
||||
public_id = url.split('/in/')[-1]
|
||||
return public_id
|
||||
|
||||
return None
|
||||
|
||||
def extract_contact_from_response(result_output):
|
||||
"""
|
||||
Extract contact information from agent's response.
|
||||
Expects format:
|
||||
FIRST: value
|
||||
LAST: value
|
||||
ROLE: value
|
||||
COMPANY: value
|
||||
LINKEDIN: value
|
||||
"""
|
||||
contact = {
|
||||
'first': '',
|
||||
'last': '',
|
||||
'role': '',
|
||||
'company': '',
|
||||
'met_at': MET_AT_REASON,
|
||||
'linkedin': ''
|
||||
}
|
||||
|
||||
for item in result_output:
|
||||
if item.get("type") == "message":
|
||||
content = item.get("content", [])
|
||||
for content_part in content:
|
||||
text = content_part.get("text", "")
|
||||
if text:
|
||||
for line in text.split('\n'):
|
||||
line = line.strip()
|
||||
line_upper = line.upper()
|
||||
|
||||
if line_upper.startswith("FIRST:"):
|
||||
value = line[6:].strip()
|
||||
if value and value.upper() != "N/A":
|
||||
contact['first'] = value
|
||||
elif line_upper.startswith("LAST:"):
|
||||
value = line[5:].strip()
|
||||
if value and value.upper() != "N/A":
|
||||
contact['last'] = value
|
||||
elif line_upper.startswith("ROLE:"):
|
||||
value = line[5:].strip()
|
||||
if value and value.upper() != "N/A":
|
||||
contact['role'] = value
|
||||
elif line_upper.startswith("COMPANY:"):
|
||||
value = line[8:].strip()
|
||||
if value and value.upper() != "N/A":
|
||||
contact['company'] = value
|
||||
elif line_upper.startswith("LINKEDIN:"):
|
||||
value = line[9:].strip()
|
||||
if value and value.upper() != "N/A":
|
||||
contact['linkedin'] = value
|
||||
|
||||
return contact
|
||||
|
||||
async def scrape_linkedin_connections():
|
||||
"""Scrape LinkedIn connections and export to CSV."""
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
csv_filename = f"linkedin_connections_{timestamp}.csv"
|
||||
csv_path = os.path.join(os.getcwd(), csv_filename)
|
||||
|
||||
# Initialize CSV file
|
||||
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=['first', 'last', 'role', 'company', 'met_at', 'linkedin'])
|
||||
writer.writeheader()
|
||||
|
||||
print(f"\n🚀 Starting LinkedIn connections scraper")
|
||||
print(f"📁 Output file: {csv_path}")
|
||||
print(f"📍 Met at: {MET_AT_REASON}")
|
||||
print("=" * 80)
|
||||
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="linux",
|
||||
provider_type=VMProviderType.CLOUD,
|
||||
name=os.environ["CUA_CONTAINER_NAME"], # Your sandbox name
|
||||
api_key=os.environ["CUA_API_KEY"],
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.INFO,
|
||||
trajectory_dir="trajectories",
|
||||
use_prompt_caching=True,
|
||||
max_trajectory_budget=10.0,
|
||||
)
|
||||
|
||||
history = []
|
||||
|
||||
# Task 1: Navigate to LinkedIn connections page
|
||||
navigation_task = (
|
||||
"STEP 1 - NAVIGATE TO LINKEDIN CONNECTIONS PAGE:\n"
|
||||
"1. Open a web browser (Chrome or Firefox)\n"
|
||||
"2. Navigate to https://www.linkedin.com/mynetwork/invite-connect/connections/\n"
|
||||
"3. Wait for the page to fully load\n"
|
||||
"4. Confirm you can see the list of connections\n"
|
||||
"5. Ready to start extracting contacts"
|
||||
)
|
||||
|
||||
print(f"\n[Task 1/21] Navigating to LinkedIn...")
|
||||
history.append({"role": "user", "content": navigation_task})
|
||||
|
||||
async for result in agent.run(history, stream=False):
|
||||
history += result.get("output", [])
|
||||
|
||||
print(f"✅ Navigation completed\n")
|
||||
|
||||
# Extract 20 contacts
|
||||
contacts_extracted = 0
|
||||
linkedin_urls = []
|
||||
previous_contact_name = None
|
||||
|
||||
for contact_num in range(1, 21):
|
||||
# Build extraction task
|
||||
if contact_num == 1:
|
||||
extraction_task = (
|
||||
f"STEP {contact_num + 1} - EXTRACT CONTACT {contact_num} OF 20:\n"
|
||||
f"1. Click on the first connection's profile\n"
|
||||
f"2. Extract: FIRST, LAST, ROLE, COMPANY, LINKEDIN URL\n"
|
||||
f"3. Return in exact format:\n"
|
||||
f"FIRST: [value]\n"
|
||||
f"LAST: [value]\n"
|
||||
f"ROLE: [value]\n"
|
||||
f"COMPANY: [value]\n"
|
||||
f"LINKEDIN: [value]\n"
|
||||
f"4. Navigate back to connections list"
|
||||
)
|
||||
else:
|
||||
extraction_task = (
|
||||
f"STEP {contact_num + 1} - EXTRACT CONTACT {contact_num} OF 20:\n"
|
||||
f"1. Find '{previous_contact_name}' in the list\n"
|
||||
f"2. Click on the contact BELOW them\n"
|
||||
f"3. Extract: FIRST, LAST, ROLE, COMPANY, LINKEDIN URL\n"
|
||||
f"4. Return in exact format:\n"
|
||||
f"FIRST: [value]\n"
|
||||
f"LAST: [value]\n"
|
||||
f"ROLE: [value]\n"
|
||||
f"COMPANY: [value]\n"
|
||||
f"LINKEDIN: [value]\n"
|
||||
f"5. Navigate back"
|
||||
)
|
||||
|
||||
print(f"[Task {contact_num + 1}/21] Extracting contact {contact_num}/20...")
|
||||
history.append({"role": "user", "content": extraction_task})
|
||||
|
||||
all_output = []
|
||||
async for result in agent.run(history, stream=False):
|
||||
output = result.get("output", [])
|
||||
history += output
|
||||
all_output.extend(output)
|
||||
|
||||
contact_data = extract_contact_from_response(all_output)
|
||||
|
||||
has_name = bool(contact_data['first'] and contact_data['last'])
|
||||
has_linkedin = bool(contact_data['linkedin'] and 'linkedin.com' in contact_data['linkedin'])
|
||||
|
||||
if has_name or has_linkedin:
|
||||
with open(csv_path, 'a', newline='', encoding='utf-8') as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=['first', 'last', 'role', 'company', 'met_at', 'linkedin'])
|
||||
writer.writerow(contact_data)
|
||||
contacts_extracted += 1
|
||||
|
||||
if contact_data['linkedin']:
|
||||
linkedin_urls.append(contact_data['linkedin'])
|
||||
|
||||
if has_name:
|
||||
previous_contact_name = f"{contact_data['first']} {contact_data['last']}".strip()
|
||||
|
||||
name_str = f"{contact_data['first']} {contact_data['last']}" if has_name else "[No name]"
|
||||
print(f"✅ Contact {contact_num}/20 saved: {name_str}")
|
||||
else:
|
||||
print(f"⚠️ Could not extract valid data for contact {contact_num}")
|
||||
|
||||
if contact_num % 5 == 0:
|
||||
print(f"\n📈 Progress: {contacts_extracted}/{contact_num} contacts extracted\n")
|
||||
|
||||
# Create messaging links file
|
||||
messaging_filename = f"linkedin_messaging_links_{timestamp}.txt"
|
||||
messaging_path = os.path.join(os.getcwd(), messaging_filename)
|
||||
|
||||
with open(messaging_path, 'w', encoding='utf-8') as txtfile:
|
||||
txtfile.write("LinkedIn Messaging Compose Links\n")
|
||||
txtfile.write("=" * 80 + "\n\n")
|
||||
|
||||
for i, linkedin_url in enumerate(linkedin_urls, 1):
|
||||
public_id = extract_public_id_from_linkedin_url(linkedin_url)
|
||||
if public_id:
|
||||
messaging_url = f"https://www.linkedin.com/messaging/compose/?recipient={public_id}"
|
||||
txtfile.write(f"{i}. {messaging_url}\n")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("🎉 All tasks completed!")
|
||||
print(f"📁 CSV file saved to: {csv_path}")
|
||||
print(f"📊 Total contacts extracted: {contacts_extracted}/20")
|
||||
print(f"💬 Messaging links saved to: {messaging_path}")
|
||||
print("="*80)
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error: {e}")
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
def main():
|
||||
try:
|
||||
load_dotenv()
|
||||
|
||||
if "ANTHROPIC_API_KEY" not in os.environ:
|
||||
raise RuntimeError("Please set ANTHROPIC_API_KEY in .env")
|
||||
|
||||
if "CUA_API_KEY" not in os.environ:
|
||||
raise RuntimeError("Please set CUA_API_KEY in .env")
|
||||
|
||||
if "CUA_CONTAINER_NAME" not in os.environ:
|
||||
raise RuntimeError("Please set CUA_CONTAINER_NAME in .env")
|
||||
|
||||
signal.signal(signal.SIGINT, handle_sigint)
|
||||
|
||||
asyncio.run(scrape_linkedin_connections())
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="Linux on Docker">
|
||||
|
||||
```python
|
||||
# Same code as Cloud Sandbox, but change Computer initialization to:
|
||||
async with Computer(
|
||||
os_type="linux",
|
||||
provider_type=VMProviderType.DOCKER,
|
||||
image="trycua/cua-xfce:latest",
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
```
|
||||
|
||||
And remove the `CUA_API_KEY` and `CUA_CONTAINER_NAME` requirements from `.env` and the validation checks.
|
||||
|
||||
</Tab>
|
||||
<Tab value="macOS Sandbox">
|
||||
|
||||
```python
|
||||
# Same code as Cloud Sandbox, but change Computer initialization to:
|
||||
async with Computer(
|
||||
os_type="macos",
|
||||
provider_type=VMProviderType.LUME,
|
||||
name="macos-sequoia-cua:latest",
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
```
|
||||
|
||||
And remove the `CUA_API_KEY` and `CUA_CONTAINER_NAME` requirements from `.env` and the validation checks.
|
||||
|
||||
</Tab>
|
||||
<Tab value="Windows Sandbox">
|
||||
|
||||
```python
|
||||
# Same code as Cloud Sandbox, but change Computer initialization to:
|
||||
async with Computer(
|
||||
os_type="windows",
|
||||
provider_type=VMProviderType.WINDOWS_SANDBOX,
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
```
|
||||
|
||||
And remove the `CUA_API_KEY` and `CUA_CONTAINER_NAME` requirements from `.env` and the validation checks.
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Run Your Script
|
||||
|
||||
Execute your contact extraction automation:
|
||||
|
||||
```bash
|
||||
python contact_export.py
|
||||
```
|
||||
|
||||
The agent will:
|
||||
|
||||
1. Navigate to your LinkedIn connections page
|
||||
2. Extract data from 20 contacts (first name, last name, role, company, LinkedIn URL)
|
||||
3. Save contacts to a timestamped CSV file
|
||||
4. Generate messaging compose links for easy follow-up
|
||||
|
||||
Monitor the output to see the agent's progress. The script will show a progress update every 5 contacts.
|
||||
|
||||
</Step>
|
||||
|
||||
</Steps>
|
||||
|
||||
---
|
||||
|
||||
## How It Works
|
||||
|
||||
This script demonstrates a practical workflow for extracting LinkedIn connection data:
|
||||
|
||||
1. **Session Persistence** - Manually log into LinkedIn through the VM once, and the VM saves your session
|
||||
2. **Navigation** - The script navigates to your connections page using your saved authenticated session
|
||||
3. **Data Extraction** - For each contact, the agent clicks their profile, extracts data, and navigates back
|
||||
4. **Python Processing** - Python parses responses, validates data, and writes to CSV incrementally
|
||||
5. **Output Files** - Generates a CSV with contact data and a text file with messaging URLs
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands)
|
||||
- Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/)
|
||||
- Experiment with different [Models and Providers](/agent-sdk/supported-model-providers/)
|
||||
- Adapt this script for other platforms (Twitter/X, email extraction, etc.)
|
||||
- Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help
|
||||
629
docs/content/docs/example-usecases/windows-app-behind-vpn.mdx
Normal file
629
docs/content/docs/example-usecases/windows-app-behind-vpn.mdx
Normal file
@@ -0,0 +1,629 @@
|
||||
---
|
||||
title: Windows App behind VPN
|
||||
description: Automate legacy Windows desktop applications behind VPN with Cua
|
||||
---
|
||||
|
||||
import { Step, Steps } from 'fumadocs-ui/components/steps';
|
||||
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
|
||||
|
||||
## Overview
|
||||
|
||||
This guide demonstrates how to automate Windows desktop applications (like eGecko HR/payroll systems) that run behind corporate VPN. This is a common enterprise scenario where legacy desktop applications require manual data entry, report generation, or workflow execution.
|
||||
|
||||
**Use cases:**
|
||||
|
||||
- HR/payroll processing (employee onboarding, payroll runs, benefits administration)
|
||||
- Desktop ERP systems behind corporate networks
|
||||
- Legacy financial applications requiring VPN access
|
||||
- Compliance reporting from on-premise systems
|
||||
|
||||
**Architecture:**
|
||||
|
||||
- Client-side Cua agent (Python SDK or Playground UI)
|
||||
- Windows VM/Sandbox with VPN client configured
|
||||
- RDP/remote desktop connection to target environment
|
||||
- Desktop application automation via computer vision and UI control
|
||||
|
||||
<Callout type="info">
|
||||
**Production Deployment**: For production use, consider workflow mining and custom finetuning to
|
||||
create vertical-specific actions (e.g., "Run payroll", "Onboard employee") instead of generic UI
|
||||
automation. This provides better audit trails and higher success rates.
|
||||
</Callout>
|
||||
|
||||
---
|
||||
|
||||
## Video Demo
|
||||
|
||||
<div className="rounded-lg border bg-card text-card-foreground shadow-sm p-4 mb-6">
|
||||
<video
|
||||
src="https://github.com/user-attachments/assets/8ab07646-6018-4128-87ce-53180cfea696"
|
||||
controls
|
||||
className="w-full rounded"
|
||||
>
|
||||
Your browser does not support the video tag.
|
||||
</video>
|
||||
<div className="text-sm text-muted-foreground mt-2">
|
||||
Demo showing Cua automating an eGecko-like desktop application on Windows behind AWS VPN
|
||||
</div>
|
||||
</div>
|
||||
|
||||
---
|
||||
|
||||
<Steps>
|
||||
|
||||
<Step>
|
||||
|
||||
### Set Up Your Environment
|
||||
|
||||
Install the required dependencies:
|
||||
|
||||
Create a `requirements.txt` file:
|
||||
|
||||
```text
|
||||
cua-agent
|
||||
cua-computer
|
||||
python-dotenv>=1.0.0
|
||||
```
|
||||
|
||||
Install the dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
Create a `.env` file with your API keys:
|
||||
|
||||
```text
|
||||
ANTHROPIC_API_KEY=your-anthropic-api-key
|
||||
CUA_API_KEY=sk_cua-api01...
|
||||
CUA_SANDBOX_NAME=your-windows-sandbox
|
||||
```
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Configure Windows Sandbox with VPN
|
||||
|
||||
<Tabs items={['Cloud Sandbox (Recommended)', 'Windows Sandbox', 'Self-Hosted VM']}>
|
||||
<Tab value="Cloud Sandbox (Recommended)">
|
||||
|
||||
For enterprise deployments, use Cua Cloud Sandbox with pre-configured VPN:
|
||||
|
||||
1. Go to [cua.ai/signin](https://cua.ai/signin)
|
||||
2. Navigate to **Dashboard > Containers > Create Instance**
|
||||
3. Create a **Windows** sandbox (Medium or Large for desktop apps)
|
||||
4. Configure VPN settings:
|
||||
- Upload your AWS VPN Client configuration (`.ovpn` file)
|
||||
- Or configure VPN credentials directly in the dashboard
|
||||
5. Note your sandbox name and API key
|
||||
|
||||
Your Windows sandbox will launch with VPN automatically connected.
|
||||
|
||||
</Tab>
|
||||
<Tab value="Windows Sandbox">
|
||||
|
||||
For local development on Windows 10 Pro/Enterprise or Windows 11:
|
||||
|
||||
1. Enable [Windows Sandbox](https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/windows-sandbox-install)
|
||||
2. Install the `pywinsandbox` dependency:
|
||||
```bash
|
||||
pip install -U git+git://github.com/karkason/pywinsandbox.git
|
||||
```
|
||||
3. Create a VPN setup script that runs on sandbox startup
|
||||
4. Configure your desktop application installation within the sandbox
|
||||
|
||||
<Callout type="warn">
|
||||
**Manual VPN Setup**: Windows Sandbox requires manual VPN configuration each time it starts. For
|
||||
production use, consider Cloud Sandbox or self-hosted VMs with persistent VPN connections.
|
||||
</Callout>
|
||||
|
||||
</Tab>
|
||||
<Tab value="Self-Hosted VM">
|
||||
|
||||
For self-managed infrastructure:
|
||||
|
||||
1. Deploy Windows VM on your preferred cloud (AWS, Azure, GCP)
|
||||
2. Install and configure VPN client (AWS VPN Client, OpenVPN, etc.)
|
||||
3. Install target desktop application and any dependencies
|
||||
4. Install `cua-computer-server`:
|
||||
```bash
|
||||
pip install cua-computer-server
|
||||
python -m computer_server
|
||||
```
|
||||
5. Configure firewall rules to allow Cua agent connections
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Create Your Automation Script
|
||||
|
||||
Create a Python file (e.g., `hr_automation.py`):
|
||||
|
||||
<Tabs items={['Cloud Sandbox', 'Windows Sandbox', 'Self-Hosted']}>
|
||||
<Tab value="Cloud Sandbox">
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer, VMProviderType
|
||||
from dotenv import load_dotenv
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
load_dotenv()
|
||||
|
||||
async def automate_hr_workflow():
|
||||
"""
|
||||
Automate HR/payroll desktop application workflow.
|
||||
|
||||
This example demonstrates:
|
||||
- Launching Windows desktop application
|
||||
- Navigating complex desktop UI
|
||||
- Data entry and form filling
|
||||
- Report generation and export
|
||||
"""
|
||||
try:
|
||||
# Connect to Windows Cloud Sandbox with VPN
|
||||
async with Computer(
|
||||
os_type="windows",
|
||||
provider_type=VMProviderType.CLOUD,
|
||||
name=os.environ["CUA_SANDBOX_NAME"],
|
||||
api_key=os.environ["CUA_API_KEY"],
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
|
||||
# Configure agent with specialized instructions
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.INFO,
|
||||
trajectory_dir="trajectories",
|
||||
use_prompt_caching=True,
|
||||
max_trajectory_budget=10.0,
|
||||
instructions="""
|
||||
You are automating a Windows desktop HR/payroll application.
|
||||
|
||||
IMPORTANT GUIDELINES:
|
||||
- Always wait for windows and dialogs to fully load before interacting
|
||||
- Look for loading indicators and wait for them to disappear
|
||||
- Verify each action by checking on-screen confirmation messages
|
||||
- If a button or field is not visible, try scrolling or navigating tabs
|
||||
- Desktop apps often have nested menus - explore systematically
|
||||
- Save work frequently using File > Save or Ctrl+S
|
||||
- Before closing, always verify changes were saved
|
||||
|
||||
COMMON UI PATTERNS:
|
||||
- Menu bar navigation (File, Edit, View, etc.)
|
||||
- Ribbon interfaces with tabs
|
||||
- Modal dialogs that block interaction
|
||||
- Data grids/tables for viewing records
|
||||
- Form fields with validation
|
||||
- Status bars showing operation progress
|
||||
""".strip()
|
||||
)
|
||||
|
||||
# Define workflow tasks
|
||||
tasks = [
|
||||
"Launch the HR application from the desktop or start menu",
|
||||
"Log in with the credentials shown in credentials.txt on the desktop",
|
||||
"Navigate to Employee Management section",
|
||||
"Create a new employee record with information from new_hire.xlsx on desktop",
|
||||
"Verify the employee was created successfully by searching for their name",
|
||||
"Generate an onboarding report for the new employee",
|
||||
"Export the report as PDF to the desktop",
|
||||
"Log out of the application"
|
||||
]
|
||||
|
||||
history = []
|
||||
|
||||
for task in tasks:
|
||||
logger.info(f"\n{'='*60}")
|
||||
logger.info(f"Task: {task}")
|
||||
logger.info(f"{'='*60}\n")
|
||||
|
||||
history.append({"role": "user", "content": task})
|
||||
|
||||
async for result in agent.run(history):
|
||||
for item in result.get("output", []):
|
||||
if item.get("type") == "message":
|
||||
content = item.get("content", [])
|
||||
for block in content:
|
||||
if block.get("type") == "text":
|
||||
response = block.get("text", "")
|
||||
logger.info(f"Agent: {response}")
|
||||
history.append({"role": "assistant", "content": response})
|
||||
|
||||
logger.info("\nTask completed. Moving to next task...\n")
|
||||
|
||||
logger.info("\n" + "="*60)
|
||||
logger.info("All tasks completed successfully!")
|
||||
logger.info("="*60)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during automation: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(automate_hr_workflow())
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="Windows Sandbox">
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer, VMProviderType
|
||||
from dotenv import load_dotenv
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
load_dotenv()
|
||||
|
||||
async def automate_hr_workflow():
|
||||
try:
|
||||
# Connect to Windows Sandbox
|
||||
async with Computer(
|
||||
os_type="windows",
|
||||
provider_type=VMProviderType.WINDOWS_SANDBOX,
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.INFO,
|
||||
trajectory_dir="trajectories",
|
||||
use_prompt_caching=True,
|
||||
max_trajectory_budget=10.0,
|
||||
instructions="""
|
||||
You are automating a Windows desktop HR/payroll application.
|
||||
|
||||
IMPORTANT GUIDELINES:
|
||||
- Always wait for windows and dialogs to fully load before interacting
|
||||
- Verify each action by checking on-screen confirmation messages
|
||||
- Desktop apps often have nested menus - explore systematically
|
||||
- Save work frequently using File > Save or Ctrl+S
|
||||
""".strip()
|
||||
)
|
||||
|
||||
tasks = [
|
||||
"Launch the HR application from the desktop",
|
||||
"Log in with credentials from credentials.txt on desktop",
|
||||
"Navigate to Employee Management and create new employee from new_hire.xlsx",
|
||||
"Generate and export onboarding report as PDF",
|
||||
"Log out of the application"
|
||||
]
|
||||
|
||||
history = []
|
||||
|
||||
for task in tasks:
|
||||
logger.info(f"\nTask: {task}")
|
||||
history.append({"role": "user", "content": task})
|
||||
|
||||
async for result in agent.run(history):
|
||||
for item in result.get("output", []):
|
||||
if item.get("type") == "message":
|
||||
content = item.get("content", [])
|
||||
for block in content:
|
||||
if block.get("type") == "text":
|
||||
response = block.get("text", "")
|
||||
logger.info(f"Agent: {response}")
|
||||
history.append({"role": "assistant", "content": response})
|
||||
|
||||
logger.info("\nAll tasks completed!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(automate_hr_workflow())
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="Self-Hosted">
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer
|
||||
from dotenv import load_dotenv
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
load_dotenv()
|
||||
|
||||
async def automate_hr_workflow():
|
||||
try:
|
||||
# Connect to self-hosted Windows VM running computer-server
|
||||
async with Computer(
|
||||
use_host_computer_server=True,
|
||||
base_url="http://your-windows-vm-ip:5757", # Update with your VM IP
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.INFO,
|
||||
trajectory_dir="trajectories",
|
||||
use_prompt_caching=True,
|
||||
max_trajectory_budget=10.0,
|
||||
instructions="""
|
||||
You are automating a Windows desktop HR/payroll application.
|
||||
|
||||
IMPORTANT GUIDELINES:
|
||||
- Always wait for windows and dialogs to fully load before interacting
|
||||
- Verify each action by checking on-screen confirmation messages
|
||||
- Save work frequently using File > Save or Ctrl+S
|
||||
""".strip()
|
||||
)
|
||||
|
||||
tasks = [
|
||||
"Launch the HR application",
|
||||
"Log in with provided credentials",
|
||||
"Complete the required HR workflow",
|
||||
"Generate and export report",
|
||||
"Log out"
|
||||
]
|
||||
|
||||
history = []
|
||||
|
||||
for task in tasks:
|
||||
logger.info(f"\nTask: {task}")
|
||||
history.append({"role": "user", "content": task})
|
||||
|
||||
async for result in agent.run(history):
|
||||
for item in result.get("output", []):
|
||||
if item.get("type") == "message":
|
||||
content = item.get("content", [])
|
||||
for block in content:
|
||||
if block.get("type") == "text":
|
||||
response = block.get("text", "")
|
||||
logger.info(f"Agent: {response}")
|
||||
history.append({"role": "assistant", "content": response})
|
||||
|
||||
logger.info("\nAll tasks completed!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(automate_hr_workflow())
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Run Your Automation
|
||||
|
||||
Execute the script:
|
||||
|
||||
```bash
|
||||
python hr_automation.py
|
||||
```
|
||||
|
||||
The agent will:
|
||||
|
||||
1. Connect to your Windows environment (with VPN if configured)
|
||||
2. Launch and navigate the desktop application
|
||||
3. Execute each workflow step sequentially
|
||||
4. Verify actions and handle errors
|
||||
5. Save trajectory logs for audit and debugging
|
||||
|
||||
Monitor the console output to see the agent's progress through each task.
|
||||
|
||||
</Step>
|
||||
|
||||
</Steps>
|
||||
|
||||
---
|
||||
|
||||
## Key Configuration Options
|
||||
|
||||
### Agent Instructions
|
||||
|
||||
The `instructions` parameter is critical for reliable desktop automation:
|
||||
|
||||
```python
|
||||
instructions="""
|
||||
You are automating a Windows desktop HR/payroll application.
|
||||
|
||||
IMPORTANT GUIDELINES:
|
||||
- Always wait for windows and dialogs to fully load before interacting
|
||||
- Look for loading indicators and wait for them to disappear
|
||||
- Verify each action by checking on-screen confirmation messages
|
||||
- If a button or field is not visible, try scrolling or navigating tabs
|
||||
- Desktop apps often have nested menus - explore systematically
|
||||
- Save work frequently using File > Save or Ctrl+S
|
||||
- Before closing, always verify changes were saved
|
||||
|
||||
COMMON UI PATTERNS:
|
||||
- Menu bar navigation (File, Edit, View, etc.)
|
||||
- Ribbon interfaces with tabs
|
||||
- Modal dialogs that block interaction
|
||||
- Data grids/tables for viewing records
|
||||
- Form fields with validation
|
||||
- Status bars showing operation progress
|
||||
|
||||
APPLICATION-SPECIFIC:
|
||||
- Login is at top-left corner
|
||||
- Employee records are under "HR Management" > "Employees"
|
||||
- Reports are generated via "Tools" > "Reports" > "Generate"
|
||||
- Always click "Save" before navigating away from a form
|
||||
""".strip()
|
||||
```
|
||||
|
||||
### Budget Management
|
||||
|
||||
For long-running workflows, adjust budget limits:
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer],
|
||||
max_trajectory_budget=20.0, # Increase for complex workflows
|
||||
# ... other params
|
||||
)
|
||||
```
|
||||
|
||||
### Image Retention
|
||||
|
||||
Balance context and cost by retaining only recent screenshots:
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
# ...
|
||||
only_n_most_recent_images=3, # Keep last 3 screenshots
|
||||
# ...
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Production Considerations
|
||||
|
||||
<Callout type="warn" title="Production Deployment">
|
||||
For enterprise production deployments, consider these additional steps:
|
||||
</Callout>
|
||||
|
||||
### 1. Workflow Mining
|
||||
|
||||
Before deploying, analyze your actual workflows:
|
||||
|
||||
- Record user interactions with the application
|
||||
- Identify common patterns and edge cases
|
||||
- Map out decision trees and validation requirements
|
||||
- Document application-specific quirks and timing issues
|
||||
|
||||
### 2. Custom Finetuning
|
||||
|
||||
Create vertical-specific actions instead of generic UI automation:
|
||||
|
||||
```python
|
||||
# Instead of generic steps:
|
||||
tasks = ["Click login", "Type username", "Type password", "Click submit"]
|
||||
|
||||
# Create semantic actions:
|
||||
tasks = ["onboard_employee", "run_payroll", "generate_compliance_report"]
|
||||
```
|
||||
|
||||
This provides:
|
||||
|
||||
- Better audit trails
|
||||
- Approval gates at business logic level
|
||||
- Higher success rates
|
||||
- Easier maintenance and updates
|
||||
|
||||
### 3. Human-in-the-Loop
|
||||
|
||||
Add approval gates for critical operations:
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
tools=[computer],
|
||||
# Add human approval callback for sensitive operations
|
||||
callbacks=[ApprovalCallback(require_approval_for=["payroll", "termination"])]
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Deployment Options
|
||||
|
||||
Choose your deployment model:
|
||||
|
||||
**Managed (Recommended)**
|
||||
|
||||
- Cua hosts Windows sandboxes, VPN/RDP stack, and agent runtime
|
||||
- You get UI/API endpoints for triggering workflows
|
||||
- Automatic scaling, monitoring, and maintenance
|
||||
- SLA guarantees and enterprise support
|
||||
|
||||
**Self-Hosted**
|
||||
|
||||
- You manage Windows VMs, VPN infrastructure, and agent deployment
|
||||
- Full control over data and security
|
||||
- Custom network configurations
|
||||
- On-premise or your preferred cloud
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### VPN Connection Issues
|
||||
|
||||
If the agent cannot reach the application:
|
||||
|
||||
1. Verify VPN is connected: Check VPN client status in the Windows sandbox
|
||||
2. Test network connectivity: Try pinging internal resources
|
||||
3. Check firewall rules: Ensure RDP and application ports are open
|
||||
4. Review VPN logs: Look for authentication or routing errors
|
||||
|
||||
### Application Not Launching
|
||||
|
||||
If the desktop application fails to start:
|
||||
|
||||
1. Verify installation: Check the application is installed in the sandbox
|
||||
2. Check dependencies: Ensure all required DLLs and frameworks are present
|
||||
3. Review permissions: Application may require admin rights
|
||||
4. Check logs: Look for error messages in Windows Event Viewer
|
||||
|
||||
### UI Element Not Found
|
||||
|
||||
If the agent cannot find buttons or fields:
|
||||
|
||||
1. Increase wait times: Some applications load slowly
|
||||
2. Check screen resolution: UI elements may be off-screen
|
||||
3. Verify DPI scaling: High DPI settings can affect element positions
|
||||
4. Update instructions: Provide more specific navigation guidance
|
||||
|
||||
### Cost Management
|
||||
|
||||
If costs are higher than expected:
|
||||
|
||||
1. Reduce `max_trajectory_budget`
|
||||
2. Decrease `only_n_most_recent_images`
|
||||
3. Use prompt caching: Set `use_prompt_caching=True`
|
||||
4. Optimize task descriptions: Be more specific to reduce retry attempts
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
- **Explore custom tools**: Learn how to create [custom tools](/agent-sdk/custom-tools) for application-specific actions
|
||||
- **Implement callbacks**: Add [monitoring and logging](/agent-sdk/callbacks) for production workflows
|
||||
- **Join community**: Get help in our [Discord](https://discord.com/invite/mVnXXpdE85)
|
||||
|
||||
---
|
||||
|
||||
## Related Examples
|
||||
|
||||
- [Form Filling](/example-usecases/form-filling) - Web form automation
|
||||
- [Post-Event Contact Export](/example-usecases/post-event-contact-export) - Data extraction workflows
|
||||
- [Custom Tools](/agent-sdk/custom-tools) - Building application-specific functions
|
||||
7
docs/content/docs/get-started/meta.json
Normal file
7
docs/content/docs/get-started/meta.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"title": "Get Started",
|
||||
"description": "Get started with Cua",
|
||||
"defaultOpen": true,
|
||||
"icon": "Rocket",
|
||||
"pages": ["../index", "quickstart"]
|
||||
}
|
||||
571
docs/content/docs/get-started/quickstart.mdx
Normal file
571
docs/content/docs/get-started/quickstart.mdx
Normal file
@@ -0,0 +1,571 @@
|
||||
---
|
||||
title: Quickstart
|
||||
description: Get started with Cua
|
||||
---
|
||||
|
||||
import { Step, Steps } from 'fumadocs-ui/components/steps';
|
||||
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
|
||||
import { Accordion, Accordions } from 'fumadocs-ui/components/accordion';
|
||||
import { Code, Terminal } from 'lucide-react';
|
||||
|
||||
{/* Choose your quickstart path:
|
||||
|
||||
<div className="grid grid-cols-1 md:grid-cols-2 gap-6 mt-8 mb-8">
|
||||
<Card icon={<Code />} href="#developer-quickstart" title="Developer Quickstart">
|
||||
Build with Python or TypeScript SDKs - full programmatic control
|
||||
</Card>
|
||||
<Card icon={<Terminal />} href="#cli-quickstart" title="CLI Quickstart">
|
||||
Get started quickly with the command-line interface
|
||||
</Card>
|
||||
</div> */}
|
||||
|
||||
---
|
||||
|
||||
## Set Up Your Computer Environment
|
||||
|
||||
Choose how you want to run your Cua computer. This will be the environment where your automated tasks will execute.
|
||||
|
||||
You can run your Cua computer in the cloud (recommended for easiest setup), locally on macOS with Lume, locally on Windows with a Windows Sandbox, or in a Docker container on any platform. Choose the option that matches your system and needs.
|
||||
|
||||
<Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox']}>
|
||||
<Tab value="Cloud Sandbox">
|
||||
|
||||
Create and manage cloud sandboxes that run Linux (Ubuntu), Windows, or macOS.
|
||||
|
||||
**First, create your API key:**
|
||||
|
||||
1. Go to [cua.ai/signin](https://cua.ai/signin)
|
||||
2. Navigate to **Dashboard > API Keys > New API Key** to create your API key
|
||||
3. **Important:** Copy and save your API key immediately - you won't be able to see it again (you'll need to regenerate if lost)
|
||||
|
||||
**Then, create your sandbox using either option:**
|
||||
|
||||
**Option 1: Via Website**
|
||||
|
||||
1. Navigate to **Dashboard > Sandboxes > Create Sandbox**
|
||||
2. Create a **Small** sandbox, choosing **Linux**, **Windows**, or **macOS**
|
||||
3. Note your sandbox name
|
||||
|
||||
**Option 2: Via CLI**
|
||||
|
||||
1. Install the CUA CLI:
|
||||
```bash
|
||||
# macOS/Linux
|
||||
curl -LsSf https://cua.ai/cli/install.sh | sh
|
||||
|
||||
# Windows
|
||||
powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```
|
||||
|
||||
2. Login and create a sandbox:
|
||||
```bash
|
||||
cua auth login
|
||||
cua sb create --os linux --size small --region north-america
|
||||
```
|
||||
|
||||
3. Note your sandbox name and password from the output
|
||||
|
||||
Your Cloud Sandbox will be automatically configured and ready to use.
|
||||
|
||||
</Tab>
|
||||
<Tab value="Linux on Docker">
|
||||
|
||||
Run Linux desktop locally on macOS, Windows, or Linux hosts.
|
||||
|
||||
1. Install Docker Desktop or Docker Engine
|
||||
|
||||
2. Pull a CUA Docker image:
|
||||
|
||||
```bash
|
||||
# XFCE (Lightweight) - recommended for most use cases
|
||||
docker pull --platform=linux/amd64 trycua/cua-xfce:latest
|
||||
|
||||
# OR KASM (Full-Featured) - full Ubuntu desktop
|
||||
docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="macOS Sandbox">
|
||||
|
||||
macOS hosts only - requires Lume CLI.
|
||||
|
||||
1. Install the Lume CLI:
|
||||
|
||||
```bash
|
||||
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
|
||||
```
|
||||
|
||||
2. Start a local Cua sandbox:
|
||||
|
||||
```bash
|
||||
lume run macos-sequoia-cua:latest
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="Windows Sandbox">
|
||||
|
||||
Windows hosts only - requires Windows 10 Pro/Enterprise or Windows 11.
|
||||
|
||||
1. Enable [Windows Sandbox](https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/windows-sandbox-install)
|
||||
2. Install the `pywinsandbox` dependency:
|
||||
|
||||
```bash
|
||||
pip install -U git+git://github.com/karkason/pywinsandbox.git
|
||||
```
|
||||
|
||||
3. Windows Sandbox will be automatically configured when you run the CLI
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
---
|
||||
|
||||
## Developer Quickstart
|
||||
|
||||
<Steps>
|
||||
|
||||
<Step>
|
||||
|
||||
### Using Computer
|
||||
|
||||
Connect to your Cua computer and perform basic interactions, such as taking screenshots or simulating user input.
|
||||
|
||||
<Tabs items={['Python', 'TypeScript']}>
|
||||
<Tab value="Python">
|
||||
Install the Cua computer Python SDK:
|
||||
```bash
|
||||
pip install cua-computer
|
||||
```
|
||||
|
||||
Then, connect to your desired computer environment:
|
||||
|
||||
<Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox', 'Your host desktop']}>
|
||||
<Tab value="Cloud Sandbox">
|
||||
Set your CUA API key (same key used for model inference) and connect to your sandbox:
|
||||
```python
|
||||
import os
|
||||
from computer import Computer
|
||||
|
||||
os.environ["CUA_API_KEY"] = "sk_cua-api01_..."
|
||||
|
||||
computer = Computer(
|
||||
os_type="linux", # or "windows" or "macos"
|
||||
provider_type="cloud",
|
||||
name="your-sandbox-name" # from CLI or website
|
||||
)
|
||||
await computer.run() # Connect to the sandbox
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="Linux on Docker">
|
||||
```python
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(
|
||||
os_type="linux",
|
||||
provider_type="docker",
|
||||
image="trycua/cua-xfce:latest" # or "trycua/cua-ubuntu:latest"
|
||||
)
|
||||
await computer.run() # Launch & connect to the sandbox
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="macOS Sandbox">
|
||||
```python
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(
|
||||
os_type="macos",
|
||||
provider_type="lume",
|
||||
name="macos-sequoia-cua:latest"
|
||||
)
|
||||
await computer.run() # Launch & connect to the sandbox
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="Windows Sandbox">
|
||||
```python
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(
|
||||
os_type="windows",
|
||||
provider_type="windows_sandbox"
|
||||
)
|
||||
await computer.run() # Launch & connect to the sandbox
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="Your host desktop">
|
||||
Install and run `cua-computer-server`:
|
||||
```bash
|
||||
pip install cua-computer-server
|
||||
python -m computer_server
|
||||
```
|
||||
|
||||
Then, use the `Computer` object to connect:
|
||||
```python
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(use_host_computer_server=True)
|
||||
await computer.run() # Connect to the host desktop
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
Once connected, you can perform interactions:
|
||||
```python
|
||||
try:
|
||||
# Take a screenshot of the computer's current display
|
||||
screenshot = await computer.interface.screenshot()
|
||||
# Simulate a left-click at coordinates (100, 100)
|
||||
await computer.interface.left_click(100, 100)
|
||||
# Type "Hello!" into the active application
|
||||
await computer.interface.type_text("Hello!")
|
||||
finally:
|
||||
await computer.close()
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="TypeScript">
|
||||
<Callout type="warn" title="TypeScript SDK Deprecated">
|
||||
The TypeScript interface is currently deprecated. We're working on version 0.2.0 with improved TypeScript support. In the meantime, please use the Python SDK.
|
||||
</Callout>
|
||||
|
||||
Install the Cua computer TypeScript SDK:
|
||||
```bash
|
||||
npm install @trycua/computer
|
||||
```
|
||||
|
||||
Then, connect to your desired computer environment:
|
||||
|
||||
<Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox', 'Your host desktop']}>
|
||||
<Tab value="Cloud Sandbox">
|
||||
Set your CUA API key (same key used for model inference):
|
||||
```bash
|
||||
export CUA_API_KEY="sk_cua-api01_..."
|
||||
```
|
||||
|
||||
Then connect to your sandbox:
|
||||
```typescript
|
||||
import { Computer, OSType } from '@trycua/computer';
|
||||
|
||||
const computer = new Computer({
|
||||
osType: OSType.LINUX, // or OSType.WINDOWS or OSType.MACOS
|
||||
name: "your-sandbox-name" // from CLI or website
|
||||
});
|
||||
await computer.run(); // Connect to the sandbox
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="Linux on Docker">
|
||||
```typescript
|
||||
import { Computer, OSType, ProviderType } from '@trycua/computer';
|
||||
|
||||
const computer = new Computer({
|
||||
osType: OSType.LINUX,
|
||||
providerType: ProviderType.DOCKER,
|
||||
image: "trycua/cua-xfce:latest" // or "trycua/cua-ubuntu:latest"
|
||||
});
|
||||
await computer.run(); // Launch & connect to the sandbox
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="macOS Sandbox">
|
||||
```typescript
|
||||
import { Computer, OSType, ProviderType } from '@trycua/computer';
|
||||
|
||||
const computer = new Computer({
|
||||
osType: OSType.MACOS,
|
||||
providerType: ProviderType.LUME,
|
||||
name: "macos-sequoia-cua:latest"
|
||||
});
|
||||
await computer.run(); // Launch & connect to the sandbox
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="Windows Sandbox">
|
||||
```typescript
|
||||
import { Computer, OSType, ProviderType } from '@trycua/computer';
|
||||
|
||||
const computer = new Computer({
|
||||
osType: OSType.WINDOWS,
|
||||
providerType: ProviderType.WINDOWS_SANDBOX
|
||||
});
|
||||
await computer.run(); // Launch & connect to the sandbox
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="Your host desktop">
|
||||
First, install and run `cua-computer-server`:
|
||||
```bash
|
||||
pip install cua-computer-server
|
||||
python -m computer_server
|
||||
```
|
||||
|
||||
Then, use the `Computer` object to connect:
|
||||
```typescript
|
||||
import { Computer } from '@trycua/computer';
|
||||
|
||||
const computer = new Computer({ useHostComputerServer: true });
|
||||
await computer.run(); // Connect to the host desktop
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
Once connected, you can perform interactions:
|
||||
```typescript
|
||||
try {
|
||||
// Take a screenshot of the computer's current display
|
||||
const screenshot = await computer.interface.screenshot();
|
||||
// Simulate a left-click at coordinates (100, 100)
|
||||
await computer.interface.leftClick(100, 100);
|
||||
// Type "Hello!" into the active application
|
||||
await computer.interface.typeText("Hello!");
|
||||
} finally {
|
||||
await computer.close();
|
||||
}
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
Learn more about computers in the [Cua computers documentation](/computer-sdk/computers). You will see how to automate computers with agents in the next step.
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Using Agent
|
||||
|
||||
Utilize an Agent to automate complex tasks by providing it with a goal and allowing it to interact with the computer environment.
|
||||
|
||||
Install the Cua agent Python SDK:
|
||||
|
||||
```bash
|
||||
pip install "cua-agent[all]"
|
||||
```
|
||||
|
||||
Choose how you want to access vision-language models for your agent:
|
||||
|
||||
<Tabs items={['CUA VLM Router', 'BYOK (Bring Your Own Key)']}>
|
||||
<Tab value="CUA VLM Router">
|
||||
|
||||
Use CUA's inference API to access multiple model providers with a single API key (same key used for sandbox access). CUA VLM Router provides intelligent routing and cost optimization.
|
||||
|
||||
**Use the agent with CUA models:**
|
||||
```python
|
||||
import os
|
||||
from agent import ComputerAgent
|
||||
|
||||
os.environ["CUA_API_KEY"] = "sk_cua-api01_..."
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-sonnet-4.5", # CUA-routed model
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
|
||||
|
||||
async for result in agent.run(messages):
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"])
|
||||
```
|
||||
|
||||
**Available CUA models:**
|
||||
- `cua/anthropic/claude-sonnet-4.5` - Claude Sonnet 4.5 (recommended)
|
||||
- `cua/anthropic/claude-opus-4.5` - Claude Opus 4.5 (enhanced agentic capabilities)
|
||||
- `cua/anthropic/claude-haiku-4.5` - Claude Haiku 4.5 (faster, cost-effective)
|
||||
- `cua/qwen/qwen3-vl-235b` - Qwen3 VL 235B (large-scale vision-language tasks)
|
||||
|
||||
**Benefits:**
|
||||
- Single API key for multiple providers
|
||||
- Cost tracking and optimization
|
||||
- No need to manage multiple provider keys
|
||||
|
||||
</Tab>
|
||||
<Tab value="BYOK (Bring Your Own Key)">
|
||||
|
||||
Use your own API keys from model providers like Anthropic, OpenAI, or others.
|
||||
|
||||
**Use the agent with your provider:**
|
||||
```python
|
||||
import os
|
||||
from agent import ComputerAgent
|
||||
|
||||
# Set your provider API key
|
||||
os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..." # For Anthropic
|
||||
# OR
|
||||
os.environ["OPENAI_API_KEY"] = "sk-..." # For OpenAI
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-sonnet-4-5-20250929", # Direct provider model
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
|
||||
|
||||
async for result in agent.run(messages):
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"])
|
||||
```
|
||||
|
||||
**Supported providers:**
|
||||
- `anthropic/claude-*` - Anthropic Claude models
|
||||
- `openai/gpt-*` - OpenAI GPT models
|
||||
- `openai/o1-*` - OpenAI o1 models
|
||||
- `huggingface-local/*` - Local HuggingFace models
|
||||
- And many more via LiteLLM
|
||||
|
||||
See [Supported Models](/agent-sdk/supported-model-providers/) for the complete list.
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
Learn more about agents in [Agent Loops](/agent-sdk/agent-loops) and available models in [Supported Models](/agent-sdk/supported-model-providers/).
|
||||
|
||||
</Step>
|
||||
</Steps>
|
||||
|
||||
### Next Steps
|
||||
|
||||
- Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands)
|
||||
- Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/)
|
||||
- Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help
|
||||
- Try out [Form Filling](/example-usecases/form-filling) preset usecase
|
||||
|
||||
{/* ---
|
||||
|
||||
## CLI Quickstart
|
||||
|
||||
Get started quickly with the CUA CLI - the easiest way to manage cloud sandboxes and run AI agents.
|
||||
|
||||
<Steps>
|
||||
<Step>
|
||||
|
||||
### Install the CUA CLI
|
||||
|
||||
<Tabs items={['macOS / Linux', 'Windows', 'Bun (Alternative)', 'From Source']}>
|
||||
<Tab value="macOS / Linux">
|
||||
```bash
|
||||
curl -LsSf https://cua.ai/cli/install.sh | sh
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="Windows">
|
||||
```powershell
|
||||
powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="Bun (Alternative)">
|
||||
```bash
|
||||
# Install Bun if you don't have it
|
||||
curl -fsSL https://bun.sh/install | bash
|
||||
|
||||
# Install CUA CLI
|
||||
bun add -g @trycua/cli
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="From Source">
|
||||
```bash
|
||||
# Install Bun (macOS/Linux)
|
||||
curl -fsSL https://bun.sh/install | bash
|
||||
|
||||
# Install Bun (Windows)
|
||||
# powershell -c "irm bun.sh/install.ps1|iex"
|
||||
|
||||
# Clone the repo
|
||||
git clone https://github.com/trycua/cua
|
||||
cd cua/libs/typescript/cua-cli
|
||||
|
||||
# Install the CLI
|
||||
bun install
|
||||
bun link
|
||||
bun link cua-cli
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Authenticate with CUA
|
||||
|
||||
Login to your CUA account:
|
||||
|
||||
```bash
|
||||
# Interactive browser login (recommended)
|
||||
cua auth login
|
||||
|
||||
# Or provide your API key directly
|
||||
cua auth login --api-key sk-your-api-key-here
|
||||
```
|
||||
|
||||
If you don't have a CUA account yet, sign up at [cua.ai/signin](https://cua.ai/signin).
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Create Your First Sandbox
|
||||
|
||||
Create a cloud sandbox where your AI agents will run:
|
||||
|
||||
```bash
|
||||
# Create a Linux sandbox (recommended for most use cases)
|
||||
cua sb create --os linux --size small --region north-america
|
||||
|
||||
# Or create a Windows sandbox
|
||||
cua sb create --os windows --size small --region north-america
|
||||
|
||||
```
|
||||
|
||||
Your sandbox will be created and you'll see output like:
|
||||
|
||||
```
|
||||
Sandbox created and ready: my-sandbox-abc123
|
||||
Password: secure-password-here
|
||||
Host: my-sandbox-abc123.sandbox.cua.ai
|
||||
```
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Start Using Your Sandbox
|
||||
|
||||
You can now interact with your sandbox in multiple ways:
|
||||
|
||||
|
||||
|
||||
#### Option 1: Access VNC Desktop
|
||||
|
||||
```bash
|
||||
cua sb vnc my-sandbox-abc123
|
||||
```
|
||||
|
||||
This opens a remote desktop connection to your sandbox.
|
||||
|
||||
#### Option 2: List and Manage Sandboxes
|
||||
|
||||
```bash
|
||||
# List all your sandboxes
|
||||
cua sb list
|
||||
|
||||
# Start/stop sandboxes as needed
|
||||
cua sb stop my-sandbox-abc123
|
||||
cua sb start my-sandbox-abc123
|
||||
|
||||
# Delete sandboxes when done
|
||||
cua sb delete my-sandbox-abc123
|
||||
```
|
||||
|
||||
</Step>
|
||||
|
||||
</Steps>
|
||||
|
||||
### What's Next?
|
||||
|
||||
- **Explore more commands**: Check out the [complete CLI reference](/libraries/cua-cli/commands)
|
||||
- **Learn about programming**: Try the [Developer Quickstart](#developer-quickstart) to build custom automations
|
||||
- **Join the community**: Get help in our [Discord community](https://discord.com/invite/mVnXXpdE85)
|
||||
|
||||
---
|
||||
|
||||
For running models locally, see [Running Models Locally](/agent-sdk/supported-model-providers/local-models). */}
|
||||
@@ -1,25 +1,58 @@
|
||||
---
|
||||
title: Home
|
||||
icon: House
|
||||
title: Introduction
|
||||
---
|
||||
|
||||
import { Monitor, Code, BookOpen } from 'lucide-react';
|
||||
import { Monitor, Code, BookOpen, Zap, Bot, Boxes, Rocket } from 'lucide-react';
|
||||
|
||||
# Welcome!
|
||||
<div className="rounded-lg border bg-card text-card-foreground shadow-sm px-4 py-2 mb-6">
|
||||
Cua is an open-source framework for building **Computer-Use Agents** - AI systems that see,
|
||||
understand, and interact with desktop applications through vision and action, just like humans do.
|
||||
</div>
|
||||
|
||||
Cua is a framework for automating Windows, Mac, and Linux apps powered by computer-using agents (CUAs).
|
||||
## Why Cua?
|
||||
|
||||
Cua makes every stage of computer-using agent development simple:
|
||||
Cua gives you everything you need to automate any desktop application without brittle selectors or APIs.
|
||||
|
||||
- **Development**: Use any LLM provider with liteLLM. The agent SDK makes multiple agent loop providers, trajectory tracing, caching, and budget management easy
|
||||
- **Containerization**: Cua offers Docker containers pre-installed with everything needed for AI-powered RPA
|
||||
- **Deployment**: Cua cloud gives you a production-ready cloud environment for your assistants
|
||||
Some highlights include:
|
||||
|
||||
- **Model flexibility** - Connect to 100+ LLM providers through liteLLM's standard interface. Use models from Anthropic, OpenAI, Google, and more - or run them locally with Ollama, Hugging Face, or MLX.
|
||||
- **Composed agents** - Mix and match grounding models with planning models for optimal performance. Use specialized models like GTA, OpenCUA, or OmniParser for UI element detection paired with powerful reasoning models like Claude or GPT-4.
|
||||
- **Cross-platform sandboxes** - Run agents safely in isolated environments. Choose from Docker containers, macOS VMs with Lume, Windows Sandbox, or deploy to Cua Cloud with production-ready infrastructure.
|
||||
- **Computer SDK** - Control any application with a PyAutoGUI-like API. Click, type, scroll, take screenshots, manage windows, read/write files - everything you need for desktop automation.
|
||||
- **Agent SDK** - Build autonomous agents with trajectory tracing, prompt caching, cost tracking, and budget controls. Test agents on industry-standard benchmarks like OSWorld-Verified with one line of code.
|
||||
- **Human-in-the-loop** - Pause agent execution and await user input or approval before continuing. Use the `human/human` model string to let humans control the agent directly.
|
||||
- **Production essentials** - Ship reliable agents with built-in PII anonymization, cost tracking, trajectory logging, and integration with observability platforms like Laminar and HUD.
|
||||
|
||||
## What can you build?
|
||||
|
||||
- RPA automation that works with any application - even legacy software without APIs.
|
||||
- Form-filling agents that handle complex multi-step web workflows.
|
||||
- Testing automation that adapts to UI changes without brittle selectors.
|
||||
- Data extraction from desktop applications and document processing.
|
||||
- Cross-application workflows that combine multiple tools and services.
|
||||
- Research agents that browse, read, and synthesize information from the web.
|
||||
|
||||
Explore real-world examples in our [blog posts](https://cua.ai/blog).
|
||||
|
||||
## Get started
|
||||
|
||||
Follow the [Quickstart guide](/docs/get-started/quickstart) for step-by-step setup with Python or TypeScript.
|
||||
|
||||
If you're new to computer-use agents, check out our [tutorials](https://cua.ai/blog), [examples](https://github.com/trycua/cua/tree/main/examples), and [notebooks](https://github.com/trycua/cua/tree/main/notebooks) to start building with Cua today.
|
||||
|
||||
<div className="grid grid-cols-1 md:grid-cols-2 gap-6 mt-8">
|
||||
<Card icon={<Monitor />} href="/quickstart-devs" title="Quickstart (Developers)">
|
||||
Build with Python—full SDK and agent code examples.
|
||||
<Card icon={<Rocket />} href="/get-started/quickstart" title="Quickstart">
|
||||
Get up and running in 3 steps with Python or TypeScript.
|
||||
</Card>
|
||||
<Card icon={<BookOpen />} href="/libraries/agent" title="API Reference">
|
||||
Explore the agent SDK and APIs
|
||||
<Card icon={<Zap />} href="/agent-sdk/agent-loops" title="Agent Loops">
|
||||
Learn how agents work and how to build your own.
|
||||
</Card>
|
||||
<Card icon={<BookOpen />} href="/computer-sdk/computers" title="Computer SDK">
|
||||
Control desktop applications with the Computer SDK.
|
||||
</Card>
|
||||
<Card icon={<Monitor />} href="/example-usecases/form-filling" title="Example Use Cases">
|
||||
See Cua in action with real-world examples.
|
||||
</Card>
|
||||
</div>
|
||||
|
||||
We can't wait to see what you build with Cua ✨
|
||||
|
||||
360
docs/content/docs/libraries/cua-cli/commands.mdx
Normal file
360
docs/content/docs/libraries/cua-cli/commands.mdx
Normal file
@@ -0,0 +1,360 @@
|
||||
---
|
||||
title: Commands
|
||||
description: Complete reference for all CUA CLI commands
|
||||
---
|
||||
|
||||
import { Tabs, Tab } from 'fumadocs-ui/components/tabs';
|
||||
import { Callout } from 'fumadocs-ui/components/callout';
|
||||
|
||||
## Overview
|
||||
|
||||
The CUA CLI provides commands for authentication and sandbox management.
|
||||
|
||||
### Command Styles
|
||||
|
||||
The CLI supports **two command styles** for flexibility:
|
||||
|
||||
**Flat style** (quick & concise):
|
||||
|
||||
```bash
|
||||
cua list
|
||||
cua create --os linux --size small --region north-america
|
||||
cua start my-sandbox
|
||||
```
|
||||
|
||||
**Grouped style** (explicit & clear):
|
||||
|
||||
```bash
|
||||
cua sb list # or: cua sandbox list
|
||||
cua sb create # or: cua sandbox create
|
||||
cua sb start # or: cua sandbox start
|
||||
```
|
||||
|
||||
Both styles work identically - use whichever you prefer!
|
||||
|
||||
### Available Commands
|
||||
|
||||
- **Authentication** - `cua auth login`, `cua auth env`, `cua auth logout` (also available as flat commands: `cua login`, `cua env`, `cua logout`)
|
||||
- **Sandbox Management** - `cua list`, `cua create`, `cua start`, `cua stop`, `cua restart`, `cua delete`, `cua vnc`
|
||||
|
||||
## Authentication Commands
|
||||
|
||||
### `cua auth login`
|
||||
|
||||
Authenticate with your CUA account using browser-based OAuth flow.
|
||||
|
||||
```bash
|
||||
# Interactive browser login
|
||||
cua auth login
|
||||
|
||||
# Direct API key login
|
||||
cua auth login --api-key sk-your-api-key-here
|
||||
|
||||
# Alternative flat style
|
||||
cua login
|
||||
cua login --api-key sk-your-api-key-here
|
||||
```
|
||||
|
||||
**Options:**
|
||||
|
||||
- `--api-key <key>` - Provide API key directly instead of browser flow
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua auth login
|
||||
Opening browser for CLI auth...
|
||||
API key saved
|
||||
```
|
||||
|
||||
### `cua auth env`
|
||||
|
||||
Create or update a `.env` file in the current directory with your CUA API key.
|
||||
|
||||
```bash
|
||||
cua auth env
|
||||
|
||||
# Alternative flat style
|
||||
cua env
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua auth env
|
||||
Wrote /path/to/your/project/.env
|
||||
```
|
||||
|
||||
The generated `.env` file will contain:
|
||||
|
||||
```
|
||||
CUA_API_KEY=sk-your-api-key-here
|
||||
```
|
||||
|
||||
### `cua auth logout`
|
||||
|
||||
Remove the stored API key from your system.
|
||||
|
||||
```bash
|
||||
cua auth logout
|
||||
|
||||
# Alternative flat style
|
||||
cua logout
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua auth logout
|
||||
Logged out
|
||||
```
|
||||
|
||||
## Sandbox Commands
|
||||
|
||||
### `cua list`
|
||||
|
||||
List all your sandboxes with their current status. Passwords are hidden by default for security.
|
||||
|
||||
```bash
|
||||
# List sandboxes (passwords hidden)
|
||||
cua list
|
||||
|
||||
# Show passwords explicitly
|
||||
cua list --show-passwords
|
||||
|
||||
# Alternative aliases
|
||||
cua ls
|
||||
cua ps
|
||||
```
|
||||
|
||||
**Example Output (default, passwords hidden):**
|
||||
|
||||
```
|
||||
NAME STATUS HOST
|
||||
my-dev-sandbox running my-dev-sandbox.sandbox.cua.ai
|
||||
test-windows stopped test-windows.sandbox.cua.ai
|
||||
```
|
||||
|
||||
**Example Output (with --show-passwords):**
|
||||
|
||||
```
|
||||
NAME STATUS PASSWORD HOST
|
||||
my-dev-sandbox running secure-pass-123 my-dev-sandbox.sandbox.cua.ai
|
||||
test-windows stopped another-pass-456 test-windows.sandbox.cua.ai
|
||||
```
|
||||
|
||||
### `cua create`
|
||||
|
||||
Create a new sandbox.
|
||||
|
||||
```bash
|
||||
cua create --os <OS> --size <SIZE> --region <REGION>
|
||||
```
|
||||
|
||||
**Required Options:**
|
||||
|
||||
- `--os` - Operating system: `linux`, `windows`, `macos`
|
||||
- `--size` - Sandbox size: `small`, `medium`, `large`
|
||||
- `--region` - Region: `north-america`, `europe`, `asia-pacific`, `south-america`
|
||||
|
||||
**Examples:**
|
||||
|
||||
```bash
|
||||
# Create a small Linux sandbox in North America
|
||||
cua create --os linux --size small --region north-america
|
||||
|
||||
# Create a medium Windows sandbox in Europe
|
||||
cua create --os windows --size medium --region europe
|
||||
|
||||
# Create a large macOS sandbox in Asia Pacific
|
||||
cua create --os macos --size large --region asia-pacific
|
||||
```
|
||||
|
||||
**Response Types:**
|
||||
|
||||
**Immediate (Status 200):**
|
||||
|
||||
```bash
|
||||
Sandbox created and ready: my-new-sandbox-abc123
|
||||
Password: secure-password-here
|
||||
Host: my-new-sandbox-abc123.sandbox.cua.ai
|
||||
```
|
||||
|
||||
**Provisioning (Status 202):**
|
||||
|
||||
```bash
|
||||
Sandbox provisioning started: my-new-sandbox-abc123
|
||||
Job ID: job-xyz789
|
||||
Use 'cua list' to monitor provisioning progress
|
||||
```
|
||||
|
||||
### `cua start`
|
||||
|
||||
Start a stopped sandbox.
|
||||
|
||||
```bash
|
||||
cua start <name>
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua start my-dev-sandbox
|
||||
Start accepted
|
||||
```
|
||||
|
||||
### `cua stop`
|
||||
|
||||
Stop a running sandbox.
|
||||
|
||||
```bash
|
||||
cua stop <name>
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua stop my-dev-sandbox
|
||||
stopping
|
||||
```
|
||||
|
||||
### `cua restart`
|
||||
|
||||
Restart a sandbox.
|
||||
|
||||
```bash
|
||||
cua restart <name>
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua restart my-dev-sandbox
|
||||
restarting
|
||||
```
|
||||
|
||||
### `cua delete`
|
||||
|
||||
Delete a sandbox permanently.
|
||||
|
||||
```bash
|
||||
cua delete <name>
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua delete old-test-sandbox
|
||||
Sandbox deletion initiated: deleting
|
||||
```
|
||||
|
||||
<Callout type="warn">
|
||||
This action is irreversible. All data on the sandbox will be permanently lost.
|
||||
</Callout>
|
||||
|
||||
### `cua vnc`
|
||||
|
||||
Open the VNC interface for a sandbox in your browser.
|
||||
|
||||
```bash
|
||||
cua vnc <name>
|
||||
|
||||
# Alternative alias
|
||||
cua open <name>
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua vnc my-dev-sandbox
|
||||
Opening NoVNC: https://my-dev-sandbox.sandbox.cua.ai/vnc.html?autoconnect=true&password=...
|
||||
```
|
||||
|
||||
This command automatically opens your default browser to the VNC interface with the correct password pre-filled.
|
||||
|
||||
## Global Options
|
||||
|
||||
### Help
|
||||
|
||||
Get help for any command:
|
||||
|
||||
```bash
|
||||
cua --help
|
||||
cua auth login --help
|
||||
cua create --help
|
||||
cua list --help
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The CLI provides clear error messages for common issues:
|
||||
|
||||
### Authentication Errors
|
||||
|
||||
```bash
|
||||
$ cua list
|
||||
Unauthorized. Try 'cua auth login' again.
|
||||
```
|
||||
|
||||
### Sandbox Not Found
|
||||
|
||||
```bash
|
||||
$ cua start nonexistent-sandbox
|
||||
Sandbox not found
|
||||
```
|
||||
|
||||
### Invalid Configuration
|
||||
|
||||
```bash
|
||||
$ cua create --os invalid --configuration small --region north-america
|
||||
Invalid request or unsupported configuration
|
||||
```
|
||||
|
||||
## Tips and Best Practices
|
||||
|
||||
### 1. Use Descriptive Sandbox Names
|
||||
|
||||
```bash
|
||||
# Good
|
||||
cua create --os linux --size small --region north-america
|
||||
# Then rename or use meaningful names in the dashboard
|
||||
|
||||
# Better workflow
|
||||
cua list # Check the generated name
|
||||
# Use that name consistently
|
||||
```
|
||||
|
||||
### 2. Environment Management
|
||||
|
||||
```bash
|
||||
# Set up your project with API key
|
||||
cd my-project
|
||||
cua auth env
|
||||
# Now your project has CUA_API_KEY in .env
|
||||
```
|
||||
|
||||
### 3. Quick Sandbox Access
|
||||
|
||||
```bash
|
||||
# Create aliases for frequently used sandboxes
|
||||
alias dev-sandbox="cua vnc my-development-sandbox"
|
||||
alias prod-sandbox="cua vnc my-production-sandbox"
|
||||
```
|
||||
|
||||
### 4. Monitoring Provisioning
|
||||
|
||||
```bash
|
||||
# For sandboxes that need provisioning time
|
||||
cua create --os windows --size large --region europe
|
||||
# Sandbox provisioning started: my-sandbox-abc123
|
||||
# Job ID: job-xyz789
|
||||
|
||||
# Check status periodically
|
||||
watch -n 5 cua list
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Get started with the quickstart guide](/get-started/quickstart#cli-quickstart)
|
||||
- [Learn about CUA computers](/computer-sdk/computers)
|
||||
- [Explore agent automation](/agent-sdk/agent-loops)
|
||||
58
docs/content/docs/libraries/cua-cli/index.mdx
Normal file
58
docs/content/docs/libraries/cua-cli/index.mdx
Normal file
@@ -0,0 +1,58 @@
|
||||
---
|
||||
title: Cua CLI
|
||||
description: Command-line interface for managing Cua cloud sandboxes and authentication
|
||||
---
|
||||
|
||||
import { Tabs, Tab } from 'fumadocs-ui/components/tabs';
|
||||
|
||||
The Cua CLI is a command-line tool that provides an intuitive interface for managing your Cua cloud sandboxes and authentication. It offers a streamlined workflow for creating, managing, and connecting to cloud sandboxes.
|
||||
|
||||
## Key Features
|
||||
|
||||
- **Authentication Management**: Secure login with browser-based OAuth flow
|
||||
- **Sandbox Lifecycle**: Create, start, stop, restart, and delete cloud sandboxes
|
||||
- **Quick Access**: Direct links to VNC and playground interfaces
|
||||
- **Cross-Platform**: Works on macOS, Linux, and Windows
|
||||
- **Environment Integration**: Automatic `.env` file generation
|
||||
|
||||
## Quick Example
|
||||
|
||||
```bash
|
||||
# Install the CLI (installs Bun + CUA CLI)
|
||||
curl -LsSf https://cua.ai/cli/install.sh | sh
|
||||
|
||||
# Login to your CUA account
|
||||
cua auth login
|
||||
|
||||
# Create a new Linux sandbox
|
||||
cua sb create --os linux --size small --region north-america
|
||||
|
||||
# List your sandboxes
|
||||
cua sb list
|
||||
```
|
||||
|
||||
## Use Cases
|
||||
|
||||
### Development Workflow
|
||||
|
||||
- Quickly spin up cloud sandboxes for testing
|
||||
- Manage multiple sandboxes across different regions
|
||||
- Integrate with CI/CD pipelines
|
||||
|
||||
### Team Collaboration
|
||||
|
||||
- Share sandbox configurations and access
|
||||
- Standardize development environments
|
||||
- Quick onboarding for new team members
|
||||
|
||||
### Automation
|
||||
|
||||
- Script sandbox provisioning and management
|
||||
- Integrate with deployment workflows
|
||||
- Automate environment setup
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Install the CLI](/libraries/cua-cli/installation)
|
||||
- [Learn about available commands](/libraries/cua-cli/commands)
|
||||
- [Get started with the quickstart guide](/get-started/quickstart#cli-quickstart)
|
||||
130
docs/content/docs/libraries/cua-cli/installation.mdx
Normal file
130
docs/content/docs/libraries/cua-cli/installation.mdx
Normal file
@@ -0,0 +1,130 @@
|
||||
---
|
||||
title: Installation
|
||||
description: Install the CUA CLI on your system
|
||||
---
|
||||
|
||||
import { Tabs, Tab } from 'fumadocs-ui/components/tabs';
|
||||
import { Callout } from 'fumadocs-ui/components/callout';
|
||||
|
||||
## Quick Install
|
||||
|
||||
The fastest way to install the CUA CLI is using our installation scripts:
|
||||
|
||||
<Tabs items={['macOS / Linux', 'Windows']}>
|
||||
<Tab value="macOS / Linux">```bash curl -LsSf https://cua.ai/cli/install.sh | sh ```</Tab>
|
||||
<Tab value="Windows">
|
||||
```powershell powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
These scripts will automatically:
|
||||
|
||||
1. Install [Bun](https://bun.sh) (a fast JavaScript runtime)
|
||||
2. Install the CUA CLI via `bun add -g @trycua/cli`
|
||||
|
||||
<Callout type="info">
|
||||
The installation scripts will automatically detect your system and install the appropriate binary
|
||||
to your PATH.
|
||||
</Callout>
|
||||
|
||||
## Alternative: Install with Bun
|
||||
|
||||
You can also install the CLI directly using Bun:
|
||||
|
||||
```bash
|
||||
# Install Bun if you don't have it
|
||||
curl -fsSL https://bun.sh/install | bash
|
||||
|
||||
# Install CUA CLI
|
||||
bun add -g @trycua/cli
|
||||
```
|
||||
|
||||
<Callout type="info">
|
||||
Using Bun provides faster installation and better performance compared to npm. If you don't have
|
||||
Bun installed, the first command will install it for you.
|
||||
</Callout>
|
||||
|
||||
## Verify Installation
|
||||
|
||||
After installation, verify the CLI is working:
|
||||
|
||||
```bash
|
||||
cua --help
|
||||
```
|
||||
|
||||
You should see the CLI help output with available commands.
|
||||
|
||||
## First Time Setup
|
||||
|
||||
After installation, you'll need to authenticate with your CUA account:
|
||||
|
||||
```bash
|
||||
# Login with browser-based OAuth flow
|
||||
cua auth login
|
||||
|
||||
# Or provide your API key directly
|
||||
cua auth login --api-key sk-your-api-key-here
|
||||
```
|
||||
|
||||
## Updating
|
||||
|
||||
To update to the latest version:
|
||||
|
||||
<Tabs items={['Script Install', 'npm Install']}>
|
||||
<Tab value="Script Install">
|
||||
Re-run the installation script: ```bash # macOS/Linux curl -LsSf https://cua.ai/cli/install.sh |
|
||||
sh # Windows powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="npm Install">```bash npm update -g @trycua/cli ```</Tab>
|
||||
</Tabs>
|
||||
|
||||
## Uninstalling
|
||||
|
||||
<Tabs items={['Script Install', 'npm Install']}>
|
||||
<Tab value="Script Install">
|
||||
Remove the binary from your PATH: ```bash # macOS/Linux rm $(which cua) # Windows # Remove from
|
||||
your PATH or delete the executable ```
|
||||
</Tab>
|
||||
<Tab value="npm Install">```bash npm uninstall -g @trycua/cli ```</Tab>
|
||||
</Tabs>
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Command Not Found
|
||||
|
||||
If you get a "command not found" error after installation:
|
||||
|
||||
1. **Check your PATH**: Make sure the installation directory is in your PATH
|
||||
2. **Restart your terminal**: Close and reopen your terminal/command prompt
|
||||
3. **Manual PATH setup**: Add the installation directory to your PATH manually
|
||||
|
||||
### Permission Issues
|
||||
|
||||
If you encounter permission issues during installation:
|
||||
|
||||
<Tabs items={['macOS / Linux', 'Windows']}>
|
||||
<Tab value="macOS / Linux">
|
||||
Try running with sudo (not recommended for the curl method): ```bash # If using npm sudo npm
|
||||
install -g @trycua/cli ```
|
||||
</Tab>
|
||||
<Tab value="Windows">
|
||||
Run PowerShell as Administrator: ```powershell # Right-click PowerShell and "Run as
|
||||
Administrator" powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
### Network Issues
|
||||
|
||||
If the installation script fails due to network issues:
|
||||
|
||||
1. **Check your internet connection**
|
||||
2. **Try the npm installation method instead**
|
||||
3. **Check if your firewall is blocking the download**
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Learn about CLI commands](/libraries/cua-cli/commands)
|
||||
- [Follow the quickstart guide](/get-started/quickstart#cli-quickstart)
|
||||
5
docs/content/docs/libraries/cua-cli/meta.json
Normal file
5
docs/content/docs/libraries/cua-cli/meta.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"title": "CLI",
|
||||
"description": "Command-line interface for CUA",
|
||||
"pages": ["index", "installation", "commands"]
|
||||
}
|
||||
@@ -5,7 +5,7 @@ description: Installation instructions for the current version of the Lume CLI.
|
||||
|
||||
## Quickstart
|
||||
|
||||
Install and run a prebuilt macOS VM in two commands:
|
||||
Install and run a prebuilt macOS sandbox in two commands:
|
||||
|
||||
```bash
|
||||
# Install Lume
|
||||
|
||||
@@ -6,6 +6,72 @@ title: Client Integrations
|
||||
|
||||
To use with Claude Desktop, add an entry to your Claude Desktop configuration (`claude_desktop_config.json`, typically found in `~/.config/claude-desktop/`):
|
||||
|
||||
### Package Installation Method
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cua-agent": {
|
||||
"command": "/bin/bash",
|
||||
"args": ["~/.cua/start_mcp_server.sh"],
|
||||
"env": {
|
||||
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-20250514",
|
||||
"ANTHROPIC_API_KEY": "your-anthropic-api-key-here",
|
||||
"CUA_MAX_IMAGES": "3",
|
||||
"CUA_USE_HOST_COMPUTER_SERVER": "false"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Development Method
|
||||
|
||||
If you're working with the CUA source code:
|
||||
|
||||
**Standard VM Mode:**
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cua-agent": {
|
||||
"command": "/usr/bin/env",
|
||||
"args": [
|
||||
"bash",
|
||||
"-lc",
|
||||
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Host Computer Control Mode:**
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cua-agent": {
|
||||
"command": "/usr/bin/env",
|
||||
"args": [
|
||||
"bash",
|
||||
"-lc",
|
||||
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; export CUA_USE_HOST_COMPUTER_SERVER='true'; export CUA_MAX_IMAGES='1'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Note**: Replace `/path/to/cua` with the absolute path to your CUA repository directory.
|
||||
|
||||
**⚠️ Host Computer Control Setup**: When using `CUA_USE_HOST_COMPUTER_SERVER='true'`, you must also:
|
||||
|
||||
1. Install computer server dependencies: `python3 -m pip install uvicorn fastapi`
|
||||
2. Install the computer server: `python3 -m pip install -e libs/python/computer-server --break-system-packages`
|
||||
3. Start the computer server: `python -m computer_server --log-level debug`
|
||||
4. The AI will have direct access to your desktop - use with caution!
|
||||
|
||||
For more information on MCP with Claude Desktop, see the [official MCP User Guide](https://modelcontextprotocol.io/quickstart/user).
|
||||
|
||||
## Cursor Integration
|
||||
@@ -15,6 +81,43 @@ To use with Cursor, add an MCP configuration file in one of these locations:
|
||||
- **Project-specific**: Create `.cursor/mcp.json` in your project directory
|
||||
- **Global**: Create `~/.cursor/mcp.json` in your home directory
|
||||
|
||||
Example configuration for Cursor:
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cua-agent": {
|
||||
"command": "/bin/bash",
|
||||
"args": ["~/.cua/start_mcp_server.sh"],
|
||||
"env": {
|
||||
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-20250514",
|
||||
"ANTHROPIC_API_KEY": "your-anthropic-api-key-here"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
After configuration, you can simply tell Cursor's Agent to perform computer tasks by explicitly mentioning the CUA agent, such as "Use the computer control tools to open Safari."
|
||||
|
||||
For more information on MCP with Cursor, see the [official Cursor MCP documentation](https://docs.cursor.com/context/model-context-protocol).
|
||||
|
||||
## Other MCP Clients
|
||||
|
||||
The MCP server is compatible with any MCP-compliant client. The server exposes the following tools:
|
||||
|
||||
- `run_cua_task` - Execute single computer tasks
|
||||
- `run_multi_cua_tasks` - Execute multiple tasks (sequential or concurrent)
|
||||
- `screenshot_cua` - Capture screenshots
|
||||
- `get_session_stats` - Monitor session statistics
|
||||
- `cleanup_session` - Manage session lifecycle
|
||||
|
||||
### Configuration Options
|
||||
|
||||
All MCP clients can configure the server using environment variables:
|
||||
|
||||
- `CUA_MODEL_NAME` - Model to use for task execution
|
||||
- `CUA_MAX_IMAGES` - Maximum images to keep in context
|
||||
- `CUA_USE_HOST_COMPUTER_SERVER` - Use host system instead of VM
|
||||
|
||||
See the [Configuration](/docs/libraries/mcp-server/configuration) page for detailed configuration options.
|
||||
|
||||
@@ -4,7 +4,70 @@ title: Configuration
|
||||
|
||||
The server is configured using environment variables (can be set in the Claude Desktop config):
|
||||
|
||||
| Variable | Description | Default |
|
||||
| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------ |
|
||||
| `CUA_MODEL_NAME` | Model string (e.g., "anthropic/claude-3-5-sonnet-20241022", "openai/computer-use-preview", "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", "omniparser+litellm/gpt-4o", "omniparser+ollama_chat/gemma3") | anthropic/claude-3-5-sonnet-20241022 |
|
||||
| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 |
|
||||
| Variable | Description | Default |
|
||||
| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------- |
|
||||
| `CUA_MODEL_NAME` | Model string (e.g., "anthropic/claude-sonnet-4-20250514", "openai/computer-use-preview", "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", "omniparser+litellm/gpt-4o", "omniparser+ollama_chat/gemma3") | anthropic/claude-sonnet-4-20250514 |
|
||||
| `ANTHROPIC_API_KEY` | Your Anthropic API key (required for Anthropic models) | None |
|
||||
| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 |
|
||||
| `CUA_USE_HOST_COMPUTER_SERVER` | Target your local desktop instead of a VM. Set to "true" to use your host system. **Warning:** AI models may perform risky actions. | false |
|
||||
|
||||
## Model Configuration
|
||||
|
||||
The `CUA_MODEL_NAME` environment variable supports various model providers through LiteLLM integration:
|
||||
|
||||
### Supported Providers
|
||||
|
||||
- **Anthropic**: `anthropic/claude-sonnet-4-20250514`,
|
||||
- **OpenAI**: `openai/computer-use-preview`, `openai/gpt-4o`
|
||||
- **Local Models**: `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`
|
||||
- **Omni + LiteLLM**: `omniparser+litellm/gpt-4o`, `omniparser+litellm/claude-3-haiku`
|
||||
- **Ollama**: `omniparser+ollama_chat/gemma3`
|
||||
|
||||
### Example Configurations
|
||||
|
||||
**Claude Desktop Configuration:**
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cua-agent": {
|
||||
"command": "/bin/bash",
|
||||
"args": ["~/.cua/start_mcp_server.sh"],
|
||||
"env": {
|
||||
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-20250514",
|
||||
"ANTHROPIC_API_KEY": "your-anthropic-api-key-here",
|
||||
"CUA_MAX_IMAGES": "5",
|
||||
"CUA_USE_HOST_COMPUTER_SERVER": "false"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Local Model Configuration:**
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cua-agent": {
|
||||
"command": "/bin/bash",
|
||||
"args": ["~/.cua/start_mcp_server.sh"],
|
||||
"env": {
|
||||
"CUA_MODEL_NAME": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
|
||||
"CUA_MAX_IMAGES": "3"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Session Management Configuration
|
||||
|
||||
The MCP server automatically manages sessions with the following defaults:
|
||||
|
||||
- **Max Concurrent Sessions**: 10
|
||||
- **Session Timeout**: 10 minutes of inactivity
|
||||
- **Computer Pool Size**: 5 instances
|
||||
- **Automatic Cleanup**: Enabled
|
||||
|
||||
These settings are optimized for typical usage and don't require configuration for most users.
|
||||
|
||||
@@ -7,3 +7,21 @@ github:
|
||||
---
|
||||
|
||||
**cua-mcp-server** is a MCP server for the Computer-Use Agent (CUA), allowing you to run CUA through Claude Desktop or other MCP clients.
|
||||
|
||||
## Features
|
||||
|
||||
- **Multi-Client Support**: Concurrent sessions with automatic resource management
|
||||
- **Progress Reporting**: Real-time progress updates during task execution
|
||||
- **Error Handling**: Robust error recovery with screenshot capture
|
||||
- **Concurrent Execution**: Run multiple tasks in parallel for improved performance
|
||||
- **Session Management**: Automatic cleanup and resource pooling
|
||||
- **LiteLLM Integration**: Support for multiple model providers
|
||||
- **VM Safety**: Default VM execution with optional host system control
|
||||
|
||||
## Quick Start
|
||||
|
||||
1. **Install**: `pip install cua-mcp-server`
|
||||
2. **Configure**: Add to your MCP client configuration
|
||||
3. **Use**: Ask Claude to perform computer tasks
|
||||
|
||||
See the [Installation](/docs/libraries/mcp-server/installation) guide for detailed setup instructions.
|
||||
|
||||
@@ -38,19 +38,103 @@ You can then use the script in your MCP configuration like this:
|
||||
"command": "/bin/bash",
|
||||
"args": ["~/.cua/start_mcp_server.sh"],
|
||||
"env": {
|
||||
"CUA_MODEL_NAME": "anthropic/claude-3-5-sonnet-20241022"
|
||||
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-20250514",
|
||||
"ANTHROPIC_API_KEY": "your-anthropic-api-key-here"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Important**: You must include your Anthropic API key for the MCP server to work properly.
|
||||
|
||||
## Development Setup
|
||||
|
||||
If you're working with the CUA source code directly (like in the CUA repository), you can use the development script instead:
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cua-agent": {
|
||||
"command": "/usr/bin/env",
|
||||
"args": [
|
||||
"bash",
|
||||
"-lc",
|
||||
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**For host computer control** (development setup):
|
||||
|
||||
1. **Install Computer Server Dependencies**:
|
||||
|
||||
```bash
|
||||
python3 -m pip install uvicorn fastapi
|
||||
python3 -m pip install -e libs/python/computer-server --break-system-packages
|
||||
```
|
||||
|
||||
2. **Start the Computer Server**:
|
||||
|
||||
```bash
|
||||
cd /path/to/cua
|
||||
python -m computer_server --log-level debug
|
||||
```
|
||||
|
||||
This will start the computer server on `http://localhost:8000` that controls your actual desktop.
|
||||
|
||||
3. **Configure Claude Desktop**:
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cua-agent": {
|
||||
"command": "/usr/bin/env",
|
||||
"args": [
|
||||
"bash",
|
||||
"-lc",
|
||||
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; export CUA_USE_HOST_COMPUTER_SERVER='true'; export CUA_MAX_IMAGES='1'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Note**: Replace `/path/to/cua` with the absolute path to your CUA repository directory.
|
||||
|
||||
**⚠️ Important**: When using host computer control (`CUA_USE_HOST_COMPUTER_SERVER='true'`), the AI will have direct access to your desktop and can perform actions like opening applications, clicking, typing, and taking screenshots. Make sure you're comfortable with this level of access.
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
If you get a `/bin/bash: ~/cua/libs/python/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative.
|
||||
**Common Issues:**
|
||||
|
||||
To see the logs:
|
||||
1. **"Claude's response was interrupted"** - This usually means:
|
||||
- Missing API key: Add `ANTHROPIC_API_KEY` to your environment variables
|
||||
- Invalid model name: Use a valid model like `anthropic/claude-sonnet-4-20250514`
|
||||
- Check logs for specific error messages
|
||||
|
||||
```
|
||||
2. **"Missing Anthropic API Key"** - Add your API key to the configuration:
|
||||
|
||||
```json
|
||||
"env": {
|
||||
"ANTHROPIC_API_KEY": "your-api-key-here"
|
||||
}
|
||||
```
|
||||
|
||||
3. **"model not found"** - Use a valid model name:
|
||||
- ✅ `anthropic/claude-sonnet-4-20250514`
|
||||
|
||||
4. **Script not found** - If you get a `/bin/bash: ~/cua/libs/python/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative.
|
||||
|
||||
5. **Host Computer Control Issues** - If using `CUA_USE_HOST_COMPUTER_SERVER='true'`:
|
||||
- **Computer Server not running**: Make sure you've started the computer server with `python -m computer_server --log-level debug`
|
||||
- **Port 8000 in use**: Check if another process is using port 8000 with `lsof -i :8000`
|
||||
- **Missing dependencies**: Install `uvicorn` and `fastapi` with `python3 -m pip install uvicorn fastapi`
|
||||
- **Image size errors**: Use `CUA_MAX_IMAGES='1'` to reduce image context size
|
||||
|
||||
**Viewing Logs:**
|
||||
|
||||
```bash
|
||||
tail -n 20 -f ~/Library/Logs/Claude/mcp*.log
|
||||
```
|
||||
|
||||
@@ -12,7 +12,7 @@ This MCP server features comprehensive liteLLM integration, allowing you to use
|
||||
|
||||
### Model String Examples:
|
||||
|
||||
- **Anthropic**: `"anthropic/claude-3-5-sonnet-20241022"`
|
||||
- **Anthropic**: `"anthropic/claude-sonnet-4-5-20250929"`
|
||||
- **OpenAI**: `"openai/computer-use-preview"`
|
||||
- **UI-TARS**: `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`
|
||||
- **Omni + Any LiteLLM**: `"omniparser+litellm/gpt-4o"`, `"omniparser+litellm/claude-3-haiku"`, `"omniparser+ollama_chat/gemma3"`
|
||||
|
||||
@@ -6,5 +6,61 @@ title: Tools
|
||||
|
||||
The MCP server exposes the following tools to Claude:
|
||||
|
||||
1. `run_cua_task` - Run a single Computer-Use Agent task with the given instruction
|
||||
2. `run_multi_cua_tasks` - Run multiple tasks in sequence
|
||||
### Core Task Execution Tools
|
||||
|
||||
1. **`run_cua_task`** - Run a single Computer-Use Agent task with the given instruction
|
||||
- `task` (string): The task description for the agent to execute
|
||||
- `session_id` (string, optional): Session ID for multi-client support. If not provided, a new session will be created
|
||||
- Returns: Tuple of (combined text output, final screenshot)
|
||||
|
||||
2. **`run_multi_cua_tasks`** - Run multiple tasks in sequence or concurrently
|
||||
- `tasks` (list of strings): List of task descriptions to execute
|
||||
- `session_id` (string, optional): Session ID for multi-client support. If not provided, a new session will be created
|
||||
- `concurrent` (boolean, optional): If true, run tasks concurrently. If false, run sequentially (default)
|
||||
- Returns: List of tuples (combined text output, screenshot) for each task
|
||||
|
||||
### Utility Tools
|
||||
|
||||
3. **`screenshot_cua`** - Take a screenshot of the current screen
|
||||
- `session_id` (string, optional): Session ID for multi-client support. If not provided, a new session will be created
|
||||
- Returns: Screenshot image
|
||||
|
||||
4. **`get_session_stats`** - Get statistics about active sessions and resource usage
|
||||
- Returns: Dictionary with session statistics including total sessions, active tasks, and session details
|
||||
|
||||
5. **`cleanup_session`** - Cleanup a specific session and release its resources
|
||||
- `session_id` (string): The session ID to cleanup
|
||||
- Returns: Confirmation message
|
||||
|
||||
## Session Management
|
||||
|
||||
The MCP server supports multi-client sessions with automatic resource management:
|
||||
|
||||
- **Session Isolation**: Each client can have its own session with isolated computer instances
|
||||
- **Resource Pooling**: Computer instances are pooled for efficient resource usage
|
||||
- **Automatic Cleanup**: Idle sessions are automatically cleaned up after 10 minutes
|
||||
- **Concurrent Tasks**: Multiple tasks can run concurrently within the same session
|
||||
- **Progress Reporting**: Real-time progress updates during task execution
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Basic Task Execution
|
||||
|
||||
```
|
||||
"Open Chrome and navigate to github.com"
|
||||
"Create a folder called 'Projects' on my desktop"
|
||||
```
|
||||
|
||||
### Multi-Task Execution
|
||||
|
||||
```
|
||||
"Run these tasks: 1) Open Finder, 2) Navigate to Documents, 3) Create a new folder called 'Work'"
|
||||
```
|
||||
|
||||
### Session Management
|
||||
|
||||
```
|
||||
"Take a screenshot of the current screen"
|
||||
"Show me the session statistics"
|
||||
"Cleanup session abc123"
|
||||
```
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
title: Usage
|
||||
---
|
||||
|
||||
## Usage
|
||||
## Basic Usage
|
||||
|
||||
Once configured, you can simply ask Claude to perform computer tasks:
|
||||
|
||||
@@ -13,9 +13,157 @@ Once configured, you can simply ask Claude to perform computer tasks:
|
||||
|
||||
Claude will automatically use your CUA agent to perform these tasks.
|
||||
|
||||
### First-time Usage Notes
|
||||
## Advanced Features
|
||||
|
||||
### Progress Reporting
|
||||
|
||||
The MCP server provides real-time progress updates during task execution:
|
||||
|
||||
- Task progress is reported as percentages (0-100%)
|
||||
- Multi-task operations show progress for each individual task
|
||||
- Progress updates are streamed to the MCP client for real-time feedback
|
||||
|
||||
### Error Handling
|
||||
|
||||
Robust error handling ensures reliable operation:
|
||||
|
||||
- Failed tasks return error messages with screenshots when possible
|
||||
- Session state is preserved even when individual tasks fail
|
||||
- Automatic cleanup prevents resource leaks
|
||||
- Detailed error logging for troubleshooting
|
||||
|
||||
### Concurrent Task Execution
|
||||
|
||||
For improved performance, multiple tasks can run concurrently:
|
||||
|
||||
- Set `concurrent=true` in `run_multi_cua_tasks` for parallel execution
|
||||
- Each task runs in its own context with isolated state
|
||||
- Progress tracking works for both sequential and concurrent modes
|
||||
- Resource pooling ensures efficient computer instance usage
|
||||
|
||||
### Session Management
|
||||
|
||||
Multi-client support with automatic resource management:
|
||||
|
||||
- Each client gets isolated sessions with separate computer instances
|
||||
- Sessions automatically clean up after 10 minutes of inactivity
|
||||
- Resource pooling prevents resource exhaustion
|
||||
- Session statistics available for monitoring
|
||||
|
||||
## Target Computer Options
|
||||
|
||||
By default, the MCP server runs CUA in a virtual machine for safety. However, you can also configure it to run on your local system.
|
||||
|
||||
### Default: Using a VM (Recommended)
|
||||
|
||||
The MCP server will automatically start and connect to a VM based on your platform. This is the safest option as AI actions are isolated from your host system.
|
||||
|
||||
No additional configuration is needed - this is the default behavior.
|
||||
|
||||
### Option: Targeting Your Local Desktop
|
||||
|
||||
<Callout type="warn">
|
||||
**Warning:** When targeting your local system, AI models have direct access to your desktop and
|
||||
may perform risky actions. Use with caution.
|
||||
</Callout>
|
||||
|
||||
To have the MCP server control your local desktop instead of a VM:
|
||||
|
||||
1. **Start the Computer Server on your host:**
|
||||
|
||||
```bash
|
||||
pip install cua-computer-server
|
||||
python -m computer_server
|
||||
```
|
||||
|
||||
2. **Configure the MCP server to use your host system:**
|
||||
|
||||
Add the `CUA_USE_HOST_COMPUTER_SERVER` environment variable to your MCP client configuration:
|
||||
|
||||
<Tabs items={['Claude Desktop', 'Other MCP Clients']}>
|
||||
<Tab value="Claude Desktop">
|
||||
Update your Claude Desktop config (see [Installation](/docs/libraries/mcp-server/installation)) to include the environment variable:
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cua-agent": {
|
||||
"command": "/bin/bash",
|
||||
"args": ["~/.cua/start_mcp_server.sh"],
|
||||
"env": {
|
||||
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"CUA_USE_HOST_COMPUTER_SERVER": "true"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="Other MCP Clients">
|
||||
Set the environment variable in your MCP client configuration:
|
||||
|
||||
```bash
|
||||
export CUA_USE_HOST_COMPUTER_SERVER=true
|
||||
```
|
||||
|
||||
Then start your MCP client as usual.
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
3. **Restart your MCP client** (e.g., Claude Desktop) to apply the changes.
|
||||
|
||||
Now Claude will control your local desktop directly when you ask it to perform computer tasks.
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Single Task Execution
|
||||
|
||||
```
|
||||
"Open Safari and navigate to apple.com"
|
||||
"Create a new folder on the desktop called 'My Projects'"
|
||||
"Take a screenshot of the current screen"
|
||||
```
|
||||
|
||||
### Multi-Task Execution (Sequential)
|
||||
|
||||
```
|
||||
"Run these tasks in order: 1) Open Finder, 2) Navigate to Documents folder, 3) Create a new folder called 'Work'"
|
||||
```
|
||||
|
||||
### Multi-Task Execution (Concurrent)
|
||||
|
||||
```
|
||||
"Run these tasks simultaneously: 1) Open Chrome, 2) Open Safari, 3) Open Finder"
|
||||
```
|
||||
|
||||
### Session Management
|
||||
|
||||
```
|
||||
"Show me the current session statistics"
|
||||
"Take a screenshot using session abc123"
|
||||
"Cleanup session xyz789"
|
||||
```
|
||||
|
||||
### Error Recovery
|
||||
|
||||
```
|
||||
"Try to open a non-existent application and show me the error"
|
||||
"Find all files with .tmp extension and delete them safely"
|
||||
```
|
||||
|
||||
## First-time Usage Notes
|
||||
|
||||
**API Keys**: Ensure you have valid API keys:
|
||||
|
||||
- Add your Anthropic API key, or other model provider API key in the Claude Desktop config (as shown above)
|
||||
- Add your Anthropic API key in the Claude Desktop config (as shown above)
|
||||
- Or set it as an environment variable in your shell profile
|
||||
- **Required**: The MCP server needs an API key to authenticate with the model provider
|
||||
|
||||
**Model Selection**: Choose the appropriate model for your needs:
|
||||
|
||||
- **Claude Sonnet 4**: Latest model with best performance (`anthropic/claude-sonnet-4-20250514`)
|
||||
- **Computer-Use Preview**: Specialized for computer tasks (`openai/computer-use-preview`)
|
||||
- **Local Models**: For privacy-sensitive environments
|
||||
- **Ollama**: For offline usage
|
||||
|
||||
@@ -4,11 +4,10 @@
|
||||
"root": true,
|
||||
"defaultOpen": true,
|
||||
"pages": [
|
||||
"index",
|
||||
"quickstart-devs",
|
||||
"quickstart-cli",
|
||||
"telemetry",
|
||||
"example-usecases",
|
||||
"---[Rocket]Get Started---",
|
||||
"...get-started",
|
||||
"---[ChefHat]Cookbook---",
|
||||
"...example-usecases",
|
||||
"---[BookCopy]Computer Playbook---",
|
||||
"...computer-sdk",
|
||||
"---[BookCopy]Agent Playbook---",
|
||||
|
||||
@@ -1,343 +0,0 @@
|
||||
---
|
||||
title: Quickstart (CLI)
|
||||
description: Get started with the Cua Agent CLI in 4 steps
|
||||
icon: Rocket
|
||||
---
|
||||
|
||||
import { Step, Steps } from 'fumadocs-ui/components/steps';
|
||||
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
|
||||
import { Accordion, Accordions } from 'fumadocs-ui/components/accordion';
|
||||
|
||||
Get up and running with the Cua Agent CLI in 4 simple steps.
|
||||
|
||||
<Steps>
|
||||
<Step>
|
||||
|
||||
## Introduction
|
||||
|
||||
Cua combines Computer (interface) + Agent (AI) for automating desktop apps. The Agent CLI provides a clean terminal interface to control your remote computer using natural language commands.
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
## Set Up Your Computer Environment
|
||||
|
||||
Choose how you want to run your Cua computer. **Cloud Sandbox is recommended** for the easiest setup:
|
||||
|
||||
<Tabs items={['☁️ Cloud Sandbox (Recommended)', 'Linux on Docker', 'Windows Sandbox', 'macOS VM']}>
|
||||
<Tab value="☁️ Cloud Sandbox (Recommended)">
|
||||
|
||||
**Easiest & safest way to get started - works on any host OS**
|
||||
|
||||
1. Go to [cua.ai/signin](https://cua.ai/signin)
|
||||
2. Navigate to **Dashboard > Containers > Create Instance**
|
||||
3. Create a **Medium, Ubuntu 22** container
|
||||
4. Note your container name and API key
|
||||
|
||||
Your cloud container will be automatically configured and ready to use.
|
||||
|
||||
</Tab>
|
||||
<Tab value="Linux on Docker">
|
||||
|
||||
**Run Linux desktop locally on macOS, Windows, or Linux hosts**
|
||||
|
||||
1. Install Docker Desktop or Docker Engine
|
||||
|
||||
2. Pull the CUA XFCE container (lightweight desktop)
|
||||
|
||||
```bash
|
||||
docker pull --platform=linux/amd64 trycua/cua-xfce:latest
|
||||
```
|
||||
|
||||
Or use KASM for a full-featured desktop:
|
||||
|
||||
```bash
|
||||
docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="Windows Sandbox">
|
||||
|
||||
**Windows hosts only - requires Windows 10 Pro/Enterprise or Windows 11**
|
||||
|
||||
1. Enable Windows Sandbox
|
||||
2. Install pywinsandbox dependency
|
||||
|
||||
```bash
|
||||
pip install -U git+git://github.com/karkason/pywinsandbox.git
|
||||
```
|
||||
|
||||
3. Windows Sandbox will be automatically configured when you run the CLI
|
||||
|
||||
</Tab>
|
||||
<Tab value="macOS VM">
|
||||
|
||||
**macOS hosts only - requires Lume CLI**
|
||||
|
||||
1. Install lume cli
|
||||
|
||||
```bash
|
||||
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
|
||||
```
|
||||
|
||||
2. Start a local Cua macOS VM
|
||||
|
||||
```bash
|
||||
lume run macos-sequoia-cua:latest
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
## Install Cua
|
||||
|
||||
<Accordions type="single" defaultValue="uv">
|
||||
|
||||
<Accordion title="uv (Recommended)" value="uv">
|
||||
|
||||
### Install uv
|
||||
|
||||
<Tabs items={['macOS / Linux', 'Windows']} persist>
|
||||
<Tab value="macOS / Linux">
|
||||
|
||||
```bash
|
||||
# Use curl to download the script and execute it with sh:
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# If your system doesn't have curl, you can use wget:
|
||||
# wget -qO- https://astral.sh/uv/install.sh | sh
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="Windows">
|
||||
|
||||
```powershell
|
||||
# Use irm to download the script and execute it with iex:
|
||||
powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
### Install Python 3.12
|
||||
|
||||
```bash
|
||||
uv python install 3.12
|
||||
# uv will install Cua dependencies automatically when you use --with "cua-agent[cli]"
|
||||
```
|
||||
|
||||
</Accordion>
|
||||
|
||||
<Accordion title="conda" value="conda">
|
||||
|
||||
### Install conda
|
||||
|
||||
<Tabs items={['macOS', 'Linux', 'Windows']} persist>
|
||||
<Tab value="macOS">
|
||||
|
||||
```bash
|
||||
mkdir -p ~/miniconda3
|
||||
curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o ~/miniconda3/miniconda.sh
|
||||
bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
|
||||
rm ~/miniconda3/miniconda.sh
|
||||
source ~/miniconda3/bin/activate
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="Linux">
|
||||
|
||||
```bash
|
||||
mkdir -p ~/miniconda3
|
||||
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
|
||||
bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
|
||||
rm ~/miniconda3/miniconda.sh
|
||||
source ~/miniconda3/bin/activate
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="Windows">
|
||||
|
||||
```powershell
|
||||
wget "https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe" -outfile ".\miniconda.exe"
|
||||
Start-Process -FilePath ".\miniconda.exe" -ArgumentList "/S" -Wait
|
||||
del .\miniconda.exe
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
### Create and activate Python 3.12 environment
|
||||
|
||||
```bash
|
||||
conda create -n cua python=3.12
|
||||
conda activate cua
|
||||
```
|
||||
|
||||
### Install Cua
|
||||
|
||||
```bash
|
||||
pip install "cua-agent[cli]" cua-computer
|
||||
```
|
||||
|
||||
</Accordion>
|
||||
|
||||
<Accordion title="pip" value="pip">
|
||||
|
||||
### Install Cua
|
||||
|
||||
```bash
|
||||
pip install "cua-agent[cli]" cua-computer
|
||||
```
|
||||
|
||||
</Accordion>
|
||||
|
||||
</Accordions>
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
## Run Cua CLI
|
||||
|
||||
Choose your preferred AI model:
|
||||
|
||||
### OpenAI Computer Use Preview
|
||||
|
||||
<Tabs items={['uv', 'conda/pip']} persist>
|
||||
<Tab value="uv">
|
||||
|
||||
```bash
|
||||
uv run --with "cua-agent[cli]" -m agent.cli openai/computer-use-preview
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="conda/pip">
|
||||
|
||||
```bash
|
||||
python -m agent.cli openai/computer-use-preview
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
### Anthropic Claude
|
||||
|
||||
<Tabs items={['uv', 'conda/pip']} persist>
|
||||
<Tab value="uv">
|
||||
|
||||
```bash
|
||||
uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-sonnet-4-5-20250929
|
||||
uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-opus-4-20250514
|
||||
uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-opus-4-1-20250805
|
||||
uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-sonnet-4-20250514
|
||||
uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-3-5-sonnet-20241022
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="conda/pip">
|
||||
|
||||
```bash
|
||||
python -m agent.cli anthropic/claude-sonnet-4-5-20250929
|
||||
python -m agent.cli anthropic/claude-opus-4-1-20250805
|
||||
python -m agent.cli anthropic/claude-opus-4-20250514
|
||||
python -m agent.cli anthropic/claude-sonnet-4-20250514
|
||||
python -m agent.cli anthropic/claude-3-5-sonnet-20241022
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
### Omniparser + LLMs
|
||||
|
||||
<Tabs items={['uv', 'conda/pip']} persist>
|
||||
<Tab value="uv">
|
||||
|
||||
```bash
|
||||
uv run --with "cua-agent[cli]" -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
|
||||
uv run --with "cua-agent[cli]" -m agent.cli omniparser+openai/gpt-4o
|
||||
uv run --with "cua-agent[cli]" -m agent.cli omniparser+vertex_ai/gemini-pro
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="conda/pip">
|
||||
|
||||
```bash
|
||||
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
|
||||
python -m agent.cli omniparser+openai/gpt-4o
|
||||
python -m agent.cli omniparser+vertex_ai/gemini-pro
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
### Local Models
|
||||
|
||||
<Tabs items={['uv', 'conda/pip']} persist>
|
||||
<Tab value="uv">
|
||||
|
||||
```bash
|
||||
# Hugging Face models (local)
|
||||
uv run --with "cua-agent[cli]" -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
||||
|
||||
# MLX models (Apple Silicon)
|
||||
uv run --with "cua-agent[cli]" -m agent.cli mlx/mlx-community/UI-TARS-1.5-7B-6bit
|
||||
|
||||
# Ollama models
|
||||
uv run --with "cua-agent[cli]" -m agent.cli omniparser+ollama_chat/llama3.2:latest
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="conda/pip">
|
||||
|
||||
```bash
|
||||
# Hugging Face models (local)
|
||||
python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
||||
|
||||
# MLX models (Apple Silicon)
|
||||
python -m agent.cli mlx/mlx-community/UI-TARS-1.5-7B-6bit
|
||||
|
||||
# Ollama models
|
||||
python -m agent.cli omniparser+ollama_chat/llama3.2:latest
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
### Interactive Setup
|
||||
|
||||
If you haven't set up environment variables, the CLI will guide you through the setup:
|
||||
|
||||
1. **Sandbox Name**: Enter your Cua sandbox name (or get one at [cua.ai](https://cua.ai/))
|
||||
2. **CUA API Key**: Enter your Cua API key
|
||||
3. **Provider API Key**: Enter your AI provider API key (OpenAI, Anthropic, etc.)
|
||||
|
||||
### Start Chatting
|
||||
|
||||
Once connected, you'll see:
|
||||
|
||||
```
|
||||
💻 Connected to your-container-name (model, agent_loop)
|
||||
Type 'exit' to quit.
|
||||
|
||||
>
|
||||
```
|
||||
|
||||
You can ask your agent to perform actions like:
|
||||
|
||||
- "Take a screenshot and tell me what's on the screen"
|
||||
- "Open Firefox and go to github.com"
|
||||
- "Type 'Hello world' into the terminal"
|
||||
- "Close the current window"
|
||||
- "Click on the search button"
|
||||
|
||||
</Step>
|
||||
</Steps>
|
||||
|
||||
---
|
||||
|
||||
For running models locally, see [Running Models Locally](/agent-sdk/supported-model-providers/local-models).
|
||||
@@ -1,313 +0,0 @@
|
||||
---
|
||||
title: Quickstart
|
||||
description: Get started with Cua in three steps
|
||||
icon: Rocket
|
||||
---
|
||||
|
||||
import { Step, Steps } from 'fumadocs-ui/components/steps';
|
||||
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
|
||||
|
||||
This quickstart guides you through setting up your [computer environment](#set-up-your-computer-environment), programmatic control with a [Cua computer](#using-computer), and task automation with a [Cua agent](#using-agent):
|
||||
|
||||
<Steps>
|
||||
|
||||
<Step>
|
||||
|
||||
## Set Up Your Computer Environment
|
||||
|
||||
Choose how you want to run your Cua computer. This will be the environment where your automated tasks will execute.
|
||||
|
||||
You can run your Cua computer in the cloud (recommended for easiest setup), locally on macOS with Lume, locally on Windows with a Windows Sandbox, or in a Docker container on any platform. Choose the option that matches your system and needs.
|
||||
|
||||
<Tabs items={['☁️ Cloud', '🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox']}>
|
||||
<Tab value="☁️ Cloud">
|
||||
|
||||
Cua Cloud Sandbox provides virtual machines that run Ubuntu.
|
||||
|
||||
1. Go to [cua.ai/signin](https://cua.ai/signin)
|
||||
2. Navigate to **Dashboard > Containers > Create Instance**
|
||||
3. Create a **Medium, Ubuntu 22** sandbox
|
||||
4. Note your sandbox name and API key
|
||||
|
||||
Your Cloud Sandbox will be automatically configured and ready to use.
|
||||
|
||||
</Tab>
|
||||
<Tab value="🍎 Lume">
|
||||
|
||||
Lume containers are macOS virtual machines that run on a macOS host machine.
|
||||
|
||||
1. Install the Lume CLI:
|
||||
|
||||
```bash
|
||||
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
|
||||
```
|
||||
|
||||
2. Start a local Cua sandbox:
|
||||
|
||||
```bash
|
||||
lume run macos-sequoia-cua:latest
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="🪟 Windows Sandbox">
|
||||
|
||||
Windows Sandbox provides Windows virtual environments that run on a Windows host machine.
|
||||
|
||||
1. Enable [Windows Sandbox](https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/windows-sandbox-install) (requires Windows 10 Pro/Enterprise or Windows 11)
|
||||
2. Install the `pywinsandbox` dependency:
|
||||
|
||||
```bash
|
||||
pip install -U git+git://github.com/karkason/pywinsandbox.git
|
||||
```
|
||||
|
||||
3. Windows Sandbox will be automatically configured when you run the CLI
|
||||
|
||||
</Tab>
|
||||
<Tab value="🐳 Docker">
|
||||
|
||||
Docker provides a way to run Ubuntu containers on any host machine.
|
||||
|
||||
1. Install Docker Desktop or Docker Engine:
|
||||
|
||||
2. Pull the CUA Ubuntu sandbox:
|
||||
|
||||
```bash
|
||||
docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
## Using Computer
|
||||
|
||||
Connect to your Cua computer and perform basic interactions, such as taking screenshots or simulating user input.
|
||||
|
||||
<Tabs items={['Python', 'TypeScript']}>
|
||||
<Tab value="Python">
|
||||
Install the Cua computer Python SDK:
|
||||
```bash
|
||||
pip install cua-computer
|
||||
```
|
||||
|
||||
Then, connect to your desired computer environment:
|
||||
|
||||
<Tabs items={['☁️ Cloud', '🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox', '🖥️ Host Desktop']}>
|
||||
<Tab value="☁️ Cloud">
|
||||
```python
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(
|
||||
os_type="linux",
|
||||
provider_type="cloud",
|
||||
name="your-sandbox-name",
|
||||
api_key="your-api-key"
|
||||
)
|
||||
await computer.run() # Connect to the sandbox
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="🍎 Lume">
|
||||
```python
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(
|
||||
os_type="macos",
|
||||
provider_type="lume",
|
||||
name="macos-sequoia-cua:latest"
|
||||
)
|
||||
await computer.run() # Launch & connect to the container
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="🪟 Windows Sandbox">
|
||||
```python
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(
|
||||
os_type="windows",
|
||||
provider_type="windows_sandbox"
|
||||
)
|
||||
await computer.run() # Launch & connect to the container
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="🐳 Docker">
|
||||
```python
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(
|
||||
os_type="linux",
|
||||
provider_type="docker",
|
||||
name="trycua/cua-ubuntu:latest"
|
||||
)
|
||||
await computer.run() # Launch & connect to the container
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="🖥️ Host Desktop">
|
||||
Install and run `cua-computer-server`:
|
||||
```bash
|
||||
pip install cua-computer-server
|
||||
python -m computer_server
|
||||
```
|
||||
|
||||
Then, use the `Computer` object to connect:
|
||||
```python
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(use_host_computer_server=True)
|
||||
await computer.run() # Connect to the host desktop
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
Once connected, you can perform interactions:
|
||||
```python
|
||||
try:
|
||||
# Take a screenshot of the computer's current display
|
||||
screenshot = await computer.interface.screenshot()
|
||||
# Simulate a left-click at coordinates (100, 100)
|
||||
await computer.interface.left_click(100, 100)
|
||||
# Type "Hello!" into the active application
|
||||
await computer.interface.type("Hello!")
|
||||
finally:
|
||||
await computer.close()
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="TypeScript">
|
||||
Install the Cua computer TypeScript SDK:
|
||||
```bash
|
||||
npm install @trycua/computer
|
||||
```
|
||||
|
||||
Then, connect to your desired computer environment:
|
||||
|
||||
<Tabs items={['☁️ Cloud','🐳 Docker', '🍎 Lume', '🪟 Windows Sandbox', '🖥️ Host Desktop']}>
|
||||
<Tab value="☁️ Cloud">
|
||||
```typescript
|
||||
import { Computer, OSType } from '@trycua/computer';
|
||||
|
||||
const computer = new Computer({
|
||||
osType: OSType.LINUX,
|
||||
name: "your-sandbox-name",
|
||||
apiKey: "your-api-key"
|
||||
});
|
||||
await computer.run(); // Connect to the sandbox
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="🍎 Lume">
|
||||
```typescript
|
||||
import { Computer, OSType, ProviderType } from '@trycua/computer';
|
||||
|
||||
const computer = new Computer({
|
||||
osType: OSType.MACOS,
|
||||
providerType: ProviderType.LUME,
|
||||
name: "macos-sequoia-cua:latest"
|
||||
});
|
||||
await computer.run(); // Launch & connect to the container
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="🪟 Windows Sandbox">
|
||||
```typescript
|
||||
import { Computer, OSType, ProviderType } from '@trycua/computer';
|
||||
|
||||
const computer = new Computer({
|
||||
osType: OSType.WINDOWS,
|
||||
providerType: ProviderType.WINDOWS_SANDBOX
|
||||
});
|
||||
await computer.run(); // Launch & connect to the container
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="🐳 Docker">
|
||||
```typescript
|
||||
import { Computer, OSType, ProviderType } from '@trycua/computer';
|
||||
|
||||
const computer = new Computer({
|
||||
osType: OSType.LINUX,
|
||||
providerType: ProviderType.DOCKER,
|
||||
name: "trycua/cua-ubuntu:latest"
|
||||
});
|
||||
await computer.run(); // Launch & connect to the container
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="🖥️ Host Desktop">
|
||||
First, install and run `cua-computer-server`:
|
||||
```bash
|
||||
pip install cua-computer-server
|
||||
python -m computer_server
|
||||
```
|
||||
|
||||
Then, use the `Computer` object to connect:
|
||||
```typescript
|
||||
import { Computer } from '@trycua/computer';
|
||||
|
||||
const computer = new Computer({ useHostComputerServer: true });
|
||||
await computer.run(); // Connect to the host desktop
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
Once connected, you can perform interactions:
|
||||
```typescript
|
||||
try {
|
||||
// Take a screenshot of the computer's current display
|
||||
const screenshot = await computer.interface.screenshot();
|
||||
// Simulate a left-click at coordinates (100, 100)
|
||||
await computer.interface.leftClick(100, 100);
|
||||
// Type "Hello!" into the active application
|
||||
await computer.interface.typeText("Hello!");
|
||||
} finally {
|
||||
await computer.close();
|
||||
}
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
Learn more about computers in the [Cua computers documentation](/computer-sdk/computers). You will see how to automate computers with agents in the next step.
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
## Using Agent
|
||||
|
||||
Utilize an Agent to automate complex tasks by providing it with a goal and allowing it to interact with the computer environment.
|
||||
|
||||
Install the Cua agent Python SDK:
|
||||
|
||||
```bash
|
||||
pip install "cua-agent[all]"
|
||||
```
|
||||
|
||||
Then, use the `ComputerAgent` object:
|
||||
|
||||
```python
|
||||
from agent import ComputerAgent
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
|
||||
|
||||
async for result in agent.run(messages):
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"])
|
||||
```
|
||||
|
||||
Learn more about agents in [Agent Loops](/agent-sdk/agent-loops) and available models in [Supported Models](/agent-sdk/supported-model-providers/).
|
||||
|
||||
</Step>
|
||||
</Steps>
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands)
|
||||
- Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/)
|
||||
- Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help
|
||||
- Try out [Form Filling](/example-usecases/form-filling) preset usecase
|
||||
@@ -24,6 +24,39 @@ const config = {
|
||||
basePath: false, // Important: this bypasses the basePath
|
||||
permanent: false,
|
||||
},
|
||||
// Redirect old docs.cua.ai URLs to cua.ai/docs with 301 for SEO
|
||||
// This handles URLs that Google has indexed from the old domain
|
||||
{
|
||||
source: '/:path*',
|
||||
has: [
|
||||
{
|
||||
type: 'host',
|
||||
value: 'docs.cua.ai',
|
||||
},
|
||||
],
|
||||
destination: 'https://cua.ai/docs/:path*',
|
||||
permanent: true, // 301 redirect to preserve SEO authority
|
||||
basePath: false,
|
||||
},
|
||||
// Redirects for documentation restructure (PR #568)
|
||||
// Moved quickstart-devs to get-started section
|
||||
{
|
||||
source: '/quickstart-devs',
|
||||
destination: '/get-started/quickstart',
|
||||
permanent: true,
|
||||
},
|
||||
// Moved telemetry to agent-sdk section
|
||||
{
|
||||
source: '/telemetry',
|
||||
destination: '/agent-sdk/telemetry',
|
||||
permanent: true,
|
||||
},
|
||||
// Removed quickstart-cli, consolidated into main quickstart
|
||||
{
|
||||
source: '/quickstart-cli',
|
||||
destination: '/get-started/quickstart',
|
||||
permanent: true,
|
||||
},
|
||||
];
|
||||
},
|
||||
images: {
|
||||
|
||||
@@ -9,22 +9,22 @@
|
||||
"postinstall": "fumadocs-mdx"
|
||||
},
|
||||
"dependencies": {
|
||||
"fumadocs-core": "15.5.1",
|
||||
"fumadocs-mdx": "11.6.7",
|
||||
"fumadocs-ui": "15.5.1",
|
||||
"fumadocs-core": "16.0.8",
|
||||
"fumadocs-mdx": "13.0.5",
|
||||
"fumadocs-ui": "16.0.8",
|
||||
"lucide-react": "^0.525.0",
|
||||
"mermaid": "^11.8.1",
|
||||
"next": "15.3.3",
|
||||
"next": "16.0.1",
|
||||
"next-themes": "^0.4.6",
|
||||
"posthog-js": "^1.276.0",
|
||||
"react": "^19.1.0",
|
||||
"react-dom": "^19.1.0",
|
||||
"react": "^19.2.0",
|
||||
"react-dom": "^19.2.0",
|
||||
"react-icons": "^5.5.0",
|
||||
"remark": "^15.0.1",
|
||||
"remark-gfm": "^4.0.1",
|
||||
"remark-mdx": "^3.1.0",
|
||||
"tailwind-merge": "^3.3.1",
|
||||
"zod": "^3.25.76"
|
||||
"zod": "^4.1.12"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@tailwindcss/postcss": "^4.1.8",
|
||||
|
||||
1634
docs/pnpm-lock.yaml
generated
1634
docs/pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
BIN
docs/public/img/grounding-with-gemini3.gif
Normal file
BIN
docs/public/img/grounding-with-gemini3.gif
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 5.2 MiB |
BIN
docs/public/img/laminar_trace_example.png
Normal file
BIN
docs/public/img/laminar_trace_example.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 628 KiB |
@@ -6,12 +6,12 @@ import { z } from 'zod';
|
||||
export const docs = defineDocs({
|
||||
docs: {
|
||||
schema: frontmatterSchema.extend({
|
||||
macos: z.boolean().optional(),
|
||||
windows: z.boolean().optional(),
|
||||
linux: z.boolean().optional(),
|
||||
pypi: z.string().optional(),
|
||||
npm: z.string().optional(),
|
||||
github: z.array(z.string()).optional(),
|
||||
macos: z.boolean().default(false),
|
||||
windows: z.boolean().default(false),
|
||||
linux: z.boolean().default(false),
|
||||
}),
|
||||
},
|
||||
meta: {
|
||||
|
||||
@@ -8,15 +8,16 @@ import { cn } from 'fumadocs-ui/utils/cn';
|
||||
import { ChevronDown, CodeXml, ExternalLink } from 'lucide-react';
|
||||
import type { Metadata } from 'next';
|
||||
import Link from 'next/link';
|
||||
import { notFound, redirect } from 'next/navigation';
|
||||
import { notFound } from 'next/navigation';
|
||||
import { PageFeedback } from '@/components/page-feedback';
|
||||
import { DocActionsMenu } from '@/components/doc-actions-menu';
|
||||
|
||||
export default async function Page(props: { params: Promise<{ slug?: string[] }> }) {
|
||||
const params = await props.params;
|
||||
const slug = params.slug || [];
|
||||
|
||||
const page = source.getPage(slug);
|
||||
if (!page) notFound(); //redirect('/docs');
|
||||
if (!page) notFound();
|
||||
|
||||
// Detect if this is an API reference page: /api/[section] or /api/[section]/[version]
|
||||
let apiSection: string | null = null;
|
||||
@@ -179,9 +180,13 @@ export default async function Page(props: { params: Promise<{ slug?: string[] }>
|
||||
};
|
||||
|
||||
const tocFooter = () => {
|
||||
// Construct file path from slug
|
||||
// For root index, use 'index.mdx', otherwise join slug parts
|
||||
const filePath = slug.length === 0 ? 'index.mdx' : `${slug.join('/')}.mdx`;
|
||||
|
||||
return (
|
||||
<div className="mt-4">
|
||||
<DocActionsMenu pageUrl={page.url} pageTitle={page.data.title} filePath={page.file.path} />
|
||||
<DocActionsMenu pageUrl={page.url} pageTitle={page.data.title} filePath={filePath} />
|
||||
</div>
|
||||
);
|
||||
};
|
||||
@@ -282,9 +287,9 @@ export async function generateMetadata(props: {
|
||||
const page = source.getPage(params.slug);
|
||||
if (!page) notFound();
|
||||
|
||||
let title = `${page.data.title} | Cua Docs`;
|
||||
if (page.url.includes('api')) title = `${page.data.title} | Cua API Docs`;
|
||||
if (page.url.includes('guide')) title = ` Guide: ${page.data.title} | Cua Docs`;
|
||||
let title = `${page.data.title} | Cua`;
|
||||
if (page.url.includes('api')) title = `${page.data.title} | Cua API`;
|
||||
if (page.url.includes('guide')) title = ` Guide: ${page.data.title} | Cua`;
|
||||
|
||||
// Canonical URL points to cua.ai to consolidate all SEO authority on main domain
|
||||
const canonicalUrl = `https://cua.ai${page.url}`;
|
||||
@@ -368,7 +373,7 @@ export async function generateMetadata(props: {
|
||||
title,
|
||||
description: page.data.description,
|
||||
type: 'article',
|
||||
siteName: 'Cua Docs',
|
||||
siteName: 'Cua',
|
||||
url: canonicalUrl,
|
||||
},
|
||||
twitter: {
|
||||
|
||||
@@ -1,3 +1,14 @@
|
||||
@import 'tailwindcss';
|
||||
@import 'fumadocs-ui/css/neutral.css';
|
||||
@import 'fumadocs-ui/css/preset.css';
|
||||
|
||||
/* Fix TOC overflow on production builds */
|
||||
#nd-toc {
|
||||
overflow-y: auto;
|
||||
overflow-x: hidden;
|
||||
}
|
||||
|
||||
#nd-toc > div {
|
||||
overflow-y: auto;
|
||||
overflow-x: hidden;
|
||||
}
|
||||
|
||||
@@ -34,9 +34,10 @@ export const baseOptions: BaseLayoutProps = {
|
||||
className="hidden dark:block"
|
||||
alt="Logo"
|
||||
/>
|
||||
Cua Documentation
|
||||
Cua
|
||||
</>
|
||||
),
|
||||
url: 'https://cua.ai',
|
||||
},
|
||||
githubUrl: 'https://github.com/trycua/cua',
|
||||
links: [
|
||||
|
||||
@@ -7,7 +7,7 @@ import posthog from 'posthog-js';
|
||||
interface DocActionsMenuProps {
|
||||
pageUrl: string;
|
||||
pageTitle: string;
|
||||
filePath: string;
|
||||
filePath?: string;
|
||||
}
|
||||
|
||||
export function DocActionsMenu({ pageUrl, pageTitle, filePath }: DocActionsMenuProps) {
|
||||
@@ -15,6 +15,9 @@ export function DocActionsMenu({ pageUrl, pageTitle, filePath }: DocActionsMenuP
|
||||
|
||||
const handleCopyMarkdown = async () => {
|
||||
try {
|
||||
if (!filePath) {
|
||||
throw new Error('No file path available');
|
||||
}
|
||||
const githubRawUrl = `https://raw.githubusercontent.com/trycua/cua/refs/heads/main/docs/content/docs/${filePath}`;
|
||||
|
||||
const response = await fetch(githubRawUrl);
|
||||
@@ -55,6 +58,9 @@ export function DocActionsMenu({ pageUrl, pageTitle, filePath }: DocActionsMenuP
|
||||
};
|
||||
|
||||
const handleEditGithub = () => {
|
||||
if (!filePath) {
|
||||
return;
|
||||
}
|
||||
posthog.capture('docs_edit_github_clicked', {
|
||||
page: pageUrl,
|
||||
page_title: pageTitle,
|
||||
|
||||
@@ -56,7 +56,7 @@ export function Footer() {
|
||||
</li>
|
||||
<li>
|
||||
<a
|
||||
href="/docs/quickstart-devs"
|
||||
href="/docs/get-started/quickstart"
|
||||
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
|
||||
>
|
||||
Quick Start
|
||||
|
||||
34
docs/src/components/hero.tsx
Normal file
34
docs/src/components/hero.tsx
Normal file
@@ -0,0 +1,34 @@
|
||||
export function Hero({ children }: { children: React.ReactNode }) {
|
||||
return (
|
||||
<div className="not-prose relative mb-12 overflow-hidden rounded-xl border border-fd-border bg-gradient-to-br from-fd-background via-fd-muted/30 to-fd-muted/50 p-8 shadow-lg md:p-12 lg:p-16">
|
||||
{/* Background Pattern */}
|
||||
<div className="pointer-events-none absolute inset-0">
|
||||
{/* Grid */}
|
||||
<svg
|
||||
className="absolute h-full w-full text-fd-foreground"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
>
|
||||
<defs>
|
||||
<pattern id="hero-grid" width="40" height="40" patternUnits="userSpaceOnUse">
|
||||
<path
|
||||
d="M 40 0 L 0 0 0 40"
|
||||
fill="none"
|
||||
stroke="currentColor"
|
||||
strokeWidth="0.5"
|
||||
opacity="0.1"
|
||||
/>
|
||||
</pattern>
|
||||
</defs>
|
||||
<rect width="100%" height="100%" fill="url(#hero-grid)" />
|
||||
</svg>
|
||||
|
||||
{/* Subtle glow effects */}
|
||||
<div className="absolute -right-20 -top-20 h-96 w-96 rounded-full bg-fd-primary/5 blur-3xl" />
|
||||
<div className="absolute -bottom-32 -left-20 h-96 w-96 rounded-full bg-fd-primary/5 blur-3xl" />
|
||||
</div>
|
||||
|
||||
{/* Content */}
|
||||
<div className="relative z-10">{children}</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -12,15 +12,24 @@ const processor = remark()
|
||||
.use(remarkGfm);
|
||||
|
||||
export async function getLLMText(page: InferPageType<typeof source>) {
|
||||
const processed = await processor.process({
|
||||
path: page.data._file.absolutePath,
|
||||
value: page.data.content,
|
||||
});
|
||||
const pageData = page.data as any;
|
||||
const filePath = pageData._file?.absolutePath;
|
||||
const content = pageData.content || pageData.body || '';
|
||||
|
||||
let processed;
|
||||
if (filePath && typeof content === 'string') {
|
||||
processed = await processor.process({ path: filePath, value: content });
|
||||
} else if (typeof content === 'string') {
|
||||
processed = await processor.process(content);
|
||||
} else {
|
||||
// Handle case where content is not available
|
||||
processed = { value: '' };
|
||||
}
|
||||
|
||||
return `# ${page.data.title}
|
||||
URL: ${page.url}
|
||||
|
||||
${page.data.description}
|
||||
${page.data.description || ''}
|
||||
|
||||
${processed.value}`;
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ import {
|
||||
EditableForm,
|
||||
EditableInput,
|
||||
} from './components/editable-code-block';
|
||||
import { Hero } from './components/hero';
|
||||
|
||||
// use this function to get MDX components, you will need it for rendering MDX
|
||||
export function getMDXComponents(components?: MDXComponents): MDXComponents {
|
||||
@@ -20,6 +21,7 @@ export function getMDXComponents(components?: MDXComponents): MDXComponents {
|
||||
EditableValue,
|
||||
EditableForm,
|
||||
EditableInput,
|
||||
Hero,
|
||||
...TabsComponents,
|
||||
...components,
|
||||
};
|
||||
|
||||
@@ -6,13 +6,19 @@ import { useEffect } from 'react';
|
||||
import { usePathname, useSearchParams } from 'next/navigation';
|
||||
|
||||
if (typeof window !== 'undefined') {
|
||||
posthog.init(process.env.NEXT_PUBLIC_POSTHOG_API_KEY!, {
|
||||
api_host: '/docs/api/posthog',
|
||||
ui_host: process.env.NEXT_PUBLIC_POSTHOG_HOST,
|
||||
person_profiles: 'always',
|
||||
capture_pageview: false,
|
||||
capture_pageleave: true,
|
||||
});
|
||||
const apiKey = process.env.NEXT_PUBLIC_POSTHOG_API_KEY;
|
||||
|
||||
if (apiKey) {
|
||||
posthog.init(apiKey, {
|
||||
api_host: '/docs/api/posthog',
|
||||
ui_host: process.env.NEXT_PUBLIC_POSTHOG_HOST,
|
||||
person_profiles: 'always',
|
||||
capture_pageview: false,
|
||||
capture_pageleave: true,
|
||||
});
|
||||
} else {
|
||||
console.warn('[PostHog] API key not configured. Analytics will be disabled.');
|
||||
}
|
||||
}
|
||||
|
||||
export function PHProvider({ children }: { children: React.ReactNode }) {
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
"moduleResolution": "bundler",
|
||||
"resolveJsonModule": true,
|
||||
"isolatedModules": true,
|
||||
"jsx": "preserve",
|
||||
"jsx": "react-jsx",
|
||||
"incremental": true,
|
||||
"paths": {
|
||||
"@/.source": ["./.source/index.ts"],
|
||||
@@ -25,6 +25,12 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
|
||||
"include": [
|
||||
"next-env.d.ts",
|
||||
"**/*.ts",
|
||||
"**/*.tsx",
|
||||
".next/types/**/*.ts",
|
||||
".next/dev/types/**/*.ts"
|
||||
],
|
||||
"exclude": ["node_modules"]
|
||||
}
|
||||
|
||||
@@ -45,7 +45,7 @@ async def run_agent_example():
|
||||
# model="anthropic/claude-opus-4-20250514",
|
||||
# model="anthropic/claude-sonnet-4-20250514",
|
||||
# model="anthropic/claude-3-7-sonnet-20250219",
|
||||
# model="anthropic/claude-3-5-sonnet-20241022",
|
||||
# model="anthropic/claude-sonnet-4-5-20250929",
|
||||
# == UI-TARS ==
|
||||
# model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
|
||||
# model="mlx/mlx-community/UI-TARS-1.5-7B-6bit",
|
||||
@@ -53,6 +53,10 @@ async def run_agent_example():
|
||||
# == Omniparser + Any LLM ==
|
||||
# model="omniparser+anthropic/claude-opus-4-20250514",
|
||||
# model="omniparser+ollama_chat/gemma3:12b-it-q4_K_M",
|
||||
# == Omniparser + Vertex AI Gemini 3 (with thinking_level) ==
|
||||
# model="omni+vertex_ai/gemini-3-flash",
|
||||
# thinking_level="high", # or "low"
|
||||
# media_resolution="medium", # or "low" or "high"
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.DEBUG,
|
||||
|
||||
@@ -9,14 +9,13 @@ from computer.providers.cloud.provider import CloudProvider
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
api_key = os.getenv("CUA_API_KEY")
|
||||
if not api_key:
|
||||
raise RuntimeError("CUA_API_KEY environment variable is not set")
|
||||
# CloudProvider will automatically read CUA_API_KEY from environment if not provided
|
||||
# You can still pass api_key explicitly if needed: CloudProvider(api_key="your_key")
|
||||
api_base = os.getenv("CUA_API_BASE")
|
||||
if api_base:
|
||||
print(f"Using API base: {api_base}")
|
||||
|
||||
provider = CloudProvider(api_key=api_key, verbose=True)
|
||||
provider = CloudProvider(verbose=True)
|
||||
async with provider:
|
||||
|
||||
# List all VMs
|
||||
|
||||
@@ -34,14 +34,6 @@ This example demonstrates how to control a Cua Cloud Sandbox using the OpenAI `c
|
||||
- `src/index.ts` — Main example script
|
||||
- `src/helpers.ts` — Helper for executing actions on the container
|
||||
|
||||
## Further Reading
|
||||
|
||||
For a step-by-step tutorial and more detailed explanation, see the accompanying blog post:
|
||||
|
||||
➡️ [Controlling a Cua Cloud Sandbox with JavaScript](https://placeholder-url-to-blog-post.com)
|
||||
|
||||
_(This link will be updated once the article is published.)_
|
||||
|
||||
---
|
||||
|
||||
If you have questions or issues, please open an issue or contact the maintainers.
|
||||
|
||||
@@ -58,7 +58,7 @@ To get set up with Lume for development, read [these instructions](Development.m
|
||||
- [Installation](https://cua.ai/docs/libraries/lume/installation)
|
||||
- [Prebuilt Images](https://cua.ai/docs/libraries/lume/prebuilt-images)
|
||||
- [CLI Reference](https://cua.ai/docs/libraries/lume/cli-reference)
|
||||
- [HTTP API](https://cuai.ai/docs/libraries/lume/http-api)
|
||||
- [HTTP API](https://cua.ai/docs/libraries/lume/http-api)
|
||||
- [FAQ](https://cua.ai/docs/libraries/lume/faq)
|
||||
|
||||
## Contributing
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 0.4.37
|
||||
current_version = 0.5.1
|
||||
commit = True
|
||||
tag = True
|
||||
tag_name = agent-v{new_version}
|
||||
|
||||
@@ -51,7 +51,7 @@ async def main():
|
||||
|
||||
# Create agent
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
trajectory_dir="trajectories",
|
||||
@@ -78,7 +78,7 @@ if __name__ == "__main__":
|
||||
- [Chat History](https://cua.ai/docs/agent-sdk/chat-history)
|
||||
- [Callbacks](https://cua.ai/docs/agent-sdk/callbacks)
|
||||
- [Custom Tools](https://cua.ai/docs/agent-sdk/custom-tools)
|
||||
- [Custom Computer Handlers](https://cua.ai/docs/agent-sdk/custom-computer-handlers)
|
||||
- [Custom Computer Handlers](https://cua.ai/docs/computer-sdk/custom-computer-handlers)
|
||||
- [Prompt Caching](https://cua.ai/docs/agent-sdk/prompt-caching)
|
||||
- [Usage Tracking](https://cua.ai/docs/agent-sdk/usage-tracking)
|
||||
- [Benchmarks](https://cua.ai/docs/agent-sdk/benchmarks)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user