Merge branch 'main' into feature/mcp-claude-extension

This commit is contained in:
James Murdza
2025-10-31 17:03:39 -07:00
committed by GitHub
126 changed files with 8356 additions and 3728 deletions

View File

@@ -1,82 +0,0 @@
name: Publish Pylume Package
on:
push:
tags:
- "pylume-v*"
workflow_dispatch:
inputs:
version:
description: "Version to publish (without v prefix)"
required: true
default: "0.1.0"
workflow_call:
inputs:
version:
description: "Version to publish"
required: true
type: string
outputs:
version:
description: "The version that was published"
value: ${{ jobs.determine-version.outputs.version }}
# Adding permissions at workflow level
permissions:
contents: write
jobs:
determine-version:
runs-on: macos-latest
outputs:
version: ${{ steps.get-version.outputs.version }}
steps:
- uses: actions/checkout@v4
- name: Determine version
id: get-version
run: |
if [ "${{ github.event_name }}" == "push" ]; then
# Extract version from tag (for package-specific tags)
if [[ "${{ github.ref }}" =~ ^refs/tags/pylume-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then
VERSION=${BASH_REMATCH[1]}
else
echo "Invalid tag format for pylume"
exit 1
fi
elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
# Use version from workflow dispatch
VERSION=${{ github.event.inputs.version }}
else
# Use version from workflow_call
VERSION=${{ inputs.version }}
fi
echo "VERSION=$VERSION"
echo "version=$VERSION" >> $GITHUB_OUTPUT
validate-version:
runs-on: macos-latest
needs: determine-version
steps:
- uses: actions/checkout@v4
- name: Validate version
id: validate-version
run: |
CODE_VERSION=$(grep '__version__' libs/python/pylume/pylume/__init__.py | cut -d'"' -f2)
if [ "${{ needs.determine-version.outputs.version }}" != "$CODE_VERSION" ]; then
echo "Version mismatch: expected $CODE_VERSION, got ${{ needs.determine-version.outputs.version }}"
exit 1
fi
echo "Version validated: $CODE_VERSION"
publish:
needs: determine-version
uses: ./.github/workflows/pypi-reusable-publish.yml
with:
package_name: "pylume"
package_dir: "libs/python/pylume"
version: ${{ needs.determine-version.outputs.version }}
is_lume_package: true
base_package_name: "pylume"
secrets:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}

View File

@@ -4,11 +4,11 @@ on:
workflow_call:
inputs:
package_name:
description: "Name of the package (e.g. pylume, computer, agent)"
description: "Name of the package (e.g. computer, agent)"
required: true
type: string
package_dir:
description: "Directory containing the package relative to workspace root (e.g. libs/python/pylume)"
description: "Directory containing the package relative to workspace root (e.g. libs/python/computer)"
required: true
type: string
version:
@@ -21,7 +21,7 @@ on:
type: boolean
default: false
base_package_name:
description: "PyPI package name (e.g. pylume, cua-agent)"
description: "PyPI package name (e.g. cua-agent)"
required: true
type: string
make_latest:

93
.github/workflows/python-tests.yml vendored Normal file
View File

@@ -0,0 +1,93 @@
name: Python Unit Tests
on:
pull_request:
paths:
- "libs/python/**"
- ".github/workflows/python-tests.yml"
push:
branches:
- main
paths:
- "libs/python/**"
- ".github/workflows/python-tests.yml"
workflow_dispatch: # Allow manual trigger
jobs:
test:
name: Test ${{ matrix.package }}
runs-on: ubuntu-latest
strategy:
fail-fast: false # Test all packages even if one fails
matrix:
package:
- core
- agent
- computer
- computer-server
- mcp-server
- pylume
- som
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install uv
run: |
pip install uv
- name: Install package and dependencies
run: |
cd libs/python/${{ matrix.package }}
# Install the package in editable mode with dev dependencies
if [ -f pyproject.toml ]; then
uv pip install --system -e .
# Install test dependencies
uv pip install --system pytest pytest-asyncio pytest-mock pytest-cov
fi
shell: bash
- name: Run tests
run: |
cd libs/python/${{ matrix.package }}
if [ -d tests ]; then
python -m pytest tests/ -v --tb=short --cov --cov-report=term --cov-report=xml
else
echo "No tests directory found, skipping tests"
fi
shell: bash
env:
CUA_TELEMETRY_DISABLED: "1" # Disable telemetry during tests
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
if: always()
with:
file: ./libs/python/${{ matrix.package }}/coverage.xml
flags: ${{ matrix.package }}
name: codecov-${{ matrix.package }}
fail_ci_if_error: false
continue-on-error: true
summary:
name: Test Summary
runs-on: ubuntu-latest
needs: test
if: always()
steps:
- name: Check test results
run: |
if [ "${{ needs.test.result }}" == "failure" ]; then
echo "❌ Some tests failed. Please check the logs above."
exit 1
else
echo "✅ All tests passed!"
fi

376
.github/workflows/test-cua-models.yml vendored Normal file
View File

@@ -0,0 +1,376 @@
name: Test CUA Supporting Models
# This workflow tests all supported CUA models with API keys
# Run manually using workflow_dispatch with test_models=true
on:
pull_request_target:
branches: [main, master]
workflow_dispatch:
inputs:
test_models:
description: "Test all supported models (requires API keys)"
required: false
default: true
type: boolean
schedule:
# Runs at 3 PM UTC (8 AM PDT) daily
- cron: "0 15 * * *"
jobs:
# Test all CUA models - runs on PRs, schedules, or when manually triggered
test-all-models:
if: ${{ github.event_name == 'pull_request_target' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false') }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
model:
# Claude Sonnet/Haiku
- anthropic/claude-sonnet-4-5-20250929
- anthropic/claude-haiku-4-5-20251001
- anthropic/claude-opus-4-1-20250805
# OpenAI CU Preview
- openai/computer-use-preview
# GLM-V
- openrouter/z-ai/glm-4.5v
# - huggingface-local/zai-org/GLM-4.5V # Requires local model setup
# Gemini CU Preview
# - gemini-2.5-computer-use-preview-10-2025
# InternVL
- huggingface-local/OpenGVLab/InternVL3_5-1B
# - huggingface-local/OpenGVLab/InternVL3_5-2B
# - huggingface-local/OpenGVLab/InternVL3_5-4B
# - huggingface-local/OpenGVLab/InternVL3_5-8B
# UI-TARS (supports full computer-use, can run standalone)
- huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
# Note: OpenCUA, GTA, and Holo are grounding-only models
# They only support predict_click(), not agent.run()
# See composed agents section below for testing them
# Moondream (typically used in composed agents)
# Format: moondream3+{any-llm-with-tools}
- moondream3+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools
# - moondream3+openai/gpt-4o # GPT-4o has VLM + Tools
# OmniParser (typically used in composed agents)
# Format: omniparser+{any-vlm-with-tools}
- omniparser+anthropic/claude-sonnet-4-5-20250929 # Claude has VLM + Tools
# - omniparser+openai/gpt-4o # GPT-4o has VLM + Tools
# Other grounding models + VLM with tools
# Format: {grounding-model}+{any-vlm-with-tools}
# These grounding-only models (OpenCUA, GTA, Holo) must be used in composed form
# since they only support predict_click(), not full agent.run()
- huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929
- huggingface-local/xlangai/OpenCUA-7B+anthropic/claude-sonnet-4-5-20250929
- huggingface-local/Hcompany/Holo1.5-3B+anthropic/claude-sonnet-4-5-20250929
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up uv and Python
uses: astral-sh/setup-uv@v4
with:
python-version: "3.12"
- name: Cache system packages
uses: actions/cache@v4
with:
path: /var/cache/apt
key: ${{ runner.os }}-apt-${{ hashFiles('**/Dockerfile') }}
restore-keys: |
${{ runner.os }}-apt-
- name: Install system dependencies
timeout-minutes: 20
run: |
sudo apt-get update
sudo apt-get install -y libgl1-mesa-dri libglib2.0-0
- name: Cache Python dependencies (uv)
uses: actions/cache@v4
with:
path: |
~/.cache/uv
.venv
key: ${{ runner.os }}-uv-${{ hashFiles('pyproject.toml', 'uv.lock', 'libs/python/**/pyproject.toml') }}
restore-keys: |
${{ runner.os }}-uv-
- name: Install CUA dependencies (uv)
run: |
# Remove existing venv if it exists (from cache restore) to avoid interactive prompt
rm -rf .venv
uv venv --python 3.12
uv pip install -e libs/python/agent -e libs/python/computer
uv pip install -e libs/python/core
uv pip install "cua-agent[uitars-hf,internvl-hf,opencua-hf,moondream3,omni]"
uv pip install pytest
- name: Cache HuggingFace models
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: ${{ runner.os }}-hf-models-v1
restore-keys: |
${{ runner.os }}-hf-models-
# Large cache - models can be several GB each and are reused across runs
- name: Record test start time
run: echo "TEST_START_TIME=$(date +%s)" >> $GITHUB_ENV
env:
# Ensure HuggingFace uses consistent cache location
HF_HOME: ~/.cache/huggingface
- name: Test model with agent loop
id: test_model
timeout-minutes: 20
continue-on-error: true
run: |
cd tests/agent_loop_testing
uv run python agent_test.py --model "${{ matrix.model }}"
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
- name: Calculate test duration and prepare message
if: always()
run: |
TEST_END_TIME=$(date +%s)
# Handle case where TEST_START_TIME might not be set
if [ -z "$TEST_START_TIME" ]; then
TEST_START_TIME=$TEST_END_TIME
fi
TEST_DURATION=$((TEST_END_TIME - TEST_START_TIME))
# Convert seconds to minutes and seconds
MINUTES=$((TEST_DURATION / 60))
SECONDS=$((TEST_DURATION % 60))
# Format duration
if [ $MINUTES -gt 0 ]; then
DURATION_STR="${MINUTES}m ${SECONDS}s"
else
DURATION_STR="${SECONDS}s"
fi
# Determine status icon based on test step outcome
if [ "${{ steps.test_model.outcome }}" == "success" ]; then
STATUS_ICON="✅"
STATUS_TEXT="PASSED"
SLACK_COLOR="#36a64f"
else
STATUS_ICON="❌"
STATUS_TEXT="FAILED"
SLACK_COLOR="#dc3545"
fi
# Prepare Slack message
echo "TESTS_CONTENT<<EOF" >> $GITHUB_ENV
echo "*CUA Model Test Results*" >> $GITHUB_ENV
echo "" >> $GITHUB_ENV
echo "*Model:* ${{ matrix.model }}" >> $GITHUB_ENV
echo "*Status:* ${STATUS_ICON} ${STATUS_TEXT}" >> $GITHUB_ENV
echo "*Duration:* ${DURATION_STR}" >> $GITHUB_ENV
echo "*Run:* ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
# Set color based on outcome
echo "SLACK_COLOR=${SLACK_COLOR}" >> $GITHUB_ENV
# Save result to JSON file for summary
mkdir -p test_summary
MODEL_NAME="${{ matrix.model }}"
# Sanitize model name for filename
SAFE_MODEL_NAME=$(echo "$MODEL_NAME" | sed 's/[^a-zA-Z0-9]/_/g')
# Determine pass status
if [ "${{ steps.test_model.outcome }}" == "success" ]; then
PASSED_VAL="true"
else
PASSED_VAL="false"
fi
# Create JSON file using printf to avoid YAML parsing issues
printf '{\n "model": "%s",\n "status": "%s",\n "status_icon": "%s",\n "duration": "%s",\n "duration_seconds": %d,\n "passed": %s\n}' \
"${MODEL_NAME}" "${STATUS_TEXT}" "${STATUS_ICON}" "${DURATION_STR}" "${TEST_DURATION}" "${PASSED_VAL}" \
> "test_summary/${SAFE_MODEL_NAME}.json"
# Expose safe model name for subsequent steps (artifact naming)
echo "SAFE_MODEL_NAME=${SAFE_MODEL_NAME}" >> $GITHUB_ENV
- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results-${{ matrix.model }}
path: |
tests/agent_loop_testing/test_images/
*.log
retention-days: 7
- name: Upload test summary data
if: always()
uses: actions/upload-artifact@v4
with:
# Unique, slash-free artifact name per matrix entry
name: test-summary-${{ env.SAFE_MODEL_NAME }}
path: test_summary/
retention-days: 1
- name: Set default Slack color
if: always() && env.SLACK_COLOR == ''
run: echo "SLACK_COLOR=#36a64f" >> $GITHUB_ENV
# Individual model notifications disabled - only summary is sent
# - name: Notify Slack with test results
# if: always()
# uses: rtCamp/action-slack-notify@v2
# env:
# SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
# SLACK_CHANNEL: ${{ vars.SLACK_CHANNEL }}
# SLACK_TITLE: CUA Model Test Update
# SLACK_COLOR: ${{ env.SLACK_COLOR }}
# SLACK_MESSAGE: |
# ${{ env.TESTS_CONTENT }}
# Summary job that aggregates all model test results
test-summary:
if: ${{ always() && (github.event_name == 'pull_request_target' || github.event_name == 'schedule' || fromJSON(inputs.test_models || 'false')) }}
needs: test-all-models
runs-on: ubuntu-latest
steps:
- name: Install jq
run: sudo apt-get update && sudo apt-get install -y jq
- name: Download all test summary artifacts
continue-on-error: true
uses: actions/download-artifact@v4
with:
pattern: test-summary-*
merge-multiple: true
path: all_summaries
- name: Generate and send summary
if: always()
shell: bash
run: |
# Create directory if it doesn't exist
mkdir -p all_summaries
# Get list of models being tested in this run from the matrix
# This helps filter out artifacts from previous runs when testing locally
EXPECTED_MODELS="${{ join(matrix.model, ' ') }}"
# Aggregate all results
PASSED_COUNT=0
FAILED_COUNT=0
TOTAL_DURATION=0
SUMMARY_MESSAGE="*🚀 Model Summaries*\n\n"
# Process each JSON file (find all JSON files recursively)
# Save to temp file first to avoid subshell issues
find all_summaries -name "*.json" -type f 2>/dev/null > /tmp/json_files.txt || true
# Use associative array to deduplicate by model name
declare -A processed_models
while IFS= read -r json_file; do
if [ -f "$json_file" ]; then
MODEL=$(jq -r '.model' "$json_file")
# Skip if we've already processed this model
if [ "${processed_models[$MODEL]}" = "1" ]; then
echo "Skipping duplicate model: $MODEL"
continue
fi
# Filter: Only include models that are in the current matrix
# This prevents including artifacts from previous workflow runs
if [ -n "$EXPECTED_MODELS" ]; then
if ! echo "$EXPECTED_MODELS" | grep -q "$MODEL"; then
echo "Skipping model from previous run: $MODEL"
continue
fi
fi
# Mark as processed
processed_models[$MODEL]="1"
STATUS_ICON=$(jq -r '.status_icon' "$json_file")
STATUS=$(jq -r '.status' "$json_file")
DURATION=$(jq -r '.duration' "$json_file")
DURATION_SEC=$(jq -r '.duration_seconds' "$json_file")
PASSED=$(jq -r '.passed' "$json_file")
# Add to summary as clean line format
SUMMARY_MESSAGE="${SUMMARY_MESSAGE}${STATUS_ICON} ${STATUS} - \`${MODEL}\` - ${DURATION}\n"
if [ "$PASSED" = "true" ]; then
PASSED_COUNT=$((PASSED_COUNT + 1))
else
FAILED_COUNT=$((FAILED_COUNT + 1))
fi
TOTAL_DURATION=$((TOTAL_DURATION + DURATION_SEC))
fi
done < /tmp/json_files.txt
# Check if we found any results
TOTAL_COUNT=$((PASSED_COUNT + FAILED_COUNT))
if [ $TOTAL_COUNT -eq 0 ]; then
SUMMARY_MESSAGE="${SUMMARY_MESSAGE}⚠️ No test results found (workflow may have been canceled)\n"
SLACK_COLOR="#ffa500"
else
# Add summary stats
SUMMARY_MESSAGE="${SUMMARY_MESSAGE}\n*Results:* ${PASSED_COUNT} passed, ${FAILED_COUNT} failed out of ${TOTAL_COUNT} models\n"
# Calculate total duration
TOTAL_MIN=$((TOTAL_DURATION / 60))
TOTAL_SEC=$((TOTAL_DURATION % 60))
if [ $TOTAL_MIN -gt 0 ]; then
TOTAL_DURATION_STR="${TOTAL_MIN}m ${TOTAL_SEC}s"
else
TOTAL_DURATION_STR="${TOTAL_SEC}s"
fi
SUMMARY_MESSAGE="${SUMMARY_MESSAGE}*Total Duration:* ${TOTAL_DURATION_STR}\n"
# Determine color based on results
if [ $FAILED_COUNT -eq 0 ]; then
SLACK_COLOR="#36a64f"
elif [ $PASSED_COUNT -eq 0 ]; then
SLACK_COLOR="#dc3545"
else
SLACK_COLOR="#ffa500"
fi
fi
SUMMARY_MESSAGE="${SUMMARY_MESSAGE}*Run:* ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
# Export for use in next step
echo "SUMMARY_MESSAGE<<EOF" >> $GITHUB_ENV
echo -e "${SUMMARY_MESSAGE}" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
echo "SLACK_COLOR=${SLACK_COLOR}" >> $GITHUB_ENV
- name: Send summary to Slack
if: always()
uses: rtCamp/action-slack-notify@v2
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
SLACK_CHANNEL: ${{ vars.SLACK_CHANNEL }}
SLACK_TITLE: CUA Models Test Summary
SLACK_COLOR: ${{ env.SLACK_COLOR }}
SLACK_MESSAGE: |
${{ env.SUMMARY_MESSAGE }}

4
.gitignore vendored
View File

@@ -202,4 +202,6 @@ storage/
# Trashes
.Trashes
.Trash-1000/
post-provision
post-provision
# Local secrets for act
.secrets

27
.vscode/launch.json vendored
View File

@@ -10,7 +10,7 @@
"python": "${workspaceFolder:cua-root}/.venv/bin/python",
"cwd": "${workspaceFolder:cua-root}",
"env": {
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
}
},
{
@@ -23,7 +23,7 @@
"python": "${workspaceFolder:cua-root}/.venv/bin/python",
"cwd": "${workspaceFolder:cua-root}",
"env": {
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
}
},
{
@@ -36,7 +36,7 @@
"python": "${workspaceFolder:cua-root}/.venv/bin/python",
"cwd": "${workspaceFolder:cua-root}",
"env": {
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
}
},
{
@@ -49,20 +49,7 @@
"python": "${workspaceFolder:cua-root}/.venv/bin/python",
"cwd": "${workspaceFolder:cua-root}",
"env": {
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
}
},
{
"name": "Run PyLume Examples",
"type": "debugpy",
"request": "launch",
"program": "examples/pylume_examples.py",
"console": "integratedTerminal",
"justMyCode": true,
"python": "${workspaceFolder:cua-root}/.venv/bin/python",
"cwd": "${workspaceFolder:cua-root}",
"env": {
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
}
},
{
@@ -84,7 +71,7 @@
"python": "${workspaceFolder:cua-root}/.venv/bin/python",
"cwd": "${workspaceFolder:cua-root}",
"env": {
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
}
},
{
@@ -106,7 +93,7 @@
"python": "${workspaceFolder:cua-root}/.venv/bin/python",
"cwd": "${workspaceFolder:cua-root}",
"env": {
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
}
},
{
@@ -119,7 +106,7 @@
"python": "${workspaceFolder:cua-root}/.venv/bin/python",
"cwd": "${workspaceFolder:cua-root}",
"env": {
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
"PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som"
}
},
{

View File

@@ -20,10 +20,6 @@
"name": "computer-server",
"path": "../libs/python/computer-server"
},
{
"name": "pylume",
"path": "../libs/python/pylume"
},
{
"name": "core",
"path": "../libs/python/core"
@@ -51,7 +47,6 @@
"${workspaceFolder:cua-root}/libs/python/computer",
"${workspaceFolder:cua-root}/libs/python/agent",
"${workspaceFolder:cua-root}/libs/python/som",
"${workspaceFolder:cua-root}/libs/python/pylume",
"${workspaceFolder:cua-root}/.vscode/typings"
],
"python.envFile": "${workspaceFolder:cua-root}/.env",
@@ -89,10 +84,6 @@
"name": "som",
"depth": 2
},
{
"name": "pylume",
"depth": 2
},
{
"name": "core",
"depth": 2
@@ -103,7 +94,6 @@
"${workspaceFolder:cua-root}/libs/python/computer",
"${workspaceFolder:cua-root}/libs/python/agent",
"${workspaceFolder:cua-root}/libs/python/som",
"${workspaceFolder:cua-root}/libs/python/pylume"
],
"python.languageServer": "None",
"[python]": {

View File

@@ -1,6 +1,6 @@
{
"python-envs.pythonProjects": [],
"python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python",
"python.defaultInterpreterPath": "${workspaceFolder}/.venv",
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "explicit",

View File

@@ -5,7 +5,7 @@ ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PYTHONPATH="/app/libs/python/core:/app/libs/python/computer:/app/libs/python/agent:/app/libs/python/som:/app/libs/python/pylume:/app/libs/python/computer-server:/app/libs/python/mcp-server"
PYTHONPATH="/app/libs/python/core:/app/libs/python/computer:/app/libs/python/agent:/app/libs/python/som:/app/libs/python/computer-server:/app/libs/python/mcp-server"
# Install system dependencies for ARM architecture
RUN apt-get update && apt-get install -y --no-install-recommends \

View File

@@ -22,14 +22,14 @@
With the [Computer SDK](#computer-sdk), you can:
- automate Windows, Linux, and macOS VMs with a consistent, [pyautogui-like API](https://docs.trycua.com/docs/libraries/computer#interface-actions)
- create & manage VMs [locally](https://docs.trycua.com/docs/computer-sdk/computers#cua-local-containers) or using [Cua cloud](https://www.trycua.com/)
- automate Windows, Linux, and macOS VMs with a consistent, [pyautogui-like API](https://cua.ai/docs/docs/libraries/computer#interface-actions)
- create & manage VMs [locally](https://cua.ai/docs/docs/computer-sdk/computers#cua-local-containers) or using [Cua cloud](https://www.cua.ai/)
With the [Agent SDK](#agent-sdk), you can:
- run computer-use models with a [consistent schema](https://docs.trycua.com/docs/agent-sdk/message-format)
- benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://docs.trycua.com/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb))
- combine UI grounding models with any LLM using [composed agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents)
- run computer-use models with a [consistent schema](https://cua.ai/docs/docs/agent-sdk/message-format)
- benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://cua.ai/docs/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb))
- combine UI grounding models with any LLM using [composed agents](https://cua.ai/docs/docs/agent-sdk/supported-agents/composed-agents)
- use new UI agent models and UI grounding models from the Model Zoo below with just a model string (e.g., `ComputerAgent(model="openai/computer-use-preview")`)
- use API or local inference by changing a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers))
@@ -96,8 +96,8 @@ Core utilities for Cua
# Quick Start
- [Clone a starter template and run the code in <1 min](https://github.com/trycua/agent-template)
- [Get started with the Cua SDKs](https://docs.trycua.com/docs/quickstart-devs)
- [Get started with the Cua CLI](https://docs.trycua.com/docs/quickstart-cli)
- [Get started with the Cua SDKs](https://cua.ai/docs/docs/quickstart-devs)
- [Get started with the Cua CLI](https://cua.ai/docs/docs/quickstart-cli)
# Agent SDK
@@ -197,9 +197,9 @@ These are the valid model configurations for `ComputerAgent(model="...")`:
| Configuration | Description |
| ---------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- |
| `{computer-use-model}` | A single model to perform all computer-use tasks |
| `{grounding-model}+{any-vlm-with-tools}` | [Composed](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) with VLM for captioning and grounding LLM for element detection |
| `moondream3+{any-llm-with-tools}` | [Composed](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) with Moondream3 for captioning and UI element detection |
| `human/human` | A [human-in-the-loop](https://docs.trycua.com/docs/agent-sdk/supported-agents/human-in-the-loop) in place of a model |
| `{grounding-model}+{any-vlm-with-tools}` | [Composed](https://cua.ai/docs/docs/agent-sdk/supported-agents/composed-agents) with VLM for captioning and grounding LLM for element detection |
| `moondream3+{any-llm-with-tools}` | [Composed](https://cua.ai/docs/docs/agent-sdk/supported-agents/composed-agents) with Moondream3 for captioning and UI element detection |
| `human/human` | A [human-in-the-loop](https://cua.ai/docs/docs/agent-sdk/supported-agents/human-in-the-loop) in place of a model |
### Model Capabilities
@@ -207,17 +207,17 @@ The following table shows which capabilities are supported by each model:
| Model | Computer-Use | Grounding | Tools | VLM |
| -------------------------------------------------------------------------------------------------------------------------------- | :----------: | :-------: | :---: | :-: |
| [Claude Sonnet/Haiku](https://docs.claude.com/en/docs/agents-and-tools/tool-use/computer-use-tool#how-to-implement-computer-use) | | | | |
| [OpenAI CU Preview](https://platform.openai.com/docs/models/computer-use-preview) | | | | |
| [GLM-V](https://huggingface.co/THUDM/glm-4v-9b) | | | | |
| [Gemini CU Preview](https://ai.google.dev/gemini-api/docs/computer-use) | | | | |
| [InternVL](https://huggingface.co/OpenGVLab/InternVL3_5-1B) | | | | |
| [UI-TARS](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B) | | | | |
| [OpenCUA](https://huggingface.co/xlangai/OpenCUA-7B) | | | | |
| [GTA](https://huggingface.co/HelloKKMe/GTA1-7B) | | | | |
| [Holo](https://huggingface.co/Hcompany/Holo1.5-3B) | | | | |
| [Moondream](https://huggingface.co/moondream/moondream3-preview) | | | | |
| [OmniParser](https://github.com/microsoft/OmniParser) | | | | |
| [Claude Sonnet/Haiku](https://docs.claude.com/en/docs/agents-and-tools/tool-use/computer-use-tool#how-to-implement-computer-use) | 🖥️ | 🎯 | 🛠️ | 👁️ |
| [OpenAI CU Preview](https://platform.openai.com/docs/models/computer-use-preview) | 🖥️ | 🎯 | | 👁️ |
| [GLM-V](https://huggingface.co/THUDM/glm-4v-9b) | 🖥️ | 🎯 | 🛠️ | 👁️ |
| [Gemini CU Preview](https://ai.google.dev/gemini-api/docs/computer-use) | 🖥️ | 🎯 | | 👁️ |
| [InternVL](https://huggingface.co/OpenGVLab/InternVL3_5-1B) | 🖥️ | 🎯 | 🛠️ | 👁️ |
| [UI-TARS](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B) | 🖥️ | 🎯 | 🛠️ | 👁️ |
| [OpenCUA](https://huggingface.co/xlangai/OpenCUA-7B) | | 🎯 | | |
| [GTA](https://huggingface.co/HelloKKMe/GTA1-7B) | | 🎯 | | |
| [Holo](https://huggingface.co/Hcompany/Holo1.5-3B) | | 🎯 | | |
| [Moondream](https://huggingface.co/moondream/moondream3-preview) | | 🎯 | | |
| [OmniParser](https://github.com/microsoft/OmniParser) | | 🎯 | | |
### Model IDs
@@ -333,8 +333,8 @@ Learn more in the [SOM documentation](./libs/python/som/README.md).
# Resources
- [Cua Blog](https://www.trycua.com/blog)
- [Cua Docs](https://docs.trycua.com)
- [Cua Blog](https://www.cua.ai/blog)
- [Cua Docs](https://cua.ai/docs)
# Community and Contributions

106
TESTING.md Normal file
View File

@@ -0,0 +1,106 @@
# Testing Guide for CUA
Quick guide to running tests and understanding the test architecture.
## 🚀 Quick Start
```bash
# Install dependencies
pip install pytest pytest-asyncio pytest-mock pytest-cov
# Install package
cd libs/python/core
pip install -e .
# Run tests
export CUA_TELEMETRY_DISABLED=1 # or $env:CUA_TELEMETRY_DISABLED="1" on Windows
pytest tests/ -v
```
## 🧪 Running Tests
```bash
# All packages
pytest libs/python/*/tests/ -v
# Specific package
cd libs/python/core && pytest tests/ -v
# With coverage
pytest tests/ --cov --cov-report=html
# Specific test
pytest tests/test_telemetry.py::TestTelemetryEnabled::test_telemetry_enabled_by_default -v
```
## 🏗️ Test Architecture
**Principles**: SRP (Single Responsibility) + Vertical Slices + Testability
```
libs/python/
├── core/tests/ # Tests ONLY core
├── agent/tests/ # Tests ONLY agent
└── computer/tests/ # Tests ONLY computer
```
Each test file = ONE feature. Each test class = ONE concern.
## Adding New Tests
1. Create `test_*.py` in the appropriate package's `tests/` directory
2. Follow the pattern:
```python
"""Unit tests for my_feature."""
import pytest
from unittest.mock import patch
class TestMyFeature:
"""Test MyFeature class."""
def test_initialization(self):
"""Test that feature initializes."""
from my_package import MyFeature
feature = MyFeature()
assert feature is not None
```
3. Mock external dependencies:
```python
@pytest.fixture
def mock_api():
with patch("my_package.api_client") as mock:
yield mock
```
## 🔄 CI/CD
Tests run automatically on every PR via GitHub Actions (`.github/workflows/python-tests.yml`):
- Matrix strategy: each package tested separately
- Python 3.12
- ~2 minute runtime
## 🐛 Troubleshooting
**ModuleNotFoundError**: Run `pip install -e .` in package directory
**Tests fail in CI but pass locally**: Set `CUA_TELEMETRY_DISABLED=1`
**Async tests error**: Install `pytest-asyncio` and use `@pytest.mark.asyncio`
**Mock not working**: Patch at usage location, not definition:
```python
# ✅ Right
@patch("my_package.module.external_function")
# ❌ Wrong
@patch("external_library.function")
```
---
**Questions?** Check existing tests for examples or open an issue.

View File

@@ -30,7 +30,7 @@ By the end of this tutorial, you'll be able to:
- Node.js 16+ and npm/yarn/pnpm
- Basic JavaScript or TypeScript knowledge
- OpenAI API access (Tier 3+ for computer-use-preview)
- Cua cloud container credits ([get started here](https://trycua.com/pricing))
- Cua cloud container credits ([get started here](https://cua.ai/pricing))
**Estimated Time:** 45-60 minutes
@@ -51,7 +51,7 @@ Luckily, the `@trycua/computer` library can be used in conjunction with other mo
To follow this guide, youll need access to a Cua cloud container.
Getting access is simple: purchase credits from our [pricing page](https://trycua.com/pricing), then create and provision a new container instance from the [dashboard](https://trycua.com/dashboard/containers). With your container running, you'll be ready to leverage the web SDK and bring automation to your JavaScript or TypeScript applications.
Getting access is simple: purchase credits from our [pricing page](https://cua.ai/pricing), then create and provision a new container instance from the [dashboard](https://cua.ai/dashboard/containers). With your container running, you'll be ready to leverage the web SDK and bring automation to your JavaScript or TypeScript applications.
## Understanding the Flow
@@ -86,7 +86,7 @@ const res = await openai.responses.create({
role: 'user',
content: [
// what we want the ai to do
{ type: 'input_text', text: 'Open firefox and go to trycua.com' },
{ type: 'input_text', text: 'Open firefox and go to cua.ai' },
// first screenshot of the vm
{
type: 'input_image',
@@ -144,7 +144,7 @@ Each response contains:
### Provision a Cua Cloud Container
1. Visit [trycua.com](https://trycua.com), sign up, purchase [credits](https://trycua.com/pricing), and create a new container instance from the [dashboard](https://trycua.com/dashboard).
1. Visit [cua.ai](https://cua.ai), sign up, purchase [credits](https://cua.ai/pricing), and create a new container instance from the [dashboard](https://cua.ai/dashboard).
2. Create an API key from the dashboard — be sure to save it in a secure location before continuing.
3. Start the cloud container from the dashboard.
@@ -281,7 +281,7 @@ let res = await openai.responses.create({
role: 'user',
content: [
// what we want the ai to do
{ type: 'input_text', text: 'open firefox and go to trycua.com' },
{ type: 'input_text', text: 'open firefox and go to cua.ai' },
// current screenshot of the vm
{
type: 'input_image',

View File

@@ -67,7 +67,7 @@ If you try out version 0.4.x, we'd love to hear how it goes. Join us on Discord
## Links
- **Composite Agent Docs:** [https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents)
- **Composite Agent Docs:** [https://cua.ai/docs/agent-sdk/supported-agents/composed-agents](https://cua.ai/docs/agent-sdk/supported-agents/composed-agents)
- **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
Questions or weird edge cases? Ping us on Discord—were curious to see what you build.

View File

@@ -84,4 +84,4 @@ Bring a team, pick a model stack, and push what agents can do on real computers.
**Contact**
Questions on Hack the North? Email **hackthenorth@trycua.com**.
_P.S. If youre planning ahead, start with the Cua Agent Framework and OSWorld-Verified docs at docs.trycua.com; well share office-hour times in both Discord channels._
_P.S. If youre planning ahead, start with the Cua Agent Framework and OSWorld-Verified docs at cua.ai/docs; well share office-hour times in both Discord channels._

View File

@@ -22,7 +22,7 @@ From day one, though, we knew wed have to fight for sign-ups. This was a nich
Unfortunately, Hack the North (HTN) didnt offer an interest form to help us estimate demand, which made capacity planning tricky—especially with early-stage infra. Stress-testing takes foresight, and multimodal language model usage is still costly (~1.5× to 34× the price of comparable text-only models).
On top of that, we were discouraged from external promotion on [lu.ma](http://lu.ma). So we spun up our own sign-up page at **trycua.com/hackathon** and built ad-hoc Discord channels to share track details. We emphasized—repeatedly—that only students already accepted to Hack the North should register.
On top of that, we were discouraged from external promotion on [lu.ma](http://lu.ma). So we spun up our own sign-up page at **cua.ai/hackathon** and built ad-hoc Discord channels to share track details. We emphasized—repeatedly—that only students already accepted to Hack the North should register.
_(Moral: the “measure-zero effect”—no matter how many times you say it, some people wont see it. Plenty of invalid sign-ups still slipped through.)_

View File

@@ -89,5 +89,5 @@ Customize your evaluation with these options:
## Learn more
- Notebook with endtoend examples: https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb
- Docs: https://docs.trycua.com/docs/agent-sdk/integrations/hud
- Docs: https://cua.ai/docs/agent-sdk/integrations/hud
- Live traces: https://app.hud.so

View File

@@ -216,4 +216,4 @@ Ready to put humans back in the loop? The most sophisticated AI system knows whe
---
_Questions about human-in-the-loop agents? Join the conversation in our [Discord community](https://discord.gg/cua-ai) or check out our [documentation](https://docs.trycua.com/docs/agent-sdk/supported-agents/human-in-the-loop)._
_Questions about human-in-the-loop agents? Join the conversation in our [Discord community](https://discord.gg/cua-ai) or check out our [documentation](https://cua.ai/docs/agent-sdk/supported-agents/human-in-the-loop)._

View File

@@ -32,7 +32,7 @@ The result? **Instant deployment** in seconds instead of hours, with no infrastr
### Step 1: Get Your API Key
Sign up at [**trycua.com**](https://trycua.com) to get your API key.
Sign up at [**cua.ai**](https://cua.ai) to get your API key.
```bash
# Set your API key in environment variables
@@ -226,6 +226,6 @@ Stay tuned for updates and join our [**Discord**](https://discord.gg/cua-ai) to
Ready to deploy your Computer-Use Agents in the cloud?
Visit [**trycua.com**](https://trycua.com) to sign up and get your API key. Join our [**Discord community**](https://discord.gg/cua-ai) for support and explore more examples on [**GitHub**](https://github.com/trycua/cua).
Visit [**cua.ai**](https://cua.ai) to sign up and get your API key. Join our [**Discord community**](https://discord.gg/cua-ai) for support and explore more examples on [**GitHub**](https://github.com/trycua/cua).
Happy RPA 2.0! 🚀

View File

@@ -174,7 +174,7 @@ Apple's announcement confirms we're on the right path. Here's what we're looking
- [Apple Containerization Framework](https://github.com/apple/containerization)
- [Lume - Direct VM Management](https://github.com/trycua/cua/tree/main/libs/lume)
- [Lumier - Docker Interface for VMs](https://github.com/trycua/cua/tree/main/libs/lumier)
- [Cua Cloud Sandbox](https://trycua.com)
- [Cua Cloud Sandbox](https://cua.ai)
- [Join our Discord](https://discord.gg/cua-ai)
---

View File

@@ -32,7 +32,7 @@ The viewer allows you to see exactly what your agent observed and how it interac
## Opening Trajectory Viewer in 3 Simple Steps
1. **Visit**: Open your browser and go to [https://www.trycua.com/trajectory-viewer](https://www.trycua.com/trajectory-viewer).
1. **Visit**: Open your browser and go to [https://cua.ai/trajectory-viewer](https://cua.ai/trajectory-viewer).
2. **Upload**: Drag and drop a trajectories folder or click Select Folder.
3. **Explore**: View your agents trajectories! All data stays in your browser unless you give permission otherwise.

View File

@@ -174,10 +174,10 @@ await computer.run()
## Links
- **Docker Provider Docs:** [https://docs.trycua.com/computers/docker](https://docs.trycua.com/computers/docker)
- **Docker Provider Docs:** [https://cua.ai/docs/computers/docker](https://cua.ai/docs/computers/docker)
- **KasmVNC:** [https://github.com/kasmtech/KasmVNC](https://github.com/kasmtech/KasmVNC)
- **Container Source:** [https://github.com/trycua/cua/tree/main/libs/kasm](https://github.com/trycua/cua/tree/main/libs/kasm)
- **Computer SDK:** [https://docs.trycua.com/docs/computer-sdk/computers](https://docs.trycua.com/docs/computer-sdk/computers)
- **Computer SDK:** [https://cua.ai/docs/computer-sdk/computers](https://cua.ai/docs/computer-sdk/computers)
- **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
Questions or weird edge cases? Ping us on Discord—were curious to see what you build.

View File

@@ -144,7 +144,11 @@ The `ComputerAgent` constructor provides a wide range of options for customizing
If set (float or dict), adds a budget manager callback that tracks usage costs and stops execution if the budget is exceeded. Dict allows advanced options (e.g., `{ "max_budget": 5.0, "raise_error": True }`).
- `instructions` (`str` | `list[str]`):
System instructions for the agent. Can be a single string or multiple strings in a tuple/list for readability; they are concatenated into one system prompt.
- `**kwargs` (`any`):
- `api_key` (`str`):
Optional API key override for the model provider.
- `api_base` (`str`):
Optional API base URL override for the model provider.
- `**additional_generation_kwargs` (`any`):
Any additional keyword arguments are passed through to the agent loop or model provider.
**Example with advanced options:**
@@ -168,7 +172,9 @@ agent = ComputerAgent(
instructions=(
"You are a helpful computer-using agent"
"Output computer calls until you complete the given task"
)
),
api_key="your-api-key",
api_base="https://your-api-base.com/v1",
)
```

View File

@@ -37,7 +37,7 @@ agent = ComputerAgent(
## View Trajectories Online
View trajectories in the browser at:
**[trycua.com/trajectory-viewer](http://trycua.com/trajectory-viewer)**
**[cua.ai/trajectory-viewer](https://cua.ai/trajectory-viewer)**
The viewer provides:

View File

@@ -13,7 +13,7 @@ All examples require a CUA API key. You can obtain one from the [Dashboard](http
## List VMs
<Tabs items={["Python", "curl"]}>
<Tabs items={['Python', 'curl']}>
<Tab value="Python">
```python
@@ -72,6 +72,8 @@ Status values:
- `terminated`: VM has been permanently destroyed
- `failed`: VM deployment or operation failed
---
</Tab>
</Tabs>

View File

@@ -18,7 +18,7 @@ Execute shell commands and get detailed results:
# Run shell command
result = await computer.interface.run_command(cmd) # result.stdout, result.stderr, result.returncode
```
</Tab>
<Tab value="TypeScript">
@@ -30,6 +30,63 @@ Execute shell commands and get detailed results:
</Tab>
</Tabs>
## Window Management
Control application launching and windows:
<Tabs items={['Python', 'TypeScript']}>
<Tab value="Python">
```python
# Launch applications
await computer.interface.launch("xfce4-terminal")
await computer.interface.launch("libreoffice --writer")
await computer.interface.open("https://www.google.com")
# Window management
windows = await computer.interface.get_application_windows("xfce4-terminal")
window_id = windows[0]
await computer.interface.activate_window(window_id)
window_id = await computer.interface.get_current_window_id() # get the current active window id
await computer.interface.window_size(window_id)
await computer.interface.get_window_title(window_id)
await computer.interface.get_window_position(window_id)
await computer.interface.set_window_size(window_id, 1200, 800)
await computer.interface.set_window_position(window_id, 100, 100)
await computer.interface.maximize_window(window_id)
await computer.interface.minimize_window(window_id)
await computer.interface.close_window(window_id)
```
</Tab>
<Tab value="TypeScript">
```typescript
// Launch applications
await computer.interface.launch("xfce4-terminal");
await computer.interface.launch("libreoffice --writer");
await computer.interface.open("https://www.google.com");
// Window management
const windows = await computer.interface.getApplicationWindows("xfce4-terminal");
let windowId = windows[0];
await computer.interface.activateWindow(windowId);
windowId = await computer.interface.getCurrentWindowId(); // current active window id
await computer.interface.getWindowSize(windowId);
await computer.interface.getWindowName(windowId);
await computer.interface.getWindowPosition(windowId);
await computer.interface.setWindowSize(windowId, 1200, 800);
await computer.interface.setWindowPosition(windowId, 100, 100);
await computer.interface.maximizeWindow(windowId);
await computer.interface.minimizeWindow(windowId);
await computer.interface.closeWindow(windowId);
```
</Tab>
</Tabs>
## Mouse Actions
Precise mouse control and interaction:
@@ -162,6 +219,35 @@ Screen capture and display information:
</Tab>
</Tabs>
## Desktop Actions
Control desktop environment features like wallpaper:
<Tabs items={['Python', 'TypeScript']}>
<Tab value="Python">
```python
# Get current desktop environment (e.g., 'xfce4', 'gnome', 'kde', 'mac', 'windows')
env = await computer.interface.get_desktop_environment()
print(env) # "xfce4"
# Set desktop wallpaper to an image file accessible on the VM
await computer.interface.set_wallpaper("/home/cua/shared/wallpaper.png")
```
</Tab>
<Tab value="TypeScript">
```typescript
// Get current desktop environment
const env = await computer.interface.getDesktopEnvironment();
print(env) # "xfce4"
// Set desktop wallpaper to an image file accessible on the VM
await computer.interface.setWallpaper('/home/cua/shared/wallpaper.png');
```
</Tab>
</Tabs>
## Clipboard Actions
System clipboard management:

View File

@@ -23,7 +23,7 @@ Cua Computers are preconfigured virtual machines running the Computer Server. Th
**Easiest & safest way to get started - works on any host OS**
This is a Cloud Sandbox running the Computer Server. Get a container at [trycua.com](https://www.trycua.com/).
This is a Cloud Sandbox running the Computer Server. Get a container at [cua.ai](https://cua.ai/).
<Tabs items={['Python', 'TypeScript']}>
<Tab value="Python">

View File

@@ -1,5 +1,5 @@
{
"title": "Computer SDK",
"description": "Build computer-using agents with the Computer SDK",
"pages": ["computers", "cloud-vm-management", "commands", "computer-ui", "sandboxed-python"]
"pages": ["computers", "commands", "computer-ui", "tracing-api", "sandboxed-python"]
}

View File

@@ -0,0 +1,350 @@
---
title: Computer Tracing API
description: Record computer interactions for debugging, training, and analysis
---
# Computer Tracing API
The Computer tracing API provides a powerful way to record computer interactions for debugging, training, analysis, and compliance purposes. Inspired by Playwright's tracing functionality, it offers flexible recording options and standardized output formats.
<Callout>
The tracing API addresses GitHub issue #299 by providing a unified recording interface that works
with any Computer usage pattern, not just ComputerAgent.
</Callout>
## Overview
The tracing API allows you to:
- Record screenshots at key moments
- Log all API calls and their results
- Capture accessibility tree snapshots
- Add custom metadata
- Export recordings in standardized formats
- Support for both automated and human-in-the-loop workflows
## Basic Usage
### Starting and Stopping Traces
```python
from computer import Computer
computer = Computer(os_type="macos")
await computer.run()
# Start tracing with default options
await computer.tracing.start()
# Perform some operations
await computer.interface.left_click(100, 200)
await computer.interface.type_text("Hello, World!")
await computer.interface.press_key("enter")
# Stop tracing and save
trace_path = await computer.tracing.stop()
print(f"Trace saved to: {trace_path}")
```
### Custom Configuration
```python
# Start tracing with custom configuration
await computer.tracing.start({
'video': False, # Record video frames
'screenshots': True, # Record screenshots (default: True)
'api_calls': True, # Record API calls (default: True)
'accessibility_tree': True, # Record accessibility snapshots
'metadata': True, # Allow custom metadata (default: True)
'name': 'my_custom_trace', # Custom trace name
'path': './my_traces' # Custom output directory
})
# Add custom metadata during tracing
await computer.tracing.add_metadata('user_id', 'user123')
await computer.tracing.add_metadata('test_case', 'login_flow')
# Stop with custom options
trace_path = await computer.tracing.stop({
'path': './exports/trace.zip',
'format': 'zip' # 'zip' or 'dir'
})
```
## Configuration Options
### Start Options
| Option | Type | Default | Description |
| -------------------- | ---- | -------------- | ------------------------------------- |
| `video` | bool | `False` | Record video frames (future feature) |
| `screenshots` | bool | `True` | Capture screenshots after key actions |
| `api_calls` | bool | `True` | Log all interface method calls |
| `accessibility_tree` | bool | `False` | Record accessibility tree snapshots |
| `metadata` | bool | `True` | Enable custom metadata recording |
| `name` | str | auto-generated | Custom name for the trace |
| `path` | str | auto-generated | Custom directory for trace files |
### Stop Options
| Option | Type | Default | Description |
| -------- | ---- | -------------- | ---------------------------------- |
| `path` | str | auto-generated | Custom output path for final trace |
| `format` | str | `'zip'` | Output format: `'zip'` or `'dir'` |
## Use Cases
### Custom Agent Development
```python
from computer import Computer
async def test_custom_agent():
computer = Computer(os_type="linux")
await computer.run()
# Start tracing for this test session
await computer.tracing.start({
'name': 'custom_agent_test',
'screenshots': True,
'accessibility_tree': True
})
# Your custom agent logic here
screenshot = await computer.interface.screenshot()
await computer.interface.left_click(500, 300)
await computer.interface.type_text("test input")
# Add context about what the agent is doing
await computer.tracing.add_metadata('action', 'filling_form')
await computer.tracing.add_metadata('confidence', 0.95)
# Save the trace
trace_path = await computer.tracing.stop()
return trace_path
```
### Training Data Collection
```python
async def collect_training_data():
computer = Computer(os_type="macos")
await computer.run()
tasks = [
"open_browser_and_search",
"create_document",
"send_email"
]
for task in tasks:
# Start a new trace for each task
await computer.tracing.start({
'name': f'training_{task}',
'screenshots': True,
'accessibility_tree': True,
'metadata': True
})
# Add task metadata
await computer.tracing.add_metadata('task_type', task)
await computer.tracing.add_metadata('difficulty', 'beginner')
# Perform the task (automated or human-guided)
await perform_task(computer, task)
# Save this training example
await computer.tracing.stop({
'path': f'./training_data/{task}.zip'
})
```
### Human-in-the-Loop Recording
```python
async def record_human_demonstration():
computer = Computer(os_type="windows")
await computer.run()
# Start recording human demonstration
await computer.tracing.start({
'name': 'human_demo_excel_workflow',
'screenshots': True,
'api_calls': True, # Will capture any programmatic actions
'metadata': True
})
print("Trace recording started. Perform your demonstration...")
print("The system will record all computer interactions.")
# Add metadata about the demonstration
await computer.tracing.add_metadata('demonstrator', 'expert_user')
await computer.tracing.add_metadata('workflow', 'excel_data_analysis')
# Human performs actions manually or through other tools
# Tracing will still capture any programmatic interactions
input("Press Enter when demonstration is complete...")
# Stop and save the demonstration
trace_path = await computer.tracing.stop()
print(f"Human demonstration saved to: {trace_path}")
```
### RPA Debugging
```python
async def debug_rpa_workflow():
computer = Computer(os_type="linux")
await computer.run()
# Start tracing with full debugging info
await computer.tracing.start({
'name': 'rpa_debug_session',
'screenshots': True,
'accessibility_tree': True,
'api_calls': True
})
try:
# Your RPA workflow
await rpa_login_sequence(computer)
await rpa_data_entry(computer)
await rpa_generate_report(computer)
await computer.tracing.add_metadata('status', 'success')
except Exception as e:
# Record the error in the trace
await computer.tracing.add_metadata('error', str(e))
await computer.tracing.add_metadata('status', 'failed')
raise
finally:
# Always save the debug trace
trace_path = await computer.tracing.stop()
print(f"Debug trace saved to: {trace_path}")
```
## Output Format
### Directory Structure
When using `format='dir'`, traces are saved with this structure:
```
trace_20240922_143052_abc123/
├── trace_metadata.json # Overall trace information
├── event_000001_trace_start.json
├── event_000002_api_call.json
├── event_000003_api_call.json
├── 000001_initial_screenshot.png
├── 000002_after_left_click.png
├── 000003_after_type_text.png
└── event_000004_trace_end.json
```
### Metadata Format
The `trace_metadata.json` contains:
```json
{
"trace_id": "trace_20240922_143052_abc123",
"config": {
"screenshots": true,
"api_calls": true,
"accessibility_tree": false,
"metadata": true
},
"start_time": 1695392252.123,
"end_time": 1695392267.456,
"duration": 15.333,
"total_events": 12,
"screenshot_count": 5,
"events": [...] // All events in chronological order
}
```
### Event Format
Individual events follow this structure:
```json
{
"type": "api_call",
"timestamp": 1695392255.789,
"relative_time": 3.666,
"data": {
"method": "left_click",
"args": { "x": 100, "y": 200, "delay": null },
"result": null,
"error": null,
"screenshot": "000002_after_left_click.png",
"success": true
}
}
```
## Integration with ComputerAgent
The tracing API works seamlessly with existing ComputerAgent workflows:
```python
from agent import ComputerAgent
from computer import Computer
# Create computer and start tracing
computer = Computer(os_type="macos")
await computer.run()
await computer.tracing.start({
'name': 'agent_with_tracing',
'screenshots': True,
'metadata': True
})
# Create agent using the same computer
agent = ComputerAgent(
model="openai/computer-use-preview",
tools=[computer]
)
# Agent operations will be automatically traced
async for _ in agent.run("open cua.ai and navigate to docs"):
pass
# Save the combined trace
trace_path = await computer.tracing.stop()
```
## Privacy Considerations
The tracing API is designed with privacy in mind:
- Clipboard content is not recorded (only content length)
- Screenshots can be disabled
- Sensitive text input can be filtered
- Custom metadata allows you to control what information is recorded
## Comparison with ComputerAgent Trajectories
| Feature | ComputerAgent Trajectories | Computer.tracing |
| ---------------------- | -------------------------- | -------------------- |
| **Scope** | ComputerAgent only | Any Computer usage |
| **Flexibility** | Fixed format | Configurable options |
| **Custom Agents** | Not supported | Fully supported |
| **Human-in-the-loop** | Limited | Full support |
| **Real-time Control** | No | Start/stop anytime |
| **Output Format** | Agent-specific | Standardized |
| **Accessibility Data** | No | Optional |
## Best Practices
1. **Start tracing early**: Begin recording before your main workflow to capture the complete session
2. **Use meaningful names**: Provide descriptive trace names for easier organization
3. **Add contextual metadata**: Include information about what you're testing or demonstrating
4. **Handle errors gracefully**: Always stop tracing in a finally block
5. **Choose appropriate options**: Only record what you need to minimize overhead
6. **Organize output**: Use custom paths to organize traces by project or use case
The Computer tracing API provides a powerful foundation for recording, analyzing, and improving computer automation workflows across all use cases.

View File

@@ -30,7 +30,7 @@ Choose how you want to run your Cua computer. **Cloud Sandbox is recommended** f
**Easiest & safest way to get started - works on any host OS**
1. Go to [trycua.com/signin](https://www.trycua.com/signin)
1. Go to [cua.ai/signin](https://cua.ai/signin)
2. Navigate to **Dashboard > Containers > Create Instance**
3. Create a **Medium, Ubuntu 22** container
4. Note your container name and API key
@@ -312,7 +312,7 @@ python -m agent.cli omniparser+ollama_chat/llama3.2:latest
If you haven't set up environment variables, the CLI will guide you through the setup:
1. **Sandbox Name**: Enter your Cua sandbox name (or get one at [trycua.com](https://www.trycua.com/))
1. **Sandbox Name**: Enter your Cua sandbox name (or get one at [cua.ai](https://cua.ai/))
2. **CUA API Key**: Enter your Cua API key
3. **Provider API Key**: Enter your AI provider API key (OpenAI, Anthropic, etc.)

View File

@@ -24,7 +24,7 @@ You can run your Cua computer in the cloud (recommended for easiest setup), loca
Cua Cloud Sandbox provides virtual machines that run Ubuntu.
1. Go to [trycua.com/signin](https://www.trycua.com/signin)
1. Go to [cua.ai/signin](https://cua.ai/signin)
2. Navigate to **Dashboard > Containers > Create Instance**
3. Create a **Medium, Ubuntu 22** sandbox
4. Note your sandbox name and API key

View File

@@ -19,6 +19,7 @@
"posthog-js": "^1.276.0",
"react": "^19.1.0",
"react-dom": "^19.1.0",
"react-icons": "^5.5.0",
"remark": "^15.0.1",
"remark-gfm": "^4.0.1",
"remark-mdx": "^3.1.0",

12
docs/pnpm-lock.yaml generated
View File

@@ -38,6 +38,9 @@ importers:
react-dom:
specifier: ^19.1.0
version: 19.1.0(react@19.1.0)
react-icons:
specifier: ^5.5.0
version: 5.5.0(react@19.1.0)
remark:
specifier: ^15.0.1
version: 15.0.1
@@ -2054,6 +2057,11 @@ packages:
peerDependencies:
react: ^19.1.0
react-icons@5.5.0:
resolution: {integrity: sha512-MEFcXdkP3dLo8uumGI5xN3lDFNsRtrjbOEKDLD7yv76v4wpnEq2Lt2qeHaQOr34I/wPN3s3+N08WkQ+CW37Xiw==}
peerDependencies:
react: '*'
react-medium-image-zoom@5.2.14:
resolution: {integrity: sha512-nfTVYcAUnBzXQpPDcZL+cG/e6UceYUIG+zDcnemL7jtAqbJjVVkA85RgneGtJeni12dTyiRPZVM6Szkmwd/o8w==}
peerDependencies:
@@ -4622,6 +4630,10 @@ snapshots:
react: 19.1.0
scheduler: 0.26.0
react-icons@5.5.0(react@19.1.0):
dependencies:
react: 19.1.0
react-medium-image-zoom@5.2.14(react-dom@19.1.0(react@19.1.0))(react@19.1.0):
dependencies:
react: 19.1.0

View File

@@ -10,6 +10,7 @@ import type { Metadata } from 'next';
import Link from 'next/link';
import { notFound, redirect } from 'next/navigation';
import { PageFeedback } from '@/components/page-feedback';
import { DocActionsMenu } from '@/components/doc-actions-menu';
export default async function Page(props: { params: Promise<{ slug?: string[] }> }) {
const params = await props.params;
@@ -177,14 +178,26 @@ export default async function Page(props: { params: Promise<{ slug?: string[] }>
);
};
const tocFooter = () => {
return (
<div className="mt-4">
<DocActionsMenu pageUrl={page.url} pageTitle={page.data.title} filePath={page.file.path} />
</div>
);
};
return (
<DocsPage toc={page.data.toc} tableOfContent={{ header: tocHeader() }} full={page.data.full}>
<DocsPage
toc={page.data.toc}
tableOfContent={{ header: tocHeader(), footer: tocFooter() }}
full={page.data.full}
>
<div className="flex flex-row w-full items-start">
<div className="flex-1">
<div className="flex flex-row w-full">
<DocsTitle>{page.data.title}</DocsTitle>
<div className="ml-auto">
<div className="ml-auto flex items-center gap-2">
{apiSection && versionItems.length > 1 && (
<Popover>
<PopoverTrigger
@@ -273,15 +286,99 @@ export async function generateMetadata(props: {
if (page.url.includes('api')) title = `${page.data.title} | Cua API Docs`;
if (page.url.includes('guide')) title = ` Guide: ${page.data.title} | Cua Docs`;
// Canonical URL points to cua.ai to consolidate all SEO authority on main domain
const canonicalUrl = `https://cua.ai${page.url}`;
// Extract keywords from the page for SEO
const keywords = [
'computer use agent',
'computer use',
'AI automation',
'visual automation',
page.data.title,
];
// Structured data for better Google indexing (TechArticle schema)
const structuredData = {
'@context': 'https://schema.org',
'@type': 'TechArticle',
headline: page.data.title,
description: page.data.description,
url: canonicalUrl,
publisher: {
'@type': 'Organization',
name: 'Cua',
url: 'https://cua.ai',
logo: {
'@type': 'ImageObject',
url: 'https://cua.ai/cua_logo_black.svg',
},
},
mainEntityOfPage: {
'@type': 'WebPage',
'@id': canonicalUrl,
},
};
// Breadcrumb schema for better site structure understanding
const breadcrumbSchema = {
'@context': 'https://schema.org',
'@type': 'BreadcrumbList',
itemListElement: [
{
'@type': 'ListItem',
position: 1,
name: 'Cua',
item: 'https://cua.ai',
},
{
'@type': 'ListItem',
position: 2,
name: 'Documentation',
item: 'https://cua.ai/docs',
},
{
'@type': 'ListItem',
position: 3,
name: page.data.title,
item: canonicalUrl,
},
],
};
return {
title,
description: page.data.description,
keywords,
authors: [{ name: 'Cua', url: 'https://cua.ai' }],
robots: {
index: true,
follow: true,
googleBot: {
index: true,
follow: true,
'max-image-preview': 'large',
'max-snippet': -1,
},
},
alternates: {
canonical: canonicalUrl,
},
openGraph: {
title,
description: page.data.description,
type: 'article',
siteName: 'Cua Docs',
url: 'https://trycua.com/docs',
url: canonicalUrl,
},
twitter: {
card: 'summary',
title,
description: page.data.description,
creator: '@trycua',
},
other: {
'script:ld+json': JSON.stringify([structuredData, breadcrumbSchema]),
},
};
}

View File

@@ -41,15 +41,15 @@ export const baseOptions: BaseLayoutProps = {
githubUrl: 'https://github.com/trycua/cua',
links: [
{
url: 'https://trycua.com',
text: 'Cua home',
url: 'https://cua.ai',
text: 'Cua Home',
type: 'icon',
icon: <HomeIcon />,
external: false,
external: true,
},
{
url: 'https://discord.com/invite/mVnXXpdE85',
text: 'Cua discord',
text: 'Discord',
type: 'icon',
icon: (
<>
@@ -69,6 +69,7 @@ export const baseOptions: BaseLayoutProps = {
/>
</>
),
external: true,
},
],
};

13
docs/src/app/robots.ts Normal file
View File

@@ -0,0 +1,13 @@
import { MetadataRoute } from 'next';
export default function robots(): MetadataRoute.Robots {
return {
rules: {
userAgent: '*',
allow: ['/', '/llms.txt'],
disallow: [],
},
sitemap: 'https://cua.ai/docs/sitemap.xml',
host: 'https://cua.ai',
};
}

32
docs/src/app/sitemap.ts Normal file
View File

@@ -0,0 +1,32 @@
import { MetadataRoute } from 'next';
import { source } from '@/lib/source';
export default function sitemap(): MetadataRoute.Sitemap {
const baseUrl = 'https://cua.ai';
// Get all pages from fumadocs source
const pages = source.getPages();
// Map pages to sitemap entries with /docs prefix
const docPages = pages.map((page) => {
// Ensure URL starts with /docs
const url = page.url.startsWith('/docs') ? page.url : `/docs${page.url}`;
return {
url: `${baseUrl}${url}`,
lastModified: new Date(),
changeFrequency: 'weekly' as const,
priority: url === '/docs' ? 1.0 : 0.8,
};
});
// Add main docs page if not included
const mainDocsPage = {
url: `${baseUrl}/docs`,
lastModified: new Date(),
changeFrequency: 'weekly' as const,
priority: 1.0,
};
return [mainDocsPage, ...docPages];
}

View File

@@ -0,0 +1,126 @@
'use client';
import { useState } from 'react';
import { SiOpenai, SiAnthropic, SiMarkdown, SiGithub } from 'react-icons/si';
import posthog from 'posthog-js';
interface DocActionsMenuProps {
pageUrl: string;
pageTitle: string;
filePath: string;
}
export function DocActionsMenu({ pageUrl, pageTitle, filePath }: DocActionsMenuProps) {
const [copied, setCopied] = useState(false);
const handleCopyMarkdown = async () => {
try {
const githubRawUrl = `https://raw.githubusercontent.com/trycua/cua/refs/heads/main/docs/content/docs/${filePath}`;
const response = await fetch(githubRawUrl);
if (!response.ok) {
throw new Error('Failed to fetch markdown');
}
const markdown = await response.text();
await navigator.clipboard.writeText(markdown);
setCopied(true);
setTimeout(() => setCopied(false), 2000);
posthog.capture('docs_copy_markdown_clicked', {
page: pageUrl,
page_title: pageTitle,
success: true,
});
} catch (error) {
console.error('Error copying markdown:', error);
try {
const urlWithUtm = `https://cua.ai${pageUrl}?utm_source=cua.ai/docs`;
await navigator.clipboard.writeText(urlWithUtm);
setCopied(true);
setTimeout(() => setCopied(false), 2000);
} catch (fallbackError) {
console.error('Error copying URL:', fallbackError);
}
posthog.capture('docs_copy_markdown_clicked', {
page: pageUrl,
page_title: pageTitle,
success: false,
error: error instanceof Error ? error.message : 'Unknown error',
});
}
};
const handleEditGithub = () => {
posthog.capture('docs_edit_github_clicked', {
page: pageUrl,
page_title: pageTitle,
});
const githubEditUrl = `https://github.com/trycua/cua/edit/main/docs/content/docs/${filePath}`;
window.open(githubEditUrl, '_blank', 'noopener,noreferrer');
};
const handleOpenChatGPT = () => {
posthog.capture('docs_open_chatgpt_clicked', {
page: pageUrl,
page_title: pageTitle,
});
const docUrl = `https://cua.ai${pageUrl}?utm_source=cua.ai/docs`;
const prompt = `I need help understanding this cua.ai documentation page: "${pageTitle}". Please read and help me with: ${docUrl}`;
const chatgptUrl = `https://chatgpt.com/?q=${encodeURIComponent(prompt)}`;
window.open(chatgptUrl, '_blank', 'noopener,noreferrer');
};
const handleOpenClaude = () => {
posthog.capture('docs_open_claude_clicked', {
page: pageUrl,
page_title: pageTitle,
});
const docUrl = `https://cua.ai${pageUrl}?utm_source=cua.ai/docs`;
const prompt = `I need help understanding this cua.ai documentation page: "${pageTitle}". Please read and help me with: ${docUrl}`;
const claudeUrl = `https://claude.ai/new?q=${encodeURIComponent(prompt)}`;
window.open(claudeUrl, '_blank', 'noopener,noreferrer');
};
return (
<div className="flex flex-col gap-2">
<button
onClick={handleCopyMarkdown}
className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
>
<SiMarkdown className="w-2 h-4 flex-shrink-0" />
<span>{copied ? 'Copied!' : 'Copy as markdown'}</span>
</button>
<button
onClick={handleEditGithub}
className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
>
<SiGithub className="w-4 h-4 flex-shrink-0" />
<span>Edit on GitHub</span>
</button>
<button
onClick={handleOpenChatGPT}
className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
>
<SiOpenai className="w-4 h-4 flex-shrink-0" />
<span>Open in ChatGPT</span>
</button>
<button
onClick={handleOpenClaude}
className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
>
<SiAnthropic className="w-4 h-4 flex-shrink-0" />
<span>Open in Claude</span>
</button>
</div>
);
}

View File

@@ -1,15 +1,159 @@
export function Footer() {
return (
<footer className="mt-auto border-t border-fd-border py-4">
<div className="container mx-auto px-4 flex justify-end">
<a
href="https://www.cua.ai/cookie-policy"
target="_blank"
rel="noopener noreferrer"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Cookie Policy
</a>
<footer className="mt-auto border-t border-fd-border py-8">
<div className="container mx-auto px-4">
<div className="grid grid-cols-1 md:grid-cols-4 gap-8 mb-6">
{/* Product Links */}
<div>
<h3 className="font-semibold text-sm mb-3 text-fd-foreground">Product</h3>
<ul className="space-y-2">
<li>
<a
href="https://cua.ai"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Home
</a>
</li>
<li>
<a
href="https://cua.ai/pricing"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Pricing
</a>
</li>
<li>
<a
href="https://cua.ai/#features"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Features
</a>
</li>
</ul>
</div>
{/* Documentation Links */}
<div>
<h3 className="font-semibold text-sm mb-3 text-fd-foreground">Documentation</h3>
<ul className="space-y-2">
<li>
<a
href="/docs"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Getting Started
</a>
</li>
<li>
<a
href="/docs/agent-sdk/agent-loops"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Agent Loops
</a>
</li>
<li>
<a
href="/docs/quickstart-devs"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Quick Start
</a>
</li>
</ul>
</div>
{/* Resources Links */}
<div>
<h3 className="font-semibold text-sm mb-3 text-fd-foreground">Resources</h3>
<ul className="space-y-2">
<li>
<a
href="https://cua.ai/blog"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Blog
</a>
</li>
<li>
<a
href="https://github.com/trycua/cua"
target="_blank"
rel="noopener noreferrer"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
GitHub
</a>
</li>
<li>
<a
href="https://discord.com/invite/mVnXXpdE85"
target="_blank"
rel="noopener noreferrer"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Discord Community
</a>
</li>
</ul>
</div>
{/* Company Links */}
<div>
<h3 className="font-semibold text-sm mb-3 text-fd-foreground">Company</h3>
<ul className="space-y-2">
<li>
<a
href="https://cua.ai/about"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
About
</a>
</li>
<li>
<a
href="mailto:hello@trycua.com"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Contact
</a>
</li>
<li>
<a
href="https://cua.ai/cookie-policy"
target="_blank"
rel="noopener noreferrer"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Cookie Policy
</a>
</li>
</ul>
</div>
</div>
{/* Bottom Bar */}
<div className="pt-6 border-t border-fd-border flex flex-col md:flex-row justify-between items-center gap-4">
<p className="text-sm text-fd-muted-foreground">
© {new Date().getFullYear()} Cua. All rights reserved.
</p>
<div className="flex gap-4">
<a
href="https://cua.ai/privacy"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Privacy Policy
</a>
<a
href="https://cua.ai/terms"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Terms of Service
</a>
</div>
</div>
</div>
</footer>
);

View File

@@ -6,7 +6,7 @@ import 'dotenv/config';
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
const COMPUTER_USE_PROMPT = 'Open firefox and go to trycua.com';
const COMPUTER_USE_PROMPT = 'Open firefox and go to cua.ai';
// Initialize the Computer Connection
const computer = new Computer({

View File

@@ -38,8 +38,8 @@ def load_env_or_fail() -> None:
"""
Build Agent Config
- customize agent behavior, tool integration, callbacks, resource management, and more
- https://docs.trycua.com/docs/agent-sdk/agent-loops#parameters
- https://docs.trycua.com/docs/agent-sdk/supported-model-providers
- https://cua.ai/docs/agent-sdk/agent-loops#parameters
- https://cua.ai/docs/agent-sdk/supported-model-providers
"""
@@ -76,7 +76,7 @@ async def run_hud_eval() -> None:
"""
Customize your hud eval below, check the doc for additional params
- https://docs.trycua.com/docs/agent-sdk/integrations/hud#parameters-1
- https://cua.ai/docs/agent-sdk/integrations/hud#parameters-1
- recommend low max steps (5-10) for testing, then max 100 for benchmarking
- also select specific tasks to run by using splitting the dataset
"""

View File

@@ -0,0 +1,384 @@
"""
Examples demonstrating the Computer.tracing API for recording sessions.
This module shows various use cases for the new Computer.tracing functionality,
including training data collection, debugging, and compliance recording.
"""
import asyncio
import logging
from pathlib import Path
from agent import ComputerAgent
from computer import Computer
async def basic_tracing_example():
"""
Basic example showing how to use Computer.tracing for recording a simple session.
"""
print("=== Basic Tracing Example ===")
# Initialize computer
computer = Computer(os_type="macos", provider_type="lume")
await computer.run()
try:
# Start tracing with basic configuration
await computer.tracing.start(
{"screenshots": True, "api_calls": True, "metadata": True, "name": "basic_session"}
)
print("Tracing started...")
# Perform some computer operations
await computer.interface.move_cursor(100, 100)
await computer.interface.left_click()
await computer.interface.type_text("Hello, tracing!")
await computer.interface.press_key("enter")
# Add custom metadata
await computer.tracing.add_metadata("session_type", "basic_demo")
await computer.tracing.add_metadata("user_notes", "Testing basic functionality")
# Stop tracing and save
trace_path = await computer.tracing.stop({"format": "zip"})
print(f"Trace saved to: {trace_path}")
finally:
await computer.stop()
async def agent_tracing_example():
"""
Example showing how to use tracing with ComputerAgent for enhanced session recording.
"""
print("=== Agent with Tracing Example ===")
# Initialize computer and agent
computer = Computer(os_type="macos", provider_type="lume")
await computer.run()
try:
# Start comprehensive tracing
await computer.tracing.start(
{
"screenshots": True,
"api_calls": True,
"accessibility_tree": True, # Include accessibility data for training
"metadata": True,
"name": "agent_session",
}
)
# Create agent
agent = ComputerAgent(
model="openai/computer-use-preview", tools=[computer], verbosity=logging.INFO
)
# Add metadata about the agent session
await computer.tracing.add_metadata("agent_model", "openai/computer-use-preview")
await computer.tracing.add_metadata("task_type", "web_search")
# Run agent task
async for message in agent.run(
"Open a web browser and search for 'computer use automation'"
):
print(f"Agent: {message}")
# Stop tracing
trace_path = await computer.tracing.stop({"format": "zip"})
print(f"Agent trace saved to: {trace_path}")
finally:
await computer.stop()
async def custom_agent_tracing_example():
"""
Example showing tracing with custom agent implementations.
"""
print("=== Custom Agent Tracing Example ===")
computer = Computer(os_type="macos", provider_type="lume")
await computer.run()
try:
# Start tracing with custom path
trace_dir = Path.cwd() / "custom_traces" / "my_agent_session"
await computer.tracing.start(
{
"screenshots": True,
"api_calls": True,
"accessibility_tree": False,
"metadata": True,
"path": str(trace_dir),
}
)
# Custom agent logic using direct computer calls
await computer.tracing.add_metadata("session_type", "custom_agent")
await computer.tracing.add_metadata("purpose", "RPA_workflow")
# Take initial screenshot
screenshot = await computer.interface.screenshot()
# Simulate RPA workflow
await computer.interface.move_cursor(500, 300)
await computer.interface.left_click()
await computer.interface.type_text("automation workflow test")
# Add workflow checkpoint
await computer.tracing.add_metadata("checkpoint", "text_input_complete")
await computer.interface.hotkey("command", "a") # Select all
await computer.interface.hotkey("command", "c") # Copy
# Stop tracing and save as directory
trace_path = await computer.tracing.stop({"format": "dir"})
print(f"Custom agent trace saved to: {trace_path}")
finally:
await computer.stop()
async def training_data_collection_example():
"""
Example for collecting training data with rich context.
"""
print("=== Training Data Collection Example ===")
computer = Computer(os_type="macos", provider_type="lume")
await computer.run()
try:
# Start tracing optimized for training data
await computer.tracing.start(
{
"screenshots": True, # Essential for visual training
"api_calls": True, # Capture action sequences
"accessibility_tree": True, # Rich semantic context
"metadata": True, # Custom annotations
"name": "training_session",
}
)
# Add training metadata
await computer.tracing.add_metadata("data_type", "training")
await computer.tracing.add_metadata("task_category", "ui_automation")
await computer.tracing.add_metadata("difficulty", "intermediate")
await computer.tracing.add_metadata("annotator", "human_expert")
# Simulate human demonstration
await computer.interface.screenshot() # Baseline screenshot
# Step 1: Navigate to application
await computer.tracing.add_metadata("step", "1_navigate_to_app")
await computer.interface.move_cursor(100, 50)
await computer.interface.left_click()
# Step 2: Input data
await computer.tracing.add_metadata("step", "2_input_data")
await computer.interface.type_text("training example data")
# Step 3: Process
await computer.tracing.add_metadata("step", "3_process")
await computer.interface.press_key("tab")
await computer.interface.press_key("enter")
# Final metadata
await computer.tracing.add_metadata("success", True)
await computer.tracing.add_metadata("completion_time", "45_seconds")
trace_path = await computer.tracing.stop()
print(f"Training data collected: {trace_path}")
finally:
await computer.stop()
async def debugging_session_example():
"""
Example for debugging agent behavior with detailed tracing.
"""
print("=== Debugging Session Example ===")
computer = Computer(os_type="macos", provider_type="lume")
await computer.run()
try:
# Start tracing for debugging
await computer.tracing.start(
{
"screenshots": True,
"api_calls": True,
"accessibility_tree": True,
"metadata": True,
"name": "debug_session",
}
)
# Debug metadata
await computer.tracing.add_metadata("session_type", "debugging")
await computer.tracing.add_metadata("issue", "click_target_detection")
await computer.tracing.add_metadata("expected_behavior", "click_on_button")
try:
# Problematic sequence that needs debugging
await computer.interface.move_cursor(200, 150)
await computer.interface.left_click()
# This might fail - let's trace it
await computer.interface.type_text("debug test")
await computer.tracing.add_metadata("action_result", "successful_typing")
except Exception as e:
# Record the error in tracing
await computer.tracing.add_metadata("error_encountered", str(e))
await computer.tracing.add_metadata("error_type", type(e).__name__)
print(f"Error occurred: {e}")
# Stop tracing
trace_path = await computer.tracing.stop()
print(f"Debug trace saved: {trace_path}")
print("Use this trace to analyze the failure and improve the agent")
finally:
await computer.stop()
async def human_in_the_loop_example():
"""
Example for recording mixed human/agent sessions.
"""
print("=== Human-in-the-Loop Example ===")
computer = Computer(os_type="macos", provider_type="lume")
await computer.run()
try:
# Start tracing for hybrid session
await computer.tracing.start(
{
"screenshots": True,
"api_calls": True,
"metadata": True,
"name": "human_agent_collaboration",
}
)
# Initial agent phase
await computer.tracing.add_metadata("phase", "agent_autonomous")
await computer.tracing.add_metadata("agent_model", "computer-use-preview")
# Agent performs initial task
await computer.interface.move_cursor(300, 200)
await computer.interface.left_click()
await computer.interface.type_text("automated input")
# Transition to human intervention
await computer.tracing.add_metadata("phase", "human_intervention")
await computer.tracing.add_metadata("intervention_reason", "complex_ui_element")
print("Human intervention phase - manual actions will be recorded...")
# At this point, human can take control while tracing continues
# Simulate human input (in practice, this would be actual human interaction)
await computer.interface.move_cursor(500, 400)
await computer.interface.double_click()
await computer.tracing.add_metadata("human_action", "double_click_complex_element")
# Back to agent
await computer.tracing.add_metadata("phase", "agent_completion")
await computer.interface.press_key("enter")
trace_path = await computer.tracing.stop()
print(f"Human-agent collaboration trace saved: {trace_path}")
finally:
await computer.stop()
async def performance_monitoring_example():
"""
Example for performance monitoring and analysis.
"""
print("=== Performance Monitoring Example ===")
computer = Computer(os_type="macos", provider_type="lume")
await computer.run()
try:
# Start tracing for performance analysis
await computer.tracing.start(
{
"screenshots": False, # Skip screenshots for performance
"api_calls": True,
"metadata": True,
"name": "performance_test",
}
)
# Performance test metadata
await computer.tracing.add_metadata("test_type", "performance_benchmark")
await computer.tracing.add_metadata("expected_duration", "< 30 seconds")
import time
start_time = time.time()
# Perform a series of rapid actions
for i in range(10):
await computer.tracing.add_metadata("iteration", i)
await computer.interface.move_cursor(100 + i * 50, 100)
await computer.interface.left_click()
await computer.interface.type_text(f"Test {i}")
await computer.interface.press_key("tab")
end_time = time.time()
# Record performance metrics
await computer.tracing.add_metadata(
"actual_duration", f"{end_time - start_time:.2f} seconds"
)
await computer.tracing.add_metadata(
"actions_per_second", f"{40 / (end_time - start_time):.2f}"
)
trace_path = await computer.tracing.stop()
print(f"Performance trace saved: {trace_path}")
finally:
await computer.stop()
async def main():
"""
Run all tracing examples.
"""
print("Computer.tracing API Examples")
print("=" * 50)
examples = [
basic_tracing_example,
agent_tracing_example,
custom_agent_tracing_example,
training_data_collection_example,
debugging_session_example,
human_in_the_loop_example,
performance_monitoring_example,
]
for example in examples:
try:
await example()
print()
except Exception as e:
print(f"Error in {example.__name__}: {e}")
print()
print("All examples completed!")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -55,11 +55,11 @@ To get set up with Lume for development, read [these instructions](Development.m
## Docs
- [Installation](https://trycua.com/docs/libraries/lume/installation)
- [Prebuilt Images](https://trycua.com/docs/libraries/lume/prebuilt-images)
- [CLI Reference](https://trycua.com/docs/libraries/lume/cli-reference)
- [HTTP API](https://trycua.com/docs/libraries/lume/http-api)
- [FAQ](https://trycua.com/docs/libraries/lume/faq)
- [Installation](https://cua.ai/docs/libraries/lume/installation)
- [Prebuilt Images](https://cua.ai/docs/libraries/lume/prebuilt-images)
- [CLI Reference](https://cua.ai/docs/libraries/lume/cli-reference)
- [HTTP API](https://cuai.ai/docs/libraries/lume/http-api)
- [FAQ](https://cua.ai/docs/libraries/lume/faq)
## Contributing

View File

@@ -58,14 +58,14 @@ docker run -it --rm \
After running the command above, you can access your macOS VM through a web browser (e.g., http://localhost:8006).
> **Note:** With the basic setup above, your VM will be reset when you stop the container (ephemeral mode). This means any changes you make inside the macOS VM will be lost. See [the documentation](https://trycua.com/docs/libraries/lumier/docker) for how to save your VM state.
> **Note:** With the basic setup above, your VM will be reset when you stop the container (ephemeral mode). This means any changes you make inside the macOS VM will be lost. See [the documentation](https://cua.ai/docs/libraries/lumier/docker) for how to save your VM state.
## Docs
- [Installation](https://trycua.com/docs/libraries/lumier/installation)
- [Docker](https://trycua.com/docs/libraries/lumier/docker)
- [Docker Compose](https://trycua.com/docs/libraries/lumier/docker-compose)
- [Building Lumier](https://trycua.com/docs/libraries/lumier/building-lumier)
- [Installation](https://cua.ai/docs/libraries/lumier/installation)
- [Docker](https://cua.ai/docs/libraries/lumier/docker)
- [Docker Compose](https://cua.ai/docs/libraries/lumier/docker-compose)
- [Building Lumier](https://cua.ai/docs/libraries/lumier/building-lumier)
## Credits

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.4.35
current_version = 0.4.37
commit = True
tag = True
tag_name = agent-v{new_version}

View File

@@ -72,16 +72,16 @@ if __name__ == "__main__":
## Docs
- [Agent Loops](https://trycua.com/docs/agent-sdk/agent-loops)
- [Supported Agents](https://trycua.com/docs/agent-sdk/supported-agents)
- [Supported Models](https://trycua.com/docs/agent-sdk/supported-models)
- [Chat History](https://trycua.com/docs/agent-sdk/chat-history)
- [Callbacks](https://trycua.com/docs/agent-sdk/callbacks)
- [Custom Tools](https://trycua.com/docs/agent-sdk/custom-tools)
- [Custom Computer Handlers](https://trycua.com/docs/agent-sdk/custom-computer-handlers)
- [Prompt Caching](https://trycua.com/docs/agent-sdk/prompt-caching)
- [Usage Tracking](https://trycua.com/docs/agent-sdk/usage-tracking)
- [Benchmarks](https://trycua.com/docs/agent-sdk/benchmarks)
- [Agent Loops](https://cua.ai/docs/agent-sdk/agent-loops)
- [Supported Agents](https://cua.ai/docs/agent-sdk/supported-agents/computer-use-agents)
- [Supported Models](https://cua.ai/docs/agent-sdk/supported-model-providers)
- [Chat History](https://cua.ai/docs/agent-sdk/chat-history)
- [Callbacks](https://cua.ai/docs/agent-sdk/callbacks)
- [Custom Tools](https://cua.ai/docs/agent-sdk/custom-tools)
- [Custom Computer Handlers](https://cua.ai/docs/agent-sdk/custom-computer-handlers)
- [Prompt Caching](https://cua.ai/docs/agent-sdk/prompt-caching)
- [Usage Tracking](https://cua.ai/docs/agent-sdk/usage-tracking)
- [Benchmarks](https://cua.ai/docs/agent-sdk/benchmarks)
## License

View File

@@ -185,7 +185,9 @@ class ComputerAgent:
max_trajectory_budget: Optional[float | dict] = None,
telemetry_enabled: Optional[bool] = True,
trust_remote_code: Optional[bool] = False,
**kwargs,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
**additional_generation_kwargs,
):
"""
Initialize ComputerAgent.
@@ -205,7 +207,9 @@ class ComputerAgent:
max_trajectory_budget: If set, adds BudgetManagerCallback to track usage costs and stop when budget is exceeded
telemetry_enabled: If set, adds TelemetryCallback to track anonymized usage data. Enabled by default.
trust_remote_code: If set, trust remote code when loading local models. Disabled by default.
**kwargs: Additional arguments passed to the agent loop
api_key: Optional API key override for the model provider
api_base: Optional API base URL override for the model provider
**additional_generation_kwargs: Additional arguments passed to the model provider
"""
# If the loop is "human/human", we need to prefix a grounding model fallback
if model in ["human/human", "human"]:
@@ -223,8 +227,10 @@ class ComputerAgent:
self.screenshot_delay = screenshot_delay
self.use_prompt_caching = use_prompt_caching
self.telemetry_enabled = telemetry_enabled
self.kwargs = kwargs
self.kwargs = additional_generation_kwargs
self.trust_remote_code = trust_remote_code
self.api_key = api_key
self.api_base = api_base
# == Add built-in callbacks ==
@@ -593,7 +599,12 @@ class ComputerAgent:
# ============================================================================
async def run(
self, messages: Messages, stream: bool = False, **kwargs
self,
messages: Messages,
stream: bool = False,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
**additional_generation_kwargs,
) -> AsyncGenerator[Dict[str, Any], None]:
"""
Run the agent with the given messages using Computer protocol handler pattern.
@@ -601,7 +612,9 @@ class ComputerAgent:
Args:
messages: List of message dictionaries
stream: Whether to stream the response
**kwargs: Additional arguments
api_key: Optional API key override for the model provider
api_base: Optional API base URL override for the model provider
**additional_generation_kwargs: Additional arguments passed to the model provider
Returns:
AsyncGenerator that yields response chunks
@@ -617,8 +630,12 @@ class ComputerAgent:
await self._initialize_computers()
# Merge kwargs
merged_kwargs = {**self.kwargs, **kwargs}
# Merge kwargs and thread api credentials (run overrides constructor)
merged_kwargs = {**self.kwargs, **additional_generation_kwargs}
if (api_key is not None) or (self.api_key is not None):
merged_kwargs["api_key"] = api_key if api_key is not None else self.api_key
if (api_base is not None) or (self.api_base is not None):
merged_kwargs["api_base"] = api_base if api_base is not None else self.api_base
old_items = self._process_input(messages)
new_items = []
@@ -728,8 +745,14 @@ class ComputerAgent:
if not self.computer_handler:
raise ValueError("Computer tool or image_b64 is required for predict_click")
image_b64 = await self.computer_handler.screenshot()
# Pass along api credentials if available
click_kwargs: Dict[str, Any] = {}
if self.api_key is not None:
click_kwargs["api_key"] = self.api_key
if self.api_base is not None:
click_kwargs["api_base"] = self.api_base
return await self.agent_loop.predict_click(
model=self.model, image_b64=image_b64, instruction=instruction
model=self.model, image_b64=image_b64, instruction=instruction, **click_kwargs
)
return None

View File

@@ -297,6 +297,20 @@ Examples:
help="Maximum number of retries for the LLM API calls",
)
# Provider override credentials
parser.add_argument(
"--api-key",
dest="api_key",
type=str,
help="API key override for the model provider (passed to ComputerAgent)",
)
parser.add_argument(
"--api-base",
dest="api_base",
type=str,
help="API base URL override for the model provider (passed to ComputerAgent)",
)
args = parser.parse_args()
# Check for required environment variables
@@ -307,7 +321,7 @@ Examples:
if not container_name:
if args.provider == "cloud":
print_colored("CUA_CONTAINER_NAME not set.", dim=True)
print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
print_colored("You can get a CUA container at https://cua.ai/", dim=True)
container_name = input("Enter your CUA container name: ").strip()
if not container_name:
print_colored("❌ Container name is required.")
@@ -380,6 +394,12 @@ Examples:
"max_retries": args.max_retries,
}
# Thread API credentials to agent if provided
if args.api_key:
agent_kwargs["api_key"] = args.api_key
if args.api_base:
agent_kwargs["api_base"] = args.api_base
if args.images > 0:
agent_kwargs["only_n_most_recent_images"] = args.images

View File

@@ -28,8 +28,12 @@ class AsyncComputerHandler(Protocol):
"""Get screen dimensions as (width, height)."""
...
async def screenshot(self) -> str:
"""Take a screenshot and return as base64 string."""
async def screenshot(self, text: Optional[str] = None) -> str:
"""Take a screenshot and return as base64 string.
Args:
text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
"""
...
async def click(self, x: int, y: int, button: str = "left") -> None:

View File

@@ -36,8 +36,12 @@ class cuaComputerHandler(AsyncComputerHandler):
screen_size = await self.interface.get_screen_size()
return screen_size["width"], screen_size["height"]
async def screenshot(self) -> str:
"""Take a screenshot and return as base64 string."""
async def screenshot(self, text: Optional[str] = None) -> str:
"""Take a screenshot and return as base64 string.
Args:
text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
"""
assert self.interface is not None
screenshot_bytes = await self.interface.screenshot()
return base64.b64encode(screenshot_bytes).decode("utf-8")

View File

@@ -122,8 +122,12 @@ class CustomComputerHandler(AsyncComputerHandler):
return self._last_screenshot_size
async def screenshot(self) -> str:
"""Take a screenshot and return as base64 string."""
async def screenshot(self, text: Optional[str] = None) -> str:
"""Take a screenshot and return as base64 string.
Args:
text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
"""
result = await self._call_function(self.functions["screenshot"])
b64_str = self._to_b64_str(result) # type: ignore

View File

@@ -1615,6 +1615,11 @@ Task: Click {instruction}. Output ONLY a click action on the target element.""",
"max_tokens": 100, # Keep response short for click prediction
"headers": {"anthropic-beta": tool_config["beta_flag"]},
}
# Thread optional API params
if "api_key" in kwargs and kwargs.get("api_key") is not None:
api_kwargs["api_key"] = kwargs.get("api_key")
if "api_base" in kwargs and kwargs.get("api_base") is not None:
api_kwargs["api_base"] = kwargs.get("api_base")
# Use liteLLM acompletion
response = await litellm.acompletion(**api_kwargs)

View File

@@ -24,7 +24,7 @@ class AsyncAgentConfig(Protocol):
_on_api_end=None,
_on_usage=None,
_on_screenshot=None,
**kwargs,
**generation_config,
) -> Dict[str, Any]:
"""
Predict the next step based on input items.
@@ -40,7 +40,9 @@ class AsyncAgentConfig(Protocol):
_on_api_end: Callback for API end
_on_usage: Callback for usage tracking
_on_screenshot: Callback for screenshot events
**kwargs: Additional arguments
**generation_config: Additional arguments to pass to the model provider
- api_key: Optional API key for the provider
- api_base: Optional API base URL for the provider
Returns:
Dictionary with "output" (output items) and "usage" array
@@ -49,7 +51,7 @@ class AsyncAgentConfig(Protocol):
@abstractmethod
async def predict_click(
self, model: str, image_b64: str, instruction: str
self, model: str, image_b64: str, instruction: str, **generation_config
) -> Optional[Tuple[int, int]]:
"""
Predict click coordinates based on image and instruction.
@@ -58,6 +60,9 @@ class AsyncAgentConfig(Protocol):
model: Model name to use
image_b64: Base64 encoded image
instruction: Instruction for where to click
**generation_config: Additional arguments to pass to the model provider
- api_key: Optional API key for the provider
- api_base: Optional API base URL for the provider
Returns:
None or tuple with (x, y) coordinates

View File

@@ -762,6 +762,7 @@ class Glm4vConfig(AsyncAgentConfig):
# "skip_special_tokens": False,
# }
}
api_kwargs.update({k: v for k, v in (kwargs or {}).items()})
# Add API callbacks
if _on_api_start:
@@ -852,6 +853,7 @@ Where x,y are coordinates normalized to 0-999 range."""
"skip_special_tokens": False,
},
}
api_kwargs.update({k: v for k, v in (kwargs or {}).items()})
# Call liteLLM
response = await litellm.acompletion(**api_kwargs)

View File

@@ -14,67 +14,73 @@ import litellm
from ..decorators import register_agent
from ..loops.base import AsyncAgentConfig
from ..responses import (
convert_completion_messages_to_responses_items,
convert_responses_items_to_completion_messages,
)
from ..types import AgentCapability, AgentResponse, Messages, Tools
SOM_TOOL_SCHEMA = {
"type": "function",
"name": "computer",
"description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
"parameters": {
"type": "object",
"properties": {
"action": {
"type": "string",
"enum": [
"screenshot",
"click",
"double_click",
"drag",
"type",
"keypress",
"scroll",
"move",
"wait",
"get_current_url",
"get_dimensions",
"get_environment",
],
"description": "The action to perform",
},
"element_id": {
"type": "integer",
"description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
},
"start_element_id": {
"type": "integer",
"description": "The ID of the element to start dragging from (required for drag action)",
},
"end_element_id": {
"type": "integer",
"description": "The ID of the element to drag to (required for drag action)",
},
"text": {
"type": "string",
"description": "The text to type (required for type action)",
},
"keys": {
"type": "string",
"description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
},
"button": {
"type": "string",
"description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
},
"scroll_x": {
"type": "integer",
"description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
},
"scroll_y": {
"type": "integer",
"description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
"function": {
"name": "computer",
"description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
"parameters": {
"type": "object",
"properties": {
"action": {
"type": "string",
"enum": [
"screenshot",
"click",
"double_click",
"drag",
"type",
"keypress",
"scroll",
"move",
"wait",
"get_current_url",
"get_dimensions",
"get_environment",
],
"description": "The action to perform",
},
"element_id": {
"type": "integer",
"description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)",
},
"start_element_id": {
"type": "integer",
"description": "The ID of the element to start dragging from (required for drag action)",
},
"end_element_id": {
"type": "integer",
"description": "The ID of the element to drag to (required for drag action)",
},
"text": {
"type": "string",
"description": "The text to type (required for type action)",
},
"keys": {
"type": "string",
"description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')",
},
"button": {
"type": "string",
"description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
},
"scroll_x": {
"type": "integer",
"description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
},
"scroll_y": {
"type": "integer",
"description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
},
},
"required": ["action", "element_id"],
},
"required": ["action"],
},
}
@@ -243,18 +249,20 @@ async def replace_computer_call_with_function(
"id": item.get("id"),
"call_id": item.get("call_id"),
"status": "completed",
# Fall back to string representation
"content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})",
}
]
elif item_type == "computer_call_output":
# Simple conversion: computer_call_output -> function_call_output
output = item.get("output")
if isinstance(output, dict):
output = [output]
return [
{
"type": "function_call_output",
"call_id": item.get("call_id"),
"content": [item.get("output")],
"output": item.get("output"),
"id": item.get("id"),
"status": "completed",
}
@@ -296,6 +304,13 @@ class OmniparserConfig(AsyncAgentConfig):
llm_model = model.split("+")[-1]
# Get screen dimensions from computer handler
try:
width, height = await computer_handler.get_dimensions()
except Exception:
# Fallback to default dimensions if method fails
width, height = 1024, 768
# Prepare tools for OpenAI API
openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
@@ -309,27 +324,43 @@ class OmniparserConfig(AsyncAgentConfig):
result = parser.parse(image_data)
if _on_screenshot:
await _on_screenshot(result.annotated_image_base64, "annotated_image")
for element in result.elements:
id2xy[element.id] = (
(element.bbox.x1 + element.bbox.x2) / 2,
(element.bbox.y1 + element.bbox.y2) / 2,
)
# handle computer calls -> function calls
new_messages = []
for message in messages:
# Convert OmniParser normalized coordinates (0-1) to absolute pixels, convert to pixels
for element in result.elements:
norm_x = (element.bbox.x1 + element.bbox.x2) / 2
norm_y = (element.bbox.y1 + element.bbox.y2) / 2
pixel_x = int(norm_x * width)
pixel_y = int(norm_y * height)
id2xy[element.id] = (pixel_x, pixel_y)
# Replace the original screenshot with the annotated image
annotated_image_url = f"data:image/png;base64,{result.annotated_image_base64}"
last_computer_call_output["output"]["image_url"] = annotated_image_url
xy2id = {v: k for k, v in id2xy.items()}
messages_with_element_ids = []
for i, message in enumerate(messages):
if not isinstance(message, dict):
message = message.__dict__
new_messages += await replace_computer_call_with_function(message, id2xy) # type: ignore
messages = new_messages
msg_type = message.get("type")
if msg_type == "computer_call" and "action" in message:
action = message.get("action", {})
converted = await replace_computer_call_with_function(message, xy2id) # type: ignore
messages_with_element_ids += converted
completion_messages = convert_responses_items_to_completion_messages(
messages_with_element_ids, allow_images_in_tool_results=False
)
# Prepare API call kwargs
api_kwargs = {
"model": llm_model,
"input": messages,
"messages": completion_messages,
"tools": openai_tools if openai_tools else None,
"stream": stream,
"truncation": "auto",
"num_retries": max_retries,
**kwargs,
}
@@ -340,8 +371,8 @@ class OmniparserConfig(AsyncAgentConfig):
print(str(api_kwargs)[:1000])
# Use liteLLM responses
response = await litellm.aresponses(**api_kwargs)
# Use liteLLM completion
response = await litellm.acompletion(**api_kwargs)
# Call API end hook
if _on_api_end:
@@ -355,12 +386,45 @@ class OmniparserConfig(AsyncAgentConfig):
if _on_usage:
await _on_usage(usage)
# handle som function calls -> xy computer calls
new_output = []
for i in range(len(response.output)): # type: ignore
new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy) # type: ignore
response_dict = response.model_dump() # type: ignore
choice_messages = [choice["message"] for choice in response_dict["choices"]]
responses_items = []
for choice_message in choice_messages:
responses_items.extend(convert_completion_messages_to_responses_items([choice_message]))
return {"output": new_output, "usage": usage}
# Convert element_id → x,y (similar to moondream's convert_computer_calls_desc2xy)
final_output = []
for item in responses_items:
if item.get("type") == "computer_call" and "action" in item:
action = item["action"].copy()
# Handle single element_id
if "element_id" in action:
element_id = action["element_id"]
if element_id in id2xy:
x, y = id2xy[element_id]
action["x"] = x
action["y"] = y
del action["element_id"]
# Handle start_element_id and end_element_id for drag operations
elif "start_element_id" in action and "end_element_id" in action:
start_id = action["start_element_id"]
end_id = action["end_element_id"]
if start_id in id2xy and end_id in id2xy:
start_x, start_y = id2xy[start_id]
end_x, end_y = id2xy[end_id]
action["path"] = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
del action["start_element_id"]
del action["end_element_id"]
converted_item = item.copy()
converted_item["action"] = action
final_output.append(converted_item)
else:
final_output.append(item)
return {"output": final_output, "usage": usage}
async def predict_click(
self, model: str, image_b64: str, instruction: str, **kwargs

View File

@@ -140,7 +140,7 @@ class OpenAIComputerUseConfig:
return output_dict
async def predict_click(
self, model: str, image_b64: str, instruction: str
self, model: str, image_b64: str, instruction: str, **kwargs
) -> Optional[Tuple[int, int]]:
"""
Predict click coordinates based on image and instruction.
@@ -208,6 +208,7 @@ Task: Click {instruction}. Output ONLY a click action on the target element.""",
"reasoning": {"summary": "concise"},
"truncation": "auto",
"max_tokens": 200, # Keep response short for click prediction
**kwargs,
}
# Use liteLLM responses

View File

@@ -773,7 +773,7 @@ class UITARSConfig:
return agent_response
async def predict_click(
self, model: str, image_b64: str, instruction: str
self, model: str, image_b64: str, instruction: str, **kwargs
) -> Optional[Tuple[int, int]]:
"""
Predict click coordinates based on image and instruction.
@@ -819,6 +819,7 @@ class UITARSConfig:
"temperature": 0.0,
"do_sample": False,
}
api_kwargs.update({k: v for k, v in (kwargs or {}).items()})
# Call liteLLM with UITARS model
response = await litellm.acompletion(**api_kwargs)

View File

@@ -4,7 +4,7 @@ build-backend = "pdm.backend"
[project]
name = "cua-agent"
version = "0.4.35"
version = "0.4.37"
description = "CUA (Computer Use) Agent for AI-driven computer interaction"
readme = "README.md"
authors = [

View File

@@ -0,0 +1,84 @@
"""Pytest configuration and shared fixtures for agent package tests.
This file contains shared fixtures and configuration for all agent tests.
Following SRP: This file ONLY handles test setup/teardown.
"""
from unittest.mock import AsyncMock, MagicMock, Mock, patch
import pytest
@pytest.fixture
def mock_litellm():
"""Mock liteLLM completion calls.
Use this fixture to avoid making real LLM API calls during tests.
Returns a mock that simulates LLM responses.
"""
with patch("litellm.acompletion") as mock_completion:
async def mock_response(*args, **kwargs):
"""Simulate a typical LLM response."""
return {
"id": "chatcmpl-test123",
"object": "chat.completion",
"created": 1234567890,
"model": kwargs.get("model", "anthropic/claude-3-5-sonnet-20241022"),
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "This is a mocked response for testing.",
},
"finish_reason": "stop",
}
],
"usage": {
"prompt_tokens": 10,
"completion_tokens": 20,
"total_tokens": 30,
},
}
mock_completion.side_effect = mock_response
yield mock_completion
@pytest.fixture
def mock_computer():
"""Mock Computer interface for agent tests.
Use this fixture to test agent logic without requiring a real Computer instance.
"""
computer = AsyncMock()
computer.interface = AsyncMock()
computer.interface.screenshot = AsyncMock(return_value=b"fake_screenshot_data")
computer.interface.left_click = AsyncMock()
computer.interface.type = AsyncMock()
computer.interface.key = AsyncMock()
# Mock context manager
computer.__aenter__ = AsyncMock(return_value=computer)
computer.__aexit__ = AsyncMock()
return computer
@pytest.fixture
def disable_telemetry(monkeypatch):
"""Disable telemetry for tests.
Use this fixture to ensure no telemetry is sent during tests.
"""
monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1")
@pytest.fixture
def sample_messages():
"""Provide sample messages for testing.
Returns a list of messages in the expected format.
"""
return [{"role": "user", "content": "Take a screenshot and tell me what you see"}]

View File

@@ -0,0 +1,139 @@
"""Unit tests for ComputerAgent class.
This file tests ONLY the ComputerAgent initialization and basic functionality.
Following SRP: This file tests ONE class (ComputerAgent).
All external dependencies (liteLLM, Computer) are mocked.
"""
from unittest.mock import AsyncMock, MagicMock, Mock, patch
import pytest
class TestComputerAgentInitialization:
"""Test ComputerAgent initialization (SRP: Only tests initialization)."""
@patch("agent.agent.litellm")
def test_agent_initialization_with_model(self, mock_litellm, disable_telemetry):
"""Test that agent can be initialized with a model string."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
assert agent is not None
assert hasattr(agent, "model")
assert agent.model == "anthropic/claude-3-5-sonnet-20241022"
@patch("agent.agent.litellm")
def test_agent_initialization_with_tools(self, mock_litellm, disable_telemetry, mock_computer):
"""Test that agent can be initialized with tools."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
assert agent is not None
assert hasattr(agent, "tools")
@patch("agent.agent.litellm")
def test_agent_initialization_with_max_budget(self, mock_litellm, disable_telemetry):
"""Test that agent can be initialized with max trajectory budget."""
from agent import ComputerAgent
budget = 5.0
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022", max_trajectory_budget=budget
)
assert agent is not None
@patch("agent.agent.litellm")
def test_agent_requires_model(self, mock_litellm, disable_telemetry):
"""Test that agent requires a model parameter."""
from agent import ComputerAgent
with pytest.raises(TypeError):
# Should fail without model parameter - intentionally missing required argument
ComputerAgent() # type: ignore[call-arg]
class TestComputerAgentRun:
"""Test ComputerAgent.run() method (SRP: Only tests run logic)."""
@pytest.mark.asyncio
@patch("agent.agent.litellm")
async def test_agent_run_with_messages(self, mock_litellm, disable_telemetry, sample_messages):
"""Test that agent.run() works with valid messages."""
from agent import ComputerAgent
# Mock liteLLM response
mock_response = {
"id": "chatcmpl-test",
"choices": [
{
"message": {"role": "assistant", "content": "Test response"},
"finish_reason": "stop",
}
],
"usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
}
mock_litellm.acompletion = AsyncMock(return_value=mock_response)
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
# Run should return an async generator
result_generator = agent.run(sample_messages)
assert result_generator is not None
# Check it's an async generator
assert hasattr(result_generator, "__anext__")
def test_agent_has_run_method(self, disable_telemetry):
"""Test that agent has run method available."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
# Verify run method exists
assert hasattr(agent, "run")
assert callable(agent.run)
def test_agent_has_agent_loop(self, disable_telemetry):
"""Test that agent has agent_loop initialized."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
# Verify agent_loop is initialized
assert hasattr(agent, "agent_loop")
assert agent.agent_loop is not None
class TestComputerAgentTypes:
"""Test AgentResponse and Messages types (SRP: Only tests type definitions)."""
def test_messages_type_exists(self):
"""Test that Messages type is exported."""
from agent import Messages
assert Messages is not None
def test_agent_response_type_exists(self):
"""Test that AgentResponse type is exported."""
from agent import AgentResponse
assert AgentResponse is not None
class TestComputerAgentIntegration:
"""Test ComputerAgent integration with Computer tool (SRP: Integration within package)."""
def test_agent_accepts_computer_tool(self, disable_telemetry, mock_computer):
"""Test that agent can be initialized with Computer tool."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
# Verify agent accepted the tool
assert agent is not None
assert hasattr(agent, "tools")

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.1.27
current_version = 0.1.29
commit = True
tag = True
tag_name = computer-server-v{new_version}

View File

@@ -40,7 +40,7 @@ Refer to this notebook for a step-by-step guide on how to use the Computer-Use S
## Docs
- [Commands](https://trycua.com/docs/libraries/computer-server/Commands)
- [REST-API](https://trycua.com/docs/libraries/computer-server/REST-API)
- [WebSocket-API](https://trycua.com/docs/libraries/computer-server/WebSocket-API)
- [Index](https://trycua.com/docs/libraries/computer-server/index)
- [Commands](https://cua.ai/docs/libraries/computer-server/Commands)
- [REST-API](https://cua.ai/docs/libraries/computer-server/REST-API)
- [WebSocket-API](https://cua.ai/docs/libraries/computer-server/WebSocket-API)
- [Index](https://cua.ai/docs/libraries/computer-server/index)

View File

@@ -85,6 +85,102 @@ class BaseFileHandler(ABC):
pass
class BaseDesktopHandler(ABC):
"""Abstract base class for OS-specific desktop handlers.
Categories:
- Wallpaper Actions: Methods for wallpaper operations
- Desktop shortcut actions: Methods for managing desktop shortcuts
"""
# Wallpaper Actions
@abstractmethod
async def get_desktop_environment(self) -> Dict[str, Any]:
"""Get the current desktop environment name."""
pass
@abstractmethod
async def set_wallpaper(self, path: str) -> Dict[str, Any]:
"""Set the desktop wallpaper to the file at path."""
pass
class BaseWindowHandler(ABC):
"""Abstract class for OS-specific window management handlers.
Categories:
- Window Management: Methods for application/window control
"""
# Window Management
@abstractmethod
async def open(self, target: str) -> Dict[str, Any]:
"""Open a file or URL with the default application."""
pass
@abstractmethod
async def launch(self, app: str, args: Optional[List[str]] = None) -> Dict[str, Any]:
"""Launch an application with optional arguments."""
pass
@abstractmethod
async def get_current_window_id(self) -> Dict[str, Any]:
"""Get the currently active window ID."""
pass
@abstractmethod
async def get_application_windows(self, app: str) -> Dict[str, Any]:
"""Get windows belonging to an application (by name or bundle)."""
pass
@abstractmethod
async def get_window_name(self, window_id: str) -> Dict[str, Any]:
"""Get the title/name of a window by ID."""
pass
@abstractmethod
async def get_window_size(self, window_id: str | int) -> Dict[str, Any]:
"""Get the size of a window by ID as {width, height}."""
pass
@abstractmethod
async def activate_window(self, window_id: str | int) -> Dict[str, Any]:
"""Bring a window to the foreground by ID."""
pass
@abstractmethod
async def close_window(self, window_id: str | int) -> Dict[str, Any]:
"""Close a window by ID."""
pass
@abstractmethod
async def get_window_position(self, window_id: str | int) -> Dict[str, Any]:
"""Get the top-left position of a window as {x, y}."""
pass
@abstractmethod
async def set_window_size(
self, window_id: str | int, width: int, height: int
) -> Dict[str, Any]:
"""Set the size of a window by ID."""
pass
@abstractmethod
async def set_window_position(self, window_id: str | int, x: int, y: int) -> Dict[str, Any]:
"""Set the position of a window by ID."""
pass
@abstractmethod
async def maximize_window(self, window_id: str | int) -> Dict[str, Any]:
"""Maximize a window by ID."""
pass
@abstractmethod
async def minimize_window(self, window_id: str | int) -> Dict[str, Any]:
"""Minimize a window by ID."""
pass
class BaseAutomationHandler(ABC):
"""Abstract base class for OS-specific automation handlers.

View File

@@ -4,7 +4,13 @@ from typing import Tuple, Type
from computer_server.diorama.base import BaseDioramaHandler
from .base import BaseAccessibilityHandler, BaseAutomationHandler, BaseFileHandler
from .base import (
BaseAccessibilityHandler,
BaseAutomationHandler,
BaseDesktopHandler,
BaseFileHandler,
BaseWindowHandler,
)
# Conditionally import platform-specific handlers
system = platform.system().lower()
@@ -17,7 +23,7 @@ elif system == "linux":
elif system == "windows":
from .windows import WindowsAccessibilityHandler, WindowsAutomationHandler
from .generic import GenericFileHandler
from .generic import GenericDesktopHandler, GenericFileHandler, GenericWindowHandler
class HandlerFactory:
@@ -49,9 +55,14 @@ class HandlerFactory:
raise RuntimeError(f"Failed to determine current OS: {str(e)}")
@staticmethod
def create_handlers() -> (
Tuple[BaseAccessibilityHandler, BaseAutomationHandler, BaseDioramaHandler, BaseFileHandler]
):
def create_handlers() -> Tuple[
BaseAccessibilityHandler,
BaseAutomationHandler,
BaseDioramaHandler,
BaseFileHandler,
BaseDesktopHandler,
BaseWindowHandler,
]:
"""Create and return appropriate handlers for the current OS.
Returns:
@@ -70,6 +81,8 @@ class HandlerFactory:
MacOSAutomationHandler(),
MacOSDioramaHandler(),
GenericFileHandler(),
GenericDesktopHandler(),
GenericWindowHandler(),
)
elif os_type == "linux":
return (
@@ -77,6 +90,8 @@ class HandlerFactory:
LinuxAutomationHandler(),
BaseDioramaHandler(),
GenericFileHandler(),
GenericDesktopHandler(),
GenericWindowHandler(),
)
elif os_type == "windows":
return (
@@ -84,6 +99,8 @@ class HandlerFactory:
WindowsAutomationHandler(),
BaseDioramaHandler(),
GenericFileHandler(),
GenericDesktopHandler(),
GenericWindowHandler(),
)
else:
raise NotImplementedError(f"OS '{os_type}' is not supported")

View File

@@ -2,15 +2,26 @@
Generic handlers for all OSes.
Includes:
- DesktopHandler
- FileHandler
"""
import base64
import os
import platform
import subprocess
import webbrowser
from pathlib import Path
from typing import Any, Dict, Optional
from .base import BaseFileHandler
from ..utils import wallpaper
from .base import BaseDesktopHandler, BaseFileHandler, BaseWindowHandler
try:
import pywinctl as pwc
except Exception: # pragma: no cover
pwc = None # type: ignore
def resolve_path(path: str) -> Path:
@@ -25,6 +36,233 @@ def resolve_path(path: str) -> Path:
return Path(path).expanduser().resolve()
# ===== Cross-platform Desktop command handlers =====
class GenericDesktopHandler(BaseDesktopHandler):
"""
Generic desktop handler providing desktop-related operations.
Implements:
- get_desktop_environment: detect current desktop environment
- set_wallpaper: set desktop wallpaper path
"""
async def get_desktop_environment(self) -> Dict[str, Any]:
"""
Get the current desktop environment.
Returns:
Dict containing 'success' boolean and either 'environment' string or 'error' string
"""
try:
env = wallpaper.get_desktop_environment()
return {"success": True, "environment": env}
except Exception as e:
return {"success": False, "error": str(e)}
async def set_wallpaper(self, path: str) -> Dict[str, Any]:
"""
Set the desktop wallpaper to the specified path.
Args:
path: The file path to set as wallpaper
Returns:
Dict containing 'success' boolean and optionally 'error' string
"""
try:
file_path = resolve_path(path)
ok = wallpaper.set_wallpaper(str(file_path))
return {"success": bool(ok)}
except Exception as e:
return {"success": False, "error": str(e)}
# ===== Cross-platform window control command handlers =====
class GenericWindowHandler(BaseWindowHandler):
"""
Cross-platform window management using pywinctl where possible.
"""
async def open(self, target: str) -> Dict[str, Any]:
try:
if target.startswith("http://") or target.startswith("https://"):
ok = webbrowser.open(target)
return {"success": bool(ok)}
path = str(resolve_path(target))
sys = platform.system().lower()
if sys == "darwin":
subprocess.Popen(["open", path])
elif sys == "linux":
subprocess.Popen(["xdg-open", path])
elif sys == "windows":
os.startfile(path) # type: ignore[attr-defined]
else:
return {"success": False, "error": f"Unsupported OS: {sys}"}
return {"success": True}
except Exception as e:
return {"success": False, "error": str(e)}
async def launch(self, app: str, args: Optional[list[str]] = None) -> Dict[str, Any]:
try:
if args:
proc = subprocess.Popen([app, *args])
else:
# allow shell command like "libreoffice --writer"
proc = subprocess.Popen(app, shell=True)
return {"success": True, "pid": proc.pid}
except Exception as e:
return {"success": False, "error": str(e)}
def _get_window_by_id(self, window_id: int | str) -> Optional[Any]:
if pwc is None:
raise RuntimeError("pywinctl not available")
# Find by native handle among Window objects; getAllWindowsDict keys are titles
try:
for w in pwc.getAllWindows():
if str(w.getHandle()) == str(window_id):
return w
return None
except Exception:
return None
async def get_current_window_id(self) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
win = pwc.getActiveWindow()
if not win:
return {"success": False, "error": "No active window"}
return {"success": True, "window_id": win.getHandle()}
except Exception as e:
return {"success": False, "error": str(e)}
async def get_application_windows(self, app: str) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
wins = pwc.getWindowsWithTitle(app, condition=pwc.Re.CONTAINS, flags=pwc.Re.IGNORECASE)
ids = [w.getHandle() for w in wins]
return {"success": True, "windows": ids}
except Exception as e:
return {"success": False, "error": str(e)}
async def get_window_name(self, window_id: int | str) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
w = self._get_window_by_id(window_id)
if not w:
return {"success": False, "error": "Window not found"}
return {"success": True, "name": w.title}
except Exception as e:
return {"success": False, "error": str(e)}
async def get_window_size(self, window_id: int | str) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
w = self._get_window_by_id(window_id)
if not w:
return {"success": False, "error": "Window not found"}
width, height = w.size
return {"success": True, "width": int(width), "height": int(height)}
except Exception as e:
return {"success": False, "error": str(e)}
async def get_window_position(self, window_id: int | str) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
w = self._get_window_by_id(window_id)
if not w:
return {"success": False, "error": "Window not found"}
x, y = w.position
return {"success": True, "x": int(x), "y": int(y)}
except Exception as e:
return {"success": False, "error": str(e)}
async def set_window_size(
self, window_id: int | str, width: int, height: int
) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
w = self._get_window_by_id(window_id)
if not w:
return {"success": False, "error": "Window not found"}
ok = w.resizeTo(int(width), int(height))
return {"success": bool(ok)}
except Exception as e:
return {"success": False, "error": str(e)}
async def set_window_position(self, window_id: int | str, x: int, y: int) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
w = self._get_window_by_id(window_id)
if not w:
return {"success": False, "error": "Window not found"}
ok = w.moveTo(int(x), int(y))
return {"success": bool(ok)}
except Exception as e:
return {"success": False, "error": str(e)}
async def maximize_window(self, window_id: int | str) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
w = self._get_window_by_id(window_id)
if not w:
return {"success": False, "error": "Window not found"}
ok = w.maximize()
return {"success": bool(ok)}
except Exception as e:
return {"success": False, "error": str(e)}
async def minimize_window(self, window_id: int | str) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
w = self._get_window_by_id(window_id)
if not w:
return {"success": False, "error": "Window not found"}
ok = w.minimize()
return {"success": bool(ok)}
except Exception as e:
return {"success": False, "error": str(e)}
async def activate_window(self, window_id: int | str) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
w = self._get_window_by_id(window_id)
if not w:
return {"success": False, "error": "Window not found"}
ok = w.activate()
return {"success": bool(ok)}
except Exception as e:
return {"success": False, "error": str(e)}
async def close_window(self, window_id: int | str) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
w = self._get_window_by_id(window_id)
if not w:
return {"success": False, "error": "Window not found"}
ok = w.close()
return {"success": bool(ok)}
except Exception as e:
return {"success": False, "error": str(e)}
# ===== Cross-platform file system command handlers =====
class GenericFileHandler(BaseFileHandler):
"""
Generic file handler that provides file system operations for all operating systems.

View File

@@ -75,9 +75,14 @@ except Exception:
except Exception:
package_version = "unknown"
accessibility_handler, automation_handler, diorama_handler, file_handler = (
HandlerFactory.create_handlers()
)
(
accessibility_handler,
automation_handler,
diorama_handler,
file_handler,
desktop_handler,
window_handler,
) = HandlerFactory.create_handlers()
handlers = {
"version": lambda: {"protocol": protocol_version, "package": package_version},
# App-Use commands
@@ -99,6 +104,23 @@ handlers = {
"delete_file": file_handler.delete_file,
"create_dir": file_handler.create_dir,
"delete_dir": file_handler.delete_dir,
# Desktop commands
"get_desktop_environment": desktop_handler.get_desktop_environment,
"set_wallpaper": desktop_handler.set_wallpaper,
# Window management
"open": window_handler.open,
"launch": window_handler.launch,
"get_current_window_id": window_handler.get_current_window_id,
"get_application_windows": window_handler.get_application_windows,
"get_window_name": window_handler.get_window_name,
"get_window_size": window_handler.get_window_size,
"get_window_position": window_handler.get_window_position,
"set_window_size": window_handler.set_window_size,
"set_window_position": window_handler.set_window_position,
"maximize_window": window_handler.maximize_window,
"minimize_window": window_handler.minimize_window,
"activate_window": window_handler.activate_window,
"close_window": window_handler.close_window,
# Mouse commands
"mouse_down": automation_handler.mouse_down,
"mouse_up": automation_handler.mouse_up,

View File

@@ -0,0 +1,3 @@
from . import wallpaper
__all__ = ["wallpaper"]

View File

@@ -0,0 +1,321 @@
"""Set the desktop wallpaper."""
import os
import subprocess
import sys
from pathlib import Path
def get_desktop_environment() -> str:
"""
Returns the name of the current desktop environment.
"""
# From https://stackoverflow.com/a/21213358/2624876
# which takes from:
# http://stackoverflow.com/questions/2035657/what-is-my-current-desktop-environment
# and http://ubuntuforums.org/showthread.php?t=652320
# and http://ubuntuforums.org/showthread.php?t=1139057
if sys.platform in ["win32", "cygwin"]:
return "windows"
elif sys.platform == "darwin":
return "mac"
else: # Most likely either a POSIX system or something not much common
desktop_session = os.environ.get("DESKTOP_SESSION")
if (
desktop_session is not None
): # easier to match if we doesn't have to deal with character cases
desktop_session = desktop_session.lower()
if desktop_session in [
"gnome",
"unity",
"cinnamon",
"mate",
"xfce4",
"lxde",
"fluxbox",
"blackbox",
"openbox",
"icewm",
"jwm",
"afterstep",
"trinity",
"kde",
]:
return desktop_session
## Special cases ##
# Canonical sets $DESKTOP_SESSION to Lubuntu rather than LXDE if using LXDE.
# There is no guarantee that they will not do the same with the other desktop environments.
elif "xfce" in desktop_session or desktop_session.startswith("xubuntu"):
return "xfce4"
elif desktop_session.startswith("ubuntustudio"):
return "kde"
elif desktop_session.startswith("ubuntu"):
return "gnome"
elif desktop_session.startswith("lubuntu"):
return "lxde"
elif desktop_session.startswith("kubuntu"):
return "kde"
elif desktop_session.startswith("razor"): # e.g. razorkwin
return "razor-qt"
elif desktop_session.startswith("wmaker"): # e.g. wmaker-common
return "windowmaker"
gnome_desktop_session_id = os.environ.get("GNOME_DESKTOP_SESSION_ID")
if os.environ.get("KDE_FULL_SESSION") == "true":
return "kde"
elif gnome_desktop_session_id:
if "deprecated" not in gnome_desktop_session_id:
return "gnome2"
# From http://ubuntuforums.org/showthread.php?t=652320
elif is_running("xfce-mcs-manage"):
return "xfce4"
elif is_running("ksmserver"):
return "kde"
return "unknown"
def is_running(process: str) -> bool:
"""Returns whether a process with the given name is (likely) currently running.
Uses a basic text search, and so may have false positives.
"""
# From http://www.bloggerpolis.com/2011/05/how-to-check-if-a-process-is-running-using-python/
# and http://richarddingwall.name/2009/06/18/windows-equivalents-of-ps-and-kill-commands/
try: # Linux/Unix
s = subprocess.Popen(["ps", "axw"], stdout=subprocess.PIPE)
except: # Windows
s = subprocess.Popen(["tasklist", "/v"], stdout=subprocess.PIPE)
assert s.stdout is not None
for x in s.stdout:
# if re.search(process, x):
if process in str(x):
return True
return False
def set_wallpaper(file_loc: str, first_run: bool = True):
"""Sets the wallpaper to the given file location."""
# From https://stackoverflow.com/a/21213504/2624876
# I have not personally tested most of this. -- @1j01
# -----------------------------------------
# Note: There are two common Linux desktop environments where
# I have not been able to set the desktop background from
# command line: KDE, Enlightenment
desktop_env = get_desktop_environment()
if desktop_env in ["gnome", "unity", "cinnamon"]:
# Tested on Ubuntu 22 -- @1j01
uri = Path(file_loc).as_uri()
SCHEMA = "org.gnome.desktop.background"
KEY = "picture-uri"
# Needed for Ubuntu 22 in dark mode
# Might be better to set only one or the other, depending on the current theme
# In the settings it will say "This background selection only applies to the dark style"
# even if it's set for both, arguably referring to the selection that you can make on that page.
# -- @1j01
KEY_DARK = "picture-uri-dark"
try:
from gi.repository import Gio # type: ignore
gsettings = Gio.Settings.new(SCHEMA) # type: ignore
gsettings.set_string(KEY, uri)
gsettings.set_string(KEY_DARK, uri)
except Exception:
# Fallback tested on Ubuntu 22 -- @1j01
args = ["gsettings", "set", SCHEMA, KEY, uri]
subprocess.Popen(args)
args = ["gsettings", "set", SCHEMA, KEY_DARK, uri]
subprocess.Popen(args)
elif desktop_env == "mate":
try: # MATE >= 1.6
# info from http://wiki.mate-desktop.org/docs:gsettings
args = ["gsettings", "set", "org.mate.background", "picture-filename", file_loc]
subprocess.Popen(args)
except Exception: # MATE < 1.6
# From https://bugs.launchpad.net/variety/+bug/1033918
args = [
"mateconftool-2",
"-t",
"string",
"--set",
"/desktop/mate/background/picture_filename",
file_loc,
]
subprocess.Popen(args)
elif desktop_env == "gnome2": # Not tested
# From https://bugs.launchpad.net/variety/+bug/1033918
args = [
"gconftool-2",
"-t",
"string",
"--set",
"/desktop/gnome/background/picture_filename",
file_loc,
]
subprocess.Popen(args)
## KDE4 is difficult
## see http://blog.zx2c4.com/699 for a solution that might work
elif desktop_env in ["kde3", "trinity"]:
# From http://ubuntuforums.org/archive/index.php/t-803417.html
args = ["dcop", "kdesktop", "KBackgroundIface", "setWallpaper", "0", file_loc, "6"]
subprocess.Popen(args)
elif desktop_env == "xfce4":
# Iterate over all wallpaper-related keys and set to file_loc
try:
list_proc = subprocess.run(
["xfconf-query", "-c", "xfce4-desktop", "-l"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=False,
)
keys = []
if list_proc.stdout:
for line in list_proc.stdout.splitlines():
line = line.strip()
if not line:
continue
# Common keys: .../last-image and .../image-path
if "/last-image" in line or "/image-path" in line:
keys.append(line)
# Fallback: known defaults if none were listed
if not keys:
keys = [
"/backdrop/screen0/monitorVNC-0/workspace0/last-image",
"/backdrop/screen0/monitor0/image-path",
]
for key in keys:
subprocess.run(
[
"xfconf-query",
"-c",
"xfce4-desktop",
"-p",
key,
"-s",
file_loc,
],
check=False,
)
except Exception:
pass
# Reload xfdesktop to apply changes
subprocess.Popen(["xfdesktop", "--reload"])
elif desktop_env == "razor-qt": # TODO: implement reload of desktop when possible
if first_run:
import configparser
desktop_conf = configparser.ConfigParser()
# Development version
desktop_conf_file = os.path.join(get_config_dir("razor"), "desktop.conf")
if os.path.isfile(desktop_conf_file):
config_option = R"screens\1\desktops\1\wallpaper"
else:
desktop_conf_file = os.path.join(get_home_dir(), ".razor/desktop.conf")
config_option = R"desktops\1\wallpaper"
desktop_conf.read(os.path.join(desktop_conf_file))
try:
if desktop_conf.has_option("razor", config_option): # only replacing a value
desktop_conf.set("razor", config_option, file_loc)
with open(desktop_conf_file, "w", encoding="utf-8", errors="replace") as f:
desktop_conf.write(f)
except Exception:
pass
else:
# TODO: reload desktop when possible
pass
elif desktop_env in ["fluxbox", "jwm", "openbox", "afterstep"]:
# http://fluxbox-wiki.org/index.php/Howto_set_the_background
# used fbsetbg on jwm too since I am too lazy to edit the XML configuration
# now where fbsetbg does the job excellent anyway.
# and I have not figured out how else it can be set on Openbox and AfterSTep
# but fbsetbg works excellent here too.
try:
args = ["fbsetbg", file_loc]
subprocess.Popen(args)
except Exception:
sys.stderr.write("ERROR: Failed to set wallpaper with fbsetbg!\n")
sys.stderr.write("Please make sre that You have fbsetbg installed.\n")
elif desktop_env == "icewm":
# command found at http://urukrama.wordpress.com/2007/12/05/desktop-backgrounds-in-window-managers/
args = ["icewmbg", file_loc]
subprocess.Popen(args)
elif desktop_env == "blackbox":
# command found at http://blackboxwm.sourceforge.net/BlackboxDocumentation/BlackboxBackground
args = ["bsetbg", "-full", file_loc]
subprocess.Popen(args)
elif desktop_env == "lxde":
args = ["pcmanfm", "--set-wallpaper", file_loc, "--wallpaper-mode=scaled"]
subprocess.Popen(args)
elif desktop_env == "windowmaker":
# From http://www.commandlinefu.com/commands/view/3857/set-wallpaper-on-windowmaker-in-one-line
args = ["wmsetbg", "-s", "-u", file_loc]
subprocess.Popen(args)
# elif desktop_env == "enlightenment": # I have not been able to make it work on e17. On e16 it would have been something in this direction
# args = ["enlightenment_remote", "-desktop-bg-add", "0", "0", "0", "0", file_loc]
# subprocess.Popen(args)
elif desktop_env == "windows":
# From https://stackoverflow.com/questions/1977694/change-desktop-background
# Tested on Windows 10. -- @1j01
import ctypes
SPI_SETDESKWALLPAPER = 20
ctypes.windll.user32.SystemParametersInfoW(SPI_SETDESKWALLPAPER, 0, file_loc, 0) # type: ignore
elif desktop_env == "mac":
# From https://stackoverflow.com/questions/431205/how-can-i-programatically-change-the-background-in-mac-os-x
try:
# Tested on macOS 10.14.6 (Mojave) -- @1j01
assert (
sys.platform == "darwin"
) # ignore `Import "appscript" could not be resolved` for other platforms
from appscript import app, mactypes
app("Finder").desktop_picture.set(mactypes.File(file_loc))
except ImportError:
# Tested on macOS 10.14.6 (Mojave) -- @1j01
# import subprocess
# SCRIPT = f"""/usr/bin/osascript<<END
# tell application "Finder" to set desktop picture to POSIX file "{file_loc}"
# END"""
# subprocess.Popen(SCRIPT, shell=True)
# Safer version, avoiding string interpolation,
# to protect against command injection (both in the shell and in AppleScript):
OSASCRIPT = """
on run (clp)
if clp's length is not 1 then error "Incorrect Parameters"
local file_loc
set file_loc to clp's item 1
tell application "Finder" to set desktop picture to POSIX file file_loc
end run
"""
subprocess.Popen(["osascript", "-e", OSASCRIPT, "--", file_loc])
else:
if first_run: # don't spam the user with the same message over and over again
sys.stderr.write(
"Warning: Failed to set wallpaper. Your desktop environment is not supported."
)
sys.stderr.write(f"You can try manually to set your wallpaper to {file_loc}")
return False
return True
def get_config_dir(app_name: str) -> str:
"""Returns the configuration directory for the given application name."""
if "XDG_CONFIG_HOME" in os.environ:
config_home = os.environ["XDG_CONFIG_HOME"]
elif "APPDATA" in os.environ: # On Windows
config_home = os.environ["APPDATA"]
else:
try:
from xdg import BaseDirectory
config_home = BaseDirectory.xdg_config_home
except ImportError: # Most likely a Linux/Unix system anyway
config_home = os.path.join(get_home_dir(), ".config")
config_dir = os.path.join(config_home, app_name)
return config_dir
def get_home_dir() -> str:
"""Returns the home directory of the current user."""
return os.path.expanduser("~")

View File

@@ -4,7 +4,7 @@ build-backend = "pdm.backend"
[project]
name = "cua-computer-server"
version = "0.1.27"
version = "0.1.29"
description = "Server component for the Computer-Use Interface (CUI) framework powering Cua"
authors = [
@@ -23,13 +23,14 @@ dependencies = [
"aiohttp>=3.9.1",
"pyperclip>=1.9.0",
"websockets>=12.0",
"pywinctl>=0.4.1",
# OS-specific runtime deps
"pyobjc-framework-Cocoa>=10.1; sys_platform == 'darwin'",
"pyobjc-framework-Quartz>=10.1; sys_platform == 'darwin'",
"pyobjc-framework-ApplicationServices>=10.1; sys_platform == 'darwin'",
"python-xlib>=0.33; sys_platform == 'linux'",
"pywin32>=310; sys_platform == 'win32'",
"pip-system-certs; sys_platform == 'win32'",
"python-certifi-win32; sys_platform == 'win32'",
]
[project.optional-dependencies]

View File

@@ -0,0 +1,47 @@
"""Pytest configuration and shared fixtures for computer-server package tests.
This file contains shared fixtures and configuration for all computer-server tests.
Following SRP: This file ONLY handles test setup/teardown.
"""
from unittest.mock import AsyncMock, Mock, patch
import pytest
@pytest.fixture
def mock_websocket():
"""Mock WebSocket connection for testing.
Use this fixture to test WebSocket logic without real connections.
"""
websocket = AsyncMock()
websocket.send = AsyncMock()
websocket.recv = AsyncMock()
websocket.close = AsyncMock()
return websocket
@pytest.fixture
def mock_computer_interface():
"""Mock computer interface for server tests.
Use this fixture to test server logic without real computer operations.
"""
interface = AsyncMock()
interface.screenshot = AsyncMock(return_value=b"fake_screenshot")
interface.left_click = AsyncMock()
interface.type = AsyncMock()
interface.key = AsyncMock()
return interface
@pytest.fixture
def disable_telemetry(monkeypatch):
"""Disable telemetry for tests.
Use this fixture to ensure no telemetry is sent during tests.
"""
monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1")

View File

@@ -0,0 +1,40 @@
"""Unit tests for computer-server package.
This file tests ONLY basic server functionality.
Following SRP: This file tests server initialization and basic operations.
All external dependencies are mocked.
"""
from unittest.mock import AsyncMock, Mock, patch
import pytest
class TestServerImports:
"""Test server module imports (SRP: Only tests imports)."""
def test_server_module_exists(self):
"""Test that server module can be imported."""
try:
import computer_server
assert computer_server is not None
except ImportError:
pytest.skip("computer_server module not installed")
class TestServerInitialization:
"""Test server initialization (SRP: Only tests initialization)."""
@pytest.mark.asyncio
async def test_server_can_be_imported(self):
"""Basic smoke test: verify server components can be imported."""
try:
from computer_server import server
assert server is not None
except ImportError:
pytest.skip("Server module not available")
except Exception as e:
# Some initialization errors are acceptable in unit tests
pytest.skip(f"Server initialization requires specific setup: {e}")

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.4.7
current_version = 0.4.11
commit = True
tag = True
tag_name = computer-v{new_version}

View File

@@ -68,7 +68,7 @@ Refer to this notebook for a step-by-step guide on how to use the Computer-Use I
## Docs
- [Computers](https://trycua.com/docs/computer-sdk/computers)
- [Commands](https://trycua.com/docs/computer-sdk/commands)
- [Computer UI](https://trycua.com/docs/computer-sdk/computer-ui)
- [Sandboxed Python](https://trycua.com/docs/computer-sdk/sandboxed-python)
- [Computers](https://cua.ai/docs/computer-sdk/computers)
- [Commands](https://cua.ai/docs/computer-sdk/commands)
- [Computer UI](https://cua.ai/docs/computer-sdk/computer-ui)
- [Sandboxed Python](https://cua.ai/docs/computer-sdk/sandboxed-python)

View File

@@ -17,6 +17,8 @@ from .interface.factory import InterfaceFactory
from .logger import Logger, LogLevel
from .models import Computer as ComputerConfig
from .models import Display
from .tracing import ComputerTracing
from .tracing_wrapper import TracingInterfaceWrapper
SYSTEM_INFO = {
"os": platform.system().lower(),
@@ -208,8 +210,13 @@ class Computer:
# Initialize with proper typing - None at first, will be set in run()
self._interface = None
self._original_interface = None # Keep reference to original interface
self._tracing_wrapper = None # Tracing wrapper for interface
self.use_host_computer_server = use_host_computer_server
# Initialize tracing
self._tracing = ComputerTracing(self)
# Record initialization in telemetry (if enabled)
if telemetry_enabled and is_telemetry_enabled():
record_event("computer_initialized", SYSTEM_INFO)
@@ -259,12 +266,14 @@ class Computer:
# Create the interface with explicit type annotation
from .interface.base import BaseComputerInterface
self._interface = cast(
interface = cast(
BaseComputerInterface,
InterfaceFactory.create_interface_for_os(
os=self.os_type, ip_address=ip_address # type: ignore[arg-type]
),
)
self._interface = interface
self._original_interface = interface
self.logger.info("Waiting for host computer server to be ready...")
await self._interface.wait_for_ready()
@@ -493,7 +502,7 @@ class Computer:
# Pass authentication credentials if using cloud provider
if self.provider_type == VMProviderType.CLOUD and self.api_key and self.config.name:
self._interface = cast(
interface = cast(
BaseComputerInterface,
InterfaceFactory.create_interface_for_os(
os=self.os_type,
@@ -503,13 +512,16 @@ class Computer:
),
)
else:
self._interface = cast(
interface = cast(
BaseComputerInterface,
InterfaceFactory.create_interface_for_os(
os=self.os_type, ip_address=ip_address
),
)
self._interface = interface
self._original_interface = interface
# Wait for the WebSocket interface to be ready
self.logger.info("Connecting to WebSocket interface...")
@@ -866,7 +878,7 @@ class Computer:
"""Get the computer interface for interacting with the VM.
Returns:
The computer interface
The computer interface (wrapped with tracing if tracing is active)
"""
if not hasattr(self, "_interface") or self._interface is None:
error_msg = "Computer interface not initialized. Call run() first."
@@ -876,8 +888,34 @@ class Computer:
)
raise RuntimeError(error_msg)
# Return tracing wrapper if tracing is active and we have an original interface
if (
self._tracing.is_tracing
and hasattr(self, "_original_interface")
and self._original_interface is not None
):
# Create wrapper if it doesn't exist or if the original interface changed
if (
not hasattr(self, "_tracing_wrapper")
or self._tracing_wrapper is None
or self._tracing_wrapper._original_interface != self._original_interface
):
self._tracing_wrapper = TracingInterfaceWrapper(
self._original_interface, self._tracing
)
return self._tracing_wrapper
return self._interface
@property
def tracing(self) -> ComputerTracing:
"""Get the computer tracing instance for recording sessions.
Returns:
ComputerTracing: The tracing instance
"""
return self._tracing
@property
def telemetry_enabled(self) -> bool:
"""Check if telemetry is enabled for this computer instance.

View File

@@ -436,6 +436,189 @@ class BaseComputerInterface(ABC):
"""
pass
# Desktop actions
@abstractmethod
async def get_desktop_environment(self) -> str:
"""Get the current desktop environment.
Returns:
The name of the current desktop environment.
"""
pass
@abstractmethod
async def set_wallpaper(self, path: str) -> None:
"""Set the desktop wallpaper to the specified path.
Args:
path: The file path to set as wallpaper
"""
pass
# Window management
@abstractmethod
async def open(self, target: str) -> None:
"""Open a target using the system's default handler.
Typically opens files, folders, or URLs with the associated application.
Args:
target: The file path, folder path, or URL to open.
"""
pass
@abstractmethod
async def launch(self, app: str, args: List[str] | None = None) -> Optional[int]:
"""Launch an application with optional arguments.
Args:
app: The application executable or bundle identifier.
args: Optional list of arguments to pass to the application.
Returns:
Optional process ID (PID) of the launched application if available, otherwise None.
"""
pass
@abstractmethod
async def get_current_window_id(self) -> int | str:
"""Get the identifier of the currently active/focused window.
Returns:
A window identifier that can be used with other window management methods.
"""
pass
@abstractmethod
async def get_application_windows(self, app: str) -> List[int | str]:
"""Get all window identifiers for a specific application.
Args:
app: The application name, executable, or identifier to query.
Returns:
A list of window identifiers belonging to the specified application.
"""
pass
@abstractmethod
async def get_window_name(self, window_id: int | str) -> str:
"""Get the title/name of a window.
Args:
window_id: The window identifier.
Returns:
The window's title or name string.
"""
pass
@abstractmethod
async def get_window_size(self, window_id: int | str) -> tuple[int, int]:
"""Get the size of a window in pixels.
Args:
window_id: The window identifier.
Returns:
A tuple of (width, height) representing the window size in pixels.
"""
pass
@abstractmethod
async def get_window_position(self, window_id: int | str) -> tuple[int, int]:
"""Get the screen position of a window.
Args:
window_id: The window identifier.
Returns:
A tuple of (x, y) representing the window's top-left corner in screen coordinates.
"""
pass
@abstractmethod
async def set_window_size(self, window_id: int | str, width: int, height: int) -> None:
"""Set the size of a window in pixels.
Args:
window_id: The window identifier.
width: Desired width in pixels.
height: Desired height in pixels.
"""
pass
@abstractmethod
async def set_window_position(self, window_id: int | str, x: int, y: int) -> None:
"""Move a window to a specific position on the screen.
Args:
window_id: The window identifier.
x: X coordinate for the window's top-left corner.
y: Y coordinate for the window's top-left corner.
"""
pass
@abstractmethod
async def maximize_window(self, window_id: int | str) -> None:
"""Maximize a window.
Args:
window_id: The window identifier.
"""
pass
@abstractmethod
async def minimize_window(self, window_id: int | str) -> None:
"""Minimize a window.
Args:
window_id: The window identifier.
"""
pass
@abstractmethod
async def activate_window(self, window_id: int | str) -> None:
"""Bring a window to the foreground and focus it.
Args:
window_id: The window identifier.
"""
pass
@abstractmethod
async def close_window(self, window_id: int | str) -> None:
"""Close a window.
Args:
window_id: The window identifier.
"""
pass
# Convenience aliases
async def get_window_title(self, window_id: int | str) -> str:
"""Convenience alias for get_window_name().
Args:
window_id: The window identifier.
Returns:
The window's title or name string.
"""
return await self.get_window_name(window_id)
async def window_size(self, window_id: int | str) -> tuple[int, int]:
"""Convenience alias for get_window_size().
Args:
window_id: The window identifier.
Returns:
A tuple of (width, height) representing the window size in pixels.
"""
return await self.get_window_size(window_id)
# Shell actions
@abstractmethod
async def run_command(self, command: str) -> CommandResult:
"""Run shell command and return structured result.

View File

@@ -487,6 +487,104 @@ class GenericComputerInterface(BaseComputerInterface):
raise RuntimeError(result.get("error", "Failed to list directory"))
return result.get("files", [])
# Desktop actions
async def get_desktop_environment(self) -> str:
result = await self._send_command("get_desktop_environment")
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to get desktop environment"))
return result.get("environment", "unknown")
async def set_wallpaper(self, path: str) -> None:
result = await self._send_command("set_wallpaper", {"path": path})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to set wallpaper"))
# Window management
async def open(self, target: str) -> None:
result = await self._send_command("open", {"target": target})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to open target"))
async def launch(self, app: str, args: list[str] | None = None) -> int | None:
payload: dict[str, object] = {"app": app}
if args is not None:
payload["args"] = args
result = await self._send_command("launch", payload)
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to launch application"))
return result.get("pid") # type: ignore[return-value]
async def get_current_window_id(self) -> int | str:
result = await self._send_command("get_current_window_id")
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to get current window id"))
return result["window_id"] # type: ignore[return-value]
async def get_application_windows(self, app: str) -> list[int | str]:
result = await self._send_command("get_application_windows", {"app": app})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to get application windows"))
return list(result.get("windows", [])) # type: ignore[return-value]
async def get_window_name(self, window_id: int | str) -> str:
result = await self._send_command("get_window_name", {"window_id": window_id})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to get window name"))
return result.get("name", "") # type: ignore[return-value]
async def get_window_size(self, window_id: int | str) -> tuple[int, int]:
result = await self._send_command("get_window_size", {"window_id": window_id})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to get window size"))
return int(result.get("width", 0)), int(result.get("height", 0))
async def get_window_position(self, window_id: int | str) -> tuple[int, int]:
result = await self._send_command("get_window_position", {"window_id": window_id})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to get window position"))
return int(result.get("x", 0)), int(result.get("y", 0))
async def set_window_size(self, window_id: int | str, width: int, height: int) -> None:
result = await self._send_command(
"set_window_size", {"window_id": window_id, "width": width, "height": height}
)
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to set window size"))
async def set_window_position(self, window_id: int | str, x: int, y: int) -> None:
result = await self._send_command(
"set_window_position", {"window_id": window_id, "x": x, "y": y}
)
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to set window position"))
async def maximize_window(self, window_id: int | str) -> None:
result = await self._send_command("maximize_window", {"window_id": window_id})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to maximize window"))
async def minimize_window(self, window_id: int | str) -> None:
result = await self._send_command("minimize_window", {"window_id": window_id})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to minimize window"))
async def activate_window(self, window_id: int | str) -> None:
result = await self._send_command("activate_window", {"window_id": window_id})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to activate window"))
async def close_window(self, window_id: int | str) -> None:
result = await self._send_command("close_window", {"window_id": window_id})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to close window"))
# Convenience aliases
async def get_window_title(self, window_id: int | str) -> str:
return await self.get_window_name(window_id)
async def window_size(self, window_id: int | str) -> tuple[int, int]:
return await self.get_window_size(window_id)
# Command execution
async def run_command(self, command: str) -> CommandResult:
result = await self._send_command("run_command", {"command": command})

View File

@@ -10,6 +10,8 @@ import subprocess
import urllib.parse
from typing import Any, Dict, List, Optional
from computer.utils import safe_join
# Setup logging
logger = logging.getLogger(__name__)
@@ -59,7 +61,7 @@ def lume_api_get(
# --max-time: Maximum time for the whole operation (20 seconds)
# -f: Fail silently (no output at all) on server errors
# Add single quotes around URL to ensure special characters are handled correctly
cmd = ["curl", "--connect-timeout", "15", "--max-time", "20", "-s", "-f", f"'{api_url}'"]
cmd = ["curl", "--connect-timeout", "15", "--max-time", "20", "-s", "-f", api_url]
# For logging and display, show the properly escaped URL
display_cmd = ["curl", "--connect-timeout", "15", "--max-time", "20", "-s", "-f", api_url]
@@ -71,7 +73,7 @@ def lume_api_get(
# Execute the command - for execution we need to use shell=True to handle URLs with special characters
try:
# Use a single string with shell=True for proper URL handling
shell_cmd = " ".join(cmd)
shell_cmd = safe_join(cmd)
result = subprocess.run(shell_cmd, shell=True, capture_output=True, text=True)
# Handle curl exit codes
@@ -514,7 +516,7 @@ def lume_api_delete(
"-s",
"-X",
"DELETE",
f"'{api_url}'",
api_url,
]
# For logging and display, show the properly escaped URL
@@ -537,7 +539,7 @@ def lume_api_delete(
# Execute the command - for execution we need to use shell=True to handle URLs with special characters
try:
# Use a single string with shell=True for proper URL handling
shell_cmd = " ".join(cmd)
shell_cmd = safe_join(cmd)
result = subprocess.run(shell_cmd, shell=True, capture_output=True, text=True)
# Handle curl exit codes

View File

@@ -0,0 +1,355 @@
"""
Computer tracing functionality for recording sessions.
This module provides a Computer.tracing API inspired by Playwright's tracing functionality,
allowing users to record computer interactions for debugging, training, and analysis.
"""
import asyncio
import base64
import io
import json
import time
import uuid
import zipfile
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from PIL import Image
class ComputerTracing:
"""
Computer tracing class that records computer interactions and saves them to disk.
This class provides a flexible API for recording computer sessions with configurable
options for what to record (screenshots, API calls, video, etc.).
"""
def __init__(self, computer_instance):
"""
Initialize the tracing instance.
Args:
computer_instance: The Computer instance to trace
"""
self._computer = computer_instance
self._is_tracing = False
self._trace_config: Dict[str, Any] = {}
self._trace_data: List[Dict[str, Any]] = []
self._trace_start_time: Optional[float] = None
self._trace_id: Optional[str] = None
self._trace_dir: Optional[Path] = None
self._screenshot_count = 0
@property
def is_tracing(self) -> bool:
"""Check if tracing is currently active."""
return self._is_tracing
async def start(self, config: Optional[Dict[str, Any]] = None) -> None:
"""
Start tracing with the specified configuration.
Args:
config: Tracing configuration dict with options:
- video: bool - Record video frames (default: False)
- screenshots: bool - Record screenshots (default: True)
- api_calls: bool - Record API calls and results (default: True)
- accessibility_tree: bool - Record accessibility tree snapshots (default: False)
- metadata: bool - Record custom metadata (default: True)
- name: str - Custom trace name (default: auto-generated)
- path: str - Custom trace directory path (default: auto-generated)
"""
if self._is_tracing:
raise RuntimeError("Tracing is already active. Call stop() first.")
# Set default configuration
default_config = {
"video": False,
"screenshots": True,
"api_calls": True,
"accessibility_tree": False,
"metadata": True,
"name": None,
"path": None,
}
self._trace_config = {**default_config, **(config or {})}
# Generate trace ID and directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
self._trace_id = (
self._trace_config.get("name") or f"trace_{timestamp}_{str(uuid.uuid4())[:8]}"
)
if self._trace_config.get("path"):
self._trace_dir = Path(self._trace_config["path"])
else:
self._trace_dir = Path.cwd() / "traces" / self._trace_id
# Create trace directory
self._trace_dir.mkdir(parents=True, exist_ok=True)
# Initialize trace data
self._trace_data = []
self._trace_start_time = time.time()
self._screenshot_count = 0
self._is_tracing = True
# Record initial metadata
await self._record_event(
"trace_start",
{
"trace_id": self._trace_id,
"config": self._trace_config,
"timestamp": self._trace_start_time,
"computer_info": {
"os_type": self._computer.os_type,
"provider_type": str(self._computer.provider_type),
"image": self._computer.image,
},
},
)
# Take initial screenshot if enabled
if self._trace_config.get("screenshots"):
await self._take_screenshot("initial_screenshot")
async def stop(self, options: Optional[Dict[str, Any]] = None) -> str:
"""
Stop tracing and save the trace data.
Args:
options: Stop options dict with:
- path: str - Custom output path for the trace archive
- format: str - Output format ('zip' or 'dir', default: 'zip')
Returns:
str: Path to the saved trace file or directory
"""
if not self._is_tracing:
raise RuntimeError("Tracing is not active. Call start() first.")
if self._trace_start_time is None or self._trace_dir is None or self._trace_id is None:
raise RuntimeError("Tracing state is invalid.")
# Record final metadata
await self._record_event(
"trace_end",
{
"timestamp": time.time(),
"duration": time.time() - self._trace_start_time,
"total_events": len(self._trace_data),
"screenshot_count": self._screenshot_count,
},
)
# Take final screenshot if enabled
if self._trace_config.get("screenshots"):
await self._take_screenshot("final_screenshot")
# Save trace metadata
metadata_path = self._trace_dir / "trace_metadata.json"
with open(metadata_path, "w") as f:
json.dump(
{
"trace_id": self._trace_id,
"config": self._trace_config,
"start_time": self._trace_start_time,
"end_time": time.time(),
"duration": time.time() - self._trace_start_time,
"total_events": len(self._trace_data),
"screenshot_count": self._screenshot_count,
"events": self._trace_data,
},
f,
indent=2,
default=str,
)
# Determine output format and path
output_format = options.get("format", "zip") if options else "zip"
custom_path = options.get("path") if options else None
if output_format == "zip":
# Create zip file
if custom_path:
zip_path = Path(custom_path)
else:
zip_path = self._trace_dir.parent / f"{self._trace_id}.zip"
await self._create_zip_archive(zip_path)
output_path = str(zip_path)
else:
# Return directory path
if custom_path:
# Move directory to custom path
custom_dir = Path(custom_path)
if custom_dir.exists():
import shutil
shutil.rmtree(custom_dir)
self._trace_dir.rename(custom_dir)
output_path = str(custom_dir)
else:
output_path = str(self._trace_dir)
# Reset tracing state
self._is_tracing = False
self._trace_config = {}
self._trace_data = []
self._trace_start_time = None
self._trace_id = None
self._screenshot_count = 0
return output_path
async def _record_event(self, event_type: str, data: Dict[str, Any]) -> None:
"""
Record a trace event.
Args:
event_type: Type of event (e.g., 'click', 'type', 'screenshot')
data: Event data
"""
if not self._is_tracing or self._trace_start_time is None or self._trace_dir is None:
return
event = {
"type": event_type,
"timestamp": time.time(),
"relative_time": time.time() - self._trace_start_time,
"data": data,
}
self._trace_data.append(event)
# Save event to individual file for large traces
event_file = self._trace_dir / f"event_{len(self._trace_data):06d}_{event_type}.json"
with open(event_file, "w") as f:
json.dump(event, f, indent=2, default=str)
async def _take_screenshot(self, name: str = "screenshot") -> Optional[str]:
"""
Take a screenshot and save it to the trace.
Args:
name: Name for the screenshot
Returns:
Optional[str]: Path to the saved screenshot, or None if screenshots disabled
"""
if (
not self._trace_config.get("screenshots")
or not self._computer.interface
or self._trace_dir is None
):
return None
try:
screenshot_bytes = await self._computer.interface.screenshot()
self._screenshot_count += 1
screenshot_filename = f"{self._screenshot_count:06d}_{name}.png"
screenshot_path = self._trace_dir / screenshot_filename
with open(screenshot_path, "wb") as f:
f.write(screenshot_bytes)
return str(screenshot_path)
except Exception as e:
# Log error but don't fail the trace
if hasattr(self._computer, "logger"):
self._computer.logger.warning(f"Failed to take screenshot: {e}")
return None
async def _create_zip_archive(self, zip_path: Path) -> None:
"""
Create a zip archive of the trace directory.
Args:
zip_path: Path where to save the zip file
"""
if self._trace_dir is None:
raise RuntimeError("Trace directory is not set")
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
for file_path in self._trace_dir.rglob("*"):
if file_path.is_file():
arcname = file_path.relative_to(self._trace_dir)
zipf.write(file_path, arcname)
async def record_api_call(
self,
method: str,
args: Dict[str, Any],
result: Any = None,
error: Optional[Exception] = None,
) -> None:
"""
Record an API call event.
Args:
method: The method name that was called
args: Arguments passed to the method
result: Result returned by the method
error: Exception raised by the method, if any
"""
if not self._trace_config.get("api_calls"):
return
# Take screenshot after certain actions if enabled
screenshot_path = None
screenshot_actions = [
"left_click",
"right_click",
"double_click",
"type_text",
"press_key",
"hotkey",
]
if method in screenshot_actions and self._trace_config.get("screenshots"):
screenshot_path = await self._take_screenshot(f"after_{method}")
# Record accessibility tree after certain actions if enabled
if method in screenshot_actions and self._trace_config.get("accessibility_tree"):
await self.record_accessibility_tree()
await self._record_event(
"api_call",
{
"method": method,
"args": args,
"result": str(result) if result is not None else None,
"error": str(error) if error else None,
"screenshot": screenshot_path,
"success": error is None,
},
)
async def record_accessibility_tree(self) -> None:
"""Record the current accessibility tree if enabled."""
if not self._trace_config.get("accessibility_tree") or not self._computer.interface:
return
try:
accessibility_tree = await self._computer.interface.get_accessibility_tree()
await self._record_event("accessibility_tree", {"tree": accessibility_tree})
except Exception as e:
if hasattr(self._computer, "logger"):
self._computer.logger.warning(f"Failed to record accessibility tree: {e}")
async def add_metadata(self, key: str, value: Any) -> None:
"""
Add custom metadata to the trace.
Args:
key: Metadata key
value: Metadata value
"""
if not self._trace_config.get("metadata"):
return
await self._record_event("metadata", {"key": key, "value": value})

View File

@@ -0,0 +1,334 @@
"""
Tracing wrapper for computer interface that records API calls.
"""
from typing import Any, Dict, List, Optional, Tuple
from .interface.base import BaseComputerInterface
class TracingInterfaceWrapper:
"""
Wrapper class that intercepts computer interface calls and records them for tracing.
"""
def __init__(self, original_interface: BaseComputerInterface, tracing_instance):
"""
Initialize the tracing wrapper.
Args:
original_interface: The original computer interface
tracing_instance: The ComputerTracing instance
"""
self._original_interface = original_interface
self._tracing = tracing_instance
def __getattr__(self, name):
"""
Delegate attribute access to the original interface if not found in wrapper.
"""
return getattr(self._original_interface, name)
async def _record_call(
self,
method_name: str,
args: Dict[str, Any],
result: Any = None,
error: Optional[Exception] = None,
):
"""
Record an API call for tracing.
Args:
method_name: Name of the method called
args: Arguments passed to the method
result: Result returned by the method
error: Exception raised, if any
"""
if self._tracing.is_tracing:
await self._tracing.record_api_call(method_name, args, result, error)
# Mouse Actions
async def left_click(
self, x: Optional[int] = None, y: Optional[int] = None, delay: Optional[float] = None
) -> None:
"""Perform a left mouse button click."""
args = {"x": x, "y": y, "delay": delay}
error = None
try:
result = await self._original_interface.left_click(x, y, delay)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("left_click", args, None, error)
async def right_click(
self, x: Optional[int] = None, y: Optional[int] = None, delay: Optional[float] = None
) -> None:
"""Perform a right mouse button click."""
args = {"x": x, "y": y, "delay": delay}
error = None
try:
result = await self._original_interface.right_click(x, y, delay)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("right_click", args, None, error)
async def double_click(
self, x: Optional[int] = None, y: Optional[int] = None, delay: Optional[float] = None
) -> None:
"""Perform a double left mouse button click."""
args = {"x": x, "y": y, "delay": delay}
error = None
try:
result = await self._original_interface.double_click(x, y, delay)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("double_click", args, None, error)
async def move_cursor(self, x: int, y: int, delay: Optional[float] = None) -> None:
"""Move the cursor to the specified screen coordinates."""
args = {"x": x, "y": y, "delay": delay}
error = None
try:
result = await self._original_interface.move_cursor(x, y, delay)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("move_cursor", args, None, error)
async def drag_to(
self,
x: int,
y: int,
button: str = "left",
duration: float = 0.5,
delay: Optional[float] = None,
) -> None:
"""Drag from current position to specified coordinates."""
args = {"x": x, "y": y, "button": button, "duration": duration, "delay": delay}
error = None
try:
result = await self._original_interface.drag_to(x, y, button, duration, delay)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("drag_to", args, None, error)
async def drag(
self,
path: List[Tuple[int, int]],
button: str = "left",
duration: float = 0.5,
delay: Optional[float] = None,
) -> None:
"""Drag the cursor along a path of coordinates."""
args = {"path": path, "button": button, "duration": duration, "delay": delay}
error = None
try:
result = await self._original_interface.drag(path, button, duration, delay)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("drag", args, None, error)
# Keyboard Actions
async def key_down(self, key: str, delay: Optional[float] = None) -> None:
"""Press and hold a key."""
args = {"key": key, "delay": delay}
error = None
try:
result = await self._original_interface.key_down(key, delay)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("key_down", args, None, error)
async def key_up(self, key: str, delay: Optional[float] = None) -> None:
"""Release a previously pressed key."""
args = {"key": key, "delay": delay}
error = None
try:
result = await self._original_interface.key_up(key, delay)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("key_up", args, None, error)
async def type_text(self, text: str, delay: Optional[float] = None) -> None:
"""Type the specified text string."""
args = {"text": text, "delay": delay}
error = None
try:
result = await self._original_interface.type_text(text, delay)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("type_text", args, None, error)
async def press_key(self, key: str, delay: Optional[float] = None) -> None:
"""Press and release a single key."""
args = {"key": key, "delay": delay}
error = None
try:
result = await self._original_interface.press_key(key, delay)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("press_key", args, None, error)
async def hotkey(self, *keys: str, delay: Optional[float] = None) -> None:
"""Press multiple keys simultaneously (keyboard shortcut)."""
args = {"keys": keys, "delay": delay}
error = None
try:
result = await self._original_interface.hotkey(*keys, delay=delay)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("hotkey", args, None, error)
# Scrolling Actions
async def scroll(self, x: int, y: int, delay: Optional[float] = None) -> None:
"""Scroll the mouse wheel by specified amounts."""
args = {"x": x, "y": y, "delay": delay}
error = None
try:
result = await self._original_interface.scroll(x, y, delay)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("scroll", args, None, error)
async def scroll_down(self, clicks: int = 1, delay: Optional[float] = None) -> None:
"""Scroll down by the specified number of clicks."""
args = {"clicks": clicks, "delay": delay}
error = None
try:
result = await self._original_interface.scroll_down(clicks, delay)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("scroll_down", args, None, error)
async def scroll_up(self, clicks: int = 1, delay: Optional[float] = None) -> None:
"""Scroll up by the specified number of clicks."""
args = {"clicks": clicks, "delay": delay}
error = None
try:
result = await self._original_interface.scroll_up(clicks, delay)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("scroll_up", args, None, error)
# Screen Actions
async def screenshot(self) -> bytes:
"""Take a screenshot."""
args = {}
error = None
result = None
try:
result = await self._original_interface.screenshot()
return result
except Exception as e:
error = e
raise
finally:
# For screenshots, we don't want to include the raw bytes in the trace args
await self._record_call(
"screenshot", args, "screenshot_taken" if result else None, error
)
async def get_screen_size(self) -> Dict[str, int]:
"""Get the screen dimensions."""
args = {}
error = None
result = None
try:
result = await self._original_interface.get_screen_size()
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("get_screen_size", args, result, error)
async def get_cursor_position(self) -> Dict[str, int]:
"""Get the current cursor position on screen."""
args = {}
error = None
result = None
try:
result = await self._original_interface.get_cursor_position()
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("get_cursor_position", args, result, error)
# Clipboard Actions
async def copy_to_clipboard(self) -> str:
"""Get the current clipboard content."""
args = {}
error = None
result = None
try:
result = await self._original_interface.copy_to_clipboard()
return result
except Exception as e:
error = e
raise
finally:
# Don't include clipboard content in trace for privacy
await self._record_call(
"copy_to_clipboard",
args,
f"content_length_{len(result)}" if result else None,
error,
)
async def set_clipboard(self, text: str) -> None:
"""Set the clipboard content to the specified text."""
# Don't include clipboard content in trace for privacy
args = {"text_length": len(text)}
error = None
try:
result = await self._original_interface.set_clipboard(text)
return result
except Exception as e:
error = e
raise
finally:
await self._record_call("set_clipboard", args, None, error)

View File

@@ -1207,7 +1207,7 @@ def create_gradio_ui():
label="Container Name",
placeholder="Enter your container name",
visible=False,
info="Get your container from [trycua.com](https://trycua.com/)",
info="Get your container from [cua.ai](https://cua.ai/)",
)
# Check if CUA_API_KEY is set in environment

View File

@@ -1,7 +1,10 @@
import base64
import io
import os
import shlex
from typing import Any, Dict, Optional, Tuple
import mslex
from PIL import Image, ImageDraw
@@ -104,3 +107,25 @@ def parse_vm_info(vm_info: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Parse VM info from pylume response."""
if not vm_info:
return None
def safe_join(argv: list[str]) -> str:
"""
Return a platform-correct string that safely quotes `argv` for shell execution.
- On POSIX: uses `shlex.join`.
- On Windows: uses `shlex.join`.
Args:
argv: iterable of argument strings (will be coerced to str).
Returns:
A safely quoted command-line string appropriate for the current platform that protects against
shell injection vulnerabilities.
"""
if os.name == "nt":
# On Windows, use mslex for proper quoting
return mslex.join(argv)
else:
# On POSIX systems, use shlex
return shlex.join(argv)

View File

@@ -4,7 +4,7 @@ build-backend = "pdm.backend"
[project]
name = "cua-computer"
version = "0.4.10"
version = "0.4.11"
description = "Computer-Use Interface (CUI) framework powering Cua"
readme = "README.md"
authors = [
@@ -16,7 +16,8 @@ dependencies = [
"websockets>=12.0",
"aiohttp>=3.9.0",
"cua-core>=0.1.0,<0.2.0",
"pydantic>=2.11.1"
"pydantic>=2.11.1",
"mslex>=1.3.0",
]
requires-python = ">=3.12"
@@ -47,4 +48,4 @@ source-includes = ["tests/", "README.md", "LICENSE"]
[tool.pytest.ini_options]
asyncio_mode = "auto"
testpaths = ["tests"]
python_files = "test_*.py"
python_files = "test_*.py"

View File

@@ -0,0 +1,69 @@
"""Pytest configuration and shared fixtures for computer package tests.
This file contains shared fixtures and configuration for all computer tests.
Following SRP: This file ONLY handles test setup/teardown.
"""
from unittest.mock import AsyncMock, MagicMock, Mock, patch
import pytest
@pytest.fixture
def mock_interface():
"""Mock computer interface for testing.
Use this fixture to test Computer logic without real OS calls.
"""
interface = AsyncMock()
interface.screenshot = AsyncMock(return_value=b"fake_screenshot")
interface.left_click = AsyncMock()
interface.right_click = AsyncMock()
interface.middle_click = AsyncMock()
interface.double_click = AsyncMock()
interface.type = AsyncMock()
interface.key = AsyncMock()
interface.move_mouse = AsyncMock()
interface.scroll = AsyncMock()
interface.get_screen_size = AsyncMock(return_value=(1920, 1080))
return interface
@pytest.fixture
def mock_cloud_provider():
"""Mock cloud provider for testing.
Use this fixture to test cloud provider logic without real API calls.
"""
provider = AsyncMock()
provider.start = AsyncMock()
provider.stop = AsyncMock()
provider.get_status = AsyncMock(return_value="running")
provider.execute_command = AsyncMock(return_value="command output")
return provider
@pytest.fixture
def mock_local_provider():
"""Mock local provider for testing.
Use this fixture to test local provider logic without real VM operations.
"""
provider = AsyncMock()
provider.start = AsyncMock()
provider.stop = AsyncMock()
provider.get_status = AsyncMock(return_value="running")
provider.execute_command = AsyncMock(return_value="command output")
return provider
@pytest.fixture
def disable_telemetry(monkeypatch):
"""Disable telemetry for tests.
Use this fixture to ensure no telemetry is sent during tests.
"""
monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1")

View File

@@ -0,0 +1,67 @@
"""Unit tests for Computer class.
This file tests ONLY the Computer class initialization and context manager.
Following SRP: This file tests ONE class (Computer).
All external dependencies (providers, interfaces) are mocked.
"""
from unittest.mock import AsyncMock, MagicMock, Mock, patch
import pytest
class TestComputerImport:
"""Test Computer module imports (SRP: Only tests imports)."""
def test_computer_class_exists(self):
"""Test that Computer class can be imported."""
from computer import Computer
assert Computer is not None
def test_vm_provider_type_exists(self):
"""Test that VMProviderType enum can be imported."""
from computer import VMProviderType
assert VMProviderType is not None
class TestComputerInitialization:
"""Test Computer initialization (SRP: Only tests initialization)."""
def test_computer_class_can_be_imported(self, disable_telemetry):
"""Test that Computer class can be imported without errors."""
from computer import Computer
assert Computer is not None
def test_computer_has_required_methods(self, disable_telemetry):
"""Test that Computer class has required methods."""
from computer import Computer
assert hasattr(Computer, "__aenter__")
assert hasattr(Computer, "__aexit__")
class TestComputerContextManager:
"""Test Computer context manager protocol (SRP: Only tests context manager)."""
def test_computer_is_async_context_manager(self, disable_telemetry):
"""Test that Computer has async context manager methods."""
from computer import Computer
assert hasattr(Computer, "__aenter__")
assert hasattr(Computer, "__aexit__")
assert callable(Computer.__aenter__)
assert callable(Computer.__aexit__)
class TestComputerInterface:
"""Test Computer.interface property (SRP: Only tests interface access)."""
def test_computer_class_structure(self, disable_telemetry):
"""Test that Computer class has expected structure."""
from computer import Computer
# Verify Computer is a class
assert isinstance(Computer, type)

View File

@@ -0,0 +1,43 @@
"""Pytest configuration and shared fixtures for core package tests.
This file contains shared fixtures and configuration for all core tests.
Following SRP: This file ONLY handles test setup/teardown.
"""
from unittest.mock import AsyncMock, Mock, patch
import pytest
@pytest.fixture
def mock_httpx_client():
"""Mock httpx.AsyncClient for API calls.
Use this fixture to avoid making real HTTP requests during tests.
"""
with patch("httpx.AsyncClient") as mock_client:
mock_instance = AsyncMock()
mock_client.return_value.__aenter__.return_value = mock_instance
yield mock_instance
@pytest.fixture
def mock_posthog():
"""Mock PostHog client for telemetry tests.
Use this fixture to avoid sending real telemetry during tests.
"""
with patch("posthog.Posthog") as mock_ph:
mock_instance = Mock()
mock_ph.return_value = mock_instance
yield mock_instance
@pytest.fixture
def disable_telemetry(monkeypatch):
"""Disable telemetry for tests that don't need it.
Use this fixture to ensure telemetry is disabled during tests.
"""
monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1")
yield

View File

@@ -0,0 +1,255 @@
"""Unit tests for core telemetry functionality.
This file tests ONLY telemetry logic, following SRP.
All external dependencies (PostHog, file system) are mocked.
"""
import os
from pathlib import Path
from unittest.mock import MagicMock, Mock, mock_open, patch
import pytest
class TestTelemetryEnabled:
"""Test telemetry enable/disable logic (SRP: Only tests enable/disable)."""
def test_telemetry_enabled_by_default(self, monkeypatch):
"""Test that telemetry is enabled by default."""
# Remove any environment variables that might affect the test
monkeypatch.delenv("CUA_TELEMETRY", raising=False)
monkeypatch.delenv("CUA_TELEMETRY_ENABLED", raising=False)
from core.telemetry import is_telemetry_enabled
assert is_telemetry_enabled() is True
def test_telemetry_disabled_with_legacy_flag(self, monkeypatch):
"""Test that telemetry can be disabled with legacy CUA_TELEMETRY=off."""
monkeypatch.setenv("CUA_TELEMETRY", "off")
from core.telemetry import is_telemetry_enabled
assert is_telemetry_enabled() is False
def test_telemetry_disabled_with_new_flag(self, monkeypatch):
"""Test that telemetry can be disabled with CUA_TELEMETRY_ENABLED=false."""
monkeypatch.setenv("CUA_TELEMETRY_ENABLED", "false")
from core.telemetry import is_telemetry_enabled
assert is_telemetry_enabled() is False
@pytest.mark.parametrize("value", ["0", "false", "no", "off"])
def test_telemetry_disabled_with_various_values(self, monkeypatch, value):
"""Test that telemetry respects various disable values."""
monkeypatch.setenv("CUA_TELEMETRY_ENABLED", value)
from core.telemetry import is_telemetry_enabled
assert is_telemetry_enabled() is False
@pytest.mark.parametrize("value", ["1", "true", "yes", "on"])
def test_telemetry_enabled_with_various_values(self, monkeypatch, value):
"""Test that telemetry respects various enable values."""
monkeypatch.setenv("CUA_TELEMETRY_ENABLED", value)
from core.telemetry import is_telemetry_enabled
assert is_telemetry_enabled() is True
class TestPostHogTelemetryClient:
"""Test PostHogTelemetryClient class (SRP: Only tests client logic)."""
@patch("core.telemetry.posthog.posthog")
@patch("core.telemetry.posthog.Path")
def test_client_initialization(self, mock_path, mock_posthog, disable_telemetry):
"""Test that client initializes correctly."""
from core.telemetry.posthog import PostHogTelemetryClient
# Mock the storage directory
mock_storage_dir = MagicMock()
mock_storage_dir.exists.return_value = False
mock_path.return_value.parent.parent = MagicMock()
mock_path.return_value.parent.parent.__truediv__.return_value = mock_storage_dir
# Reset singleton
PostHogTelemetryClient.destroy_client()
client = PostHogTelemetryClient()
assert client is not None
assert hasattr(client, "installation_id")
assert hasattr(client, "initialized")
assert hasattr(client, "queued_events")
@patch("core.telemetry.posthog.posthog")
@patch("core.telemetry.posthog.Path")
def test_installation_id_generation(self, mock_path, mock_posthog, disable_telemetry):
"""Test that installation ID is generated if not exists."""
from core.telemetry.posthog import PostHogTelemetryClient
# Mock file system
mock_id_file = MagicMock()
mock_id_file.exists.return_value = False
mock_storage_dir = MagicMock()
mock_storage_dir.__truediv__.return_value = mock_id_file
mock_core_dir = MagicMock()
mock_core_dir.__truediv__.return_value = mock_storage_dir
mock_path.return_value.parent.parent = mock_core_dir
# Reset singleton
PostHogTelemetryClient.destroy_client()
client = PostHogTelemetryClient()
# Should have generated a new UUID
assert client.installation_id is not None
assert len(client.installation_id) == 36 # UUID format
@patch("core.telemetry.posthog.posthog")
@patch("core.telemetry.posthog.Path")
def test_installation_id_persistence(self, mock_path, mock_posthog, disable_telemetry):
"""Test that installation ID is read from file if exists."""
from core.telemetry.posthog import PostHogTelemetryClient
existing_id = "test-installation-id-123"
# Mock file system
mock_id_file = MagicMock()
mock_id_file.exists.return_value = True
mock_id_file.read_text.return_value = existing_id
mock_storage_dir = MagicMock()
mock_storage_dir.__truediv__.return_value = mock_id_file
mock_core_dir = MagicMock()
mock_core_dir.__truediv__.return_value = mock_storage_dir
mock_path.return_value.parent.parent = mock_core_dir
# Reset singleton
PostHogTelemetryClient.destroy_client()
client = PostHogTelemetryClient()
assert client.installation_id == existing_id
@patch("core.telemetry.posthog.posthog")
@patch("core.telemetry.posthog.Path")
def test_record_event_when_disabled(self, mock_path, mock_posthog, monkeypatch):
"""Test that events are not recorded when telemetry is disabled."""
from core.telemetry.posthog import PostHogTelemetryClient
# Disable telemetry explicitly using the correct environment variable
monkeypatch.setenv("CUA_TELEMETRY_ENABLED", "false")
# Mock file system
mock_storage_dir = MagicMock()
mock_storage_dir.exists.return_value = False
mock_path.return_value.parent.parent = MagicMock()
mock_path.return_value.parent.parent.__truediv__.return_value = mock_storage_dir
# Reset singleton
PostHogTelemetryClient.destroy_client()
client = PostHogTelemetryClient()
client.record_event("test_event", {"key": "value"})
# PostHog capture should not be called at all when telemetry is disabled
mock_posthog.capture.assert_not_called()
@patch("core.telemetry.posthog.posthog")
@patch("core.telemetry.posthog.Path")
def test_record_event_when_enabled(self, mock_path, mock_posthog, monkeypatch):
"""Test that events are recorded when telemetry is enabled."""
from core.telemetry.posthog import PostHogTelemetryClient
# Enable telemetry
monkeypatch.setenv("CUA_TELEMETRY_ENABLED", "true")
# Mock file system
mock_storage_dir = MagicMock()
mock_storage_dir.exists.return_value = False
mock_path.return_value.parent.parent = MagicMock()
mock_path.return_value.parent.parent.__truediv__.return_value = mock_storage_dir
# Reset singleton
PostHogTelemetryClient.destroy_client()
client = PostHogTelemetryClient()
client.initialized = True # Pretend it's initialized
event_name = "test_event"
event_props = {"key": "value"}
client.record_event(event_name, event_props)
# PostHog capture should be called
assert mock_posthog.capture.call_count >= 1
@patch("core.telemetry.posthog.posthog")
@patch("core.telemetry.posthog.Path")
def test_singleton_pattern(self, mock_path, mock_posthog, disable_telemetry):
"""Test that get_client returns the same instance."""
from core.telemetry.posthog import PostHogTelemetryClient
# Mock file system
mock_storage_dir = MagicMock()
mock_storage_dir.exists.return_value = False
mock_path.return_value.parent.parent = MagicMock()
mock_path.return_value.parent.parent.__truediv__.return_value = mock_storage_dir
# Reset singleton
PostHogTelemetryClient.destroy_client()
client1 = PostHogTelemetryClient.get_client()
client2 = PostHogTelemetryClient.get_client()
assert client1 is client2
class TestRecordEvent:
"""Test the public record_event function (SRP: Only tests public API)."""
@patch("core.telemetry.posthog.PostHogTelemetryClient")
def test_record_event_calls_client(self, mock_client_class, disable_telemetry):
"""Test that record_event delegates to the client."""
from core.telemetry import record_event
mock_client_instance = Mock()
mock_client_class.get_client.return_value = mock_client_instance
event_name = "test_event"
event_props = {"key": "value"}
record_event(event_name, event_props)
mock_client_instance.record_event.assert_called_once_with(event_name, event_props)
@patch("core.telemetry.posthog.PostHogTelemetryClient")
def test_record_event_without_properties(self, mock_client_class, disable_telemetry):
"""Test that record_event works without properties."""
from core.telemetry import record_event
mock_client_instance = Mock()
mock_client_class.get_client.return_value = mock_client_instance
event_name = "test_event"
record_event(event_name)
mock_client_instance.record_event.assert_called_once_with(event_name, {})
class TestDestroyTelemetryClient:
"""Test client destruction (SRP: Only tests cleanup)."""
@patch("core.telemetry.posthog.PostHogTelemetryClient")
def test_destroy_client_calls_class_method(self, mock_client_class):
"""Test that destroy_telemetry_client delegates correctly."""
from core.telemetry import destroy_telemetry_client
destroy_telemetry_client()
mock_client_class.destroy_client.assert_called_once()

View File

@@ -129,12 +129,12 @@ See [desktop-extension/README.md](desktop-extension/README.md) for more details.
## Documentation
- Installation: https://trycua.com/docs/libraries/mcp-server/installation
- Configuration: https://trycua.com/docs/libraries/mcp-server/configuration
- Usage: https://trycua.com/docs/libraries/mcp-server/usage
- Tools: https://trycua.com/docs/libraries/mcp-server/tools
- Client Integrations: https://trycua.com/docs/libraries/mcp-server/client-integrations
- LLM Integrations: https://trycua.com/docs/libraries/mcp-server/llm-integrations
- Installation: https://cua.ai/docs/libraries/mcp-server/installation
- Configuration: https://cua.ai/docs/libraries/mcp-server/configuration
- Usage: https://cua.ai/docs/libraries/mcp-server/usage
- Tools: https://cua.ai/docs/libraries/mcp-server/tools
- Client Integrations: https://cua.ai/docs/libraries/mcp-server/client-integrations
- LLM Integrations: https://cua.ai/docs/libraries/mcp-server/llm-integrations
## Troubleshooting

View File

@@ -0,0 +1,51 @@
"""Pytest configuration and shared fixtures for mcp-server package tests.
This file contains shared fixtures and configuration for all mcp-server tests.
Following SRP: This file ONLY handles test setup/teardown.
"""
from unittest.mock import AsyncMock, Mock, patch
import pytest
@pytest.fixture
def mock_mcp_context():
"""Mock MCP context for testing.
Use this fixture to test MCP server logic without real MCP connections.
"""
context = AsyncMock()
context.request_context = AsyncMock()
context.session = Mock()
context.session.send_resource_updated = AsyncMock()
return context
@pytest.fixture
def mock_computer():
"""Mock Computer instance for MCP server tests.
Use this fixture to test MCP logic without real Computer operations.
"""
computer = AsyncMock()
computer.interface = AsyncMock()
computer.interface.screenshot = AsyncMock(return_value=b"fake_screenshot")
computer.interface.left_click = AsyncMock()
computer.interface.type = AsyncMock()
# Mock context manager
computer.__aenter__ = AsyncMock(return_value=computer)
computer.__aexit__ = AsyncMock()
return computer
@pytest.fixture
def disable_telemetry(monkeypatch):
"""Disable telemetry for tests.
Use this fixture to ensure no telemetry is sent during tests.
"""
monkeypatch.setenv("CUA_TELEMETRY_DISABLED", "1")

View File

@@ -0,0 +1,44 @@
"""Unit tests for mcp-server package.
This file tests ONLY basic MCP server functionality.
Following SRP: This file tests MCP server initialization.
All external dependencies are mocked.
"""
from unittest.mock import AsyncMock, Mock, patch
import pytest
class TestMCPServerImports:
"""Test MCP server module imports (SRP: Only tests imports)."""
def test_mcp_server_module_exists(self):
"""Test that mcp_server module can be imported."""
try:
import mcp_server
assert mcp_server is not None
except ImportError:
pytest.skip("mcp_server module not installed")
except SystemExit:
pytest.skip("MCP dependencies (mcp.server.fastmcp) not available")
class TestMCPServerInitialization:
"""Test MCP server initialization (SRP: Only tests initialization)."""
@pytest.mark.asyncio
async def test_mcp_server_can_be_imported(self):
"""Basic smoke test: verify MCP server components can be imported."""
try:
from mcp_server import server
assert server is not None
except ImportError:
pytest.skip("MCP server module not available")
except SystemExit:
pytest.skip("MCP dependencies (mcp.server.fastmcp) not available")
except Exception as e:
# Some initialization errors are acceptable in unit tests
pytest.skip(f"MCP server initialization requires specific setup: {e}")

View File

@@ -1,10 +0,0 @@
[bumpversion]
current_version = 0.2.1
commit = True
tag = True
tag_name = pylume-v{new_version}
message = Bump pylume to v{new_version}
[bumpversion:file:pylume/__init__.py]
search = __version__ = "{current_version}"
replace = __version__ = "{new_version}"

View File

@@ -1,46 +0,0 @@
<div align="center">
<h1>
<div class="image-wrapper" style="display: inline-block;">
<picture>
<source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_white.png" style="display: block; margin: auto;">
<source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="https://raw.githubusercontent.com/trycua/cua/main/img/logo_black.png" style="display: block; margin: auto;">
<img alt="Shows my svg">
</picture>
</div>
[![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
[![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
[![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
[![PyPI](https://img.shields.io/pypi/v/pylume?color=333333)](https://pypi.org/project/pylume/)
</h1>
</div>
**pylume** is a lightweight Python library based on [lume](https://github.com/trycua/lume) to create, run and manage macOS and Linux virtual machines (VMs) natively on Apple Silicon.
```bash
pip install pylume
```
## Usage
Please refer to this [Notebook](./samples/nb.ipynb) for a quickstart. More details about the underlying API used by pylume are available [here](https://github.com/trycua/lume/docs/API-Reference.md).
## Prebuilt Images
Pre-built images are available on [ghcr.io/trycua](https://github.com/orgs/trycua/packages).
These images come pre-configured with an SSH server and auto-login enabled.
## Contributing
We welcome and greatly appreciate contributions to lume! Whether you're improving documentation, adding new features, fixing bugs, or adding new VM images, your efforts help make pylume better for everyone.
Join our [Discord community](https://discord.com/invite/mVnXXpdE85) to discuss ideas or get assistance.
## License
lume is open-sourced under the MIT License - see the [LICENSE](LICENSE) file for details.
## Stargazers over time
[![Stargazers over time](https://starchart.cc/trycua/pylume.svg?variant=adaptive)](https://starchart.cc/trycua/pylume)

View File

@@ -1,9 +0,0 @@
"""
PyLume Python SDK - A client library for managing macOS VMs with PyLume.
"""
from pylume.exceptions import *
from pylume.models import *
from pylume.pylume import *
__version__ = "0.1.0"

View File

@@ -1,59 +0,0 @@
"""
PyLume Python SDK - A client library for managing macOS VMs with PyLume.
Example:
>>> from pylume import PyLume, VMConfig
>>> client = PyLume()
>>> config = VMConfig(name="my-vm", cpu=4, memory="8GB", disk_size="64GB")
>>> client.create_vm(config)
>>> client.run_vm("my-vm")
"""
# Import exceptions then all models
from .exceptions import (
LumeConfigError,
LumeConnectionError,
LumeError,
LumeImageError,
LumeNotFoundError,
LumeServerError,
LumeTimeoutError,
LumeVMError,
)
from .models import (
CloneSpec,
ImageInfo,
ImageList,
ImageRef,
SharedDirectory,
VMConfig,
VMRunOpts,
VMStatus,
VMUpdateOpts,
)
# Import main class last to avoid circular imports
from .pylume import PyLume
__version__ = "0.2.1"
__all__ = [
"PyLume",
"VMConfig",
"VMStatus",
"VMRunOpts",
"VMUpdateOpts",
"ImageRef",
"CloneSpec",
"SharedDirectory",
"ImageList",
"ImageInfo",
"LumeError",
"LumeServerError",
"LumeConnectionError",
"LumeTimeoutError",
"LumeNotFoundError",
"LumeConfigError",
"LumeVMError",
"LumeImageError",
]

View File

@@ -1,119 +0,0 @@
import asyncio
import json
import shlex
import subprocess
from typing import Any, Dict, Optional
from .exceptions import (
LumeConfigError,
LumeConnectionError,
LumeError,
LumeNotFoundError,
LumeServerError,
LumeTimeoutError,
)
class LumeClient:
def __init__(self, base_url: str, timeout: float = 60.0, debug: bool = False):
self.base_url = base_url
self.timeout = timeout
self.debug = debug
def _log_debug(self, message: str, **kwargs) -> None:
"""Log debug information if debug mode is enabled."""
if self.debug:
print(f"DEBUG: {message}")
if kwargs:
print(json.dumps(kwargs, indent=2))
async def _run_curl(
self,
method: str,
path: str,
data: Optional[Dict[str, Any]] = None,
params: Optional[Dict[str, Any]] = None,
) -> Any:
"""Execute a curl command and return the response."""
url = f"{self.base_url}{path}"
if params:
param_str = "&".join(f"{k}={v}" for k, v in params.items())
url = f"{url}?{param_str}"
cmd = ["curl", "-X", method, "-s", "-w", "%{http_code}", "-m", str(self.timeout)]
if data is not None:
cmd.extend(["-H", "Content-Type: application/json", "-d", json.dumps(data)])
cmd.append(url)
self._log_debug(f"Running curl command: {' '.join(map(shlex.quote, cmd))}")
try:
process = await asyncio.create_subprocess_exec(
*cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = await process.communicate()
if process.returncode != 0:
raise LumeConnectionError(f"Curl command failed: {stderr.decode()}")
# The last 3 characters are the status code
response = stdout.decode()
status_code = int(response[-3:])
response_body = response[:-3] # Remove status code from response
if status_code >= 400:
if status_code == 404:
raise LumeNotFoundError(f"Resource not found: {path}")
elif status_code == 400:
raise LumeConfigError(f"Invalid request: {response_body}")
elif status_code >= 500:
raise LumeServerError(f"Server error: {response_body}")
else:
raise LumeError(f"Request failed with status {status_code}: {response_body}")
return json.loads(response_body) if response_body.strip() else None
except asyncio.TimeoutError:
raise LumeTimeoutError(f"Request timed out after {self.timeout} seconds")
async def get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Any:
"""Make a GET request."""
return await self._run_curl("GET", path, params=params)
async def post(
self, path: str, data: Optional[Dict[str, Any]] = None, timeout: Optional[float] = None
) -> Any:
"""Make a POST request."""
old_timeout = self.timeout
if timeout is not None:
self.timeout = timeout
try:
return await self._run_curl("POST", path, data=data)
finally:
self.timeout = old_timeout
async def patch(self, path: str, data: Dict[str, Any]) -> None:
"""Make a PATCH request."""
await self._run_curl("PATCH", path, data=data)
async def delete(self, path: str) -> None:
"""Make a DELETE request."""
await self._run_curl("DELETE", path)
def print_curl(self, method: str, path: str, data: Optional[Dict[str, Any]] = None) -> None:
"""Print equivalent curl command for debugging."""
curl_cmd = f"""curl -X {method} \\
'{self.base_url}{path}'"""
if data:
curl_cmd += f" \\\n -H 'Content-Type: application/json' \\\n -d '{json.dumps(data)}'"
print("\nEquivalent curl command:")
print(curl_cmd)
print()
async def close(self) -> None:
"""Close the client resources."""
pass # No shared resources to clean up

View File

@@ -1,54 +0,0 @@
from typing import Optional
class LumeError(Exception):
"""Base exception for all PyLume errors."""
pass
class LumeServerError(LumeError):
"""Raised when there's an error with the PyLume server."""
def __init__(
self, message: str, status_code: Optional[int] = None, response_text: Optional[str] = None
):
self.status_code = status_code
self.response_text = response_text
super().__init__(message)
class LumeConnectionError(LumeError):
"""Raised when there's an error connecting to the PyLume server."""
pass
class LumeTimeoutError(LumeError):
"""Raised when a request to the PyLume server times out."""
pass
class LumeNotFoundError(LumeError):
"""Raised when a requested resource is not found."""
pass
class LumeConfigError(LumeError):
"""Raised when there's an error with the configuration."""
pass
class LumeVMError(LumeError):
"""Raised when there's an error with a VM operation."""
pass
class LumeImageError(LumeError):
"""Raised when there's an error with an image operation."""
pass

Binary file not shown.

View File

@@ -1,265 +0,0 @@
import re
from typing import Any, Dict, List, Literal, Optional
from pydantic import BaseModel, ConfigDict, Field, RootModel, computed_field, validator
class DiskInfo(BaseModel):
"""Information about disk storage allocation.
Attributes:
total: Total disk space in bytes
allocated: Currently allocated disk space in bytes
"""
total: int
allocated: int
class VMConfig(BaseModel):
"""Configuration for creating a new VM.
Note: Memory and disk sizes should be specified with units (e.g., "4GB", "64GB")
Attributes:
name: Name of the virtual machine
os: Operating system type, either "macOS" or "linux"
cpu: Number of CPU cores to allocate
memory: Amount of memory to allocate with units
disk_size: Size of the disk to create with units
display: Display resolution in format "widthxheight"
ipsw: IPSW path or 'latest' for macOS VMs, None for other OS types
"""
name: str
os: Literal["macOS", "linux"] = "macOS"
cpu: int = Field(default=2, ge=1)
memory: str = "4GB"
disk_size: str = Field(default="64GB", alias="diskSize")
display: str = "1024x768"
ipsw: Optional[str] = Field(default=None, description="IPSW path or 'latest', for macOS VMs")
class Config:
populate_by_alias = True
class SharedDirectory(BaseModel):
"""Configuration for a shared directory.
Attributes:
host_path: Path to the directory on the host system
read_only: Whether the directory should be mounted as read-only
"""
host_path: str = Field(..., alias="hostPath") # Allow host_path but serialize as hostPath
read_only: bool = False
class Config:
populate_by_name = True # Allow both alias and original name
alias_generator = lambda s: "".join(
word.capitalize() if i else word for i, word in enumerate(s.split("_"))
)
class VMRunOpts(BaseModel):
"""Configuration for running a VM.
Args:
no_display: Whether to not display the VNC client
shared_directories: List of directories to share with the VM
"""
no_display: bool = Field(default=False, alias="noDisplay")
shared_directories: Optional[list[SharedDirectory]] = Field(
default=None, alias="sharedDirectories"
)
model_config = ConfigDict(
populate_by_name=True,
alias_generator=lambda s: "".join(
word.capitalize() if i else word for i, word in enumerate(s.split("_"))
),
)
def model_dump(self, **kwargs):
"""Export model data with proper field name conversion.
Converts shared directory fields to match API expectations when using aliases.
Args:
**kwargs: Keyword arguments passed to parent model_dump method
Returns:
dict: Model data with properly formatted field names
"""
data = super().model_dump(**kwargs)
# Convert shared directory fields to match API expectations
if self.shared_directories and "by_alias" in kwargs and kwargs["by_alias"]:
data["sharedDirectories"] = [
{"hostPath": d.host_path, "readOnly": d.read_only} for d in self.shared_directories
]
# Remove the snake_case version if it exists
data.pop("shared_directories", None)
return data
class VMStatus(BaseModel):
"""Status information for a virtual machine.
Attributes:
name: Name of the virtual machine
status: Current status of the VM
os: Operating system type
cpu_count: Number of CPU cores allocated
memory_size: Amount of memory allocated in bytes
disk_size: Disk storage information
vnc_url: URL for VNC connection if available
ip_address: IP address of the VM if available
"""
name: str
status: str
os: Literal["macOS", "linux"]
cpu_count: int = Field(alias="cpuCount")
memory_size: int = Field(alias="memorySize") # API returns memory size in bytes
disk_size: DiskInfo = Field(alias="diskSize")
vnc_url: Optional[str] = Field(default=None, alias="vncUrl")
ip_address: Optional[str] = Field(default=None, alias="ipAddress")
class Config:
populate_by_alias = True
@computed_field
@property
def state(self) -> str:
"""Get the current state of the VM.
Returns:
str: Current VM status
"""
return self.status
@computed_field
@property
def cpu(self) -> int:
"""Get the number of CPU cores.
Returns:
int: Number of CPU cores allocated to the VM
"""
return self.cpu_count
@computed_field
@property
def memory(self) -> str:
"""Get memory allocation in human-readable format.
Returns:
str: Memory size formatted as "{size}GB"
"""
# Convert bytes to GB
gb = self.memory_size / (1024 * 1024 * 1024)
return f"{int(gb)}GB"
class VMUpdateOpts(BaseModel):
"""Options for updating VM configuration.
Attributes:
cpu: Number of CPU cores to update to
memory: Amount of memory to update to with units
disk_size: Size of disk to update to with units
"""
cpu: Optional[int] = None
memory: Optional[str] = None
disk_size: Optional[str] = None
class ImageRef(BaseModel):
"""Reference to a VM image.
Attributes:
image: Name of the image
tag: Tag version of the image
registry: Registry hostname where image is stored
organization: Organization or namespace in the registry
"""
image: str
tag: str = "latest"
registry: Optional[str] = "ghcr.io"
organization: Optional[str] = "trycua"
def model_dump(self, **kwargs):
"""Override model_dump to return just the image:tag format.
Args:
**kwargs: Keyword arguments (ignored)
Returns:
str: Image reference in "image:tag" format
"""
return f"{self.image}:{self.tag}"
class CloneSpec(BaseModel):
"""Specification for cloning a VM.
Attributes:
name: Name of the source VM to clone
new_name: Name for the new cloned VM
"""
name: str
new_name: str = Field(alias="newName")
class Config:
populate_by_alias = True
class ImageInfo(BaseModel):
"""Model for individual image information.
Attributes:
imageId: Unique identifier for the image
"""
imageId: str
class ImageList(RootModel):
"""Response model for the images endpoint.
A list-like container for ImageInfo objects that provides
iteration and indexing capabilities.
"""
root: List[ImageInfo]
def __iter__(self):
"""Iterate over the image list.
Returns:
Iterator over ImageInfo objects
"""
return iter(self.root)
def __getitem__(self, item):
"""Get an item from the image list by index.
Args:
item: Index or slice to retrieve
Returns:
ImageInfo or list of ImageInfo objects
"""
return self.root[item]
def __len__(self):
"""Get the number of images in the list.
Returns:
int: Number of images in the list
"""
return len(self.root)

View File

@@ -1,315 +0,0 @@
import asyncio
import json
import os
import re
import signal
import subprocess
import sys
import time
from functools import wraps
from typing import Any, Callable, List, Optional, TypeVar, Union
from .client import LumeClient
from .exceptions import (
LumeConfigError,
LumeConnectionError,
LumeError,
LumeImageError,
LumeNotFoundError,
LumeServerError,
LumeTimeoutError,
LumeVMError,
)
from .models import (
CloneSpec,
ImageList,
ImageRef,
SharedDirectory,
VMConfig,
VMRunOpts,
VMStatus,
VMUpdateOpts,
)
from .server import LumeServer
# Type variable for the decorator
T = TypeVar("T")
def ensure_server(func: Callable[..., T]) -> Callable[..., T]:
"""Decorator to ensure server is running before executing the method."""
@wraps(func)
async def wrapper(self: "PyLume", *args: Any, **kwargs: Any) -> T:
# ensure_running is an async method, so we need to await it
await self.server.ensure_running()
# Initialize client if needed
await self._init_client()
return await func(self, *args, **kwargs) # type: ignore
return wrapper # type: ignore
class PyLume:
def __init__(
self,
debug: bool = False,
server_start_timeout: int = 60,
port: Optional[int] = None,
use_existing_server: bool = False,
host: str = "localhost",
):
"""Initialize the async PyLume client.
Args:
debug: Enable debug logging
auto_start_server: Whether to automatically start the lume server if not running
server_start_timeout: Timeout in seconds to wait for server to start
port: Port number for the lume server. Required when use_existing_server is True.
use_existing_server: If True, will try to connect to an existing server on the specified port
instead of starting a new one.
host: Host to use for connections (e.g., "localhost", "127.0.0.1", "host.docker.internal")
"""
if use_existing_server and port is None:
raise LumeConfigError("Port must be specified when using an existing server")
self.server = LumeServer(
debug=debug,
server_start_timeout=server_start_timeout,
port=port,
use_existing_server=use_existing_server,
host=host,
)
self.client = None
async def __aenter__(self) -> "PyLume":
"""Async context manager entry."""
if self.server.use_existing_server:
# Just ensure base_url is set for existing server
if self.server.requested_port is None:
raise LumeConfigError("Port must be specified when using an existing server")
if not self.server.base_url:
self.server.port = self.server.requested_port
self.server.base_url = f"http://{self.server.host}:{self.server.port}/lume"
# Ensure the server is running (will connect to existing or start new as needed)
await self.server.ensure_running()
# Initialize the client
await self._init_client()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
"""Async context manager exit."""
if self.client is not None:
await self.client.close()
await self.server.stop()
async def _init_client(self) -> None:
"""Initialize the client if not already initialized."""
if self.client is None:
if self.server.base_url is None:
raise RuntimeError("Server base URL not set")
self.client = LumeClient(self.server.base_url, debug=self.server.debug)
def _log_debug(self, message: str, **kwargs) -> None:
"""Log debug information if debug mode is enabled."""
if self.server.debug:
print(f"DEBUG: {message}")
if kwargs:
print(json.dumps(kwargs, indent=2))
async def _handle_api_error(self, e: Exception, operation: str) -> None:
"""Handle API errors and raise appropriate custom exceptions."""
if isinstance(e, subprocess.SubprocessError):
raise LumeConnectionError(f"Failed to connect to PyLume server: {str(e)}")
elif isinstance(e, asyncio.TimeoutError):
raise LumeTimeoutError(f"Request timed out: {str(e)}")
if not hasattr(e, "status") and not isinstance(e, subprocess.CalledProcessError):
raise LumeServerError(f"Unknown error during {operation}: {str(e)}")
status_code = getattr(e, "status", 500)
response_text = str(e)
self._log_debug(
f"{operation} request failed", status_code=status_code, response_text=response_text
)
if status_code == 404:
raise LumeNotFoundError(f"Resource not found during {operation}")
elif status_code == 400:
raise LumeConfigError(f"Invalid configuration for {operation}: {response_text}")
elif status_code >= 500:
raise LumeServerError(
f"Server error during {operation}",
status_code=status_code,
response_text=response_text,
)
else:
raise LumeServerError(
f"Error during {operation}", status_code=status_code, response_text=response_text
)
async def _read_output(self) -> None:
"""Read and log server output."""
try:
while True:
if not self.server.server_process or self.server.server_process.poll() is not None:
self._log_debug("Server process ended")
break
# Read stdout without blocking
if self.server.server_process.stdout:
while True:
line = self.server.server_process.stdout.readline()
if not line:
break
line = line.strip()
self._log_debug(f"Server stdout: {line}")
if "Server started" in line.decode("utf-8"):
self._log_debug("Detected server started message")
return
# Read stderr without blocking
if self.server.server_process.stderr:
while True:
line = self.server.server_process.stderr.readline()
if not line:
break
line = line.strip()
self._log_debug(f"Server stderr: {line}")
if "error" in line.decode("utf-8").lower():
raise RuntimeError(f"Server error: {line}")
await asyncio.sleep(0.1) # Small delay to prevent CPU spinning
except Exception as e:
self._log_debug(f"Error in output reader: {str(e)}")
raise
@ensure_server
async def create_vm(self, spec: Union[VMConfig, dict]) -> None:
"""Create a VM with the given configuration."""
# Ensure client is initialized
await self._init_client()
if isinstance(spec, VMConfig):
spec = spec.model_dump(by_alias=True, exclude_none=True)
# Suppress optional attribute access errors
self.client.print_curl("POST", "/vms", spec) # type: ignore[attr-defined]
await self.client.post("/vms", spec) # type: ignore[attr-defined]
@ensure_server
async def run_vm(self, name: str, opts: Optional[Union[VMRunOpts, dict]] = None) -> None:
"""Run a VM."""
if opts is None:
opts = VMRunOpts(no_display=False) # type: ignore[attr-defined]
elif isinstance(opts, dict):
opts = VMRunOpts(**opts)
payload = opts.model_dump(by_alias=True, exclude_none=True)
self.client.print_curl("POST", f"/vms/{name}/run", payload) # type: ignore[attr-defined]
await self.client.post(f"/vms/{name}/run", payload) # type: ignore[attr-defined]
@ensure_server
async def list_vms(self) -> List[VMStatus]:
"""List all VMs."""
data = await self.client.get("/vms") # type: ignore[attr-defined]
return [VMStatus.model_validate(vm) for vm in data]
@ensure_server
async def get_vm(self, name: str) -> VMStatus:
"""Get VM details."""
data = await self.client.get(f"/vms/{name}") # type: ignore[attr-defined]
return VMStatus.model_validate(data)
@ensure_server
async def update_vm(self, name: str, params: Union[VMUpdateOpts, dict]) -> None:
"""Update VM settings."""
if isinstance(params, dict):
params = VMUpdateOpts(**params)
payload = params.model_dump(by_alias=True, exclude_none=True)
self.client.print_curl("PATCH", f"/vms/{name}", payload) # type: ignore[attr-defined]
await self.client.patch(f"/vms/{name}", payload) # type: ignore[attr-defined]
@ensure_server
async def stop_vm(self, name: str) -> None:
"""Stop a VM."""
await self.client.post(f"/vms/{name}/stop") # type: ignore[attr-defined]
@ensure_server
async def delete_vm(self, name: str) -> None:
"""Delete a VM."""
await self.client.delete(f"/vms/{name}") # type: ignore[attr-defined]
@ensure_server
async def pull_image(
self, spec: Union[ImageRef, dict, str], name: Optional[str] = None
) -> None:
"""Pull a VM image."""
await self._init_client()
if isinstance(spec, str):
if ":" in spec:
image_str = spec
else:
image_str = f"{spec}:latest"
registry = "ghcr.io"
organization = "trycua"
elif isinstance(spec, dict):
image = spec.get("image", "")
tag = spec.get("tag", "latest")
image_str = f"{image}:{tag}"
registry = spec.get("registry", "ghcr.io")
organization = spec.get("organization", "trycua")
else:
image_str = f"{spec.image}:{spec.tag}"
registry = spec.registry
organization = spec.organization
payload = {
"image": image_str,
"name": name,
"registry": registry,
"organization": organization,
}
self.client.print_curl("POST", "/pull", payload) # type: ignore[attr-defined]
await self.client.post("/pull", payload, timeout=300.0) # type: ignore[attr-defined]
@ensure_server
async def clone_vm(self, name: str, new_name: str) -> None:
"""Clone a VM with the given name to a new VM with new_name."""
config = CloneSpec(name=name, newName=new_name)
self.client.print_curl("POST", "/vms/clone", config.model_dump()) # type: ignore[attr-defined]
await self.client.post("/vms/clone", config.model_dump()) # type: ignore[attr-defined]
@ensure_server
async def get_latest_ipsw_url(self) -> str:
"""Get the latest IPSW URL."""
await self._init_client()
data = await self.client.get("/ipsw") # type: ignore[attr-defined]
return data["url"]
@ensure_server
async def get_images(self, organization: Optional[str] = None) -> ImageList:
"""Get list of available images."""
await self._init_client()
params = {"organization": organization} if organization else None
data = await self.client.get("/images", params) # type: ignore[attr-defined]
return ImageList(root=data)
async def close(self) -> None:
"""Close the client and stop the server."""
if self.client is not None:
await self.client.close()
self.client = None
await asyncio.sleep(1)
await self.server.stop()
async def _ensure_client(self) -> None:
"""Ensure client is initialized."""
if self.client is None:
await self._init_client()

View File

@@ -1,481 +0,0 @@
import asyncio
import json
import logging
import os
import random
import shlex
import signal
import socket
import subprocess
import sys
import tempfile
import time
from logging import getLogger
from typing import Optional
from .exceptions import LumeConnectionError
class LumeServer:
def __init__(
self,
debug: bool = False,
server_start_timeout: int = 60,
port: Optional[int] = None,
use_existing_server: bool = False,
host: str = "localhost",
):
"""Initialize the LumeServer.
Args:
debug: Enable debug logging
server_start_timeout: Timeout in seconds to wait for server to start
port: Specific port to use for the server
use_existing_server: If True, will try to connect to an existing server
instead of starting a new one
host: Host to use for connections (e.g., "localhost", "127.0.0.1", "host.docker.internal")
"""
self.debug = debug
self.server_start_timeout = server_start_timeout
self.server_process = None
self.output_file = None
self.requested_port = port
self.port = None
self.base_url = None
self.use_existing_server = use_existing_server
self.host = host
# Configure logging
self.logger = getLogger("pylume.server")
if not self.logger.handlers:
handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
self.logger.addHandler(handler)
self.logger.setLevel(logging.DEBUG if debug else logging.INFO)
self.logger.debug(f"Server initialized with host: {self.host}")
def _check_port_available(self, port: int) -> bool:
"""Check if a port is available."""
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(0.5)
result = s.connect_ex(("127.0.0.1", port))
if result == 0: # Port is in use on localhost
return False
except:
pass
# Check the specified host (e.g., "host.docker.internal") if it's not a localhost alias
if self.host not in ["localhost", "127.0.0.1"]:
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(0.5)
result = s.connect_ex((self.host, port))
if result == 0: # Port is in use on host
return False
except:
pass
return True
def _get_server_port(self) -> int:
"""Get an available port for the server."""
# Use requested port if specified
if self.requested_port is not None:
if not self._check_port_available(self.requested_port):
raise RuntimeError(f"Requested port {self.requested_port} is not available")
return self.requested_port
# Find a free port
for _ in range(10): # Try up to 10 times
port = random.randint(49152, 65535)
if self._check_port_available(port):
return port
raise RuntimeError("Could not find an available port")
async def _ensure_server_running(self) -> None:
"""Ensure the lume server is running, start it if it's not."""
try:
self.logger.debug("Checking if lume server is running...")
# Try to connect to the server with a short timeout
cmd = ["curl", "-s", "-w", "%{http_code}", "-m", "5", f"{self.base_url}/vms"]
process = await asyncio.create_subprocess_exec(
*cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = await process.communicate()
if process.returncode == 0:
response = stdout.decode()
status_code = int(response[-3:])
if status_code == 200:
self.logger.debug("PyLume server is running")
return
self.logger.debug("PyLume server not running, attempting to start it")
# Server not running, try to start it
lume_path = os.path.join(os.path.dirname(__file__), "lume")
if not os.path.exists(lume_path):
raise RuntimeError(f"Could not find lume binary at {lume_path}")
# Make sure the file is executable
os.chmod(lume_path, 0o755)
# Create a temporary file for server output
self.output_file = tempfile.NamedTemporaryFile(mode="w+", delete=False)
self.logger.debug(f"Using temporary file for server output: {self.output_file.name}")
# Start the server
self.logger.debug(f"Starting lume server with: {lume_path} serve --port {self.port}")
# Start server in background using subprocess.Popen
try:
self.server_process = subprocess.Popen(
[lume_path, "serve", "--port", str(self.port)],
stdout=self.output_file,
stderr=self.output_file,
cwd=os.path.dirname(lume_path),
start_new_session=True, # Run in new session to avoid blocking
)
except Exception as e:
self.output_file.close()
os.unlink(self.output_file.name)
raise RuntimeError(f"Failed to start lume server process: {str(e)}")
# Wait for server to start
self.logger.debug(
f"Waiting up to {self.server_start_timeout} seconds for server to start..."
)
start_time = time.time()
server_ready = False
last_size = 0
while time.time() - start_time < self.server_start_timeout:
if self.server_process.poll() is not None:
# Process has terminated
self.output_file.seek(0)
output = self.output_file.read()
self.output_file.close()
os.unlink(self.output_file.name)
error_msg = (
f"Server process terminated unexpectedly.\n"
f"Exit code: {self.server_process.returncode}\n"
f"Output: {output}"
)
raise RuntimeError(error_msg)
# Check output file for server ready message
self.output_file.seek(0, os.SEEK_END)
size = self.output_file.tell()
if size > last_size: # Only read if there's new content
self.output_file.seek(last_size)
new_output = self.output_file.read()
if new_output.strip(): # Only log non-empty output
self.logger.debug(f"Server output: {new_output.strip()}")
last_size = size
if "Server started" in new_output:
server_ready = True
self.logger.debug("Server startup detected")
break
# Try to connect to the server periodically
try:
cmd = ["curl", "-s", "-w", "%{http_code}", "-m", "5", f"{self.base_url}/vms"]
process = await asyncio.create_subprocess_exec(
*cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = await process.communicate()
if process.returncode == 0:
response = stdout.decode()
status_code = int(response[-3:])
if status_code == 200:
server_ready = True
self.logger.debug("Server is responding to requests")
break
except:
pass # Server not ready yet
await asyncio.sleep(1.0)
if not server_ready:
# Cleanup if server didn't start
if self.server_process:
self.server_process.terminate()
try:
self.server_process.wait(timeout=5)
except subprocess.TimeoutExpired:
self.server_process.kill()
self.output_file.close()
os.unlink(self.output_file.name)
raise RuntimeError(
f"Failed to start lume server after {self.server_start_timeout} seconds. "
"Check the debug output for more details."
)
# Give the server a moment to fully initialize
await asyncio.sleep(2.0)
# Verify server is responding
try:
cmd = ["curl", "-s", "-w", "%{http_code}", "-m", "10", f"{self.base_url}/vms"]
process = await asyncio.create_subprocess_exec(
*cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = await process.communicate()
if process.returncode != 0:
raise RuntimeError(f"Curl command failed: {stderr.decode()}")
response = stdout.decode()
status_code = int(response[-3:])
if status_code != 200:
raise RuntimeError(f"Server returned status code {status_code}")
self.logger.debug("PyLume server started successfully")
except Exception as e:
self.logger.debug(f"Server verification failed: {str(e)}")
if self.server_process:
self.server_process.terminate()
try:
self.server_process.wait(timeout=5)
except subprocess.TimeoutExpired:
self.server_process.kill()
self.output_file.close()
os.unlink(self.output_file.name)
raise RuntimeError(f"Server started but is not responding: {str(e)}")
self.logger.debug("Server startup completed successfully")
except Exception as e:
raise RuntimeError(f"Failed to start lume server: {str(e)}")
async def _start_server(self) -> None:
"""Start the lume server using the lume executable."""
self.logger.debug("Starting PyLume server")
# Get absolute path to lume executable in the same directory as this file
lume_path = os.path.join(os.path.dirname(__file__), "lume")
if not os.path.exists(lume_path):
raise RuntimeError(f"Could not find lume binary at {lume_path}")
try:
# Make executable
os.chmod(lume_path, 0o755)
# Get and validate port
self.port = self._get_server_port()
self.base_url = f"http://{self.host}:{self.port}/lume"
# Set up output handling
self.output_file = tempfile.NamedTemporaryFile(mode="w+", delete=False)
# Start the server process with the lume executable
env = os.environ.copy()
env["RUST_BACKTRACE"] = "1" # Enable backtrace for better error reporting
# Specify the host to bind to (0.0.0.0 to allow external connections)
self.server_process = subprocess.Popen(
[lume_path, "serve", "--port", str(self.port)],
stdout=self.output_file,
stderr=subprocess.STDOUT,
cwd=os.path.dirname(lume_path), # Run from same directory as executable
env=env,
)
# Wait for server to initialize
await asyncio.sleep(2)
await self._wait_for_server()
except Exception as e:
await self._cleanup()
raise RuntimeError(f"Failed to start lume server process: {str(e)}")
async def _tail_log(self) -> None:
"""Read and display server log output in debug mode."""
while True:
try:
self.output_file.seek(0, os.SEEK_END) # type: ignore[attr-defined]
line = self.output_file.readline() # type: ignore[attr-defined]
if line:
line = line.strip()
if line:
print(f"SERVER: {line}")
if self.server_process.poll() is not None: # type: ignore[attr-defined]
print("Server process ended")
break
await asyncio.sleep(0.1)
except Exception as e:
print(f"Error reading log: {e}")
await asyncio.sleep(0.1)
async def _wait_for_server(self) -> None:
"""Wait for server to start and become responsive with increased timeout."""
start_time = time.time()
while time.time() - start_time < self.server_start_timeout:
if self.server_process.poll() is not None: # type: ignore[attr-defined]
error_msg = await self._get_error_output()
await self._cleanup()
raise RuntimeError(error_msg)
try:
await self._verify_server()
self.logger.debug("Server is now responsive")
return
except Exception as e:
self.logger.debug(f"Server not ready yet: {str(e)}")
await asyncio.sleep(1.0)
await self._cleanup()
raise RuntimeError(f"Server failed to start after {self.server_start_timeout} seconds")
async def _verify_server(self) -> None:
"""Verify server is responding to requests."""
try:
cmd = [
"curl",
"-s",
"-w",
"%{http_code}",
"-m",
"10",
f"http://{self.host}:{self.port}/lume/vms",
]
process = await asyncio.create_subprocess_exec(
*cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = await process.communicate()
if process.returncode != 0:
raise RuntimeError(f"Curl command failed: {stderr.decode()}")
response = stdout.decode()
status_code = int(response[-3:])
if status_code != 200:
raise RuntimeError(f"Server returned status code {status_code}")
self.logger.debug("PyLume server started successfully")
except Exception as e:
raise RuntimeError(f"Server not responding: {str(e)}")
async def _get_error_output(self) -> str:
"""Get error output from the server process."""
if not self.output_file:
return "No output available"
self.output_file.seek(0)
output = self.output_file.read()
return (
f"Server process terminated unexpectedly.\n"
f"Exit code: {self.server_process.returncode}\n" # type: ignore[attr-defined]
f"Output: {output}"
)
async def _cleanup(self) -> None:
"""Clean up all server resources."""
if self.server_process:
try:
self.server_process.terminate()
try:
self.server_process.wait(timeout=5)
except subprocess.TimeoutExpired:
self.server_process.kill()
except:
pass
self.server_process = None
# Clean up output file
if self.output_file:
try:
self.output_file.close()
os.unlink(self.output_file.name)
except Exception as e:
self.logger.debug(f"Error cleaning up output file: {e}")
self.output_file = None
async def ensure_running(self) -> None:
"""Ensure the server is running.
If use_existing_server is True, will only try to connect to an existing server.
Otherwise will:
1. Try to connect to an existing server on the specified port
2. If that fails and not in Docker, start a new server
3. If in Docker and no existing server is found, raise an error
"""
# First check if we're in Docker
in_docker = os.path.exists("/.dockerenv") or (
os.path.exists("/proc/1/cgroup") and "docker" in open("/proc/1/cgroup", "r").read()
)
# If using a non-localhost host like host.docker.internal, set up the connection details
if self.host not in ["localhost", "127.0.0.1"]:
if self.requested_port is None:
raise RuntimeError("Port must be specified when using a remote host")
self.port = self.requested_port
self.base_url = f"http://{self.host}:{self.port}/lume"
self.logger.debug(f"Using remote host server at {self.base_url}")
# Try to verify the server is accessible
try:
await self._verify_server()
self.logger.debug("Successfully connected to remote server")
return
except Exception as e:
if self.use_existing_server or in_docker:
# If explicitly requesting an existing server or in Docker, we can't start a new one
raise RuntimeError(
f"Failed to connect to remote server at {self.base_url}: {str(e)}"
)
else:
self.logger.debug(f"Remote server not available at {self.base_url}: {str(e)}")
# Fall back to localhost for starting a new server
self.host = "localhost"
# If explicitly using an existing server, verify it's running
if self.use_existing_server:
if self.requested_port is None:
raise RuntimeError("Port must be specified when using an existing server")
self.port = self.requested_port
self.base_url = f"http://{self.host}:{self.port}/lume"
try:
await self._verify_server()
self.logger.debug("Successfully connected to existing server")
except Exception as e:
raise RuntimeError(
f"Failed to connect to existing server at {self.base_url}: {str(e)}"
)
else:
# Try to connect to an existing server first
if self.requested_port is not None:
self.port = self.requested_port
self.base_url = f"http://{self.host}:{self.port}/lume"
try:
await self._verify_server()
self.logger.debug("Successfully connected to existing server")
return
except Exception:
self.logger.debug(f"No existing server found at {self.base_url}")
# If in Docker and can't connect to existing server, raise an error
if in_docker:
raise RuntimeError(
f"Failed to connect to server at {self.base_url} and cannot start a new server in Docker"
)
# Start a new server
self.logger.debug("Starting a new server instance")
await self._start_server()
async def stop(self) -> None:
"""Stop the server if we're managing it."""
if not self.use_existing_server:
self.logger.debug("Stopping lume server...")
await self._cleanup()

View File

@@ -1,51 +0,0 @@
[build-system]
build-backend = "pdm.backend"
requires = ["pdm-backend"]
[project]
authors = [{ name = "TryCua", email = "gh@trycua.com" }]
classifiers = [
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Operating System :: MacOS :: MacOS X",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
dependencies = ["pydantic>=2.11.1"]
description = "Python SDK for lume - run macOS and Linux VMs on Apple Silicon"
dynamic = ["version"]
keywords = ["apple-silicon", "macos", "virtualization", "vm"]
license = { text = "MIT" }
name = "pylume"
readme = "README.md"
requires-python = ">=3.12"
[tool.pdm.version]
path = "pylume/__init__.py"
source = "file"
[project.urls]
homepage = "https://github.com/trycua/pylume"
repository = "https://github.com/trycua/pylume"
[tool.pdm]
distribution = true
[tool.pdm.dev-dependencies]
dev = [
"black>=23.0.0",
"isort>=5.12.0",
"pytest-asyncio>=0.23.0",
"pytest>=7.0.0",
]
[tool.pytest.ini_options]
asyncio_mode = "auto"
python_files = "test_*.py"
testpaths = ["tests"]
[tool.pdm.build]
includes = ["pylume/"]
source-includes = ["LICENSE", "README.md", "tests/"]

View File

@@ -0,0 +1,23 @@
"""Pytest configuration for pylume tests.
This module provides test fixtures for the pylume package.
Note: This package has macOS-specific dependencies and will skip tests
if the required modules are not available.
"""
from unittest.mock import Mock, patch
import pytest
@pytest.fixture
def mock_subprocess():
with patch("subprocess.run") as mock_run:
mock_run.return_value = Mock(returncode=0, stdout="", stderr="")
yield mock_run
@pytest.fixture
def mock_requests():
with patch("requests.get") as mock_get, patch("requests.post") as mock_post:
yield {"get": mock_get, "post": mock_post}

Some files were not shown because too many files have changed in this diff Show More