Merge branch 'main' into feat/cua-bench-submodules

This commit is contained in:
Dillon DuPont
2025-12-09 15:25:46 -05:00
117 changed files with 6147 additions and 4331 deletions

View File

@@ -0,0 +1,29 @@
name: Build and Publish CUA Linux Container
on:
push:
branches:
- main
tags:
- "docker-cua-linux-v*.*.*"
paths:
- "libs/qemu-docker/linux/**"
- ".github/workflows/docker-publish-cua-linux.yml"
- ".github/workflows/docker-reusable-publish.yml"
pull_request:
paths:
- "libs/qemu-docker/linux/**"
- ".github/workflows/docker-publish-cua-linux.yml"
- ".github/workflows/docker-reusable-publish.yml"
jobs:
publish:
uses: ./.github/workflows/docker-reusable-publish.yml
with:
image_name: cua-linux
context_dir: libs/qemu-docker/linux
dockerfile_path: Dockerfile
tag_prefix: docker-cua-linux-v
docker_hub_org: trycua
secrets:
DOCKER_HUB_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}

View File

@@ -0,0 +1,29 @@
name: Build and Publish CUA Windows Container
on:
push:
branches:
- main
tags:
- "docker-cua-windows-v*.*.*"
paths:
- "libs/qemu-docker/windows/**"
- ".github/workflows/docker-publish-cua-windows.yml"
- ".github/workflows/docker-reusable-publish.yml"
pull_request:
paths:
- "libs/qemu-docker/windows/**"
- ".github/workflows/docker-publish-cua-windows.yml"
- ".github/workflows/docker-reusable-publish.yml"
jobs:
publish:
uses: ./.github/workflows/docker-reusable-publish.yml
with:
image_name: cua-windows
context_dir: libs/qemu-docker/windows
dockerfile_path: Dockerfile
tag_prefix: docker-cua-windows-v
docker_hub_org: trycua
secrets:
DOCKER_HUB_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}

View File

@@ -2,8 +2,7 @@ name: Lint & Format Check
on:
pull_request:
branches:
- main
push:
branches:
- main

1
.gitignore vendored
View File

@@ -1,3 +1,4 @@
**/image/setup.iso
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

View File

@@ -15,6 +15,8 @@ repos:
name: TypeScript type check
entry: node ./scripts/typescript-typecheck.js
language: node
files: \.(ts|tsx)$
pass_filenames: false
- repo: https://github.com/PyCQA/isort
rev: 7.0.0

View File

@@ -1,14 +1,22 @@
<div align="center">
<picture>
<source media="(prefers-color-scheme: dark)" alt="Cua logo" height="150" srcset="img/logo_white.png">
<source media="(prefers-color-scheme: light)" alt="Cua logo" height="150" srcset="img/logo_black.png">
<img alt="Cua logo" height="150" src="img/logo_black.png">
</picture>
<a href="https://cua.ai" target="_blank" rel="noopener noreferrer">
<picture>
<source media="(prefers-color-scheme: dark)" alt="Cua logo" width="150" srcset="img/logo_white.png">
<source media="(prefers-color-scheme: light)" alt="Cua logo" width="150" srcset="img/logo_black.png">
<img alt="Cua logo" width="500" src="img/logo_black.png">
</picture>
</a>
[![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
[![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
<br>
<p align="center">Build and deploy AI agents that can reason, plan and act on any Computers</p>
<p align="center">
<a href="https://cua.ai" target="_blank" rel="noopener noreferrer"><img src="https://img.shields.io/badge/cua.ai-0ea5e9" alt="cua.ai"></a>
<a href="https://discord.com/invite/cua-ai" target="_blank" rel="noopener noreferrer"><img src="https://img.shields.io/badge/Discord-Join%20Server-10b981?logo=discord&logoColor=white" alt="Discord"></a>
<a href="https://x.com/trycua" target="_blank" rel="noopener noreferrer"><img src="https://img.shields.io/twitter/follow/trycua?style=social" alt="Twitter"></a>
<a href="https://cua.ai/docs" target="_blank" rel="noopener noreferrer"><img src="https://img.shields.io/badge/Docs-0ea5e9.svg" alt="Documentation"></a>
<br>
<a href="https://trendshift.io/repositories/13685" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13685" alt="trycua%2Fcua | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</p>
</div>
@@ -101,6 +109,10 @@ Core utilities for Cua
- [Get started with the Cua SDKs](https://cua.ai/docs/quickstart-devs)
- [Get started with the Cua CLI](https://cua.ai/docs/quickstart-cli)
## Python Version Compatibility
Cua packages require **Python 3.12 or 3.13**. Python 3.14 is not currently supported due to dependency compatibility issues (pydantic-core/PyO3 compatibility). If you encounter build errors on Python 3.14, please use Python 3.13 or earlier.
# Agent SDK
Install the agent SDK:

View File

@@ -21,7 +21,6 @@ The Playground connects to your existing Cua sandboxes—the same ones you use w
<video src="https://github.com/user-attachments/assets/9fef0f30-1024-4833-8b7a-6a2c02d8eb99" width="600" controls></video>
</div>
Sign up at [cua.ai/signin](https://cua.ai/signin) and grab your API key from the dashboard. Then navigate to the Playground:
1. Navigate to Dashboard > Playground
@@ -33,6 +32,7 @@ Sign up at [cua.ai/signin](https://cua.ai/signin) and grab your API key from the
Example use cases:
**Prompt Testing**
```
❌ "Check the website"
✅ "Navigate to example.com in Firefox and take a screenshot of the homepage"
@@ -42,6 +42,7 @@ Example use cases:
Run the same task with different models to compare quality, speed, and cost.
**Debugging Agent Behavior**
1. Send: "Find the login button and click it"
2. View tool calls to see each mouse movement
3. Check screenshots to verify the agent found the right element

View File

@@ -51,7 +51,6 @@ When you request an Anthropic model through Cua, we automatically route to the b
Sign up at [cua.ai/signin](https://cua.ai/signin) and create your API key from **Dashboard > API Keys > New API Key** (save it immediately—you won't see it again).
Use it with the Agent SDK (make sure to set your environment variable):
```python

View File

@@ -29,13 +29,13 @@ A few papers stand out for their immediate relevance to anyone building or deplo
## Summary Statistics
| Category | Count |
|----------|-------|
| Benchmarks & Datasets | 18 |
| Safety & Security | 12 |
| Grounding & Visual Reasoning | 14 |
| Agent Architectures & Training | 11 |
| Adversarial Attacks | 8 |
| Category | Count |
| ------------------------------ | ----- |
| Benchmarks & Datasets | 18 |
| Safety & Security | 12 |
| Grounding & Visual Reasoning | 14 |
| Agent Architectures & Training | 11 |
| Adversarial Attacks | 8 |
**Total Papers:** 45
@@ -56,6 +56,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** The first comprehensive benchmark for evaluating GUI agents on macOS. Features 202 multilingual interactive tasks across 30 applications (28 macOS-exclusive), with support for 5 languages (English, Chinese, Arabic, Japanese, Russian). Reveals a dramatic gap: proprietary agents achieve 30%+ success rate while open-source models lag below 5%. Also includes safety benchmarking for deception attacks.
**Key Findings:**
- Proprietary computer-use agents lead at above 30% success rate
- Open-source lightweight models struggle below 5%, highlighting need for macOS domain adaptation
- Multilingual benchmarks expose weaknesses, especially in Arabic (28.8% degradation vs English)
@@ -70,6 +71,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A comprehensive safety benchmark built on OSWorld for testing computer-use agents across three harm categories: deliberate user misuse, prompt injection attacks, and model misbehavior. Includes 150 tasks spanning harassment, copyright infringement, disinformation, data exfiltration, and more. Proposes an automated judge achieving high agreement with human annotations (0.76-0.79 F1 score).
**Key Findings:**
- All tested models (o4-mini, Claude 3.7 Sonnet, Gemini 2.5 Pro) tend to directly comply with many deliberate misuse queries
- Models are relatively vulnerable to static prompt injections
- Models occasionally perform unsafe actions without explicit malicious prompts
@@ -83,6 +85,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A comprehensive open-source framework for scaling computer-use agent data and foundation models. Introduces AgentNet, the first large-scale computer-use task dataset spanning 3 operating systems and 200+ applications/websites. OpenCUA-72B achieves 45% success rate on OSWorld-Verified, establishing new state-of-the-art among open-source models.
**Key Contributions:**
- Annotation infrastructure for capturing human computer-use demonstrations
- AgentNet: large-scale dataset across 3 OSes and 200+ apps
- Scalable pipeline transforming demonstrations into state-action pairs with reflective Chain-of-Thought reasoning
@@ -97,6 +100,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A benchmark of 130 realistic, high-quality, long-horizon tasks for agentic search systems (like Deep Research), requiring real-time web browsing and extensive information synthesis. Constructed with 1000+ hours of human labor. Introduces Agent-as-a-Judge framework using tree-structured rubric design for automated evaluation.
**Key Findings:**
- OpenAI Deep Research achieves 50-70% of human performance while spending half the time
- First systematic evaluation of ten frontier agentic search systems vs. human performance
- Addresses the challenge of evaluating time-varying, complex answers
@@ -110,6 +114,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Addresses GUI grounding—mapping natural language to specific UI actions—as a critical bottleneck in agent development. Introduces OSWorld-G benchmark (564 annotated samples) and Jedi dataset (4 million synthetic examples), the largest computer-use grounding dataset. Improved grounding directly enhances agentic capabilities, boosting OSWorld performance from 23% to 51%.
**Key Contributions:**
- OSWorld-G: comprehensive benchmark for diverse grounding tasks (text matching, element recognition, layout understanding, precise manipulation)
- Jedi: 4M examples through multi-perspective task decoupling
- Demonstrates compositional generalization to novel interfaces
@@ -123,6 +128,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Evaluates potential safety risks of MLLM-based agents during real-world computer manipulation. Features 492 risky tasks spanning web, social media, multimedia, OS, email, and office software. Categorizes risks into user-originated and environmental risks, evaluating both risk goal intention and completion.
**Key Findings:**
- Current computer-use agents face significant safety risks in real-world scenarios
- Safety principles designed for dialogue scenarios don't transfer well to computer-use
- Highlights necessity and urgency of safety alignment for computer-use agents
@@ -136,6 +142,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A benchmark featuring high-fidelity, deterministic replicas of 11 widely-used websites across e-commerce, travel, communication, and professional networking. Contains 112 practical tasks requiring both information retrieval and state-changing actions. Enables reproducible evaluation without safety risks.
**Key Findings:**
- Best frontier language models achieve only 41% success rate
- Highlights critical gaps in autonomous web navigation and task completion
- Supports scalable post-training data generation
@@ -149,6 +156,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** An RL-based framework for GUI grounding incorporating seed data curation, dense policy gradients, and self-evolutionary reinforcement finetuning using attention maps. With only 3K training samples, the 7B model achieves state-of-the-art on three grounding benchmarks, outperforming UI-TARS-72B by 24.2% on ScreenSpot-Pro.
**Key Results:**
- 47.3% accuracy on ScreenSpot-Pro with 7B model
- Outperforms 72B models with fraction of training data
- Demonstrates effectiveness of RL for high-resolution, complex environments
@@ -162,6 +170,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A generative adversarial framework that manipulates agent decision-making using diffusion-based semantic injections. Combines negative prompt degradation with positive semantic optimization. Without model access, produces visually natural images that induce consistent decision biases in agents.
**Key Findings:**
- Consistently induces decision-level preference redirection on LLaVA-34B, Gemma3, GPT-4o, and Mistral-3.2
- Outperforms baselines (SPSA, Bandit, standard diffusion)
- Exposes vulnerability: autonomous agents can be misled through visually subtle, semantically-guided manipulations
@@ -175,6 +184,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** An extensible benchmark simulating a small software company environment where AI agents interact like digital workers: browsing the web, writing code, running programs, and communicating with coworkers. Tests agents on real professional tasks with important implications for industry adoption and labor market effects.
**Key Findings:**
- Best agent achieves 30% autonomous task completion
- Simpler tasks are solvable autonomously
- More difficult long-horizon tasks remain beyond current systems' reach
@@ -188,6 +198,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A comprehensive benchmark for VLMs in video game QA, encompassing visual unit testing, visual regression testing, needle-in-a-haystack challenges, glitch detection, and bug report generation for both images and videos. Addresses the need for standardized benchmarks in this labor-intensive domain.
**Key Focus:**
- First benchmark specifically designed for video game QA with VLMs
- Covers wide range of QA activities across images and videos
- Addresses lack of automation in game development workflows
@@ -201,6 +212,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** End-to-end benchmark for evaluating web agent security against prompt injection attacks. Tests realistic scenarios where even simple, low-effort human-written injections can deceive top-tier AI models including those with advanced reasoning.
**Key Findings:**
- Attacks partially succeed in up to 86% of cases
- State-of-the-art agents often struggle to fully complete attacker goals
- Reveals "security by incompetence"—agents' limitations sometimes prevent full attack success
@@ -214,6 +226,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Measures whether AI web-navigation agents follow the privacy principle of "data minimization"—using sensitive information only when truly necessary to complete a task. Simulates realistic web interaction scenarios end-to-end.
**Key Findings:**
- Agents built on GPT-4, Llama-3, and Claude are prone to inadvertent use of unnecessary sensitive information
- Proposes prompting-based defense that reduces information leakage
- End-to-end benchmarking provides more realistic measure than probing LLMs about privacy
@@ -227,6 +240,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A novel paradigm for AI agents that fluidly bridge embodiment and web-scale reasoning. Creates unified simulation integrating realistic 3D indoor/outdoor environments with functional web interfaces. Tasks include cooking from online recipes, navigating with dynamic map data, and interpreting landmarks using web knowledge.
**Key Contributions:**
- Unified platform combining 3D environments with web interfaces
- Benchmark spanning cooking, navigation, shopping, tourism, and geolocation
- Reveals significant performance gaps between AI systems and humans
@@ -240,6 +254,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** The first attempt to model UI interactions for precision engineering tasks. Features 41K+ annotated video recordings of CAD operations with time horizons up to 20x longer than existing datasets. Proposes VideoCADFormer for learning CAD interactions directly from video.
**Key Contributions:**
- Large-scale synthetic dataset for CAD UI interactions
- VQA benchmark for evaluating spatial reasoning and video understanding
- Reveals challenges in precise action grounding and long-horizon dependencies
@@ -253,6 +268,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Introduces a pre-operative critic mechanism that provides feedback before action execution by reasoning about potential outcomes. Proposes Suggestion-aware Group Relative Policy Optimization (S-GRPO) for building the GUI-Critic-R1 model with fully automated data generation.
**Key Results:**
- Significant advantages in critic accuracy compared to current MLLMs
- Improved success rates and operational efficiency on GUI automation benchmarks
- Works across both mobile and web domains
@@ -266,7 +282,8 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A vision-language model trained with RL to explicitly anchor each reasoning step to specific visual coordinates. Introduces multi-turn RL framework enabling dynamic zooming into predicted coordinates during reasoning.
**Key Results:**
- 86.4% on V*Bench for visual search
- 86.4% on V\*Bench for visual search
- Outperforms supervised fine-tuning and conventional RL across spatial reasoning, visual search, and web-based grounding
- Grounding amplifies region exploration, subgoal setting, and visual verification
@@ -279,6 +296,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A VLM-based method for coordinate-free GUI grounding using an attention-based action head. Enables proposing one or more action regions in a single forward pass with a grounding verifier for selection.
**Key Results:**
- GUI-Actor-7B achieves 44.6 on ScreenSpot-Pro with Qwen2.5-VL, outperforming UI-TARS-72B (38.1)
- Improved generalization to unseen resolutions and layouts
- Fine-tuning only ~100M parameters achieves SOTA performance
@@ -292,11 +310,13 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Extensive analysis of the R1-Zero paradigm (online RL + chain-of-thought reasoning) for GUI grounding. Identifies issues: longer reasoning chains lead to worse performance, reward hacking via box size exploitation, and overfitting easy examples.
**Solutions Proposed:**
- Fast Thinking Template for direct answer generation
- Box size constraint in reward function
- Difficulty-aware scaling in RL objective
**Key Results:**
- GUI-G1-3B achieves 90.3% on ScreenSpot and 37.1% on ScreenSpot-Pro
- Outperforms larger UI-TARS-7B with only 3B parameters
@@ -309,6 +329,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Framework integrating self-reflection and error correction into end-to-end multimodal GUI models through GUI-specific pre-training, offline SFT, and online reflection tuning. Enables self-reflection emergence with fully automated data generation.
**Key Contributions:**
- Scalable pipelines for automatic reflection/correction data from successful trajectories
- GUI-Reflection Task Suite for reflection-oriented abilities
- Diverse environment for online training on mobile devices
@@ -323,6 +344,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A generalist agent capable of multimodal computer interaction (text, images, audio, video). Integrates tool-based and pure vision agents within highly modular architecture, enabling collaborative step-by-step task solving.
**Key Results:**
- 7.27 accuracy gain over Claude-Computer-Use on OSWorld
- Evaluated on pure vision benchmarks (OSWorld), general benchmarks (GAIA), and tool-intensive benchmarks (SWE-Bench)
- Demonstrates value of modular, collaborative agent architecture
@@ -336,6 +358,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A fine-grained adversarial attack framework that modifies VLM perception of only key objects while preserving semantics of remaining regions. Unlike broad semantic disruption, this targeted approach reduces conflicts with task context, making VLMs output valid but incorrect decisions that affect agent actions in the physical world.
**Key Contributions:**
- AdvEDM-R: removes semantics of specific objects from images
- AdvEDM-A: adds semantics of new objects into images
- Demonstrates fine-grained control with excellent attack performance in embodied decision-making tasks
@@ -349,6 +372,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A vision-centric reasoning benchmark grounded in challenging perceptual tasks. Unlike prior benchmarks, it moves beyond shallow perception ("see") to require fine-grained observation and analytical reasoning ("observe"). Features natural adversarial image pairs and annotated reasoning chains for process evaluation.
**Key Findings:**
- Tests 20 leading MLLMs including 12 foundation models and 8 reasoning-enhanced models
- Existing reasoning strategies (chain-of-thought, self-criticism) result in unstable and redundant reasoning
- Repeated image observation improves performance across models
@@ -363,6 +387,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** First systematic investigation of backdoor vulnerabilities in VLA models. Proposes Objective-Decoupled Optimization with two stages: explicit feature-space separation to isolate trigger representations, and conditional control deviations activated only by triggers.
**Key Findings:**
- Consistently achieves near-100% attack success rates with minimal impact on clean task accuracy
- Robust against common input perturbations, task transfers, and model fine-tuning
- Exposes critical security vulnerabilities in current VLA deployments under Training-as-a-Service paradigm
@@ -376,6 +401,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Benchmark for proactively inferring user goals from multimodal contextual observations for wearable assistant agents (smart glasses). Dataset comprises ~30 hours from 363 participants across 3,482 recordings with visual, audio, digital, and longitudinal context.
**Key Findings:**
- Humans achieve 93% MCQ accuracy; best VLM reaches ~84%
- For open-ended generation, best models produce relevant goals only ~57% of the time
- Smaller models (suited for wearables) achieve ~49% accuracy
@@ -390,6 +416,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A game-theoretic multi-agent framework formulating reasoning as a non-zero-sum game between base agents (visual perception specialists) and a critical agent (logic/fact verification). Features uncertainty-aware controller for dynamic agent collaboration with multi-round debates.
**Key Results:**
- Boosts small-to-mid scale models (Qwen2.5-VL-7B, InternVL3-14B) by 5-6%
- Enhances strong models like GPT-4o by 2-3%
- Modular, scalable, and generalizable framework
@@ -403,6 +430,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Introduces Grounded Reasoning with Images and Texts—a method for training MLLMs to generate reasoning chains interleaving natural language with explicit bounding box coordinates. Uses GRPO-GR reinforcement learning with rewards focused on answer accuracy and grounding format.
**Key Contributions:**
- Exceptional data efficiency: requires as few as 20 image-question-answer triplets
- Successfully unifies reasoning and grounding abilities
- Eliminates need for reasoning chain annotations or explicit bounding box labels
@@ -416,6 +444,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** First multimodal safety alignment framework. Introduces BeaverTails-V (first dataset with dual preference annotations for helpfulness and safety), and Beaver-Guard-V (multi-level guardrail system defending against unsafe queries and adversarial attacks).
**Key Results:**
- Guard model improves precursor model's safety by average of 40.9% over five filtering rounds
- Safe RLHF-V enhances model safety by 34.2% and helpfulness by 34.3%
- First exploration of multi-modal safety alignment within constrained optimization
@@ -429,6 +458,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** An inference-time approach that quantifies visual token uncertainty and selectively masks uncertain tokens. Decomposes uncertainty into aleatoric and epistemic components, focusing on epistemic uncertainty for perception-related errors.
**Key Results:**
- Significantly reduces object hallucinations
- Enhances reliability and quality of LVLM outputs across diverse visual contexts
- Validated on CHAIR, THRONE, and MMBench benchmarks
@@ -442,6 +472,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A unified LVLM integrating segmentation-aware perception and controllable object-centric generation. Uses dual-branch visual encoder for global semantic context and fine-grained spatial details, with MoVQGAN-based visual tokenizer for discrete visual tokens.
**Key Contributions:**
- Progressive multi-stage training pipeline
- Segmentation masks jointly optimized as spatial condition prompts
- Bridges segmentation-aware perception with fine-grained visual synthesis
@@ -455,6 +486,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Introduces Multi-Model Monte Carlo Tree Search (M3CTS) for generating diverse Long Chain-of-Thought reasoning trajectories. Proposes fine-grained Direct Preference Optimization (fDPO) with segment-specific preference granularity guided by spatial reward mechanism.
**Key Results:**
- fDPO achieves 4.1% and 9.0% gains over standard DPO on spatial quality and quantity tasks
- SpatialReasoner-R1 sets new SOTA on SpatialRGPT-Bench, outperforming strongest baseline by 9.8%
- Maintains competitive performance on general vision-language tasks
@@ -468,6 +500,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A two-stage reinforcement fine-tuning framework: SFT with curated Chain-of-Thought data activates reasoning potential, followed by RL based on Group Relative Policy Optimization (GRPO) for domain shift adaptability.
**Key Advantages:**
- State-of-the-art results outperforming both open-source and proprietary models
- Robust performance under domain shifts across various tasks
- Excellent data efficiency in few-shot learning scenarios
@@ -481,6 +514,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Reveals that safe images can be exploited for jailbreaking when combined with additional safe images and prompts, exploiting LVLMs' universal reasoning capabilities and safety snowball effect. Proposes Safety Snowball Agent (SSA) framework.
**Key Findings:**
- SSA can use nearly any image to induce LVLMs to produce unsafe content
- Achieves high jailbreak success rates against latest LVLMs
- Exploits inherent LVLM properties rather than alignment flaws
@@ -494,6 +528,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Uncovers novel attack vector: Malicious Image Patches (MIPs)—adversarially perturbed screen regions that induce OS agents to perform harmful actions. MIPs can be embedded in wallpapers or shared on social media to exfiltrate sensitive data.
**Key Findings:**
- MIPs generalize across user prompts and screen configurations
- Can hijack multiple OS agents during execution of benign instructions
- Exposes critical security vulnerabilities requiring attention before widespread deployment
@@ -507,6 +542,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A framework leveraging instruction-driven routing and sparsification for VLA efficiency. Features 3-stage progressive architecture inspired by human multimodal coordination: Encoder-FiLM Aggregation Routing, LLM-FiLM Pruning Routing, and V-L-A Coupled Attention.
**Key Results:**
- 97.4% success rate on LIBERO benchmark, 70.0% on real-world robotic tasks
- Reduces training costs by 2.5x and inference latency by 2.8x compared to OpenVLA
- Achieves state-of-the-art performance
@@ -520,6 +556,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Novel off-policy RL algorithm applying direct policy updates for positive samples and conservative, regularized updates for negative ones. Augmented with Successful Transition Replay (STR) for prioritizing successful interactions.
**Key Results:**
- At least 17% relative increase over existing methods on AndroidWorld benchmark
- Substantially fewer computational resources than GPT-4o-based methods
- 5-60x faster inference
@@ -533,6 +570,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** An API-centric stress testing framework that uncovers intent integrity violations in LLM agents. Uses semantic partitioning to organize tasks into meaningful categories, with targeted mutations to expose subtle agent errors while preserving user intent.
**Key Contributions:**
- Datatype-aware strategy memory for retrieving effective mutation patterns
- Lightweight predictor for ranking mutations by error likelihood
- Generalizes to stronger target models using smaller LLMs for test generation
@@ -546,6 +584,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** A dual-system framework bridging high-level reasoning with low-level action execution. Trains multimodal LLM to generate embodied reasoning plans guided by action-aligned visual rewards, compressed into visual plan latents for downstream action execution.
**Key Capabilities:**
- Few-shot adaptation
- Long-horizon planning
- Self-correction behaviors in complex embodied AI tasks
@@ -559,6 +598,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Automated attack framework that constructs chains of images with risky visual thoughts to challenge VLMs. Exploits the conflict between logical processing and safety protocols, leading to unsafe content generation.
**Key Results:**
- Improves average attack success rate by 26.71% (from 63.70% to 90.41%)
- Tested on 9 open-source and 6 commercial VLMs
- Outperforms state-of-the-art methods
@@ -572,6 +612,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** First web-based benchmark evaluating MLLM agents on diverse CAPTCHA puzzles. Spans 20 modern CAPTCHA types (225 total) with novel metric: CAPTCHA Reasoning Depth quantifying cognitive and motor steps required.
**Key Findings:**
- Humans achieve 93.3% success rate
- State-of-the-art agents achieve at most 40.0% (Browser-Use OpenAI-o3)
- Highlights significant gap between human and agent capabilities
@@ -585,7 +626,8 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Introduces pixel-space reasoning framework where VLMs use visual operations (zoom-in, select-frame) to directly inspect and infer from visual evidence. Two-phase training: instruction tuning on synthesized traces, then RL with curiosity-driven rewards.
**Key Results:**
- 84% on V*Bench, 74% on TallyQA-Complex, 84% on InfographicsVQA
- 84% on V\*Bench, 74% on TallyQA-Complex, 84% on InfographicsVQA
- Highest accuracy achieved by any open-source 7B model
- Enables proactive information gathering from complex visual inputs
@@ -598,6 +640,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Brain-inspired framework decomposing interactions into three biologically plausible phases: Blink (rapid detection via saccadic-like attention), Think (higher-level reasoning/planning), and Link (executable command generation for motor control).
**Key Innovations:**
- Automated annotation pipeline for blink data
- BTL Reward: first rule-based reward mechanism driven by both process and outcome
- Competitive performance on static GUI understanding and dynamic interaction tasks
@@ -611,6 +654,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Simulation environment engine enabling flexible definition of screens, icons, and navigation graphs with full environment access for agent training/evaluation. Demonstrates progressive training approach from SFT to multi-turn RL.
**Key Findings:**
- Supervised fine-tuning enables memorization of fundamental knowledge
- Single-turn RL enhances generalization to unseen scenarios
- Multi-turn RL encourages exploration strategies through interactive trial and error
@@ -624,6 +668,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Reasoning-enhanced framework integrating structured reasoning, action prediction, and history summarization. Uses Chain-of-Thought analyses combining progress estimation and decision reasoning, trained via SFT and GRPO with history-aware rewards.
**Key Results:**
- State-of-the-art under identical training data conditions
- Particularly strong in out-of-domain scenarios
- Robust reasoning and generalization across diverse GUI navigation tasks
@@ -637,6 +682,7 @@ We'll be at NeurIPS in San Diego. If you're working on computer-use agents, buil
**Summary:** Self-improving framework addressing trajectory verification and training data scalability. Features UI-Genie-RM (image-text interleaved reward model) and self-improvement pipeline with reward-guided exploration and outcome verification.
**Key Contributions:**
- UI-Genie-RM-517k: first reward-specific dataset for GUI agents
- UI-Genie-Agent-16k: high-quality synthetic trajectories without manual annotation
- State-of-the-art across multiple GUI agent benchmarks through three generations of self-improvement

View File

@@ -4,11 +4,7 @@ description: Supported computer-using agent loops and models
---
<Callout>
A corresponding{' '}
<a href="https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb" target="_blank">
Jupyter Notebook
</a>{' '}
is available for this documentation.
A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
</Callout>
An agent can be thought of as a loop - it generates actions, executes them, and repeats until done:

View File

@@ -3,14 +3,7 @@ title: Customize ComputerAgent
---
<Callout>
A corresponding{' '}
<a
href="https://github.com/trycua/cua/blob/main/notebooks/customizing_computeragent.ipynb"
target="_blank"
>
Jupyter Notebook
</a>{' '}
is available for this documentation.
A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/customizing_computeragent.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
</Callout>
The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems.

View File

@@ -4,11 +4,7 @@ description: Use ComputerAgent with HUD for benchmarking and evaluation
---
<Callout>
A corresponding{' '}
<a href="https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb" target="_blank">
Jupyter Notebook
</a>{' '}
is available for this documentation.
A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
</Callout>
The HUD integration allows an agent to be benchmarked using the [HUD framework](https://www.hud.so/). Through the HUD integration, the agent controls a computer inside HUD, where tests are run to evaluate the success of each task.

View File

@@ -4,12 +4,12 @@ title: Configuration
The server is configured using environment variables (can be set in the Claude Desktop config):
| Variable | Description | Default |
| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------- |
| Variable | Description | Default |
| ------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------- |
| `CUA_MODEL_NAME` | Model string (e.g., "anthropic/claude-sonnet-4-20250514", "openai/computer-use-preview", "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", "omniparser+litellm/gpt-4o", "omniparser+ollama_chat/gemma3") | anthropic/claude-sonnet-4-20250514 |
| `ANTHROPIC_API_KEY` | Your Anthropic API key (required for Anthropic models) | None |
| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 |
| `CUA_USE_HOST_COMPUTER_SERVER` | Target your local desktop instead of a VM. Set to "true" to use your host system. **Warning:** AI models may perform risky actions. | false |
| `ANTHROPIC_API_KEY` | Your Anthropic API key (required for Anthropic models) | None |
| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 |
| `CUA_USE_HOST_COMPUTER_SERVER` | Target your local desktop instead of a VM. Set to "true" to use your host system. **Warning:** AI models may perform risky actions. | false |
## Model Configuration
@@ -17,7 +17,7 @@ The `CUA_MODEL_NAME` environment variable supports various model providers throu
### Supported Providers
- **Anthropic**: `anthropic/claude-sonnet-4-20250514`,
- **Anthropic**: `anthropic/claude-sonnet-4-20250514`,
- **OpenAI**: `openai/computer-use-preview`, `openai/gpt-4o`
- **Local Models**: `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`
- **Omni + LiteLLM**: `omniparser+litellm/gpt-4o`, `omniparser+litellm/claude-3-haiku`

View File

@@ -0,0 +1,17 @@
---
title: MCP Server
description: Run Cua agents through Claude Desktop and other MCP clients
---
The MCP Server exposes Cua agents as tools for [Model Context Protocol](https://modelcontextprotocol.io/) clients like Claude Desktop. This lets you ask Claude to perform computer tasks directly from the chat interface.
```bash
pip install cua-mcp-server
```
## Key Features
- **Claude Desktop integration** - Use Cua agents directly in Claude's chat
- **Multi-client support** - Concurrent sessions with automatic resource management
- **Progress reporting** - Real-time updates during task execution
- **VM safety** - Runs in sandboxed VMs by default

View File

@@ -14,6 +14,7 @@
"usage-tracking",
"telemetry",
"benchmarks",
"integrations"
"integrations",
"mcp-server"
]
}

View File

@@ -1,5 +1,5 @@
---
title: Commands
title: Command Reference
description: Complete reference for all CUA CLI commands
---
@@ -35,7 +35,7 @@ Both styles work identically - use whichever you prefer!
### Available Commands
- **Authentication** - `cua auth login`, `cua auth env`, `cua auth logout` (also available as flat commands: `cua login`, `cua env`, `cua logout`)
- **Sandbox Management** - `cua list`, `cua create`, `cua start`, `cua stop`, `cua restart`, `cua delete`, `cua vnc`
- **Sandbox Management** - `cua list`, `cua create`, `cua get`, `cua start`, `cua stop`, `cua restart`, `cua delete`, `cua vnc`
## Authentication Commands
@@ -188,6 +188,79 @@ Job ID: job-xyz789
Use 'cua list' to monitor provisioning progress
```
### `cua get`
Get detailed information about a specific sandbox, including computer-server health status.
```bash
cua get <name>
# With additional options
cua get <name> --json
cua get <name> --show-passwords
cua get <name> --show-vnc-url
```
**Options:**
- `--json` - Output all details in JSON format
- `--show-passwords` - Include password in output
- `--show-vnc-url` - Include computed NoVNC URL
**Example Output (default):**
```bash
$ cua get my-dev-sandbox
Name: my-dev-sandbox
Status: running
Host: my-dev-sandbox.containers.cloud.trycua.com
OS Type: linux
Computer Server Version: 0.1.30
Computer Server Status: healthy
```
**Example Output (with --show-passwords and --show-vnc-url):**
```bash
$ cua get my-dev-sandbox --show-passwords --show-vnc-url
Name: my-dev-sandbox
Status: running
Host: my-dev-sandbox.containers.cloud.trycua.com
Password: secure-pass-123
OS Type: linux
Computer Server Version: 0.1.30
Computer Server Status: healthy
VNC URL: https://my-dev-sandbox.containers.cloud.trycua.com/vnc.html?autoconnect=true&password=secure-pass-123
```
**Example Output (JSON format):**
```bash
$ cua get my-dev-sandbox --json
{
"name": "my-dev-sandbox",
"status": "running",
"host": "my-dev-sandbox.containers.cloud.trycua.com",
"os_type": "linux",
"computer_server_version": "0.1.30",
"computer_server_status": "healthy"
}
```
**Computer Server Health Check:**
The `cua get` command automatically probes the computer-server when the sandbox is running:
- Checks OS type via `https://{host}:8443/status`
- Checks version via `https://{host}:8443/cmd`
- Shows "Computer Server Status: healthy" when both probes succeed
- Uses a 3-second timeout for each probe
<Callout type="info">
The computer server status is only checked for running sandboxes. Stopped or suspended sandboxes
will not show computer server information.
</Callout>
### `cua start`
Start a stopped sandbox.

View File

@@ -0,0 +1,68 @@
---
title: Getting Started
description: Install and set up the CUA CLI
---
import { Tabs, Tab } from 'fumadocs-ui/components/tabs';
import { Callout } from 'fumadocs-ui/components/callout';
The Cua CLI is a command-line tool for managing your Cua cloud sandboxes. Create, start, stop, and connect to sandboxes directly from your terminal.
## Installation
<Tabs items={['macOS / Linux', 'Windows']}>
<Tab value="macOS / Linux">
```bash
curl -LsSf https://cua.ai/cli/install.sh | sh
```
</Tab>
<Tab value="Windows">
```powershell
powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
```
</Tab>
</Tabs>
This installs [Bun](https://bun.sh) and the CUA CLI. Verify with:
```bash
cua --help
```
## Authentication
Login to your CUA account:
```bash
# Browser-based login
cua auth login
# Or with API key
cua auth login --api-key sk-your-api-key-here
```
Generate a `.env` file for your project:
```bash
cua auth env
```
## Quick Start
```bash
# Create a sandbox
cua create --os linux --size small --region north-america
# List sandboxes
cua list
# Open VNC in browser
cua vnc my-sandbox
# Stop a sandbox
cua stop my-sandbox
```
## Next Steps
- [Command Reference](/cli-playbook/commands) - Full list of available commands

View File

@@ -0,0 +1,5 @@
{
"title": "Cloud CLI",
"description": "Command-line interface for CUA Cloud",
"pages": ["index", "commands"]
}

View File

@@ -5,7 +5,7 @@ description: Computer commands and interface methods
This page describes the set of supported **commands** you can use to control a Cua Computer directly via the Python SDK.
These commands map to the same actions available in the [Computer Server API Commands Reference](../libraries/computer-server/Commands), and provide low-level, async access to system operations from your agent or automation code.
These commands map to the same actions available in the [Computer Server API Commands Reference](/computer-sdk/computer-server/Commands), and provide low-level, async access to system operations from your agent or automation code.
## Shell Actions

View File

@@ -0,0 +1,15 @@
---
title: Computer Server
description: HTTP/WebSocket server for remote computer control
---
The Computer Server is an HTTP and WebSocket server that runs inside each Cua sandbox (VM or container). It exposes APIs for remote computer control - allowing the Computer SDK and agents to execute actions like clicking, typing, taking screenshots, and running commands on the sandboxed environment.
When you use `Computer(provider_type="cloud")` or any other provider, the Computer SDK communicates with this server running inside the sandbox to execute your automation commands.
## Key Features
- **REST API** - Execute commands, take screenshots, manage files
- **WebSocket API** - Real-time streaming for continuous interaction
- **Cross-platform** - Runs on Linux, macOS, and Windows sandboxes
- **Secure** - Isolated inside the sandbox environment

View File

@@ -0,0 +1,4 @@
{
"title": "Computer Server",
"pages": ["index", "Commands", "REST-API", "WebSocket-API"]
}

View File

@@ -1,5 +1,5 @@
---
title: Computer UI (Deprecated)
title: Computer UI
---
<Callout type="warn" title="Deprecated">

View File

@@ -7,6 +7,7 @@
"tracing-api",
"sandboxed-python",
"custom-computer-handlers",
"computer-ui"
"computer-ui",
"computer-server"
]
}

View File

@@ -4,14 +4,7 @@ slug: sandboxed-python
---
<Callout>
A corresponding{' '}
<a
href="https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py"
target="_blank"
>
Python example
</a>{' '}
is available for this documentation.
A corresponding <a href="https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py" target="_blank">Python example</a> is available for this documentation.
</Callout>
You can run Python functions securely inside a sandboxed virtual environment on a remote Cua Computer. This is useful for executing untrusted user code, isolating dependencies, or providing a safe environment for automation tasks.

View File

@@ -1,9 +1,9 @@
---
title: Computer Tracing API
title: Tracing
description: Record computer interactions for debugging, training, and analysis
---
# Computer Tracing API
# Tracing
The Computer tracing API provides a powerful way to record computer interactions for debugging, training, analysis, and compliance purposes. Inspired by Playwright's tracing functionality, it offers flexible recording options and standardized output formats.

View File

@@ -19,8 +19,6 @@ import { Code, Terminal } from 'lucide-react';
</Card>
</div> */}
---
## Set Up Your Computer Environment
Choose how you want to run your Cua computer. This will be the environment where your automated tasks will execute.
@@ -43,7 +41,7 @@ You can run your Cua computer in the cloud (recommended for easiest setup), loca
**Option 1: Via Website**
1. Navigate to **Dashboard > Sandboxes > Create Sandbox**
2. Create a **Small** sandbox, choosing **Linux**, **Windows**, or **macOS**
2. Create a sandbox, choosing **Linux**, **Windows**, or **macOS**
3. Note your sandbox name
**Option 2: Via CLI**
@@ -122,6 +120,10 @@ You can run your Cua computer in the cloud (recommended for easiest setup), loca
## Developer Quickstart
<Callout type="warn" title="Python Version Compatibility">
Cua packages require **Python 3.12 or 3.13**. Python 3.14 is not currently supported due to dependency compatibility issues (pydantic-core/PyO3 compatibility). If you encounter build errors on Python 3.14, please use Python 3.13 or earlier.
</Callout>
<Steps>
<Step>

View File

@@ -4,55 +4,46 @@ title: Introduction
import { Monitor, Code, BookOpen, Zap, Bot, Boxes, Rocket } from 'lucide-react';
<div className="rounded-lg border bg-card text-card-foreground shadow-sm px-4 py-2 mb-6">
Cua is an open-source framework for building **Computer-Use Agents** - AI systems that see,
understand, and interact with desktop applications through vision and action, just like humans do.
<div className="not-prose -mt-2 mb-6">
<p className="text-fd-primary font-semibold text-sm mb-1">Welcome</p>
<h1 className="text-3xl font-bold tracking-tight md:text-4xl">Welcome to Cua</h1>
</div>
## Why Cua?
**Cua** is an open-source framework for building, deploying and evaluating Computer-Use Agents - AI systems that autonomously interact with computer interfaces by understanding visual elements and executing actions. Cua provides SDKs for easy integration with 100+ vision-language models (VLMs), supporting everything from simple task automation to complex multi-step workflows across Windows, Linux, and macOS environments.
Cua gives you everything you need to automate any desktop application without brittle selectors or APIs.
Some highlights include:
- **Model flexibility** - Connect to 100+ LLM providers through liteLLM's standard interface. Use models from Anthropic, OpenAI, Google, and more - or run them locally with Ollama, Hugging Face, or MLX.
- **Composed agents** - Mix and match grounding models with planning models for optimal performance. Use specialized models like GTA, OpenCUA, or OmniParser for UI element detection paired with powerful reasoning models like Claude or GPT-4.
- **Cross-platform sandboxes** - Run agents safely in isolated environments. Choose from Docker containers, macOS VMs with Lume, Windows Sandbox, or deploy to Cua Cloud with production-ready infrastructure.
- **Computer SDK** - Control any application with a PyAutoGUI-like API. Click, type, scroll, take screenshots, manage windows, read/write files - everything you need for desktop automation.
- **Agent SDK** - Build autonomous agents with trajectory tracing, prompt caching, cost tracking, and budget controls. Test agents on industry-standard benchmarks like OSWorld-Verified with one line of code.
- **Human-in-the-loop** - Pause agent execution and await user input or approval before continuing. Use the `human/human` model string to let humans control the agent directly.
- **Production essentials** - Ship reliable agents with built-in PII anonymization, cost tracking, trajectory logging, and integration with observability platforms like Laminar and HUD.
## What can you build?
- RPA automation that works with any application - even legacy software without APIs.
- Form-filling agents that handle complex multi-step web workflows.
- Testing automation that adapts to UI changes without brittle selectors.
- Data extraction from desktop applications and document processing.
- Cross-application workflows that combine multiple tools and services.
- Research agents that browse, read, and synthesize information from the web.
Explore real-world examples in our [blog posts](https://cua.ai/blog).
## Get started
Follow the [Quickstart guide](/docs/get-started/quickstart) for step-by-step setup with Python or TypeScript.
If you're new to computer-use agents, check out our [tutorials](https://cua.ai/blog), [examples](https://github.com/trycua/cua/tree/main/examples), and [notebooks](https://github.com/trycua/cua/tree/main/notebooks) to start building with Cua today.
<div className="grid grid-cols-1 md:grid-cols-2 gap-6 mt-8">
<Card icon={<Rocket />} href="/get-started/quickstart" title="Quickstart">
Get up and running in 3 steps with Python or TypeScript.
</Card>
<Card icon={<Zap />} href="/agent-sdk/agent-loops" title="Agent Loops">
Learn how agents work and how to build your own.
</Card>
<Card icon={<BookOpen />} href="/computer-sdk/computers" title="Computer SDK">
Control desktop applications with the Computer SDK.
</Card>
<Card icon={<Monitor />} href="/example-usecases/form-filling" title="Example Use Cases">
See Cua in action with real-world examples.
</Card>
<div className="not-prose relative rounded-xl overflow-hidden my-8 w-full">
<img src="/docs/img/hero.png" alt="Cua" className="w-full h-auto rounded-xl" />
</div>
We can't wait to see what you build with Cua ✨
## What is a Computer-Use Agent?
Computer-Use Agents (CUAs) are AI systems that can autonomously interact with computer interfaces through visual understanding and action execution. They work by capturing screenshots, feeding them to a vision-language model (VLM), and letting the model determine the next action to take - such as clicking, typing, or scrolling - in a continuous loop until the task is complete.
## What is a Computer-Use Sandbox?
Computer-Use Sandboxes are isolated, controlled environments where AI agents can safely interact with computer interfaces. They provide a secure execution space for agents to perform actions such as clicking, typing, and running code, test automation workflows, and learn from interactions without affecting production systems.
## Key Features
With the **Computer SDK**, you can:
- Automate **Windows, Linux, and macOS** sandboxes with a consistent, pyautogui-like API
- Create & manage sandboxes locally or using **Cua Cloud**
With the **Agent SDK**, you can:
- Run computer-use models with a consistent schema
- Benchmark on **OSWorld-Verified**, **SheetBench-V2**, and **ScreenSpot**
- Combine UI grounding models with any LLM using **composed agents**
- Use **100+ models** via API or local inference (Claude, GPT-4, Gemini, Ollama, MLX)
## Get Started
Follow the [Quickstart guide](/get-started/quickstart) for step-by-step setup with Python or TypeScript.
Check out our [tutorials](https://cua.ai/blog), [examples](https://github.com/trycua/cua/tree/main/examples), and [notebooks](https://github.com/trycua/cua/tree/main/notebooks) to start building with Cua today.
<div className="grid grid-cols-2 md:grid-cols-4 gap-2 mt-4 text-sm">
<Card icon={<Rocket className="w-4 h-4" />} href="/get-started/quickstart" title="Quickstart" />
<Card icon={<Zap className="w-4 h-4" />} href="/agent-sdk/agent-loops" title="Agent Loops" />
<Card icon={<BookOpen className="w-4 h-4" />} href="/computer-sdk/computers" title="Computer SDK" />
<Card icon={<Monitor className="w-4 h-4" />} href="/example-usecases/form-filling" title="Examples" />
</div>

View File

@@ -1,21 +0,0 @@
---
title: Agent
description: Reference for the current version of the Agent library.
pypi: cua-agent
github:
- https://github.com/trycua/cua/tree/main/libs/python/agent
---
The Agent library provides the ComputerAgent class and tools for building AI agents that automate workflows on Cua Computers.
## Agent Loops
See the [Agent Loops](../agent-sdk/agent-loops) documentation for how agents process information and take actions.
## Chat History
See the [Chat History](../agent-sdk/chat-history) documentation for managing conversational context and turn-by-turn interactions.
## Callbacks
See the [Callbacks](../agent-sdk/callbacks) documentation for extending and customizing agent behavior with custom hooks.

View File

@@ -1,24 +0,0 @@
---
title: Computer Server
descrption: Reference for the current version of the Computer Server library.
pypi: cua-computer-server
github:
- https://github.com/trycua/cua/tree/main/libs/python/computer-server
---
<Callout>
A corresponding{' '}
<a
href="https://github.com/trycua/cua/blob/main/notebooks/computer_server_nb.ipynb"
target="_blank"
>
Jupyter Notebook
</a>{' '}
is available for this documentation.
</Callout>
The Computer Server API reference documentation is currently under development.
## Overview
The Computer Server provides WebSocket and REST API endpoints for remote computer control and automation.

View File

@@ -1,23 +0,0 @@
---
title: Computer
description: Reference for the current version of the Computer library.
pypi: cua-computer
npm: '@trycua/computer'
github:
- https://github.com/trycua/cua/tree/main/libs/python/computer
- https://github.com/trycua/cua/tree/main/libs/typescript/computer
---
The Computer library provides a Computer class for controlling and automating containers running the Computer Server.
## Connecting to Computers
See the [Cua Computers](../computer-sdk/computers) documentation for how to connect to different computer types (cloud, local, or host desktop).
## Computer Commands
See the [Commands](../computer-sdk/commands) documentation for all supported commands and interface methods (Shell, Mouse, Keyboard, File System, etc.).
## Sandboxed Python Functions
See the [Sandboxed Python](../computer-sdk/sandboxed-python) documentation for running Python functions securely in isolated environments on a remote Cua Computer.

View File

@@ -1,13 +0,0 @@
---
title: Core
description: Reference for the current version of the Core library.
pypi: cua-core
npm: '@trycua/core'
github:
- https://github.com/trycua/cua/tree/main/libs/python/core
- https://github.com/trycua/cua/tree/main/libs/typescript/core
---
## Overview
The Core library provides foundational utilities and shared functionality across the CUA ecosystem.

View File

@@ -1,58 +0,0 @@
---
title: Cua CLI
description: Command-line interface for managing Cua cloud sandboxes and authentication
---
import { Tabs, Tab } from 'fumadocs-ui/components/tabs';
The Cua CLI is a command-line tool that provides an intuitive interface for managing your Cua cloud sandboxes and authentication. It offers a streamlined workflow for creating, managing, and connecting to cloud sandboxes.
## Key Features
- **Authentication Management**: Secure login with browser-based OAuth flow
- **Sandbox Lifecycle**: Create, start, stop, restart, and delete cloud sandboxes
- **Quick Access**: Direct links to VNC and playground interfaces
- **Cross-Platform**: Works on macOS, Linux, and Windows
- **Environment Integration**: Automatic `.env` file generation
## Quick Example
```bash
# Install the CLI (installs Bun + CUA CLI)
curl -LsSf https://cua.ai/cli/install.sh | sh
# Login to your CUA account
cua auth login
# Create a new Linux sandbox
cua sb create --os linux --size small --region north-america
# List your sandboxes
cua sb list
```
## Use Cases
### Development Workflow
- Quickly spin up cloud sandboxes for testing
- Manage multiple sandboxes across different regions
- Integrate with CI/CD pipelines
### Team Collaboration
- Share sandbox configurations and access
- Standardize development environments
- Quick onboarding for new team members
### Automation
- Script sandbox provisioning and management
- Integrate with deployment workflows
- Automate environment setup
## Next Steps
- [Install the CLI](/libraries/cua-cli/installation)
- [Learn about available commands](/libraries/cua-cli/commands)
- [Get started with the quickstart guide](/get-started/quickstart#cli-quickstart)

View File

@@ -1,130 +0,0 @@
---
title: Installation
description: Install the CUA CLI on your system
---
import { Tabs, Tab } from 'fumadocs-ui/components/tabs';
import { Callout } from 'fumadocs-ui/components/callout';
## Quick Install
The fastest way to install the CUA CLI is using our installation scripts:
<Tabs items={['macOS / Linux', 'Windows']}>
<Tab value="macOS / Linux">```bash curl -LsSf https://cua.ai/cli/install.sh | sh ```</Tab>
<Tab value="Windows">
```powershell powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
```
</Tab>
</Tabs>
These scripts will automatically:
1. Install [Bun](https://bun.sh) (a fast JavaScript runtime)
2. Install the CUA CLI via `bun add -g @trycua/cli`
<Callout type="info">
The installation scripts will automatically detect your system and install the appropriate binary
to your PATH.
</Callout>
## Alternative: Install with Bun
You can also install the CLI directly using Bun:
```bash
# Install Bun if you don't have it
curl -fsSL https://bun.sh/install | bash
# Install CUA CLI
bun add -g @trycua/cli
```
<Callout type="info">
Using Bun provides faster installation and better performance compared to npm. If you don't have
Bun installed, the first command will install it for you.
</Callout>
## Verify Installation
After installation, verify the CLI is working:
```bash
cua --help
```
You should see the CLI help output with available commands.
## First Time Setup
After installation, you'll need to authenticate with your CUA account:
```bash
# Login with browser-based OAuth flow
cua auth login
# Or provide your API key directly
cua auth login --api-key sk-your-api-key-here
```
## Updating
To update to the latest version:
<Tabs items={['Script Install', 'npm Install']}>
<Tab value="Script Install">
Re-run the installation script: ```bash # macOS/Linux curl -LsSf https://cua.ai/cli/install.sh |
sh # Windows powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
```
</Tab>
<Tab value="npm Install">```bash npm update -g @trycua/cli ```</Tab>
</Tabs>
## Uninstalling
<Tabs items={['Script Install', 'npm Install']}>
<Tab value="Script Install">
Remove the binary from your PATH: ```bash # macOS/Linux rm $(which cua) # Windows # Remove from
your PATH or delete the executable ```
</Tab>
<Tab value="npm Install">```bash npm uninstall -g @trycua/cli ```</Tab>
</Tabs>
## Troubleshooting
### Command Not Found
If you get a "command not found" error after installation:
1. **Check your PATH**: Make sure the installation directory is in your PATH
2. **Restart your terminal**: Close and reopen your terminal/command prompt
3. **Manual PATH setup**: Add the installation directory to your PATH manually
### Permission Issues
If you encounter permission issues during installation:
<Tabs items={['macOS / Linux', 'Windows']}>
<Tab value="macOS / Linux">
Try running with sudo (not recommended for the curl method): ```bash # If using npm sudo npm
install -g @trycua/cli ```
</Tab>
<Tab value="Windows">
Run PowerShell as Administrator: ```powershell # Right-click PowerShell and "Run as
Administrator" powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
```
</Tab>
</Tabs>
### Network Issues
If the installation script fails due to network issues:
1. **Check your internet connection**
2. **Try the npm installation method instead**
3. **Check if your firewall is blocking the download**
## Next Steps
- [Learn about CLI commands](/libraries/cua-cli/commands)
- [Follow the quickstart guide](/get-started/quickstart#cli-quickstart)

View File

@@ -1,5 +0,0 @@
{
"title": "CLI",
"description": "Command-line interface for CUA",
"pages": ["index", "installation", "commands"]
}

View File

@@ -1,27 +0,0 @@
---
title: MCP Server
description: Reference for the current version of the MCP Server library.
pypi: cua-mcp-server
github:
- https://github.com/trycua/cua/tree/main/libs/python/mcp-server
---
**cua-mcp-server** is a MCP server for the Computer-Use Agent (CUA), allowing you to run CUA through Claude Desktop or other MCP clients.
## Features
- **Multi-Client Support**: Concurrent sessions with automatic resource management
- **Progress Reporting**: Real-time progress updates during task execution
- **Error Handling**: Robust error recovery with screenshot capture
- **Concurrent Execution**: Run multiple tasks in parallel for improved performance
- **Session Management**: Automatic cleanup and resource pooling
- **LiteLLM Integration**: Support for multiple model providers
- **VM Safety**: Default VM execution with optional host system control
## Quick Start
1. **Install**: `pip install cua-mcp-server`
2. **Configure**: Add to your MCP client configuration
3. **Use**: Ask Claude to perform computer tasks
See the [Installation](/docs/libraries/mcp-server/installation) guide for detailed setup instructions.

View File

@@ -1,78 +0,0 @@
---
title: Configuration
---
### Detection Parameters
#### Box Threshold (0.3)
Controls the confidence threshold for accepting detections:
<img
src="/docs/img/som_box_threshold.png"
alt="Illustration of confidence thresholds in object detection, with a high-confidence detection accepted and a low-confidence detection rejected."
width="500px"
/>
- Higher values (0.3) yield more precise but fewer detections - Lower values (0.01) catch more
potential icons but increase false positives - Default is 0.3 for optimal precision/recall balance
#### IOU Threshold (0.1)
Controls how overlapping detections are merged:
<img
src="/docs/img/som_iou_threshold.png"
alt="Diagram showing Intersection over Union (IOU) with low overlap between two boxes kept separate and high overlap leading to merging."
width="500px"
/>
- Lower values (0.1) more aggressively remove overlapping boxes - Higher values (0.5) allow more
overlapping detections - Default is 0.1 to handle densely packed UI elements
### OCR Configuration
- **Engine**: EasyOCR
- Primary choice for all platforms
- Fast initialization and processing
- Built-in English language support
- GPU acceleration when available
- **Settings**:
- Timeout: 5 seconds
- Confidence threshold: 0.5
- Paragraph mode: Disabled
- Language: English only
## Performance
### Hardware Acceleration
#### MPS (Metal Performance Shaders)
- Multi-scale detection (640px, 1280px, 1920px)
- Test-time augmentation enabled
- Half-precision (FP16)
- Average detection time: ~0.4s
- Best for production use when available
#### CPU
- Single-scale detection (1280px)
- Full-precision (FP32)
- Average detection time: ~1.3s
- Reliable fallback option
### Example Output Structure
```
examples/output/
├── {timestamp}_no_ocr/
│ ├── annotated_images/
│ │ └── screenshot_analyzed.png
│ ├── screen_details.txt
│ └── summary.json
└── {timestamp}_ocr/
├── annotated_images/
│ └── screenshot_analyzed.png
├── screen_details.txt
└── summary.json
```

View File

@@ -1,66 +0,0 @@
---
title: Set-of-Mark
description: Reference for the current version of the Set-of-Mark library.
pypi: cua-som
github:
- https://github.com/trycua/cua/tree/main/libs/python/som
---
<Callout>
A corresponding{' '}
<a href="https://github.com/trycua/cua/blob/main/examples/som_examples.py" target="_blank">
Python example
</a>{' '}
is available for this documentation.
</Callout>
## Overview
The SOM library provides visual element detection and interaction capabilities. It is based on the [Set-of-Mark](https://arxiv.org/abs/2310.11441) research paper and the [OmniParser](https://github.com/microsoft/OmniParser) model.
## API Documentation
### OmniParser Class
```python
class OmniParser:
def __init__(self, device: str = "auto"):
"""Initialize the parser with automatic device detection"""
def parse(
self,
image: PIL.Image,
box_threshold: float = 0.3,
iou_threshold: float = 0.1,
use_ocr: bool = True,
ocr_engine: str = "easyocr"
) -> ParseResult:
"""Parse UI elements from an image"""
```
### ParseResult Object
```python
@dataclass
class ParseResult:
elements: List[UIElement] # Detected elements
visualized_image: PIL.Image # Annotated image
processing_time: float # Time in seconds
def to_dict(self) -> dict:
"""Convert to JSON-serializable dictionary"""
def filter_by_type(self, elem_type: str) -> List[UIElement]:
"""Filter elements by type ('icon' or 'text')"""
```
### UIElement
```python
class UIElement(BaseModel):
id: Optional[int] = Field(None) # Element ID (1-indexed)
type: Literal["icon", "text"] # Element type
bbox: BoundingBox # Bounding box coordinates { x1, y1, x2, y2 }
interactivity: bool = Field(default=False) # Whether the element is interactive
confidence: float = Field(default=1.0) # Detection confidence
```

View File

@@ -0,0 +1,5 @@
{
"title": "macOS VM CLI",
"description": "CLI tools for macOS virtualization",
"pages": ["lume", "lumier"]
}

View File

@@ -10,9 +10,11 @@
"...example-usecases",
"---[BookCopy]Computer Playbook---",
"...computer-sdk",
"---[BookCopy]Agent Playbook---",
"---[Bot]Agent Playbook---",
"...agent-sdk",
"---[CodeXml]API Reference---",
"...libraries"
"---[Terminal]Cloud CLI Playbook---",
"...cli-playbook",
"---[Terminal]macOS VM CLI Playbook---",
"...macos-vm-cli-playbook"
]
}

View File

@@ -10,11 +10,11 @@
},
"dependencies": {
"fumadocs-core": "16.0.8",
"fumadocs-mdx": "13.0.5",
"fumadocs-mdx": "13.0.8",
"fumadocs-ui": "16.0.8",
"lucide-react": "^0.525.0",
"mermaid": "^11.8.1",
"next": "16.0.1",
"mermaid": "^11.12.1",
"next": "16.0.7",
"next-themes": "^0.4.6",
"posthog-js": "^1.276.0",
"react": "^19.2.0",
@@ -42,6 +42,9 @@
"@tailwindcss/oxide",
"esbuild",
"sharp"
]
],
"overrides": {
"js-yaml@>=4.0.0 <4.1.1": ">=4.1.1"
}
}
}

261
docs/pnpm-lock.yaml generated
View File

@@ -4,28 +4,31 @@ settings:
autoInstallPeers: true
excludeLinksFromLockfile: false
overrides:
js-yaml@>=4.0.0 <4.1.1: '>=4.1.1'
importers:
.:
dependencies:
fumadocs-core:
specifier: 16.0.8
version: 16.0.8(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
version: 16.0.8(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
fumadocs-mdx:
specifier: 13.0.5
version: 13.0.5(fumadocs-core@16.0.8(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(next@16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react@19.2.0)
specifier: 13.0.8
version: 13.0.8(fumadocs-core@16.0.8(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(next@16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react@19.2.0)
fumadocs-ui:
specifier: 16.0.8
version: 16.0.8(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0)(tailwindcss@4.1.10)
version: 16.0.8(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0)(tailwindcss@4.1.10)
lucide-react:
specifier: ^0.525.0
version: 0.525.0(react@19.2.0)
mermaid:
specifier: ^11.8.1
version: 11.8.1
specifier: ^11.12.1
version: 11.12.1
next:
specifier: 16.0.1
version: 16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
specifier: 16.0.7
version: 16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
next-themes:
specifier: ^0.4.6
version: 0.4.6(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
@@ -98,9 +101,6 @@ packages:
'@antfu/install-pkg@1.1.0':
resolution: {integrity: sha512-MGQsmw10ZyI+EJo45CdSER4zEb+p31LpDAFp2Z3gkSd1yqVZGi0Ebx++YTEMonJy4oChEMLsxZ64j8FH6sSqtQ==}
'@antfu/utils@8.1.1':
resolution: {integrity: sha512-Mex9nXf9vR6AhcXmMrlz/HVgYYZpVGJ6YlPgwl7UnaFpnshXs6EK/oa5Gpf3CzENMjkvEx2tQtntGnb7UtSTOQ==}
'@braintree/sanitize-url@7.1.1':
resolution: {integrity: sha512-i1L7noDNxtFyL5DmZafWy1wRVhGehQmzZaz1HiN5e7iylJMSZR7ekOV7NsIqa5qBldlLrsKv4HbgFUVlQrz8Mw==}
@@ -299,8 +299,8 @@ packages:
'@iconify/types@2.0.0':
resolution: {integrity: sha512-+wluvCrRhXrhyOmRDJ3q8mux9JkKy5SJ/v8ol2tu4FVjyYvtEzkc/3pK15ET6RKg4b4w4BmTk1+gsCUhf21Ykg==}
'@iconify/utils@2.3.0':
resolution: {integrity: sha512-GmQ78prtwYW6EtzXRU1rY+KwOKfz32PD7iJh6Iyqw68GiKuoZ2A6pRtzWONz5VQJbp50mEjXh/7NkumtrAgRKA==}
'@iconify/utils@3.1.0':
resolution: {integrity: sha512-Zlzem1ZXhI1iHeeERabLNzBHdOa4VhQbqAcOQaMKuTuyZCpwKbC2R4Dd0Zo3g9EAc+Y4fiarO8HIHRAth7+skw==}
'@img/colour@1.0.0':
resolution: {integrity: sha512-A5P/LfWGFSl6nsckYtjw9da+19jB8hkJ6ACTGcDfEJ0aE+l2n2El7dsVM7UVHZQ9s2lmYMWlrS21YLy2IR1LUw==}
@@ -464,56 +464,56 @@ packages:
'@mdx-js/mdx@3.1.1':
resolution: {integrity: sha512-f6ZO2ifpwAQIpzGWaBQT2TXxPv6z3RBzQKpVftEWN78Vl/YweF1uwussDx8ECAXVtr3Rs89fKyG9YlzUs9DyGQ==}
'@mermaid-js/parser@0.6.1':
resolution: {integrity: sha512-lCQNpV8R4lgsGcjX5667UiuDLk2micCtjtxR1YKbBXvN5w2v+FeLYoHrTSSrjwXdMcDYvE4ZBPvKT31dfeSmmA==}
'@mermaid-js/parser@0.6.3':
resolution: {integrity: sha512-lnjOhe7zyHjc+If7yT4zoedx2vo4sHaTmtkl1+or8BRTnCtDmcTpAjpzDSfCZrshM5bCoz0GyidzadJAH1xobA==}
'@next/env@16.0.1':
resolution: {integrity: sha512-LFvlK0TG2L3fEOX77OC35KowL8D7DlFF45C0OvKMC4hy8c/md1RC4UMNDlUGJqfCoCS2VWrZ4dSE6OjaX5+8mw==}
'@next/env@16.0.7':
resolution: {integrity: sha512-gpaNgUh5nftFKRkRQGnVi5dpcYSKGcZZkQffZ172OrG/XkrnS7UBTQ648YY+8ME92cC4IojpI2LqTC8sTDhAaw==}
'@next/swc-darwin-arm64@16.0.1':
resolution: {integrity: sha512-R0YxRp6/4W7yG1nKbfu41bp3d96a0EalonQXiMe+1H9GTHfKxGNCGFNWUho18avRBPsO8T3RmdWuzmfurlQPbg==}
'@next/swc-darwin-arm64@16.0.7':
resolution: {integrity: sha512-LlDtCYOEj/rfSnEn/Idi+j1QKHxY9BJFmxx7108A6D8K0SB+bNgfYQATPk/4LqOl4C0Wo3LACg2ie6s7xqMpJg==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [darwin]
'@next/swc-darwin-x64@16.0.1':
resolution: {integrity: sha512-kETZBocRux3xITiZtOtVoVvXyQLB7VBxN7L6EPqgI5paZiUlnsgYv4q8diTNYeHmF9EiehydOBo20lTttCbHAg==}
'@next/swc-darwin-x64@16.0.7':
resolution: {integrity: sha512-rtZ7BhnVvO1ICf3QzfW9H3aPz7GhBrnSIMZyr4Qy6boXF0b5E3QLs+cvJmg3PsTCG2M1PBoC+DANUi4wCOKXpA==}
engines: {node: '>= 10'}
cpu: [x64]
os: [darwin]
'@next/swc-linux-arm64-gnu@16.0.1':
resolution: {integrity: sha512-hWg3BtsxQuSKhfe0LunJoqxjO4NEpBmKkE+P2Sroos7yB//OOX3jD5ISP2wv8QdUwtRehMdwYz6VB50mY6hqAg==}
'@next/swc-linux-arm64-gnu@16.0.7':
resolution: {integrity: sha512-mloD5WcPIeIeeZqAIP5c2kdaTa6StwP4/2EGy1mUw8HiexSHGK/jcM7lFuS3u3i2zn+xH9+wXJs6njO7VrAqww==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [linux]
'@next/swc-linux-arm64-musl@16.0.1':
resolution: {integrity: sha512-UPnOvYg+fjAhP3b1iQStcYPWeBFRLrugEyK/lDKGk7kLNua8t5/DvDbAEFotfV1YfcOY6bru76qN9qnjLoyHCQ==}
'@next/swc-linux-arm64-musl@16.0.7':
resolution: {integrity: sha512-+ksWNrZrthisXuo9gd1XnjHRowCbMtl/YgMpbRvFeDEqEBd523YHPWpBuDjomod88U8Xliw5DHhekBC3EOOd9g==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [linux]
'@next/swc-linux-x64-gnu@16.0.1':
resolution: {integrity: sha512-Et81SdWkcRqAJziIgFtsFyJizHoWne4fzJkvjd6V4wEkWTB4MX6J0uByUb0peiJQ4WeAt6GGmMszE5KrXK6WKg==}
'@next/swc-linux-x64-gnu@16.0.7':
resolution: {integrity: sha512-4WtJU5cRDxpEE44Ana2Xro1284hnyVpBb62lIpU5k85D8xXxatT+rXxBgPkc7C1XwkZMWpK5rXLXTh9PFipWsA==}
engines: {node: '>= 10'}
cpu: [x64]
os: [linux]
'@next/swc-linux-x64-musl@16.0.1':
resolution: {integrity: sha512-qBbgYEBRrC1egcG03FZaVfVxrJm8wBl7vr8UFKplnxNRprctdP26xEv9nJ07Ggq4y1adwa0nz2mz83CELY7N6Q==}
'@next/swc-linux-x64-musl@16.0.7':
resolution: {integrity: sha512-HYlhqIP6kBPXalW2dbMTSuB4+8fe+j9juyxwfMwCe9kQPPeiyFn7NMjNfoFOfJ2eXkeQsoUGXg+O2SE3m4Qg2w==}
engines: {node: '>= 10'}
cpu: [x64]
os: [linux]
'@next/swc-win32-arm64-msvc@16.0.1':
resolution: {integrity: sha512-cPuBjYP6I699/RdbHJonb3BiRNEDm5CKEBuJ6SD8k3oLam2fDRMKAvmrli4QMDgT2ixyRJ0+DTkiODbIQhRkeQ==}
'@next/swc-win32-arm64-msvc@16.0.7':
resolution: {integrity: sha512-EviG+43iOoBRZg9deGauXExjRphhuYmIOJ12b9sAPy0eQ6iwcPxfED2asb/s2/yiLYOdm37kPaiZu8uXSYPs0Q==}
engines: {node: '>= 10'}
cpu: [arm64]
os: [win32]
'@next/swc-win32-x64-msvc@16.0.1':
resolution: {integrity: sha512-XeEUJsE4JYtfrXe/LaJn3z1pD19fK0Q6Er8Qoufi+HqvdO4LEPyCxLUt4rxA+4RfYo6S9gMlmzCMU2F+AatFqQ==}
'@next/swc-win32-x64-msvc@16.0.7':
resolution: {integrity: sha512-gniPjy55zp5Eg0896qSrf3yB1dw4F/3s8VK1ephdsZZ129j2n6e1WqCbE2YgcKhW9hPB9TVZENugquWJD5x0ug==}
engines: {node: '>= 10'}
cpu: [x64]
os: [win32]
@@ -1239,9 +1239,6 @@ packages:
confbox@0.1.8:
resolution: {integrity: sha512-RMtmw0iFkeR4YV+fUOSucriAQNb9g8zFR52MWCtl+cCZOFRNL6zeB395vPzFhEjjn4fMxXudmELnl/KF/WrK6w==}
confbox@0.2.2:
resolution: {integrity: sha512-1NB+BKqhtNipMsov4xI/NnhCKp9XG9NamYp5PVm9klAT0fsrNPjaFICsCFhNhwZJKNh7zB/3q8qXz0E9oaMNtQ==}
core-js@3.46.0:
resolution: {integrity: sha512-vDMm9B0xnqqZ8uSBpZ8sNtRtOdmfShrvT6h2TuQGLs0Is+cR0DYbj/KWP6ALVNbWPpqA/qPLoOuppJN07humpA==}
@@ -1412,11 +1409,11 @@ packages:
resolution: {integrity: sha512-e1U46jVP+w7Iut8Jt8ri1YsPOvFpg46k+K8TpCb0P+zjCkjkPnV7WzfDJzMHy1LnA+wj5pLT1wjO901gLXeEhA==}
engines: {node: '>=12'}
dagre-d3-es@7.0.11:
resolution: {integrity: sha512-tvlJLyQf834SylNKax8Wkzco/1ias1OPw8DcUMDE7oUIoSEW25riQVuiu/0OWEFqT0cxHT3Pa9/D82Jr47IONw==}
dagre-d3-es@7.0.13:
resolution: {integrity: sha512-efEhnxpSuwpYOKRm/L5KbqoZmNNukHa/Flty4Wp62JRvgH2ojwVgPgdYyr4twpieZnyRDdIH7PY2mopX26+j2Q==}
dayjs@1.11.13:
resolution: {integrity: sha512-oaMBel6gjolK862uaPQOVTA7q3TZhuSvuMQAAglQDOWYO9A91IrAOUJEyKVlqJlHE0vq5p5UXxzdPfMH/x6xNg==}
dayjs@1.11.19:
resolution: {integrity: sha512-t5EcLVS6QPBNqM2z8fakk/NKel+Xzshgt8FFKAn+qwlD1pzZWxh0nVCrvFK7ZDb6XucZeF9z8C7CBWTRIVApAw==}
debug@4.4.1:
resolution: {integrity: sha512-KcKCqiftBJcZr++7ykoDIEwSa3XWowTfNPo92BYxjXiyYEVrUQh2aLyhxBCwww+heortUFxEJYcRzosstTEBYQ==}
@@ -1497,9 +1494,6 @@ packages:
estree-walker@3.0.3:
resolution: {integrity: sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g==}
exsolve@1.0.7:
resolution: {integrity: sha512-VO5fQUzZtI6C+vx4w/4BWJpg3s/5l+6pRQEHzFRM8WFi4XffSP1Z+4qi7GbjWbvRQEbdIco5mIMq+zX4rPuLrw==}
extend@3.0.2:
resolution: {integrity: sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==}
@@ -1553,8 +1547,8 @@ packages:
waku:
optional: true
fumadocs-mdx@13.0.5:
resolution: {integrity: sha512-ERhPxQzoTwEdtuel5dN5OmUItOhGGXTLR1uCjiGPABYeVkc57vAexyTRQSYZMxGlcfjkJaYqt3qY1p5j7i4g7A==}
fumadocs-mdx@13.0.8:
resolution: {integrity: sha512-UbUwH0iGvYbytnxhmfd7tWJKFK8L0mrbTAmrQYnpg6Wi/h8afNMJmbHBOzVcaEWJKeFipZ1CGDAsNA2fztwXNg==}
hasBin: true
peerDependencies:
'@fumadocs/mdx-remote': ^1.4.0
@@ -1595,10 +1589,6 @@ packages:
github-slugger@2.0.0:
resolution: {integrity: sha512-IaOQ9puYtjrkq7Y0Ygl9KDZnrf/aiUJYUpVf89y8kyaxbRG7Y1SrX/jaumrv81vc61+kiMempujsM3Yw7w5qcw==}
globals@15.15.0:
resolution: {integrity: sha512-7ACyT3wmyp3I61S4fG682L0VA2RGD9otkqGJIwNUMF1SWUombIIk+af1unuDYgMm082aHYwD+mzJvv9Iu8dsgg==}
engines: {node: '>=18'}
graceful-fs@4.2.11:
resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==}
@@ -1662,8 +1652,8 @@ packages:
resolution: {integrity: sha512-rg9zJN+G4n2nfJl5MW3BMygZX56zKPNVEYYqq7adpmMh4Jn2QNEwhvQlFy6jPVdcod7txZtKHWnyZiA3a0zP7A==}
hasBin: true
js-yaml@4.1.0:
resolution: {integrity: sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==}
js-yaml@4.1.1:
resolution: {integrity: sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==}
hasBin: true
katex@0.16.22:
@@ -1673,9 +1663,6 @@ packages:
khroma@2.1.0:
resolution: {integrity: sha512-Ls993zuzfayK269Svk9hzpeGUKob/sIgZzyHYdjQoAdQetRKpOLj+k/QQQ/6Qi0Yz65mlROrfd+Ev+1+7dz9Kw==}
kolorist@1.8.0:
resolution: {integrity: sha512-Y+60/zizpJ3HRH8DCss+q95yr6145JXZo46OTpFvDZWLfRCE4qChOyk1b26nMaNpfHHgxagk9dXT5OP0Tfe+dQ==}
langium@3.3.1:
resolution: {integrity: sha512-QJv/h939gDpvT+9SiLVlY7tZC3xB2qK57v0J04Sh9wpMb6MP1q8gB21L3WIo8T5P1MSMg3Ep14L7KkDCFG3y4w==}
engines: {node: '>=16.0.0'}
@@ -1750,10 +1737,6 @@ packages:
resolution: {integrity: sha512-xi6IyHML+c9+Q3W0S4fCQJOym42pyurFiJUHEcEyHS0CeKzia4yZDEsLlqOFykxOdHpNy0NmvVO31vcSqAxJCg==}
engines: {node: '>= 12.0.0'}
local-pkg@1.1.1:
resolution: {integrity: sha512-WunYko2W1NcdfAFpuLUoucsgULmgDBRkdxHxWQ7mK0cQqwPiy8E1enjuRBrhLtZkB5iScJ1XIPdhVEFK8aOLSg==}
engines: {node: '>=14'}
lodash-es@4.17.21:
resolution: {integrity: sha512-mKnC+QJ9pWVzv+C4/U3rRsHapFfHvQFoFB92e52xeyGMcX6/OlIl78je1u8vePzYZSkkogMPJ2yjxxsb89cxyw==}
@@ -1782,9 +1765,9 @@ packages:
markdown-table@3.0.4:
resolution: {integrity: sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==}
marked@15.0.12:
resolution: {integrity: sha512-8dD6FusOQSrpv9Z1rdNMdlSgQOIP880DHqnohobOmYLElGEqAL/JvxvuxZO16r4HtjTlfPRDC1hbvxC9dPN2nA==}
engines: {node: '>= 18'}
marked@16.4.2:
resolution: {integrity: sha512-TI3V8YYWvkVf3KJe1dRkpnjs68JUPyEa5vjKrp1XEEJUAOaQc+Qj+L1qWbPd0SJuAdQkFU0h73sXXqwDYxsiDA==}
engines: {node: '>= 20'}
hasBin: true
mdast-util-find-and-replace@3.0.2:
@@ -1835,8 +1818,8 @@ packages:
mdast-util-to-string@4.0.0:
resolution: {integrity: sha512-0H44vDimn51F0YwvxSJSm0eCDOJTRlmN0R1yBh4HLj9wiV1Dn0QoXGbvFAWj2hSItVTlCmBF1hqKlIyUBVFLPg==}
mermaid@11.8.1:
resolution: {integrity: sha512-VSXJLqP1Sqw5sGr273mhvpPRhXwE6NlmMSqBZQw+yZJoAJkOIPPn/uT3teeCBx60Fkt5zEI3FrH2eVT0jXRDzw==}
mermaid@11.12.1:
resolution: {integrity: sha512-UlIZrRariB11TY1RtTgUWp65tphtBv4CSq7vyS2ZZ2TgoMjs2nloq+wFqxiwcxlhHUvs7DPGgMjs2aeQxz5h9g==}
micromark-core-commonmark@2.0.3:
resolution: {integrity: sha512-RDBrHEMSxVFLg6xvnXmb1Ayr2WzLAWjeSATAoxwKYJV94TeNavgoIdA0a9ytzDSVzBy2YKFK+emCPOEibLeCrg==}
@@ -1956,8 +1939,8 @@ packages:
engines: {node: '>=10'}
hasBin: true
mlly@1.7.4:
resolution: {integrity: sha512-qmdSIPC4bDJXgZTCR7XosJiNKySV7O215tsPtDN9iEO/7q/76b/ijtgRu/+epFXSJhijtTCCGp3DWS549P3xKw==}
mlly@1.8.0:
resolution: {integrity: sha512-l8D9ODSRWLe2KHJSifWGwBqpTZXIXTeo8mlKjY+E2HAakaTeNpqAyBZ8GSqLzHgw4XmHmC8whvpjJNMbFZN7/g==}
ms@2.1.3:
resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==}
@@ -1977,8 +1960,8 @@ packages:
react: ^16.8 || ^17 || ^18 || ^19 || ^19.0.0-rc
react-dom: ^16.8 || ^17 || ^18 || ^19 || ^19.0.0-rc
next@16.0.1:
resolution: {integrity: sha512-e9RLSssZwd35p7/vOa+hoDFggUZIUbZhIUSLZuETCwrCVvxOs87NamoUzT+vbcNAL8Ld9GobBnWOA6SbV/arOw==}
next@16.0.7:
resolution: {integrity: sha512-3mBRJyPxT4LOxAJI6IsXeFtKfiJUbjCLgvXO02fV8Wy/lIhPvP94Fe7dGhUgHXcQy4sSuYwQNcOLhIfOm0rL0A==}
engines: {node: '>=20.9.0'}
hasBin: true
peerDependencies:
@@ -2008,8 +1991,8 @@ packages:
oniguruma-to-es@4.3.3:
resolution: {integrity: sha512-rPiZhzC3wXwE59YQMRDodUwwT9FZ9nNBwQQfsd1wfdtlKEyCdRV0avrTcSZ5xlIvGRVPd/cx6ZN45ECmS39xvg==}
package-manager-detector@1.3.0:
resolution: {integrity: sha512-ZsEbbZORsyHuO00lY1kV3/t72yp6Ysay6Pd17ZAlNGuGwmWDLCJxFpRs0IzfXfj1o4icJOkUEioexFHzyPurSQ==}
package-manager-detector@1.6.0:
resolution: {integrity: sha512-61A5ThoTiDG/C8s8UMZwSorAGwMJ0ERVGj2OjoW5pAalsNOg15+iQiPzrLJ4jhZ1HJzmC2PIHT2oEiH3R5fzNA==}
parse-entities@4.0.2:
resolution: {integrity: sha512-GG2AQYWoLgL877gQIKeRPGO1xF9+eG1ujIb5soS5gPvLQ1y2o8FL90w2QWNdf9I361Mpp7726c+lj3U0qK1uGw==}
@@ -2033,9 +2016,6 @@ packages:
pkg-types@1.3.1:
resolution: {integrity: sha512-/Jm5M4RvtBFVkKWRu2BLUTNP8/M2a+UwuAX+ae4770q1qVGtfjG+WTCupoZixokjmHiry8uI+dlY8KXYV5HVVQ==}
pkg-types@2.2.0:
resolution: {integrity: sha512-2SM/GZGAEkPp3KWORxQZns4M+WSeXbC2HEvmOIJe3Cmiv6ieAJvdVhDldtHqM5J1Y7MrR1XhkBT/rMlhh9FdqQ==}
points-on-curve@0.2.0:
resolution: {integrity: sha512-0mYKnYYe9ZcqMCWhUjItv/oHjvgEsfKvnUTg8sAtnHr3GVy7rGkXCb6d5cSyqrWqL4k81b9CPg3urd+T7aop3A==}
@@ -2076,9 +2056,6 @@ packages:
property-information@7.1.0:
resolution: {integrity: sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ==}
quansync@0.2.10:
resolution: {integrity: sha512-t41VRkMYbkHyCYmOvx/6URnN80H7k4X0lLdBMGsz+maAwrJQYB1djpV6vHrQIBE0WBSGqhtEHrK9U3DWWH8v7A==}
react-dom@19.2.0:
resolution: {integrity: sha512-UlbRu4cAiGaIewkPyiRGJk0imDN2T3JjieT6spoL2UeSf5od4n5LB/mQ4ejmxhCFT1tYe8IvaFulzynWovsEFQ==}
peerDependencies:
@@ -2258,8 +2235,9 @@ packages:
resolution: {integrity: sha512-5S7Va8hKfV7W5U6g3aYxXmlPoZVAwUMy9AOKyF2fVuZa2UD3qZjg578OrLRt8PcNN1PleVaL/5/yYATNL0ICUw==}
engines: {node: '>=18'}
tinyexec@1.0.1:
resolution: {integrity: sha512-5uC6DDlmeqiOwCPmK9jMSdOuZTh8bU39Ys6yidB+UTt5hfZUPGAypSgFRiEp+jbi9qH40BLDvy85jIU88wKSqw==}
tinyexec@1.0.2:
resolution: {integrity: sha512-W/KYk+NFhkmsYpuHq5JykngiOCnxeVL8v8dFnqxSD8qEEdRfXk1SDM6JzNqcERbcGYj9tMrDQBYV9cjgnunFIg==}
engines: {node: '>=18'}
tinyglobby@0.2.15:
resolution: {integrity: sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==}
@@ -2390,10 +2368,8 @@ snapshots:
'@antfu/install-pkg@1.1.0':
dependencies:
package-manager-detector: 1.3.0
tinyexec: 1.0.1
'@antfu/utils@8.1.1': {}
package-manager-detector: 1.6.0
tinyexec: 1.0.2
'@braintree/sanitize-url@7.1.1': {}
@@ -2520,18 +2496,11 @@ snapshots:
'@iconify/types@2.0.0': {}
'@iconify/utils@2.3.0':
'@iconify/utils@3.1.0':
dependencies:
'@antfu/install-pkg': 1.1.0
'@antfu/utils': 8.1.1
'@iconify/types': 2.0.0
debug: 4.4.1
globals: 15.15.0
kolorist: 1.8.0
local-pkg: 1.1.1
mlly: 1.7.4
transitivePeerDependencies:
- supports-color
mlly: 1.8.0
'@img/colour@1.0.0':
optional: true
@@ -2681,34 +2650,34 @@ snapshots:
transitivePeerDependencies:
- supports-color
'@mermaid-js/parser@0.6.1':
'@mermaid-js/parser@0.6.3':
dependencies:
langium: 3.3.1
'@next/env@16.0.1': {}
'@next/env@16.0.7': {}
'@next/swc-darwin-arm64@16.0.1':
'@next/swc-darwin-arm64@16.0.7':
optional: true
'@next/swc-darwin-x64@16.0.1':
'@next/swc-darwin-x64@16.0.7':
optional: true
'@next/swc-linux-arm64-gnu@16.0.1':
'@next/swc-linux-arm64-gnu@16.0.7':
optional: true
'@next/swc-linux-arm64-musl@16.0.1':
'@next/swc-linux-arm64-musl@16.0.7':
optional: true
'@next/swc-linux-x64-gnu@16.0.1':
'@next/swc-linux-x64-gnu@16.0.7':
optional: true
'@next/swc-linux-x64-musl@16.0.1':
'@next/swc-linux-x64-musl@16.0.7':
optional: true
'@next/swc-win32-arm64-msvc@16.0.1':
'@next/swc-win32-arm64-msvc@16.0.7':
optional: true
'@next/swc-win32-x64-msvc@16.0.1':
'@next/swc-win32-x64-msvc@16.0.7':
optional: true
'@orama/orama@3.1.16': {}
@@ -3426,8 +3395,6 @@ snapshots:
confbox@0.1.8: {}
confbox@0.2.2: {}
core-js@3.46.0: {}
cose-base@1.0.3:
@@ -3621,12 +3588,12 @@ snapshots:
d3-transition: 3.0.1(d3-selection@3.0.0)
d3-zoom: 3.0.0
dagre-d3-es@7.0.11:
dagre-d3-es@7.0.13:
dependencies:
d3: 7.9.0
lodash-es: 4.17.21
dayjs@1.11.13: {}
dayjs@1.11.19: {}
debug@4.4.1:
dependencies:
@@ -3744,8 +3711,6 @@ snapshots:
dependencies:
'@types/estree': 1.0.8
exsolve@1.0.7: {}
extend@3.0.2: {}
fdir@6.5.0(picomatch@4.0.3):
@@ -3754,7 +3719,7 @@ snapshots:
fflate@0.4.8: {}
fumadocs-core@16.0.8(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0):
fumadocs-core@16.0.8(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0):
dependencies:
'@formatjs/intl-localematcher': 0.6.2
'@orama/orama': 3.1.16
@@ -3777,39 +3742,39 @@ snapshots:
optionalDependencies:
'@types/react': 19.1.8
lucide-react: 0.525.0(react@19.2.0)
next: 16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
next: 16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
react: 19.2.0
react-dom: 19.2.0(react@19.2.0)
transitivePeerDependencies:
- supports-color
fumadocs-mdx@13.0.5(fumadocs-core@16.0.8(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(next@16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react@19.2.0):
fumadocs-mdx@13.0.8(fumadocs-core@16.0.8(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(next@16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react@19.2.0):
dependencies:
'@mdx-js/mdx': 3.1.1
'@standard-schema/spec': 1.0.0
chokidar: 4.0.3
esbuild: 0.25.12
estree-util-value-to-estree: 3.5.0
fumadocs-core: 16.0.8(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
js-yaml: 4.1.0
fumadocs-core: 16.0.8(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
js-yaml: 4.1.1
lru-cache: 11.2.2
mdast-util-to-markdown: 2.1.2
picocolors: 1.1.1
picomatch: 4.0.3
remark-mdx: 3.1.1
tinyexec: 1.0.1
tinyexec: 1.0.2
tinyglobby: 0.2.15
unified: 11.0.5
unist-util-remove-position: 5.0.0
unist-util-visit: 5.0.0
zod: 4.1.12
optionalDependencies:
next: 16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
next: 16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
react: 19.2.0
transitivePeerDependencies:
- supports-color
fumadocs-ui@16.0.8(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0)(tailwindcss@4.1.10):
fumadocs-ui@16.0.8(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0)(tailwindcss@4.1.10):
dependencies:
'@radix-ui/react-accordion': 1.2.12(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
'@radix-ui/react-collapsible': 1.1.12(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
@@ -3822,7 +3787,7 @@ snapshots:
'@radix-ui/react-slot': 1.2.4(@types/react@19.1.8)(react@19.2.0)
'@radix-ui/react-tabs': 1.1.13(@types/react-dom@19.1.6(@types/react@19.1.8))(@types/react@19.1.8)(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
class-variance-authority: 0.7.1
fumadocs-core: 16.0.8(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
fumadocs-core: 16.0.8(@types/react@19.1.8)(lucide-react@0.525.0(react@19.2.0))(next@16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0))(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
lodash.merge: 4.6.2
next-themes: 0.4.6(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
postcss-selector-parser: 7.1.0
@@ -3833,7 +3798,7 @@ snapshots:
tailwind-merge: 3.3.1
optionalDependencies:
'@types/react': 19.1.8
next: 16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
next: 16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0)
tailwindcss: 4.1.10
transitivePeerDependencies:
- '@mixedbread/sdk'
@@ -3850,8 +3815,6 @@ snapshots:
github-slugger@2.0.0: {}
globals@15.15.0: {}
graceful-fs@4.2.11: {}
hachure-fill@0.5.2: {}
@@ -3948,7 +3911,7 @@ snapshots:
jiti@2.4.2: {}
js-yaml@4.1.0:
js-yaml@4.1.1:
dependencies:
argparse: 2.0.1
@@ -3958,8 +3921,6 @@ snapshots:
khroma@2.1.0: {}
kolorist@1.8.0: {}
langium@3.3.1:
dependencies:
chevrotain: 11.0.3
@@ -4017,12 +3978,6 @@ snapshots:
lightningcss-win32-arm64-msvc: 1.30.1
lightningcss-win32-x64-msvc: 1.30.1
local-pkg@1.1.1:
dependencies:
mlly: 1.7.4
pkg-types: 2.2.0
quansync: 0.2.10
lodash-es@4.17.21: {}
lodash.merge@4.6.2: {}
@@ -4043,7 +3998,7 @@ snapshots:
markdown-table@3.0.4: {}
marked@15.0.12: {}
marked@16.4.2: {}
mdast-util-find-and-replace@3.0.2:
dependencies:
@@ -4208,30 +4163,28 @@ snapshots:
dependencies:
'@types/mdast': 4.0.4
mermaid@11.8.1:
mermaid@11.12.1:
dependencies:
'@braintree/sanitize-url': 7.1.1
'@iconify/utils': 2.3.0
'@mermaid-js/parser': 0.6.1
'@iconify/utils': 3.1.0
'@mermaid-js/parser': 0.6.3
'@types/d3': 7.4.3
cytoscape: 3.32.1
cytoscape-cose-bilkent: 4.1.0(cytoscape@3.32.1)
cytoscape-fcose: 2.2.0(cytoscape@3.32.1)
d3: 7.9.0
d3-sankey: 0.12.3
dagre-d3-es: 7.0.11
dayjs: 1.11.13
dagre-d3-es: 7.0.13
dayjs: 1.11.19
dompurify: 3.2.6
katex: 0.16.22
khroma: 2.1.0
lodash-es: 4.17.21
marked: 15.0.12
marked: 16.4.2
roughjs: 4.6.6
stylis: 4.3.6
ts-dedent: 2.2.0
uuid: 11.1.0
transitivePeerDependencies:
- supports-color
micromark-core-commonmark@2.0.3:
dependencies:
@@ -4505,7 +4458,7 @@ snapshots:
mkdirp@3.0.1: {}
mlly@1.7.4:
mlly@1.8.0:
dependencies:
acorn: 8.15.0
pathe: 2.0.3
@@ -4523,9 +4476,9 @@ snapshots:
react: 19.2.0
react-dom: 19.2.0(react@19.2.0)
next@16.0.1(react-dom@19.2.0(react@19.2.0))(react@19.2.0):
next@16.0.7(react-dom@19.2.0(react@19.2.0))(react@19.2.0):
dependencies:
'@next/env': 16.0.1
'@next/env': 16.0.7
'@swc/helpers': 0.5.15
caniuse-lite: 1.0.30001724
postcss: 8.4.31
@@ -4533,14 +4486,14 @@ snapshots:
react-dom: 19.2.0(react@19.2.0)
styled-jsx: 5.1.6(react@19.2.0)
optionalDependencies:
'@next/swc-darwin-arm64': 16.0.1
'@next/swc-darwin-x64': 16.0.1
'@next/swc-linux-arm64-gnu': 16.0.1
'@next/swc-linux-arm64-musl': 16.0.1
'@next/swc-linux-x64-gnu': 16.0.1
'@next/swc-linux-x64-musl': 16.0.1
'@next/swc-win32-arm64-msvc': 16.0.1
'@next/swc-win32-x64-msvc': 16.0.1
'@next/swc-darwin-arm64': 16.0.7
'@next/swc-darwin-x64': 16.0.7
'@next/swc-linux-arm64-gnu': 16.0.7
'@next/swc-linux-arm64-musl': 16.0.7
'@next/swc-linux-x64-gnu': 16.0.7
'@next/swc-linux-x64-musl': 16.0.7
'@next/swc-win32-arm64-msvc': 16.0.7
'@next/swc-win32-x64-msvc': 16.0.7
sharp: 0.34.5
transitivePeerDependencies:
- '@babel/core'
@@ -4556,7 +4509,7 @@ snapshots:
regex: 6.0.1
regex-recursion: 6.0.2
package-manager-detector@1.3.0: {}
package-manager-detector@1.6.0: {}
parse-entities@4.0.2:
dependencies:
@@ -4581,13 +4534,7 @@ snapshots:
pkg-types@1.3.1:
dependencies:
confbox: 0.1.8
mlly: 1.7.4
pathe: 2.0.3
pkg-types@2.2.0:
dependencies:
confbox: 0.2.2
exsolve: 1.0.7
mlly: 1.8.0
pathe: 2.0.3
points-on-curve@0.2.0: {}
@@ -4628,8 +4575,6 @@ snapshots:
property-information@7.1.0: {}
quansync@0.2.10: {}
react-dom@19.2.0(react@19.2.0):
dependencies:
react: 19.2.0
@@ -4886,7 +4831,7 @@ snapshots:
mkdirp: 3.0.1
yallist: 5.0.0
tinyexec@1.0.1: {}
tinyexec@1.0.2: {}
tinyglobby@0.2.15:
dependencies:

BIN
docs/public/img/bg-dark.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 277 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 418 KiB

BIN
docs/public/img/hero.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.2 MiB

View File

@@ -200,7 +200,7 @@ export default async function Page(props: { params: Promise<{ slug?: string[] }>
<div className="flex flex-row w-full items-start">
<div className="flex-1">
<div className="flex flex-row w-full">
<DocsTitle>{page.data.title}</DocsTitle>
{slug.length > 0 && <DocsTitle>{page.data.title}</DocsTitle>}
<div className="ml-auto flex items-center gap-2">
{apiSection && versionItems.length > 1 && (

View File

@@ -2,6 +2,34 @@
@import 'fumadocs-ui/css/neutral.css';
@import 'fumadocs-ui/css/preset.css';
/* Custom Sky + Emerald theme */
@theme {
--color-fd-primary: hsl(199, 89%, 48%); /* sky-500 */
--color-fd-primary-foreground: hsl(0, 0%, 100%);
--color-fd-ring: hsl(199, 89%, 48%); /* sky-500 */
--color-fd-muted: hsl(160, 84%, 95%); /* emerald-50 */
--color-fd-accent: hsl(152, 76%, 92%); /* emerald-100 */
--font-sans: var(--font-geist-sans);
--font-mono: var(--font-geist-mono);
}
.dark {
--color-fd-primary: hsl(199, 89%, 48%); /* sky-500 */
--color-fd-primary-foreground: hsl(0, 0%, 100%);
--color-fd-ring: hsl(199, 89%, 48%); /* sky-500 */
--color-fd-muted: hsl(199, 89%, 14%); /* sky-950 */
--color-fd-accent: hsl(199, 89%, 20%); /* sky dark */
}
.dark body {
background-image: linear-gradient(
rgba(14, 165, 233, 0.1),
transparent 20rem,
transparent
);
background-repeat: no-repeat;
}
/* Fix TOC overflow on production builds */
#nd-toc {
overflow-y: auto;

View File

@@ -1,6 +1,6 @@
import './global.css';
import { RootProvider } from 'fumadocs-ui/provider';
import { Inter } from 'next/font/google';
import { Geist, Geist_Mono } from 'next/font/google';
import type { ReactNode } from 'react';
import { PHProvider, PostHogPageView } from '@/providers/posthog-provider';
import { AnalyticsTracker } from '@/components/analytics-tracker';
@@ -8,13 +8,19 @@ import { CookieConsent } from '@/components/cookie-consent';
import { Footer } from '@/components/footer';
import { Suspense } from 'react';
const inter = Inter({
const geist = Geist({
subsets: ['latin'],
variable: '--font-geist-sans',
});
const geistMono = Geist_Mono({
subsets: ['latin'],
variable: '--font-geist-mono',
});
export default function Layout({ children }: { children: ReactNode }) {
return (
<html lang="en" className={inter.className} suppressHydrationWarning>
<html lang="en" className={`${geist.variable} ${geistMono.variable} font-sans`} suppressHydrationWarning>
<head>
<link rel="icon" href="/docs/favicon.ico" sizes="any" />
</head>

View File

@@ -0,0 +1,119 @@
"""
Browser Tool Example
Demonstrates how to use the BrowserTool to control a browser programmatically
via the computer server. The browser runs visibly on the XFCE desktop so visual
agents can see it.
Prerequisites:
- Computer server running (Docker container or local)
- For Docker: Container should be running with browser tool support
- For local: Playwright and Firefox must be installed
Usage:
python examples/browser_tool_example.py
"""
import asyncio
import logging
import sys
from pathlib import Path
# Add the libs path to sys.path
libs_path = Path(__file__).parent.parent / "libs" / "python"
sys.path.insert(0, str(libs_path))
from agent.tools.browser_tool import BrowserTool
# Import Computer interface and BrowserTool
from computer import Computer
# Configure logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
async def test_browser_tool():
"""Test the BrowserTool with various commands."""
# Initialize the computer interface
# For local testing, use provider_type="docker"
# For provider_type="cloud", provide name and api_key
computer = Computer(provider_type="docker", os_type="linux", image="cua-xfce:dev")
await computer.run()
# Initialize the browser tool with the computer interface
browser = BrowserTool(interface=computer)
logger.info("Testing Browser Tool...")
try:
# Test 0: Take a screenshot (pre-init)
logger.info("Test 0: Taking a screenshot...")
screenshot_bytes = await browser.screenshot()
screenshot_path = Path(__file__).parent / "browser_screenshot_init.png"
with open(screenshot_path, "wb") as f:
f.write(screenshot_bytes)
logger.info(f"Screenshot captured: {len(screenshot_bytes)} bytes")
# Test 1: Visit a URL
logger.info("Test 1: Visiting a URL...")
result = await browser.visit_url("https://www.trycua.com")
logger.info(f"Visit URL result: {result}")
# Wait a bit for the page to load
await asyncio.sleep(2)
# Test 2: Take a screenshot
logger.info("Test 2: Taking a screenshot...")
screenshot_bytes = await browser.screenshot()
screenshot_path = Path(__file__).parent / "browser_screenshot.png"
with open(screenshot_path, "wb") as f:
f.write(screenshot_bytes)
logger.info(f"Screenshot captured: {len(screenshot_bytes)} bytes")
# Wait a bit
await asyncio.sleep(1)
# Test 3: Visit bot detector
logger.info("Test 3: Visiting bot detector...")
result = await browser.visit_url("https://bot-detector.rebrowser.net/")
logger.info(f"Visit URL result: {result}")
# Test 2: Web search
logger.info("Test 2: Performing a web search...")
result = await browser.web_search("Python programming")
logger.info(f"Web search result: {result}")
# Wait a bit
await asyncio.sleep(2)
# Test 3: Scroll
logger.info("Test 3: Scrolling the page...")
result = await browser.scroll(delta_x=0, delta_y=500)
logger.info(f"Scroll result: {result}")
# Wait a bit
await asyncio.sleep(1)
# Test 4: Click (example coordinates - adjust based on your screen)
logger.info("Test 4: Clicking at coordinates...")
result = await browser.click(x=500, y=300)
logger.info(f"Click result: {result}")
# Wait a bit
await asyncio.sleep(1)
# Test 5: Type text (if there's a focused input field)
logger.info("Test 5: Typing text...")
result = await browser.type("Hello from BrowserTool!")
logger.info(f"Type result: {result}")
logger.info("All tests completed!")
except Exception as e:
logger.error(f"Error during testing: {e}", exc_info=True)
if __name__ == "__main__":
asyncio.run(test_browser_tool())

View File

@@ -8,6 +8,7 @@ from . import (
composed_grounded,
gelato,
gemini,
generic_vlm,
glm45v,
gta1,
holo,
@@ -16,7 +17,6 @@ from . import (
omniparser,
openai,
opencua,
generic_vlm,
uiins,
uitars,
uitars2,
@@ -24,19 +24,19 @@ from . import (
__all__ = [
"anthropic",
"openai",
"uitars",
"omniparser",
"gta1",
"composed_grounded",
"glm45v",
"opencua",
"internvl",
"holo",
"moondream3",
"gelato",
"gemini",
"generic_vlm",
"glm45v",
"gta1",
"holo",
"internvl",
"moondream3",
"omniparser",
"openai",
"opencua",
"uiins",
"gelato",
"uitars",
"uitars2",
]

View File

@@ -442,7 +442,7 @@ def get_all_element_descriptions(responses_items: List[Dict[str, Any]]) -> List[
# Conversion functions between responses_items and completion messages formats
def convert_responses_items_to_completion_messages(
messages: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
allow_images_in_tool_results: bool = True,
send_multiple_user_images_per_parallel_tool_results: bool = False,
) -> List[Dict[str, Any]]:
@@ -573,25 +573,33 @@ def convert_responses_items_to_completion_messages(
"computer_call_output",
]
# Send tool message + separate user message with image (OpenAI compatible)
completion_messages += [
{
"role": "tool",
"tool_call_id": call_id,
"content": "[Execution completed. See screenshot below]",
},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": output.get("image_url")}}
],
},
] if send_multiple_user_images_per_parallel_tool_results or (not is_next_message_image_result) else [
{
"role": "tool",
"tool_call_id": call_id,
"content": "[Execution completed. See screenshot below]",
},
]
completion_messages += (
[
{
"role": "tool",
"tool_call_id": call_id,
"content": "[Execution completed. See screenshot below]",
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": output.get("image_url")},
}
],
},
]
if send_multiple_user_images_per_parallel_tool_results
or (not is_next_message_image_result)
else [
{
"role": "tool",
"tool_call_id": call_id,
"content": "[Execution completed. See screenshot below]",
},
]
)
else:
# Handle text output as tool response
completion_messages.append(

View File

@@ -0,0 +1,6 @@
"""Tools for agent interactions."""
from .browser_tool import BrowserTool
__all__ = ["BrowserTool"]

View File

@@ -0,0 +1,135 @@
"""
Browser Tool for agent interactions.
Allows agents to control a browser programmatically via Playwright.
"""
import logging
from typing import TYPE_CHECKING, Optional
if TYPE_CHECKING:
from computer.interface import GenericComputerInterface
logger = logging.getLogger(__name__)
class BrowserTool:
"""
Browser tool that uses the computer SDK's interface to control a browser.
Implements the Fara/Magentic-One agent interface for browser control.
"""
def __init__(
self,
interface: "GenericComputerInterface",
):
"""
Initialize the BrowserTool.
Args:
interface: A GenericComputerInterface instance that provides playwright_exec
"""
self.interface = interface
self.logger = logger
async def _execute_command(self, command: str, params: dict) -> dict:
"""
Execute a browser command via the computer interface.
Args:
command: Command name
params: Command parameters
Returns:
Response dictionary
"""
try:
result = await self.interface.playwright_exec(command, params)
if not result.get("success"):
self.logger.error(
f"Browser command '{command}' failed: {result.get('error', 'Unknown error')}"
)
return result
except Exception as e:
self.logger.error(f"Error executing browser command '{command}': {e}")
return {"success": False, "error": str(e)}
async def visit_url(self, url: str) -> dict:
"""
Navigate to a URL.
Args:
url: URL to visit
Returns:
Response dictionary with success status and current URL
"""
return await self._execute_command("visit_url", {"url": url})
async def click(self, x: int, y: int) -> dict:
"""
Click at coordinates.
Args:
x: X coordinate
y: Y coordinate
Returns:
Response dictionary with success status
"""
return await self._execute_command("click", {"x": x, "y": y})
async def type(self, text: str) -> dict:
"""
Type text into the focused element.
Args:
text: Text to type
Returns:
Response dictionary with success status
"""
return await self._execute_command("type", {"text": text})
async def scroll(self, delta_x: int, delta_y: int) -> dict:
"""
Scroll the page.
Args:
delta_x: Horizontal scroll delta
delta_y: Vertical scroll delta
Returns:
Response dictionary with success status
"""
return await self._execute_command("scroll", {"delta_x": delta_x, "delta_y": delta_y})
async def web_search(self, query: str) -> dict:
"""
Navigate to a Google search for the query.
Args:
query: Search query
Returns:
Response dictionary with success status and current URL
"""
return await self._execute_command("web_search", {"query": query})
async def screenshot(self) -> bytes:
"""
Take a screenshot of the current browser page.
Returns:
Screenshot image data as bytes (PNG format)
"""
import base64
result = await self._execute_command("screenshot", {})
if result.get("success") and result.get("screenshot"):
# Decode base64 screenshot to bytes
screenshot_b64 = result["screenshot"]
screenshot_bytes = base64.b64decode(screenshot_b64)
return screenshot_bytes
else:
error = result.get("error", "Unknown error")
raise RuntimeError(f"Failed to take screenshot: {error}")

View File

@@ -24,7 +24,7 @@ dependencies = [
"certifi>=2024.2.2",
"litellm>=1.74.12"
]
requires-python = ">=3.12"
requires-python = ">=3.12,<3.14"
[project.optional-dependencies]
openai = []

View File

@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.1.30
current_version = 0.1.31
commit = True
tag = True
tag_name = computer-server-v{new_version}

View File

@@ -0,0 +1,361 @@
"""
Browser manager using Playwright for programmatic browser control.
This allows agents to control a browser that runs visibly on the XFCE desktop.
"""
import asyncio
import logging
import os
from typing import Any, Dict, Optional
try:
from playwright.async_api import Browser, BrowserContext, Page, async_playwright
except ImportError:
async_playwright = None
Browser = None
BrowserContext = None
Page = None
logger = logging.getLogger(__name__)
class BrowserManager:
"""
Manages a Playwright browser instance that runs visibly on the XFCE desktop.
Uses persistent context to maintain cookies and sessions.
"""
def __init__(self):
"""Initialize the BrowserManager."""
self.playwright = None
self.browser: Optional[Browser] = None
self.context: Optional[BrowserContext] = None
self.page: Optional[Page] = None
self._initialized = False
self._initialization_error: Optional[str] = None
self._lock = asyncio.Lock()
async def _ensure_initialized(self):
"""Ensure the browser is initialized."""
# Check if browser was closed and needs reinitialization
if self._initialized:
try:
# Check if context is still valid by trying to access it
if self.context:
# Try to get pages - this will raise if context is closed
_ = self.context.pages
# If we get here, context is still alive
return
else:
# Context was closed, need to reinitialize
self._initialized = False
logger.warning("Browser context was closed, will reinitialize...")
except Exception as e:
# Context is dead, need to reinitialize
logger.warning(f"Browser context is dead ({e}), will reinitialize...")
self._initialized = False
self.context = None
self.page = None
# Clean up playwright if it exists
if self.playwright:
try:
await self.playwright.stop()
except Exception:
pass
self.playwright = None
async with self._lock:
# Double-check after acquiring lock (another thread might have initialized it)
if self._initialized:
try:
if self.context:
_ = self.context.pages
return
except Exception:
self._initialized = False
self.context = None
self.page = None
if self.playwright:
try:
await self.playwright.stop()
except Exception:
pass
self.playwright = None
if async_playwright is None:
raise RuntimeError(
"playwright is not installed. Please install it with: pip install playwright && playwright install --with-deps firefox"
)
try:
# Get display from environment or default to :1
display = os.environ.get("DISPLAY", ":1")
logger.info(f"Initializing browser with DISPLAY={display}")
# Start playwright
self.playwright = await async_playwright().start()
# Launch Firefox with persistent context (keeps cookies/sessions)
# headless=False is CRITICAL so the visual agent can see it
user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright-firefox")
os.makedirs(user_data_dir, exist_ok=True)
# launch_persistent_context returns a BrowserContext, not a Browser
# Note: Removed --kiosk mode so the desktop remains visible
self.context = await self.playwright.firefox.launch_persistent_context(
user_data_dir=user_data_dir,
headless=False, # CRITICAL: visible for visual agent
viewport={"width": 1024, "height": 768},
# Removed --kiosk to allow desktop visibility
)
# Add init script to make the browser less detectable
await self.context.add_init_script(
"""const defaultGetter = Object.getOwnPropertyDescriptor(
Navigator.prototype,
"webdriver"
).get;
defaultGetter.apply(navigator);
defaultGetter.toString();
Object.defineProperty(Navigator.prototype, "webdriver", {
set: undefined,
enumerable: true,
configurable: true,
get: new Proxy(defaultGetter, {
apply: (target, thisArg, args) => {
Reflect.apply(target, thisArg, args);
return false;
},
}),
});
const patchedGetter = Object.getOwnPropertyDescriptor(
Navigator.prototype,
"webdriver"
).get;
patchedGetter.apply(navigator);
patchedGetter.toString();"""
)
# Get the first page or create one
pages = self.context.pages
if pages:
self.page = pages[0]
else:
self.page = await self.context.new_page()
self._initialized = True
logger.info("Browser initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize browser: {e}")
import traceback
logger.error(traceback.format_exc())
# Don't raise - return error in execute_command instead
self._initialization_error = str(e)
raise
async def _execute_command_impl(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]:
"""Internal implementation of command execution."""
if cmd == "visit_url":
url = params.get("url")
if not url:
return {"success": False, "error": "url parameter is required"}
await self.page.goto(url, wait_until="domcontentloaded", timeout=30000)
return {"success": True, "url": self.page.url}
elif cmd == "click":
x = params.get("x")
y = params.get("y")
if x is None or y is None:
return {"success": False, "error": "x and y parameters are required"}
await self.page.mouse.click(x, y)
return {"success": True}
elif cmd == "type":
text = params.get("text")
if text is None:
return {"success": False, "error": "text parameter is required"}
await self.page.keyboard.type(text)
return {"success": True}
elif cmd == "scroll":
delta_x = params.get("delta_x", 0)
delta_y = params.get("delta_y", 0)
await self.page.mouse.wheel(delta_x, delta_y)
return {"success": True}
elif cmd == "web_search":
query = params.get("query")
if not query:
return {"success": False, "error": "query parameter is required"}
# Navigate to Google search
search_url = f"https://www.google.com/search?q={query}"
await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000)
return {"success": True, "url": self.page.url}
elif cmd == "screenshot":
# Take a screenshot and return as base64
import base64
screenshot_bytes = await self.page.screenshot(type="png")
screenshot_b64 = base64.b64encode(screenshot_bytes).decode("utf-8")
return {"success": True, "screenshot": screenshot_b64}
else:
return {"success": False, "error": f"Unknown command: {cmd}"}
async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]:
"""
Execute a browser command with automatic recovery.
Args:
cmd: Command name (visit_url, click, type, scroll, web_search)
params: Command parameters
Returns:
Result dictionary with success status and any data
"""
max_retries = 2
for attempt in range(max_retries):
try:
await self._ensure_initialized()
except Exception as e:
error_msg = getattr(self, "_initialization_error", None) or str(e)
logger.error(f"Browser initialization failed: {error_msg}")
return {
"success": False,
"error": f"Browser initialization failed: {error_msg}. "
f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly.",
}
# Check if page is still valid and get a new one if needed
page_valid = False
try:
if self.page is not None and not self.page.is_closed():
# Try to access page.url to check if it's still valid
_ = self.page.url
page_valid = True
except Exception as e:
logger.warning(f"Page is invalid: {e}, will get a new page...")
self.page = None
# Get a valid page if we don't have one
if not page_valid or self.page is None:
try:
if self.context:
pages = self.context.pages
if pages:
# Find first non-closed page
for p in pages:
try:
if not p.is_closed():
self.page = p
logger.info("Reusing existing open page")
page_valid = True
break
except Exception:
continue
# If no valid page found, create a new one
if not page_valid:
self.page = await self.context.new_page()
logger.info("Created new page")
except Exception as e:
logger.error(f"Failed to get new page: {e}, browser may be closed")
# Browser was closed - force reinitialization
self._initialized = False
self.context = None
self.page = None
if self.playwright:
try:
await self.playwright.stop()
except Exception:
pass
self.playwright = None
# If this isn't the last attempt, continue to retry
if attempt < max_retries - 1:
logger.info("Browser was closed, retrying with fresh initialization...")
continue
else:
return {
"success": False,
"error": f"Browser was closed and cannot be recovered: {e}",
}
# Try to execute the command
try:
return await self._execute_command_impl(cmd, params)
except Exception as e:
error_str = str(e)
logger.error(f"Error executing command {cmd}: {e}")
# Check if this is a "browser/page/context closed" error
if any(keyword in error_str.lower() for keyword in ["closed", "target", "context"]):
logger.warning(
f"Browser/page was closed during command execution (attempt {attempt + 1}/{max_retries})"
)
# Force reinitialization
self._initialized = False
self.context = None
self.page = None
if self.playwright:
try:
await self.playwright.stop()
except Exception:
pass
self.playwright = None
# If this isn't the last attempt, retry
if attempt < max_retries - 1:
logger.info("Retrying command after browser reinitialization...")
continue
else:
return {
"success": False,
"error": f"Command failed after {max_retries} attempts: {error_str}",
}
else:
# Not a browser closed error, return immediately
import traceback
logger.error(traceback.format_exc())
return {"success": False, "error": error_str}
# Should never reach here, but just in case
return {"success": False, "error": "Command failed after all retries"}
async def close(self):
"""Close the browser and cleanup resources."""
async with self._lock:
try:
if self.context:
await self.context.close()
self.context = None
if self.browser:
await self.browser.close()
self.browser = None
if self.playwright:
await self.playwright.stop()
self.playwright = None
self.page = None
self._initialized = False
logger.info("Browser closed successfully")
except Exception as e:
logger.error(f"Error closing browser: {e}")
# Global instance
_browser_manager: Optional[BrowserManager] = None
def get_browser_manager() -> BrowserManager:
"""Get or create the global BrowserManager instance."""
global _browser_manager
if _browser_manager is None:
_browser_manager = BrowserManager()
return _browser_manager

View File

@@ -55,6 +55,34 @@ from .base import BaseAccessibilityHandler, BaseAutomationHandler
logger = logging.getLogger(__name__)
# Trigger accessibility permissions prompt on macOS
try:
# Source - https://stackoverflow.com/a/17134
# Posted by Andreas
# Retrieved 2025-12-03, License - CC BY-SA 4.0
# Attempt to create and post a mouse event to trigger the permissions prompt
# This will cause macOS to show "Python would like to control this computer using accessibility features"
current_pos = CGEventGetLocation(CGEventCreate(None))
p = CGPoint()
p.x = current_pos.x
p.y = current_pos.y
me = CGEventCreateMouseEvent(None, kCGEventMouseMoved, p, 0)
if me:
CGEventPost(kCGHIDEventTap, me)
CFRelease(me)
except Exception as e:
logger.debug(f"Failed to trigger accessibility permissions prompt: {e}")
# Trigger screen recording prompt on macOS
try:
import pyautogui
pyautogui.screenshot()
except Exception as e:
logger.debug(f"Failed to trigger screenshot permissions prompt: {e}")
# Constants for accessibility API
kAXErrorSuccess = 0
kAXRoleAttribute = "AXRole"

View File

@@ -25,6 +25,7 @@ from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse
from .handlers.factory import HandlerFactory
from .browser import get_browser_manager
# Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s
AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60"))
@@ -749,5 +750,71 @@ async def agent_response_endpoint(
return JSONResponse(content=payload, headers=headers)
@app.post("/playwright_exec")
async def playwright_exec_endpoint(
request: Request,
container_name: Optional[str] = Header(None, alias="X-Container-Name"),
api_key: Optional[str] = Header(None, alias="X-API-Key"),
):
"""
Execute Playwright browser commands.
Headers:
- X-Container-Name: Container name for cloud authentication
- X-API-Key: API key for cloud authentication
Body:
{
"command": "visit_url|click|type|scroll|web_search",
"params": {...}
}
"""
# Parse request body
try:
body = await request.json()
command = body.get("command")
params = body.get("params", {})
except Exception as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
if not command:
raise HTTPException(status_code=400, detail="Command is required")
# Check if CONTAINER_NAME is set (indicating cloud provider)
server_container_name = os.environ.get("CONTAINER_NAME")
# If cloud provider, perform authentication
if server_container_name:
logger.info(
f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..."
)
# Validate required headers
if not container_name:
raise HTTPException(status_code=401, detail="Container name required")
if not api_key:
raise HTTPException(status_code=401, detail="API key required")
# Validate with AuthenticationManager
is_authenticated = await auth_manager.auth(container_name, api_key)
if not is_authenticated:
raise HTTPException(status_code=401, detail="Authentication failed")
# Get browser manager and execute command
try:
browser_manager = get_browser_manager()
result = await browser_manager.execute_command(command, params)
if result.get("success"):
return JSONResponse(content=result)
else:
raise HTTPException(status_code=400, detail=result.get("error", "Command failed"))
except Exception as e:
logger.error(f"Error executing playwright command: {str(e)}")
logger.error(traceback.format_exc())
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -4,7 +4,7 @@ build-backend = "pdm.backend"
[project]
name = "cua-computer-server"
version = "0.1.30"
version = "0.1.31"
description = "Server component for the Computer-Use Interface (CUI) framework powering Cua"
authors = [
@@ -12,7 +12,7 @@ authors = [
]
readme = "README.md"
license = { text = "MIT" }
requires-python = ">=3.12"
requires-python = ">=3.12,<3.14"
dependencies = [
"fastapi>=0.111.0",
"uvicorn[standard]>=0.27.0",
@@ -24,6 +24,7 @@ dependencies = [
"pyperclip>=1.9.0",
"websockets>=12.0",
"pywinctl>=0.4.1",
"playwright>=1.40.0",
# OS-specific runtime deps
"pyobjc-framework-Cocoa>=10.1; sys_platform == 'darwin'",
"pyobjc-framework-Quartz>=10.1; sys_platform == 'darwin'",

View File

@@ -969,6 +969,35 @@ class Computer:
"""
return await self.interface.to_screenshot_coordinates(x, y)
async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any]:
"""
Execute a Playwright browser command.
Args:
command: The browser command to execute (visit_url, click, type, scroll, web_search)
params: Command parameters
Returns:
Dict containing the command result
Examples:
# Navigate to a URL
await computer.playwright_exec("visit_url", {"url": "https://example.com"})
# Click at coordinates
await computer.playwright_exec("click", {"x": 100, "y": 200})
# Type text
await computer.playwright_exec("type", {"text": "Hello, world!"})
# Scroll
await computer.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100})
# Web search
await computer.playwright_exec("web_search", {"query": "computer use agent"})
"""
return await self.interface.playwright_exec(command, params)
# Add virtual environment management functions to computer interface
async def venv_install(self, venv_name: str, requirements: list[str]):
"""Install packages in a virtual environment.

View File

@@ -667,6 +667,56 @@ class GenericComputerInterface(BaseComputerInterface):
return screenshot_x, screenshot_y
# Playwright browser control
async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any]:
"""
Execute a Playwright browser command.
Args:
command: The browser command to execute (visit_url, click, type, scroll, web_search)
params: Command parameters
Returns:
Dict containing the command result
Examples:
# Navigate to a URL
await interface.playwright_exec("visit_url", {"url": "https://example.com"})
# Click at coordinates
await interface.playwright_exec("click", {"x": 100, "y": 200})
# Type text
await interface.playwright_exec("type", {"text": "Hello, world!"})
# Scroll
await interface.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100})
# Web search
await interface.playwright_exec("web_search", {"query": "computer use agent"})
"""
protocol = "https" if self.api_key else "http"
port = "8443" if self.api_key else "8000"
url = f"{protocol}://{self.ip_address}:{port}/playwright_exec"
payload = {"command": command, "params": params or {}}
headers = {"Content-Type": "application/json"}
if self.api_key:
headers["X-API-Key"] = self.api_key
if self.vm_name:
headers["X-Container-Name"] = self.vm_name
try:
async with aiohttp.ClientSession() as session:
async with session.post(url, json=payload, headers=headers) as response:
if response.status == 200:
return await response.json()
else:
error_text = await response.text()
return {"success": False, "error": error_text}
except Exception as e:
return {"success": False, "error": str(e)}
# Websocket Methods
async def _keep_alive(self):
"""Keep the WebSocket connection alive with automatic reconnection."""

View File

@@ -45,7 +45,9 @@ class CloudProvider(BaseVMProvider):
# Fall back to environment variable if api_key not provided
if api_key is None:
api_key = os.getenv("CUA_API_KEY")
assert api_key, "api_key required for CloudProvider (provide via parameter or CUA_API_KEY environment variable)"
assert (
api_key
), "api_key required for CloudProvider (provide via parameter or CUA_API_KEY environment variable)"
self.api_key = api_key
self.verbose = verbose
self.api_base = (api_base or DEFAULT_API_BASE).rstrip("/")

View File

@@ -19,7 +19,7 @@ dependencies = [
"pydantic>=2.11.1",
"mslex>=1.3.0",
]
requires-python = ">=3.12"
requires-python = ">=3.12,<3.14"
[project.optional-dependencies]
lume = [

View File

@@ -15,7 +15,7 @@ dependencies = [
"httpx>=0.24.0",
"posthog>=3.20.0"
]
requires-python = ">=3.12"
requires-python = ">=3.12,<3.14"
[tool.pdm]
distribution = true

View File

@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
name = "cua-mcp-server"
description = "MCP Server for Computer-Use Agent (CUA)"
readme = "README.md"
requires-python = ">=3.12"
requires-python = ">=3.12,<3.14"
version = "0.1.15"
authors = [
{name = "TryCua", email = "gh@trycua.com"}

View File

@@ -24,7 +24,7 @@ dependencies = [
"typing-extensions>=4.9.0",
"pydantic>=2.6.3"
]
requires-python = ">=3.12"
requires-python = ">=3.12,<3.14"
readme = "README.md"
license = {text = "AGPL-3.0-or-later"}
keywords = ["computer-vision", "ocr", "ui-analysis", "icon-detection"]

View File

@@ -0,0 +1,14 @@
# QEMU Docker Containers
Docker containers running desktop operating systems via QEMU/KVM for Computer-Using Agents (CUA).
## Structure
```
qemu-docker/
└── windows/ # Windows 11 container with CUA computer-server
```
## Windows Container
See [windows/README.md](windows/README.md) for complete documentation on the Windows 11 QEMU container.

View File

@@ -0,0 +1,14 @@
FROM trycua/qemu-local:latest
COPY src/vm/setup/. /oem/
COPY --chmod=755 src/entry.sh /entry.sh
ENV RAM_SIZE="8G"
ENV CPU_CORES="8"
ENV DISK_SIZE="64G"
ENV ARGUMENTS="-qmp tcp:0.0.0.0:7200,server,nowait"
EXPOSE 5000 8006
ENTRYPOINT ["/entry.sh"]

View File

@@ -0,0 +1,146 @@
# CUA Linux Container
Containerized Ubuntu 22.04 LTS virtual desktop for Computer-Using Agents (CUA). Utilizes QEMU/KVM with Ubuntu Desktop and computer-server pre-installed for remote computer control.
## Features
- Ubuntu 22.04 LTS Desktop running in QEMU/KVM
- Automated installation via cloud-init autoinstall
- Pre-installed CUA computer-server for remote computer control
- Support for custom OEM scripts during setup
- noVNC access for visual desktop interaction
## Quick Start
### 1. Download Ubuntu Server ISO
**Download Ubuntu 22.04 LTS Server ISO:**
1. Visit & download the [server ISO](https://releases.ubuntu.com/22.04/ubuntu-22.04.5-live-server-amd64.iso)
2. After downloading, rename the file to `setup.iso`
3. Copy it to the directory `src/vm/image/`
This ISO is used for automated Ubuntu installation with cloud-init on first run.
### 2. Build the Image
```bash
docker build -t cua-linux:dev .
```
### 3. First Run - Create Golden Image
On first run, the container will install Ubuntu from scratch and create a golden image. This takes 15-30 minutes.
```bash
# Create storage directory
mkdir -p ./storage
# Run with ubuntu.iso to create golden image
docker run -it --rm \
--device=/dev/kvm \
--name cua-linux \
--mount type=bind,source=/path/to/ubuntu.iso,target=/custom.iso \
--cap-add NET_ADMIN \
-v $(pwd)/storage:/storage \
-p 8006:8006 \
-p 5000:5000 \
-e RAM_SIZE=8G \
-e CPU_CORES=4 \
-e DISK_SIZE=64G \
cua-linux:dev
```
**What happens during first run:**
1. Ubuntu 22.04 Server installs automatically using cloud-init autoinstall
2. Minimal desktop environment is installed with auto-login enabled
3. OEM setup scripts install Python 3, create venv, and install CUA computer-server
4. systemd service created for CUA server (runs automatically on login)
5. X11 access configured for GUI automation
6. Golden image is saved to `/storage` directory
7. Container exits after setup completes
### 4. Subsequent Runs - Use Golden Image
After the golden image is created, subsequent runs boot much faster (30 sec - 2 min):
```bash
# Run without ubuntu.iso - uses existing golden image
docker run -it --rm \
--device=/dev/kvm \
--name cua-linux \
--cap-add NET_ADMIN \
-v $(pwd)/storage:/storage \
-p 8006:8006 \
-p 5000:5000 \
-e RAM_SIZE=8G \
-e CPU_CORES=4 \
cua-linux:dev
```
**Access points:**
- **Computer Server API**: `http://localhost:5000`
- **noVNC Browser**: `http://localhost:8006`
## Container Configuration
### Ports
- **5000**: CUA computer-server API endpoint
- **8006**: noVNC web interface for visual desktop access
### Environment Variables
- `RAM_SIZE`: RAM allocated to Ubuntu VM (default: "8G", recommended: "8G" for WSL2)
- `CPU_CORES`: CPU cores allocated to VM (default: "8")
- `DISK_SIZE`: VM disk size (default: "64G", minimum: "32G")
### Volumes
- `/storage`: Persistent VM storage (golden image, disk)
- `/custom.iso`: Mount point for ubuntu.iso (only needed for first run)
- `/oem`: Optional mount point for custom OEM scripts (built-in scripts included in image)
## Architecture
```
┌─────────────────────────────────────────────────────────┐
│ Docker Container (Linux host) │
│ │
│ • Port forwarding: localhost:5000 → EMULATOR_IP:5000 │
│ • Exposes: 5000 (API), 8006 (noVNC) │
│ │
│ ┌────────────────────────────────────────────────────┐ │
│ │ QEMU VM (Ubuntu 22.04) │ │
│ │ │ │
│ │ • CUA computer-server listens on 5000 │ │
│ │ │ │
│ └────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────┘
```
**Communication Flow:**
1. External client → `localhost:5000` (host)
2. Docker port mapping → Container's `localhost:5000`
3. Container detects VM IP and waits for server to be ready
4. CUA computer-server in Ubuntu VM processes request
## Development
### Modifying Setup Scripts
Setup scripts are in `src/vm/setup/`:
- `install.sh`: Entry point called after cloud-init installation (runs OEM setup)
- `setup.sh`: Main setup orchestration (copies scripts to /opt/oem)
- `setup-cua-server.sh`: CUA server installation with isolated venv and systemd service
After modifying, rebuild the image:
```bash
docker build -t cua-linux:dev .
```

View File

@@ -0,0 +1,61 @@
#!/bin/bash
cleanup() {
echo "Received signal, shutting down gracefully..."
if [ -n "$VM_PID" ]; then
kill -TERM "$VM_PID" 2>/dev/null
wait "$VM_PID" 2>/dev/null
fi
exit 0
}
# Install trap for signals
trap cleanup SIGTERM SIGINT SIGHUP SIGQUIT
# Start the VM in the background
echo "Starting Ubuntu VM..."
/usr/bin/tini -s /run/entry.sh &
VM_PID=$!
echo "Live stream accessible at localhost:8006"
echo "Waiting for Ubuntu to boot and CUA computer-server to start..."
VM_IP=""
while true; do
# Wait for VM and get the IP
if [ -z "$VM_IP" ]; then
VM_IP=$(ps aux | grep dnsmasq | grep -oP '(?<=--dhcp-range=)[0-9.]+' | head -1)
if [ -n "$VM_IP" ]; then
echo "Detected VM IP: $VM_IP"
else
echo "Waiting for VM to start..."
sleep 5
continue
fi
fi
# Check if server is ready
response=$(curl --write-out '%{http_code}' --silent --output /dev/null $VM_IP:5000/status)
if [ "${response:-0}" -eq 200 ]; then
break
fi
echo "Waiting for CUA computer-server to be ready. This might take a while..."
sleep 5
done
echo "VM is up and running, and the CUA Computer Server is ready!"
echo "Computer server accessible at localhost:5000"
# Detect initial setup by presence of custom ISO
CUSTOM_ISO=$(find / -maxdepth 1 -type f -iname "*.iso" -print -quit 2>/dev/null || true)
if [ -n "$CUSTOM_ISO" ]; then
echo "Preparation complete. Shutting down gracefully..."
cleanup
fi
# Keep container alive for golden image boots
echo "Container running. Press Ctrl+C to stop."
tail -f /dev/null

View File

@@ -0,0 +1,7 @@
> Add your Ubuntu 22.04 live server setup.iso to this folder
**Download Ubuntu 22.04 LTS Server ISO:**
1. Visit & download the [server ISO](https://releases.ubuntu.com/22.04/ubuntu-22.04.5-live-server-amd64.iso)
2. After downloading, rename the file to `setup.iso`
3. Copy it to the current directory.

View File

@@ -0,0 +1,26 @@
#!/bin/bash
# OEM Installation Entry Point for Linux
# This script is called by the OEM systemd service on first boot
set -e
SCRIPT_DIR="/opt/oem"
LOG_FILE="$SCRIPT_DIR/setup.log"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
log "=== Starting OEM Setup ==="
# Run main setup script
if [ -f "$SCRIPT_DIR/setup.sh" ]; then
log "Running setup.sh..."
bash "$SCRIPT_DIR/setup.sh" 2>&1 | tee -a "$LOG_FILE"
log "setup.sh completed with exit code: $?"
else
log "ERROR: setup.sh not found at $SCRIPT_DIR/setup.sh"
exit 1
fi
log "=== OEM Setup Completed ==="

View File

@@ -0,0 +1,135 @@
#!/bin/bash
# Setup CUA Computer Server on Linux
# Creates a system-level systemd service to run computer server in background
set -e
USER_NAME="docker"
USER_HOME="/home/$USER_NAME"
SCRIPT_DIR="/opt/oem"
CUA_DIR="/opt/cua-server"
VENV_DIR="$CUA_DIR/venv"
SERVICE_NAME="cua-computer-server"
LOG_FILE="$SCRIPT_DIR/setup.log"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
log "=== Installing CUA Computer Server ==="
# Install Python 3 and venv
log "Installing Python 3 and dependencies..."
sudo apt-get install -y python3 python3-venv python3-pip python3-tk python3-dev
# Create CUA directory
log "Creating CUA directory at $CUA_DIR..."
sudo mkdir -p "$CUA_DIR"
sudo chown "$USER_NAME:$USER_NAME" "$CUA_DIR"
# Create virtual environment
if [ -f "$VENV_DIR/bin/python" ]; then
log "Existing venv detected; skipping creation"
else
log "Creating Python virtual environment at $VENV_DIR..."
python3 -m venv "$VENV_DIR"
log "Virtual environment created successfully"
fi
# Activate and install packages
log "Upgrading pip, setuptools, and wheel..."
"$VENV_DIR/bin/pip" install --upgrade pip setuptools wheel
log "Installing cua-computer-server..."
"$VENV_DIR/bin/pip" install --upgrade cua-computer-server
log "cua-computer-server installed successfully"
# Open firewall for port 5000 (if ufw is available)
if command -v ufw &> /dev/null; then
log "Opening firewall for port 5000..."
sudo ufw allow 5000/tcp || true
log "Firewall rule added"
fi
# Create start script with auto-restart
START_SCRIPT="$CUA_DIR/start-server.sh"
log "Creating start script at $START_SCRIPT..."
cat > "$START_SCRIPT" << 'EOF'
#!/bin/bash
# CUA Computer Server Start Script with auto-restart
CUA_DIR="/opt/cua-server"
VENV_DIR="$CUA_DIR/venv"
LOG_FILE="$CUA_DIR/server.log"
start_server() {
echo "$(date '+%Y-%m-%d %H:%M:%S') Updating cua-computer-server..." >> "$LOG_FILE"
"$VENV_DIR/bin/pip" install --upgrade cua-computer-server >> "$LOG_FILE" 2>&1
echo "$(date '+%Y-%m-%d %H:%M:%S') Starting CUA Computer Server on port 5000..." >> "$LOG_FILE"
"$VENV_DIR/bin/python" -m computer_server --port 5000 >> "$LOG_FILE" 2>&1
return $?
}
while true; do
start_server
EXIT_CODE=$?
echo "$(date '+%Y-%m-%d %H:%M:%S') Server exited with code: $EXIT_CODE. Restarting in 5s..." >> "$LOG_FILE"
sleep 5
done
EOF
chmod +x "$START_SCRIPT"
log "Start script created"
# Create xhost script for X11 access
log "Creating xhost script..."
sudo tee /etc/X11/Xsession.d/99xauth > /dev/null << 'EOF'
#!/bin/sh
# Grant local X11 access for CUA Computer Server
export DISPLAY=:0
xhost +local: 2>/dev/null || true
EOF
sudo chmod +x /etc/X11/Xsession.d/99xauth
log "X11 access script created"
# Create system-level systemd service
log "Creating systemd system service..."
sudo tee /etc/systemd/system/$SERVICE_NAME.service > /dev/null << EOF
[Unit]
Description=CUA Computer Server
After=graphical.target
[Service]
Type=simple
ExecStart=$START_SCRIPT
Restart=always
RestartSec=5
Environment=PYTHONUNBUFFERED=1
Environment=DISPLAY=:0
Environment=XAUTHORITY=$USER_HOME/.Xauthority
User=$USER_NAME
WorkingDirectory=$CUA_DIR
[Install]
WantedBy=graphical.target
EOF
log "Systemd service created at /etc/systemd/system/$SERVICE_NAME.service"
# Ensure proper ownership of CUA directory
log "Setting ownership of $CUA_DIR to $USER_NAME..."
sudo chown -R "$USER_NAME:$USER_NAME" "$CUA_DIR"
# Enable and start the service
log "Enabling systemd service..."
sudo systemctl daemon-reload
sudo systemctl enable "$SERVICE_NAME.service"
log "Starting CUA Computer Server service..."
sudo systemctl start "$SERVICE_NAME.service" || true
log "=== CUA Computer Server setup completed ==="
log "Service status: $(sudo systemctl is-active $SERVICE_NAME.service 2>/dev/null || echo 'unknown')"

View File

@@ -0,0 +1,33 @@
#!/bin/bash
# Main Setup Script for Linux
# Installs dependencies and sets up CUA Computer Server
set -e
SCRIPT_DIR="/opt/oem"
LOG_FILE="$SCRIPT_DIR/setup.log"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
log "=== Running Main Setup ==="
# Update package lists
log "Updating package lists..."
sudo apt-get update
# Install Git
log "Installing Git..."
sudo apt-get install -y git
# Setup CUA Computer Server
log "Setting up CUA Computer Server..."
if [ -f "$SCRIPT_DIR/setup-cua-server.sh" ]; then
bash "$SCRIPT_DIR/setup-cua-server.sh" 2>&1 | tee -a "$LOG_FILE"
log "CUA Computer Server setup completed."
else
log "ERROR: setup-cua-server.sh not found at $SCRIPT_DIR/setup-cua-server.sh"
fi
log "=== Main Setup Completed ==="

View File

@@ -0,0 +1,15 @@
FROM trycua/windows-local:latest
COPY src/vm/setup/. /oem/
COPY --chmod=755 src/entry.sh /entry.sh
ENV RAM_SIZE="8G"
ENV CPU_CORES="8"
ENV VERSION="win11x64-enterprise-eval"
ENV DISK_SIZE="30G"
ENV ARGUMENTS="-qmp tcp:0.0.0.0:7200,server,nowait"
EXPOSE 5000 8006
ENTRYPOINT ["/entry.sh"]

View File

@@ -0,0 +1,159 @@
# CUA Windows Container
Containerized Windows 11 virtual desktop for Computer-Using Agents (CUA). Utilizes QEMU/KVM with Windows 11 and computer-server pre-installed for remote computer control.
## Features
- Windows 11 Enterprise running in QEMU/KVM
- Pre-installed CUA computer-server for remote computer control
- Caddy reverse proxy (port 9222 → 1337) for browser automation
- noVNC access for visual desktop interaction
- Automated setup via unattended installation
- Support for both dev (shared folder) and azure (OEM folder) deployment modes
- Python 3.12 with isolated virtual environment for CUA computer-server
- Services run hidden in background via Windows scheduled tasks
- Essential tools pre-installed (Chrome, LibreOffice, VLC, GIMP, VSCode, Thunderbird)
## Quick Start
### 1. Download and Prepare setup.iso
**Download Windows 11 Evaluation ISO:**
1. Visit [Microsoft Evaluation Center](https://info.microsoft.com/ww-landing-windows-11-enterprise.html)
2. Accept the Terms of Service
3. Download **Windows 11 Enterprise Evaluation (90-day trial, English, United States)** ISO file [~6GB]
4. After downloading, rename the file to `setup.iso`
5. Copy it to the directory `src/vm/image/`
This ISO is used for automated Windows installation on first run.
### 2. Build the Image
```bash
docker build -t cua-windows:dev .
```
### 3. First Run - Create Golden Image
On first run, the container will install Windows from scratch and create a golden image. This takes 15-30 minutes.
```bash
# Create storage directory
mkdir -p ./storage
# Run with setup.iso to create golden image
docker run -it --rm \
--device=/dev/kvm \
--platform linux/amd64 \
--name cua-windows \
--mount type=bind,source=$(pwd)/src/vm/image/setup.iso,target=/custom.iso \
--cap-add NET_ADMIN \
-v $(pwd)/storage:/storage \
-p 8006:8006 \
-p 5000:5000 \
-e RAM_SIZE=8G \
-e CPU_CORES=4 \
-e DISK_SIZE=20G \
cua-windows:dev
```
**What happens during first run:**
1. Windows 11 installs automatically using unattended configuration
2. Setup scripts install Python 3.12, Git, and CUA computer-server in isolated venv
3. Windows scheduled tasks created for CUA server and Caddy proxy (run hidden in background)
4. Golden image is saved to `/storage` directory
5. Container exits after setup completes
### 4. Subsequent Runs - Use Golden Image
After the golden image is created, subsequent runs boot much faster (30 sec - 2 min):
```bash
# Run without setup.iso - uses existing golden image
docker run -it --rm \
--device=/dev/kvm \
--platform linux/amd64 \
--name cua-windows \
--cap-add NET_ADMIN \
-v $(pwd)/storage:/storage \
-p 8006:8006 \
-p 5000:5000 \
-e RAM_SIZE=8G \
-e CPU_CORES=4 \
cua-windows:dev
```
**Access points:**
- **Computer Server API**: `http://localhost:5000`
- **noVNC Browser**: `http://localhost:8006`
## Container Configuration
### Ports
- **5000**: CUA computer-server API endpoint
- **8006**: noVNC web interface for visual desktop access
### Environment Variables
- `RAM_SIZE`: RAM allocated to Windows VM (default: "8G", recommended: "8G" for WSL2)
- `CPU_CORES`: CPU cores allocated to VM (default: "8")
- `DISK_SIZE`: VM disk size (default: "30G", minimum: "20G")
- `VERSION`: Windows version (default: "win11x64-enterprise-eval")
### Volumes
- `/storage`: Persistent VM storage (golden image, disk, firmware)
- `/custom.iso`: Mount point for setup.iso (only needed for first run)
## Architecture
```
┌─────────────────────────────────────────────────────────┐
│ Docker Container (Linux host) │
│ │
│ • Port forwarding: localhost:5000 → EMULATOR_IP:5000 │
│ • Exposes: 5000 (API), 8006 (noVNC) │
│ │
│ ┌────────────────────────────────────────────────────┐ │
│ │ QEMU VM (Windows 11) │ │
│ │ │ │
│ │ • CUA computer-server listens on 5000 │ │
│ │ │ │
│ └────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────┘
```
**Communication Flow:**
1. External client → `localhost:5000` (host)
2. Docker port mapping → Container's `localhost:5000`
3. socat port forwarding → `20.20.20.21:5000` (VM)
4. CUA computer-server in Windows VM processes request
## Development
### Modifying Setup Scripts
Setup scripts are in `src/vm/setup/`:
- `install.bat`: Entry point called by Windows setup
- `setup.ps1`: Main setup orchestration (installs software, configures Windows)
- `setup-cua-server.ps1`: CUA server installation with isolated venv
- `on-logon.ps1`: Runs on user logon (starts scheduled tasks)
- `setup-utils.psm1`: Helpers functions for setup
After modifying, rebuild the image:
```bash
docker build -t cua-windows:dev .
```
## Credits
- Built on [Dockur Windows](https://github.com/dockur/windows) base image
- Inspired by [Windows Agent Arena](https://github.com/microsoft/WindowsAgentArena)

View File

@@ -0,0 +1,67 @@
#!/bin/bash
cleanup() {
echo "Received signal, shutting down gracefully..."
if [ -n "$VM_PID" ]; then
kill -TERM "$VM_PID" 2>/dev/null
wait "$VM_PID" 2>/dev/null
fi
exit 0
}
# Install trap for signals
trap cleanup SIGTERM SIGINT SIGHUP SIGQUIT
# Create windows.boot file if it doesn't exist (required for proper boot)
if [ -d "/storage" -a ! -f "/storage/windows.boot" ]; then
echo "Creating windows.boot file in /storage..."
touch /storage/windows.boot
fi
# Start the VM in the background
echo "Starting Windows VM..."
/usr/bin/tini -s /run/entry.sh &
VM_PID=$!
echo "Live stream accessible at localhost:8006"
echo "Waiting for Windows to boot and CUA computer-server to start..."
VM_IP=""
while true; do
# Wait from VM and get the IP
if [ -z "$VM_IP" ]; then
VM_IP=$(ps aux | grep dnsmasq | grep -oP '(?<=--dhcp-range=)[0-9.]+' | head -1)
if [ -n "$VM_IP" ]; then
echo "Detected VM IP: $VM_IP"
else
echo "Waiting for VM to start..."
sleep 5
continue
fi
fi
# Check if server is ready
response=$(curl --write-out '%{http_code}' --silent --output /dev/null $VM_IP:5000/status)
if [ "${response:-0}" -eq 200 ]; then
break
fi
echo "Waiting for CUA computer-server to be ready. This might take a while..."
sleep 5
done
echo "VM is up and running, and the CUA Computer Server is ready!"
echo "Computer server accessible at localhost:5000"
# Detect initial setup by presence of custom ISO
CUSTOM_ISO=$(find / -maxdepth 1 -type f -iname "*.iso" -print -quit 2>/dev/null || true)
if [ -n "$CUSTOM_ISO" ]; then
echo "Preparation complete. Shutting down gracefully..."
cleanup
fi
# Keep container alive for golden image boots
echo "Container running. Press Ctrl+C to stop."
tail -f /dev/null

View File

@@ -0,0 +1,9 @@
> Add your Win11E setup.iso to this folder
**Download Windows 11 Evaluation ISO:**
1. Visit [Microsoft Evaluation Center](https://info.microsoft.com/ww-landing-windows-11-enterprise.html)
2. Accept the Terms of Service
3. Download **Windows 11 Enterprise Evaluation (90-day trial, English, United States)** ISO file [~6GB]
4. After downloading, rename the file to `setup.iso`
5. Copy it to the current directory.

View File

@@ -0,0 +1,31 @@
@echo off
SET ScriptFolder=C:\OEM
SET LogFile=%ScriptFolder%\ps_script_log.txt
echo Running PowerShell script... > %LogFile%
:: Check for PowerShell availability
where powershell >> %LogFile% 2>&1
if %ERRORLEVEL% neq 0 (
echo PowerShell is not available! >> %LogFile%
echo PowerShell is not available!
exit /b 1
)
:: Add a 30-second delay
echo Waiting for 30 seconds before continuing... >> %LogFile%
timeout /t 30 /nobreak >> %LogFile% 2>&1
:: Run PowerShell script with ExecutionPolicy Bypass and log errors
echo Running setup.ps1... >> %LogFile%
powershell -ExecutionPolicy Bypass -File "%ScriptFolder%\setup.ps1" >> %LogFile% 2>&1
if %ERRORLEVEL% neq 0 (
echo An error occurred. See %LogFile% for details.
) else (
echo PowerShell script has completed successfully.
)
echo PowerShell script has completed.

Some files were not shown because too many files have changed in this diff Show More