From 87698101fcf553b74a0dfde9275d40c1b2f6091d Mon Sep 17 00:00:00 2001 From: Sarina Li Date: Wed, 19 Nov 2025 16:09:03 -0500 Subject: [PATCH 01/15] add gemini 3 support with omni parser + docs --- .../gemini-complex-ui-navigation.mdx | 632 ++++++++++++++++++ docs/content/docs/example-usecases/meta.json | 7 +- libs/python/agent/agent/loops/omniparser.py | 16 + 3 files changed, 654 insertions(+), 1 deletion(-) create mode 100644 docs/content/docs/example-usecases/gemini-complex-ui-navigation.mdx diff --git a/docs/content/docs/example-usecases/gemini-complex-ui-navigation.mdx b/docs/content/docs/example-usecases/gemini-complex-ui-navigation.mdx new file mode 100644 index 00000000..619576e8 --- /dev/null +++ b/docs/content/docs/example-usecases/gemini-complex-ui-navigation.mdx @@ -0,0 +1,632 @@ +--- +title: GUI Grounding with Gemini 3 +description: Using Google's Gemini 3 with OmniParser for Advanced GUI Grounding Tasks +--- + +import { Step, Steps } from 'fumadocs-ui/components/steps'; +import { Tab, Tabs } from 'fumadocs-ui/components/tabs'; +import { Callout } from 'fumadocs-ui/components/callout'; + +## Overview + +This example demonstrates how to use Google's Gemini 3 models with OmniParser for complex GUI grounding tasks. Gemini 3 Pro achieves exceptional performance on the [ScreenSpot-Pro benchmark](https://github.com/likaixin2000/ScreenSpot-Pro-GUI-Grounding) with a **72.7% accuracy** (compared to Claude Sonnet 4.5's 36.2%), making it ideal for precise UI element location and complex navigation tasks. + + + According to [Google's Gemini 3 announcement](https://blog.google/products/gemini/gemini-3/), + Gemini 3 Pro achieves: - **72.7%** on ScreenSpot-Pro (vs. Gemini 2.5 Pro's 11.4%) - + Industry-leading performance on complex UI navigation tasks - Advanced multimodal understanding + for high-resolution screens + + +### What You'll Build + +This guide shows how to: + +- Set up Vertex AI with proper authentication +- Use OmniParser with Gemini 3 for GUI element detection +- Leverage Gemini 3-specific features like `thinking_level` and `media_resolution` +- Create agents that can perform complex multi-step UI interactions + +--- + + + + + +### Set Up Google Cloud and Vertex AI + +Before using Gemini 3 models, you need to enable Vertex AI in Google Cloud Console. + +#### 1. Create a Google Cloud Project + +1. Go to [Google Cloud Console](https://console.cloud.google.com/) +2. Click **Select a project** β†’ **New Project** +3. Enter a project name and click **Create** +4. Note your **Project ID** (you'll need this later) + +#### 2. Enable Vertex AI API + +1. Navigate to [Vertex AI API](https://console.cloud.google.com/apis/library/aiplatform.googleapis.com) +2. Select your project +3. Click **Enable** + +#### 3. Enable Billing + +1. Go to [Billing](https://console.cloud.google.com/billing) +2. Link a billing account to your project +3. Vertex AI offers a [free tier](https://cloud.google.com/vertex-ai/pricing) for testing + +#### 4. Create a Service Account + +1. Go to [IAM & Admin > Service Accounts](https://console.cloud.google.com/iam-admin/serviceaccounts) +2. Click **Create Service Account** +3. Enter a name (e.g., "cua-gemini-agent") +4. Click **Create and Continue** +5. Grant the **Vertex AI User** role +6. Click **Done** + +#### 5. Create and Download Service Account Key + +1. Click on your newly created service account +2. Go to **Keys** tab +3. Click **Add Key** β†’ **Create new key** +4. Select **JSON** format +5. Click **Create** (the key file will download automatically) +6. **Important**: Store this key file securely! It contains credentials for accessing your Google Cloud resources + + + Never commit your service account JSON key to version control! Add it to `.gitignore` immediately. + + + + + + +### Install Dependencies + +Install the required packages for OmniParser and Gemini 3: + +Create a `requirements.txt` file: + +```text +cua-agent +cua-computer +cua-som # OmniParser for GUI element detection +litellm>=1.0.0 +python-dotenv>=1.0.0 +google-cloud-aiplatform>=1.70.0 +``` + +Install the dependencies: + +```bash +pip install -r requirements.txt +``` + + + + + +### Configure Environment Variables + +Create a `.env` file in your project root: + +```text +# Google Cloud / Vertex AI credentials +GOOGLE_CLOUD_PROJECT=your-project-id +GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-service-account-key.json + +# Cua credentials (for cloud sandboxes) +CUA_API_KEY=sk_cua-api01... +CUA_SANDBOX_NAME=your-sandbox-name +``` + +Replace the values: + +- `your-project-id`: Your Google Cloud Project ID from Step 1 +- `/path/to/your-service-account-key.json`: Path to the JSON key file you downloaded +- `sk_cua-api01...`: Your Cua API key from the [Cua dashboard](https://cua.dev) +- `your-sandbox-name`: Your sandbox name (if using cloud sandboxes) + + + + + +### Create Your Complex UI Navigation Script + +Create a Python file (e.g., `gemini_ui_navigation.py`): + + + + +```python +import asyncio +import logging +import os +import signal +import traceback + +from agent import ComputerAgent +from computer import Computer, VMProviderType +from dotenv import load_dotenv + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def handle_sigint(sig, frame): + print("\n\nExecution interrupted by user. Exiting gracefully...") + exit(0) + +async def complex_ui_navigation(): + """ + Demonstrate Gemini 3's exceptional UI grounding capabilities + with complex, multi-step navigation tasks. + """ + try: + async with Computer( + os_type="linux", + provider_type=VMProviderType.CLOUD, + name=os.environ["CUA_SANDBOX_NAME"], + api_key=os.environ["CUA_API_KEY"], + verbosity=logging.INFO, + ) as computer: + + agent = ComputerAgent( + # Use OmniParser with Gemini 3 Pro for optimal GUI grounding + model="omniparser+vertex_ai/gemini-3-pro-preview", + tools=[computer], + only_n_most_recent_images=3, + verbosity=logging.INFO, + trajectory_dir="trajectories", + use_prompt_caching=False, + max_trajectory_budget=5.0, + # Gemini 3-specific parameters + thinking_level="high", # Enables deeper reasoning (vs "low") + media_resolution="high", # High-resolution image processing (vs "low" or "medium") + ) + + # Complex GUI grounding tasks inspired by ScreenSpot-Pro benchmark + # These test precise element location in professional UIs + tasks = [ + # Task 1: GitHub repository navigation + { + "instruction": ( + "Go to github.com/trycua/cua. " + "Find and click on the 'Issues' tab. " + "Then locate and click on the search box within the issues page " + "(not the global GitHub search). " + "Type 'omniparser' and press Enter." + ), + "description": "Tests precise UI element distinction in a complex interface", + }, + + # Task 2: Wikipedia multi-step interaction + { + "instruction": ( + "Open wikipedia.org. " + "Search for 'artificial intelligence'. " + "On the results page, find and click the 'Edit' button (not 'Edit source'). " + "Then locate the donation banner at the top and close it by clicking the X button." + ), + "description": "Tests element disambiguation and multi-step reasoning", + }, + ] + + history = [] + + for i, task_info in enumerate(tasks, 1): + task = task_info["instruction"] + print(f"\n{'='*60}") + print(f"[Task {i}/{len(tasks)}] {task_info['description']}") + print(f"{'='*60}") + print(f"\nInstruction: {task}\n") + + # Add user message to history + history.append({"role": "user", "content": task}) + + # Run agent with conversation history + async for result in agent.run(history, stream=False): + history += result.get("output", []) + + # Print output for debugging + for item in result.get("output", []): + if item.get("type") == "message": + content = item.get("content", []) + for content_part in content: + if content_part.get("text"): + logger.info(f"Agent: {content_part.get('text')}") + elif item.get("type") == "computer_call": + action = item.get("action", {}) + action_type = action.get("type", "") + logger.debug(f"Computer Action: {action_type}") + + print(f"\nβœ… Task {i}/{len(tasks)} completed") + + print("\nπŸŽ‰ All complex UI navigation tasks completed successfully!") + + except Exception as e: + logger.error(f"Error in complex_ui_navigation: {e}") + traceback.print_exc() + raise + +def main(): + try: + load_dotenv() + + # Validate required environment variables + required_vars = [ + "GOOGLE_CLOUD_PROJECT", + "GOOGLE_APPLICATION_CREDENTIALS", + "CUA_API_KEY", + "CUA_SANDBOX_NAME", + ] + + missing_vars = [var for var in required_vars if not os.environ.get(var)] + if missing_vars: + raise RuntimeError( + f"Missing required environment variables: {', '.join(missing_vars)}\n" + f"Please check your .env file and ensure all keys are set.\n" + f"See the setup guide for details on configuring Vertex AI credentials." + ) + + signal.signal(signal.SIGINT, handle_sigint) + + asyncio.run(complex_ui_navigation()) + + except Exception as e: + logger.error(f"Error running automation: {e}") + traceback.print_exc() + +if __name__ == "__main__": + main() +``` + + + + +```python +import asyncio +import logging +import os +import signal +import traceback + +from agent import ComputerAgent +from computer import Computer, VMProviderType +from dotenv import load_dotenv + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def handle_sigint(sig, frame): + print("\n\nExecution interrupted by user. Exiting gracefully...") + exit(0) + +async def complex_ui_navigation(): + """ + Demonstrate Gemini 3's exceptional UI grounding capabilities + with complex, multi-step navigation tasks. + """ + try: + async with Computer( + os_type="linux", + provider_type=VMProviderType.DOCKER, + image="trycua/cua-xfce:latest", + verbosity=logging.INFO, + ) as computer: + + agent = ComputerAgent( + # Use OmniParser with Gemini 3 Pro for optimal GUI grounding + model="omniparser+vertex_ai/gemini-3-pro-preview", + tools=[computer], + only_n_most_recent_images=3, + verbosity=logging.INFO, + trajectory_dir="trajectories", + use_prompt_caching=False, + max_trajectory_budget=5.0, + # Gemini 3-specific parameters + thinking_level="high", # Enables deeper reasoning (vs "low") + media_resolution="high", # High-resolution image processing (vs "low" or "medium") + ) + + # Complex GUI grounding tasks inspired by ScreenSpot-Pro benchmark + tasks = [ + { + "instruction": ( + "Go to github.com/trycua/cua. " + "Find and click on the 'Issues' tab. " + "Then locate and click on the search box within the issues page " + "(not the global GitHub search). " + "Type 'omniparser' and press Enter." + ), + "description": "Tests precise UI element distinction in a complex interface", + }, + ] + + history = [] + + for i, task_info in enumerate(tasks, 1): + task = task_info["instruction"] + print(f"\n{'='*60}") + print(f"[Task {i}/{len(tasks)}] {task_info['description']}") + print(f"{'='*60}") + print(f"\nInstruction: {task}\n") + + history.append({"role": "user", "content": task}) + + async for result in agent.run(history, stream=False): + history += result.get("output", []) + + for item in result.get("output", []): + if item.get("type") == "message": + content = item.get("content", []) + for content_part in content: + if content_part.get("text"): + logger.info(f"Agent: {content_part.get('text')}") + elif item.get("type") == "computer_call": + action = item.get("action", {}) + action_type = action.get("type", "") + logger.debug(f"Computer Action: {action_type}") + + print(f"\nβœ… Task {i}/{len(tasks)} completed") + + print("\nπŸŽ‰ All complex UI navigation tasks completed successfully!") + + except Exception as e: + logger.error(f"Error in complex_ui_navigation: {e}") + traceback.print_exc() + raise + +def main(): + try: + load_dotenv() + + required_vars = [ + "GOOGLE_CLOUD_PROJECT", + "GOOGLE_APPLICATION_CREDENTIALS", + ] + + missing_vars = [var for var in required_vars if not os.environ.get(var)] + if missing_vars: + raise RuntimeError( + f"Missing required environment variables: {', '.join(missing_vars)}\n" + f"Please check your .env file." + ) + + signal.signal(signal.SIGINT, handle_sigint) + + asyncio.run(complex_ui_navigation()) + + except Exception as e: + logger.error(f"Error running automation: {e}") + traceback.print_exc() + +if __name__ == "__main__": + main() +``` + + + + +```python +import asyncio +import logging +import os +import signal +import traceback + +from agent import ComputerAgent +from computer import Computer, VMProviderType +from dotenv import load_dotenv + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def handle_sigint(sig, frame): + print("\n\nExecution interrupted by user. Exiting gracefully...") + exit(0) + +async def complex_ui_navigation(): + """ + Demonstrate Gemini 3's exceptional UI grounding capabilities + with complex, multi-step navigation tasks. + """ + try: + async with Computer( + os_type="macos", + provider_type=VMProviderType.LUME, + name="macos-sequoia-cua:latest", + verbosity=logging.INFO, + ) as computer: + + agent = ComputerAgent( + # Use OmniParser with Gemini 3 Pro for optimal GUI grounding + model="omniparser+vertex_ai/gemini-3-pro-preview", + tools=[computer], + only_n_most_recent_images=3, + verbosity=logging.INFO, + trajectory_dir="trajectories", + use_prompt_caching=False, + max_trajectory_budget=5.0, + # Gemini 3-specific parameters + thinking_level="high", # Enables deeper reasoning (vs "low") + media_resolution="high", # High-resolution image processing (vs "low" or "medium") + ) + + # Complex GUI grounding tasks inspired by ScreenSpot-Pro benchmark + tasks = [ + { + "instruction": ( + "Go to github.com/trycua/cua. " + "Find and click on the 'Issues' tab. " + "Then locate and click on the search box within the issues page " + "(not the global GitHub search). " + "Type 'omniparser' and press Enter." + ), + "description": "Tests precise UI element distinction in a complex interface", + }, + ] + + history = [] + + for i, task_info in enumerate(tasks, 1): + task = task_info["instruction"] + print(f"\n{'='*60}") + print(f"[Task {i}/{len(tasks)}] {task_info['description']}") + print(f"{'='*60}") + print(f"\nInstruction: {task}\n") + + history.append({"role": "user", "content": task}) + + async for result in agent.run(history, stream=False): + history += result.get("output", []) + + for item in result.get("output", []): + if item.get("type") == "message": + content = item.get("content", []) + for content_part in content: + if content_part.get("text"): + logger.info(f"Agent: {content_part.get('text')}") + elif item.get("type") == "computer_call": + action = item.get("action", {}) + action_type = action.get("type", "") + logger.debug(f"Computer Action: {action_type}") + + print(f"\nβœ… Task {i}/{len(tasks)} completed") + + print("\nπŸŽ‰ All complex UI navigation tasks completed successfully!") + + except Exception as e: + logger.error(f"Error in complex_ui_navigation: {e}") + traceback.print_exc() + raise + +def main(): + try: + load_dotenv() + + required_vars = [ + "GOOGLE_CLOUD_PROJECT", + "GOOGLE_APPLICATION_CREDENTIALS", + ] + + missing_vars = [var for var in required_vars if not os.environ.get(var)] + if missing_vars: + raise RuntimeError( + f"Missing required environment variables: {', '.join(missing_vars)}\n" + f"Please check your .env file." + ) + + signal.signal(signal.SIGINT, handle_sigint) + + asyncio.run(complex_ui_navigation()) + + except Exception as e: + logger.error(f"Error running automation: {e}") + traceback.print_exc() + +if __name__ == "__main__": + main() +``` + + + + + + + + +### Run Your Script + +Execute your complex UI navigation automation: + +```bash +python gemini_ui_navigation.py +``` + +The agent will: + +1. Navigate to GitHub and locate specific UI elements +2. Distinguish between similar elements (e.g., global search vs. issues search) +3. Perform multi-step interactions with visual feedback +4. Use Gemini 3's advanced reasoning for precise element grounding + +Monitor the output to see the agent's progress through each task. + + + + + +--- + +## Understanding Gemini 3-Specific Parameters + +### `thinking_level` + +Controls the amount of internal reasoning the model performs: + +- `"high"`: Deeper reasoning, better for complex UI navigation (recommended for ScreenSpot-like tasks) +- `"low"`: Faster responses, suitable for simpler tasks + +### `media_resolution` + +Controls vision processing for multimodal inputs: + +- `"high"`: Best for complex UIs with many small elements (recommended) +- `"medium"`: Balanced quality and speed +- `"low"`: Faster processing for simple interfaces + + + For tasks requiring precise GUI element location (like ScreenSpot-Pro), use + `thinking_level="high"` and `media_resolution="high"` for optimal performance. + + +--- + +## Benchmark Performance + +Gemini 3 Pro's performance on ScreenSpot-Pro demonstrates its exceptional UI grounding capabilities: + +| Model | ScreenSpot-Pro Score | +| ----------------- | -------------------- | +| **Gemini 3 Pro** | **72.7%** | +| Claude Sonnet 4.5 | 36.2% | +| Gemini 2.5 Pro | 11.4% | +| GPT-5.1 | 3.5% | + +This makes Gemini 3 the ideal choice for complex UI navigation, element detection, and professional GUI automation tasks. + +--- + +## Troubleshooting + +### Authentication Issues + +If you encounter authentication errors: + +1. Verify your service account JSON key path is correct +2. Ensure the service account has the **Vertex AI User** role +3. Check that the Vertex AI API is enabled in your project +4. Confirm your `GOOGLE_CLOUD_PROJECT` matches your actual project ID + +### "Vertex AI API not enabled" Error + +Run this command to enable the API: + +```bash +gcloud services enable aiplatform.googleapis.com --project=YOUR_PROJECT_ID +``` + +### Billing Issues + +Ensure billing is enabled for your Google Cloud project. Visit the [Billing section](https://console.cloud.google.com/billing) to verify. + +--- + +## Next Steps + +- Learn more about [OmniParser agent loops](/agent-sdk/agent-loops) +- Explore [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) +- Read about [ScreenSpot-Pro benchmark](https://github.com/likaixin2000/ScreenSpot-Pro-GUI-Grounding) +- Check out [Google's Gemini 3 announcement](https://blog.google/products/gemini/gemini-3/) +- Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help diff --git a/docs/content/docs/example-usecases/meta.json b/docs/content/docs/example-usecases/meta.json index bfc88f1c..ca970219 100644 --- a/docs/content/docs/example-usecases/meta.json +++ b/docs/content/docs/example-usecases/meta.json @@ -1,5 +1,10 @@ { "title": "Cookbook", "description": "Real-world examples of building with Cua", - "pages": ["windows-app-behind-vpn", "form-filling", "post-event-contact-export"] + "pages": [ + "windows-app-behind-vpn", + "form-filling", + "post-event-contact-export", + "gemini-complex-ui-navigation" + ] } diff --git a/libs/python/agent/agent/loops/omniparser.py b/libs/python/agent/agent/loops/omniparser.py index e15dfc5b..f671dce2 100644 --- a/libs/python/agent/agent/loops/omniparser.py +++ b/libs/python/agent/agent/loops/omniparser.py @@ -365,6 +365,22 @@ class OmniparserConfig(AsyncAgentConfig): **kwargs, } + # Add Vertex AI specific parameters if using vertex_ai models + if llm_model.startswith("vertex_ai/"): + import os + + # Pass vertex_project and vertex_location to liteLLM + if "vertex_project" not in api_kwargs: + api_kwargs["vertex_project"] = os.getenv("GOOGLE_CLOUD_PROJECT") + if "vertex_location" not in api_kwargs: + api_kwargs["vertex_location"] = "global" + + # Pass through Gemini 3-specific parameters if provided + if "thinking_level" in kwargs: + api_kwargs["thinking_level"] = kwargs["thinking_level"] + if "media_resolution" in kwargs: + api_kwargs["media_resolution"] = kwargs["media_resolution"] + # Call API start hook if _on_api_start: await _on_api_start(api_kwargs) From c9751302ddda66cf99fcd98d526a0a0eeb407cb7 Mon Sep 17 00:00:00 2001 From: Sarina Li Date: Wed, 19 Nov 2025 17:37:06 -0500 Subject: [PATCH 02/15] add gif for demo --- blog/cua-vlm-router.md | 13 ++-- blog/introducing-cua-cli.md | 10 +++ docs/content/docs/agent-sdk/agent-loops.mdx | 6 +- .../agent-sdk/customizing-computeragent.mdx | 9 ++- .../docs/agent-sdk/integrations/hud.mdx | 6 +- .../agent-sdk/integrations/observability.mdx | 6 +- .../cua-vlm-router.mdx | 57 +++++++++------- .../supported-model-providers/index.mdx | 1 + docs/content/docs/agent-sdk/telemetry.mdx | 1 + .../content/docs/computer-sdk/computer-ui.mdx | 3 +- .../docs/computer-sdk/sandboxed-python.mdx | 9 ++- .../docs/example-usecases/form-filling.mdx | 1 + .../gemini-complex-ui-navigation.mdx | 6 ++ .../post-event-contact-export.mdx | 1 + .../windows-app-behind-vpn.mdx | 20 +++++- docs/content/docs/index.mdx | 3 +- .../docs/libraries/computer-server/index.mdx | 9 ++- .../docs/libraries/cua-cli/commands.mdx | 26 +++++++- docs/content/docs/libraries/cua-cli/index.mdx | 3 + .../docs/libraries/cua-cli/installation.mdx | 61 +++++------------- .../mcp-server/client-integrations.mdx | 9 ++- .../libraries/mcp-server/configuration.mdx | 16 +++-- .../libraries/mcp-server/installation.mdx | 11 +++- .../docs/libraries/mcp-server/tools.mdx | 3 + .../docs/libraries/mcp-server/usage.mdx | 36 ++++++++--- docs/content/docs/libraries/som/index.mdx | 6 +- docs/public/img/grounding-with-gemini3.gif | Bin 0 -> 5461762 bytes examples/agent_examples.py | 4 ++ 28 files changed, 233 insertions(+), 103 deletions(-) create mode 100644 docs/public/img/grounding-with-gemini3.gif diff --git a/blog/cua-vlm-router.md b/blog/cua-vlm-router.md index 44be5585..b796e810 100644 --- a/blog/cua-vlm-router.md +++ b/blog/cua-vlm-router.md @@ -4,7 +4,6 @@ If you've been building computer-use agents, you know the reality: every model p Today we're launching the **Cua VLM Router**: a managed inference API that gives you unified access to multiple vision-language model providers through a single API key. We're starting with Anthropic's Claude models (Sonnet 4.5 and Haiku 4.5)β€”some of the most loved and widely-used computer-use models in the Cua ecosystem - with more providers coming soon. - ![Cua VLM Router Banner](https://github.com/user-attachments/assets/1b978f62-2cae-4cf7-932a-55ac8c8f2e06) ## What You Get @@ -12,21 +11,25 @@ Today we're launching the **Cua VLM Router**: a managed inference API that gives The Cua VLM Router handles the infrastructure so you can focus on building: **Single API Key** + - One key for all model providers (no juggling multiple credentials) - Works for both model inference and sandbox access - Manage everything from one dashboard at cua.ai **Smart Routing** + - Automatic provider selection for optimal availability and performance - For Anthropic models, we route to the best provider (Anthropic, AWS Bedrock, or Microsoft Foundry) - No configuration neededβ€”just specify the model and we handle the rest **Cost Tracking & Optimization** + - Unified usage dashboard across all models - Real-time credit balance tracking - Detailed cost breakdown per request (gateway cost + upstream cost) **Production-Ready** + - OpenAI-compatible API (drop-in replacement for existing code) - Full streaming support with Server-Sent Events - Metadata about routing decisions in every response @@ -35,10 +38,10 @@ The Cua VLM Router handles the infrastructure so you can focus on building: We're starting with Anthropic's latest Claude models: -| Model | Best For | -|-------|----------| +| Model | Best For | +| --------------------------------- | ---------------------------------- | | `cua/anthropic/claude-sonnet-4.5` | General-purpose tasks, recommended | -| `cua/anthropic/claude-haiku-4.5` | Fast responses, cost-effective | +| `cua/anthropic/claude-haiku-4.5` | Fast responses, cost-effective | ## How It Works @@ -85,12 +88,14 @@ async for result in agent.run(messages): Already using Anthropic directly? Just add the `cua/` prefix: **Before:** + ```python export ANTHROPIC_API_KEY="sk-ant-..." agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929") ``` **After:** + ```python export CUA_API_KEY="sk_cua-api01_..." agent = ComputerAgent(model="cua/anthropic/claude-sonnet-4.5") diff --git a/blog/introducing-cua-cli.md b/blog/introducing-cua-cli.md index 85f88b7b..7c3742e6 100644 --- a/blog/introducing-cua-cli.md +++ b/blog/introducing-cua-cli.md @@ -11,11 +11,13 @@ Today we're launching the **Cua CLI**: a command-line interface that brings the The Cua CLI handles everything you need to work with Cloud Sandboxes: **Authentication** + - Browser-based OAuth login with automatic credential storage - Direct API key support for CI/CD pipelines - Export credentials to `.env` files for SDK integration **Sandbox Management** + - Create sandboxes with your choice of OS, size, and region - List all your sandboxes with status and connection details - Start, stop, restart, and delete sandboxes @@ -123,17 +125,20 @@ await computer.run() Create sandboxes in the size and region that fits your needs: **Sizes:** + - `small` - 2 cores, 8 GB RAM, 128 GB SSD - `medium` - 4 cores, 16 GB RAM, 128 GB SSD - `large` - 8 cores, 32 GB RAM, 256 GB SSD **Regions:** + - `north-america` - `europe` - `asia-pacific` - `south-america` **OS Options:** + - `linux` - Ubuntu with XFCE desktop - `windows` - Windows 11 with Edge and Python - `macos` - macOS (preview access) @@ -141,6 +146,7 @@ Create sandboxes in the size and region that fits your needs: ## Example Workflows **Quick Testing Environment** + ```bash # Spin up a sandbox, test something, tear it down cua sb create --os linux --size small --region north-america @@ -149,6 +155,7 @@ cua sb delete my-sandbox-abc123 ``` **Persistent Development Sandbox** + ```bash # Create a sandbox for long-term use cua sb create --os linux --size medium --region north-america @@ -221,11 +228,13 @@ Yes. The CLI and dashboard share the same API. Any sandbox you create in the das How do I update the CLI? If you installed via script: + ```bash curl -LsSf https://cua.ai/cli/install.sh | sh ``` If you installed via npm: + ```bash npm install -g @trycua/cli@latest ``` @@ -235,6 +244,7 @@ npm install -g @trycua/cli@latest ## What's Next We're actively iterating based on feedback. Planned features include: + - SSH key management for secure sandbox access - Template-based sandbox creation - Batch operations (start/stop multiple sandboxes) diff --git a/docs/content/docs/agent-sdk/agent-loops.mdx b/docs/content/docs/agent-sdk/agent-loops.mdx index 49d7e897..2885a5c5 100644 --- a/docs/content/docs/agent-sdk/agent-loops.mdx +++ b/docs/content/docs/agent-sdk/agent-loops.mdx @@ -4,7 +4,11 @@ description: Supported computer-using agent loops and models --- - A corresponding Jupyter Notebook is available for this documentation. + A corresponding{' '} + + Jupyter Notebook + {' '} + is available for this documentation. An agent can be thought of as a loop - it generates actions, executes them, and repeats until done: diff --git a/docs/content/docs/agent-sdk/customizing-computeragent.mdx b/docs/content/docs/agent-sdk/customizing-computeragent.mdx index a89b9269..158495e0 100644 --- a/docs/content/docs/agent-sdk/customizing-computeragent.mdx +++ b/docs/content/docs/agent-sdk/customizing-computeragent.mdx @@ -3,7 +3,14 @@ title: Customize ComputerAgent --- - A corresponding Jupyter Notebook is available for this documentation. + A corresponding{' '} + + Jupyter Notebook + {' '} + is available for this documentation. The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems. diff --git a/docs/content/docs/agent-sdk/integrations/hud.mdx b/docs/content/docs/agent-sdk/integrations/hud.mdx index 9575ebf6..7bfcbdea 100644 --- a/docs/content/docs/agent-sdk/integrations/hud.mdx +++ b/docs/content/docs/agent-sdk/integrations/hud.mdx @@ -4,7 +4,11 @@ description: Use ComputerAgent with HUD for benchmarking and evaluation --- - A corresponding Jupyter Notebook is available for this documentation. + A corresponding{' '} + + Jupyter Notebook + {' '} + is available for this documentation. The HUD integration allows an agent to be benchmarked using the [HUD framework](https://www.hud.so/). Through the HUD integration, the agent controls a computer inside HUD, where tests are run to evaluate the success of each task. diff --git a/docs/content/docs/agent-sdk/integrations/observability.mdx b/docs/content/docs/agent-sdk/integrations/observability.mdx index 3b1a316e..44db0ea4 100644 --- a/docs/content/docs/agent-sdk/integrations/observability.mdx +++ b/docs/content/docs/agent-sdk/integrations/observability.mdx @@ -59,4 +59,8 @@ you will see all the agent execution steps, including computer actions, LLM call For each step, you will see the LLM call, the computer action. The computer actions are highlighted in the timeline in yellow. -Example trace in Laminar showing the litellm.response span and its output. \ No newline at end of file +Example trace in Laminar showing the litellm.response span and its output. diff --git a/docs/content/docs/agent-sdk/supported-model-providers/cua-vlm-router.mdx b/docs/content/docs/agent-sdk/supported-model-providers/cua-vlm-router.mdx index a0ce1340..4b99e43f 100644 --- a/docs/content/docs/agent-sdk/supported-model-providers/cua-vlm-router.mdx +++ b/docs/content/docs/agent-sdk/supported-model-providers/cua-vlm-router.mdx @@ -55,10 +55,10 @@ async for result in agent.run(messages): The CUA VLM Router currently supports these models: -| Model ID | Provider | Description | Best For | -|----------|----------|-------------|----------| +| Model ID | Provider | Description | Best For | +| --------------------------------- | --------- | ----------------- | ---------------------------------- | | `cua/anthropic/claude-sonnet-4.5` | Anthropic | Claude Sonnet 4.5 | General-purpose tasks, recommended | -| `cua/anthropic/claude-haiku-4.5` | Anthropic | Claude Haiku 4.5 | Fast responses, cost-effective | +| `cua/anthropic/claude-haiku-4.5` | Anthropic | Claude Haiku 4.5 | Fast responses, cost-effective | ## How It Works @@ -95,6 +95,7 @@ GET /v1/models ``` **Response:** + ```json { "data": [ @@ -117,12 +118,11 @@ Content-Type: application/json ``` **Request:** + ```json { "model": "anthropic/claude-sonnet-4.5", - "messages": [ - {"role": "user", "content": "Hello!"} - ], + "messages": [{ "role": "user", "content": "Hello!" }], "max_tokens": 100, "temperature": 0.7, "stream": false @@ -130,20 +130,23 @@ Content-Type: application/json ``` **Response:** + ```json { "id": "gen_...", "object": "chat.completion", "created": 1763554838, "model": "anthropic/claude-sonnet-4.5", - "choices": [{ - "index": 0, - "message": { - "role": "assistant", - "content": "Hello! How can I help you today?" - }, - "finish_reason": "stop" - }], + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "Hello! How can I help you today?" + }, + "finish_reason": "stop" + } + ], "usage": { "prompt_tokens": 10, "completion_tokens": 12, @@ -170,6 +173,7 @@ curl -X POST https://inference.cua.ai/v1/chat/completions \ ``` **Response (SSE format):** + ``` data: {"id":"gen_...","choices":[{"delta":{"content":"1"}}],"object":"chat.completion.chunk"} @@ -187,6 +191,7 @@ GET /v1/balance ``` **Response:** + ```json { "balance": 211689.85, @@ -201,6 +206,7 @@ CUA VLM Router provides detailed cost information in every response: ### Credit System Requests are billed in **credits**: + - Credits are deducted from your CUA account balance - Prices vary by model and usage - CUA manages all provider API keys and infrastructure @@ -210,8 +216,8 @@ Requests are billed in **credits**: ```json { "usage": { - "cost": 0.01, // CUA gateway cost in credits - "market_cost": 0.000065 // Actual upstream API cost + "cost": 0.01, // CUA gateway cost in credits + "market_cost": 0.000065 // Actual upstream API cost } } ``` @@ -251,19 +257,20 @@ agent = ComputerAgent( ## Benefits Over Direct Provider Access -| Feature | CUA VLM Router | Direct Provider (BYOK) | -|---------|---------------|------------------------| -| **Single API Key** | βœ… One key for all providers | ❌ Multiple keys to manage | -| **Managed Infrastructure** | βœ… No API key management | ❌ Manage multiple provider keys | -| **Usage Tracking** | βœ… Unified dashboard | ❌ Per-provider tracking | -| **Model Switching** | βœ… Change model string only | ❌ Change code + keys | -| **Setup Complexity** | βœ… One environment variable | ❌ Multiple environment variables | +| Feature | CUA VLM Router | Direct Provider (BYOK) | +| -------------------------- | ---------------------------- | --------------------------------- | +| **Single API Key** | βœ… One key for all providers | ❌ Multiple keys to manage | +| **Managed Infrastructure** | βœ… No API key management | ❌ Manage multiple provider keys | +| **Usage Tracking** | βœ… Unified dashboard | ❌ Per-provider tracking | +| **Model Switching** | βœ… Change model string only | ❌ Change code + keys | +| **Setup Complexity** | βœ… One environment variable | ❌ Multiple environment variables | ## Error Handling ### Common Error Responses #### Invalid API Key + ```json { "detail": "Insufficient credits. Current balance: 0.00 credits" @@ -271,6 +278,7 @@ agent = ComputerAgent( ``` #### Missing Authorization + ```json { "detail": "Missing Authorization: Bearer token" @@ -278,6 +286,7 @@ agent = ComputerAgent( ``` #### Invalid Model + ```json { "detail": "Invalid or unavailable model" @@ -343,6 +352,7 @@ agent = ComputerAgent( Switching from direct provider access (BYOK) to CUA VLM Router is simple: **Before (Direct Provider Access with BYOK):** + ```python import os # Required: Provider-specific API key @@ -355,6 +365,7 @@ agent = ComputerAgent( ``` **After (CUA VLM Router - Cloud Service):** + ```python import os # Required: CUA API key only (no provider keys needed) diff --git a/docs/content/docs/agent-sdk/supported-model-providers/index.mdx b/docs/content/docs/agent-sdk/supported-model-providers/index.mdx index ed06fd7f..97587e5c 100644 --- a/docs/content/docs/agent-sdk/supported-model-providers/index.mdx +++ b/docs/content/docs/agent-sdk/supported-model-providers/index.mdx @@ -14,6 +14,7 @@ model="cua/anthropic/claude-haiku-4.5" # Claude Haiku 4.5 (faster) ``` **Benefits:** + - Single API key for multiple providers - Cost tracking and optimization - Fully managed infrastructure (no provider keys to manage) diff --git a/docs/content/docs/agent-sdk/telemetry.mdx b/docs/content/docs/agent-sdk/telemetry.mdx index a045f351..d1dfb60f 100644 --- a/docs/content/docs/agent-sdk/telemetry.mdx +++ b/docs/content/docs/agent-sdk/telemetry.mdx @@ -19,6 +19,7 @@ Cua collects anonymized usage and error statistics. We follow [Posthog's ethical ### Disabled by default (opt-in) **Trajectory logging** captures full conversation history: + - User messages and agent responses - Computer actions and outputs - Agent reasoning traces diff --git a/docs/content/docs/computer-sdk/computer-ui.mdx b/docs/content/docs/computer-sdk/computer-ui.mdx index 9739398b..a51ef60d 100644 --- a/docs/content/docs/computer-sdk/computer-ui.mdx +++ b/docs/content/docs/computer-sdk/computer-ui.mdx @@ -3,7 +3,8 @@ title: Computer UI (Deprecated) --- - The Computer UI is deprecated and will be replaced with a revamped playground experience soon. We recommend using VNC or Screen Sharing for precise control of the computer instead. + The Computer UI is deprecated and will be replaced with a revamped playground experience soon. We + recommend using VNC or Screen Sharing for precise control of the computer instead. The computer module includes a Gradio UI for creating and sharing demonstration data. We make it easy for people to build community datasets for better computer use models with an upload to Huggingface feature. diff --git a/docs/content/docs/computer-sdk/sandboxed-python.mdx b/docs/content/docs/computer-sdk/sandboxed-python.mdx index e66ad34c..bb1c1e9c 100644 --- a/docs/content/docs/computer-sdk/sandboxed-python.mdx +++ b/docs/content/docs/computer-sdk/sandboxed-python.mdx @@ -4,7 +4,14 @@ slug: sandboxed-python --- - A corresponding Python example is available for this documentation. + A corresponding{' '} + + Python example + {' '} + is available for this documentation. You can run Python functions securely inside a sandboxed virtual environment on a remote Cua Computer. This is useful for executing untrusted user code, isolating dependencies, or providing a safe environment for automation tasks. diff --git a/docs/content/docs/example-usecases/form-filling.mdx b/docs/content/docs/example-usecases/form-filling.mdx index 817d0dd4..fd365a0f 100644 --- a/docs/content/docs/example-usecases/form-filling.mdx +++ b/docs/content/docs/example-usecases/form-filling.mdx @@ -473,6 +473,7 @@ python form_filling.py ``` The agent will: + 1. Download the PDF resume from Overleaf 2. Extract information from the PDF 3. Fill out the JotForm with the extracted information diff --git a/docs/content/docs/example-usecases/gemini-complex-ui-navigation.mdx b/docs/content/docs/example-usecases/gemini-complex-ui-navigation.mdx index 619576e8..646b4060 100644 --- a/docs/content/docs/example-usecases/gemini-complex-ui-navigation.mdx +++ b/docs/content/docs/example-usecases/gemini-complex-ui-navigation.mdx @@ -11,6 +11,12 @@ import { Callout } from 'fumadocs-ui/components/callout'; This example demonstrates how to use Google's Gemini 3 models with OmniParser for complex GUI grounding tasks. Gemini 3 Pro achieves exceptional performance on the [ScreenSpot-Pro benchmark](https://github.com/likaixin2000/ScreenSpot-Pro-GUI-Grounding) with a **72.7% accuracy** (compared to Claude Sonnet 4.5's 36.2%), making it ideal for precise UI element location and complex navigation tasks. +Demo of Gemini 3 with OmniParser performing complex GUI navigation tasks + According to [Google's Gemini 3 announcement](https://blog.google/products/gemini/gemini-3/), Gemini 3 Pro achieves: - **72.7%** on ScreenSpot-Pro (vs. Gemini 2.5 Pro's 11.4%) - diff --git a/docs/content/docs/example-usecases/post-event-contact-export.mdx b/docs/content/docs/example-usecases/post-event-contact-export.mdx index fc6685d7..8324f5cd 100644 --- a/docs/content/docs/example-usecases/post-event-contact-export.mdx +++ b/docs/content/docs/example-usecases/post-event-contact-export.mdx @@ -441,6 +441,7 @@ python contact_export.py ``` The agent will: + 1. Navigate to your LinkedIn connections page 2. Extract data from 20 contacts (first name, last name, role, company, LinkedIn URL) 3. Save contacts to a timestamped CSV file diff --git a/docs/content/docs/example-usecases/windows-app-behind-vpn.mdx b/docs/content/docs/example-usecases/windows-app-behind-vpn.mdx index 82411cc3..3e910987 100644 --- a/docs/content/docs/example-usecases/windows-app-behind-vpn.mdx +++ b/docs/content/docs/example-usecases/windows-app-behind-vpn.mdx @@ -11,19 +11,23 @@ import { Tab, Tabs } from 'fumadocs-ui/components/tabs'; This guide demonstrates how to automate Windows desktop applications (like eGecko HR/payroll systems) that run behind corporate VPN. This is a common enterprise scenario where legacy desktop applications require manual data entry, report generation, or workflow execution. **Use cases:** + - HR/payroll processing (employee onboarding, payroll runs, benefits administration) - Desktop ERP systems behind corporate networks - Legacy financial applications requiring VPN access - Compliance reporting from on-premise systems **Architecture:** + - Client-side Cua agent (Python SDK or Playground UI) - Windows VM/Sandbox with VPN client configured - RDP/remote desktop connection to target environment - Desktop application automation via computer vision and UI control - **Production Deployment**: For production use, consider workflow mining and custom finetuning to create vertical-specific actions (e.g., "Run payroll", "Onboard employee") instead of generic UI automation. This provides better audit trails and higher success rates. + **Production Deployment**: For production use, consider workflow mining and custom finetuning to + create vertical-specific actions (e.g., "Run payroll", "Onboard employee") instead of generic UI + automation. This provides better audit trails and higher success rates. --- @@ -31,7 +35,11 @@ This guide demonstrates how to automate Windows desktop applications (like eGeck ## Video Demo
-