mirror of
https://github.com/trycua/computer.git
synced 2026-01-04 04:19:57 -06:00
Merge branch 'main' into feat/generic-vlm-provider
This commit is contained in:
@@ -242,7 +242,7 @@ agent = ComputerAgent(model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5")
|
||||
agent = ComputerAgent(model="omniparser+openai/gpt-4o")
|
||||
|
||||
# Combine state-of-the-art grounding with powerful reasoning
|
||||
agent = ComputerAgent(model="huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-3-5-sonnet-20241022")
|
||||
agent = ComputerAgent(model="huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929")
|
||||
|
||||
# Combine two different vision models for enhanced capabilities
|
||||
agent = ComputerAgent(model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B+openai/gpt-4o")
|
||||
|
||||
@@ -25,7 +25,7 @@ desktop = computer.create_desktop_from_apps(["Safari", "Notes"])
|
||||
|
||||
# Your agent can now only see and interact with these apps
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[desktop]
|
||||
)
|
||||
```
|
||||
@@ -94,7 +94,7 @@ async def main():
|
||||
|
||||
# Initialize an agent
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[desktop]
|
||||
)
|
||||
|
||||
@@ -160,7 +160,7 @@ async def automate_iphone():
|
||||
|
||||
# Initialize an agent for iPhone automation
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[my_iphone]
|
||||
)
|
||||
|
||||
|
||||
@@ -145,9 +145,9 @@ While the core concept remains the same across all agent loops, different AI mod
|
||||
| Agent Loop | Supported Models | Description | Set-Of-Marks |
|
||||
|:-----------|:-----------------|:------------|:-------------|
|
||||
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA Preview model | Not Required |
|
||||
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use Beta Tools | Not Required |
|
||||
| `AgentLoop.ANTHROPIC` | • `claude-sonnet-4-5-20250929`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use Beta Tools | Not Required |
|
||||
| `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required |
|
||||
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
||||
| `AgentLoop.OMNI` | • `claude-sonnet-4-5-20250929`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
||||
|
||||
Each loop handles the same basic pattern we implemented manually in Part 1:
|
||||
|
||||
@@ -191,7 +191,7 @@ The performance of different Computer-Use models varies significantly across tas
|
||||
|
||||
- **AgentLoop.OPENAI**: Choose when you have OpenAI Tier 3 access and need the most capable computer-use agent for web-based tasks. Uses the same [OpenAI Computer-Use Loop](https://platform.openai.com/docs/guides/tools-computer-use) as Part 1, delivering strong performance on browser-based benchmarks.
|
||||
|
||||
- **AgentLoop.ANTHROPIC**: Ideal for users with Anthropic API access who need strong reasoning capabilities with computer-use abilities. Works with `claude-3-5-sonnet-20240620` and `claude-3-7-sonnet-20250219` models following [Anthropic's Computer-Use tools](https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#understanding-the-multi-agent-loop).
|
||||
- **AgentLoop.ANTHROPIC**: Ideal for users with Anthropic API access who need strong reasoning capabilities with computer-use abilities. Works with `claude-sonnet-4-5-20250929` and `claude-3-7-sonnet-20250219` models following [Anthropic's Computer-Use tools](https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#understanding-the-multi-agent-loop).
|
||||
|
||||
- **AgentLoop.UITARS**: Best for scenarios requiring more powerful OS/desktop, and latency-sensitive automation, as UI-TARS-1.5 leads in OS capabilities benchmarks. Requires running the model locally or accessing it through compatible endpoints (e.g. on Hugging Face).
|
||||
|
||||
|
||||
@@ -14,12 +14,12 @@ This is the kind of problem that makes you wonder if we're building the future o
|
||||
|
||||
Agent framework 0.4 solves this by doing something radical: making all these different models speak the same language.
|
||||
|
||||
Instead of writing separate code for each model's peculiarities, you now just pick a model with a string like `"anthropic/claude-3-5-sonnet-20241022"` or `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`, and everything else Just Works™. Behind the scenes, we handle all the coordinate normalization, token parsing, and image preprocessing so you don't have to.
|
||||
Instead of writing separate code for each model's peculiarities, you now just pick a model with a string like `"anthropic/claude-sonnet-4-5-20250929"` or `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`, and everything else Just Works™. Behind the scenes, we handle all the coordinate normalization, token parsing, and image preprocessing so you don't have to.
|
||||
|
||||
```python
|
||||
# This works the same whether you're using Anthropic, OpenAI, or that new model you found on Hugging Face
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022", # or any other supported model
|
||||
model="anthropic/claude-sonnet-4-5-20250929", # or any other supported model
|
||||
tools=[computer]
|
||||
)
|
||||
```
|
||||
|
||||
@@ -8,13 +8,13 @@ Growing a developer-focused product is hard. Traditional marketing doesn't work.
|
||||
|
||||
So we tried something different at Google DevFest Toronto: show up with backpacks full of cute cua-la keychains and see what happens.
|
||||
|
||||
This is the story of how two new hires—a growth engineer and a designer/artist—guerrilla marketed their way through a major tech conference with $200 worth of merch and a post-event automation pipeline.
|
||||
This is the story of how two new hires, a growth engineer and a designer/artist, guerrilla marketed their way through a major tech conference with $200 worth of merch and a post-event automation pipeline.
|
||||
|
||||
## Meet the Team
|
||||
|
||||
**Sarina** (Growth Engineering): Built the post-event automation pipeline that extracts LinkedIn connections and generates personalized messages while you sleep.
|
||||
|
||||
**Esther** (Design + Art): Hand-crafted every piece of artwork, giving life to CUA through illustrations, branding, and yes, extremely cute cua-la keychains.
|
||||
**Esther** (Design + Art): Hand-crafted every piece of artwork, giving life to Cua through illustrations, branding, and yes, extremely cute cua-la keychains.
|
||||
|
||||
The thesis: what if we could draw people in with irresistible physical merch, then use computer use agents to handle all the tedious follow-up work?
|
||||
|
||||
@@ -24,11 +24,9 @@ The thesis: what if we could draw people in with irresistible physical merch, th
|
||||
|
||||
Google DevFest Toronto brought together hundreds of developers and AI enthusiasts. We didn't have a booth. We didn't have demos. We showed up with backpacks full of cua-la keychains with the cua.ai logo and started handing them out.
|
||||
|
||||
That's it. Pure guerrilla marketing.
|
||||
That's it. Pure guerrilla marketing, the cua-las were absurdly effective.
|
||||
|
||||
The cua-las were absurdly effective.
|
||||
|
||||
People would literally crowd around us—not because they were interested in computer use (at first), but because they wanted a cua-la. We'd pitch CUA while handing out keychains, and suddenly we had an engaged audience. No booth required.
|
||||
People would literally crowd around us, not because they were interested in computer use (at first), but because they wanted a cua-la. We'd pitch Cua while handing out keychains, and suddenly we had an engaged audience!
|
||||
|
||||
<img src="./assets/devfest-image.JPG" alt="DevFest crowd">
|
||||
|
||||
@@ -36,13 +34,13 @@ People would literally crowd around us—not because they were interested in com
|
||||
|
||||
A few people stuck the cua-las on their bags immediately. Then, throughout the event, we started getting approached:
|
||||
|
||||
"Wait, are you the CUA girls?"
|
||||
"Wait, are you the Cua girls?"
|
||||
|
||||
They'd seen the cua-las on someone's bag, asked about it, and tracked us down. The keychains became walking advertisements.
|
||||
They'd seen the cua-las on someone's bag, asked about it, and tracked us down! The keychains became walking advertisements.
|
||||
|
||||
<img src="./assets/htn-at-devfest.JPG" alt="Hack the North recognition at DevFest">
|
||||
|
||||
Even better: two attendees recognized CUA from Hack the North. Our previous event marketing was actually working. People remembered us.
|
||||
Even better: two attendees recognized Cua from Hack the North. Our previous event marketing was actually working. People remembered us.
|
||||
|
||||
## Part 2: The Automation (Try It Yourself)
|
||||
|
||||
@@ -64,9 +62,9 @@ Sarina had a better idea: build the automation we wish existed, then open source
|
||||
LinkedIn scraping automation in action
|
||||
</video>
|
||||
|
||||
The agent navigates LinkedIn like a human would—click profile, extract info, navigate back, repeat. But it does it overnight while you sleep.
|
||||
The agent navigates LinkedIn like a human would: click profile, extract info, navigate back, repeat. But it does it overnight while you sleep.
|
||||
|
||||
The secret sauce: **VM session persistence**. By logging into LinkedIn once through CUA's VM, the session stays alive. No captchas, no bot detection, just smooth automation.
|
||||
The secret sauce: **VM session persistence**. By logging into LinkedIn once through Cua's VM, the session stays alive. No captchas, no bot detection, just smooth automation.
|
||||
|
||||
<video controls width="100%">
|
||||
<source src="./assets/adding-row-csv.mp4" type="video/mp4">
|
||||
@@ -98,7 +96,7 @@ Then use that data to craft personalized messages. Sarina wrote unique follow-up
|
||||
|
||||
- People crowding around us for cua-las
|
||||
- Walking advertisements on bags throughout the event
|
||||
- Instant brand recognition ("Are you the CUA girls?")
|
||||
- Instant brand recognition ("Are you the Cua girls?")
|
||||
- Two people who remembered us from Hack the North
|
||||
- 20+ quality connections extracted and messaged within 24 hours
|
||||
- Several demo requests from personalized follow-ups
|
||||
@@ -119,11 +117,11 @@ We ran out faster than expected! Next time: bigger bag, or limit to one per pers
|
||||
The VM login step added friction. "Log in manually first, then run the script" confused some people who wanted to try it themselves. Need better first-run UX.
|
||||
|
||||
**Message Personalization**
|
||||
While the extraction was automated, Sarina still wrote each follow-up message manually. The automation saved the data collection part, but not the creative writing part. (Though this probably led to better messages.)
|
||||
While the extraction was automated, I still wrote each follow-up message manually, I think we are looking for ways to better enrich messages with context from the event, which is hard to automate.
|
||||
|
||||
## What's Next: NeurIPS 2025
|
||||
|
||||
NeurIPS is the biggest AI conference of the year. Thousands of researchers, hundreds of companies, and endless networking opportunities.
|
||||
NeurIPS is the biggest AI conference of the year. Thousands of researchers, hundreds of companies.
|
||||
|
||||
**The good news**: We still have one giant bag of cua-las left. They're already packed and ready.
|
||||
|
||||
@@ -135,11 +133,11 @@ The cua-las get people interested. The automation ensures we actually follow thr
|
||||
|
||||
Most event marketing fails at the follow-up stage. You collect business cards, connect on LinkedIn, and then... nothing. The moment passes. People forget.
|
||||
|
||||
With CUA handling the mechanical work (data organization, connection tracking, follow-up scheduling), we can focus on the human part: genuine conversations, valuable introductions, and actually helping people.
|
||||
With Cua handling the mechanical work (data organization, connection tracking, follow-up scheduling), we can focus on the human part: genuine conversations, valuable introductions, and actually helping people.
|
||||
|
||||
## The Framework: Cute Merch + Smart Automation
|
||||
|
||||
Traditional event marketing: show up, pitch, collect cards, never follow up.
|
||||
Traditional event marketing: show up, pitch, collect cards.
|
||||
|
||||
Our approach: combine two forces that shouldn't work together but do.
|
||||
|
||||
@@ -167,19 +165,8 @@ Most companies nail one or the other:
|
||||
|
||||
Do both, and you create a flywheel: each event builds brand recognition for the next, while automation ensures maximum value from every connection.
|
||||
|
||||
## The Meta Lesson
|
||||
|
||||
We built CUA to build CUA. Every automation we create for growth becomes:
|
||||
|
||||
1. A real-world test of the product
|
||||
2. Documentation of what works (and what doesn't)
|
||||
3. An example for others to copy
|
||||
4. Marketing material that's actually useful
|
||||
|
||||
Esther hand-draws artwork that makes people smile. Sarina builds automations that save time. Together, they're proving that developer tools can be both powerful and delightful.
|
||||
|
||||
See you at NeurIPS 2025. We'll be the ones with the cua-las.
|
||||
See you at NeurIPS 2025!
|
||||
|
||||
---
|
||||
|
||||
_Want to build your own growth hacking automations? Check out [CUA on GitHub](https://github.com/trycua/cua) or join our [Discord](https://discord.gg/cua) to share your experiments. cua-las not included (yet)._
|
||||
_Want to build your own growth hacking automations? Check out [Cua on GitHub](https://github.com/trycua/cua) or join our [Discord](https://discord.gg/cua) to share your experiments. cua-las not included (yet)._
|
||||
|
||||
86
blog/cua-playground-preview.md
Normal file
86
blog/cua-playground-preview.md
Normal file
@@ -0,0 +1,86 @@
|
||||
# Cua Playground: Agents + Sandboxes in Your Browser
|
||||
|
||||
Building computer-use agents means constant iteration—writing code, deploying to a sandbox, testing behavior, debugging issues, then repeating the cycle. Every test requires switching between your code editor, terminal, and VNC viewer. Want to try a different prompt? Edit your code, redeploy, and wait for the agent to restart. It works, but it's slow.
|
||||
|
||||
Today we're launching the **Cua Playground**: a browser-based environment for testing computer-use agents without writing code. Send messages to your sandboxes, watch them execute in real-time, and iterate on prompts instantly—all from your dashboard at cua.ai.
|
||||
|
||||

|
||||
|
||||
**What's new with this release:**
|
||||
|
||||
- Instant testing—send messages to any running sandbox directly from your browser
|
||||
- Real-time execution—watch your agent work with live tool call updates and screenshots
|
||||
- Multi-model support—test with Claude Sonnet 4.5, Haiku 4.5, and more
|
||||
- Persistent chat history—conversations save automatically to local storage
|
||||
|
||||
The Playground connects to your existing Cua sandboxes—the same ones you use with the Agent SDK. Select a running sandbox and a model, then start chatting. The agent uses computer-use tools (mouse, keyboard, bash, editor) to complete your tasks, and you see every action it takes.
|
||||
|
||||
## Getting Started Today
|
||||
|
||||
<div align="center">
|
||||
<video src="https://github.com/user-attachments/assets/9fef0f30-1024-4833-8b7a-6a2c02d8eb99" width="600" controls></video>
|
||||
</div>
|
||||
|
||||
|
||||
Sign up at [cua.ai/signin](https://cua.ai/signin) and grab your API key from the dashboard. Then navigate to the Playground:
|
||||
|
||||
1. Navigate to Dashboard > Playground
|
||||
2. Select a sandbox from the dropdown (must be "running" status)
|
||||
3. Choose a model (we recommend Claude Sonnet 4.5 to start)
|
||||
4. Send a message: "Take a screenshot and describe what you see"
|
||||
5. Watch the agent execute computer actions in real-time
|
||||
|
||||
Example use cases:
|
||||
|
||||
**Prompt Testing**
|
||||
```
|
||||
❌ "Check the website"
|
||||
✅ "Navigate to example.com in Firefox and take a screenshot of the homepage"
|
||||
```
|
||||
|
||||
**Model Comparison**
|
||||
Run the same task with different models to compare quality, speed, and cost.
|
||||
|
||||
**Debugging Agent Behavior**
|
||||
1. Send: "Find the login button and click it"
|
||||
2. View tool calls to see each mouse movement
|
||||
3. Check screenshots to verify the agent found the right element
|
||||
4. Adjust your prompt based on what you observe
|
||||
|
||||
## FAQs
|
||||
|
||||
<details>
|
||||
<summary><strong>Do I need to know how to code?</strong></summary>
|
||||
|
||||
No. The Playground is designed for testing agent behavior without writing code. However, for production deployments, you'll need to use the Agent SDK (Python/TypeScript).
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>Does this replace the Agent SDK?</strong></summary>
|
||||
|
||||
No. The Playground is for rapid testing and experimentation. For production deployments, scheduled tasks, or complex workflows, use the Agent SDK.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>How much does it cost?</strong></summary>
|
||||
|
||||
Playground requests use the same credit system as Agent SDK requests. You're charged for model inference (varies by model) and sandbox runtime (billed per hour while running).
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>Why is my sandbox not showing up?</strong></summary>
|
||||
|
||||
The sandbox must have `status = "running"` to appear in the dropdown. Check Dashboard > Sandboxes to verify status. If stopped, click "Start" and wait ~30 seconds for it to become available.
|
||||
|
||||
</details>
|
||||
|
||||
## Need help?
|
||||
|
||||
If you hit issues getting the Playground working, reach out in [Discord](https://discord.gg/cua-ai). We respond fast and fix based on what people actually use.
|
||||
|
||||
---
|
||||
|
||||
Get started at [cua.ai](https://cua.ai) or try the Playground at [cua.ai/dashboard/playground](https://cua.ai/dashboard/playground).
|
||||
@@ -4,7 +4,6 @@ If you've been building computer-use agents, you know the reality: every model p
|
||||
|
||||
Today we're launching the **Cua VLM Router**: a managed inference API that gives you unified access to multiple vision-language model providers through a single API key. We're starting with Anthropic's Claude models (Sonnet 4.5 and Haiku 4.5)—some of the most loved and widely-used computer-use models in the Cua ecosystem - with more providers coming soon.
|
||||
|
||||
|
||||

|
||||
|
||||
## What You Get
|
||||
@@ -12,21 +11,25 @@ Today we're launching the **Cua VLM Router**: a managed inference API that gives
|
||||
The Cua VLM Router handles the infrastructure so you can focus on building:
|
||||
|
||||
**Single API Key**
|
||||
|
||||
- One key for all model providers (no juggling multiple credentials)
|
||||
- Works for both model inference and sandbox access
|
||||
- Manage everything from one dashboard at cua.ai
|
||||
|
||||
**Smart Routing**
|
||||
|
||||
- Automatic provider selection for optimal availability and performance
|
||||
- For Anthropic models, we route to the best provider (Anthropic, AWS Bedrock, or Microsoft Foundry)
|
||||
- No configuration needed—just specify the model and we handle the rest
|
||||
|
||||
**Cost Tracking & Optimization**
|
||||
|
||||
- Unified usage dashboard across all models
|
||||
- Real-time credit balance tracking
|
||||
- Detailed cost breakdown per request (gateway cost + upstream cost)
|
||||
|
||||
**Production-Ready**
|
||||
|
||||
- OpenAI-compatible API (drop-in replacement for existing code)
|
||||
- Full streaming support with Server-Sent Events
|
||||
- Metadata about routing decisions in every response
|
||||
@@ -35,10 +38,10 @@ The Cua VLM Router handles the infrastructure so you can focus on building:
|
||||
|
||||
We're starting with Anthropic's latest Claude models:
|
||||
|
||||
| Model | Best For |
|
||||
|-------|----------|
|
||||
| Model | Best For |
|
||||
| --------------------------------- | ---------------------------------- |
|
||||
| `cua/anthropic/claude-sonnet-4.5` | General-purpose tasks, recommended |
|
||||
| `cua/anthropic/claude-haiku-4.5` | Fast responses, cost-effective |
|
||||
| `cua/anthropic/claude-haiku-4.5` | Fast responses, cost-effective |
|
||||
|
||||
## How It Works
|
||||
|
||||
@@ -48,36 +51,38 @@ When you request an Anthropic model through Cua, we automatically route to the b
|
||||
|
||||
Sign up at [cua.ai/signin](https://cua.ai/signin) and create your API key from **Dashboard > API Keys > New API Key** (save it immediately—you won't see it again).
|
||||
|
||||
Set your environment variable:
|
||||
|
||||
```bash
|
||||
export CUA_API_KEY="sk_cua-api01_..."
|
||||
```
|
||||
|
||||
Use it with the Agent SDK:
|
||||
Use it with the Agent SDK (make sure to set your environment variable):
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer
|
||||
|
||||
computer = Computer(
|
||||
async def main():
|
||||
# Initialize cloud computer
|
||||
computer = Computer(
|
||||
os_type="linux",
|
||||
provider_type="cloud",
|
||||
name="your-sandbox-name"
|
||||
)
|
||||
name="your-container-name",
|
||||
api_key="your-cua-api-key"
|
||||
)
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-sonnet-4.5", # Cua-routed model
|
||||
# Initialize agent with Claude Sonnet 4.5
|
||||
agent = ComputerAgent(
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0
|
||||
)
|
||||
model="cua/anthropic/claude-sonnet-4.5",
|
||||
api_key="your-cua-api-key",
|
||||
instructions="You are a helpful assistant that can control computers",
|
||||
only_n_most_recent_images=3
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": "Take a screenshot and analyze what's on screen"}]
|
||||
# Run a task
|
||||
async for result in agent.run("Open a browser and search for Python tutorials"):
|
||||
print(result)
|
||||
|
||||
async for result in agent.run(messages):
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"])
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Migration is Simple
|
||||
@@ -85,12 +90,14 @@ async for result in agent.run(messages):
|
||||
Already using Anthropic directly? Just add the `cua/` prefix:
|
||||
|
||||
**Before:**
|
||||
|
||||
```python
|
||||
export ANTHROPIC_API_KEY="sk-ant-..."
|
||||
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
|
||||
```
|
||||
|
||||
**After:**
|
||||
|
||||
```python
|
||||
export CUA_API_KEY="sk_cua-api01_..."
|
||||
agent = ComputerAgent(model="cua/anthropic/claude-sonnet-4.5")
|
||||
|
||||
@@ -58,7 +58,7 @@ await run_full_dataset(
|
||||
# Or test on SheetBench (50 spreadsheet tasks)
|
||||
await run_full_dataset(
|
||||
dataset="hud-evals/SheetBench-V2",
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
split="train[:2]"
|
||||
)
|
||||
```
|
||||
|
||||
@@ -11,11 +11,13 @@ Today we're launching the **Cua CLI**: a command-line interface that brings the
|
||||
The Cua CLI handles everything you need to work with Cloud Sandboxes:
|
||||
|
||||
**Authentication**
|
||||
|
||||
- Browser-based OAuth login with automatic credential storage
|
||||
- Direct API key support for CI/CD pipelines
|
||||
- Export credentials to `.env` files for SDK integration
|
||||
|
||||
**Sandbox Management**
|
||||
|
||||
- Create sandboxes with your choice of OS, size, and region
|
||||
- List all your sandboxes with status and connection details
|
||||
- Start, stop, restart, and delete sandboxes
|
||||
@@ -123,17 +125,20 @@ await computer.run()
|
||||
Create sandboxes in the size and region that fits your needs:
|
||||
|
||||
**Sizes:**
|
||||
|
||||
- `small` - 2 cores, 8 GB RAM, 128 GB SSD
|
||||
- `medium` - 4 cores, 16 GB RAM, 128 GB SSD
|
||||
- `large` - 8 cores, 32 GB RAM, 256 GB SSD
|
||||
|
||||
**Regions:**
|
||||
|
||||
- `north-america`
|
||||
- `europe`
|
||||
- `asia-pacific`
|
||||
- `south-america`
|
||||
|
||||
**OS Options:**
|
||||
|
||||
- `linux` - Ubuntu with XFCE desktop
|
||||
- `windows` - Windows 11 with Edge and Python
|
||||
- `macos` - macOS (preview access)
|
||||
@@ -141,6 +146,7 @@ Create sandboxes in the size and region that fits your needs:
|
||||
## Example Workflows
|
||||
|
||||
**Quick Testing Environment**
|
||||
|
||||
```bash
|
||||
# Spin up a sandbox, test something, tear it down
|
||||
cua sb create --os linux --size small --region north-america
|
||||
@@ -149,6 +155,7 @@ cua sb delete my-sandbox-abc123
|
||||
```
|
||||
|
||||
**Persistent Development Sandbox**
|
||||
|
||||
```bash
|
||||
# Create a sandbox for long-term use
|
||||
cua sb create --os linux --size medium --region north-america
|
||||
@@ -221,11 +228,13 @@ Yes. The CLI and dashboard share the same API. Any sandbox you create in the das
|
||||
<summary><strong>How do I update the CLI?</strong></summary>
|
||||
|
||||
If you installed via script:
|
||||
|
||||
```bash
|
||||
curl -LsSf https://cua.ai/cli/install.sh | sh
|
||||
```
|
||||
|
||||
If you installed via npm:
|
||||
|
||||
```bash
|
||||
npm install -g @trycua/cli@latest
|
||||
```
|
||||
@@ -235,6 +244,7 @@ npm install -g @trycua/cli@latest
|
||||
## What's Next
|
||||
|
||||
We're actively iterating based on feedback. Planned features include:
|
||||
|
||||
- SSH key management for secure sandbox access
|
||||
- Template-based sandbox creation
|
||||
- Batch operations (start/stop multiple sandboxes)
|
||||
|
||||
@@ -4,7 +4,11 @@ description: Supported computer-using agent loops and models
|
||||
---
|
||||
|
||||
<Callout>
|
||||
A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
|
||||
A corresponding{' '}
|
||||
<a href="https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb" target="_blank">
|
||||
Jupyter Notebook
|
||||
</a>{' '}
|
||||
is available for this documentation.
|
||||
</Callout>
|
||||
|
||||
An agent can be thought of as a loop - it generates actions, executes them, and repeats until done:
|
||||
@@ -30,7 +34,7 @@ async def take_screenshot():
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0
|
||||
)
|
||||
@@ -117,7 +121,7 @@ The output is an AsyncGenerator that yields response chunks.
|
||||
The `ComputerAgent` constructor provides a wide range of options for customizing agent behavior, tool integration, callbacks, resource management, and more.
|
||||
|
||||
- `model` (`str`): Default: **required**
|
||||
The LLM or agent model to use. Determines which agent loop is selected unless `custom_loop` is provided. (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro")
|
||||
The LLM or agent model to use. Determines which agent loop is selected unless `custom_loop` is provided. (e.g., "claude-sonnet-4-5-20250929", "computer-use-preview", "omni+vertex_ai/gemini-pro")
|
||||
- `tools` (`List[Any]`):
|
||||
List of tools the agent can use (e.g., `Computer`, sandboxed Python functions, etc.).
|
||||
- `custom_loop` (`Callable`):
|
||||
@@ -155,7 +159,7 @@ from computer import Computer
|
||||
from agent.callbacks import ImageRetentionCallback
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[Computer(...)],
|
||||
only_n_most_recent_images=3,
|
||||
callbacks=[ImageRetentionCallback(only_n_most_recent_images=3)],
|
||||
|
||||
@@ -13,7 +13,7 @@ Optimize agent costs with budget management and image retention callbacks.
|
||||
from agent.callbacks import BudgetManagerCallback
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
BudgetManagerCallback(
|
||||
@@ -30,7 +30,7 @@ agent = ComputerAgent(
|
||||
```python
|
||||
# Simple budget limit
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
max_trajectory_budget=5.0 # $5 limit
|
||||
)
|
||||
```
|
||||
@@ -40,7 +40,7 @@ agent = ComputerAgent(
|
||||
```python
|
||||
# Advanced budget configuration
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
max_trajectory_budget={
|
||||
"max_budget": 10.0,
|
||||
"raise_error": True, # Raise error when exceeded
|
||||
@@ -55,7 +55,7 @@ agent = ComputerAgent(
|
||||
from agent.callbacks import ImageRetentionCallback
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
ImageRetentionCallback(only_n_most_recent_images=3)
|
||||
@@ -67,7 +67,7 @@ agent = ComputerAgent(
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3 # Auto-adds ImageRetentionCallback
|
||||
)
|
||||
@@ -77,7 +77,7 @@ agent = ComputerAgent(
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0, # Budget limit
|
||||
only_n_most_recent_images=3, # Image retention
|
||||
|
||||
@@ -21,7 +21,7 @@ from agent.callbacks import (
|
||||
)
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
ImageRetentionCallback(only_n_most_recent_images=3),
|
||||
|
||||
@@ -14,7 +14,7 @@ from agent.callbacks import LoggingCallback
|
||||
import logging
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
LoggingCallback(
|
||||
@@ -29,7 +29,7 @@ agent = ComputerAgent(
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
verbosity=logging.INFO # Auto-adds LoggingCallback
|
||||
)
|
||||
@@ -72,7 +72,7 @@ class CustomLogger(AsyncCallbackHandler):
|
||||
|
||||
# Use custom logger
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
callbacks=[CustomLogger("my_agent")]
|
||||
)
|
||||
|
||||
@@ -13,7 +13,7 @@ The TrajectorySaverCallback records complete agent conversations including messa
|
||||
from agent.callbacks import TrajectorySaverCallback
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
TrajectorySaverCallback(
|
||||
@@ -28,7 +28,7 @@ agent = ComputerAgent(
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
trajectory_dir="trajectories", # Auto-save trajectories
|
||||
tools=[computer]
|
||||
)
|
||||
|
||||
@@ -3,7 +3,14 @@ title: Customize ComputerAgent
|
||||
---
|
||||
|
||||
<Callout>
|
||||
A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/customizing_computeragent.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
|
||||
A corresponding{' '}
|
||||
<a
|
||||
href="https://github.com/trycua/cua/blob/main/notebooks/customizing_computeragent.ipynb"
|
||||
target="_blank"
|
||||
>
|
||||
Jupyter Notebook
|
||||
</a>{' '}
|
||||
is available for this documentation.
|
||||
</Callout>
|
||||
|
||||
The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems.
|
||||
|
||||
@@ -4,7 +4,11 @@ description: Use ComputerAgent with HUD for benchmarking and evaluation
|
||||
---
|
||||
|
||||
<Callout>
|
||||
A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
|
||||
A corresponding{' '}
|
||||
<a href="https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb" target="_blank">
|
||||
Jupyter Notebook
|
||||
</a>{' '}
|
||||
is available for this documentation.
|
||||
</Callout>
|
||||
|
||||
The HUD integration allows an agent to be benchmarked using the [HUD framework](https://www.hud.so/). Through the HUD integration, the agent controls a computer inside HUD, where tests are run to evaluate the success of each task.
|
||||
|
||||
@@ -59,4 +59,8 @@ you will see all the agent execution steps, including computer actions, LLM call
|
||||
|
||||
For each step, you will see the LLM call, the computer action. The computer actions are highlighted in the timeline in yellow.
|
||||
|
||||
<img src="/docs/img/laminar_trace_example.png" alt="Example trace in Laminar showing the litellm.response span and its output." width="800px" />
|
||||
<img
|
||||
src="/docs/img/laminar_trace_example.png"
|
||||
alt="Example trace in Laminar showing the litellm.response span and its output."
|
||||
width="800px"
|
||||
/>
|
||||
|
||||
@@ -7,7 +7,7 @@ This guide lists **breaking changes** when migrating from the original `Computer
|
||||
## Breaking Changes
|
||||
|
||||
- **Initialization:**
|
||||
- `ComputerAgent` (v0.4.x) uses `model` as a string (e.g. "anthropic/claude-3-5-sonnet-20241022") instead of `LLM` and `AgentLoop` objects.
|
||||
- `ComputerAgent` (v0.4.x) uses `model` as a string (e.g. "anthropic/claude-sonnet-4-5-20250929") instead of `LLM` and `AgentLoop` objects.
|
||||
- `tools` is a list (can include multiple computers and decorated functions).
|
||||
- `callbacks` are now first-class for extensibility (image retention, budget, trajectory, logging, etc).
|
||||
- **No explicit `loop` parameter:**
|
||||
@@ -39,7 +39,7 @@ async with Computer() as computer:
|
||||
```python
|
||||
async with Computer() as computer:
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer]
|
||||
)
|
||||
messages = [{"role": "user", "content": "Take a screenshot"}]
|
||||
|
||||
@@ -38,7 +38,7 @@ With the OpenAI provider, prompt caching is handled automatically for prompts of
|
||||
```python
|
||||
from agent import ComputerAgent
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
use_prompt_caching=True,
|
||||
)
|
||||
```
|
||||
|
||||
@@ -32,7 +32,7 @@ Any vision-enabled LiteLLM-compatible model can be used as the planning componen
|
||||
- Any All‑in‑one CUA (planning-capable). See [All‑in‑one CUAs](./computer-use-agents).
|
||||
- Any VLM via LiteLLM providers: `anthropic/*`, `openai/*`, `openrouter/*`, `gemini/*`, `vertex_ai/*`, `huggingface-local/*`, `mlx/*`, etc.
|
||||
- Examples:
|
||||
- **Anthropic**: `anthropic/claude-3-5-sonnet-20241022`, `anthropic/claude-opus-4-1-20250805`
|
||||
- **Anthropic**: `anthropic/claude-sonnet-4-5-20250929`, `anthropic/claude-opus-4-1-20250805`
|
||||
- **OpenAI**: `openai/gpt-5`, `openai/gpt-o3`, `openai/gpt-4o`
|
||||
- **Google**: `gemini/gemini-1.5-pro`, `vertex_ai/gemini-pro-vision`
|
||||
- **Local models**: Any Hugging Face vision-language model
|
||||
@@ -59,7 +59,7 @@ Combine state-of-the-art grounding with powerful reasoning:
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
"huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-3-5-sonnet-20241022",
|
||||
"huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer]
|
||||
)
|
||||
|
||||
@@ -113,7 +113,7 @@ async for _ in agent.run("Close the settings window, then open the Downloads fol
|
||||
Composed agents support both capabilities:
|
||||
|
||||
```python
|
||||
agent = ComputerAgent("huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-3-5-sonnet-20241022")
|
||||
agent = ComputerAgent("huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929")
|
||||
|
||||
# Full computer-use agent capabilities
|
||||
async for _ in agent.run("Complete this online form"):
|
||||
|
||||
@@ -29,10 +29,9 @@ Claude models with computer-use capabilities:
|
||||
- Claude 4.1: `claude-opus-4-1-20250805`
|
||||
- Claude 4: `claude-opus-4-20250514`, `claude-sonnet-4-20250514`
|
||||
- Claude 3.7: `claude-3-7-sonnet-20250219`
|
||||
- Claude 3.5: `claude-3-5-sonnet-20241022`
|
||||
|
||||
```python
|
||||
agent = ComputerAgent("claude-3-5-sonnet-20241022", tools=[computer])
|
||||
agent = ComputerAgent("claude-sonnet-4-5-20250929", tools=[computer])
|
||||
async for _ in agent.run("Open Firefox and navigate to github.com"):
|
||||
pass
|
||||
```
|
||||
|
||||
@@ -11,10 +11,10 @@ All models that support `ComputerAgent.run()` also support `ComputerAgent.predic
|
||||
|
||||
### Anthropic CUAs
|
||||
|
||||
- Claude 4.5: `claude-sonnet-4-5-20250929`
|
||||
- Claude 4.1: `claude-opus-4-1-20250805`
|
||||
- Claude 4: `claude-opus-4-20250514`, `claude-sonnet-4-20250514`
|
||||
- Claude 3.7: `claude-3-7-sonnet-20250219`
|
||||
- Claude 3.5: `claude-3-5-sonnet-20241022`
|
||||
|
||||
### OpenAI CUA Preview
|
||||
|
||||
@@ -61,7 +61,7 @@ Moondream3 is a powerful small model that can perform UI grounding and click pre
|
||||
|
||||
```python
|
||||
# Using any grounding model for click prediction
|
||||
agent = ComputerAgent("claude-3-5-sonnet-20241022", tools=[computer])
|
||||
agent = ComputerAgent("claude-sonnet-4-5-20250929", tools=[computer])
|
||||
|
||||
# Predict coordinates for specific elements
|
||||
login_coords = agent.predict_click("find the login button")
|
||||
@@ -75,7 +75,7 @@ print(f"Menu icon: {menu_coords}")
|
||||
|
||||
```python
|
||||
# OmniParser is just for OCR, so it requires an LLM for predict_click
|
||||
agent = ComputerAgent("omniparser+anthropic/claude-3-5-sonnet-20241022", tools=[computer])
|
||||
agent = ComputerAgent("omniparser+anthropic/claude-sonnet-4-5-20250929", tools=[computer])
|
||||
|
||||
# Predict click coordinates using composed agent
|
||||
coords = agent.predict_click("find the submit button")
|
||||
|
||||
@@ -55,10 +55,10 @@ async for result in agent.run(messages):
|
||||
|
||||
The CUA VLM Router currently supports these models:
|
||||
|
||||
| Model ID | Provider | Description | Best For |
|
||||
|----------|----------|-------------|----------|
|
||||
| Model ID | Provider | Description | Best For |
|
||||
| --------------------------------- | --------- | ----------------- | ---------------------------------- |
|
||||
| `cua/anthropic/claude-sonnet-4.5` | Anthropic | Claude Sonnet 4.5 | General-purpose tasks, recommended |
|
||||
| `cua/anthropic/claude-haiku-4.5` | Anthropic | Claude Haiku 4.5 | Fast responses, cost-effective |
|
||||
| `cua/anthropic/claude-haiku-4.5` | Anthropic | Claude Haiku 4.5 | Fast responses, cost-effective |
|
||||
|
||||
## How It Works
|
||||
|
||||
@@ -95,6 +95,7 @@ GET /v1/models
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
@@ -117,12 +118,11 @@ Content-Type: application/json
|
||||
```
|
||||
|
||||
**Request:**
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "anthropic/claude-sonnet-4.5",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Hello!"}
|
||||
],
|
||||
"messages": [{ "role": "user", "content": "Hello!" }],
|
||||
"max_tokens": 100,
|
||||
"temperature": 0.7,
|
||||
"stream": false
|
||||
@@ -130,20 +130,23 @@ Content-Type: application/json
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "gen_...",
|
||||
"object": "chat.completion",
|
||||
"created": 1763554838,
|
||||
"model": "anthropic/claude-sonnet-4.5",
|
||||
"choices": [{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I help you today?"
|
||||
},
|
||||
"finish_reason": "stop"
|
||||
}],
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I help you today?"
|
||||
},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 10,
|
||||
"completion_tokens": 12,
|
||||
@@ -170,6 +173,7 @@ curl -X POST https://inference.cua.ai/v1/chat/completions \
|
||||
```
|
||||
|
||||
**Response (SSE format):**
|
||||
|
||||
```
|
||||
data: {"id":"gen_...","choices":[{"delta":{"content":"1"}}],"object":"chat.completion.chunk"}
|
||||
|
||||
@@ -187,6 +191,7 @@ GET /v1/balance
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"balance": 211689.85,
|
||||
@@ -201,6 +206,7 @@ CUA VLM Router provides detailed cost information in every response:
|
||||
### Credit System
|
||||
|
||||
Requests are billed in **credits**:
|
||||
|
||||
- Credits are deducted from your CUA account balance
|
||||
- Prices vary by model and usage
|
||||
- CUA manages all provider API keys and infrastructure
|
||||
@@ -210,8 +216,8 @@ Requests are billed in **credits**:
|
||||
```json
|
||||
{
|
||||
"usage": {
|
||||
"cost": 0.01, // CUA gateway cost in credits
|
||||
"market_cost": 0.000065 // Actual upstream API cost
|
||||
"cost": 0.01, // CUA gateway cost in credits
|
||||
"market_cost": 0.000065 // Actual upstream API cost
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -251,19 +257,20 @@ agent = ComputerAgent(
|
||||
|
||||
## Benefits Over Direct Provider Access
|
||||
|
||||
| Feature | CUA VLM Router | Direct Provider (BYOK) |
|
||||
|---------|---------------|------------------------|
|
||||
| **Single API Key** | ✅ One key for all providers | ❌ Multiple keys to manage |
|
||||
| **Managed Infrastructure** | ✅ No API key management | ❌ Manage multiple provider keys |
|
||||
| **Usage Tracking** | ✅ Unified dashboard | ❌ Per-provider tracking |
|
||||
| **Model Switching** | ✅ Change model string only | ❌ Change code + keys |
|
||||
| **Setup Complexity** | ✅ One environment variable | ❌ Multiple environment variables |
|
||||
| Feature | CUA VLM Router | Direct Provider (BYOK) |
|
||||
| -------------------------- | ---------------------------- | --------------------------------- |
|
||||
| **Single API Key** | ✅ One key for all providers | ❌ Multiple keys to manage |
|
||||
| **Managed Infrastructure** | ✅ No API key management | ❌ Manage multiple provider keys |
|
||||
| **Usage Tracking** | ✅ Unified dashboard | ❌ Per-provider tracking |
|
||||
| **Model Switching** | ✅ Change model string only | ❌ Change code + keys |
|
||||
| **Setup Complexity** | ✅ One environment variable | ❌ Multiple environment variables |
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Common Error Responses
|
||||
|
||||
#### Invalid API Key
|
||||
|
||||
```json
|
||||
{
|
||||
"detail": "Insufficient credits. Current balance: 0.00 credits"
|
||||
@@ -271,6 +278,7 @@ agent = ComputerAgent(
|
||||
```
|
||||
|
||||
#### Missing Authorization
|
||||
|
||||
```json
|
||||
{
|
||||
"detail": "Missing Authorization: Bearer token"
|
||||
@@ -278,6 +286,7 @@ agent = ComputerAgent(
|
||||
```
|
||||
|
||||
#### Invalid Model
|
||||
|
||||
```json
|
||||
{
|
||||
"detail": "Invalid or unavailable model"
|
||||
@@ -343,6 +352,7 @@ agent = ComputerAgent(
|
||||
Switching from direct provider access (BYOK) to CUA VLM Router is simple:
|
||||
|
||||
**Before (Direct Provider Access with BYOK):**
|
||||
|
||||
```python
|
||||
import os
|
||||
# Required: Provider-specific API key
|
||||
@@ -355,6 +365,7 @@ agent = ComputerAgent(
|
||||
```
|
||||
|
||||
**After (CUA VLM Router - Cloud Service):**
|
||||
|
||||
```python
|
||||
import os
|
||||
# Required: CUA API key only (no provider keys needed)
|
||||
|
||||
@@ -14,6 +14,7 @@ model="cua/anthropic/claude-haiku-4.5" # Claude Haiku 4.5 (faster)
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
|
||||
- Single API key for multiple providers
|
||||
- Cost tracking and optimization
|
||||
- Fully managed infrastructure (no provider keys to manage)
|
||||
@@ -27,7 +28,6 @@ model="cua/anthropic/claude-haiku-4.5" # Claude Haiku 4.5 (faster)
|
||||
Direct access to Anthropic's Claude models using your own Anthropic API key (BYOK - Bring Your Own Key).
|
||||
|
||||
```python
|
||||
model="anthropic/claude-3-5-sonnet-20241022"
|
||||
model="anthropic/claude-3-7-sonnet-20250219"
|
||||
model="anthropic/claude-opus-4-20250514"
|
||||
model="anthropic/claude-sonnet-4-20250514"
|
||||
@@ -61,6 +61,6 @@ Combine Omniparser for UI understanding with any LLM provider.
|
||||
```python
|
||||
model="omniparser+ollama_chat/mistral-small3.2"
|
||||
model="omniparser+vertex_ai/gemini-pro"
|
||||
model="omniparser+anthropic/claude-3-5-sonnet-20241022"
|
||||
model="omniparser+anthropic/claude-sonnet-4-5-20250929"
|
||||
model="omniparser+openai/gpt-4o"
|
||||
```
|
||||
|
||||
@@ -19,6 +19,7 @@ Cua collects anonymized usage and error statistics. We follow [Posthog's ethical
|
||||
### Disabled by default (opt-in)
|
||||
|
||||
**Trajectory logging** captures full conversation history:
|
||||
|
||||
- User messages and agent responses
|
||||
- Computer actions and outputs
|
||||
- Agent reasoning traces
|
||||
@@ -117,7 +118,7 @@ Telemetry settings are configured at initialization and can't be changed afterwa
|
||||
| Event Name | Data Collected | Trigger Notes |
|
||||
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------- |
|
||||
| **module_init** | • `module`: "agent"<br />• `version`: Package version<br />• `python_version`: Full Python version string | Triggered once when the agent package is imported for the first time |
|
||||
| **agent_session_start** | • `session_id`: Unique UUID for this agent instance<br />• `agent_type`: Class name (e.g., "ComputerAgent")<br />• `model`: Model name (e.g., "claude-3-5-sonnet")<br />• `os`: Operating system<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when TelemetryCallback is initialized (agent instantiation) |
|
||||
| **agent_session_start** | • `session_id`: Unique UUID for this agent instance<br />• `agent_type`: Class name (e.g., "ComputerAgent")<br />• `model`: Model name (e.g., "claude-sonnet-4-5")<br />• `os`: Operating system<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when TelemetryCallback is initialized (agent instantiation) |
|
||||
| **agent_run_start** | • `session_id`: Agent session UUID<br />• `run_id`: Unique UUID for this run<br />• `start_time`: Unix timestamp<br />• `input_context_size`: Character count of input messages<br />• `num_existing_messages`: Count of existing messages<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the start of each agent.run() call |
|
||||
| **agent_run_end** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `end_time`: Unix timestamp<br />• `duration_seconds`: Total run duration<br />• `num_steps`: Total steps taken in this run<br />• `total_usage`: Accumulated token usage and costs<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the end of each agent.run() call |
|
||||
| **agent_step** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Step number (incremental)<br />• `timestamp`: Unix timestamp<br />• `duration_seconds`: Duration of previous step | Triggered on each agent response/step during a run |
|
||||
|
||||
@@ -3,7 +3,8 @@ title: Computer UI (Deprecated)
|
||||
---
|
||||
|
||||
<Callout type="warn" title="Deprecated">
|
||||
The Computer UI is deprecated and will be replaced with a revamped playground experience soon. We recommend using VNC or Screen Sharing for precise control of the computer instead.
|
||||
The Computer UI is deprecated and will be replaced with a revamped playground experience soon. We
|
||||
recommend using VNC or Screen Sharing for precise control of the computer instead.
|
||||
</Callout>
|
||||
|
||||
The computer module includes a Gradio UI for creating and sharing demonstration data. We make it easy for people to build community datasets for better computer use models with an upload to Huggingface feature.
|
||||
|
||||
@@ -4,7 +4,14 @@ slug: sandboxed-python
|
||||
---
|
||||
|
||||
<Callout>
|
||||
A corresponding <a href="https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py" target="_blank">Python example</a> is available for this documentation.
|
||||
A corresponding{' '}
|
||||
<a
|
||||
href="https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py"
|
||||
target="_blank"
|
||||
>
|
||||
Python example
|
||||
</a>{' '}
|
||||
is available for this documentation.
|
||||
</Callout>
|
||||
|
||||
You can run Python functions securely inside a sandboxed virtual environment on a remote Cua Computer. This is useful for executing untrusted user code, isolating dependencies, or providing a safe environment for automation tasks.
|
||||
|
||||
@@ -473,6 +473,7 @@ python form_filling.py
|
||||
```
|
||||
|
||||
The agent will:
|
||||
|
||||
1. Download the PDF resume from Overleaf
|
||||
2. Extract information from the PDF
|
||||
3. Fill out the JotForm with the extracted information
|
||||
|
||||
@@ -0,0 +1,640 @@
|
||||
---
|
||||
title: GUI Grounding with Gemini 3
|
||||
description: Using Google's Gemini 3 with OmniParser for Advanced GUI Grounding Tasks
|
||||
---
|
||||
|
||||
import { Step, Steps } from 'fumadocs-ui/components/steps';
|
||||
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
|
||||
import { Callout } from 'fumadocs-ui/components/callout';
|
||||
|
||||
## Overview
|
||||
|
||||
This example demonstrates how to use Google's Gemini 3 models with OmniParser for complex GUI grounding tasks. Gemini 3 Pro achieves exceptional performance on the [ScreenSpot-Pro benchmark](https://github.com/likaixin2000/ScreenSpot-Pro-GUI-Grounding) with a **72.7% accuracy** (compared to Claude Sonnet 4.5's 36.2%), making it ideal for precise UI element location and complex navigation tasks.
|
||||
|
||||
<img
|
||||
src="/docs/img/grounding-with-gemini3.gif"
|
||||
alt="Demo of Gemini 3 with OmniParser performing complex GUI navigation tasks"
|
||||
width="800px"
|
||||
/>
|
||||
|
||||
<Callout type="info" title="Why Gemini 3 for UI Navigation?">
|
||||
According to [Google's Gemini 3 announcement](https://blog.google/products/gemini/gemini-3/),
|
||||
Gemini 3 Pro achieves: - **72.7%** on ScreenSpot-Pro (vs. Gemini 2.5 Pro's 11.4%) -
|
||||
Industry-leading performance on complex UI navigation tasks - Advanced multimodal understanding
|
||||
for high-resolution screens
|
||||
</Callout>
|
||||
|
||||
### What You'll Build
|
||||
|
||||
This guide shows how to:
|
||||
|
||||
- Set up Vertex AI with proper authentication
|
||||
- Use OmniParser with Gemini 3 for GUI element detection
|
||||
- Leverage Gemini 3-specific features like `thinking_level` and `media_resolution`
|
||||
- Create agents that can perform complex multi-step UI interactions
|
||||
|
||||
---
|
||||
|
||||
<Steps>
|
||||
|
||||
<Step>
|
||||
|
||||
### Set Up Google Cloud and Vertex AI
|
||||
|
||||
Before using Gemini 3 models, you need to enable Vertex AI in Google Cloud Console.
|
||||
|
||||
#### 1. Create a Google Cloud Project
|
||||
|
||||
1. Go to [Google Cloud Console](https://console.cloud.google.com/)
|
||||
2. Click **Select a project** → **New Project**
|
||||
3. Enter a project name and click **Create**
|
||||
4. Note your **Project ID** (you'll need this later)
|
||||
|
||||
#### 2. Enable Vertex AI API
|
||||
|
||||
1. Navigate to [Vertex AI API](https://console.cloud.google.com/apis/library/aiplatform.googleapis.com)
|
||||
2. Select your project
|
||||
3. Click **Enable**
|
||||
|
||||
#### 3. Enable Billing
|
||||
|
||||
1. Go to [Billing](https://console.cloud.google.com/billing)
|
||||
2. Link a billing account to your project
|
||||
3. Vertex AI offers a [free tier](https://cloud.google.com/vertex-ai/pricing) for testing
|
||||
|
||||
#### 4. Create a Service Account
|
||||
|
||||
1. Go to [IAM & Admin > Service Accounts](https://console.cloud.google.com/iam-admin/serviceaccounts)
|
||||
2. Click **Create Service Account**
|
||||
3. Enter a name (e.g., "cua-gemini-agent")
|
||||
4. Click **Create and Continue**
|
||||
5. Grant the **Vertex AI User** role
|
||||
6. Click **Done**
|
||||
|
||||
#### 5. Create and Download Service Account Key
|
||||
|
||||
1. Click on your newly created service account
|
||||
2. Go to **Keys** tab
|
||||
3. Click **Add Key** → **Create new key**
|
||||
4. Select **JSON** format
|
||||
5. Click **Create** (the key file will download automatically)
|
||||
6. **Important**: Store this key file securely! It contains credentials for accessing your Google Cloud resources
|
||||
|
||||
<Callout type="warn">
|
||||
Never commit your service account JSON key to version control! Add it to `.gitignore` immediately.
|
||||
</Callout>
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Install Dependencies
|
||||
|
||||
Install the required packages for OmniParser and Gemini 3:
|
||||
|
||||
Create a `requirements.txt` file:
|
||||
|
||||
```text
|
||||
cua-agent
|
||||
cua-computer
|
||||
cua-som # OmniParser for GUI element detection
|
||||
litellm>=1.0.0
|
||||
python-dotenv>=1.0.0
|
||||
google-cloud-aiplatform>=1.70.0
|
||||
```
|
||||
|
||||
Install the dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Configure Environment Variables
|
||||
|
||||
Create a `.env` file in your project root:
|
||||
|
||||
```text
|
||||
# Google Cloud / Vertex AI credentials
|
||||
GOOGLE_CLOUD_PROJECT=your-project-id
|
||||
GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-service-account-key.json
|
||||
|
||||
# Cua credentials (for cloud sandboxes)
|
||||
CUA_API_KEY=sk_cua-api01...
|
||||
CUA_SANDBOX_NAME=your-sandbox-name
|
||||
```
|
||||
|
||||
Replace the values:
|
||||
|
||||
- `your-project-id`: Your Google Cloud Project ID from Step 1
|
||||
- `/path/to/your-service-account-key.json`: Path to the JSON key file you downloaded
|
||||
- `sk_cua-api01...`: Your Cua API key from the [Cua dashboard](https://cua.dev)
|
||||
- `your-sandbox-name`: Your sandbox name (if using cloud sandboxes)
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Create Your Complex UI Navigation Script
|
||||
|
||||
Create a Python file (e.g., `gemini_ui_navigation.py`):
|
||||
|
||||
<Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox']}>
|
||||
<Tab value="Cloud Sandbox">
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import traceback
|
||||
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer, VMProviderType
|
||||
from dotenv import load_dotenv
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def handle_sigint(sig, frame):
|
||||
print("\n\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
|
||||
async def complex_ui_navigation():
|
||||
"""
|
||||
Demonstrate Gemini 3's exceptional UI grounding capabilities
|
||||
with complex, multi-step navigation tasks.
|
||||
"""
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="linux",
|
||||
provider_type=VMProviderType.CLOUD,
|
||||
name=os.environ["CUA_SANDBOX_NAME"],
|
||||
api_key=os.environ["CUA_API_KEY"],
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
# Use OmniParser with Gemini 3 Pro for optimal GUI grounding
|
||||
model="omniparser+vertex_ai/gemini-3-pro-preview",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.INFO,
|
||||
trajectory_dir="trajectories",
|
||||
use_prompt_caching=False,
|
||||
max_trajectory_budget=5.0,
|
||||
# Gemini 3-specific parameters
|
||||
thinking_level="high", # Enables deeper reasoning (vs "low")
|
||||
media_resolution="high", # High-resolution image processing (vs "low" or "medium")
|
||||
)
|
||||
|
||||
# Complex GUI grounding tasks inspired by ScreenSpot-Pro benchmark
|
||||
# These test precise element location in professional UIs
|
||||
tasks = [
|
||||
# Task 1: GitHub repository navigation
|
||||
{
|
||||
"instruction": (
|
||||
"Go to github.com/trycua/cua. "
|
||||
"Find and click on the 'Issues' tab. "
|
||||
"Then locate and click on the search box within the issues page "
|
||||
"(not the global GitHub search). "
|
||||
"Type 'omniparser' and press Enter."
|
||||
),
|
||||
"description": "Tests precise UI element distinction in a complex interface",
|
||||
},
|
||||
|
||||
# Task 2: Search for and install Visual Studio Code
|
||||
{
|
||||
"instruction": (
|
||||
"Open your system's app store (e.g., Microsoft Store). "
|
||||
"Search for 'Visual Studio Code'. "
|
||||
"In the search results, select 'Visual Studio Code'. "
|
||||
"Click on 'Install' or 'Get' to begin the installation. "
|
||||
"If prompted, accept any permissions or confirm the installation. "
|
||||
"Wait for Visual Studio Code to finish installing."
|
||||
),
|
||||
"description": "Tests the ability to search for an application and complete its installation through a step-by-step app store workflow.",
|
||||
},
|
||||
]
|
||||
|
||||
history = []
|
||||
|
||||
for i, task_info in enumerate(tasks, 1):
|
||||
task = task_info["instruction"]
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[Task {i}/{len(tasks)}] {task_info['description']}")
|
||||
print(f"{'='*60}")
|
||||
print(f"\nInstruction: {task}\n")
|
||||
|
||||
# Add user message to history
|
||||
history.append({"role": "user", "content": task})
|
||||
|
||||
# Run agent with conversation history
|
||||
async for result in agent.run(history, stream=False):
|
||||
history += result.get("output", [])
|
||||
|
||||
# Print output for debugging
|
||||
for item in result.get("output", []):
|
||||
if item.get("type") == "message":
|
||||
content = item.get("content", [])
|
||||
for content_part in content:
|
||||
if content_part.get("text"):
|
||||
logger.info(f"Agent: {content_part.get('text')}")
|
||||
elif item.get("type") == "computer_call":
|
||||
action = item.get("action", {})
|
||||
action_type = action.get("type", "")
|
||||
logger.debug(f"Computer Action: {action_type}")
|
||||
|
||||
print(f"\n✅ Task {i}/{len(tasks)} completed")
|
||||
|
||||
print("\n🎉 All complex UI navigation tasks completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in complex_ui_navigation: {e}")
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
def main():
|
||||
try:
|
||||
load_dotenv()
|
||||
|
||||
# Validate required environment variables
|
||||
required_vars = [
|
||||
"GOOGLE_CLOUD_PROJECT",
|
||||
"GOOGLE_APPLICATION_CREDENTIALS",
|
||||
"CUA_API_KEY",
|
||||
"CUA_SANDBOX_NAME",
|
||||
]
|
||||
|
||||
missing_vars = [var for var in required_vars if not os.environ.get(var)]
|
||||
if missing_vars:
|
||||
raise RuntimeError(
|
||||
f"Missing required environment variables: {', '.join(missing_vars)}\n"
|
||||
f"Please check your .env file and ensure all keys are set.\n"
|
||||
f"See the setup guide for details on configuring Vertex AI credentials."
|
||||
)
|
||||
|
||||
signal.signal(signal.SIGINT, handle_sigint)
|
||||
|
||||
asyncio.run(complex_ui_navigation())
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error running automation: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="Linux on Docker">
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import traceback
|
||||
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer, VMProviderType
|
||||
from dotenv import load_dotenv
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def handle_sigint(sig, frame):
|
||||
print("\n\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
|
||||
async def complex_ui_navigation():
|
||||
"""
|
||||
Demonstrate Gemini 3's exceptional UI grounding capabilities
|
||||
with complex, multi-step navigation tasks.
|
||||
"""
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="linux",
|
||||
provider_type=VMProviderType.DOCKER,
|
||||
image="trycua/cua-xfce:latest",
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
# Use OmniParser with Gemini 3 Pro for optimal GUI grounding
|
||||
model="omniparser+vertex_ai/gemini-3-pro-preview",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.INFO,
|
||||
trajectory_dir="trajectories",
|
||||
use_prompt_caching=False,
|
||||
max_trajectory_budget=5.0,
|
||||
# Gemini 3-specific parameters
|
||||
thinking_level="high", # Enables deeper reasoning (vs "low")
|
||||
media_resolution="high", # High-resolution image processing (vs "low" or "medium")
|
||||
)
|
||||
|
||||
# Complex GUI grounding tasks inspired by ScreenSpot-Pro benchmark
|
||||
tasks = [
|
||||
{
|
||||
"instruction": (
|
||||
"Go to github.com/trycua/cua. "
|
||||
"Find and click on the 'Issues' tab. "
|
||||
"Then locate and click on the search box within the issues page "
|
||||
"(not the global GitHub search). "
|
||||
"Type 'omniparser' and press Enter."
|
||||
),
|
||||
"description": "Tests precise UI element distinction in a complex interface",
|
||||
},
|
||||
]
|
||||
|
||||
history = []
|
||||
|
||||
for i, task_info in enumerate(tasks, 1):
|
||||
task = task_info["instruction"]
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[Task {i}/{len(tasks)}] {task_info['description']}")
|
||||
print(f"{'='*60}")
|
||||
print(f"\nInstruction: {task}\n")
|
||||
|
||||
history.append({"role": "user", "content": task})
|
||||
|
||||
async for result in agent.run(history, stream=False):
|
||||
history += result.get("output", [])
|
||||
|
||||
for item in result.get("output", []):
|
||||
if item.get("type") == "message":
|
||||
content = item.get("content", [])
|
||||
for content_part in content:
|
||||
if content_part.get("text"):
|
||||
logger.info(f"Agent: {content_part.get('text')}")
|
||||
elif item.get("type") == "computer_call":
|
||||
action = item.get("action", {})
|
||||
action_type = action.get("type", "")
|
||||
logger.debug(f"Computer Action: {action_type}")
|
||||
|
||||
print(f"\n✅ Task {i}/{len(tasks)} completed")
|
||||
|
||||
print("\n🎉 All complex UI navigation tasks completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in complex_ui_navigation: {e}")
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
def main():
|
||||
try:
|
||||
load_dotenv()
|
||||
|
||||
required_vars = [
|
||||
"GOOGLE_CLOUD_PROJECT",
|
||||
"GOOGLE_APPLICATION_CREDENTIALS",
|
||||
]
|
||||
|
||||
missing_vars = [var for var in required_vars if not os.environ.get(var)]
|
||||
if missing_vars:
|
||||
raise RuntimeError(
|
||||
f"Missing required environment variables: {', '.join(missing_vars)}\n"
|
||||
f"Please check your .env file."
|
||||
)
|
||||
|
||||
signal.signal(signal.SIGINT, handle_sigint)
|
||||
|
||||
asyncio.run(complex_ui_navigation())
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error running automation: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="macOS Sandbox">
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import traceback
|
||||
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer, VMProviderType
|
||||
from dotenv import load_dotenv
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def handle_sigint(sig, frame):
|
||||
print("\n\nExecution interrupted by user. Exiting gracefully...")
|
||||
exit(0)
|
||||
|
||||
async def complex_ui_navigation():
|
||||
"""
|
||||
Demonstrate Gemini 3's exceptional UI grounding capabilities
|
||||
with complex, multi-step navigation tasks.
|
||||
"""
|
||||
try:
|
||||
async with Computer(
|
||||
os_type="macos",
|
||||
provider_type=VMProviderType.LUME,
|
||||
name="macos-sequoia-cua:latest",
|
||||
verbosity=logging.INFO,
|
||||
) as computer:
|
||||
|
||||
agent = ComputerAgent(
|
||||
# Use OmniParser with Gemini 3 Pro for optimal GUI grounding
|
||||
model="omniparser+vertex_ai/gemini-3-pro-preview",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.INFO,
|
||||
trajectory_dir="trajectories",
|
||||
use_prompt_caching=False,
|
||||
max_trajectory_budget=5.0,
|
||||
# Gemini 3-specific parameters
|
||||
thinking_level="high", # Enables deeper reasoning (vs "low")
|
||||
media_resolution="high", # High-resolution image processing (vs "low" or "medium")
|
||||
)
|
||||
|
||||
# Complex GUI grounding tasks inspired by ScreenSpot-Pro benchmark
|
||||
tasks = [
|
||||
{
|
||||
"instruction": (
|
||||
"Go to github.com/trycua/cua. "
|
||||
"Find and click on the 'Issues' tab. "
|
||||
"Then locate and click on the search box within the issues page "
|
||||
"(not the global GitHub search). "
|
||||
"Type 'omniparser' and press Enter."
|
||||
),
|
||||
"description": "Tests precise UI element distinction in a complex interface",
|
||||
},
|
||||
]
|
||||
|
||||
history = []
|
||||
|
||||
for i, task_info in enumerate(tasks, 1):
|
||||
task = task_info["instruction"]
|
||||
print(f"\n{'='*60}")
|
||||
print(f"[Task {i}/{len(tasks)}] {task_info['description']}")
|
||||
print(f"{'='*60}")
|
||||
print(f"\nInstruction: {task}\n")
|
||||
|
||||
history.append({"role": "user", "content": task})
|
||||
|
||||
async for result in agent.run(history, stream=False):
|
||||
history += result.get("output", [])
|
||||
|
||||
for item in result.get("output", []):
|
||||
if item.get("type") == "message":
|
||||
content = item.get("content", [])
|
||||
for content_part in content:
|
||||
if content_part.get("text"):
|
||||
logger.info(f"Agent: {content_part.get('text')}")
|
||||
elif item.get("type") == "computer_call":
|
||||
action = item.get("action", {})
|
||||
action_type = action.get("type", "")
|
||||
logger.debug(f"Computer Action: {action_type}")
|
||||
|
||||
print(f"\n✅ Task {i}/{len(tasks)} completed")
|
||||
|
||||
print("\n🎉 All complex UI navigation tasks completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in complex_ui_navigation: {e}")
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
def main():
|
||||
try:
|
||||
load_dotenv()
|
||||
|
||||
required_vars = [
|
||||
"GOOGLE_CLOUD_PROJECT",
|
||||
"GOOGLE_APPLICATION_CREDENTIALS",
|
||||
]
|
||||
|
||||
missing_vars = [var for var in required_vars if not os.environ.get(var)]
|
||||
if missing_vars:
|
||||
raise RuntimeError(
|
||||
f"Missing required environment variables: {', '.join(missing_vars)}\n"
|
||||
f"Please check your .env file."
|
||||
)
|
||||
|
||||
signal.signal(signal.SIGINT, handle_sigint)
|
||||
|
||||
asyncio.run(complex_ui_navigation())
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error running automation: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
### Run Your Script
|
||||
|
||||
Execute your complex UI navigation automation:
|
||||
|
||||
```bash
|
||||
python gemini_ui_navigation.py
|
||||
```
|
||||
|
||||
The agent will:
|
||||
|
||||
1. Navigate to GitHub and locate specific UI elements
|
||||
2. Distinguish between similar elements (e.g., global search vs. issues search)
|
||||
3. Perform multi-step interactions with visual feedback
|
||||
4. Use Gemini 3's advanced reasoning for precise element grounding
|
||||
|
||||
Monitor the output to see the agent's progress through each task.
|
||||
|
||||
</Step>
|
||||
|
||||
</Steps>
|
||||
|
||||
---
|
||||
|
||||
## Understanding Gemini 3-Specific Parameters
|
||||
|
||||
### `thinking_level`
|
||||
|
||||
Controls the amount of internal reasoning the model performs:
|
||||
|
||||
- `"high"`: Deeper reasoning, better for complex UI navigation (recommended for ScreenSpot-like tasks)
|
||||
- `"low"`: Faster responses, suitable for simpler tasks
|
||||
|
||||
### `media_resolution`
|
||||
|
||||
Controls vision processing for multimodal inputs:
|
||||
|
||||
- `"high"`: Best for complex UIs with many small elements (recommended)
|
||||
- `"medium"`: Balanced quality and speed
|
||||
- `"low"`: Faster processing for simple interfaces
|
||||
|
||||
<Callout type="info">
|
||||
For tasks requiring precise GUI element location (like ScreenSpot-Pro), use
|
||||
`thinking_level="high"` and `media_resolution="high"` for optimal performance.
|
||||
</Callout>
|
||||
|
||||
---
|
||||
|
||||
## Benchmark Performance
|
||||
|
||||
Gemini 3 Pro's performance on ScreenSpot-Pro demonstrates its exceptional UI grounding capabilities:
|
||||
|
||||
| Model | ScreenSpot-Pro Score |
|
||||
| ----------------- | -------------------- |
|
||||
| **Gemini 3 Pro** | **72.7%** |
|
||||
| Claude Sonnet 4.5 | 36.2% |
|
||||
| Gemini 2.5 Pro | 11.4% |
|
||||
| GPT-5.1 | 3.5% |
|
||||
|
||||
This makes Gemini 3 the ideal choice for complex UI navigation, element detection, and professional GUI automation tasks.
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Authentication Issues
|
||||
|
||||
If you encounter authentication errors:
|
||||
|
||||
1. Verify your service account JSON key path is correct
|
||||
2. Ensure the service account has the **Vertex AI User** role
|
||||
3. Check that the Vertex AI API is enabled in your project
|
||||
4. Confirm your `GOOGLE_CLOUD_PROJECT` matches your actual project ID
|
||||
|
||||
### "Vertex AI API not enabled" Error
|
||||
|
||||
Run this command to enable the API:
|
||||
|
||||
```bash
|
||||
gcloud services enable aiplatform.googleapis.com --project=YOUR_PROJECT_ID
|
||||
```
|
||||
|
||||
### Billing Issues
|
||||
|
||||
Ensure billing is enabled for your Google Cloud project. Visit the [Billing section](https://console.cloud.google.com/billing) to verify.
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Learn more about [OmniParser agent loops](/agent-sdk/agent-loops)
|
||||
- Explore [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing)
|
||||
- Read about [ScreenSpot-Pro benchmark](https://github.com/likaixin2000/ScreenSpot-Pro-GUI-Grounding)
|
||||
- Check out [Google's Gemini 3 announcement](https://blog.google/products/gemini/gemini-3/)
|
||||
- Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help
|
||||
@@ -1,5 +1,10 @@
|
||||
{
|
||||
"title": "Cookbook",
|
||||
"description": "Real-world examples of building with Cua",
|
||||
"pages": ["windows-app-behind-vpn", "form-filling", "post-event-contact-export"]
|
||||
"pages": [
|
||||
"windows-app-behind-vpn",
|
||||
"form-filling",
|
||||
"post-event-contact-export",
|
||||
"gemini-complex-ui-navigation"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -441,6 +441,7 @@ python contact_export.py
|
||||
```
|
||||
|
||||
The agent will:
|
||||
|
||||
1. Navigate to your LinkedIn connections page
|
||||
2. Extract data from 20 contacts (first name, last name, role, company, LinkedIn URL)
|
||||
3. Save contacts to a timestamped CSV file
|
||||
|
||||
@@ -11,19 +11,23 @@ import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
|
||||
This guide demonstrates how to automate Windows desktop applications (like eGecko HR/payroll systems) that run behind corporate VPN. This is a common enterprise scenario where legacy desktop applications require manual data entry, report generation, or workflow execution.
|
||||
|
||||
**Use cases:**
|
||||
|
||||
- HR/payroll processing (employee onboarding, payroll runs, benefits administration)
|
||||
- Desktop ERP systems behind corporate networks
|
||||
- Legacy financial applications requiring VPN access
|
||||
- Compliance reporting from on-premise systems
|
||||
|
||||
**Architecture:**
|
||||
|
||||
- Client-side Cua agent (Python SDK or Playground UI)
|
||||
- Windows VM/Sandbox with VPN client configured
|
||||
- RDP/remote desktop connection to target environment
|
||||
- Desktop application automation via computer vision and UI control
|
||||
|
||||
<Callout type="info">
|
||||
**Production Deployment**: For production use, consider workflow mining and custom finetuning to create vertical-specific actions (e.g., "Run payroll", "Onboard employee") instead of generic UI automation. This provides better audit trails and higher success rates.
|
||||
**Production Deployment**: For production use, consider workflow mining and custom finetuning to
|
||||
create vertical-specific actions (e.g., "Run payroll", "Onboard employee") instead of generic UI
|
||||
automation. This provides better audit trails and higher success rates.
|
||||
</Callout>
|
||||
|
||||
---
|
||||
@@ -31,7 +35,11 @@ This guide demonstrates how to automate Windows desktop applications (like eGeck
|
||||
## Video Demo
|
||||
|
||||
<div className="rounded-lg border bg-card text-card-foreground shadow-sm p-4 mb-6">
|
||||
<video src="https://github.com/user-attachments/assets/8ab07646-6018-4128-87ce-53180cfea696" controls className="w-full rounded">
|
||||
<video
|
||||
src="https://github.com/user-attachments/assets/8ab07646-6018-4128-87ce-53180cfea696"
|
||||
controls
|
||||
className="w-full rounded"
|
||||
>
|
||||
Your browser does not support the video tag.
|
||||
</video>
|
||||
<div className="text-sm text-muted-foreground mt-2">
|
||||
@@ -106,7 +114,8 @@ For local development on Windows 10 Pro/Enterprise or Windows 11:
|
||||
4. Configure your desktop application installation within the sandbox
|
||||
|
||||
<Callout type="warn">
|
||||
**Manual VPN Setup**: Windows Sandbox requires manual VPN configuration each time it starts. For production use, consider Cloud Sandbox or self-hosted VMs with persistent VPN connections.
|
||||
**Manual VPN Setup**: Windows Sandbox requires manual VPN configuration each time it starts. For
|
||||
production use, consider Cloud Sandbox or self-hosted VMs with persistent VPN connections.
|
||||
</Callout>
|
||||
|
||||
</Tab>
|
||||
@@ -421,6 +430,7 @@ python hr_automation.py
|
||||
```
|
||||
|
||||
The agent will:
|
||||
|
||||
1. Connect to your Windows environment (with VPN if configured)
|
||||
2. Launch and navigate the desktop application
|
||||
3. Execute each workflow step sequentially
|
||||
@@ -506,6 +516,7 @@ agent = ComputerAgent(
|
||||
### 1. Workflow Mining
|
||||
|
||||
Before deploying, analyze your actual workflows:
|
||||
|
||||
- Record user interactions with the application
|
||||
- Identify common patterns and edge cases
|
||||
- Map out decision trees and validation requirements
|
||||
@@ -524,6 +535,7 @@ tasks = ["onboard_employee", "run_payroll", "generate_compliance_report"]
|
||||
```
|
||||
|
||||
This provides:
|
||||
|
||||
- Better audit trails
|
||||
- Approval gates at business logic level
|
||||
- Higher success rates
|
||||
@@ -547,12 +559,14 @@ agent = ComputerAgent(
|
||||
Choose your deployment model:
|
||||
|
||||
**Managed (Recommended)**
|
||||
|
||||
- Cua hosts Windows sandboxes, VPN/RDP stack, and agent runtime
|
||||
- You get UI/API endpoints for triggering workflows
|
||||
- Automatic scaling, monitoring, and maintenance
|
||||
- SLA guarantees and enterprise support
|
||||
|
||||
**Self-Hosted**
|
||||
|
||||
- You manage Windows VMs, VPN infrastructure, and agent deployment
|
||||
- Full control over data and security
|
||||
- Custom network configurations
|
||||
|
||||
@@ -5,7 +5,8 @@ title: Introduction
|
||||
import { Monitor, Code, BookOpen, Zap, Bot, Boxes, Rocket } from 'lucide-react';
|
||||
|
||||
<div className="rounded-lg border bg-card text-card-foreground shadow-sm px-4 py-2 mb-6">
|
||||
Cua is an open-source framework for building **Computer-Use Agents** - AI systems that see, understand, and interact with desktop applications through vision and action, just like humans do.
|
||||
Cua is an open-source framework for building **Computer-Use Agents** - AI systems that see,
|
||||
understand, and interact with desktop applications through vision and action, just like humans do.
|
||||
</div>
|
||||
|
||||
## Why Cua?
|
||||
|
||||
@@ -7,7 +7,14 @@ github:
|
||||
---
|
||||
|
||||
<Callout>
|
||||
A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/computer_server_nb.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
|
||||
A corresponding{' '}
|
||||
<a
|
||||
href="https://github.com/trycua/cua/blob/main/notebooks/computer_server_nb.ipynb"
|
||||
target="_blank"
|
||||
>
|
||||
Jupyter Notebook
|
||||
</a>{' '}
|
||||
is available for this documentation.
|
||||
</Callout>
|
||||
|
||||
The Computer Server API reference documentation is currently under development.
|
||||
|
||||
@@ -15,6 +15,7 @@ The CUA CLI provides commands for authentication and sandbox management.
|
||||
The CLI supports **two command styles** for flexibility:
|
||||
|
||||
**Flat style** (quick & concise):
|
||||
|
||||
```bash
|
||||
cua list
|
||||
cua create --os linux --size small --region north-america
|
||||
@@ -22,6 +23,7 @@ cua start my-sandbox
|
||||
```
|
||||
|
||||
**Grouped style** (explicit & clear):
|
||||
|
||||
```bash
|
||||
cua sb list # or: cua sandbox list
|
||||
cua sb create # or: cua sandbox create
|
||||
@@ -54,9 +56,11 @@ cua login --api-key sk-your-api-key-here
|
||||
```
|
||||
|
||||
**Options:**
|
||||
|
||||
- `--api-key <key>` - Provide API key directly instead of browser flow
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua auth login
|
||||
Opening browser for CLI auth...
|
||||
@@ -75,12 +79,14 @@ cua env
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua auth env
|
||||
Wrote /path/to/your/project/.env
|
||||
```
|
||||
|
||||
The generated `.env` file will contain:
|
||||
|
||||
```
|
||||
CUA_API_KEY=sk-your-api-key-here
|
||||
```
|
||||
@@ -97,6 +103,7 @@ cua logout
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua auth logout
|
||||
Logged out
|
||||
@@ -121,6 +128,7 @@ cua ps
|
||||
```
|
||||
|
||||
**Example Output (default, passwords hidden):**
|
||||
|
||||
```
|
||||
NAME STATUS HOST
|
||||
my-dev-sandbox running my-dev-sandbox.sandbox.cua.ai
|
||||
@@ -128,6 +136,7 @@ test-windows stopped test-windows.sandbox.cua.ai
|
||||
```
|
||||
|
||||
**Example Output (with --show-passwords):**
|
||||
|
||||
```
|
||||
NAME STATUS PASSWORD HOST
|
||||
my-dev-sandbox running secure-pass-123 my-dev-sandbox.sandbox.cua.ai
|
||||
@@ -143,11 +152,13 @@ cua create --os <OS> --size <SIZE> --region <REGION>
|
||||
```
|
||||
|
||||
**Required Options:**
|
||||
|
||||
- `--os` - Operating system: `linux`, `windows`, `macos`
|
||||
- `--size` - Sandbox size: `small`, `medium`, `large`
|
||||
- `--region` - Region: `north-america`, `europe`, `asia-pacific`, `south-america`
|
||||
|
||||
**Examples:**
|
||||
|
||||
```bash
|
||||
# Create a small Linux sandbox in North America
|
||||
cua create --os linux --size small --region north-america
|
||||
@@ -162,6 +173,7 @@ cua create --os macos --size large --region asia-pacific
|
||||
**Response Types:**
|
||||
|
||||
**Immediate (Status 200):**
|
||||
|
||||
```bash
|
||||
Sandbox created and ready: my-new-sandbox-abc123
|
||||
Password: secure-password-here
|
||||
@@ -169,6 +181,7 @@ Host: my-new-sandbox-abc123.sandbox.cua.ai
|
||||
```
|
||||
|
||||
**Provisioning (Status 202):**
|
||||
|
||||
```bash
|
||||
Sandbox provisioning started: my-new-sandbox-abc123
|
||||
Job ID: job-xyz789
|
||||
@@ -184,6 +197,7 @@ cua start <name>
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua start my-dev-sandbox
|
||||
Start accepted
|
||||
@@ -198,6 +212,7 @@ cua stop <name>
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua stop my-dev-sandbox
|
||||
stopping
|
||||
@@ -212,6 +227,7 @@ cua restart <name>
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua restart my-dev-sandbox
|
||||
restarting
|
||||
@@ -226,6 +242,7 @@ cua delete <name>
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua delete old-test-sandbox
|
||||
Sandbox deletion initiated: deleting
|
||||
@@ -247,6 +264,7 @@ cua open <name>
|
||||
```
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
$ cua vnc my-dev-sandbox
|
||||
Opening NoVNC: https://my-dev-sandbox.sandbox.cua.ai/vnc.html?autoconnect=true&password=...
|
||||
@@ -254,7 +272,6 @@ Opening NoVNC: https://my-dev-sandbox.sandbox.cua.ai/vnc.html?autoconnect=true&p
|
||||
|
||||
This command automatically opens your default browser to the VNC interface with the correct password pre-filled.
|
||||
|
||||
|
||||
## Global Options
|
||||
|
||||
### Help
|
||||
@@ -273,18 +290,21 @@ cua list --help
|
||||
The CLI provides clear error messages for common issues:
|
||||
|
||||
### Authentication Errors
|
||||
|
||||
```bash
|
||||
$ cua list
|
||||
Unauthorized. Try 'cua auth login' again.
|
||||
```
|
||||
|
||||
### Sandbox Not Found
|
||||
|
||||
```bash
|
||||
$ cua start nonexistent-sandbox
|
||||
Sandbox not found
|
||||
```
|
||||
|
||||
### Invalid Configuration
|
||||
|
||||
```bash
|
||||
$ cua create --os invalid --configuration small --region north-america
|
||||
Invalid request or unsupported configuration
|
||||
@@ -293,6 +313,7 @@ Invalid request or unsupported configuration
|
||||
## Tips and Best Practices
|
||||
|
||||
### 1. Use Descriptive Sandbox Names
|
||||
|
||||
```bash
|
||||
# Good
|
||||
cua create --os linux --size small --region north-america
|
||||
@@ -304,6 +325,7 @@ cua list # Check the generated name
|
||||
```
|
||||
|
||||
### 2. Environment Management
|
||||
|
||||
```bash
|
||||
# Set up your project with API key
|
||||
cd my-project
|
||||
@@ -312,6 +334,7 @@ cua auth env
|
||||
```
|
||||
|
||||
### 3. Quick Sandbox Access
|
||||
|
||||
```bash
|
||||
# Create aliases for frequently used sandboxes
|
||||
alias dev-sandbox="cua vnc my-development-sandbox"
|
||||
@@ -319,6 +342,7 @@ alias prod-sandbox="cua vnc my-production-sandbox"
|
||||
```
|
||||
|
||||
### 4. Monitoring Provisioning
|
||||
|
||||
```bash
|
||||
# For sandboxes that need provisioning time
|
||||
cua create --os windows --size large --region europe
|
||||
|
||||
@@ -34,16 +34,19 @@ cua sb list
|
||||
## Use Cases
|
||||
|
||||
### Development Workflow
|
||||
|
||||
- Quickly spin up cloud sandboxes for testing
|
||||
- Manage multiple sandboxes across different regions
|
||||
- Integrate with CI/CD pipelines
|
||||
|
||||
### Team Collaboration
|
||||
|
||||
- Share sandbox configurations and access
|
||||
- Standardize development environments
|
||||
- Quick onboarding for new team members
|
||||
|
||||
### Automation
|
||||
|
||||
- Script sandbox provisioning and management
|
||||
- Integrate with deployment workflows
|
||||
- Automate environment setup
|
||||
|
||||
@@ -11,24 +11,21 @@ import { Callout } from 'fumadocs-ui/components/callout';
|
||||
The fastest way to install the CUA CLI is using our installation scripts:
|
||||
|
||||
<Tabs items={['macOS / Linux', 'Windows']}>
|
||||
<Tab value="macOS / Linux">
|
||||
```bash
|
||||
curl -LsSf https://cua.ai/cli/install.sh | sh
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="macOS / Linux">```bash curl -LsSf https://cua.ai/cli/install.sh | sh ```</Tab>
|
||||
<Tab value="Windows">
|
||||
```powershell
|
||||
powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```powershell powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
These scripts will automatically:
|
||||
|
||||
1. Install [Bun](https://bun.sh) (a fast JavaScript runtime)
|
||||
2. Install the CUA CLI via `bun add -g @trycua/cli`
|
||||
|
||||
<Callout type="info">
|
||||
The installation scripts will automatically detect your system and install the appropriate binary to your PATH.
|
||||
The installation scripts will automatically detect your system and install the appropriate binary
|
||||
to your PATH.
|
||||
</Callout>
|
||||
|
||||
## Alternative: Install with Bun
|
||||
@@ -44,8 +41,8 @@ bun add -g @trycua/cli
|
||||
```
|
||||
|
||||
<Callout type="info">
|
||||
Using Bun provides faster installation and better performance compared to npm.
|
||||
If you don't have Bun installed, the first command will install it for you.
|
||||
Using Bun provides faster installation and better performance compared to npm. If you don't have
|
||||
Bun installed, the first command will install it for you.
|
||||
</Callout>
|
||||
|
||||
## Verify Installation
|
||||
@@ -76,40 +73,21 @@ To update to the latest version:
|
||||
|
||||
<Tabs items={['Script Install', 'npm Install']}>
|
||||
<Tab value="Script Install">
|
||||
Re-run the installation script:
|
||||
```bash
|
||||
# macOS/Linux
|
||||
curl -LsSf https://cua.ai/cli/install.sh | sh
|
||||
|
||||
# Windows
|
||||
powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="npm Install">
|
||||
```bash
|
||||
npm update -g @trycua/cli
|
||||
Re-run the installation script: ```bash # macOS/Linux curl -LsSf https://cua.ai/cli/install.sh |
|
||||
sh # Windows powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="npm Install">```bash npm update -g @trycua/cli ```</Tab>
|
||||
</Tabs>
|
||||
|
||||
## Uninstalling
|
||||
|
||||
<Tabs items={['Script Install', 'npm Install']}>
|
||||
<Tab value="Script Install">
|
||||
Remove the binary from your PATH:
|
||||
```bash
|
||||
# macOS/Linux
|
||||
rm $(which cua)
|
||||
|
||||
# Windows
|
||||
# Remove from your PATH or delete the executable
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="npm Install">
|
||||
```bash
|
||||
npm uninstall -g @trycua/cli
|
||||
```
|
||||
Remove the binary from your PATH: ```bash # macOS/Linux rm $(which cua) # Windows # Remove from
|
||||
your PATH or delete the executable ```
|
||||
</Tab>
|
||||
<Tab value="npm Install">```bash npm uninstall -g @trycua/cli ```</Tab>
|
||||
</Tabs>
|
||||
|
||||
## Troubleshooting
|
||||
@@ -128,17 +106,12 @@ If you encounter permission issues during installation:
|
||||
|
||||
<Tabs items={['macOS / Linux', 'Windows']}>
|
||||
<Tab value="macOS / Linux">
|
||||
Try running with sudo (not recommended for the curl method):
|
||||
```bash
|
||||
# If using npm
|
||||
sudo npm install -g @trycua/cli
|
||||
```
|
||||
Try running with sudo (not recommended for the curl method): ```bash # If using npm sudo npm
|
||||
install -g @trycua/cli ```
|
||||
</Tab>
|
||||
<Tab value="Windows">
|
||||
Run PowerShell as Administrator:
|
||||
```powershell
|
||||
# Right-click PowerShell and "Run as Administrator"
|
||||
powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
Run PowerShell as Administrator: ```powershell # Right-click PowerShell and "Run as
|
||||
Administrator" powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
@@ -30,13 +30,15 @@ To use with Claude Desktop, add an entry to your Claude Desktop configuration (`
|
||||
If you're working with the CUA source code:
|
||||
|
||||
**Standard VM Mode:**
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cua-agent": {
|
||||
"command": "/usr/bin/env",
|
||||
"args": [
|
||||
"bash", "-lc",
|
||||
"bash",
|
||||
"-lc",
|
||||
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
|
||||
]
|
||||
}
|
||||
@@ -45,13 +47,15 @@ If you're working with the CUA source code:
|
||||
```
|
||||
|
||||
**Host Computer Control Mode:**
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cua-agent": {
|
||||
"command": "/usr/bin/env",
|
||||
"args": [
|
||||
"bash", "-lc",
|
||||
"bash",
|
||||
"-lc",
|
||||
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; export CUA_USE_HOST_COMPUTER_SERVER='true'; export CUA_MAX_IMAGES='1'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
|
||||
]
|
||||
}
|
||||
@@ -62,6 +66,7 @@ If you're working with the CUA source code:
|
||||
**Note**: Replace `/path/to/cua` with the absolute path to your CUA repository directory.
|
||||
|
||||
**⚠️ Host Computer Control Setup**: When using `CUA_USE_HOST_COMPUTER_SERVER='true'`, you must also:
|
||||
|
||||
1. Install computer server dependencies: `python3 -m pip install uvicorn fastapi`
|
||||
2. Install the computer server: `python3 -m pip install -e libs/python/computer-server --break-system-packages`
|
||||
3. Start the computer server: `python -m computer_server --log-level debug`
|
||||
|
||||
@@ -4,19 +4,20 @@ title: Configuration
|
||||
|
||||
The server is configured using environment variables (can be set in the Claude Desktop config):
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| `CUA_MODEL_NAME` | Model string (e.g., "anthropic/claude-sonnet-4-20250514", "anthropic/claude-3-5-sonnet-20240620", "openai/computer-use-preview", "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", "omniparser+litellm/gpt-4o", "omniparser+ollama_chat/gemma3") | anthropic/claude-sonnet-4-20250514 |
|
||||
| `ANTHROPIC_API_KEY` | Your Anthropic API key (required for Anthropic models) | None |
|
||||
| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 |
|
||||
| `CUA_USE_HOST_COMPUTER_SERVER` | Target your local desktop instead of a VM. Set to "true" to use your host system. **Warning:** AI models may perform risky actions. | false |
|
||||
| Variable | Description | Default |
|
||||
| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------- |
|
||||
| `CUA_MODEL_NAME` | Model string (e.g., "anthropic/claude-sonnet-4-20250514", "openai/computer-use-preview", "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", "omniparser+litellm/gpt-4o", "omniparser+ollama_chat/gemma3") | anthropic/claude-sonnet-4-20250514 |
|
||||
| `ANTHROPIC_API_KEY` | Your Anthropic API key (required for Anthropic models) | None |
|
||||
| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 |
|
||||
| `CUA_USE_HOST_COMPUTER_SERVER` | Target your local desktop instead of a VM. Set to "true" to use your host system. **Warning:** AI models may perform risky actions. | false |
|
||||
|
||||
## Model Configuration
|
||||
|
||||
The `CUA_MODEL_NAME` environment variable supports various model providers through LiteLLM integration:
|
||||
|
||||
### Supported Providers
|
||||
- **Anthropic**: `anthropic/claude-sonnet-4-20250514`, `anthropic/claude-3-5-sonnet-20240620`, `anthropic/claude-3-haiku-20240307`
|
||||
|
||||
- **Anthropic**: `anthropic/claude-sonnet-4-20250514`,
|
||||
- **OpenAI**: `openai/computer-use-preview`, `openai/gpt-4o`
|
||||
- **Local Models**: `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`
|
||||
- **Omni + LiteLLM**: `omniparser+litellm/gpt-4o`, `omniparser+litellm/claude-3-haiku`
|
||||
@@ -25,6 +26,7 @@ The `CUA_MODEL_NAME` environment variable supports various model providers throu
|
||||
### Example Configurations
|
||||
|
||||
**Claude Desktop Configuration:**
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
@@ -43,6 +45,7 @@ The `CUA_MODEL_NAME` environment variable supports various model providers throu
|
||||
```
|
||||
|
||||
**Local Model Configuration:**
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
@@ -61,6 +64,7 @@ The `CUA_MODEL_NAME` environment variable supports various model providers throu
|
||||
## Session Management Configuration
|
||||
|
||||
The MCP server automatically manages sessions with the following defaults:
|
||||
|
||||
- **Max Concurrent Sessions**: 10
|
||||
- **Session Timeout**: 10 minutes of inactivity
|
||||
- **Computer Pool Size**: 5 instances
|
||||
|
||||
@@ -58,7 +58,8 @@ If you're working with the CUA source code directly (like in the CUA repository)
|
||||
"cua-agent": {
|
||||
"command": "/usr/bin/env",
|
||||
"args": [
|
||||
"bash", "-lc",
|
||||
"bash",
|
||||
"-lc",
|
||||
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
|
||||
]
|
||||
}
|
||||
@@ -69,16 +70,19 @@ If you're working with the CUA source code directly (like in the CUA repository)
|
||||
**For host computer control** (development setup):
|
||||
|
||||
1. **Install Computer Server Dependencies**:
|
||||
|
||||
```bash
|
||||
python3 -m pip install uvicorn fastapi
|
||||
python3 -m pip install -e libs/python/computer-server --break-system-packages
|
||||
```
|
||||
|
||||
2. **Start the Computer Server**:
|
||||
|
||||
```bash
|
||||
cd /path/to/cua
|
||||
python -m computer_server --log-level debug
|
||||
```
|
||||
|
||||
This will start the computer server on `http://localhost:8000` that controls your actual desktop.
|
||||
|
||||
3. **Configure Claude Desktop**:
|
||||
@@ -88,7 +92,8 @@ If you're working with the CUA source code directly (like in the CUA repository)
|
||||
"cua-agent": {
|
||||
"command": "/usr/bin/env",
|
||||
"args": [
|
||||
"bash", "-lc",
|
||||
"bash",
|
||||
"-lc",
|
||||
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; export CUA_USE_HOST_COMPUTER_SERVER='true'; export CUA_MAX_IMAGES='1'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
|
||||
]
|
||||
}
|
||||
@@ -110,6 +115,7 @@ If you're working with the CUA source code directly (like in the CUA repository)
|
||||
- Check logs for specific error messages
|
||||
|
||||
2. **"Missing Anthropic API Key"** - Add your API key to the configuration:
|
||||
|
||||
```json
|
||||
"env": {
|
||||
"ANTHROPIC_API_KEY": "your-api-key-here"
|
||||
@@ -118,8 +124,6 @@ If you're working with the CUA source code directly (like in the CUA repository)
|
||||
|
||||
3. **"model not found"** - Use a valid model name:
|
||||
- ✅ `anthropic/claude-sonnet-4-20250514`
|
||||
- ✅ `anthropic/claude-3-5-sonnet-20240620`
|
||||
- ❌ `anthropic/claude-3-5-sonnet-20241022` (doesn't exist)
|
||||
|
||||
4. **Script not found** - If you get a `/bin/bash: ~/cua/libs/python/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative.
|
||||
|
||||
@@ -130,6 +134,7 @@ If you're working with the CUA source code directly (like in the CUA repository)
|
||||
- **Image size errors**: Use `CUA_MAX_IMAGES='1'` to reduce image context size
|
||||
|
||||
**Viewing Logs:**
|
||||
|
||||
```bash
|
||||
tail -n 20 -f ~/Library/Logs/Claude/mcp*.log
|
||||
```
|
||||
|
||||
@@ -12,7 +12,7 @@ This MCP server features comprehensive liteLLM integration, allowing you to use
|
||||
|
||||
### Model String Examples:
|
||||
|
||||
- **Anthropic**: `"anthropic/claude-3-5-sonnet-20241022"`
|
||||
- **Anthropic**: `"anthropic/claude-sonnet-4-5-20250929"`
|
||||
- **OpenAI**: `"openai/computer-use-preview"`
|
||||
- **UI-TARS**: `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`
|
||||
- **Omni + Any LiteLLM**: `"omniparser+litellm/gpt-4o"`, `"omniparser+litellm/claude-3-haiku"`, `"omniparser+ollama_chat/gemma3"`
|
||||
|
||||
@@ -45,17 +45,20 @@ The MCP server supports multi-client sessions with automatic resource management
|
||||
## Usage Examples
|
||||
|
||||
### Basic Task Execution
|
||||
|
||||
```
|
||||
"Open Chrome and navigate to github.com"
|
||||
"Create a folder called 'Projects' on my desktop"
|
||||
```
|
||||
|
||||
### Multi-Task Execution
|
||||
|
||||
```
|
||||
"Run these tasks: 1) Open Finder, 2) Navigate to Documents, 3) Create a new folder called 'Work'"
|
||||
```
|
||||
|
||||
### Session Management
|
||||
|
||||
```
|
||||
"Take a screenshot of the current screen"
|
||||
"Show me the session statistics"
|
||||
|
||||
@@ -16,27 +16,35 @@ Claude will automatically use your CUA agent to perform these tasks.
|
||||
## Advanced Features
|
||||
|
||||
### Progress Reporting
|
||||
|
||||
The MCP server provides real-time progress updates during task execution:
|
||||
|
||||
- Task progress is reported as percentages (0-100%)
|
||||
- Multi-task operations show progress for each individual task
|
||||
- Progress updates are streamed to the MCP client for real-time feedback
|
||||
|
||||
### Error Handling
|
||||
|
||||
Robust error handling ensures reliable operation:
|
||||
|
||||
- Failed tasks return error messages with screenshots when possible
|
||||
- Session state is preserved even when individual tasks fail
|
||||
- Automatic cleanup prevents resource leaks
|
||||
- Detailed error logging for troubleshooting
|
||||
|
||||
### Concurrent Task Execution
|
||||
|
||||
For improved performance, multiple tasks can run concurrently:
|
||||
|
||||
- Set `concurrent=true` in `run_multi_cua_tasks` for parallel execution
|
||||
- Each task runs in its own context with isolated state
|
||||
- Progress tracking works for both sequential and concurrent modes
|
||||
- Resource pooling ensures efficient computer instance usage
|
||||
|
||||
### Session Management
|
||||
|
||||
Multi-client support with automatic resource management:
|
||||
|
||||
- Each client gets isolated sessions with separate computer instances
|
||||
- Sessions automatically clean up after 10 minutes of inactivity
|
||||
- Resource pooling prevents resource exhaustion
|
||||
@@ -55,7 +63,8 @@ No additional configuration is needed - this is the default behavior.
|
||||
### Option: Targeting Your Local Desktop
|
||||
|
||||
<Callout type="warn">
|
||||
**Warning:** When targeting your local system, AI models have direct access to your desktop and may perform risky actions. Use with caution.
|
||||
**Warning:** When targeting your local system, AI models have direct access to your desktop and
|
||||
may perform risky actions. Use with caution.
|
||||
</Callout>
|
||||
|
||||
To have the MCP server control your local desktop instead of a VM:
|
||||
@@ -82,13 +91,14 @@ Add the `CUA_USE_HOST_COMPUTER_SERVER` environment variable to your MCP client c
|
||||
"command": "/bin/bash",
|
||||
"args": ["~/.cua/start_mcp_server.sh"],
|
||||
"env": {
|
||||
"CUA_MODEL_NAME": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"CUA_USE_HOST_COMPUTER_SERVER": "true"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="Other MCP Clients">
|
||||
Set the environment variable in your MCP client configuration:
|
||||
@@ -98,6 +108,7 @@ Add the `CUA_USE_HOST_COMPUTER_SERVER` environment variable to your MCP client c
|
||||
```
|
||||
|
||||
Then start your MCP client as usual.
|
||||
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
@@ -108,6 +119,7 @@ Now Claude will control your local desktop directly when you ask it to perform c
|
||||
## Usage Examples
|
||||
|
||||
### Single Task Execution
|
||||
|
||||
```
|
||||
"Open Safari and navigate to apple.com"
|
||||
"Create a new folder on the desktop called 'My Projects'"
|
||||
@@ -115,16 +127,19 @@ Now Claude will control your local desktop directly when you ask it to perform c
|
||||
```
|
||||
|
||||
### Multi-Task Execution (Sequential)
|
||||
|
||||
```
|
||||
"Run these tasks in order: 1) Open Finder, 2) Navigate to Documents folder, 3) Create a new folder called 'Work'"
|
||||
```
|
||||
|
||||
### Multi-Task Execution (Concurrent)
|
||||
|
||||
```
|
||||
"Run these tasks simultaneously: 1) Open Chrome, 2) Open Safari, 3) Open Finder"
|
||||
```
|
||||
|
||||
### Session Management
|
||||
|
||||
```
|
||||
"Show me the current session statistics"
|
||||
"Take a screenshot using session abc123"
|
||||
@@ -132,6 +147,7 @@ Now Claude will control your local desktop directly when you ask it to perform c
|
||||
```
|
||||
|
||||
### Error Recovery
|
||||
|
||||
```
|
||||
"Try to open a non-existent application and show me the error"
|
||||
"Find all files with .tmp extension and delete them safely"
|
||||
@@ -140,13 +156,14 @@ Now Claude will control your local desktop directly when you ask it to perform c
|
||||
## First-time Usage Notes
|
||||
|
||||
**API Keys**: Ensure you have valid API keys:
|
||||
- Add your Anthropic API key in the Claude Desktop config (as shown above)
|
||||
- Or set it as an environment variable in your shell profile
|
||||
- **Required**: The MCP server needs an API key to authenticate with the model provider
|
||||
|
||||
- Add your Anthropic API key in the Claude Desktop config (as shown above)
|
||||
- Or set it as an environment variable in your shell profile
|
||||
- **Required**: The MCP server needs an API key to authenticate with the model provider
|
||||
|
||||
**Model Selection**: Choose the appropriate model for your needs:
|
||||
- **Claude Sonnet 4**: Latest model with best performance (`anthropic/claude-sonnet-4-20250514`)
|
||||
- **Claude 3.5 Sonnet**: Reliable performance (`anthropic/claude-3-5-sonnet-20240620`)
|
||||
- **Computer-Use Preview**: Specialized for computer tasks (`openai/computer-use-preview`)
|
||||
- **Local Models**: For privacy-sensitive environments
|
||||
- **Ollama**: For offline usage
|
||||
|
||||
- **Claude Sonnet 4**: Latest model with best performance (`anthropic/claude-sonnet-4-20250514`)
|
||||
- **Computer-Use Preview**: Specialized for computer tasks (`openai/computer-use-preview`)
|
||||
- **Local Models**: For privacy-sensitive environments
|
||||
- **Ollama**: For offline usage
|
||||
|
||||
@@ -7,7 +7,11 @@ github:
|
||||
---
|
||||
|
||||
<Callout>
|
||||
A corresponding <a href="https://github.com/trycua/cua/blob/main/examples/som_examples.py" target="_blank">Python example</a> is available for this documentation.
|
||||
A corresponding{' '}
|
||||
<a href="https://github.com/trycua/cua/blob/main/examples/som_examples.py" target="_blank">
|
||||
Python example
|
||||
</a>{' '}
|
||||
is available for this documentation.
|
||||
</Callout>
|
||||
|
||||
## Overview
|
||||
|
||||
BIN
docs/public/img/grounding-with-gemini3.gif
Normal file
BIN
docs/public/img/grounding-with-gemini3.gif
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 5.2 MiB |
@@ -53,6 +53,10 @@ async def run_agent_example():
|
||||
# == Omniparser + Any LLM ==
|
||||
# model="omniparser+anthropic/claude-opus-4-20250514",
|
||||
# model="omniparser+ollama_chat/gemma3:12b-it-q4_K_M",
|
||||
# == Omniparser + Vertex AI Gemini 3 (with thinking_level) ==
|
||||
# model="omni+vertex_ai/gemini-3-flash",
|
||||
# thinking_level="high", # or "low"
|
||||
# media_resolution="medium", # or "low" or "high"
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.DEBUG,
|
||||
|
||||
@@ -51,7 +51,7 @@ async def main():
|
||||
|
||||
# Create agent
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
trajectory_dir="trajectories",
|
||||
|
||||
@@ -189,7 +189,7 @@ class ComputerAgent:
|
||||
Initialize ComputerAgent.
|
||||
|
||||
Args:
|
||||
model: Model name (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro")
|
||||
model: Model name (e.g., "claude-sonnet-4-5-20250929", "computer-use-preview", "omni+vertex_ai/gemini-pro")
|
||||
tools: List of tools (computer objects, decorated functions, etc.)
|
||||
custom_loop: Custom agent loop function to use instead of auto-selection
|
||||
only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically.
|
||||
|
||||
@@ -7,7 +7,7 @@ Usage:
|
||||
Examples:
|
||||
python -m agent.cli openai/computer-use-preview
|
||||
python -m agent.cli anthropic/claude-sonnet-4-5-20250929
|
||||
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
|
||||
python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
|
||||
"""
|
||||
|
||||
try:
|
||||
@@ -233,7 +233,7 @@ async def main():
|
||||
Examples:
|
||||
python -m agent.cli openai/computer-use-preview
|
||||
python -m agent.cli anthropic/claude-sonnet-4-5-20250929
|
||||
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
|
||||
python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
|
||||
python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
||||
""",
|
||||
)
|
||||
|
||||
@@ -671,11 +671,12 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
||||
# Handle custom function tools (not computer tools)
|
||||
if tool_name != "computer":
|
||||
from ..responses import make_function_call_item
|
||||
responses_items.append(make_function_call_item(
|
||||
function_name=tool_name,
|
||||
arguments=tool_input,
|
||||
call_id=call_id
|
||||
))
|
||||
|
||||
responses_items.append(
|
||||
make_function_call_item(
|
||||
function_name=tool_name, arguments=tool_input, call_id=call_id
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
# Computer tool - process actions
|
||||
@@ -883,16 +884,17 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
||||
# Handle custom function tools
|
||||
if tool_name != "computer":
|
||||
from ..responses import make_function_call_item
|
||||
|
||||
# tool_call.function.arguments is a JSON string, need to parse it
|
||||
try:
|
||||
args_dict = json.loads(tool_call.function.arguments)
|
||||
except json.JSONDecodeError:
|
||||
args_dict = {}
|
||||
responses_items.append(make_function_call_item(
|
||||
function_name=tool_name,
|
||||
arguments=args_dict,
|
||||
call_id=tool_call.id
|
||||
))
|
||||
responses_items.append(
|
||||
make_function_call_item(
|
||||
function_name=tool_name, arguments=args_dict, call_id=tool_call.id
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
# Handle computer tool
|
||||
|
||||
@@ -20,6 +20,7 @@ from ..loops.base import AsyncAgentConfig
|
||||
from ..responses import (
|
||||
convert_completion_messages_to_responses_items,
|
||||
convert_responses_items_to_completion_messages,
|
||||
make_reasoning_item,
|
||||
)
|
||||
from ..types import AgentCapability
|
||||
|
||||
@@ -373,13 +374,23 @@ class GenericVlmConfig(AsyncAgentConfig):
|
||||
if _on_usage:
|
||||
await _on_usage(usage)
|
||||
|
||||
# Parse tool call from text; then convert to responses items via fake tool_calls
|
||||
# Extract response data
|
||||
resp_dict = response.model_dump() # type: ignore
|
||||
choice = (resp_dict.get("choices") or [{}])[0]
|
||||
content_text = ((choice.get("message") or {}).get("content")) or ""
|
||||
tool_call = _parse_tool_call_from_text(content_text)
|
||||
message = choice.get("message") or {}
|
||||
content_text = message.get("content") or ""
|
||||
tool_calls_array = message.get("tool_calls") or []
|
||||
reasoning_text = message.get("reasoning") or ""
|
||||
|
||||
output_items: List[Dict[str, Any]] = []
|
||||
|
||||
# Add reasoning if present (Ollama Cloud format)
|
||||
if reasoning_text:
|
||||
output_items.append(make_reasoning_item(reasoning_text))
|
||||
|
||||
# Priority 1: Try to parse tool call from content text (OpenRouter format)
|
||||
tool_call = _parse_tool_call_from_text(content_text)
|
||||
|
||||
if tool_call and isinstance(tool_call, dict):
|
||||
fn_name = tool_call.get("name") or "computer"
|
||||
raw_args = tool_call.get("arguments") or {}
|
||||
@@ -405,8 +416,50 @@ class GenericVlmConfig(AsyncAgentConfig):
|
||||
],
|
||||
}
|
||||
output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
|
||||
elif tool_calls_array:
|
||||
# Priority 2: Use tool_calls field if present (Ollama Cloud format)
|
||||
# Process and unnormalize coordinates in tool calls
|
||||
processed_tool_calls = []
|
||||
for tc in tool_calls_array:
|
||||
function = tc.get("function", {})
|
||||
fn_name = function.get("name", "computer")
|
||||
args_str = function.get("arguments", "{}")
|
||||
|
||||
try:
|
||||
args = json.loads(args_str)
|
||||
|
||||
# Unnormalize coordinates if present
|
||||
if "coordinate" in args and last_rw is not None and last_rh is not None:
|
||||
args = await _unnormalize_coordinate(args, (last_rw, last_rh))
|
||||
|
||||
# Convert Qwen format to Computer Calls format if this is a computer tool
|
||||
if fn_name == "computer":
|
||||
converted_action = convert_qwen_tool_args_to_computer_action(args)
|
||||
if converted_action:
|
||||
args = converted_action
|
||||
|
||||
processed_tool_calls.append(
|
||||
{
|
||||
"type": tc.get("type", "function"),
|
||||
"id": tc.get("id", "call_0"),
|
||||
"function": {
|
||||
"name": fn_name,
|
||||
"arguments": json.dumps(args),
|
||||
},
|
||||
}
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
# Keep original if parsing fails
|
||||
processed_tool_calls.append(tc)
|
||||
|
||||
fake_cm = {
|
||||
"role": "assistant",
|
||||
"content": content_text if content_text else "",
|
||||
"tool_calls": processed_tool_calls,
|
||||
}
|
||||
output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
|
||||
else:
|
||||
# Fallback: just return assistant text
|
||||
# No tool calls found in either format, return text response
|
||||
fake_cm = {"role": "assistant", "content": content_text}
|
||||
output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
|
||||
|
||||
|
||||
@@ -365,6 +365,22 @@ class OmniparserConfig(AsyncAgentConfig):
|
||||
**kwargs,
|
||||
}
|
||||
|
||||
# Add Vertex AI specific parameters if using vertex_ai models
|
||||
if llm_model.startswith("vertex_ai/"):
|
||||
import os
|
||||
|
||||
# Pass vertex_project and vertex_location to liteLLM
|
||||
if "vertex_project" not in api_kwargs:
|
||||
api_kwargs["vertex_project"] = os.getenv("GOOGLE_CLOUD_PROJECT")
|
||||
if "vertex_location" not in api_kwargs:
|
||||
api_kwargs["vertex_location"] = "global"
|
||||
|
||||
# Pass through Gemini 3-specific parameters if provided
|
||||
if "thinking_level" in kwargs:
|
||||
api_kwargs["thinking_level"] = kwargs["thinking_level"]
|
||||
if "media_resolution" in kwargs:
|
||||
api_kwargs["media_resolution"] = kwargs["media_resolution"]
|
||||
|
||||
# Call API start hook
|
||||
if _on_api_start:
|
||||
await _on_api_start(api_kwargs)
|
||||
|
||||
@@ -5,13 +5,14 @@ UITARS-2 agent loop implementation using LiteLLM.
|
||||
- Calls litellm.acompletion
|
||||
- Parses <seed:tool_call> ... </seed:tool_call> outputs back into Responses items (computer actions)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import litellm
|
||||
from litellm.responses.litellm_completion_transformation.transformation import (
|
||||
@@ -20,37 +21,45 @@ from litellm.responses.litellm_completion_transformation.transformation import (
|
||||
|
||||
from ..decorators import register_agent
|
||||
from .omniparser import get_last_computer_call_output # type: ignore
|
||||
|
||||
try:
|
||||
from PIL import Image # type: ignore
|
||||
except Exception: # pragma: no cover
|
||||
Image = None # type: ignore
|
||||
from ..responses import (
|
||||
convert_responses_items_to_completion_messages,
|
||||
make_click_item,
|
||||
make_double_click_item,
|
||||
make_drag_item,
|
||||
make_function_call_item,
|
||||
make_keypress_item,
|
||||
make_screenshot_item,
|
||||
make_move_item,
|
||||
make_output_text_item,
|
||||
make_reasoning_item,
|
||||
make_screenshot_item,
|
||||
make_scroll_item,
|
||||
make_type_item,
|
||||
make_wait_item,
|
||||
convert_responses_items_to_completion_messages,
|
||||
)
|
||||
from ..types import AgentCapability
|
||||
|
||||
|
||||
TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
||||
{"type": "function", "name": "open_computer", "parameters": {}, "description": "Open computer."},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "open_computer",
|
||||
"parameters": {},
|
||||
"description": "Open computer.",
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "click",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}
|
||||
"point": {
|
||||
"type": "string",
|
||||
"description": "Click coordinates. The format is: <point>x y</point>",
|
||||
}
|
||||
},
|
||||
"required": ["point"],
|
||||
},
|
||||
@@ -62,7 +71,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}
|
||||
"point": {
|
||||
"type": "string",
|
||||
"description": "Click coordinates. The format is: <point>x y</point>",
|
||||
}
|
||||
},
|
||||
"required": ["point"],
|
||||
},
|
||||
@@ -74,7 +86,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}
|
||||
"point": {
|
||||
"type": "string",
|
||||
"description": "Click coordinates. The format is: <point>x y</point>",
|
||||
}
|
||||
},
|
||||
"required": ["point"],
|
||||
},
|
||||
@@ -106,7 +121,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"point": {"type": "string", "description": "Target coordinates. The format is: <point>x y</point>"}
|
||||
"point": {
|
||||
"type": "string",
|
||||
"description": "Target coordinates. The format is: <point>x y</point>",
|
||||
}
|
||||
},
|
||||
"required": ["point"],
|
||||
},
|
||||
@@ -117,7 +135,12 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
||||
"name": "hotkey",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"key": {"type": "string", "description": "Hotkeys you want to press. Split keys with a space and use lowercase."}},
|
||||
"properties": {
|
||||
"key": {
|
||||
"type": "string",
|
||||
"description": "Hotkeys you want to press. Split keys with a space and use lowercase.",
|
||||
}
|
||||
},
|
||||
"required": ["key"],
|
||||
},
|
||||
"description": "Press hotkey.",
|
||||
@@ -227,9 +250,7 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
||||
"name": "wait",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"time": {"type": "integer", "description": "Wait time in seconds."}
|
||||
},
|
||||
"properties": {"time": {"type": "integer", "description": "Wait time in seconds."}},
|
||||
"required": [],
|
||||
},
|
||||
"description": "Wait for a while.",
|
||||
@@ -268,7 +289,12 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
||||
},
|
||||
"description": "Type content.",
|
||||
},
|
||||
{"type": "function", "name": "take_screenshot", "parameters": {}, "description": "Take screenshot."},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "take_screenshot",
|
||||
"parameters": {},
|
||||
"description": "Take screenshot.",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@@ -319,7 +345,9 @@ _PROMPT_SUFFIX = (
|
||||
SYSTEM_PROMPT = _PROMPT_PREFIX + _format_tool_schemas_json_lines(TOOL_SCHEMAS) + _PROMPT_SUFFIX
|
||||
|
||||
|
||||
def _extract_function_schemas_from_tools(tools: Optional[List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
||||
def _extract_function_schemas_from_tools(
|
||||
tools: Optional[List[Dict[str, Any]]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
schemas: List[Dict[str, Any]] = []
|
||||
if not tools:
|
||||
return schemas
|
||||
@@ -330,12 +358,14 @@ def _extract_function_schemas_from_tools(tools: Optional[List[Dict[str, Any]]])
|
||||
params = fn.get("parameters", {})
|
||||
desc = fn.get("description", "")
|
||||
if name:
|
||||
schemas.append({
|
||||
"type": "function",
|
||||
"name": name,
|
||||
"parameters": params if isinstance(params, dict) else {},
|
||||
"description": desc,
|
||||
})
|
||||
schemas.append(
|
||||
{
|
||||
"type": "function",
|
||||
"name": name,
|
||||
"parameters": params if isinstance(params, dict) else {},
|
||||
"description": desc,
|
||||
}
|
||||
)
|
||||
return schemas
|
||||
|
||||
|
||||
@@ -392,7 +422,9 @@ def _denormalize_xy_from_uitars(nx: float, ny: float, width: int, height: int) -
|
||||
return x, y
|
||||
|
||||
|
||||
def _map_computer_action_to_function(action: Dict[str, Any], width: int, height: int) -> Optional[Dict[str, Any]]:
|
||||
def _map_computer_action_to_function(
|
||||
action: Dict[str, Any], width: int, height: int
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Map a computer action item to a UITARS function + parameters dict of strings.
|
||||
Returns dict like {"function": name, "parameters": {..}} or None if unknown.
|
||||
"""
|
||||
@@ -404,7 +436,10 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height:
|
||||
return None
|
||||
nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
|
||||
if btn == "right":
|
||||
return {"function": "right_single", "parameters": {"point": f"<point>{nx} {ny}</point>"}}
|
||||
return {
|
||||
"function": "right_single",
|
||||
"parameters": {"point": f"<point>{nx} {ny}</point>"},
|
||||
}
|
||||
return {"function": "click", "parameters": {"point": f"<point>{nx} {ny}</point>"}}
|
||||
if atype == "double_click":
|
||||
x, y = action.get("x"), action.get("y")
|
||||
@@ -434,8 +469,19 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height:
|
||||
nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
|
||||
sx, sy = action.get("scroll_x", 0), action.get("scroll_y", 0)
|
||||
# Our parser used positive sy for up
|
||||
direction = "up" if sy and sy > 0 else ("down" if sy and sy < 0 else ("right" if sx and sx > 0 else ("left" if sx and sx < 0 else "down")))
|
||||
return {"function": "scroll", "parameters": {"direction": direction, "point": f"<point>{nx} {ny}</point>"}}
|
||||
direction = (
|
||||
"up"
|
||||
if sy and sy > 0
|
||||
else (
|
||||
"down"
|
||||
if sy and sy < 0
|
||||
else ("right" if sx and sx > 0 else ("left" if sx and sx < 0 else "down"))
|
||||
)
|
||||
)
|
||||
return {
|
||||
"function": "scroll",
|
||||
"parameters": {"direction": direction, "point": f"<point>{nx} {ny}</point>"},
|
||||
}
|
||||
if atype == "drag":
|
||||
path = action.get("path", [])
|
||||
if isinstance(path, list) and len(path) >= 2:
|
||||
@@ -461,7 +507,9 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height:
|
||||
return None
|
||||
|
||||
|
||||
def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int) -> List[Dict[str, Any]]:
|
||||
def _to_uitars_messages(
|
||||
messages: List[Dict[str, Any]], width: int, height: int
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Convert responses items into completion messages tailored for UI-TARS.
|
||||
|
||||
- User content is passed through similar to convert_responses_items_to_completion_messages
|
||||
@@ -505,7 +553,9 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
|
||||
completion_content = []
|
||||
for item in content:
|
||||
if item.get("type") == "input_image":
|
||||
completion_content.append({"type": "image_url", "image_url": {"url": item.get("image_url")}})
|
||||
completion_content.append(
|
||||
{"type": "image_url", "image_url": {"url": item.get("image_url")}}
|
||||
)
|
||||
elif item.get("type") in ("input_text", "text"):
|
||||
completion_content.append({"type": "text", "text": item.get("text")})
|
||||
uitars_messages.append({"role": "user", "content": completion_content})
|
||||
@@ -517,7 +567,11 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
|
||||
if mtype == "reasoning":
|
||||
# Responses reasoning stores summary list
|
||||
summary = msg.get("summary", [])
|
||||
texts = [s.get("text", "") for s in summary if isinstance(s, dict) and s.get("type") == "summary_text"]
|
||||
texts = [
|
||||
s.get("text", "")
|
||||
for s in summary
|
||||
if isinstance(s, dict) and s.get("type") == "summary_text"
|
||||
]
|
||||
if texts:
|
||||
pending_think = "\n".join([t for t in texts if t])
|
||||
continue
|
||||
@@ -546,9 +600,15 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
|
||||
pending_think, pending_functions = None, []
|
||||
content = msg.get("content", [])
|
||||
if isinstance(content, list):
|
||||
texts = [c.get("text", "") for c in content if isinstance(c, dict) and c.get("type") in ("output_text", "text")]
|
||||
texts = [
|
||||
c.get("text", "")
|
||||
for c in content
|
||||
if isinstance(c, dict) and c.get("type") in ("output_text", "text")
|
||||
]
|
||||
if texts:
|
||||
uitars_messages.append({"role": "assistant", "content": "\n".join([t for t in texts if t])})
|
||||
uitars_messages.append(
|
||||
{"role": "assistant", "content": "\n".join([t for t in texts if t])}
|
||||
)
|
||||
elif isinstance(content, str) and content:
|
||||
uitars_messages.append({"role": "assistant", "content": content})
|
||||
continue
|
||||
@@ -581,8 +641,12 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
|
||||
|
||||
return uitars_messages
|
||||
|
||||
|
||||
def _to_response_items(
|
||||
actions: List[Dict[str, Any]], tool_names: Optional[set[str]] = None, width: Optional[int] = None, height: Optional[int] = None
|
||||
actions: List[Dict[str, Any]],
|
||||
tool_names: Optional[set[str]] = None,
|
||||
width: Optional[int] = None,
|
||||
height: Optional[int] = None,
|
||||
) -> List[Any]:
|
||||
"""Map parsed actions into Responses items (computer actions + optional reasoning)."""
|
||||
items: List[Any] = []
|
||||
@@ -736,8 +800,12 @@ class UITARS2Config:
|
||||
|
||||
# Build dynamic system prompt by concatenating built-in schemas and provided function tools
|
||||
provided_fn_schemas = _extract_function_schemas_from_tools(tools)
|
||||
combined_schemas = TOOL_SCHEMAS + provided_fn_schemas if provided_fn_schemas else TOOL_SCHEMAS
|
||||
dynamic_system_prompt = _PROMPT_PREFIX + _format_tool_schemas_json_lines(combined_schemas) + _PROMPT_SUFFIX
|
||||
combined_schemas = (
|
||||
TOOL_SCHEMAS + provided_fn_schemas if provided_fn_schemas else TOOL_SCHEMAS
|
||||
)
|
||||
dynamic_system_prompt = (
|
||||
_PROMPT_PREFIX + _format_tool_schemas_json_lines(combined_schemas) + _PROMPT_SUFFIX
|
||||
)
|
||||
|
||||
# Prepend system prompt (based on training prompts + provided tools)
|
||||
litellm_messages: List[Dict[str, Any]] = [
|
||||
@@ -829,7 +897,10 @@ class UITARS2Config:
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Please return a single click action."},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
@@ -841,7 +912,9 @@ class UITARS2Config:
|
||||
"temperature": kwargs.get("temperature", 0.0),
|
||||
"do_sample": kwargs.get("temperature", 0.0) > 0.0,
|
||||
}
|
||||
api_kwargs.update({k: v for k, v in (kwargs or {}).items() if k not in ["max_tokens", "temperature"]})
|
||||
api_kwargs.update(
|
||||
{k: v for k, v in (kwargs or {}).items() if k not in ["max_tokens", "temperature"]}
|
||||
)
|
||||
|
||||
response = await litellm.acompletion(**api_kwargs)
|
||||
# Extract response content
|
||||
@@ -852,7 +925,11 @@ class UITARS2Config:
|
||||
msg = choices[0].get("message", {})
|
||||
content_text = msg.get("content", "")
|
||||
if isinstance(content_text, list):
|
||||
text_parts = [p.get("text", "") for p in content_text if isinstance(p, dict) and p.get("type") == "text"]
|
||||
text_parts = [
|
||||
p.get("text", "")
|
||||
for p in content_text
|
||||
if isinstance(p, dict) and p.get("type") == "text"
|
||||
]
|
||||
content_text = "\n".join([t for t in text_parts if t])
|
||||
if not isinstance(content_text, str):
|
||||
return None
|
||||
|
||||
@@ -22,14 +22,14 @@ async def test_http_endpoint():
|
||||
|
||||
# Example 1: Simple text request
|
||||
simple_request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"input": "Tell me a three sentence bedtime story about a unicorn.",
|
||||
"env": {"ANTHROPIC_API_KEY": anthropic_api_key},
|
||||
}
|
||||
|
||||
# Example 2: Multi-modal request with image
|
||||
multimodal_request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"input": [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -47,7 +47,7 @@ async def test_http_endpoint():
|
||||
|
||||
# Example 3: Request with custom agent and computer kwargs
|
||||
custom_request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"input": "Take a screenshot and tell me what you see",
|
||||
"env": {"ANTHROPIC_API_KEY": anthropic_api_key},
|
||||
}
|
||||
@@ -95,7 +95,7 @@ def curl_examples():
|
||||
"""curl http://localhost:8000/responses \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"input": "Tell me a three sentence bedtime story about a unicorn."
|
||||
}'"""
|
||||
)
|
||||
@@ -105,7 +105,7 @@ def curl_examples():
|
||||
"""curl http://localhost:8000/responses \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"input": [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -126,7 +126,7 @@ def curl_examples():
|
||||
"""curl http://localhost:8000/responses \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"input": "Take a screenshot and tell me what you see",
|
||||
"agent_kwargs": {
|
||||
"save_trajectory": true,
|
||||
@@ -166,7 +166,7 @@ async def test_p2p_client():
|
||||
|
||||
# Send a test request
|
||||
request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"input": "Hello from P2P client!",
|
||||
}
|
||||
await connection.send(json.dumps(request))
|
||||
|
||||
@@ -6,9 +6,9 @@ with an advanced UI for model selection and configuration.
|
||||
|
||||
Supported Agent Models:
|
||||
- OpenAI: openai/computer-use-preview
|
||||
- Anthropic: anthropic/claude-3-5-sonnet-20241022, anthropic/claude-3-7-sonnet-20250219
|
||||
- Anthropic: anthropic/claude-sonnet-4-5-20250929, anthropic/claude-3-7-sonnet-20250219
|
||||
- UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
||||
- Omniparser: omniparser+anthropic/claude-3-5-sonnet-20241022, omniparser+ollama_chat/gemma3
|
||||
- Omniparser: omniparser+anthropic/claude-sonnet-4-5-20250929, omniparser+ollama_chat/gemma3
|
||||
|
||||
Requirements:
|
||||
- Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
|
||||
@@ -116,14 +116,12 @@ MODEL_MAPPINGS = {
|
||||
"Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
|
||||
"Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
|
||||
"Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
|
||||
"Anthropic: Claude 3.5 Sonnet (20241022)": "anthropic/claude-3-5-sonnet-20241022",
|
||||
},
|
||||
"omni": {
|
||||
"default": "omniparser+openai/gpt-4o",
|
||||
"OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
|
||||
"OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
|
||||
"OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
|
||||
"OMNI: Claude 3.5 Sonnet (20241022)": "omniparser+anthropic/claude-3-5-sonnet-20241022",
|
||||
},
|
||||
"uitars": {
|
||||
"default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",
|
||||
|
||||
@@ -44,13 +44,11 @@ def create_gradio_ui() -> gr.Blocks:
|
||||
"Anthropic: Claude 4 Opus (20250514)",
|
||||
"Anthropic: Claude 4 Sonnet (20250514)",
|
||||
"Anthropic: Claude 3.7 Sonnet (20250219)",
|
||||
"Anthropic: Claude 3.5 Sonnet (20241022)",
|
||||
]
|
||||
omni_models = [
|
||||
"OMNI: OpenAI GPT-4o",
|
||||
"OMNI: OpenAI GPT-4o mini",
|
||||
"OMNI: Claude 3.7 Sonnet (20250219)",
|
||||
"OMNI: Claude 3.5 Sonnet (20241022)",
|
||||
]
|
||||
|
||||
# Check if API keys are available
|
||||
|
||||
@@ -102,7 +102,7 @@ async def main():
|
||||
# model="anthropic/claude-opus-4-20250514",
|
||||
# model="anthropic/claude-sonnet-4-20250514",
|
||||
# model="anthropic/claude-3-7-sonnet-20250219",
|
||||
# model="anthropic/claude-3-5-sonnet-20241022",
|
||||
# model="anthropic/claude-sonnet-4-5-20250929",
|
||||
# == UI-TARS ==
|
||||
# model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
|
||||
# TODO: add local mlx provider
|
||||
|
||||
@@ -24,7 +24,7 @@ def mock_litellm():
|
||||
"id": "chatcmpl-test123",
|
||||
"object": "chat.completion",
|
||||
"created": 1234567890,
|
||||
"model": kwargs.get("model", "anthropic/claude-3-5-sonnet-20241022"),
|
||||
"model": kwargs.get("model", "anthropic/claude-sonnet-4-5-20250929"),
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
|
||||
@@ -18,18 +18,18 @@ class TestComputerAgentInitialization:
|
||||
"""Test that agent can be initialized with a model string."""
|
||||
from agent import ComputerAgent
|
||||
|
||||
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
|
||||
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
|
||||
|
||||
assert agent is not None
|
||||
assert hasattr(agent, "model")
|
||||
assert agent.model == "anthropic/claude-3-5-sonnet-20241022"
|
||||
assert agent.model == "anthropic/claude-sonnet-4-5-20250929"
|
||||
|
||||
@patch("agent.agent.litellm")
|
||||
def test_agent_initialization_with_tools(self, mock_litellm, disable_telemetry, mock_computer):
|
||||
"""Test that agent can be initialized with tools."""
|
||||
from agent import ComputerAgent
|
||||
|
||||
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
|
||||
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929", tools=[mock_computer])
|
||||
|
||||
assert agent is not None
|
||||
assert hasattr(agent, "tools")
|
||||
@@ -41,7 +41,7 @@ class TestComputerAgentInitialization:
|
||||
|
||||
budget = 5.0
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022", max_trajectory_budget=budget
|
||||
model="anthropic/claude-sonnet-4-5-20250929", max_trajectory_budget=budget
|
||||
)
|
||||
|
||||
assert agent is not None
|
||||
@@ -79,7 +79,7 @@ class TestComputerAgentRun:
|
||||
|
||||
mock_litellm.acompletion = AsyncMock(return_value=mock_response)
|
||||
|
||||
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
|
||||
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
|
||||
|
||||
# Run should return an async generator
|
||||
result_generator = agent.run(sample_messages)
|
||||
@@ -92,7 +92,7 @@ class TestComputerAgentRun:
|
||||
"""Test that agent has run method available."""
|
||||
from agent import ComputerAgent
|
||||
|
||||
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
|
||||
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
|
||||
|
||||
# Verify run method exists
|
||||
assert hasattr(agent, "run")
|
||||
@@ -102,7 +102,7 @@ class TestComputerAgentRun:
|
||||
"""Test that agent has agent_loop initialized."""
|
||||
from agent import ComputerAgent
|
||||
|
||||
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
|
||||
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
|
||||
|
||||
# Verify agent_loop is initialized
|
||||
assert hasattr(agent, "agent_loop")
|
||||
@@ -132,7 +132,7 @@ class TestComputerAgentIntegration:
|
||||
"""Test that agent can be initialized with Computer tool."""
|
||||
from agent import ComputerAgent
|
||||
|
||||
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
|
||||
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929", tools=[mock_computer])
|
||||
|
||||
# Verify agent accepted the tool
|
||||
assert agent is not None
|
||||
|
||||
@@ -133,7 +133,7 @@ await cleanup_session(ctx, "session-to-cleanup")
|
||||
|
||||
### Environment Variables
|
||||
|
||||
- `CUA_MODEL_NAME`: Model to use (default: `anthropic/claude-3-5-sonnet-20241022`)
|
||||
- `CUA_MODEL_NAME`: Model to use (default: `anthropic/claude-sonnet-4-5-20250929`)
|
||||
- `CUA_MAX_IMAGES`: Maximum images to keep (default: `3`)
|
||||
|
||||
### Session Manager Configuration
|
||||
|
||||
@@ -44,7 +44,7 @@ Add this to your MCP client configuration:
|
||||
"args": [
|
||||
"bash",
|
||||
"-lc",
|
||||
"export CUA_MODEL_NAME='anthropic/claude-3-5-sonnet-20241022'; ~/.cua/start_mcp_server.sh"
|
||||
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-5-20250929'; ~/.cua/start_mcp_server.sh"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -156,7 +156,7 @@ def serve() -> FastMCP:
|
||||
|
||||
try:
|
||||
# Get model name
|
||||
model_name = os.getenv("CUA_MODEL_NAME", "anthropic/claude-3-5-sonnet-20241022")
|
||||
model_name = os.getenv("CUA_MODEL_NAME", "anthropic/claude-sonnet-4-5-20250929")
|
||||
logger.info(f"Using model: {model_name}")
|
||||
|
||||
# Create agent with the new v0.4.x API
|
||||
|
||||
@@ -168,7 +168,7 @@ def print_usage_examples():
|
||||
"command": "/bin/bash",
|
||||
"args": ["~/.cua/start_mcp_server.sh"],
|
||||
"env": {
|
||||
"CUA_MODEL_NAME": "anthropic/claude-3-5-sonnet-20241022"
|
||||
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-5-20250929"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -192,7 +192,7 @@ Step 2: Configure MCP client:
|
||||
"command": "/bin/bash",
|
||||
"args": ["~/.cua/start_mcp_server.sh"],
|
||||
"env": {
|
||||
"CUA_MODEL_NAME": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"CUA_USE_HOST_COMPUTER_SERVER": "true"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ const peerClient = new AgentClient('peer://my-agent-proxy');
|
||||
|
||||
// Send a simple text request
|
||||
const response = await client.responses.create({
|
||||
model: 'anthropic/claude-3-5-sonnet-20241022',
|
||||
model: 'anthropic/claude-sonnet-4-5-20250929',
|
||||
input: 'Write a one-sentence bedtime story about a unicorn.',
|
||||
// Optional per-request env overrides
|
||||
env: {
|
||||
@@ -47,7 +47,7 @@ console.log(response.output);
|
||||
|
||||
```typescript
|
||||
const response = await client.responses.create({
|
||||
model: 'anthropic/claude-3-5-sonnet-20241022',
|
||||
model: 'anthropic/claude-sonnet-4-5-20250929',
|
||||
input: [
|
||||
{
|
||||
role: 'user',
|
||||
@@ -74,7 +74,7 @@ const client = new AgentClient('https://localhost:8000', {
|
||||
});
|
||||
|
||||
const response = await client.responses.create({
|
||||
model: 'anthropic/claude-3-5-sonnet-20241022',
|
||||
model: 'anthropic/claude-sonnet-4-5-20250929',
|
||||
input: 'Hello, world!',
|
||||
agent_kwargs: {
|
||||
save_trajectory: true,
|
||||
|
||||
@@ -42,7 +42,7 @@ A simple HTML page that demonstrates using the CUA Agent Client in a browser env
|
||||
|
||||
4. **Configure and test:**
|
||||
- Enter an agent URL (e.g., `https://localhost:8000` or `peer://some-peer-id`)
|
||||
- Enter a model name (e.g., `anthropic/claude-3-5-sonnet-20241022`)
|
||||
- Enter a model name (e.g., `anthropic/claude-sonnet-4-5-20250929`)
|
||||
- Type a message and click "Send Message" or press Enter
|
||||
- View the response in the output textarea
|
||||
|
||||
@@ -53,7 +53,7 @@ A simple HTML page that demonstrates using the CUA Agent Client in a browser env
|
||||
|
||||
**Example Models:**
|
||||
|
||||
- `anthropic/claude-3-5-sonnet-20241022`
|
||||
- `anthropic/claude-sonnet-4-5-20250929`
|
||||
- `openai/gpt-4`
|
||||
- `huggingface-local/microsoft/UI-TARS-7B`
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@trycua/cli",
|
||||
"version": "0.1.4",
|
||||
"version": "0.1.5",
|
||||
"packageManager": "bun@1.1.38",
|
||||
"description": "Command-line interface for CUA cloud sandboxes and authentication",
|
||||
"type": "module",
|
||||
|
||||
@@ -17,7 +17,9 @@ export async function runCli() {
|
||||
' cua sb <command> Create and manage cloud sandboxes\n' +
|
||||
' list View all your sandboxes\n' +
|
||||
' create Provision a new sandbox\n' +
|
||||
' start/stop Control sandbox state\n' +
|
||||
' start Start or resume a sandbox\n' +
|
||||
' stop Stop a sandbox (preserves disk)\n' +
|
||||
' suspend Suspend a sandbox (preserves memory)\n' +
|
||||
' vnc Open remote desktop\n' +
|
||||
'\n' +
|
||||
'Documentation: https://docs.cua.ai/libraries/cua-cli/commands'
|
||||
|
||||
@@ -191,6 +191,41 @@ const restartHandler = async (argv: Record<string, unknown>) => {
|
||||
process.exit(1);
|
||||
};
|
||||
|
||||
const suspendHandler = async (argv: Record<string, unknown>) => {
|
||||
const token = await ensureApiKeyInteractive();
|
||||
const name = String((argv as any).name);
|
||||
const res = await http(`/v1/vms/${encodeURIComponent(name)}/suspend`, {
|
||||
token,
|
||||
method: 'POST',
|
||||
});
|
||||
if (res.status === 202) {
|
||||
const body = (await res.json().catch(() => ({}))) as {
|
||||
status?: string;
|
||||
};
|
||||
console.log(body.status ?? 'suspending');
|
||||
return;
|
||||
}
|
||||
if (res.status === 404) {
|
||||
console.error('Sandbox not found');
|
||||
process.exit(1);
|
||||
}
|
||||
if (res.status === 401) {
|
||||
clearApiKey();
|
||||
console.error("Unauthorized. Try 'cua login' again.");
|
||||
process.exit(1);
|
||||
}
|
||||
if (res.status === 400 || res.status === 500) {
|
||||
const body = (await res.json().catch(() => ({}))) as { error?: string };
|
||||
console.error(
|
||||
body.error ??
|
||||
"Suspend not supported for this VM. Use 'cua sb stop' instead."
|
||||
);
|
||||
process.exit(1);
|
||||
}
|
||||
console.error(`Unexpected status: ${res.status}`);
|
||||
process.exit(1);
|
||||
};
|
||||
|
||||
const openHandler = async (argv: Record<string, unknown>) => {
|
||||
const token = await ensureApiKeyInteractive();
|
||||
const name = String((argv as any).name);
|
||||
@@ -296,6 +331,13 @@ export function registerSandboxCommands(y: Argv) {
|
||||
y.positional('name', { type: 'string', describe: 'Sandbox name' }),
|
||||
restartHandler
|
||||
)
|
||||
.command(
|
||||
'suspend <name>',
|
||||
'Suspend a sandbox, preserving memory state (use start to resume)',
|
||||
(y) =>
|
||||
y.positional('name', { type: 'string', describe: 'Sandbox name' }),
|
||||
suspendHandler
|
||||
)
|
||||
.command(
|
||||
['vnc <name>', 'open <name>'],
|
||||
'Open remote desktop (VNC) connection in your browser',
|
||||
@@ -378,6 +420,13 @@ export function registerSandboxCommands(y: Argv) {
|
||||
y.positional('name', { type: 'string', describe: 'Sandbox name' }),
|
||||
handler: restartHandler,
|
||||
} as any)
|
||||
.command({
|
||||
command: 'suspend <name>',
|
||||
describe: false as any, // Hide from help
|
||||
builder: (y: Argv) =>
|
||||
y.positional('name', { type: 'string', describe: 'Sandbox name' }),
|
||||
handler: suspendHandler,
|
||||
} as any)
|
||||
.command({
|
||||
command: ['vnc <name>', 'open <name>'],
|
||||
describe: false as any, // Hide from help
|
||||
|
||||
@@ -16,6 +16,8 @@ export type SandboxStatus =
|
||||
| 'pending'
|
||||
| 'running'
|
||||
| 'stopped'
|
||||
| 'suspended'
|
||||
| 'suspending'
|
||||
| 'terminated'
|
||||
| 'failed';
|
||||
export type SandboxItem = {
|
||||
|
||||
@@ -203,7 +203,7 @@
|
||||
"\n",
|
||||
"Examples:\n",
|
||||
"- `openai/computer-use-preview+ollama/gemma3:4b`\n",
|
||||
"- `anthropic/claude-3-5-sonnet-20241022+ollama/gemma3:4b`\n"
|
||||
"- `anthropic/claude-sonnet-4-5-20250929+ollama/gemma3:4b`\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -217,7 +217,7 @@
|
||||
"import logging\n",
|
||||
"\n",
|
||||
"agent_composed = ComputerAgent(\n",
|
||||
" model=\"anthropic/claude-3-5-sonnet-20241022+ollama/gemma3:4b\",\n",
|
||||
" model=\"anthropic/claude-sonnet-4-5-20250929+ollama/gemma3:4b\",\n",
|
||||
" tools=[computer],\n",
|
||||
" trajectory_dir=\"trajectories\",\n",
|
||||
" only_n_most_recent_images=3,\n",
|
||||
@@ -234,7 +234,20 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "section-3-conceptual",
|
||||
"metadata": {},
|
||||
"source": "## 3) Customize your agent 🛠️\n\nFor a few customization options, see: https://cua.ai/docs/agent-sdk/customizing-computeragent\n\nLevels of customization you can explore:\n\n1) Simple — Prompt engineering\n2) Easy — Tools\n3) Intermediate — Callbacks\n4) Expert — Custom agent via `register_agent` (see `libs/python/agent/agent/decorators.py` → `register_agent`)\n\nor, incorporate the ComputerAgent into your own agent framework!"
|
||||
"source": [
|
||||
"## 3) Customize your agent 🛠️\n",
|
||||
"\n",
|
||||
"For a few customization options, see: https://cua.ai/docs/agent-sdk/customizing-computeragent\n",
|
||||
"\n",
|
||||
"Levels of customization you can explore:\n",
|
||||
"\n",
|
||||
"1) Simple — Prompt engineering\n",
|
||||
"2) Easy — Tools\n",
|
||||
"3) Intermediate — Callbacks\n",
|
||||
"4) Expert — Custom agent via `register_agent` (see `libs/python/agent/agent/decorators.py` → `register_agent`)\n",
|
||||
"\n",
|
||||
"or, incorporate the ComputerAgent into your own agent framework!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
@@ -274,4 +287,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
}
|
||||
|
||||
@@ -184,7 +184,7 @@ if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser(description="Test CUA Agent with mock computer")
|
||||
parser.add_argument(
|
||||
"--model", default="anthropic/claude-sonnet-4-20250514", help="CUA model to test"
|
||||
"--model", default="anthropic/claude-sonnet-4-5-20250929", help="CUA model to test"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
6
uv.lock
generated
6
uv.lock
generated
@@ -861,7 +861,7 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "cua-agent"
|
||||
version = "0.4.39"
|
||||
version = "0.4.53"
|
||||
source = { editable = "libs/python/agent" }
|
||||
dependencies = [
|
||||
{ name = "aiohttp" },
|
||||
@@ -885,7 +885,6 @@ all = [
|
||||
{ name = "einops" },
|
||||
{ name = "google-genai" },
|
||||
{ name = "gradio" },
|
||||
{ name = "hud-python" },
|
||||
{ name = "mlx-vlm", marker = "sys_platform == 'darwin'" },
|
||||
{ name = "pillow" },
|
||||
{ name = "python-dotenv" },
|
||||
@@ -975,7 +974,6 @@ requires-dist = [
|
||||
{ name = "gradio", marker = "extra == 'all'", specifier = ">=5.23.3" },
|
||||
{ name = "gradio", marker = "extra == 'ui'", specifier = ">=5.23.3" },
|
||||
{ name = "httpx", specifier = ">=0.27.0" },
|
||||
{ name = "hud-python", marker = "extra == 'all'", specifier = "==0.4.52" },
|
||||
{ name = "hud-python", marker = "extra == 'hud'", specifier = "==0.4.52" },
|
||||
{ name = "litellm", specifier = ">=1.74.12" },
|
||||
{ name = "mlx-vlm", marker = "sys_platform == 'darwin' and extra == 'all'", specifier = ">=0.1.27" },
|
||||
@@ -1015,7 +1013,7 @@ provides-extras = ["openai", "anthropic", "qwen", "omni", "uitars", "uitars-mlx"
|
||||
|
||||
[[package]]
|
||||
name = "cua-computer"
|
||||
version = "0.4.12"
|
||||
version = "0.4.17"
|
||||
source = { editable = "libs/python/computer" }
|
||||
dependencies = [
|
||||
{ name = "aiohttp" },
|
||||
|
||||
Reference in New Issue
Block a user