mirror of
https://github.com/trycua/computer.git
synced 2026-01-07 14:00:04 -06:00
Merge pull request #376 from trycua/feat/move-agent-sdk-readmes-to-docs
Move Agent SDK READMEs to docs
This commit is contained in:
@@ -22,7 +22,7 @@ agent = ComputerAgent(
|
||||
tools=[computer]
|
||||
)
|
||||
|
||||
prompt = "open github, navigate to trycua/cua"
|
||||
prompt = "Take a screenshot and tell me what you see"
|
||||
|
||||
async for result in agent.run(prompt):
|
||||
if result["output"][-1]["type"] == "message":
|
||||
@@ -30,3 +30,142 @@ async for result in agent.run(prompt):
|
||||
```
|
||||
|
||||
For a list of supported models and configurations, see the [Supported Agents](./supported-agents/computer-use-agents) page.
|
||||
|
||||
### Response Format
|
||||
|
||||
```python
|
||||
{
|
||||
"output": [
|
||||
{
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"content": [{"type": "output_text", "text": "I can see..."}]
|
||||
},
|
||||
{
|
||||
"type": "computer_call",
|
||||
"action": {"type": "screenshot"},
|
||||
"call_id": "call_123"
|
||||
},
|
||||
{
|
||||
"type": "computer_call_output",
|
||||
"call_id": "call_123",
|
||||
"output": {"image_url": "data:image/png;base64,..."}
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 150,
|
||||
"completion_tokens": 75,
|
||||
"total_tokens": 225,
|
||||
"response_cost": 0.01,
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Environment Variables
|
||||
|
||||
Use the following environment variables to configure the agent and its access to cloud computers and LLM providers:
|
||||
|
||||
```bash
|
||||
# Computer instance (cloud)
|
||||
export CUA_CONTAINER_NAME="your-container-name"
|
||||
export CUA_API_KEY="your-cua-api-key"
|
||||
|
||||
# LLM API keys
|
||||
export ANTHROPIC_API_KEY="your-anthropic-key"
|
||||
export OPENAI_API_KEY="your-openai-key"
|
||||
```
|
||||
|
||||
### Input and output
|
||||
|
||||
The input prompt passed to `Agent.run` can either be a string or a list of message dictionaries:
|
||||
|
||||
```python
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Take a screenshot and describe what you see"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "I'll take a screenshot for you."
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
The output is an AsyncGenerator that yields response chunks.
|
||||
|
||||
### Parameters
|
||||
|
||||
The `ComputerAgent` constructor provides a wide range of options for customizing agent behavior, tool integration, callbacks, resource management, and more.
|
||||
|
||||
- `model` (`str`): Default: **required**
|
||||
The LLM or agent model to use. Determines which agent loop is selected unless `custom_loop` is provided. (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro")
|
||||
- `tools` (`List[Any]`):
|
||||
List of tools the agent can use (e.g., `Computer`, sandboxed Python functions, etc.).
|
||||
- `custom_loop` (`Callable`):
|
||||
Optional custom agent loop function. If provided, overrides automatic loop selection.
|
||||
- `only_n_most_recent_images` (`int`):
|
||||
If set, only the N most recent images are kept in the message history. Useful for limiting memory usage. Automatically adds `ImageRetentionCallback`.
|
||||
- `callbacks` (`List[Any]`):
|
||||
List of callback instances for advanced preprocessing, postprocessing, logging, or custom hooks. See [Callbacks & Extensibility](#callbacks--extensibility).
|
||||
- `verbosity` (`int`):
|
||||
Logging level (e.g., `logging.INFO`). If set, adds a logging callback.
|
||||
- `trajectory_dir` (`str`):
|
||||
Directory path to save full trajectory data, including screenshots and responses. Adds `TrajectorySaverCallback`.
|
||||
- `max_retries` (`int`): Default: `3`
|
||||
Maximum number of retries for failed API calls (default: 3).
|
||||
- `screenshot_delay` (`float` | `int`): Default: `0.5`
|
||||
Delay (in seconds) before taking screenshots (default: 0.5).
|
||||
- `use_prompt_caching` (`bool`): Default: `False`
|
||||
Enables prompt caching for repeated prompts (mainly for Anthropic models).
|
||||
- `max_trajectory_budget` (`float` | `dict`):
|
||||
If set (float or dict), adds a budget manager callback that tracks usage costs and stops execution if the budget is exceeded. Dict allows advanced options (e.g., `{ "max_budget": 5.0, "raise_error": True }`).
|
||||
- `**kwargs` (`any`):
|
||||
Any additional keyword arguments are passed through to the agent loop or model provider.
|
||||
|
||||
**Example with advanced options:**
|
||||
|
||||
```python
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer
|
||||
from agent.callbacks import ImageRetentionCallback
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[Computer(...)],
|
||||
only_n_most_recent_images=3,
|
||||
callbacks=[ImageRetentionCallback(only_n_most_recent_images=3)],
|
||||
verbosity=logging.INFO,
|
||||
trajectory_dir="trajectories",
|
||||
max_retries=5,
|
||||
screenshot_delay=1.0,
|
||||
use_prompt_caching=True,
|
||||
max_trajectory_budget={"max_budget": 5.0, "raise_error": True}
|
||||
)
|
||||
```
|
||||
|
||||
### Streaming Responses
|
||||
|
||||
```python
|
||||
async for result in agent.run(messages, stream=True):
|
||||
# Process streaming chunks
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"], end="", flush=True)
|
||||
elif item["type"] == "computer_call":
|
||||
action = item["action"]
|
||||
print(f"\n[Action: {action['type']}]")
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
```python
|
||||
try:
|
||||
async for result in agent.run(messages):
|
||||
# Process results
|
||||
pass
|
||||
except BudgetExceededException:
|
||||
print("Budget limit exceeded")
|
||||
except Exception as e:
|
||||
print(f"Agent error: {e}")
|
||||
```
|
||||
@@ -42,11 +42,4 @@ Called when responses are received from agent loop.
|
||||
- `on_screenshot(screenshot, name)` - When screenshots are taken
|
||||
|
||||
### 10. `on_run_end(kwargs, old_items, new_items)`
|
||||
Called when agent run completes. Finalize tracking, save trajectories.
|
||||
|
||||
## Built-in Callbacks
|
||||
|
||||
- **ImageRetentionCallback**: Limits recent images in context
|
||||
- **BudgetManagerCallback**: Stops execution when budget exceeded
|
||||
- **TrajectorySaverCallback**: Saves conversation trajectories
|
||||
- **LoggingCallback**: Logs agent activities
|
||||
Called when agent run completes. Finalize tracking, save trajectories.
|
||||
@@ -28,19 +28,23 @@ agent = ComputerAgent(
|
||||
## Budget Manager Shorthand
|
||||
|
||||
```python
|
||||
# Simple budget limit
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0 # Auto-adds BudgetManagerCallback
|
||||
max_trajectory_budget=5.0 # $5 limit
|
||||
)
|
||||
```
|
||||
|
||||
**Or with options:**
|
||||
```python
|
||||
# Advanced budget configuration
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[computer],
|
||||
max_trajectory_budget={"max_budget": 5.0, "raise_error": True}
|
||||
max_trajectory_budget={
|
||||
"max_budget": 10.0,
|
||||
"raise_error": True, # Raise error when exceeded
|
||||
"reset_after_each_run": False # Persistent across runs
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
@@ -4,12 +4,61 @@ title: Callbacks
|
||||
|
||||
Callbacks in the Agent SDK provide hooks into the agent's lifecycle, allowing for custom functionality to be executed at various stages of an agent's run. They enable extensibility by allowing developers to integrate their own logic for tasks such as logging, cost management, and data anonymization.
|
||||
|
||||
This section details the various callbacks available in the Agent SDK.
|
||||
## Usage
|
||||
|
||||
## Available Callbacks
|
||||
You can add preprocessing and postprocessing hooks using callbacks, or write your own by subclassing `AsyncCallbackHandler`.
|
||||
|
||||
- [Agent Lifecycle](agent-lifecycle.mdx)
|
||||
- [Cost Saving](cost-saving.mdx)
|
||||
- [Logging](logging.mdx)
|
||||
- [PII Anonymization](pii-anonymization.mdx)
|
||||
- [Trajectories](trajectories.mdx)
|
||||
### Built-in Callbacks
|
||||
|
||||
Built-in callbacks can be used as follows:
|
||||
|
||||
```python
|
||||
from agent.callbacks import (
|
||||
ImageRetentionCallback,
|
||||
TrajectorySaverCallback,
|
||||
BudgetManagerCallback,
|
||||
LoggingCallback
|
||||
)
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
ImageRetentionCallback(only_n_most_recent_images=3),
|
||||
TrajectorySaverCallback(trajectory_dir="trajectories"),
|
||||
BudgetManagerCallback(max_budget=10.0, raise_error=True),
|
||||
LoggingCallback(level=logging.INFO)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
The following built-in callbacks are available:
|
||||
|
||||
- [BudgetManagerCallback](callbacks/cost-saving): Stops execution when budget exceeded
|
||||
- [LoggingCallback](callbacks/trajectories): Logs agent activities
|
||||
- **ImageRetentionCallback**: Limits recent images in context
|
||||
- **TrajectorySaverCallback**: Saves conversation trajectories
|
||||
- [PII Anonymization](callbacks/pii-anonymization)
|
||||
|
||||
### Custom Callbacks
|
||||
|
||||
Create custom callbacks using knowlege of the callback lifecycle as described in [Agent Lifecycle](callbacks/agent-lifecycle).
|
||||
|
||||
```python
|
||||
from agent.callbacks.base import AsyncCallbackHandler
|
||||
|
||||
class CustomCallback(AsyncCallbackHandler):
|
||||
async def on_llm_start(self, messages):
|
||||
"""Preprocess messages before LLM call"""
|
||||
# Add custom preprocessing logic
|
||||
return messages
|
||||
|
||||
async def on_llm_end(self, messages):
|
||||
"""Postprocess messages after LLM call"""
|
||||
# Add custom postprocessing logic
|
||||
return messages
|
||||
|
||||
async def on_usage(self, usage):
|
||||
"""Track usage information"""
|
||||
print(f"Tokens used: {usage.total_tokens}")
|
||||
```
|
||||
|
||||
@@ -29,8 +29,8 @@ agent = ComputerAgent(
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[computer],
|
||||
trajectory_dir="trajectories" # Auto-adds TrajectorySaverCallback
|
||||
trajectory_dir="trajectories", # Auto-save trajectories
|
||||
tools=[computer]
|
||||
)
|
||||
```
|
||||
|
||||
@@ -46,6 +46,12 @@ The viewer provides:
|
||||
|
||||
## Trajectory Structure
|
||||
|
||||
Trajectories are saved with:
|
||||
- Complete conversation history
|
||||
- Usage statistics and costs
|
||||
- Timestamps and metadata
|
||||
- Screenshots and computer actions
|
||||
|
||||
Each trajectory contains:
|
||||
- **metadata.json**: Run info, timestamps, usage stats (`total_tokens`, `response_cost`)
|
||||
- **turn_000/**: Turn-by-turn conversation history (api calls, responses, computer calls, screenshots)
|
||||
|
||||
@@ -1,11 +1,29 @@
|
||||
---
|
||||
title: Sandboxed Tools
|
||||
slug: sandboxed-tools
|
||||
title: Custom Tools
|
||||
slug: custom-tools
|
||||
---
|
||||
|
||||
The Agent SDK supports defining custom Python tools that run securely in sandboxed environments on remote Cua Computers. This enables safe execution of user-defined functions, isolation of dependencies, and robust automation workflows.
|
||||
|
||||
## Example: Defining a Sandboxed Tool
|
||||
## Custom Tools
|
||||
|
||||
Define a custom tool for an agent:
|
||||
|
||||
```python
|
||||
def calculate(a: int, b: int) -> int:
|
||||
"""Calculate the sum of two integers"""
|
||||
return a + b
|
||||
|
||||
# Use with agent
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[computer, calculate]
|
||||
)
|
||||
```
|
||||
|
||||
## Sandboxed Tools
|
||||
|
||||
Define a sandboxed tool:
|
||||
|
||||
```python
|
||||
from computer.helpers import sandboxed
|
||||
@@ -4,11 +4,11 @@
|
||||
"pages": [
|
||||
"agent-loops",
|
||||
"supported-agents",
|
||||
"supported-model-providers",
|
||||
"chat-history",
|
||||
"callbacks",
|
||||
"sandboxed-tools",
|
||||
"custom-tools",
|
||||
"custom-computer-handlers",
|
||||
"local-models",
|
||||
"prompt-caching",
|
||||
"usage-tracking",
|
||||
"benchmarks",
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
---
|
||||
title: Supported Model Providers
|
||||
---
|
||||
|
||||
## Supported Models
|
||||
|
||||
### Anthropic Claude (Computer Use API)
|
||||
```python
|
||||
model="anthropic/claude-3-5-sonnet-20241022"
|
||||
model="anthropic/claude-3-7-sonnet-20250219"
|
||||
model="anthropic/claude-opus-4-20250514"
|
||||
model="anthropic/claude-sonnet-4-20250514"
|
||||
```
|
||||
|
||||
### OpenAI Computer Use Preview
|
||||
```python
|
||||
model="openai/computer-use-preview"
|
||||
```
|
||||
|
||||
### UI-TARS (Local or Huggingface Inference)
|
||||
```python
|
||||
model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"
|
||||
model="ollama_chat/0000/ui-tars-1.5-7b"
|
||||
```
|
||||
|
||||
### Omniparser + Any LLM
|
||||
```python
|
||||
model="omniparser+ollama_chat/mistral-small3.2"
|
||||
model="omniparser+vertex_ai/gemini-pro"
|
||||
model="omniparser+anthropic/claude-3-5-sonnet-20241022"
|
||||
model="omniparser+openai/gpt-4o"
|
||||
```
|
||||
@@ -8,109 +8,14 @@ github:
|
||||
|
||||
The Agent library provides the ComputerAgent class and tools for building AI agents that automate workflows on Cua Computers.
|
||||
|
||||
## Reference
|
||||
## Agent Loops
|
||||
|
||||
### Basic Usage
|
||||
See the [Agent Loops](../agent-sdk/agent-loops) documentation for how agents process information and take actions.
|
||||
|
||||
```python
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer
|
||||
## Chat History
|
||||
|
||||
computer = Computer() # Connect to a cua container
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[computer]
|
||||
)
|
||||
See the [Chat History](../agent-sdk/chat-history) documentation for managing conversational context and turn-by-turn interactions.
|
||||
|
||||
prompt = "open github, navigate to trycua/cua"
|
||||
## Callbacks
|
||||
|
||||
async for result in agent.run(prompt):
|
||||
print("Agent:", result["output"][-1]["content"][0]["text"])
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### ComputerAgent Constructor Options
|
||||
|
||||
The `ComputerAgent` constructor provides a wide range of options for customizing agent behavior, tool integration, callbacks, resource management, and more.
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
| --------------------------- | ----------------- | ------------ | ---------------------------------------------------------------------------------------------------- |
|
||||
| `model` | `str` | **required** | Model name (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro") |
|
||||
| `tools` | `List[Any]` | `None` | List of tools (e.g., computer objects, decorated functions) |
|
||||
| `custom_loop` | `Callable` | `None` | Custom agent loop function (overrides auto-selection) |
|
||||
| `only_n_most_recent_images` | `int` | `None` | If set, only keep the N most recent images in message history (adds ImageRetentionCallback) |
|
||||
| `callbacks` | `List[Any]` | `None` | List of AsyncCallbackHandler instances for preprocessing/postprocessing |
|
||||
| `verbosity` | `int` | `None` | Logging level (`logging.DEBUG`, `logging.INFO`, etc.; adds LoggingCallback) |
|
||||
| `trajectory_dir` | `str` | `None` | Directory to save trajectory data (adds TrajectorySaverCallback) |
|
||||
| `max_retries` | `int` | `3` | Maximum number of retries for failed API calls |
|
||||
| `screenshot_delay` | `float` \| `int` | `0.5` | Delay before screenshots (seconds) |
|
||||
| `use_prompt_caching` | `bool` | `False` | Use prompt caching to avoid reprocessing the same prompt (mainly for Anthropic) |
|
||||
| `max_trajectory_budget` | `float` \| `dict` | `None` | If set, adds BudgetManagerCallback to track usage costs and stop when budget is exceeded |
|
||||
| `**kwargs` | _any_ | | Additional arguments passed to the agent loop |
|
||||
|
||||
#### Parameter Details
|
||||
|
||||
- **model**: The LLM or agent model to use. Determines which agent loop is selected unless `custom_loop` is provided.
|
||||
- **tools**: List of tools the agent can use (e.g., `Computer`, sandboxed Python functions, etc.).
|
||||
- **custom_loop**: Optional custom agent loop function. If provided, overrides automatic loop selection.
|
||||
- **only_n_most_recent_images**: If set, only the N most recent images are kept in the message history. Useful for limiting memory usage. Automatically adds `ImageRetentionCallback`.
|
||||
- **callbacks**: List of callback instances for advanced preprocessing, postprocessing, logging, or custom hooks. See [Callbacks & Extensibility](#callbacks--extensibility).
|
||||
- **verbosity**: Logging level (e.g., `logging.INFO`). If set, adds a logging callback.
|
||||
- **trajectory_dir**: Directory path to save full trajectory data, including screenshots and responses. Adds `TrajectorySaverCallback`.
|
||||
- **max_retries**: Maximum number of retries for failed API calls (default: 3).
|
||||
- **screenshot_delay**: Delay (in seconds) before taking screenshots (default: 0.5).
|
||||
- **use_prompt_caching**: Enables prompt caching for repeated prompts (mainly for Anthropic models).
|
||||
- **max_trajectory_budget**: If set (float or dict), adds a budget manager callback that tracks usage costs and stops execution if the budget is exceeded. Dict allows advanced options (e.g., `{ "max_budget": 5.0, "raise_error": True }`).
|
||||
- **\*\*kwargs**: Any additional keyword arguments are passed through to the agent loop or model provider.
|
||||
|
||||
**Example with advanced options:**
|
||||
|
||||
```python
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer
|
||||
from agent.callbacks import ImageRetentionCallback
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[Computer(...)],
|
||||
only_n_most_recent_images=3,
|
||||
callbacks=[ImageRetentionCallback(only_n_most_recent_images=3)],
|
||||
verbosity=logging.INFO,
|
||||
trajectory_dir="trajectories",
|
||||
max_retries=5,
|
||||
screenshot_delay=1.0,
|
||||
use_prompt_caching=True,
|
||||
max_trajectory_budget={"max_budget": 5.0, "raise_error": True}
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Message Array (Multi-turn)
|
||||
|
||||
```python
|
||||
messages = [
|
||||
{"role": "user", "content": "go to trycua on gh"},
|
||||
# ... (reasoning, computer_call, computer_call_output, etc)
|
||||
]
|
||||
async for result in agent.run(messages):
|
||||
# Handle output, tool invocations, screenshots, etc.
|
||||
print("Agent:", result["output"][-1]["content"][0]["text"])
|
||||
messages += result["output"] # Add agent output to message array
|
||||
...
|
||||
```
|
||||
|
||||
### Callbacks & Extensibility
|
||||
|
||||
You can add preprocessing and postprocessing hooks using callbacks, or write your own by subclassing `AsyncCallbackHandler`:
|
||||
|
||||
```python
|
||||
from agent.callbacks import ImageRetentionCallback, PIIAnonymizationCallback
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[computer],
|
||||
callbacks=[ImageRetentionCallback(only_n_most_recent_images=3)]
|
||||
)
|
||||
```
|
||||
See the [Callbacks](../agent-sdk/callbacks) documentation for extending and customizing agent behavior with custom hooks.
|
||||
|
||||
@@ -87,6 +87,16 @@ Choose how you want to run your cua computer. **Cloud containers are recommended
|
||||
<Tab value="Python">
|
||||
```bash
|
||||
pip install "cua-agent[all]" cua-computer
|
||||
|
||||
# or install specific providers
|
||||
pip install "cua-agent[openai]" # OpenAI computer-use-preview support
|
||||
pip install "cua-agent[anthropic]" # Anthropic Claude support
|
||||
pip install "cua-agent[omni]" # Omniparser + any LLM support
|
||||
pip install "cua-agent[uitars]" # UI-TARS
|
||||
pip install "cua-agent[uitars-mlx]" # UI-TARS + MLX support
|
||||
pip install "cua-agent[uitars-hf]" # UI-TARS + Huggingface support
|
||||
pip install "cua-agent[glm45v-hf]" # GLM-4.5V + Huggingface support
|
||||
pip install "cua-agent[ui]" # Gradio UI support
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="TypeScript">
|
||||
|
||||
@@ -29,16 +29,6 @@
|
||||
|
||||
```bash
|
||||
pip install "cua-agent[all]"
|
||||
|
||||
# or install specific providers
|
||||
pip install "cua-agent[openai]" # OpenAI computer-use-preview support
|
||||
pip install "cua-agent[anthropic]" # Anthropic Claude support
|
||||
pip install "cua-agent[omni]" # Omniparser + any LLM support
|
||||
pip install "cua-agent[uitars]" # UI-TARS
|
||||
pip install "cua-agent[uitars-mlx]" # UI-TARS + MLX support
|
||||
pip install "cua-agent[uitars-hf]" # UI-TARS + Huggingface support
|
||||
pip install "cua-agent[glm45v-hf]" # GLM-4.5V + Huggingface support
|
||||
pip install "cua-agent[ui]" # Gradio UI support
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
@@ -79,303 +69,18 @@ if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Supported Models
|
||||
## Docs
|
||||
|
||||
### Anthropic Claude (Computer Use API)
|
||||
```python
|
||||
model="anthropic/claude-3-5-sonnet-20241022"
|
||||
model="anthropic/claude-3-7-sonnet-20250219"
|
||||
model="anthropic/claude-opus-4-20250514"
|
||||
model="anthropic/claude-sonnet-4-20250514"
|
||||
```
|
||||
|
||||
### OpenAI Computer Use Preview
|
||||
```python
|
||||
model="openai/computer-use-preview"
|
||||
```
|
||||
|
||||
### UI-TARS (Local or Huggingface Inference)
|
||||
```python
|
||||
model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"
|
||||
model="ollama_chat/0000/ui-tars-1.5-7b"
|
||||
```
|
||||
|
||||
### Omniparser + Any LLM
|
||||
```python
|
||||
model="omniparser+ollama_chat/mistral-small3.2"
|
||||
model="omniparser+vertex_ai/gemini-pro"
|
||||
model="omniparser+anthropic/claude-3-5-sonnet-20241022"
|
||||
model="omniparser+openai/gpt-4o"
|
||||
```
|
||||
|
||||
## Custom Tools
|
||||
|
||||
Define custom tools using decorated functions:
|
||||
|
||||
```python
|
||||
from computer.helpers import sandboxed
|
||||
|
||||
@sandboxed()
|
||||
def read_file(location: str) -> str:
|
||||
"""Read contents of a file
|
||||
|
||||
Parameters
|
||||
----------
|
||||
location : str
|
||||
Path to the file to read
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
Contents of the file or error message
|
||||
"""
|
||||
try:
|
||||
with open(location, 'r') as f:
|
||||
return f.read()
|
||||
except Exception as e:
|
||||
return f"Error reading file: {str(e)}"
|
||||
|
||||
def calculate(a: int, b: int) -> int:
|
||||
"""Calculate the sum of two integers"""
|
||||
return a + b
|
||||
|
||||
# Use with agent
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[computer, read_file, calculate]
|
||||
)
|
||||
```
|
||||
|
||||
## Callbacks System
|
||||
|
||||
agent provides a comprehensive callback system for extending functionality:
|
||||
|
||||
### Built-in Callbacks
|
||||
|
||||
```python
|
||||
from agent.callbacks import (
|
||||
ImageRetentionCallback,
|
||||
TrajectorySaverCallback,
|
||||
BudgetManagerCallback,
|
||||
LoggingCallback
|
||||
)
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
ImageRetentionCallback(only_n_most_recent_images=3),
|
||||
TrajectorySaverCallback(trajectory_dir="trajectories"),
|
||||
BudgetManagerCallback(max_budget=10.0, raise_error=True),
|
||||
LoggingCallback(level=logging.INFO)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Custom Callbacks
|
||||
|
||||
```python
|
||||
from agent.callbacks.base import AsyncCallbackHandler
|
||||
|
||||
class CustomCallback(AsyncCallbackHandler):
|
||||
async def on_llm_start(self, messages):
|
||||
"""Preprocess messages before LLM call"""
|
||||
# Add custom preprocessing logic
|
||||
return messages
|
||||
|
||||
async def on_llm_end(self, messages):
|
||||
"""Postprocess messages after LLM call"""
|
||||
# Add custom postprocessing logic
|
||||
return messages
|
||||
|
||||
async def on_usage(self, usage):
|
||||
"""Track usage information"""
|
||||
print(f"Tokens used: {usage.total_tokens}")
|
||||
```
|
||||
|
||||
## Budget Management
|
||||
|
||||
Control costs with built-in budget management:
|
||||
|
||||
```python
|
||||
# Simple budget limit
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
max_trajectory_budget=5.0 # $5 limit
|
||||
)
|
||||
|
||||
# Advanced budget configuration
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
max_trajectory_budget={
|
||||
"max_budget": 10.0,
|
||||
"raise_error": True, # Raise error when exceeded
|
||||
"reset_after_each_run": False # Persistent across runs
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
## Trajectory Management
|
||||
|
||||
Save and replay agent conversations:
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
trajectory_dir="trajectories", # Auto-save trajectories
|
||||
tools=[computer]
|
||||
)
|
||||
|
||||
# Trajectories are saved with:
|
||||
# - Complete conversation history
|
||||
# - Usage statistics and costs
|
||||
# - Timestamps and metadata
|
||||
# - Screenshots and computer actions
|
||||
```
|
||||
|
||||
## Configuration Options
|
||||
|
||||
### ComputerAgent Parameters
|
||||
|
||||
- `model`: Model identifier (required)
|
||||
- `tools`: List of computer objects and decorated functions
|
||||
- `callbacks`: List of callback handlers for extensibility
|
||||
- `only_n_most_recent_images`: Limit recent images to prevent context overflow
|
||||
- `verbosity`: Logging level (logging.INFO, logging.DEBUG, etc.)
|
||||
- `trajectory_dir`: Directory to save conversation trajectories
|
||||
- `max_retries`: Maximum API call retries (default: 3)
|
||||
- `screenshot_delay`: Delay between actions and screenshots (default: 0.5s)
|
||||
- `use_prompt_caching`: Enable prompt caching for supported models
|
||||
- `max_trajectory_budget`: Budget limit configuration
|
||||
|
||||
### Environment Variables
|
||||
|
||||
```bash
|
||||
# Computer instance (cloud)
|
||||
export CUA_CONTAINER_NAME="your-container-name"
|
||||
export CUA_API_KEY="your-cua-api-key"
|
||||
|
||||
# LLM API keys
|
||||
export ANTHROPIC_API_KEY="your-anthropic-key"
|
||||
export OPENAI_API_KEY="your-openai-key"
|
||||
```
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Streaming Responses
|
||||
|
||||
```python
|
||||
async for result in agent.run(messages, stream=True):
|
||||
# Process streaming chunks
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"], end="", flush=True)
|
||||
elif item["type"] == "computer_call":
|
||||
action = item["action"]
|
||||
print(f"\n[Action: {action['type']}]")
|
||||
```
|
||||
|
||||
### Interactive Chat Loop
|
||||
|
||||
```python
|
||||
history = []
|
||||
while True:
|
||||
user_input = input("> ")
|
||||
if user_input.lower() in ['quit', 'exit']:
|
||||
break
|
||||
|
||||
history.append({"role": "user", "content": user_input})
|
||||
|
||||
async for result in agent.run(history):
|
||||
history += result["output"]
|
||||
|
||||
# Display assistant responses
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"])
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
```python
|
||||
try:
|
||||
async for result in agent.run(messages):
|
||||
# Process results
|
||||
pass
|
||||
except BudgetExceededException:
|
||||
print("Budget limit exceeded")
|
||||
except Exception as e:
|
||||
print(f"Agent error: {e}")
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
### ComputerAgent.run()
|
||||
|
||||
```python
|
||||
async def run(
|
||||
self,
|
||||
messages: Messages,
|
||||
stream: bool = False,
|
||||
**kwargs
|
||||
) -> AsyncGenerator[Dict[str, Any], None]:
|
||||
"""
|
||||
Run the agent with the given messages.
|
||||
|
||||
Args:
|
||||
messages: List of message dictionaries
|
||||
stream: Whether to stream the response
|
||||
**kwargs: Additional arguments
|
||||
|
||||
Returns:
|
||||
AsyncGenerator that yields response chunks
|
||||
"""
|
||||
```
|
||||
|
||||
### Message Format
|
||||
|
||||
```python
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Take a screenshot and describe what you see"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "I'll take a screenshot for you."
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### Response Format
|
||||
|
||||
```python
|
||||
{
|
||||
"output": [
|
||||
{
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"content": [{"type": "output_text", "text": "I can see..."}]
|
||||
},
|
||||
{
|
||||
"type": "computer_call",
|
||||
"action": {"type": "screenshot"},
|
||||
"call_id": "call_123"
|
||||
},
|
||||
{
|
||||
"type": "computer_call_output",
|
||||
"call_id": "call_123",
|
||||
"output": {"image_url": "data:image/png;base64,..."}
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 150,
|
||||
"completion_tokens": 75,
|
||||
"total_tokens": 225,
|
||||
"response_cost": 0.01,
|
||||
}
|
||||
}
|
||||
```
|
||||
- [Agent Loops](https://trycua.com/docs/agent-sdk/agent-loops)
|
||||
- [Supported Agents](https://trycua.com/docs/agent-sdk/supported-agents)
|
||||
- [Supported Models](https://trycua.com/docs/agent-sdk/supported-models)
|
||||
- [Chat History](https://trycua.com/docs/agent-sdk/chat-history)
|
||||
- [Callbacks](https://trycua.com/docs/agent-sdk/callbacks)
|
||||
- [Custom Tools](https://trycua.com/docs/agent-sdk/custom-tools)
|
||||
- [Custom Computer Handlers](https://trycua.com/docs/agent-sdk/custom-computer-handlers)
|
||||
- [Prompt Caching](https://trycua.com/docs/agent-sdk/prompt-caching)
|
||||
- [Usage Tracking](https://trycua.com/docs/agent-sdk/usage-tracking)
|
||||
- [Benchmarks](https://trycua.com/docs/agent-sdk/benchmarks)
|
||||
|
||||
## License
|
||||
|
||||
|
||||
Reference in New Issue
Block a user