From 52005c592fc4c7140dce451cb3730de6ad870eb5 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Fri, 25 Jul 2025 17:06:09 -0400 Subject: [PATCH] Updated ReadME --- libs/python/agent2/README.md | 450 ++++++++++++++++++++++-------- libs/python/agent2/example.py | 4 +- libs/python/agent2/pyproject.toml | 44 ++- 3 files changed, 371 insertions(+), 127 deletions(-) diff --git a/libs/python/agent2/README.md b/libs/python/agent2/README.md index 31115835..b76bb673 100644 --- a/libs/python/agent2/README.md +++ b/libs/python/agent2/README.md @@ -1,33 +1,120 @@ -# Agent2 - Computer Use Agent +
+

+
+ + + + Shows my svg + +
-**agent2** is a clean Computer-Use framework with liteLLM integration for running agentic workflows on macOS and Linux. + [![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#) + [![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#) + [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85) + [![PyPI](https://img.shields.io/pypi/v/cua-computer?color=333333)](https://pypi.org/project/cua-computer/) +

+
-## Key Features +**cua-agent** is a general Computer-Use framework with liteLLM integration for running agentic workflows on macOS, Windows, and Linux sandboxes. It provides a unified interface for computer-use agents across multiple LLM providers with advanced callback system for extensibility. -- **Docstring-based Tools**: Define tools using standard Python docstrings (no decorators needed) -- **Regex Model Matching**: Agent loops can match models using regex patterns -- **liteLLM Integration**: All completions use liteLLM's `.responses()` method -- **Streaming Support**: Built-in streaming with asyncio.Queue and cancellation support -- **Computer Tools**: Direct integration with computer interface for clicks, typing, etc. -- **Custom Tools**: Easy Python function tools with comprehensive docstrings +## Features + +- **Safe Computer-Use/Tool-Use**: Using Computer SDK for sandboxed desktops +- **Multi-Agent Support**: Anthropic Claude, OpenAI computer-use-preview, UI-TARS, Omniparser + any LLM +- **Multi-API Support**: Take advantage of liteLLM supporting 100+ LLMs / model APIs, including local models (`huggingface-local/`, `ollama_chat/`, `mlx/`) +- **Cross-Platform**: Works on Windows, macOS, and Linux with cloud and local computer instances +- **Extensible Callbacks**: Built-in support for image retention, cache control, PII anonymization, budget limits, and trajectory tracking ## Install ```bash -pip install "cua-agent2[all]" +pip install "cua-agent[all]" # or install specific providers -pip install "cua-agent2[anthropic]" # Anthropic support -pip install "cua-agent2[openai]" # OpenAI computer-use-preview support +pip install "cua-agent[openai]" # OpenAI computer-use-preview support +pip install "cua-agent[anthropic]" # Anthropic Claude support +pip install "cua-agent[omni]" # Omniparser + any LLM support +pip install "cua-agent[uitars]" # UI-TARS +pip install "cua-agent[uitars-mlx]" # UI-TARS + MLX support +pip install "cua-agent[uitars-hf]" # UI-TARS + Huggingface support +pip install "cua-agent[ui]" # Gradio UI support ``` -## Usage - -### Define Tools +## Quick Start ```python -# No imports needed for tools - just define functions with comprehensive docstrings +import asyncio +import os +from agent import ComputerAgent +from computer import Computer +async def main(): + # Set up computer instance + async with Computer( + os_type="linux", + provider_type="cloud", + name=os.getenv("CUA_CONTAINER_NAME"), + api_key=os.getenv("CUA_API_KEY") + ) as computer: + + # Create agent + agent = ComputerAgent( + model="anthropic/claude-3-5-sonnet-20241022", + tools=[computer], + only_n_most_recent_images=3, + trajectory_dir="trajectories", + max_trajectory_budget=5.0 # $5 budget limit + ) + + # Run agent + messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}] + + async for result in agent.run(messages): + for item in result["output"]: + if item["type"] == "message": + print(item["content"][0]["text"]) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Supported Models + +### Anthropic Claude (Computer Use API) +```python +model="anthropic/claude-3-5-sonnet-20241022" +model="anthropic/claude-3-5-sonnet-20240620" +model="anthropic/claude-opus-4-20250514" +model="anthropic/claude-sonnet-4-20250514" +``` + +### OpenAI Computer Use Preview +```python +model="openai/computer-use-preview" +``` + +### UI-TARS (Local or Huggingface Inference) +```python +model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" +model="ollama_chat/0000/ui-tars-1.5-7b" +``` + +### Omniparser + Any LLM +```python +model="omniparser+ollama_chat/mistral-small3.2" +model="omniparser+vertex_ai/gemini-pro" +model="omniparser+anthropic/claude-3-5-sonnet-20241022" +model="omniparser+openai/gpt-4o" +``` + +## Custom Tools + +Define custom tools using decorated functions: + +```python +from computer.helpers import sandboxed + +@sandboxed() def read_file(location: str) -> str: """Read contents of a file @@ -39,113 +126,256 @@ def read_file(location: str) -> str: Returns ------- str - Contents of the file + Contents of the file or error message """ - with open(location, 'r') as f: - return f.read() + try: + with open(location, 'r') as f: + return f.read() + except Exception as e: + return f"Error reading file: {str(e)}" -def search_web(query: str) -> str: - """Search the web for information +def calculate(a: int, b: int) -> int: + """Calculate the sum of two integers""" + return a + b + +# Use with agent +agent = ComputerAgent( + model="anthropic/claude-3-5-sonnet-20241022", + tools=[computer, read_file, calculate] +) +``` + +## Callbacks System + +Agent2 provides a comprehensive callback system for extending functionality: + +### Built-in Callbacks + +```python +from agent2.callbacks import ( + ImageRetentionCallback, + TrajectorySaverCallback, + BudgetManagerCallback, + LoggingCallback +) + +agent = ComputerAgent( + model="anthropic/claude-3-5-sonnet-20241022", + tools=[computer], + callbacks=[ + ImageRetentionCallback(only_n_most_recent_images=3), + TrajectorySaverCallback(trajectory_dir="trajectories"), + BudgetManagerCallback(max_budget=10.0, raise_error=True), + LoggingCallback(level=logging.INFO) + ] +) +``` + +### Custom Callbacks + +```python +from agent2.callbacks.base import AsyncCallbackHandler + +class CustomCallback(AsyncCallbackHandler): + async def on_llm_start(self, messages): + """Preprocess messages before LLM call""" + # Add custom preprocessing logic + return messages - Parameters - ---------- - query : str - Search query to look for + async def on_llm_end(self, messages): + """Postprocess messages after LLM call""" + # Add custom postprocessing logic + return messages + + async def on_usage(self, usage): + """Track usage information""" + print(f"Tokens used: {usage.total_tokens}") +``` + +## Budget Management + +Control costs with built-in budget management: + +```python +# Simple budget limit +agent = ComputerAgent( + model="anthropic/claude-3-5-sonnet-20241022", + max_trajectory_budget=5.0 # $5 limit +) + +# Advanced budget configuration +agent = ComputerAgent( + model="anthropic/claude-3-5-sonnet-20241022", + max_trajectory_budget={ + "max_budget": 10.0, + "raise_error": True, # Raise error when exceeded + "reset_after_each_run": False # Persistent across runs + } +) +``` + +## Trajectory Management + +Save and replay agent conversations: + +```python +agent = ComputerAgent( + model="anthropic/claude-3-5-sonnet-20241022", + trajectory_dir="trajectories", # Auto-save trajectories + tools=[computer] +) + +# Trajectories are saved with: +# - Complete conversation history +# - Usage statistics and costs +# - Timestamps and metadata +# - Screenshots and computer actions +``` + +## Configuration Options + +### ComputerAgent Parameters + +- `model`: Model identifier (required) +- `tools`: List of computer objects and decorated functions +- `callbacks`: List of callback handlers for extensibility +- `only_n_most_recent_images`: Limit recent images to prevent context overflow +- `verbosity`: Logging level (logging.INFO, logging.DEBUG, etc.) +- `trajectory_dir`: Directory to save conversation trajectories +- `max_retries`: Maximum API call retries (default: 3) +- `screenshot_delay`: Delay between actions and screenshots (default: 0.5s) +- `use_prompt_caching`: Enable prompt caching for supported models +- `max_trajectory_budget`: Budget limit configuration + +### Environment Variables + +```bash +# Computer instance (cloud) +export CUA_CONTAINER_NAME="your-container-name" +export CUA_API_KEY="your-cua-api-key" + +# LLM API keys +export ANTHROPIC_API_KEY="your-anthropic-key" +export OPENAI_API_KEY="your-openai-key" +``` + +## Advanced Usage + +### Streaming Responses + +```python +async for result in agent.run(messages, stream=True): + # Process streaming chunks + for item in result["output"]: + if item["type"] == "message": + print(item["content"][0]["text"], end="", flush=True) + elif item["type"] == "computer_call": + action = item["action"] + print(f"\n[Action: {action['type']}]") +``` + +### Interactive Chat Loop + +```python +history = [] +while True: + user_input = input("> ") + if user_input.lower() in ['quit', 'exit']: + break - Returns - ------- - str - Search results + history.append({"role": "user", "content": user_input}) + + async for result in agent.run(history): + history += result["output"] + + # Display assistant responses + for item in result["output"]: + if item["type"] == "message": + print(item["content"][0]["text"]) +``` + +### Error Handling + +```python +try: + async for result in agent.run(messages): + # Process results + pass +except BudgetExceededException: + print("Budget limit exceeded") +except Exception as e: + print(f"Agent error: {e}") +``` + +## API Reference + +### ComputerAgent.run() + +```python +async def run( + self, + messages: Messages, + stream: bool = False, + **kwargs +) -> AsyncGenerator[Dict[str, Any], None]: + """ + Run the agent with the given messages. + + Args: + messages: List of message dictionaries + stream: Whether to stream the response + **kwargs: Additional arguments + + Returns: + AsyncGenerator that yields response chunks """ - return f"Search results for: {query}" ``` -### Define Agent Loops +### Message Format ```python -from agent2 import agent_loop -from agent2.types import Messages - -@agent_loop(models=r"claude-3.*", priority=10) -async def custom_claude_loop(messages: Messages, model: str, stream: bool = False, tools: Optional[List[Dict[str, Any]]] = None, **kwargs): - """Custom agent loop for Claude models.""" - # Map computer tools to Claude format - anthropic_tools = _prepare_tools_for_anthropic(tools) - - # Your custom logic here - response = await litellm.aresponses( - model=model, - messages=messages, - stream=stream, - tools=anthropic_tools, - **kwargs - ) - - if stream: - async for chunk in response: - yield chunk - else: - yield response - -@agent_loop(models=r"omni+.*", priority=10) -async def custom_omni_loop(messages: Messages, model: str, stream: bool = False, tools: Optional[List[Dict[str, Any]]] = None, **kwargs): - """Custom agent loop for Omni models.""" - # Map computer tools to Claude format - omni_tools, som_prompt = _prepare_tools_for_omni(tools) - - # Your custom logic here - response = await litellm.aresponses( - model=model.replace("omni+", ""), - messages=som_prompt, - stream=stream, - tools=omni_tools, - **kwargs - ) - - if stream: - async for chunk in response: - yield chunk - else: - yield response +messages = [ + { + "role": "user", + "content": "Take a screenshot and describe what you see" + }, + { + "role": "assistant", + "content": "I'll take a screenshot for you." + } +] ``` -### Use ComputerAgent +### Response Format ```python -from agent2 import ComputerAgent -from computer import Computer - -async def main(): - with Computer() as computer: - agent = ComputerAgent( - model="claude-3-5-sonnet-20241022", - tools=[computer, read_file, search_web] - ) - - messages = [{"role": "user", "content": "Save a picture of a cat to my desktop."}] - - async for chunk in agent.run(messages, stream=True): - print(chunk) - - omni_agent = ComputerAgent( - model="omni+vertex_ai/gemini-pro", - tools=[computer, read_file, search_web] - ) - - messages = [{"role": "user", "content": "Save a picture of a cat to my desktop."}] - - async for chunk in omni_agent.run(messages, stream=True): - print(chunk) +{ + "output": [ + { + "type": "message", + "role": "assistant", + "content": [{"type": "output_text", "text": "I can see..."}] + }, + { + "type": "computer_call", + "action": {"type": "screenshot"}, + "call_id": "call_123" + }, + { + "type": "computer_call_output", + "call_id": "call_123", + "output": {"image_url": "data:image/png;base64,..."} + } + ], + "usage": { + "prompt_tokens": 150, + "completion_tokens": 75, + "total_tokens": 225, + "response_cost": 0.01, + } +} ``` -## Supported Agent Loops +## License -- **Anthropic**: Claude models with computer use -- **Computer-Use-Preview**: OpenAI's computer use preview models - -## Architecture - -- Agent loops are automatically selected based on model regex matching -- Computer tools are mapped to model-specific schemas -- All completions use `litellm.responses()` for consistency -- Streaming is handled with asyncio.Queue for cancellation support +MIT License - see LICENSE file for details. \ No newline at end of file diff --git a/libs/python/agent2/example.py b/libs/python/agent2/example.py index 21a8da46..38f4fbcd 100644 --- a/libs/python/agent2/example.py +++ b/libs/python/agent2/example.py @@ -93,7 +93,7 @@ async def main(): # Supported models: # == OpenAI CUA (computer-use-preview) == - # model="openai/computer-use-preview", + model="openai/computer-use-preview", # == Anthropic CUA (Claude > 3.5) == # model="anthropic/claude-opus-4-20250514", @@ -109,7 +109,7 @@ async def main(): # == Omniparser + Any LLM == # model="omniparser+..." - model="omniparser+anthropic/claude-opus-4-20250514", + # model="omniparser+anthropic/claude-opus-4-20250514", tools=[computer], only_n_most_recent_images=3, diff --git a/libs/python/agent2/pyproject.toml b/libs/python/agent2/pyproject.toml index 7d209bfa..5656c40a 100644 --- a/libs/python/agent2/pyproject.toml +++ b/libs/python/agent2/pyproject.toml @@ -3,9 +3,9 @@ requires = ["pdm-backend"] build-backend = "pdm.backend" [project] -name = "cua-agent2" -version = "0.1.0" -description = "CUA Agent2 - Decorator-based Computer Use Agent with liteLLM integration" +name = "cua-agent" +version = "0.4.0" +description = "CUA (Computer Use) Agent for AI-driven computer interaction" readme = "README.md" authors = [ { name = "TryCua", email = "gh@trycua.com" } @@ -19,27 +19,41 @@ dependencies = [ "pydantic>=2.6.4", "rich>=13.7.1", "python-dotenv>=1.0.1", - "cua-computer>=0.3.0,<0.4.0", + "cua-computer>=0.3.0,<0.5.0", "cua-core>=0.1.0,<0.2.0", "certifi>=2024.2.2", - "litellm>=1.0.0" + "litellm>=1.74.8" ] requires-python = ">=3.11" [project.optional-dependencies] -anthropic = [ - "anthropic>=0.49.0", - "boto3>=1.35.81", +openai = [] +anthropic = [] +omni = [ + "ultralytics>=8.0.0", + "cua-som>=0.1.0,<0.2.0", ] -openai = [ - "openai>=1.14.0", - "httpx>=0.27.0", +uitars = [] +uitars-mlx = [ + "mlx-vlm>=0.1.27; sys_platform == 'darwin'" +] +uitars-hf = [ + "transformers>=4.54.0" +] +ui = [ + "gradio>=5.23.3", + "python-dotenv>=1.0.1", ] all = [ - "anthropic>=0.49.0", - "boto3>=1.35.81", - "openai>=1.14.0", - "httpx>=0.27.0", + # omni requirements + "ultralytics>=8.0.0", + "cua-som>=0.1.0,<0.2.0", + # uitars requirements + "mlx-vlm>=0.1.27; sys_platform == 'darwin'", + "transformers>=4.54.0", + # ui requirements + "gradio>=5.23.3", + "python-dotenv>=1.0.1", ] [tool.uv]