renamed to agent

This commit is contained in:
Dillon DuPont
2025-07-25 19:01:20 -04:00
parent 52005c592f
commit 6f177d7f6a
29 changed files with 1320 additions and 25 deletions

View File

@@ -147,12 +147,12 @@ agent = ComputerAgent(
## Callbacks System
Agent2 provides a comprehensive callback system for extending functionality:
agent provides a comprehensive callback system for extending functionality:
### Built-in Callbacks
```python
from agent2.callbacks import (
from agent.callbacks import (
ImageRetentionCallback,
TrajectorySaverCallback,
BudgetManagerCallback,
@@ -174,7 +174,7 @@ agent = ComputerAgent(
### Custom Callbacks
```python
from agent2.callbacks.base import AsyncCallbackHandler
from agent.callbacks.base import AsyncCallbackHandler
class CustomCallback(AsyncCallbackHandler):
async def on_llm_start(self, messages):

View File

@@ -1,5 +1,5 @@
"""
Agent2 - Decorator-based Computer Use Agent with liteLLM integration
agent - Decorator-based Computer Use Agent with liteLLM integration
"""
from .decorators import agent_loop

View File

@@ -0,0 +1,21 @@
"""
Entry point for running agent CLI module.
Usage:
python -m agent.cli <model_string>
"""
import sys
import asyncio
from .cli import main
if __name__ == "__main__":
# Check if 'cli' is specified as the module
if len(sys.argv) > 1 and sys.argv[1] == "cli":
# Remove 'cli' from arguments and run CLI
sys.argv.pop(1)
asyncio.run(main())
else:
print("Usage: python -m agent.cli <model_string>")
print("Example: python -m agent.cli openai/computer-use-preview")
sys.exit(1)

View File

@@ -1,5 +1,5 @@
"""
Adapters package for agent2 - Custom LLM adapters for LiteLLM
Adapters package for agent - Custom LLM adapters for LiteLLM
"""
from .huggingfacelocal_adapter import HuggingFaceLocalAdapter

View File

@@ -192,7 +192,7 @@ class ComputerAgent:
]
# == Initialize computer agent ==
# Find the appropriate agent loop
if custom_loop:
self.agent_loop = custom_loop
@@ -204,16 +204,26 @@ class ComputerAgent:
self.agent_loop = loop_info.func
self.agent_loop_info = loop_info
# Process tools and create tool schemas
self.tool_schemas = self._process_tools()
self.tool_schemas = []
self.computer_handler = None
# Find computer tool and create interface adapter
computer_handler = None
for schema in self.tool_schemas:
if schema["type"] == "computer":
computer_handler = OpenAIComputerHandler(schema["computer"].interface)
break
self.computer_handler = computer_handler
async def _initialize_computers(self):
"""Initialize computer objects"""
if not self.tool_schemas:
for tool in self.tools:
if hasattr(tool, '_initialized') and not tool._initialized:
await tool.run()
# Process tools and create tool schemas
self.tool_schemas = self._process_tools()
# Find computer tool and create interface adapter
computer_handler = None
for schema in self.tool_schemas:
if schema["type"] == "computer":
computer_handler = OpenAIComputerHandler(schema["computer"].interface)
break
self.computer_handler = computer_handler
def _process_input(self, input: Messages) -> List[Dict[str, Any]]:
"""Process input messages and create schemas for the agent loop"""
@@ -484,6 +494,9 @@ class ComputerAgent:
Returns:
AsyncGenerator that yields response chunks
"""
await self._initialize_computers()
# Merge kwargs
merged_kwargs = {**self.kwargs, **kwargs}

View File

@@ -54,10 +54,10 @@ class LoggingCallback(AsyncCallbackHandler):
Initialize the logging callback.
Args:
logger: Logger instance to use. If None, creates a logger named 'agent2.ComputerAgent'
logger: Logger instance to use. If None, creates a logger named 'agent.ComputerAgent'
level: Logging level (logging.DEBUG, logging.INFO, etc.)
"""
self.logger = logger or logging.getLogger('agent2.ComputerAgent')
self.logger = logger or logging.getLogger('agent.ComputerAgent')
self.level = level
# Set up logger if it doesn't have handlers

View File

@@ -0,0 +1,290 @@
"""
CLI chat interface for agent - Computer Use Agent
Usage:
python -m agent.cli <model_string>
Examples:
python -m agent.cli openai/computer-use-preview
python -m agent.cli anthropic/claude-3-5-sonnet-20241022
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
"""
import asyncio
import argparse
import os
import sys
import json
from typing import List, Dict, Any
import dotenv
from yaspin import yaspin
# Load environment variables
dotenv.load_dotenv()
# Color codes for terminal output
class Colors:
RESET = '\033[0m'
BOLD = '\033[1m'
DIM = '\033[2m'
# Text colors
RED = '\033[31m'
GREEN = '\033[32m'
YELLOW = '\033[33m'
BLUE = '\033[34m'
MAGENTA = '\033[35m'
CYAN = '\033[36m'
WHITE = '\033[37m'
GRAY = '\033[90m'
# Background colors
BG_RED = '\033[41m'
BG_GREEN = '\033[42m'
BG_YELLOW = '\033[43m'
BG_BLUE = '\033[44m'
def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n"):
"""Print colored text to terminal."""
prefix = ""
if bold:
prefix += Colors.BOLD
if dim:
prefix += Colors.DIM
if color:
prefix += color
print(f"{prefix}{text}{Colors.RESET}", end=end)
def print_action(action_type: str, details: Dict[str, Any]):
"""Print computer action with nice formatting."""
# Format action details
args_str = ""
if action_type == "click" and "x" in details and "y" in details:
args_str = f"({details['x']}, {details['y']})"
elif action_type == "type" and "text" in details:
text = details["text"]
if len(text) > 50:
text = text[:47] + "..."
args_str = f'"{text}"'
elif action_type == "key" and "key" in details:
args_str = f"'{details['key']}'"
elif action_type == "scroll" and "x" in details and "y" in details:
args_str = f"({details['x']}, {details['y']})"
print_colored(f"🛠️ {action_type}{args_str}", dim=True)
def print_welcome(model: str, agent_loop: str, container_name: str):
"""Print welcome message."""
print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
print_colored("Type 'exit' to quit.", dim=True)
async def ainput(prompt: str = ""):
return await asyncio.to_thread(input, prompt)
async def chat_loop(agent, model: str, container_name: str):
"""Main chat loop with the agent."""
print_welcome(model, agent.agent_loop.__name__, container_name)
history = []
while True:
# Get user input with prompt
print_colored("> ", end="")
user_input = await ainput()
if user_input.lower() in ['exit', 'quit', 'q']:
print_colored("\n👋 Goodbye!")
break
if not user_input:
continue
# Add user message to history
history.append({"role": "user", "content": user_input})
# Stream responses from the agent with spinner
with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
spinner.hide()
async for result in agent.run(history):
# Add agent responses to history
history.extend(result.get("output", []))
# Process and display the output
for item in result.get("output", []):
if item.get("type") == "message":
# Display agent text response
content = item.get("content", [])
for content_part in content:
if content_part.get("text"):
text = content_part.get("text", "").strip()
if text:
spinner.hide()
print_colored(text)
elif item.get("type") == "computer_call":
# Display computer action
action = item.get("action", {})
action_type = action.get("type", "")
if action_type:
spinner.hide()
print_action(action_type, action)
spinner.text = f"Performing {action_type}..."
spinner.show()
elif item.get("type") == "function_call":
# Display function call
function_name = item.get("name", "")
spinner.hide()
print_colored(f"🔧 Calling function: {function_name}", dim=True)
spinner.text = f"Calling {function_name}..."
spinner.show()
elif item.get("type") == "function_call_output":
# Display function output (dimmed)
output = item.get("output", "")
if output and len(output.strip()) > 0:
spinner.hide()
print_colored(f"📤 {output}", dim=True)
spinner.hide()
async def main():
"""Main CLI function."""
parser = argparse.ArgumentParser(
description="CUA Agent CLI - Interactive computer use assistant",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python -m agent.cli openai/computer-use-preview
python -m agent.cli anthropic/claude-3-5-sonnet-20241022
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
"""
)
parser.add_argument(
"model",
help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
)
parser.add_argument(
"--images",
type=int,
default=3,
help="Number of recent images to keep in context (default: 3)"
)
parser.add_argument(
"--trajectory",
action="store_true",
help="Save trajectory for debugging"
)
parser.add_argument(
"--budget",
type=float,
help="Maximum budget for the session (in dollars)"
)
parser.add_argument(
"--verbose",
action="store_true",
help="Enable verbose logging"
)
args = parser.parse_args()
# Check for required environment variables
container_name = os.getenv("CUA_CONTAINER_NAME")
cua_api_key = os.getenv("CUA_API_KEY")
# Prompt for missing environment variables
if not container_name:
print_colored("CUA_CONTAINER_NAME not set.", dim=True)
print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
container_name = input("Enter your CUA container name: ").strip()
if not container_name:
print_colored("❌ Container name is required.")
sys.exit(1)
if not cua_api_key:
print_colored("CUA_API_KEY not set.", dim=True)
cua_api_key = input("Enter your CUA API key: ").strip()
if not cua_api_key:
print_colored("❌ API key is required.")
sys.exit(1)
# Check for provider-specific API keys based on model
provider_api_keys = {
"openai/": "OPENAI_API_KEY",
"anthropic/": "ANTHROPIC_API_KEY",
"omniparser+": "OPENAI_API_KEY",
"omniparser+": "ANTHROPIC_API_KEY",
}
# Find matching provider and check for API key
for prefix, env_var in provider_api_keys.items():
if args.model.startswith(prefix):
if not os.getenv(env_var):
print_colored(f"{env_var} not set.", dim=True)
api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
if not api_key:
print_colored(f"{env_var.replace('_', ' ').title()} is required.")
sys.exit(1)
# Set the environment variable for the session
os.environ[env_var] = api_key
break
# Import here to avoid import errors if dependencies are missing
try:
from agent import ComputerAgent
from computer import Computer
except ImportError as e:
print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
sys.exit(1)
# Create computer instance
async with Computer(
os_type="linux",
provider_type="cloud",
name=container_name,
api_key=cua_api_key
) as computer:
# Create agent
agent_kwargs = {
"model": args.model,
"tools": [computer],
"only_n_most_recent_images": args.images,
"verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
}
if args.trajectory:
agent_kwargs["trajectory_dir"] = "trajectories"
if args.budget:
agent_kwargs["max_trajectory_budget"] = {
"max_budget": args.budget,
"raise_error": True,
"reset_after_each_run": False
}
agent = ComputerAgent(**agent_kwargs)
# Start chat loop
await chat_loop(agent, args.model, container_name)
if __name__ == "__main__":
try:
asyncio.run(main())
except (KeyboardInterrupt, EOFError) as _:
print_colored("\n\n👋 Goodbye!")

View File

@@ -1,5 +1,5 @@
"""
Decorators for agent2 - agent_loop decorator
Decorators for agent - agent_loop decorator
"""
import asyncio

View File

@@ -1,5 +1,5 @@
"""
Agent loops for agent2
Agent loops for agent
"""
# Import the loops to register them

View File

@@ -1,5 +1,5 @@
"""
Type definitions for agent2
Type definitions for agent
"""
from typing import Dict, List, Any, Optional, Callable, Protocol, Literal

View File

@@ -0,0 +1,7 @@
"""
UI components for agent
"""
from .gradio import test_cua, create_gradio_ui
__all__ = ["test_cua", "create_gradio_ui"]

View File

@@ -0,0 +1,8 @@
"""
Gradio UI for agent
"""
from .app import test_cua
from .ui_components import create_gradio_ui
__all__ = ["test_cua", "create_gradio_ui"]

View File

@@ -0,0 +1,248 @@
"""
Advanced Gradio UI for Computer-Use Agent (cua-agent)
This is a Gradio interface for the Computer-Use Agent v0.4.x (cua-agent)
with an advanced UI for model selection and configuration.
Supported Agent Models:
- OpenAI: openai/computer-use-preview
- Anthropic: anthropic/claude-3-5-sonnet-20241022, anthropic/claude-3-7-sonnet-20250219
- UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
- Omniparser: omniparser+anthropic/claude-3-5-sonnet-20241022, omniparser+ollama_chat/gemma3
Requirements:
- Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
- macOS 14 (Sonoma) or newer / Ubuntu 20.04+
- Python 3.11+
- Lume CLI installed (https://github.com/trycua/cua)
- OpenAI or Anthropic API key
"""
import os
import asyncio
import logging
import json
import platform
from pathlib import Path
from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
import gradio as gr
from gradio.components.chatbot import MetadataDict
from typing import cast
# Import from agent package
from agent import ComputerAgent
from agent.types import Messages, AgentResponse
from computer import Computer
# Global variables
global_agent = None
global_computer = None
SETTINGS_FILE = Path(".gradio_settings.json")
import dotenv
if dotenv.load_dotenv():
print(f"DEBUG - Loaded environment variables from {dotenv.find_dotenv()}")
else:
print("DEBUG - No .env file found")
# --- Settings Load/Save Functions ---
def load_settings() -> Dict[str, Any]:
"""Loads settings from the JSON file."""
if SETTINGS_FILE.exists():
try:
with open(SETTINGS_FILE, "r") as f:
settings = json.load(f)
if isinstance(settings, dict):
print(f"DEBUG - Loaded settings from {SETTINGS_FILE}")
return settings
except (json.JSONDecodeError, IOError) as e:
print(f"Warning: Could not load settings from {SETTINGS_FILE}: {e}")
return {}
def save_settings(settings: Dict[str, Any]):
"""Saves settings to the JSON file."""
settings.pop("provider_api_key", None)
try:
with open(SETTINGS_FILE, "w") as f:
json.dump(settings, f, indent=4)
print(f"DEBUG - Saved settings to {SETTINGS_FILE}")
except IOError as e:
print(f"Warning: Could not save settings to {SETTINGS_FILE}: {e}")
# Custom Screenshot Handler for Gradio chat
class GradioChatScreenshotHandler:
"""Custom handler that adds screenshots to the Gradio chatbot."""
def __init__(self, chatbot_history: List[gr.ChatMessage]):
self.chatbot_history = chatbot_history
print("GradioChatScreenshotHandler initialized")
async def on_screenshot(self, screenshot_base64: str, action_type: str = "") -> None:
"""Add screenshot to chatbot when a screenshot is taken."""
image_markdown = f"![Screenshot after {action_type}](data:image/png;base64,{screenshot_base64})"
if self.chatbot_history is not None:
self.chatbot_history.append(
gr.ChatMessage(
role="assistant",
content=image_markdown,
metadata={"title": f"🖥️ Screenshot - {action_type}", "status": "done"},
)
)
# Detect platform capabilities
is_mac = platform.system().lower() == "darwin"
is_lume_available = is_mac or (os.environ.get("PYLUME_HOST", "localhost") != "localhost")
print("PYLUME_HOST: ", os.environ.get("PYLUME_HOST", "localhost"))
print("is_mac: ", is_mac)
print("Lume available: ", is_lume_available)
# Map model names to agent model strings
MODEL_MAPPINGS = {
"openai": {
"default": "openai/computer-use-preview",
"OpenAI: Computer-Use Preview": "openai/computer-use-preview",
},
"anthropic": {
"default": "anthropic/claude-3-7-sonnet-20250219",
"Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
"Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
"Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
"Anthropic: Claude 3.5 Sonnet (20240620)": "anthropic/claude-3-5-sonnet-20240620",
},
"omni": {
"default": "omniparser+openai/gpt-4o",
"OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
"OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
"OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
"OMNI: Claude 3.5 Sonnet (20240620)": "omniparser+anthropic/claude-3-5-sonnet-20240620",
},
"uitars": {
"default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",
"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
},
}
def get_model_string(model_name: str, loop_provider: str) -> str:
"""Determine the agent model string based on the input."""
if model_name == "Custom model (OpenAI compatible API)":
return "custom_oaicompat"
elif model_name == "Custom model (ollama)":
return "custom_ollama"
elif loop_provider == "OMNI-OLLAMA" or model_name.startswith("OMNI: Ollama "):
if model_name.startswith("OMNI: Ollama "):
ollama_model = model_name.split("OMNI: Ollama ", 1)[1]
return f"omniparser+ollama_chat/{ollama_model}"
return "omniparser+ollama_chat/llama3"
# Map based on loop provider
mapping = MODEL_MAPPINGS.get(loop_provider.lower(), MODEL_MAPPINGS["openai"])
return mapping.get(model_name, mapping["default"])
def get_ollama_models() -> List[str]:
"""Get available models from Ollama if installed."""
try:
import subprocess
result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
if result.returncode == 0:
lines = result.stdout.strip().split("\n")
if len(lines) < 2:
return []
models = []
for line in lines[1:]:
parts = line.split()
if parts:
model_name = parts[0]
models.append(f"OMNI: Ollama {model_name}")
return models
return []
except Exception as e:
logging.error(f"Error getting Ollama models: {e}")
return []
def create_computer_instance(
verbosity: int = logging.INFO,
os_type: str = "macos",
provider_type: str = "lume",
name: Optional[str] = None,
api_key: Optional[str] = None
) -> Computer:
"""Create or get the global Computer instance."""
global global_computer
if global_computer is None:
global_computer = Computer(
verbosity=verbosity,
os_type=os_type,
provider_type=provider_type,
name=name if name else "",
api_key=api_key
)
return global_computer
def create_agent(
model_string: str,
save_trajectory: bool = True,
only_n_most_recent_images: int = 3,
verbosity: int = logging.INFO,
custom_model_name: Optional[str] = None,
computer_os: str = "macos",
computer_provider: str = "lume",
computer_name: Optional[str] = None,
computer_api_key: Optional[str] = None,
max_trajectory_budget: Optional[float] = None,
) -> ComputerAgent:
"""Create or update the global agent with the specified parameters."""
global global_agent
# Create the computer
computer = create_computer_instance(
verbosity=verbosity,
os_type=computer_os,
provider_type=computer_provider,
name=computer_name,
api_key=computer_api_key
)
# Handle custom models
if model_string == "custom_oaicompat" and custom_model_name:
model_string = custom_model_name
elif model_string == "custom_ollama" and custom_model_name:
model_string = f"omniparser+ollama_chat/{custom_model_name}"
# Create agent kwargs
agent_kwargs = {
"model": model_string,
"tools": [computer],
"only_n_most_recent_images": only_n_most_recent_images,
"verbosity": verbosity,
}
if save_trajectory:
agent_kwargs["trajectory_dir"] = "trajectories"
if max_trajectory_budget:
agent_kwargs["max_trajectory_budget"] = {"max_budget": max_trajectory_budget, "raise_error": True}
global_agent = ComputerAgent(**agent_kwargs)
return global_agent
def test_cua():
"""Standalone function to launch the Gradio app."""
from agent.ui.gradio.ui_components import create_gradio_ui
print(f"Starting Gradio app for CUA Agent...")
demo = create_gradio_ui()
demo.launch(share=False, inbrowser=True)
if __name__ == "__main__":
test_cua()

View File

@@ -0,0 +1,703 @@
"""
UI Components for the Gradio interface
"""
import os
import asyncio
import logging
import json
import platform
from pathlib import Path
from typing import Dict, List, Optional, Any, cast
import gradio as gr
from gradio.components.chatbot import MetadataDict
from .app import (
load_settings, save_settings, create_agent, get_model_string,
get_ollama_models, GradioChatScreenshotHandler, global_agent, global_computer
)
def create_gradio_ui() -> gr.Blocks:
"""Create a Gradio UI for the Computer-Use Agent."""
# Load settings
saved_settings = load_settings()
# Check for API keys
openai_api_key = os.environ.get("OPENAI_API_KEY", "")
anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
cua_api_key = os.environ.get("CUA_API_KEY", "")
# Model choices
openai_models = ["OpenAI: Computer-Use Preview"]
anthropic_models = [
"Anthropic: Claude 4 Opus (20250514)",
"Anthropic: Claude 4 Sonnet (20250514)",
"Anthropic: Claude 3.7 Sonnet (20250219)",
"Anthropic: Claude 3.5 Sonnet (20240620)",
]
omni_models = [
"OMNI: OpenAI GPT-4o",
"OMNI: OpenAI GPT-4o mini",
"OMNI: Claude 3.7 Sonnet (20250219)",
"OMNI: Claude 3.5 Sonnet (20240620)"
]
# Check if API keys are available
has_openai_key = bool(openai_api_key)
has_anthropic_key = bool(anthropic_api_key)
has_cua_key = bool(cua_api_key)
# Get Ollama models for OMNI
ollama_models = get_ollama_models()
if ollama_models:
omni_models += ollama_models
# Detect platform
is_mac = platform.system().lower() == "darwin"
# Format model choices
provider_to_models = {
"OPENAI": openai_models,
"ANTHROPIC": anthropic_models,
"OMNI": omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
"UITARS": ([
"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
] if is_mac else []) + ["Custom model (OpenAI compatible API)"],
}
# Apply saved settings
initial_loop = saved_settings.get("agent_loop", "OMNI")
available_models_for_loop = provider_to_models.get(initial_loop, [])
saved_model_choice = saved_settings.get("model_choice")
if saved_model_choice and saved_model_choice in available_models_for_loop:
initial_model = saved_model_choice
else:
if initial_loop == "OPENAI":
initial_model = openai_models[0] if openai_models else "No models available"
elif initial_loop == "ANTHROPIC":
initial_model = anthropic_models[0] if anthropic_models else "No models available"
else: # OMNI
initial_model = omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
initial_save_trajectory = saved_settings.get("save_trajectory", True)
initial_recent_images = saved_settings.get("recent_images", 3)
# Example prompts
example_messages = [
"Create a Python virtual environment, install pandas and matplotlib, then plot stock data",
"Open a PDF in Preview, add annotations, and save it as a compressed version",
"Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
"Configure SSH keys and set up a connection to a remote server",
]
def generate_python_code(agent_loop_choice, model_name, tasks, recent_images=3, save_trajectory=True, computer_os="linux", computer_provider="cloud", container_name="", cua_cloud_api_key="", max_budget=None):
"""Generate Python code for the current configuration and tasks."""
tasks_str = ""
for task in tasks:
if task and task.strip():
tasks_str += f' "{task}",\n'
model_string = get_model_string(model_name, agent_loop_choice)
computer_args = []
if computer_os != "macos":
computer_args.append(f'os_type="{computer_os}"')
if computer_provider != "lume":
computer_args.append(f'provider_type="{computer_provider}"')
if container_name:
computer_args.append(f'name="{container_name}"')
if cua_cloud_api_key:
computer_args.append(f'api_key="{cua_cloud_api_key}"')
computer_args_str = ", ".join(computer_args)
if computer_args_str:
computer_args_str = f"({computer_args_str})"
else:
computer_args_str = "()"
code = f'''import asyncio
from computer import Computer
from agent import ComputerAgent
async def main():
async with Computer{computer_args_str} as computer:
agent = ComputerAgent(
model="{model_string}",
tools=[computer],
only_n_most_recent_images={recent_images},'''
if save_trajectory:
code += '''
trajectory_dir="trajectories",'''
if max_budget:
code += f'''
max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},'''
code += '''
)
'''
if tasks_str:
code += f'''
# Prompts for the computer-use agent
tasks = [
{tasks_str.rstrip()}
]
for task in tasks:
print(f"Executing task: {{task}}")
messages = [{{"role": "user", "content": task}}]
async for result in agent.run(messages):
for item in result["output"]:
if item["type"] == "message":
print(item["content"][0]["text"])'''
else:
code += f'''
# Execute a single task
task = "Search for information about CUA on GitHub"
print(f"Executing task: {{task}}")
messages = [{{"role": "user", "content": task}}]
async for result in agent.run(messages):
for item in result["output"]:
if item["type"] == "message":
print(item["content"][0]["text"])'''
code += '''
if __name__ == "__main__":
asyncio.run(main())'''
return code
# Create the Gradio interface
with gr.Blocks(title="Computer-Use Agent") as demo:
with gr.Row():
# Left column for settings
with gr.Column(scale=1):
# Logo
gr.HTML(
"""
<div style="display: flex; justify-content: center; margin-bottom: 0.5em">
<img alt="CUA Logo" style="width: 80px;"
src="https://github.com/trycua/cua/blob/main/img/logo_black.png?raw=true" />
</div>
"""
)
# Python code accordion
with gr.Accordion("Python Code", open=False):
code_display = gr.Code(
language="python",
value=generate_python_code(initial_loop, "gpt-4o", []),
interactive=False,
)
with gr.Accordion("Computer Configuration", open=True):
computer_os = gr.Radio(
choices=["macos", "linux", "windows"],
label="Operating System",
value="macos",
info="Select the operating system for the computer",
)
is_windows = platform.system().lower() == "windows"
is_mac = platform.system().lower() == "darwin"
providers = ["cloud"]
if is_mac:
providers += ["lume"]
if is_windows:
providers += ["winsandbox"]
computer_provider = gr.Radio(
choices=providers,
label="Provider",
value="lume" if is_mac else "cloud",
info="Select the computer provider",
)
container_name = gr.Textbox(
label="Container Name",
placeholder="Enter container name (optional)",
value=os.environ.get("CUA_CONTAINER_NAME", ""),
info="Optional name for the container",
)
cua_cloud_api_key = gr.Textbox(
label="CUA Cloud API Key",
placeholder="Enter your CUA Cloud API key",
value=os.environ.get("CUA_API_KEY", ""),
type="password",
info="Required for cloud provider",
visible=(not has_cua_key)
)
with gr.Accordion("Agent Configuration", open=True):
agent_loop = gr.Dropdown(
choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
label="Agent Loop",
value=initial_loop,
info="Select the agent loop provider",
)
# Model selection dropdowns
with gr.Group() as model_selection_group:
openai_model_choice = gr.Dropdown(
choices=openai_models,
label="OpenAI Model",
value=openai_models[0] if openai_models else "No models available",
info="Select OpenAI model",
interactive=True,
visible=(initial_loop == "OPENAI")
)
anthropic_model_choice = gr.Dropdown(
choices=anthropic_models,
label="Anthropic Model",
value=anthropic_models[0] if anthropic_models else "No models available",
info="Select Anthropic model",
interactive=True,
visible=(initial_loop == "ANTHROPIC")
)
omni_model_choice = gr.Dropdown(
choices=omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
label="OMNI Model",
value=omni_models[0] if omni_models else "Custom model (OpenAI compatible API)",
info="Select OMNI model or choose a custom model option",
interactive=True,
visible=(initial_loop == "OMNI")
)
uitars_model_choice = gr.Dropdown(
choices=provider_to_models.get("UITARS", ["No models available"]),
label="UITARS Model",
value=provider_to_models.get("UITARS", ["No models available"])[0] if provider_to_models.get("UITARS") else "No models available",
info="Select UITARS model",
interactive=True,
visible=(initial_loop == "UITARS")
)
model_choice = gr.Textbox(visible=False)
# API key inputs
with gr.Group(visible=not has_openai_key and (initial_loop == "OPENAI" or initial_loop == "OMNI")) as openai_key_group:
openai_api_key_input = gr.Textbox(
label="OpenAI API Key",
placeholder="Enter your OpenAI API key",
value=os.environ.get("OPENAI_API_KEY", ""),
interactive=True,
type="password",
info="Required for OpenAI models"
)
with gr.Group(visible=not has_anthropic_key and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")) as anthropic_key_group:
anthropic_api_key_input = gr.Textbox(
label="Anthropic API Key",
placeholder="Enter your Anthropic API key",
value=os.environ.get("ANTHROPIC_API_KEY", ""),
interactive=True,
type="password",
info="Required for Anthropic models"
)
# API key handlers
def set_openai_api_key(key):
if key and key.strip():
os.environ["OPENAI_API_KEY"] = key.strip()
print(f"DEBUG - Set OpenAI API key environment variable")
return key
def set_anthropic_api_key(key):
if key and key.strip():
os.environ["ANTHROPIC_API_KEY"] = key.strip()
print(f"DEBUG - Set Anthropic API key environment variable")
return key
openai_api_key_input.change(
fn=set_openai_api_key,
inputs=[openai_api_key_input],
outputs=[openai_api_key_input],
queue=False
)
anthropic_api_key_input.change(
fn=set_anthropic_api_key,
inputs=[anthropic_api_key_input],
outputs=[anthropic_api_key_input],
queue=False
)
# UI update function
def update_ui(loop=None, openai_model=None, anthropic_model=None, omni_model=None, uitars_model=None):
loop = loop or agent_loop.value
model_value = None
if loop == "OPENAI" and openai_model:
model_value = openai_model
elif loop == "ANTHROPIC" and anthropic_model:
model_value = anthropic_model
elif loop == "OMNI" and omni_model:
model_value = omni_model
elif loop == "UITARS" and uitars_model:
model_value = uitars_model
openai_visible = (loop == "OPENAI")
anthropic_visible = (loop == "ANTHROPIC")
omni_visible = (loop == "OMNI")
uitars_visible = (loop == "UITARS")
show_openai_key = not has_openai_key and (loop == "OPENAI" or (loop == "OMNI" and model_value and "OpenAI" in model_value and "Custom" not in model_value))
show_anthropic_key = not has_anthropic_key and (loop == "ANTHROPIC" or (loop == "OMNI" and model_value and "Claude" in model_value and "Custom" not in model_value))
is_custom_openai_api = model_value == "Custom model (OpenAI compatible API)"
is_custom_ollama = model_value == "Custom model (ollama)"
is_any_custom = is_custom_openai_api or is_custom_ollama
model_choice_value = model_value if model_value else ""
return [
gr.update(visible=openai_visible),
gr.update(visible=anthropic_visible),
gr.update(visible=omni_visible),
gr.update(visible=uitars_visible),
gr.update(visible=show_openai_key),
gr.update(visible=show_anthropic_key),
gr.update(visible=is_any_custom),
gr.update(visible=is_custom_openai_api),
gr.update(visible=is_custom_openai_api),
gr.update(value=model_choice_value)
]
# Custom model inputs
custom_model = gr.Textbox(
label="Custom Model Name",
placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct or llama3)",
value=initial_custom_model,
visible=(initial_model == "Custom model (OpenAI compatible API)" or initial_model == "Custom model (ollama)"),
interactive=True,
)
provider_base_url = gr.Textbox(
label="Provider Base URL",
placeholder="Enter provider base URL (e.g., http://localhost:1234/v1)",
value=initial_provider_base_url,
visible=(initial_model == "Custom model (OpenAI compatible API)"),
interactive=True,
)
provider_api_key = gr.Textbox(
label="Provider API Key",
placeholder="Enter provider API key (if required)",
value="",
visible=(initial_model == "Custom model (OpenAI compatible API)"),
interactive=True,
type="password",
)
# Connect UI update events
for dropdown in [agent_loop, omni_model_choice, uitars_model_choice, openai_model_choice, anthropic_model_choice]:
dropdown.change(
fn=update_ui,
inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
outputs=[
openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
openai_key_group, anthropic_key_group,
custom_model, provider_base_url, provider_api_key,
model_choice
],
queue=False
)
save_trajectory = gr.Checkbox(
label="Save Trajectory",
value=initial_save_trajectory,
info="Save the agent's trajectory for debugging",
interactive=True,
)
recent_images = gr.Slider(
label="Recent Images",
minimum=1,
maximum=10,
value=initial_recent_images,
step=1,
info="Number of recent images to keep in context",
interactive=True,
)
max_budget = gr.Number(
label="Max Budget ($)",
value=lambda: None,
minimum=-1,
maximum=100.0,
step=0.1,
info="Optional budget limit for trajectory (0 = no limit)",
interactive=True,
)
# Right column for chat interface
with gr.Column(scale=2):
gr.Markdown(
"Ask me to perform tasks in a virtual environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
)
chatbot_history = gr.Chatbot(type="messages")
msg = gr.Textbox(
placeholder="Ask me to perform tasks in a virtual environment"
)
clear = gr.Button("Clear")
cancel_button = gr.Button("Cancel", variant="stop")
# Add examples
example_group = gr.Examples(examples=example_messages, inputs=msg)
# Chat submission function
def chat_submit(message, history):
history.append(gr.ChatMessage(role="user", content=message))
return "", history
# Cancel function
async def cancel_agent_task(history):
global global_agent
if global_agent:
print("DEBUG - Cancelling agent task")
history.append(gr.ChatMessage(role="assistant", content="Task cancelled by user", metadata={"title": "❌ Cancelled"}))
else:
history.append(gr.ChatMessage(role="assistant", content="No active agent task to cancel", metadata={"title": " Info"}))
return history
# Process response function
async def process_response(
history,
openai_model_value,
anthropic_model_value,
omni_model_value,
uitars_model_value,
custom_model_value,
agent_loop_choice,
save_traj,
recent_imgs,
custom_url_value=None,
custom_api_key=None,
openai_key_input=None,
anthropic_key_input=None,
computer_os="linux",
computer_provider="cloud",
container_name="",
cua_cloud_api_key="",
max_budget_value=None,
):
if not history:
yield history
return
# Get the last user message
last_user_message = history[-1]["content"]
# Get the appropriate model value based on the agent loop
if agent_loop_choice == "OPENAI":
model_choice_value = openai_model_value
elif agent_loop_choice == "ANTHROPIC":
model_choice_value = anthropic_model_value
elif agent_loop_choice == "OMNI":
model_choice_value = omni_model_value
elif agent_loop_choice == "UITARS":
model_choice_value = uitars_model_value
else:
model_choice_value = "No models available"
# Determine if this is a custom model selection
is_custom_model_selected = model_choice_value in ["Custom model (OpenAI compatible API)", "Custom model (ollama)"]
# Determine the model name string to analyze
if is_custom_model_selected:
model_string_to_analyze = custom_model_value
else:
model_string_to_analyze = model_choice_value
try:
# Get the model string
model_string = get_model_string(model_string_to_analyze, agent_loop_choice)
# Set API keys if provided
if openai_key_input:
os.environ["OPENAI_API_KEY"] = openai_key_input
if anthropic_key_input:
os.environ["ANTHROPIC_API_KEY"] = anthropic_key_input
if cua_cloud_api_key:
os.environ["CUA_API_KEY"] = cua_cloud_api_key
# Save settings
current_settings = {
"agent_loop": agent_loop_choice,
"model_choice": model_choice_value,
"custom_model": custom_model_value,
"provider_base_url": custom_url_value,
"save_trajectory": save_traj,
"recent_images": recent_imgs,
"computer_os": computer_os,
"computer_provider": computer_provider,
"container_name": container_name,
}
save_settings(current_settings)
# Create agent
global_agent = create_agent(
model_string=model_string,
save_trajectory=save_traj,
only_n_most_recent_images=recent_imgs,
custom_model_name=custom_model_value if is_custom_model_selected else None,
computer_os=computer_os,
computer_provider=computer_provider,
computer_name=container_name,
computer_api_key=cua_cloud_api_key,
verbosity=logging.DEBUG,
max_trajectory_budget=max_budget_value if max_budget_value and max_budget_value > 0 else None,
)
if global_agent is None:
history.append(
gr.ChatMessage(
role="assistant",
content="Failed to create agent. Check API keys and configuration.",
)
)
yield history
return
# Create message list for agent
messages = [{"role": "user", "content": last_user_message}]
# Stream responses from the agent
async for result in global_agent.run(messages):
print(f"DEBUG - Agent response ------- START")
from pprint import pprint
pprint(result)
print(f"DEBUG - Agent response ------- END")
# Process the result output
for item in result.get("output", []):
if item.get("type") == "message":
content = item.get("content", [])
for content_part in content:
if content_part.get("text"):
history.append(gr.ChatMessage(
role=item.get("role", "assistant"),
content=content_part.get("text", ""),
metadata=content_part.get("metadata", {})
))
elif item.get("type") == "computer_call":
action = item.get("action", {})
action_type = action.get("type", "")
if action_type:
action_title = f"🛠️ Performing {action_type}"
if action.get("x") and action.get("y"):
action_title += f" at ({action['x']}, {action['y']})"
history.append(gr.ChatMessage(
role="assistant",
content=f"```json\n{json.dumps(action)}\n```",
metadata={"title": action_title}
))
elif item.get("type") == "function_call":
function_name = item.get("name", "")
arguments = item.get("arguments", "{}")
history.append(gr.ChatMessage(
role="assistant",
content=f"🔧 Calling function: {function_name}\n```json\n{arguments}\n```",
metadata={"title": f"Function Call: {function_name}"}
))
elif item.get("type") == "function_call_output":
output = item.get("output", "")
history.append(gr.ChatMessage(
role="assistant",
content=f"📤 Function output:\n```\n{output}\n```",
metadata={"title": "Function Output"}
))
yield history
except Exception as e:
import traceback
traceback.print_exc()
history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
yield history
# Connect the submit button
submit_event = msg.submit(
fn=chat_submit,
inputs=[msg, chatbot_history],
outputs=[msg, chatbot_history],
queue=False,
).then(
fn=process_response,
inputs=[
chatbot_history,
openai_model_choice,
anthropic_model_choice,
omni_model_choice,
uitars_model_choice,
custom_model,
agent_loop,
save_trajectory,
recent_images,
provider_base_url,
provider_api_key,
openai_api_key_input,
anthropic_api_key_input,
computer_os,
computer_provider,
container_name,
cua_cloud_api_key,
max_budget,
],
outputs=[chatbot_history],
queue=True,
)
# Clear button functionality
clear.click(lambda: None, None, chatbot_history, queue=False)
# Connect cancel button
cancel_button.click(
cancel_agent_task,
[chatbot_history],
[chatbot_history],
queue=False
)
# Code display update function
def update_code_display(agent_loop, model_choice_val, custom_model_val, chat_history, recent_images_val, save_trajectory_val, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget_val):
messages = []
if chat_history:
for msg in chat_history:
if isinstance(msg, dict) and msg.get("role") == "user":
messages.append(msg.get("content", ""))
return generate_python_code(
agent_loop,
model_choice_val or custom_model_val or "gpt-4o",
messages,
recent_images_val,
save_trajectory_val,
computer_os,
computer_provider,
container_name,
cua_cloud_api_key,
max_budget_val
)
# Update code display when configuration changes
for component in [agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget]:
component.change(
update_code_display,
inputs=[agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget],
outputs=[code_display]
)
return demo

View File

@@ -1,12 +1,12 @@
"""
Example usage of the agent2 library with docstring-based tool definitions.
Example usage of the agent library with docstring-based tool definitions.
"""
import asyncio
import logging
from agent2 import agent_loop, ComputerAgent
from agent2.types import Messages
from agent import agent_loop, ComputerAgent
from agent.types import Messages
from computer import Computer
from computer.helpers import sandboxed

View File

@@ -4,7 +4,7 @@ build-backend = "pdm.backend"
[project]
name = "cua-agent"
version = "0.4.0"
version = "0.4.0b1"
description = "CUA (Computer Use) Agent for AI-driven computer interaction"
readme = "README.md"
authors = [
@@ -44,6 +44,9 @@ ui = [
"gradio>=5.23.3",
"python-dotenv>=1.0.1",
]
cli = [
"yaspin>=3.1.0",
]
all = [
# omni requirements
"ultralytics>=8.0.0",
@@ -54,6 +57,8 @@ all = [
# ui requirements
"gradio>=5.23.3",
"python-dotenv>=1.0.1",
# cli requirements
"yaspin>=3.1.0",
]
[tool.uv]
@@ -63,4 +68,4 @@ constraint-dependencies = ["fastrtc>0.43.0", "mlx-audio>0.2.3"]
distribution = true
[tool.pdm.build]
includes = ["agent2/"]
includes = ["agent/"]