mirror of
https://github.com/trycua/computer.git
synced 2026-01-10 07:20:10 -06:00
renamed to agent
This commit is contained in:
@@ -147,12 +147,12 @@ agent = ComputerAgent(
|
||||
|
||||
## Callbacks System
|
||||
|
||||
Agent2 provides a comprehensive callback system for extending functionality:
|
||||
agent provides a comprehensive callback system for extending functionality:
|
||||
|
||||
### Built-in Callbacks
|
||||
|
||||
```python
|
||||
from agent2.callbacks import (
|
||||
from agent.callbacks import (
|
||||
ImageRetentionCallback,
|
||||
TrajectorySaverCallback,
|
||||
BudgetManagerCallback,
|
||||
@@ -174,7 +174,7 @@ agent = ComputerAgent(
|
||||
### Custom Callbacks
|
||||
|
||||
```python
|
||||
from agent2.callbacks.base import AsyncCallbackHandler
|
||||
from agent.callbacks.base import AsyncCallbackHandler
|
||||
|
||||
class CustomCallback(AsyncCallbackHandler):
|
||||
async def on_llm_start(self, messages):
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Agent2 - Decorator-based Computer Use Agent with liteLLM integration
|
||||
agent - Decorator-based Computer Use Agent with liteLLM integration
|
||||
"""
|
||||
|
||||
from .decorators import agent_loop
|
||||
21
libs/python/agent2/agent/__main__.py
Normal file
21
libs/python/agent2/agent/__main__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""
|
||||
Entry point for running agent CLI module.
|
||||
|
||||
Usage:
|
||||
python -m agent.cli <model_string>
|
||||
"""
|
||||
|
||||
import sys
|
||||
import asyncio
|
||||
from .cli import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Check if 'cli' is specified as the module
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "cli":
|
||||
# Remove 'cli' from arguments and run CLI
|
||||
sys.argv.pop(1)
|
||||
asyncio.run(main())
|
||||
else:
|
||||
print("Usage: python -m agent.cli <model_string>")
|
||||
print("Example: python -m agent.cli openai/computer-use-preview")
|
||||
sys.exit(1)
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Adapters package for agent2 - Custom LLM adapters for LiteLLM
|
||||
Adapters package for agent - Custom LLM adapters for LiteLLM
|
||||
"""
|
||||
|
||||
from .huggingfacelocal_adapter import HuggingFaceLocalAdapter
|
||||
@@ -192,7 +192,7 @@ class ComputerAgent:
|
||||
]
|
||||
|
||||
# == Initialize computer agent ==
|
||||
|
||||
|
||||
# Find the appropriate agent loop
|
||||
if custom_loop:
|
||||
self.agent_loop = custom_loop
|
||||
@@ -204,16 +204,26 @@ class ComputerAgent:
|
||||
self.agent_loop = loop_info.func
|
||||
self.agent_loop_info = loop_info
|
||||
|
||||
# Process tools and create tool schemas
|
||||
self.tool_schemas = self._process_tools()
|
||||
self.tool_schemas = []
|
||||
self.computer_handler = None
|
||||
|
||||
# Find computer tool and create interface adapter
|
||||
computer_handler = None
|
||||
for schema in self.tool_schemas:
|
||||
if schema["type"] == "computer":
|
||||
computer_handler = OpenAIComputerHandler(schema["computer"].interface)
|
||||
break
|
||||
self.computer_handler = computer_handler
|
||||
async def _initialize_computers(self):
|
||||
"""Initialize computer objects"""
|
||||
if not self.tool_schemas:
|
||||
for tool in self.tools:
|
||||
if hasattr(tool, '_initialized') and not tool._initialized:
|
||||
await tool.run()
|
||||
|
||||
# Process tools and create tool schemas
|
||||
self.tool_schemas = self._process_tools()
|
||||
|
||||
# Find computer tool and create interface adapter
|
||||
computer_handler = None
|
||||
for schema in self.tool_schemas:
|
||||
if schema["type"] == "computer":
|
||||
computer_handler = OpenAIComputerHandler(schema["computer"].interface)
|
||||
break
|
||||
self.computer_handler = computer_handler
|
||||
|
||||
def _process_input(self, input: Messages) -> List[Dict[str, Any]]:
|
||||
"""Process input messages and create schemas for the agent loop"""
|
||||
@@ -484,6 +494,9 @@ class ComputerAgent:
|
||||
Returns:
|
||||
AsyncGenerator that yields response chunks
|
||||
"""
|
||||
|
||||
await self._initialize_computers()
|
||||
|
||||
# Merge kwargs
|
||||
merged_kwargs = {**self.kwargs, **kwargs}
|
||||
|
||||
@@ -54,10 +54,10 @@ class LoggingCallback(AsyncCallbackHandler):
|
||||
Initialize the logging callback.
|
||||
|
||||
Args:
|
||||
logger: Logger instance to use. If None, creates a logger named 'agent2.ComputerAgent'
|
||||
logger: Logger instance to use. If None, creates a logger named 'agent.ComputerAgent'
|
||||
level: Logging level (logging.DEBUG, logging.INFO, etc.)
|
||||
"""
|
||||
self.logger = logger or logging.getLogger('agent2.ComputerAgent')
|
||||
self.logger = logger or logging.getLogger('agent.ComputerAgent')
|
||||
self.level = level
|
||||
|
||||
# Set up logger if it doesn't have handlers
|
||||
290
libs/python/agent2/agent/cli.py
Normal file
290
libs/python/agent2/agent/cli.py
Normal file
@@ -0,0 +1,290 @@
|
||||
"""
|
||||
CLI chat interface for agent - Computer Use Agent
|
||||
|
||||
Usage:
|
||||
python -m agent.cli <model_string>
|
||||
|
||||
Examples:
|
||||
python -m agent.cli openai/computer-use-preview
|
||||
python -m agent.cli anthropic/claude-3-5-sonnet-20241022
|
||||
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from typing import List, Dict, Any
|
||||
import dotenv
|
||||
from yaspin import yaspin
|
||||
|
||||
# Load environment variables
|
||||
dotenv.load_dotenv()
|
||||
|
||||
# Color codes for terminal output
|
||||
class Colors:
|
||||
RESET = '\033[0m'
|
||||
BOLD = '\033[1m'
|
||||
DIM = '\033[2m'
|
||||
|
||||
# Text colors
|
||||
RED = '\033[31m'
|
||||
GREEN = '\033[32m'
|
||||
YELLOW = '\033[33m'
|
||||
BLUE = '\033[34m'
|
||||
MAGENTA = '\033[35m'
|
||||
CYAN = '\033[36m'
|
||||
WHITE = '\033[37m'
|
||||
GRAY = '\033[90m'
|
||||
|
||||
# Background colors
|
||||
BG_RED = '\033[41m'
|
||||
BG_GREEN = '\033[42m'
|
||||
BG_YELLOW = '\033[43m'
|
||||
BG_BLUE = '\033[44m'
|
||||
|
||||
|
||||
def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n"):
|
||||
"""Print colored text to terminal."""
|
||||
prefix = ""
|
||||
if bold:
|
||||
prefix += Colors.BOLD
|
||||
if dim:
|
||||
prefix += Colors.DIM
|
||||
if color:
|
||||
prefix += color
|
||||
|
||||
print(f"{prefix}{text}{Colors.RESET}", end=end)
|
||||
|
||||
|
||||
def print_action(action_type: str, details: Dict[str, Any]):
|
||||
"""Print computer action with nice formatting."""
|
||||
# Format action details
|
||||
args_str = ""
|
||||
if action_type == "click" and "x" in details and "y" in details:
|
||||
args_str = f"({details['x']}, {details['y']})"
|
||||
elif action_type == "type" and "text" in details:
|
||||
text = details["text"]
|
||||
if len(text) > 50:
|
||||
text = text[:47] + "..."
|
||||
args_str = f'"{text}"'
|
||||
elif action_type == "key" and "key" in details:
|
||||
args_str = f"'{details['key']}'"
|
||||
elif action_type == "scroll" and "x" in details and "y" in details:
|
||||
args_str = f"({details['x']}, {details['y']})"
|
||||
|
||||
print_colored(f"🛠️ {action_type}{args_str}", dim=True)
|
||||
|
||||
|
||||
def print_welcome(model: str, agent_loop: str, container_name: str):
|
||||
"""Print welcome message."""
|
||||
print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
|
||||
print_colored("Type 'exit' to quit.", dim=True)
|
||||
|
||||
async def ainput(prompt: str = ""):
|
||||
return await asyncio.to_thread(input, prompt)
|
||||
|
||||
async def chat_loop(agent, model: str, container_name: str):
|
||||
"""Main chat loop with the agent."""
|
||||
print_welcome(model, agent.agent_loop.__name__, container_name)
|
||||
|
||||
history = []
|
||||
|
||||
while True:
|
||||
# Get user input with prompt
|
||||
print_colored("> ", end="")
|
||||
user_input = await ainput()
|
||||
|
||||
if user_input.lower() in ['exit', 'quit', 'q']:
|
||||
print_colored("\n👋 Goodbye!")
|
||||
break
|
||||
|
||||
if not user_input:
|
||||
continue
|
||||
|
||||
# Add user message to history
|
||||
history.append({"role": "user", "content": user_input})
|
||||
|
||||
# Stream responses from the agent with spinner
|
||||
with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
|
||||
spinner.hide()
|
||||
|
||||
async for result in agent.run(history):
|
||||
# Add agent responses to history
|
||||
history.extend(result.get("output", []))
|
||||
|
||||
# Process and display the output
|
||||
for item in result.get("output", []):
|
||||
if item.get("type") == "message":
|
||||
# Display agent text response
|
||||
content = item.get("content", [])
|
||||
for content_part in content:
|
||||
if content_part.get("text"):
|
||||
text = content_part.get("text", "").strip()
|
||||
if text:
|
||||
spinner.hide()
|
||||
print_colored(text)
|
||||
|
||||
elif item.get("type") == "computer_call":
|
||||
# Display computer action
|
||||
action = item.get("action", {})
|
||||
action_type = action.get("type", "")
|
||||
if action_type:
|
||||
spinner.hide()
|
||||
print_action(action_type, action)
|
||||
spinner.text = f"Performing {action_type}..."
|
||||
spinner.show()
|
||||
|
||||
elif item.get("type") == "function_call":
|
||||
# Display function call
|
||||
function_name = item.get("name", "")
|
||||
spinner.hide()
|
||||
print_colored(f"🔧 Calling function: {function_name}", dim=True)
|
||||
spinner.text = f"Calling {function_name}..."
|
||||
spinner.show()
|
||||
|
||||
elif item.get("type") == "function_call_output":
|
||||
# Display function output (dimmed)
|
||||
output = item.get("output", "")
|
||||
if output and len(output.strip()) > 0:
|
||||
spinner.hide()
|
||||
print_colored(f"📤 {output}", dim=True)
|
||||
|
||||
spinner.hide()
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main CLI function."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="CUA Agent CLI - Interactive computer use assistant",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python -m agent.cli openai/computer-use-preview
|
||||
python -m agent.cli anthropic/claude-3-5-sonnet-20241022
|
||||
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
|
||||
python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"model",
|
||||
help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--images",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of recent images to keep in context (default: 3)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--trajectory",
|
||||
action="store_true",
|
||||
help="Save trajectory for debugging"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--budget",
|
||||
type=float,
|
||||
help="Maximum budget for the session (in dollars)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="Enable verbose logging"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Check for required environment variables
|
||||
container_name = os.getenv("CUA_CONTAINER_NAME")
|
||||
cua_api_key = os.getenv("CUA_API_KEY")
|
||||
|
||||
# Prompt for missing environment variables
|
||||
if not container_name:
|
||||
print_colored("CUA_CONTAINER_NAME not set.", dim=True)
|
||||
print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
|
||||
container_name = input("Enter your CUA container name: ").strip()
|
||||
if not container_name:
|
||||
print_colored("❌ Container name is required.")
|
||||
sys.exit(1)
|
||||
|
||||
if not cua_api_key:
|
||||
print_colored("CUA_API_KEY not set.", dim=True)
|
||||
cua_api_key = input("Enter your CUA API key: ").strip()
|
||||
if not cua_api_key:
|
||||
print_colored("❌ API key is required.")
|
||||
sys.exit(1)
|
||||
|
||||
# Check for provider-specific API keys based on model
|
||||
provider_api_keys = {
|
||||
"openai/": "OPENAI_API_KEY",
|
||||
"anthropic/": "ANTHROPIC_API_KEY",
|
||||
"omniparser+": "OPENAI_API_KEY",
|
||||
"omniparser+": "ANTHROPIC_API_KEY",
|
||||
}
|
||||
|
||||
# Find matching provider and check for API key
|
||||
for prefix, env_var in provider_api_keys.items():
|
||||
if args.model.startswith(prefix):
|
||||
if not os.getenv(env_var):
|
||||
print_colored(f"{env_var} not set.", dim=True)
|
||||
api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
|
||||
if not api_key:
|
||||
print_colored(f"❌ {env_var.replace('_', ' ').title()} is required.")
|
||||
sys.exit(1)
|
||||
# Set the environment variable for the session
|
||||
os.environ[env_var] = api_key
|
||||
break
|
||||
|
||||
# Import here to avoid import errors if dependencies are missing
|
||||
try:
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer
|
||||
except ImportError as e:
|
||||
print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
|
||||
print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
|
||||
sys.exit(1)
|
||||
|
||||
# Create computer instance
|
||||
async with Computer(
|
||||
os_type="linux",
|
||||
provider_type="cloud",
|
||||
name=container_name,
|
||||
api_key=cua_api_key
|
||||
) as computer:
|
||||
|
||||
# Create agent
|
||||
agent_kwargs = {
|
||||
"model": args.model,
|
||||
"tools": [computer],
|
||||
"only_n_most_recent_images": args.images,
|
||||
"verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
|
||||
}
|
||||
|
||||
if args.trajectory:
|
||||
agent_kwargs["trajectory_dir"] = "trajectories"
|
||||
|
||||
if args.budget:
|
||||
agent_kwargs["max_trajectory_budget"] = {
|
||||
"max_budget": args.budget,
|
||||
"raise_error": True,
|
||||
"reset_after_each_run": False
|
||||
}
|
||||
|
||||
agent = ComputerAgent(**agent_kwargs)
|
||||
|
||||
# Start chat loop
|
||||
await chat_loop(agent, args.model, container_name)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except (KeyboardInterrupt, EOFError) as _:
|
||||
print_colored("\n\n👋 Goodbye!")
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Decorators for agent2 - agent_loop decorator
|
||||
Decorators for agent - agent_loop decorator
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Agent loops for agent2
|
||||
Agent loops for agent
|
||||
"""
|
||||
|
||||
# Import the loops to register them
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Type definitions for agent2
|
||||
Type definitions for agent
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Any, Optional, Callable, Protocol, Literal
|
||||
7
libs/python/agent2/agent/ui/__init__.py
Normal file
7
libs/python/agent2/agent/ui/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""
|
||||
UI components for agent
|
||||
"""
|
||||
|
||||
from .gradio import test_cua, create_gradio_ui
|
||||
|
||||
__all__ = ["test_cua", "create_gradio_ui"]
|
||||
8
libs/python/agent2/agent/ui/gradio/__init__.py
Normal file
8
libs/python/agent2/agent/ui/gradio/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""
|
||||
Gradio UI for agent
|
||||
"""
|
||||
|
||||
from .app import test_cua
|
||||
from .ui_components import create_gradio_ui
|
||||
|
||||
__all__ = ["test_cua", "create_gradio_ui"]
|
||||
248
libs/python/agent2/agent/ui/gradio/app.py
Normal file
248
libs/python/agent2/agent/ui/gradio/app.py
Normal file
@@ -0,0 +1,248 @@
|
||||
"""
|
||||
Advanced Gradio UI for Computer-Use Agent (cua-agent)
|
||||
|
||||
This is a Gradio interface for the Computer-Use Agent v0.4.x (cua-agent)
|
||||
with an advanced UI for model selection and configuration.
|
||||
|
||||
Supported Agent Models:
|
||||
- OpenAI: openai/computer-use-preview
|
||||
- Anthropic: anthropic/claude-3-5-sonnet-20241022, anthropic/claude-3-7-sonnet-20250219
|
||||
- UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
||||
- Omniparser: omniparser+anthropic/claude-3-5-sonnet-20241022, omniparser+ollama_chat/gemma3
|
||||
|
||||
Requirements:
|
||||
- Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
|
||||
- macOS 14 (Sonoma) or newer / Ubuntu 20.04+
|
||||
- Python 3.11+
|
||||
- Lume CLI installed (https://github.com/trycua/cua)
|
||||
- OpenAI or Anthropic API key
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import logging
|
||||
import json
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
|
||||
import gradio as gr
|
||||
from gradio.components.chatbot import MetadataDict
|
||||
from typing import cast
|
||||
|
||||
# Import from agent package
|
||||
from agent import ComputerAgent
|
||||
from agent.types import Messages, AgentResponse
|
||||
from computer import Computer
|
||||
|
||||
# Global variables
|
||||
global_agent = None
|
||||
global_computer = None
|
||||
SETTINGS_FILE = Path(".gradio_settings.json")
|
||||
|
||||
|
||||
import dotenv
|
||||
if dotenv.load_dotenv():
|
||||
print(f"DEBUG - Loaded environment variables from {dotenv.find_dotenv()}")
|
||||
else:
|
||||
print("DEBUG - No .env file found")
|
||||
|
||||
# --- Settings Load/Save Functions ---
|
||||
def load_settings() -> Dict[str, Any]:
|
||||
"""Loads settings from the JSON file."""
|
||||
if SETTINGS_FILE.exists():
|
||||
try:
|
||||
with open(SETTINGS_FILE, "r") as f:
|
||||
settings = json.load(f)
|
||||
if isinstance(settings, dict):
|
||||
print(f"DEBUG - Loaded settings from {SETTINGS_FILE}")
|
||||
return settings
|
||||
except (json.JSONDecodeError, IOError) as e:
|
||||
print(f"Warning: Could not load settings from {SETTINGS_FILE}: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def save_settings(settings: Dict[str, Any]):
|
||||
"""Saves settings to the JSON file."""
|
||||
settings.pop("provider_api_key", None)
|
||||
try:
|
||||
with open(SETTINGS_FILE, "w") as f:
|
||||
json.dump(settings, f, indent=4)
|
||||
print(f"DEBUG - Saved settings to {SETTINGS_FILE}")
|
||||
except IOError as e:
|
||||
print(f"Warning: Could not save settings to {SETTINGS_FILE}: {e}")
|
||||
|
||||
|
||||
# Custom Screenshot Handler for Gradio chat
|
||||
class GradioChatScreenshotHandler:
|
||||
"""Custom handler that adds screenshots to the Gradio chatbot."""
|
||||
|
||||
def __init__(self, chatbot_history: List[gr.ChatMessage]):
|
||||
self.chatbot_history = chatbot_history
|
||||
print("GradioChatScreenshotHandler initialized")
|
||||
|
||||
async def on_screenshot(self, screenshot_base64: str, action_type: str = "") -> None:
|
||||
"""Add screenshot to chatbot when a screenshot is taken."""
|
||||
image_markdown = f""
|
||||
|
||||
if self.chatbot_history is not None:
|
||||
self.chatbot_history.append(
|
||||
gr.ChatMessage(
|
||||
role="assistant",
|
||||
content=image_markdown,
|
||||
metadata={"title": f"🖥️ Screenshot - {action_type}", "status": "done"},
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# Detect platform capabilities
|
||||
is_mac = platform.system().lower() == "darwin"
|
||||
is_lume_available = is_mac or (os.environ.get("PYLUME_HOST", "localhost") != "localhost")
|
||||
|
||||
print("PYLUME_HOST: ", os.environ.get("PYLUME_HOST", "localhost"))
|
||||
print("is_mac: ", is_mac)
|
||||
print("Lume available: ", is_lume_available)
|
||||
|
||||
# Map model names to agent model strings
|
||||
MODEL_MAPPINGS = {
|
||||
"openai": {
|
||||
"default": "openai/computer-use-preview",
|
||||
"OpenAI: Computer-Use Preview": "openai/computer-use-preview",
|
||||
},
|
||||
"anthropic": {
|
||||
"default": "anthropic/claude-3-7-sonnet-20250219",
|
||||
"Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
|
||||
"Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
|
||||
"Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
|
||||
"Anthropic: Claude 3.5 Sonnet (20240620)": "anthropic/claude-3-5-sonnet-20240620",
|
||||
},
|
||||
"omni": {
|
||||
"default": "omniparser+openai/gpt-4o",
|
||||
"OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
|
||||
"OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
|
||||
"OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
|
||||
"OMNI: Claude 3.5 Sonnet (20240620)": "omniparser+anthropic/claude-3-5-sonnet-20240620",
|
||||
},
|
||||
"uitars": {
|
||||
"default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",
|
||||
"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def get_model_string(model_name: str, loop_provider: str) -> str:
|
||||
"""Determine the agent model string based on the input."""
|
||||
if model_name == "Custom model (OpenAI compatible API)":
|
||||
return "custom_oaicompat"
|
||||
elif model_name == "Custom model (ollama)":
|
||||
return "custom_ollama"
|
||||
elif loop_provider == "OMNI-OLLAMA" or model_name.startswith("OMNI: Ollama "):
|
||||
if model_name.startswith("OMNI: Ollama "):
|
||||
ollama_model = model_name.split("OMNI: Ollama ", 1)[1]
|
||||
return f"omniparser+ollama_chat/{ollama_model}"
|
||||
return "omniparser+ollama_chat/llama3"
|
||||
|
||||
# Map based on loop provider
|
||||
mapping = MODEL_MAPPINGS.get(loop_provider.lower(), MODEL_MAPPINGS["openai"])
|
||||
return mapping.get(model_name, mapping["default"])
|
||||
|
||||
|
||||
def get_ollama_models() -> List[str]:
|
||||
"""Get available models from Ollama if installed."""
|
||||
try:
|
||||
import subprocess
|
||||
result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
lines = result.stdout.strip().split("\n")
|
||||
if len(lines) < 2:
|
||||
return []
|
||||
models = []
|
||||
for line in lines[1:]:
|
||||
parts = line.split()
|
||||
if parts:
|
||||
model_name = parts[0]
|
||||
models.append(f"OMNI: Ollama {model_name}")
|
||||
return models
|
||||
return []
|
||||
except Exception as e:
|
||||
logging.error(f"Error getting Ollama models: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def create_computer_instance(
|
||||
verbosity: int = logging.INFO,
|
||||
os_type: str = "macos",
|
||||
provider_type: str = "lume",
|
||||
name: Optional[str] = None,
|
||||
api_key: Optional[str] = None
|
||||
) -> Computer:
|
||||
"""Create or get the global Computer instance."""
|
||||
global global_computer
|
||||
if global_computer is None:
|
||||
global_computer = Computer(
|
||||
verbosity=verbosity,
|
||||
os_type=os_type,
|
||||
provider_type=provider_type,
|
||||
name=name if name else "",
|
||||
api_key=api_key
|
||||
)
|
||||
return global_computer
|
||||
|
||||
|
||||
def create_agent(
|
||||
model_string: str,
|
||||
save_trajectory: bool = True,
|
||||
only_n_most_recent_images: int = 3,
|
||||
verbosity: int = logging.INFO,
|
||||
custom_model_name: Optional[str] = None,
|
||||
computer_os: str = "macos",
|
||||
computer_provider: str = "lume",
|
||||
computer_name: Optional[str] = None,
|
||||
computer_api_key: Optional[str] = None,
|
||||
max_trajectory_budget: Optional[float] = None,
|
||||
) -> ComputerAgent:
|
||||
"""Create or update the global agent with the specified parameters."""
|
||||
global global_agent
|
||||
|
||||
# Create the computer
|
||||
computer = create_computer_instance(
|
||||
verbosity=verbosity,
|
||||
os_type=computer_os,
|
||||
provider_type=computer_provider,
|
||||
name=computer_name,
|
||||
api_key=computer_api_key
|
||||
)
|
||||
|
||||
# Handle custom models
|
||||
if model_string == "custom_oaicompat" and custom_model_name:
|
||||
model_string = custom_model_name
|
||||
elif model_string == "custom_ollama" and custom_model_name:
|
||||
model_string = f"omniparser+ollama_chat/{custom_model_name}"
|
||||
|
||||
# Create agent kwargs
|
||||
agent_kwargs = {
|
||||
"model": model_string,
|
||||
"tools": [computer],
|
||||
"only_n_most_recent_images": only_n_most_recent_images,
|
||||
"verbosity": verbosity,
|
||||
}
|
||||
|
||||
if save_trajectory:
|
||||
agent_kwargs["trajectory_dir"] = "trajectories"
|
||||
|
||||
if max_trajectory_budget:
|
||||
agent_kwargs["max_trajectory_budget"] = {"max_budget": max_trajectory_budget, "raise_error": True}
|
||||
|
||||
global_agent = ComputerAgent(**agent_kwargs)
|
||||
return global_agent
|
||||
|
||||
|
||||
def test_cua():
|
||||
"""Standalone function to launch the Gradio app."""
|
||||
from agent.ui.gradio.ui_components import create_gradio_ui
|
||||
print(f"Starting Gradio app for CUA Agent...")
|
||||
demo = create_gradio_ui()
|
||||
demo.launch(share=False, inbrowser=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_cua()
|
||||
703
libs/python/agent2/agent/ui/gradio/ui_components.py
Normal file
703
libs/python/agent2/agent/ui/gradio/ui_components.py
Normal file
@@ -0,0 +1,703 @@
|
||||
"""
|
||||
UI Components for the Gradio interface
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import logging
|
||||
import json
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any, cast
|
||||
import gradio as gr
|
||||
from gradio.components.chatbot import MetadataDict
|
||||
|
||||
from .app import (
|
||||
load_settings, save_settings, create_agent, get_model_string,
|
||||
get_ollama_models, GradioChatScreenshotHandler, global_agent, global_computer
|
||||
)
|
||||
|
||||
|
||||
def create_gradio_ui() -> gr.Blocks:
|
||||
"""Create a Gradio UI for the Computer-Use Agent."""
|
||||
|
||||
# Load settings
|
||||
saved_settings = load_settings()
|
||||
|
||||
# Check for API keys
|
||||
openai_api_key = os.environ.get("OPENAI_API_KEY", "")
|
||||
anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
cua_api_key = os.environ.get("CUA_API_KEY", "")
|
||||
|
||||
# Model choices
|
||||
openai_models = ["OpenAI: Computer-Use Preview"]
|
||||
anthropic_models = [
|
||||
"Anthropic: Claude 4 Opus (20250514)",
|
||||
"Anthropic: Claude 4 Sonnet (20250514)",
|
||||
"Anthropic: Claude 3.7 Sonnet (20250219)",
|
||||
"Anthropic: Claude 3.5 Sonnet (20240620)",
|
||||
]
|
||||
omni_models = [
|
||||
"OMNI: OpenAI GPT-4o",
|
||||
"OMNI: OpenAI GPT-4o mini",
|
||||
"OMNI: Claude 3.7 Sonnet (20250219)",
|
||||
"OMNI: Claude 3.5 Sonnet (20240620)"
|
||||
]
|
||||
|
||||
# Check if API keys are available
|
||||
has_openai_key = bool(openai_api_key)
|
||||
has_anthropic_key = bool(anthropic_api_key)
|
||||
has_cua_key = bool(cua_api_key)
|
||||
|
||||
# Get Ollama models for OMNI
|
||||
ollama_models = get_ollama_models()
|
||||
if ollama_models:
|
||||
omni_models += ollama_models
|
||||
|
||||
# Detect platform
|
||||
is_mac = platform.system().lower() == "darwin"
|
||||
|
||||
# Format model choices
|
||||
provider_to_models = {
|
||||
"OPENAI": openai_models,
|
||||
"ANTHROPIC": anthropic_models,
|
||||
"OMNI": omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
|
||||
"UITARS": ([
|
||||
"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
|
||||
] if is_mac else []) + ["Custom model (OpenAI compatible API)"],
|
||||
}
|
||||
|
||||
# Apply saved settings
|
||||
initial_loop = saved_settings.get("agent_loop", "OMNI")
|
||||
available_models_for_loop = provider_to_models.get(initial_loop, [])
|
||||
saved_model_choice = saved_settings.get("model_choice")
|
||||
if saved_model_choice and saved_model_choice in available_models_for_loop:
|
||||
initial_model = saved_model_choice
|
||||
else:
|
||||
if initial_loop == "OPENAI":
|
||||
initial_model = openai_models[0] if openai_models else "No models available"
|
||||
elif initial_loop == "ANTHROPIC":
|
||||
initial_model = anthropic_models[0] if anthropic_models else "No models available"
|
||||
else: # OMNI
|
||||
initial_model = omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
|
||||
|
||||
initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
|
||||
initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
|
||||
initial_save_trajectory = saved_settings.get("save_trajectory", True)
|
||||
initial_recent_images = saved_settings.get("recent_images", 3)
|
||||
|
||||
# Example prompts
|
||||
example_messages = [
|
||||
"Create a Python virtual environment, install pandas and matplotlib, then plot stock data",
|
||||
"Open a PDF in Preview, add annotations, and save it as a compressed version",
|
||||
"Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
|
||||
"Configure SSH keys and set up a connection to a remote server",
|
||||
]
|
||||
|
||||
def generate_python_code(agent_loop_choice, model_name, tasks, recent_images=3, save_trajectory=True, computer_os="linux", computer_provider="cloud", container_name="", cua_cloud_api_key="", max_budget=None):
|
||||
"""Generate Python code for the current configuration and tasks."""
|
||||
tasks_str = ""
|
||||
for task in tasks:
|
||||
if task and task.strip():
|
||||
tasks_str += f' "{task}",\n'
|
||||
|
||||
model_string = get_model_string(model_name, agent_loop_choice)
|
||||
|
||||
computer_args = []
|
||||
if computer_os != "macos":
|
||||
computer_args.append(f'os_type="{computer_os}"')
|
||||
if computer_provider != "lume":
|
||||
computer_args.append(f'provider_type="{computer_provider}"')
|
||||
if container_name:
|
||||
computer_args.append(f'name="{container_name}"')
|
||||
if cua_cloud_api_key:
|
||||
computer_args.append(f'api_key="{cua_cloud_api_key}"')
|
||||
|
||||
computer_args_str = ", ".join(computer_args)
|
||||
if computer_args_str:
|
||||
computer_args_str = f"({computer_args_str})"
|
||||
else:
|
||||
computer_args_str = "()"
|
||||
|
||||
code = f'''import asyncio
|
||||
from computer import Computer
|
||||
from agent import ComputerAgent
|
||||
|
||||
async def main():
|
||||
async with Computer{computer_args_str} as computer:
|
||||
agent = ComputerAgent(
|
||||
model="{model_string}",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images={recent_images},'''
|
||||
|
||||
if save_trajectory:
|
||||
code += '''
|
||||
trajectory_dir="trajectories",'''
|
||||
|
||||
if max_budget:
|
||||
code += f'''
|
||||
max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},'''
|
||||
|
||||
code += '''
|
||||
)
|
||||
'''
|
||||
|
||||
if tasks_str:
|
||||
code += f'''
|
||||
# Prompts for the computer-use agent
|
||||
tasks = [
|
||||
{tasks_str.rstrip()}
|
||||
]
|
||||
|
||||
for task in tasks:
|
||||
print(f"Executing task: {{task}}")
|
||||
messages = [{{"role": "user", "content": task}}]
|
||||
async for result in agent.run(messages):
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"])'''
|
||||
else:
|
||||
code += f'''
|
||||
# Execute a single task
|
||||
task = "Search for information about CUA on GitHub"
|
||||
print(f"Executing task: {{task}}")
|
||||
messages = [{{"role": "user", "content": task}}]
|
||||
async for result in agent.run(messages):
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"])'''
|
||||
|
||||
code += '''
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())'''
|
||||
|
||||
return code
|
||||
|
||||
# Create the Gradio interface
|
||||
with gr.Blocks(title="Computer-Use Agent") as demo:
|
||||
with gr.Row():
|
||||
# Left column for settings
|
||||
with gr.Column(scale=1):
|
||||
# Logo
|
||||
gr.HTML(
|
||||
"""
|
||||
<div style="display: flex; justify-content: center; margin-bottom: 0.5em">
|
||||
<img alt="CUA Logo" style="width: 80px;"
|
||||
src="https://github.com/trycua/cua/blob/main/img/logo_black.png?raw=true" />
|
||||
</div>
|
||||
"""
|
||||
)
|
||||
|
||||
# Python code accordion
|
||||
with gr.Accordion("Python Code", open=False):
|
||||
code_display = gr.Code(
|
||||
language="python",
|
||||
value=generate_python_code(initial_loop, "gpt-4o", []),
|
||||
interactive=False,
|
||||
)
|
||||
|
||||
with gr.Accordion("Computer Configuration", open=True):
|
||||
computer_os = gr.Radio(
|
||||
choices=["macos", "linux", "windows"],
|
||||
label="Operating System",
|
||||
value="macos",
|
||||
info="Select the operating system for the computer",
|
||||
)
|
||||
|
||||
is_windows = platform.system().lower() == "windows"
|
||||
is_mac = platform.system().lower() == "darwin"
|
||||
|
||||
providers = ["cloud"]
|
||||
if is_mac:
|
||||
providers += ["lume"]
|
||||
if is_windows:
|
||||
providers += ["winsandbox"]
|
||||
|
||||
computer_provider = gr.Radio(
|
||||
choices=providers,
|
||||
label="Provider",
|
||||
value="lume" if is_mac else "cloud",
|
||||
info="Select the computer provider",
|
||||
)
|
||||
|
||||
container_name = gr.Textbox(
|
||||
label="Container Name",
|
||||
placeholder="Enter container name (optional)",
|
||||
value=os.environ.get("CUA_CONTAINER_NAME", ""),
|
||||
info="Optional name for the container",
|
||||
)
|
||||
|
||||
cua_cloud_api_key = gr.Textbox(
|
||||
label="CUA Cloud API Key",
|
||||
placeholder="Enter your CUA Cloud API key",
|
||||
value=os.environ.get("CUA_API_KEY", ""),
|
||||
type="password",
|
||||
info="Required for cloud provider",
|
||||
visible=(not has_cua_key)
|
||||
)
|
||||
|
||||
with gr.Accordion("Agent Configuration", open=True):
|
||||
agent_loop = gr.Dropdown(
|
||||
choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
|
||||
label="Agent Loop",
|
||||
value=initial_loop,
|
||||
info="Select the agent loop provider",
|
||||
)
|
||||
|
||||
# Model selection dropdowns
|
||||
with gr.Group() as model_selection_group:
|
||||
openai_model_choice = gr.Dropdown(
|
||||
choices=openai_models,
|
||||
label="OpenAI Model",
|
||||
value=openai_models[0] if openai_models else "No models available",
|
||||
info="Select OpenAI model",
|
||||
interactive=True,
|
||||
visible=(initial_loop == "OPENAI")
|
||||
)
|
||||
|
||||
anthropic_model_choice = gr.Dropdown(
|
||||
choices=anthropic_models,
|
||||
label="Anthropic Model",
|
||||
value=anthropic_models[0] if anthropic_models else "No models available",
|
||||
info="Select Anthropic model",
|
||||
interactive=True,
|
||||
visible=(initial_loop == "ANTHROPIC")
|
||||
)
|
||||
|
||||
omni_model_choice = gr.Dropdown(
|
||||
choices=omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
|
||||
label="OMNI Model",
|
||||
value=omni_models[0] if omni_models else "Custom model (OpenAI compatible API)",
|
||||
info="Select OMNI model or choose a custom model option",
|
||||
interactive=True,
|
||||
visible=(initial_loop == "OMNI")
|
||||
)
|
||||
|
||||
uitars_model_choice = gr.Dropdown(
|
||||
choices=provider_to_models.get("UITARS", ["No models available"]),
|
||||
label="UITARS Model",
|
||||
value=provider_to_models.get("UITARS", ["No models available"])[0] if provider_to_models.get("UITARS") else "No models available",
|
||||
info="Select UITARS model",
|
||||
interactive=True,
|
||||
visible=(initial_loop == "UITARS")
|
||||
)
|
||||
|
||||
model_choice = gr.Textbox(visible=False)
|
||||
|
||||
# API key inputs
|
||||
with gr.Group(visible=not has_openai_key and (initial_loop == "OPENAI" or initial_loop == "OMNI")) as openai_key_group:
|
||||
openai_api_key_input = gr.Textbox(
|
||||
label="OpenAI API Key",
|
||||
placeholder="Enter your OpenAI API key",
|
||||
value=os.environ.get("OPENAI_API_KEY", ""),
|
||||
interactive=True,
|
||||
type="password",
|
||||
info="Required for OpenAI models"
|
||||
)
|
||||
|
||||
with gr.Group(visible=not has_anthropic_key and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")) as anthropic_key_group:
|
||||
anthropic_api_key_input = gr.Textbox(
|
||||
label="Anthropic API Key",
|
||||
placeholder="Enter your Anthropic API key",
|
||||
value=os.environ.get("ANTHROPIC_API_KEY", ""),
|
||||
interactive=True,
|
||||
type="password",
|
||||
info="Required for Anthropic models"
|
||||
)
|
||||
|
||||
# API key handlers
|
||||
def set_openai_api_key(key):
|
||||
if key and key.strip():
|
||||
os.environ["OPENAI_API_KEY"] = key.strip()
|
||||
print(f"DEBUG - Set OpenAI API key environment variable")
|
||||
return key
|
||||
|
||||
def set_anthropic_api_key(key):
|
||||
if key and key.strip():
|
||||
os.environ["ANTHROPIC_API_KEY"] = key.strip()
|
||||
print(f"DEBUG - Set Anthropic API key environment variable")
|
||||
return key
|
||||
|
||||
openai_api_key_input.change(
|
||||
fn=set_openai_api_key,
|
||||
inputs=[openai_api_key_input],
|
||||
outputs=[openai_api_key_input],
|
||||
queue=False
|
||||
)
|
||||
|
||||
anthropic_api_key_input.change(
|
||||
fn=set_anthropic_api_key,
|
||||
inputs=[anthropic_api_key_input],
|
||||
outputs=[anthropic_api_key_input],
|
||||
queue=False
|
||||
)
|
||||
|
||||
# UI update function
|
||||
def update_ui(loop=None, openai_model=None, anthropic_model=None, omni_model=None, uitars_model=None):
|
||||
loop = loop or agent_loop.value
|
||||
|
||||
model_value = None
|
||||
if loop == "OPENAI" and openai_model:
|
||||
model_value = openai_model
|
||||
elif loop == "ANTHROPIC" and anthropic_model:
|
||||
model_value = anthropic_model
|
||||
elif loop == "OMNI" and omni_model:
|
||||
model_value = omni_model
|
||||
elif loop == "UITARS" and uitars_model:
|
||||
model_value = uitars_model
|
||||
|
||||
openai_visible = (loop == "OPENAI")
|
||||
anthropic_visible = (loop == "ANTHROPIC")
|
||||
omni_visible = (loop == "OMNI")
|
||||
uitars_visible = (loop == "UITARS")
|
||||
|
||||
show_openai_key = not has_openai_key and (loop == "OPENAI" or (loop == "OMNI" and model_value and "OpenAI" in model_value and "Custom" not in model_value))
|
||||
show_anthropic_key = not has_anthropic_key and (loop == "ANTHROPIC" or (loop == "OMNI" and model_value and "Claude" in model_value and "Custom" not in model_value))
|
||||
|
||||
is_custom_openai_api = model_value == "Custom model (OpenAI compatible API)"
|
||||
is_custom_ollama = model_value == "Custom model (ollama)"
|
||||
is_any_custom = is_custom_openai_api or is_custom_ollama
|
||||
|
||||
model_choice_value = model_value if model_value else ""
|
||||
|
||||
return [
|
||||
gr.update(visible=openai_visible),
|
||||
gr.update(visible=anthropic_visible),
|
||||
gr.update(visible=omni_visible),
|
||||
gr.update(visible=uitars_visible),
|
||||
gr.update(visible=show_openai_key),
|
||||
gr.update(visible=show_anthropic_key),
|
||||
gr.update(visible=is_any_custom),
|
||||
gr.update(visible=is_custom_openai_api),
|
||||
gr.update(visible=is_custom_openai_api),
|
||||
gr.update(value=model_choice_value)
|
||||
]
|
||||
|
||||
# Custom model inputs
|
||||
custom_model = gr.Textbox(
|
||||
label="Custom Model Name",
|
||||
placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct or llama3)",
|
||||
value=initial_custom_model,
|
||||
visible=(initial_model == "Custom model (OpenAI compatible API)" or initial_model == "Custom model (ollama)"),
|
||||
interactive=True,
|
||||
)
|
||||
|
||||
provider_base_url = gr.Textbox(
|
||||
label="Provider Base URL",
|
||||
placeholder="Enter provider base URL (e.g., http://localhost:1234/v1)",
|
||||
value=initial_provider_base_url,
|
||||
visible=(initial_model == "Custom model (OpenAI compatible API)"),
|
||||
interactive=True,
|
||||
)
|
||||
|
||||
provider_api_key = gr.Textbox(
|
||||
label="Provider API Key",
|
||||
placeholder="Enter provider API key (if required)",
|
||||
value="",
|
||||
visible=(initial_model == "Custom model (OpenAI compatible API)"),
|
||||
interactive=True,
|
||||
type="password",
|
||||
)
|
||||
|
||||
# Connect UI update events
|
||||
for dropdown in [agent_loop, omni_model_choice, uitars_model_choice, openai_model_choice, anthropic_model_choice]:
|
||||
dropdown.change(
|
||||
fn=update_ui,
|
||||
inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
|
||||
outputs=[
|
||||
openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
|
||||
openai_key_group, anthropic_key_group,
|
||||
custom_model, provider_base_url, provider_api_key,
|
||||
model_choice
|
||||
],
|
||||
queue=False
|
||||
)
|
||||
|
||||
save_trajectory = gr.Checkbox(
|
||||
label="Save Trajectory",
|
||||
value=initial_save_trajectory,
|
||||
info="Save the agent's trajectory for debugging",
|
||||
interactive=True,
|
||||
)
|
||||
|
||||
recent_images = gr.Slider(
|
||||
label="Recent Images",
|
||||
minimum=1,
|
||||
maximum=10,
|
||||
value=initial_recent_images,
|
||||
step=1,
|
||||
info="Number of recent images to keep in context",
|
||||
interactive=True,
|
||||
)
|
||||
|
||||
max_budget = gr.Number(
|
||||
label="Max Budget ($)",
|
||||
value=lambda: None,
|
||||
minimum=-1,
|
||||
maximum=100.0,
|
||||
step=0.1,
|
||||
info="Optional budget limit for trajectory (0 = no limit)",
|
||||
interactive=True,
|
||||
)
|
||||
|
||||
# Right column for chat interface
|
||||
with gr.Column(scale=2):
|
||||
gr.Markdown(
|
||||
"Ask me to perform tasks in a virtual environment.<br>Built with <a href='https://github.com/trycua/cua' target='_blank'>github.com/trycua/cua</a>."
|
||||
)
|
||||
|
||||
chatbot_history = gr.Chatbot(type="messages")
|
||||
msg = gr.Textbox(
|
||||
placeholder="Ask me to perform tasks in a virtual environment"
|
||||
)
|
||||
clear = gr.Button("Clear")
|
||||
cancel_button = gr.Button("Cancel", variant="stop")
|
||||
|
||||
# Add examples
|
||||
example_group = gr.Examples(examples=example_messages, inputs=msg)
|
||||
|
||||
# Chat submission function
|
||||
def chat_submit(message, history):
|
||||
history.append(gr.ChatMessage(role="user", content=message))
|
||||
return "", history
|
||||
|
||||
# Cancel function
|
||||
async def cancel_agent_task(history):
|
||||
global global_agent
|
||||
if global_agent:
|
||||
print("DEBUG - Cancelling agent task")
|
||||
history.append(gr.ChatMessage(role="assistant", content="Task cancelled by user", metadata={"title": "❌ Cancelled"}))
|
||||
else:
|
||||
history.append(gr.ChatMessage(role="assistant", content="No active agent task to cancel", metadata={"title": "ℹ️ Info"}))
|
||||
return history
|
||||
|
||||
# Process response function
|
||||
async def process_response(
|
||||
history,
|
||||
openai_model_value,
|
||||
anthropic_model_value,
|
||||
omni_model_value,
|
||||
uitars_model_value,
|
||||
custom_model_value,
|
||||
agent_loop_choice,
|
||||
save_traj,
|
||||
recent_imgs,
|
||||
custom_url_value=None,
|
||||
custom_api_key=None,
|
||||
openai_key_input=None,
|
||||
anthropic_key_input=None,
|
||||
computer_os="linux",
|
||||
computer_provider="cloud",
|
||||
container_name="",
|
||||
cua_cloud_api_key="",
|
||||
max_budget_value=None,
|
||||
):
|
||||
if not history:
|
||||
yield history
|
||||
return
|
||||
|
||||
# Get the last user message
|
||||
last_user_message = history[-1]["content"]
|
||||
|
||||
# Get the appropriate model value based on the agent loop
|
||||
if agent_loop_choice == "OPENAI":
|
||||
model_choice_value = openai_model_value
|
||||
elif agent_loop_choice == "ANTHROPIC":
|
||||
model_choice_value = anthropic_model_value
|
||||
elif agent_loop_choice == "OMNI":
|
||||
model_choice_value = omni_model_value
|
||||
elif agent_loop_choice == "UITARS":
|
||||
model_choice_value = uitars_model_value
|
||||
else:
|
||||
model_choice_value = "No models available"
|
||||
|
||||
# Determine if this is a custom model selection
|
||||
is_custom_model_selected = model_choice_value in ["Custom model (OpenAI compatible API)", "Custom model (ollama)"]
|
||||
|
||||
# Determine the model name string to analyze
|
||||
if is_custom_model_selected:
|
||||
model_string_to_analyze = custom_model_value
|
||||
else:
|
||||
model_string_to_analyze = model_choice_value
|
||||
|
||||
try:
|
||||
# Get the model string
|
||||
model_string = get_model_string(model_string_to_analyze, agent_loop_choice)
|
||||
|
||||
# Set API keys if provided
|
||||
if openai_key_input:
|
||||
os.environ["OPENAI_API_KEY"] = openai_key_input
|
||||
if anthropic_key_input:
|
||||
os.environ["ANTHROPIC_API_KEY"] = anthropic_key_input
|
||||
if cua_cloud_api_key:
|
||||
os.environ["CUA_API_KEY"] = cua_cloud_api_key
|
||||
|
||||
# Save settings
|
||||
current_settings = {
|
||||
"agent_loop": agent_loop_choice,
|
||||
"model_choice": model_choice_value,
|
||||
"custom_model": custom_model_value,
|
||||
"provider_base_url": custom_url_value,
|
||||
"save_trajectory": save_traj,
|
||||
"recent_images": recent_imgs,
|
||||
"computer_os": computer_os,
|
||||
"computer_provider": computer_provider,
|
||||
"container_name": container_name,
|
||||
}
|
||||
save_settings(current_settings)
|
||||
|
||||
# Create agent
|
||||
global_agent = create_agent(
|
||||
model_string=model_string,
|
||||
save_trajectory=save_traj,
|
||||
only_n_most_recent_images=recent_imgs,
|
||||
custom_model_name=custom_model_value if is_custom_model_selected else None,
|
||||
computer_os=computer_os,
|
||||
computer_provider=computer_provider,
|
||||
computer_name=container_name,
|
||||
computer_api_key=cua_cloud_api_key,
|
||||
verbosity=logging.DEBUG,
|
||||
max_trajectory_budget=max_budget_value if max_budget_value and max_budget_value > 0 else None,
|
||||
)
|
||||
|
||||
if global_agent is None:
|
||||
history.append(
|
||||
gr.ChatMessage(
|
||||
role="assistant",
|
||||
content="Failed to create agent. Check API keys and configuration.",
|
||||
)
|
||||
)
|
||||
yield history
|
||||
return
|
||||
|
||||
# Create message list for agent
|
||||
messages = [{"role": "user", "content": last_user_message}]
|
||||
|
||||
# Stream responses from the agent
|
||||
async for result in global_agent.run(messages):
|
||||
print(f"DEBUG - Agent response ------- START")
|
||||
from pprint import pprint
|
||||
pprint(result)
|
||||
print(f"DEBUG - Agent response ------- END")
|
||||
|
||||
# Process the result output
|
||||
for item in result.get("output", []):
|
||||
if item.get("type") == "message":
|
||||
content = item.get("content", [])
|
||||
for content_part in content:
|
||||
if content_part.get("text"):
|
||||
history.append(gr.ChatMessage(
|
||||
role=item.get("role", "assistant"),
|
||||
content=content_part.get("text", ""),
|
||||
metadata=content_part.get("metadata", {})
|
||||
))
|
||||
elif item.get("type") == "computer_call":
|
||||
action = item.get("action", {})
|
||||
action_type = action.get("type", "")
|
||||
if action_type:
|
||||
action_title = f"🛠️ Performing {action_type}"
|
||||
if action.get("x") and action.get("y"):
|
||||
action_title += f" at ({action['x']}, {action['y']})"
|
||||
history.append(gr.ChatMessage(
|
||||
role="assistant",
|
||||
content=f"```json\n{json.dumps(action)}\n```",
|
||||
metadata={"title": action_title}
|
||||
))
|
||||
elif item.get("type") == "function_call":
|
||||
function_name = item.get("name", "")
|
||||
arguments = item.get("arguments", "{}")
|
||||
history.append(gr.ChatMessage(
|
||||
role="assistant",
|
||||
content=f"🔧 Calling function: {function_name}\n```json\n{arguments}\n```",
|
||||
metadata={"title": f"Function Call: {function_name}"}
|
||||
))
|
||||
elif item.get("type") == "function_call_output":
|
||||
output = item.get("output", "")
|
||||
history.append(gr.ChatMessage(
|
||||
role="assistant",
|
||||
content=f"📤 Function output:\n```\n{output}\n```",
|
||||
metadata={"title": "Function Output"}
|
||||
))
|
||||
|
||||
yield history
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
|
||||
yield history
|
||||
|
||||
# Connect the submit button
|
||||
submit_event = msg.submit(
|
||||
fn=chat_submit,
|
||||
inputs=[msg, chatbot_history],
|
||||
outputs=[msg, chatbot_history],
|
||||
queue=False,
|
||||
).then(
|
||||
fn=process_response,
|
||||
inputs=[
|
||||
chatbot_history,
|
||||
openai_model_choice,
|
||||
anthropic_model_choice,
|
||||
omni_model_choice,
|
||||
uitars_model_choice,
|
||||
custom_model,
|
||||
agent_loop,
|
||||
save_trajectory,
|
||||
recent_images,
|
||||
provider_base_url,
|
||||
provider_api_key,
|
||||
openai_api_key_input,
|
||||
anthropic_api_key_input,
|
||||
computer_os,
|
||||
computer_provider,
|
||||
container_name,
|
||||
cua_cloud_api_key,
|
||||
max_budget,
|
||||
],
|
||||
outputs=[chatbot_history],
|
||||
queue=True,
|
||||
)
|
||||
|
||||
# Clear button functionality
|
||||
clear.click(lambda: None, None, chatbot_history, queue=False)
|
||||
|
||||
# Connect cancel button
|
||||
cancel_button.click(
|
||||
cancel_agent_task,
|
||||
[chatbot_history],
|
||||
[chatbot_history],
|
||||
queue=False
|
||||
)
|
||||
|
||||
# Code display update function
|
||||
def update_code_display(agent_loop, model_choice_val, custom_model_val, chat_history, recent_images_val, save_trajectory_val, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget_val):
|
||||
messages = []
|
||||
if chat_history:
|
||||
for msg in chat_history:
|
||||
if isinstance(msg, dict) and msg.get("role") == "user":
|
||||
messages.append(msg.get("content", ""))
|
||||
|
||||
return generate_python_code(
|
||||
agent_loop,
|
||||
model_choice_val or custom_model_val or "gpt-4o",
|
||||
messages,
|
||||
recent_images_val,
|
||||
save_trajectory_val,
|
||||
computer_os,
|
||||
computer_provider,
|
||||
container_name,
|
||||
cua_cloud_api_key,
|
||||
max_budget_val
|
||||
)
|
||||
|
||||
# Update code display when configuration changes
|
||||
for component in [agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget]:
|
||||
component.change(
|
||||
update_code_display,
|
||||
inputs=[agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget],
|
||||
outputs=[code_display]
|
||||
)
|
||||
|
||||
return demo
|
||||
@@ -1,12 +1,12 @@
|
||||
"""
|
||||
Example usage of the agent2 library with docstring-based tool definitions.
|
||||
Example usage of the agent library with docstring-based tool definitions.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from agent2 import agent_loop, ComputerAgent
|
||||
from agent2.types import Messages
|
||||
from agent import agent_loop, ComputerAgent
|
||||
from agent.types import Messages
|
||||
from computer import Computer
|
||||
from computer.helpers import sandboxed
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "pdm.backend"
|
||||
|
||||
[project]
|
||||
name = "cua-agent"
|
||||
version = "0.4.0"
|
||||
version = "0.4.0b1"
|
||||
description = "CUA (Computer Use) Agent for AI-driven computer interaction"
|
||||
readme = "README.md"
|
||||
authors = [
|
||||
@@ -44,6 +44,9 @@ ui = [
|
||||
"gradio>=5.23.3",
|
||||
"python-dotenv>=1.0.1",
|
||||
]
|
||||
cli = [
|
||||
"yaspin>=3.1.0",
|
||||
]
|
||||
all = [
|
||||
# omni requirements
|
||||
"ultralytics>=8.0.0",
|
||||
@@ -54,6 +57,8 @@ all = [
|
||||
# ui requirements
|
||||
"gradio>=5.23.3",
|
||||
"python-dotenv>=1.0.1",
|
||||
# cli requirements
|
||||
"yaspin>=3.1.0",
|
||||
]
|
||||
|
||||
[tool.uv]
|
||||
@@ -63,4 +68,4 @@ constraint-dependencies = ["fastrtc>0.43.0", "mlx-audio>0.2.3"]
|
||||
distribution = true
|
||||
|
||||
[tool.pdm.build]
|
||||
includes = ["agent2/"]
|
||||
includes = ["agent/"]
|
||||
|
||||
Reference in New Issue
Block a user