Files
computer/libs/python/agent2/agent/ui/gradio/app.py
2025-07-28 10:17:04 -04:00

249 lines
8.6 KiB
Python

"""
Advanced Gradio UI for Computer-Use Agent (cua-agent)
This is a Gradio interface for the Computer-Use Agent v0.4.x (cua-agent)
with an advanced UI for model selection and configuration.
Supported Agent Models:
- OpenAI: openai/computer-use-preview
- Anthropic: anthropic/claude-3-5-sonnet-20241022, anthropic/claude-3-7-sonnet-20250219
- UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
- Omniparser: omniparser+anthropic/claude-3-5-sonnet-20241022, omniparser+ollama_chat/gemma3
Requirements:
- Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
- macOS 14 (Sonoma) or newer / Ubuntu 20.04+
- Python 3.11+
- Lume CLI installed (https://github.com/trycua/cua)
- OpenAI or Anthropic API key
"""
import os
import asyncio
import logging
import json
import platform
from pathlib import Path
from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
import gradio as gr
from gradio.components.chatbot import MetadataDict
from typing import cast
# Import from agent package
from agent import ComputerAgent
from agent.types import Messages, AgentResponse
from computer import Computer
# Global variables
global_agent = None
global_computer = None
SETTINGS_FILE = Path(".gradio_settings.json")
import dotenv
if dotenv.load_dotenv():
print(f"DEBUG - Loaded environment variables from {dotenv.find_dotenv()}")
else:
print("DEBUG - No .env file found")
# --- Settings Load/Save Functions ---
def load_settings() -> Dict[str, Any]:
"""Loads settings from the JSON file."""
if SETTINGS_FILE.exists():
try:
with open(SETTINGS_FILE, "r") as f:
settings = json.load(f)
if isinstance(settings, dict):
print(f"DEBUG - Loaded settings from {SETTINGS_FILE}")
return settings
except (json.JSONDecodeError, IOError) as e:
print(f"Warning: Could not load settings from {SETTINGS_FILE}: {e}")
return {}
def save_settings(settings: Dict[str, Any]):
"""Saves settings to the JSON file."""
settings.pop("provider_api_key", None)
try:
with open(SETTINGS_FILE, "w") as f:
json.dump(settings, f, indent=4)
print(f"DEBUG - Saved settings to {SETTINGS_FILE}")
except IOError as e:
print(f"Warning: Could not save settings to {SETTINGS_FILE}: {e}")
# # Custom Screenshot Handler for Gradio chat
# class GradioChatScreenshotHandler:
# """Custom handler that adds screenshots to the Gradio chatbot."""
# def __init__(self, chatbot_history: List[gr.ChatMessage]):
# self.chatbot_history = chatbot_history
# print("GradioChatScreenshotHandler initialized")
# async def on_screenshot(self, screenshot_base64: str, action_type: str = "") -> None:
# """Add screenshot to chatbot when a screenshot is taken."""
# image_markdown = f"![Screenshot after {action_type}](data:image/png;base64,{screenshot_base64})"
# if self.chatbot_history is not None:
# self.chatbot_history.append(
# gr.ChatMessage(
# role="assistant",
# content=image_markdown,
# metadata={"title": f"🖥️ Screenshot - {action_type}", "status": "done"},
# )
# )
# Detect platform capabilities
is_mac = platform.system().lower() == "darwin"
is_lume_available = is_mac or (os.environ.get("PYLUME_HOST", "localhost") != "localhost")
print("PYLUME_HOST: ", os.environ.get("PYLUME_HOST", "localhost"))
print("is_mac: ", is_mac)
print("Lume available: ", is_lume_available)
# Map model names to agent model strings
MODEL_MAPPINGS = {
"openai": {
"default": "openai/computer-use-preview",
"OpenAI: Computer-Use Preview": "openai/computer-use-preview",
},
"anthropic": {
"default": "anthropic/claude-3-7-sonnet-20250219",
"Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
"Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
"Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
"Anthropic: Claude 3.5 Sonnet (20240620)": "anthropic/claude-3-5-sonnet-20240620",
},
"omni": {
"default": "omniparser+openai/gpt-4o",
"OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
"OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
"OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
"OMNI: Claude 3.5 Sonnet (20240620)": "omniparser+anthropic/claude-3-5-sonnet-20240620",
},
"uitars": {
"default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",
"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
},
}
def get_model_string(model_name: str, loop_provider: str) -> str:
"""Determine the agent model string based on the input."""
if model_name == "Custom model (OpenAI compatible API)":
return "custom_oaicompat"
elif model_name == "Custom model (ollama)":
return "custom_ollama"
elif loop_provider == "OMNI-OLLAMA" or model_name.startswith("OMNI: Ollama "):
if model_name.startswith("OMNI: Ollama "):
ollama_model = model_name.split("OMNI: Ollama ", 1)[1]
return f"omniparser+ollama_chat/{ollama_model}"
return "omniparser+ollama_chat/llama3"
# Map based on loop provider
mapping = MODEL_MAPPINGS.get(loop_provider.lower(), MODEL_MAPPINGS["openai"])
return mapping.get(model_name, mapping["default"])
def get_ollama_models() -> List[str]:
"""Get available models from Ollama if installed."""
try:
import subprocess
result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
if result.returncode == 0:
lines = result.stdout.strip().split("\n")
if len(lines) < 2:
return []
models = []
for line in lines[1:]:
parts = line.split()
if parts:
model_name = parts[0]
models.append(f"OMNI: Ollama {model_name}")
return models
return []
except Exception as e:
logging.error(f"Error getting Ollama models: {e}")
return []
def create_computer_instance(
verbosity: int = logging.INFO,
os_type: str = "macos",
provider_type: str = "lume",
name: Optional[str] = None,
api_key: Optional[str] = None
) -> Computer:
"""Create or get the global Computer instance."""
global global_computer
if global_computer is None:
global_computer = Computer(
verbosity=verbosity,
os_type=os_type,
provider_type=provider_type,
name=name if name else "",
api_key=api_key
)
return global_computer
def create_agent(
model_string: str,
save_trajectory: bool = True,
only_n_most_recent_images: int = 3,
verbosity: int = logging.INFO,
custom_model_name: Optional[str] = None,
computer_os: str = "macos",
computer_provider: str = "lume",
computer_name: Optional[str] = None,
computer_api_key: Optional[str] = None,
max_trajectory_budget: Optional[float] = None,
) -> ComputerAgent:
"""Create or update the global agent with the specified parameters."""
global global_agent
# Create the computer
computer = create_computer_instance(
verbosity=verbosity,
os_type=computer_os,
provider_type=computer_provider,
name=computer_name,
api_key=computer_api_key
)
# Handle custom models
if model_string == "custom_oaicompat" and custom_model_name:
model_string = custom_model_name
elif model_string == "custom_ollama" and custom_model_name:
model_string = f"omniparser+ollama_chat/{custom_model_name}"
# Create agent kwargs
agent_kwargs = {
"model": model_string,
"tools": [computer],
"only_n_most_recent_images": only_n_most_recent_images,
"verbosity": verbosity,
}
if save_trajectory:
agent_kwargs["trajectory_dir"] = "trajectories"
if max_trajectory_budget:
agent_kwargs["max_trajectory_budget"] = {"max_budget": max_trajectory_budget, "raise_error": True}
global_agent = ComputerAgent(**agent_kwargs)
return global_agent
def launch_ui():
"""Standalone function to launch the Gradio app."""
from agent.ui.gradio.ui_components import create_gradio_ui
print(f"Starting Gradio app for CUA Agent...")
demo = create_gradio_ui()
demo.launch(share=False, inbrowser=True)
if __name__ == "__main__":
launch_ui()