diff --git a/libs/python/agent2/README.md b/libs/python/agent2/README.md
deleted file mode 100644
index 0c5595e1..00000000
--- a/libs/python/agent2/README.md
+++ /dev/null
@@ -1,381 +0,0 @@
-
-
-
-
-
-
-
-
-
-
- [](#)
- [](#)
- [](https://discord.com/invite/mVnXXpdE85)
- [](https://pypi.org/project/cua-computer/)
-
-
-
-**cua-agent** is a general Computer-Use framework with liteLLM integration for running agentic workflows on macOS, Windows, and Linux sandboxes. It provides a unified interface for computer-use agents across multiple LLM providers with advanced callback system for extensibility.
-
-## Features
-
-- **Safe Computer-Use/Tool-Use**: Using Computer SDK for sandboxed desktops
-- **Multi-Agent Support**: Anthropic Claude, OpenAI computer-use-preview, UI-TARS, Omniparser + any LLM
-- **Multi-API Support**: Take advantage of liteLLM supporting 100+ LLMs / model APIs, including local models (`huggingface-local/`, `ollama_chat/`, `mlx/`)
-- **Cross-Platform**: Works on Windows, macOS, and Linux with cloud and local computer instances
-- **Extensible Callbacks**: Built-in support for image retention, cache control, PII anonymization, budget limits, and trajectory tracking
-
-## Install
-
-```bash
-pip install "cua-agent[all]"
-
-# or install specific providers
-pip install "cua-agent[openai]" # OpenAI computer-use-preview support
-pip install "cua-agent[anthropic]" # Anthropic Claude support
-pip install "cua-agent[omni]" # Omniparser + any LLM support
-pip install "cua-agent[uitars]" # UI-TARS
-pip install "cua-agent[uitars-mlx]" # UI-TARS + MLX support
-pip install "cua-agent[uitars-hf]" # UI-TARS + Huggingface support
-pip install "cua-agent[ui]" # Gradio UI support
-```
-
-## Quick Start
-
-```python
-import asyncio
-import os
-from agent import ComputerAgent
-from computer import Computer
-
-async def main():
- # Set up computer instance
- async with Computer(
- os_type="linux",
- provider_type="cloud",
- name=os.getenv("CUA_CONTAINER_NAME"),
- api_key=os.getenv("CUA_API_KEY")
- ) as computer:
-
- # Create agent
- agent = ComputerAgent(
- model="anthropic/claude-3-5-sonnet-20241022",
- tools=[computer],
- only_n_most_recent_images=3,
- trajectory_dir="trajectories",
- max_trajectory_budget=5.0 # $5 budget limit
- )
-
- # Run agent
- messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
-
- async for result in agent.run(messages):
- for item in result["output"]:
- if item["type"] == "message":
- print(item["content"][0]["text"])
-
-if __name__ == "__main__":
- asyncio.run(main())
-```
-
-## Supported Models
-
-### Anthropic Claude (Computer Use API)
-```python
-model="anthropic/claude-3-5-sonnet-20241022"
-model="anthropic/claude-3-5-sonnet-20240620"
-model="anthropic/claude-opus-4-20250514"
-model="anthropic/claude-sonnet-4-20250514"
-```
-
-### OpenAI Computer Use Preview
-```python
-model="openai/computer-use-preview"
-```
-
-### UI-TARS (Local or Huggingface Inference)
-```python
-model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"
-model="ollama_chat/0000/ui-tars-1.5-7b"
-```
-
-### Omniparser + Any LLM
-```python
-model="omniparser+ollama_chat/mistral-small3.2"
-model="omniparser+vertex_ai/gemini-pro"
-model="omniparser+anthropic/claude-3-5-sonnet-20241022"
-model="omniparser+openai/gpt-4o"
-```
-
-## Custom Tools
-
-Define custom tools using decorated functions:
-
-```python
-from computer.helpers import sandboxed
-
-@sandboxed()
-def read_file(location: str) -> str:
- """Read contents of a file
-
- Parameters
- ----------
- location : str
- Path to the file to read
-
- Returns
- -------
- str
- Contents of the file or error message
- """
- try:
- with open(location, 'r') as f:
- return f.read()
- except Exception as e:
- return f"Error reading file: {str(e)}"
-
-def calculate(a: int, b: int) -> int:
- """Calculate the sum of two integers"""
- return a + b
-
-# Use with agent
-agent = ComputerAgent(
- model="anthropic/claude-3-5-sonnet-20241022",
- tools=[computer, read_file, calculate]
-)
-```
-
-## Callbacks System
-
-agent provides a comprehensive callback system for extending functionality:
-
-### Built-in Callbacks
-
-```python
-from agent.callbacks import (
- ImageRetentionCallback,
- TrajectorySaverCallback,
- BudgetManagerCallback,
- LoggingCallback
-)
-
-agent = ComputerAgent(
- model="anthropic/claude-3-5-sonnet-20241022",
- tools=[computer],
- callbacks=[
- ImageRetentionCallback(only_n_most_recent_images=3),
- TrajectorySaverCallback(trajectory_dir="trajectories"),
- BudgetManagerCallback(max_budget=10.0, raise_error=True),
- LoggingCallback(level=logging.INFO)
- ]
-)
-```
-
-### Custom Callbacks
-
-```python
-from agent.callbacks.base import AsyncCallbackHandler
-
-class CustomCallback(AsyncCallbackHandler):
- async def on_llm_start(self, messages):
- """Preprocess messages before LLM call"""
- # Add custom preprocessing logic
- return messages
-
- async def on_llm_end(self, messages):
- """Postprocess messages after LLM call"""
- # Add custom postprocessing logic
- return messages
-
- async def on_usage(self, usage):
- """Track usage information"""
- print(f"Tokens used: {usage.total_tokens}")
-```
-
-## Budget Management
-
-Control costs with built-in budget management:
-
-```python
-# Simple budget limit
-agent = ComputerAgent(
- model="anthropic/claude-3-5-sonnet-20241022",
- max_trajectory_budget=5.0 # $5 limit
-)
-
-# Advanced budget configuration
-agent = ComputerAgent(
- model="anthropic/claude-3-5-sonnet-20241022",
- max_trajectory_budget={
- "max_budget": 10.0,
- "raise_error": True, # Raise error when exceeded
- "reset_after_each_run": False # Persistent across runs
- }
-)
-```
-
-## Trajectory Management
-
-Save and replay agent conversations:
-
-```python
-agent = ComputerAgent(
- model="anthropic/claude-3-5-sonnet-20241022",
- trajectory_dir="trajectories", # Auto-save trajectories
- tools=[computer]
-)
-
-# Trajectories are saved with:
-# - Complete conversation history
-# - Usage statistics and costs
-# - Timestamps and metadata
-# - Screenshots and computer actions
-```
-
-## Configuration Options
-
-### ComputerAgent Parameters
-
-- `model`: Model identifier (required)
-- `tools`: List of computer objects and decorated functions
-- `callbacks`: List of callback handlers for extensibility
-- `only_n_most_recent_images`: Limit recent images to prevent context overflow
-- `verbosity`: Logging level (logging.INFO, logging.DEBUG, etc.)
-- `trajectory_dir`: Directory to save conversation trajectories
-- `max_retries`: Maximum API call retries (default: 3)
-- `screenshot_delay`: Delay between actions and screenshots (default: 0.5s)
-- `use_prompt_caching`: Enable prompt caching for supported models
-- `max_trajectory_budget`: Budget limit configuration
-
-### Environment Variables
-
-```bash
-# Computer instance (cloud)
-export CUA_CONTAINER_NAME="your-container-name"
-export CUA_API_KEY="your-cua-api-key"
-
-# LLM API keys
-export ANTHROPIC_API_KEY="your-anthropic-key"
-export OPENAI_API_KEY="your-openai-key"
-```
-
-## Advanced Usage
-
-### Streaming Responses
-
-```python
-async for result in agent.run(messages, stream=True):
- # Process streaming chunks
- for item in result["output"]:
- if item["type"] == "message":
- print(item["content"][0]["text"], end="", flush=True)
- elif item["type"] == "computer_call":
- action = item["action"]
- print(f"\n[Action: {action['type']}]")
-```
-
-### Interactive Chat Loop
-
-```python
-history = []
-while True:
- user_input = input("> ")
- if user_input.lower() in ['quit', 'exit']:
- break
-
- history.append({"role": "user", "content": user_input})
-
- async for result in agent.run(history):
- history += result["output"]
-
- # Display assistant responses
- for item in result["output"]:
- if item["type"] == "message":
- print(item["content"][0]["text"])
-```
-
-### Error Handling
-
-```python
-try:
- async for result in agent.run(messages):
- # Process results
- pass
-except BudgetExceededException:
- print("Budget limit exceeded")
-except Exception as e:
- print(f"Agent error: {e}")
-```
-
-## API Reference
-
-### ComputerAgent.run()
-
-```python
-async def run(
- self,
- messages: Messages,
- stream: bool = False,
- **kwargs
-) -> AsyncGenerator[Dict[str, Any], None]:
- """
- Run the agent with the given messages.
-
- Args:
- messages: List of message dictionaries
- stream: Whether to stream the response
- **kwargs: Additional arguments
-
- Returns:
- AsyncGenerator that yields response chunks
- """
-```
-
-### Message Format
-
-```python
-messages = [
- {
- "role": "user",
- "content": "Take a screenshot and describe what you see"
- },
- {
- "role": "assistant",
- "content": "I'll take a screenshot for you."
- }
-]
-```
-
-### Response Format
-
-```python
-{
- "output": [
- {
- "type": "message",
- "role": "assistant",
- "content": [{"type": "output_text", "text": "I can see..."}]
- },
- {
- "type": "computer_call",
- "action": {"type": "screenshot"},
- "call_id": "call_123"
- },
- {
- "type": "computer_call_output",
- "call_id": "call_123",
- "output": {"image_url": "data:image/png;base64,..."}
- }
- ],
- "usage": {
- "prompt_tokens": 150,
- "completion_tokens": 75,
- "total_tokens": 225,
- "response_cost": 0.01,
- }
-}
-```
-
-## License
-
-MIT License - see LICENSE file for details.
\ No newline at end of file
diff --git a/libs/python/agent2/agent/__init__.py b/libs/python/agent2/agent/__init__.py
deleted file mode 100644
index 2de7b7cf..00000000
--- a/libs/python/agent2/agent/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-"""
-agent - Decorator-based Computer Use Agent with liteLLM integration
-"""
-
-from .decorators import agent_loop
-from .agent import ComputerAgent
-from .types import Messages, AgentResponse
-
-# Import loops to register them
-from . import loops
-
-__all__ = [
- "agent_loop",
- "ComputerAgent",
- "Messages",
- "AgentResponse"
-]
-
-__version__ = "0.4.0b3"
diff --git a/libs/python/agent2/agent/__main__.py b/libs/python/agent2/agent/__main__.py
deleted file mode 100644
index 1b4d6697..00000000
--- a/libs/python/agent2/agent/__main__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-"""
-Entry point for running agent CLI module.
-
-Usage:
- python -m agent.cli
-"""
-
-import sys
-import asyncio
-from .cli import main
-
-if __name__ == "__main__":
- # Check if 'cli' is specified as the module
- if len(sys.argv) > 1 and sys.argv[1] == "cli":
- # Remove 'cli' from arguments and run CLI
- sys.argv.pop(1)
- asyncio.run(main())
- else:
- print("Usage: python -m agent.cli ")
- print("Example: python -m agent.cli openai/computer-use-preview")
- sys.exit(1)
diff --git a/libs/python/agent2/agent/adapters/__init__.py b/libs/python/agent2/agent/adapters/__init__.py
deleted file mode 100644
index 2d9abbe3..00000000
--- a/libs/python/agent2/agent/adapters/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-"""
-Adapters package for agent - Custom LLM adapters for LiteLLM
-"""
-
-from .huggingfacelocal_adapter import HuggingFaceLocalAdapter
-
-__all__ = [
- "HuggingFaceLocalAdapter",
-]
diff --git a/libs/python/agent2/agent/adapters/huggingfacelocal_adapter.py b/libs/python/agent2/agent/adapters/huggingfacelocal_adapter.py
deleted file mode 100644
index f8706868..00000000
--- a/libs/python/agent2/agent/adapters/huggingfacelocal_adapter.py
+++ /dev/null
@@ -1,229 +0,0 @@
-import asyncio
-import warnings
-from typing import Iterator, AsyncIterator, Dict, List, Any, Optional
-from litellm.types.utils import GenericStreamingChunk, ModelResponse
-from litellm.llms.custom_llm import CustomLLM
-from litellm import completion, acompletion
-
-# Try to import HuggingFace dependencies
-try:
- import torch
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
- HF_AVAILABLE = True
-except ImportError:
- HF_AVAILABLE = False
-
-
-class HuggingFaceLocalAdapter(CustomLLM):
- """HuggingFace Local Adapter for running vision-language models locally."""
-
- def __init__(self, device: str = "auto", **kwargs):
- """Initialize the adapter.
-
- Args:
- device: Device to load model on ("auto", "cuda", "cpu", etc.)
- **kwargs: Additional arguments
- """
- super().__init__()
- self.device = device
- self.models = {} # Cache for loaded models
- self.processors = {} # Cache for loaded processors
-
- def _load_model_and_processor(self, model_name: str):
- """Load model and processor if not already cached.
-
- Args:
- model_name: Name of the model to load
-
- Returns:
- Tuple of (model, processor)
- """
- if model_name not in self.models:
- # Load model
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
- model_name,
- torch_dtype=torch.float16,
- device_map=self.device,
- attn_implementation="sdpa"
- )
-
- # Load processor
- processor = AutoProcessor.from_pretrained(model_name)
-
- # Cache them
- self.models[model_name] = model
- self.processors[model_name] = processor
-
- return self.models[model_name], self.processors[model_name]
-
- def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """Convert OpenAI format messages to HuggingFace format.
-
- Args:
- messages: Messages in OpenAI format
-
- Returns:
- Messages in HuggingFace format
- """
- converted_messages = []
-
- for message in messages:
- converted_message = {
- "role": message["role"],
- "content": []
- }
-
- content = message.get("content", [])
- if isinstance(content, str):
- # Simple text content
- converted_message["content"].append({
- "type": "text",
- "text": content
- })
- elif isinstance(content, list):
- # Multi-modal content
- for item in content:
- if item.get("type") == "text":
- converted_message["content"].append({
- "type": "text",
- "text": item.get("text", "")
- })
- elif item.get("type") == "image_url":
- # Convert image_url format to image format
- image_url = item.get("image_url", {}).get("url", "")
- converted_message["content"].append({
- "type": "image",
- "image": image_url
- })
-
- converted_messages.append(converted_message)
-
- return converted_messages
-
- def _generate(self, **kwargs) -> str:
- """Generate response using the local HuggingFace model.
-
- Args:
- **kwargs: Keyword arguments containing messages and model info
-
- Returns:
- Generated text response
- """
- if not HF_AVAILABLE:
- raise ImportError(
- "HuggingFace transformers dependencies not found. "
- "Please install with: pip install \"cua-agent[uitars-hf]\""
- )
-
- # Extract messages and model from kwargs
- messages = kwargs.get('messages', [])
- model_name = kwargs.get('model', 'ByteDance-Seed/UI-TARS-1.5-7B')
- max_new_tokens = kwargs.get('max_tokens', 128)
-
- # Warn about ignored kwargs
- ignored_kwargs = set(kwargs.keys()) - {'messages', 'model', 'max_tokens'}
- if ignored_kwargs:
- warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
-
- # Load model and processor
- model, processor = self._load_model_and_processor(model_name)
-
- # Convert messages to HuggingFace format
- hf_messages = self._convert_messages(messages)
-
- # Apply chat template and tokenize
- inputs = processor.apply_chat_template(
- hf_messages,
- add_generation_prompt=True,
- tokenize=True,
- return_dict=True,
- return_tensors="pt"
- )
-
- # Move inputs to the same device as model
- if torch.cuda.is_available() and self.device != "cpu":
- inputs = inputs.to("cuda")
-
- # Generate response
- with torch.no_grad():
- generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
-
- # Trim input tokens from output
- generated_ids_trimmed = [
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
- ]
-
- # Decode output
- output_text = processor.batch_decode(
- generated_ids_trimmed,
- skip_special_tokens=True,
- clean_up_tokenization_spaces=False
- )
-
- return output_text[0] if output_text else ""
-
- def completion(self, *args, **kwargs) -> ModelResponse:
- """Synchronous completion method.
-
- Returns:
- ModelResponse with generated text
- """
- generated_text = self._generate(**kwargs)
-
- return completion(
- model=f"huggingface-local/{kwargs['model']}",
- mock_response=generated_text,
- )
-
- async def acompletion(self, *args, **kwargs) -> ModelResponse:
- """Asynchronous completion method.
-
- Returns:
- ModelResponse with generated text
- """
- # Run _generate in thread pool to avoid blocking
- generated_text = await asyncio.to_thread(self._generate, **kwargs)
-
- return await acompletion(
- model=f"huggingface-local/{kwargs['model']}",
- mock_response=generated_text,
- )
-
- def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
- """Synchronous streaming method.
-
- Returns:
- Iterator of GenericStreamingChunk
- """
- generated_text = self._generate(**kwargs)
-
- generic_streaming_chunk: GenericStreamingChunk = {
- "finish_reason": "stop",
- "index": 0,
- "is_finished": True,
- "text": generated_text,
- "tool_use": None,
- "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
- }
-
- yield generic_streaming_chunk
-
- async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
- """Asynchronous streaming method.
-
- Returns:
- AsyncIterator of GenericStreamingChunk
- """
- # Run _generate in thread pool to avoid blocking
- generated_text = await asyncio.to_thread(self._generate, **kwargs)
-
- generic_streaming_chunk: GenericStreamingChunk = {
- "finish_reason": "stop",
- "index": 0,
- "is_finished": True,
- "text": generated_text,
- "tool_use": None,
- "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
- }
-
- yield generic_streaming_chunk
\ No newline at end of file
diff --git a/libs/python/agent2/agent/agent.py b/libs/python/agent2/agent/agent.py
deleted file mode 100644
index ba86a632..00000000
--- a/libs/python/agent2/agent/agent.py
+++ /dev/null
@@ -1,577 +0,0 @@
-"""
-ComputerAgent - Main agent class that selects and runs agent loops
-"""
-
-import asyncio
-from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set
-
-from litellm.responses.utils import Usage
-from .types import Messages, Computer
-from .decorators import find_agent_loop
-from .computer_handler import OpenAIComputerHandler, acknowledge_safety_check_callback, check_blocklisted_url
-import json
-import litellm
-import litellm.utils
-import inspect
-from .adapters import HuggingFaceLocalAdapter
-from .callbacks import ImageRetentionCallback, LoggingCallback, TrajectorySaverCallback, BudgetManagerCallback
-
-def get_json(obj: Any, max_depth: int = 10) -> Any:
- def custom_serializer(o: Any, depth: int = 0, seen: Set[int] = None) -> Any:
- if seen is None:
- seen = set()
-
- # Use model_dump() if available
- if hasattr(o, 'model_dump'):
- return o.model_dump()
-
- # Check depth limit
- if depth > max_depth:
- return f""
-
- # Check for circular references using object id
- obj_id = id(o)
- if obj_id in seen:
- return f""
-
- # Handle Computer objects
- if hasattr(o, '__class__') and 'computer' in getattr(o, '__class__').__name__.lower():
- return f""
-
- # Handle objects with __dict__
- if hasattr(o, '__dict__'):
- seen.add(obj_id)
- try:
- result = {}
- for k, v in o.__dict__.items():
- if v is not None:
- # Recursively serialize with updated depth and seen set
- serialized_value = custom_serializer(v, depth + 1, seen.copy())
- result[k] = serialized_value
- return result
- finally:
- seen.discard(obj_id)
-
- # Handle common types that might contain nested objects
- elif isinstance(o, dict):
- seen.add(obj_id)
- try:
- return {
- k: custom_serializer(v, depth + 1, seen.copy())
- for k, v in o.items()
- if v is not None
- }
- finally:
- seen.discard(obj_id)
-
- elif isinstance(o, (list, tuple, set)):
- seen.add(obj_id)
- try:
- return [
- custom_serializer(item, depth + 1, seen.copy())
- for item in o
- if item is not None
- ]
- finally:
- seen.discard(obj_id)
-
- # For basic types that json.dumps can handle
- elif isinstance(o, (str, int, float, bool)) or o is None:
- return o
-
- # Fallback to string representation
- else:
- return str(o)
-
- def remove_nones(obj: Any) -> Any:
- if isinstance(obj, dict):
- return {k: remove_nones(v) for k, v in obj.items() if v is not None}
- elif isinstance(obj, list):
- return [remove_nones(item) for item in obj if item is not None]
- return obj
-
- # Serialize with circular reference and depth protection
- serialized = custom_serializer(obj)
-
- # Convert to JSON string and back to ensure JSON compatibility
- json_str = json.dumps(serialized)
- parsed = json.loads(json_str)
-
- # Final cleanup of any remaining None values
- return remove_nones(parsed)
-
-def sanitize_message(msg: Any) -> Any:
- """Return a copy of the message with image_url omitted for computer_call_output messages."""
- if msg.get("type") == "computer_call_output":
- output = msg.get("output", {})
- if isinstance(output, dict):
- sanitized = msg.copy()
- sanitized["output"] = {**output, "image_url": "[omitted]"}
- return sanitized
- return msg
-
-class ComputerAgent:
- """
- Main agent class that automatically selects the appropriate agent loop
- based on the model and executes tool calls.
- """
-
- def __init__(
- self,
- model: str,
- tools: Optional[List[Any]] = None,
- custom_loop: Optional[Callable] = None,
- only_n_most_recent_images: Optional[int] = None,
- callbacks: Optional[List[Any]] = None,
- verbosity: Optional[int] = None,
- trajectory_dir: Optional[str] = None,
- max_retries: Optional[int] = 3,
- screenshot_delay: Optional[float | int] = 0.5,
- use_prompt_caching: Optional[bool] = False,
- max_trajectory_budget: Optional[float | dict] = None,
- **kwargs
- ):
- """
- Initialize ComputerAgent.
-
- Args:
- model: Model name (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro")
- tools: List of tools (computer objects, decorated functions, etc.)
- custom_loop: Custom agent loop function to use instead of auto-selection
- only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically.
- callbacks: List of AsyncCallbackHandler instances for preprocessing/postprocessing
- verbosity: Logging level (logging.DEBUG, logging.INFO, etc.). If set, adds LoggingCallback automatically
- trajectory_dir: If set, saves trajectory data (screenshots, responses) to this directory. Adds TrajectorySaverCallback automatically.
- max_retries: Maximum number of retries for failed API calls
- screenshot_delay: Delay before screenshots in seconds
- use_prompt_caching: If set, use prompt caching to avoid reprocessing the same prompt. Intended for use with anthropic providers.
- max_trajectory_budget: If set, adds BudgetManagerCallback to track usage costs and stop when budget is exceeded
- **kwargs: Additional arguments passed to the agent loop
- """
- self.model = model
- self.tools = tools or []
- self.custom_loop = custom_loop
- self.only_n_most_recent_images = only_n_most_recent_images
- self.callbacks = callbacks or []
- self.verbosity = verbosity
- self.trajectory_dir = trajectory_dir
- self.max_retries = max_retries
- self.screenshot_delay = screenshot_delay
- self.use_prompt_caching = use_prompt_caching
- self.kwargs = kwargs
-
- # == Add built-in callbacks ==
-
- # Add logging callback if verbosity is set
- if self.verbosity is not None:
- self.callbacks.append(LoggingCallback(level=self.verbosity))
-
- # Add image retention callback if only_n_most_recent_images is set
- if self.only_n_most_recent_images:
- self.callbacks.append(ImageRetentionCallback(self.only_n_most_recent_images))
-
- # Add trajectory saver callback if trajectory_dir is set
- if self.trajectory_dir:
- self.callbacks.append(TrajectorySaverCallback(self.trajectory_dir))
-
- # Add budget manager if max_trajectory_budget is set
- if max_trajectory_budget:
- if isinstance(max_trajectory_budget, dict):
- self.callbacks.append(BudgetManagerCallback(**max_trajectory_budget))
- else:
- self.callbacks.append(BudgetManagerCallback(max_trajectory_budget))
-
- # == Enable local model providers w/ LiteLLM ==
-
- # Register local model providers
- hf_adapter = HuggingFaceLocalAdapter(
- device="auto"
- )
- litellm.custom_provider_map = [
- {"provider": "huggingface-local", "custom_handler": hf_adapter}
- ]
-
- # == Initialize computer agent ==
-
- # Find the appropriate agent loop
- if custom_loop:
- self.agent_loop = custom_loop
- self.agent_loop_info = None
- else:
- loop_info = find_agent_loop(model)
- if not loop_info:
- raise ValueError(f"No agent loop found for model: {model}")
- self.agent_loop = loop_info.func
- self.agent_loop_info = loop_info
-
- self.tool_schemas = []
- self.computer_handler = None
-
- async def _initialize_computers(self):
- """Initialize computer objects"""
- if not self.tool_schemas:
- for tool in self.tools:
- if hasattr(tool, '_initialized') and not tool._initialized:
- await tool.run()
-
- # Process tools and create tool schemas
- self.tool_schemas = self._process_tools()
-
- # Find computer tool and create interface adapter
- computer_handler = None
- for schema in self.tool_schemas:
- if schema["type"] == "computer":
- computer_handler = OpenAIComputerHandler(schema["computer"].interface)
- break
- self.computer_handler = computer_handler
-
- def _process_input(self, input: Messages) -> List[Dict[str, Any]]:
- """Process input messages and create schemas for the agent loop"""
- if isinstance(input, str):
- return [{"role": "user", "content": input}]
- return [get_json(msg) for msg in input]
-
- def _process_tools(self) -> List[Dict[str, Any]]:
- """Process tools and create schemas for the agent loop"""
- schemas = []
-
- for tool in self.tools:
- # Check if it's a computer object (has interface attribute)
- if hasattr(tool, 'interface'):
- # This is a computer tool - will be handled by agent loop
- schemas.append({
- "type": "computer",
- "computer": tool
- })
- elif callable(tool):
- # Use litellm.utils.function_to_dict to extract schema from docstring
- try:
- function_schema = litellm.utils.function_to_dict(tool)
- schemas.append({
- "type": "function",
- "function": function_schema
- })
- except Exception as e:
- print(f"Warning: Could not process tool {tool}: {e}")
- else:
- print(f"Warning: Unknown tool type: {tool}")
-
- return schemas
-
- def _get_tool(self, name: str) -> Optional[Callable]:
- """Get a tool by name"""
- for tool in self.tools:
- if hasattr(tool, '__name__') and tool.__name__ == name:
- return tool
- elif hasattr(tool, 'func') and tool.func.__name__ == name:
- return tool
- return None
-
- # ============================================================================
- # AGENT RUN LOOP LIFECYCLE HOOKS
- # ============================================================================
-
- async def _on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
- """Initialize run tracking by calling callbacks."""
- for callback in self.callbacks:
- if hasattr(callback, 'on_run_start'):
- await callback.on_run_start(kwargs, old_items)
-
- async def _on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
- """Finalize run tracking by calling callbacks."""
- for callback in self.callbacks:
- if hasattr(callback, 'on_run_end'):
- await callback.on_run_end(kwargs, old_items, new_items)
-
- async def _on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool:
- """Check if run should continue by calling callbacks."""
- for callback in self.callbacks:
- if hasattr(callback, 'on_run_continue'):
- should_continue = await callback.on_run_continue(kwargs, old_items, new_items)
- if not should_continue:
- return False
- return True
-
- async def _on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """Prepare messages for the LLM call by applying callbacks."""
- result = messages
- for callback in self.callbacks:
- if hasattr(callback, 'on_llm_start'):
- result = await callback.on_llm_start(result)
- return result
-
- async def _on_llm_end(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """Postprocess messages after the LLM call by applying callbacks."""
- result = messages
- for callback in self.callbacks:
- if hasattr(callback, 'on_llm_end'):
- result = await callback.on_llm_end(result)
- return result
-
- async def _on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
- """Called when responses are received."""
- for callback in self.callbacks:
- if hasattr(callback, 'on_responses'):
- await callback.on_responses(get_json(kwargs), get_json(responses))
-
- async def _on_computer_call_start(self, item: Dict[str, Any]) -> None:
- """Called when a computer call is about to start."""
- for callback in self.callbacks:
- if hasattr(callback, 'on_computer_call_start'):
- await callback.on_computer_call_start(get_json(item))
-
- async def _on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
- """Called when a computer call has completed."""
- for callback in self.callbacks:
- if hasattr(callback, 'on_computer_call_end'):
- await callback.on_computer_call_end(get_json(item), get_json(result))
-
- async def _on_function_call_start(self, item: Dict[str, Any]) -> None:
- """Called when a function call is about to start."""
- for callback in self.callbacks:
- if hasattr(callback, 'on_function_call_start'):
- await callback.on_function_call_start(get_json(item))
-
- async def _on_function_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
- """Called when a function call has completed."""
- for callback in self.callbacks:
- if hasattr(callback, 'on_function_call_end'):
- await callback.on_function_call_end(get_json(item), get_json(result))
-
- async def _on_text(self, item: Dict[str, Any]) -> None:
- """Called when a text message is encountered."""
- for callback in self.callbacks:
- if hasattr(callback, 'on_text'):
- await callback.on_text(get_json(item))
-
- async def _on_api_start(self, kwargs: Dict[str, Any]) -> None:
- """Called when an LLM API call is about to start."""
- for callback in self.callbacks:
- if hasattr(callback, 'on_api_start'):
- await callback.on_api_start(get_json(kwargs))
-
- async def _on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
- """Called when an LLM API call has completed."""
- for callback in self.callbacks:
- if hasattr(callback, 'on_api_end'):
- await callback.on_api_end(get_json(kwargs), get_json(result))
-
- async def _on_usage(self, usage: Dict[str, Any]) -> None:
- """Called when usage information is received."""
- for callback in self.callbacks:
- if hasattr(callback, 'on_usage'):
- await callback.on_usage(get_json(usage))
-
- async def _on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
- """Called when a screenshot is taken."""
- for callback in self.callbacks:
- if hasattr(callback, 'on_screenshot'):
- await callback.on_screenshot(screenshot, name)
-
- # ============================================================================
- # AGENT OUTPUT PROCESSING
- # ============================================================================
-
- async def _handle_item(self, item: Any, computer: Optional[Computer] = None) -> List[Dict[str, Any]]:
- """Handle each item; may cause a computer action + screenshot."""
-
- item_type = item.get("type", None)
-
- if item_type == "message":
- await self._on_text(item)
- # # Print messages
- # if item.get("content"):
- # for content_item in item.get("content"):
- # if content_item.get("text"):
- # print(content_item.get("text"))
- return []
-
- if item_type == "computer_call":
- await self._on_computer_call_start(item)
- if not computer:
- raise ValueError("Computer handler is required for computer calls")
-
- # Perform computer actions
- action = item.get("action")
- action_type = action.get("type")
-
- # Extract action arguments (all fields except 'type')
- action_args = {k: v for k, v in action.items() if k != "type"}
-
- # print(f"{action_type}({action_args})")
-
- # Execute the computer action
- computer_method = getattr(computer, action_type, None)
- if computer_method:
- await computer_method(**action_args)
- else:
- print(f"Unknown computer action: {action_type}")
- return []
-
- # Take screenshot after action
- if self.screenshot_delay and self.screenshot_delay > 0:
- await asyncio.sleep(self.screenshot_delay)
- screenshot_base64 = await computer.screenshot()
- await self._on_screenshot(screenshot_base64, "screenshot_after")
-
- # Handle safety checks
- pending_checks = item.get("pending_safety_checks", [])
- acknowledged_checks = []
- for check in pending_checks:
- check_message = check.get("message", str(check))
- if acknowledge_safety_check_callback(check_message):
- acknowledged_checks.append(check)
- else:
- raise ValueError(f"Safety check failed: {check_message}")
-
- # Create call output
- call_output = {
- "type": "computer_call_output",
- "call_id": item.get("call_id"),
- "acknowledged_safety_checks": acknowledged_checks,
- "output": {
- "type": "input_image",
- "image_url": f"data:image/png;base64,{screenshot_base64}",
- },
- }
-
- # Additional URL safety checks for browser environments
- if await computer.get_environment() == "browser":
- current_url = await computer.get_current_url()
- call_output["output"]["current_url"] = current_url
- check_blocklisted_url(current_url)
-
- result = [call_output]
- await self._on_computer_call_end(item, result)
- return result
-
- if item_type == "function_call":
- await self._on_function_call_start(item)
- # Perform function call
- function = self._get_tool(item.get("name"))
- if not function:
- raise ValueError(f"Function {item.get("name")} not found")
-
- args = json.loads(item.get("arguments"))
-
- # Execute function - use asyncio.to_thread for non-async functions
- if inspect.iscoroutinefunction(function):
- result = await function(**args)
- else:
- result = await asyncio.to_thread(function, **args)
-
- # Create function call output
- call_output = {
- "type": "function_call_output",
- "call_id": item.get("call_id"),
- "output": str(result),
- }
-
- result = [call_output]
- await self._on_function_call_end(item, result)
- return result
-
- return []
-
- # ============================================================================
- # MAIN AGENT LOOP
- # ============================================================================
-
- async def run(
- self,
- messages: Messages,
- stream: bool = False,
- **kwargs
- ) -> AsyncGenerator[Dict[str, Any], None]:
- """
- Run the agent with the given messages using Computer protocol handler pattern.
-
- Args:
- messages: List of message dictionaries
- stream: Whether to stream the response
- **kwargs: Additional arguments
-
- Returns:
- AsyncGenerator that yields response chunks
- """
-
- await self._initialize_computers()
-
- # Merge kwargs
- merged_kwargs = {**self.kwargs, **kwargs}
-
- old_items = self._process_input(messages)
- new_items = []
-
- # Initialize run tracking
- run_kwargs = {
- "messages": messages,
- "stream": stream,
- "model": self.model,
- "agent_loop": self.agent_loop.__name__,
- **merged_kwargs
- }
- await self._on_run_start(run_kwargs, old_items)
-
- while new_items[-1].get("role") != "assistant" if new_items else True:
- # Lifecycle hook: Check if we should continue based on callbacks (e.g., budget manager)
- should_continue = await self._on_run_continue(run_kwargs, old_items, new_items)
- if not should_continue:
- break
-
- # Lifecycle hook: Prepare messages for the LLM call
- # Use cases:
- # - PII anonymization
- # - Image retention policy
- combined_messages = old_items + new_items
- preprocessed_messages = await self._on_llm_start(combined_messages)
-
- loop_kwargs = {
- "messages": preprocessed_messages,
- "model": self.model,
- "tools": self.tool_schemas,
- "stream": False,
- "computer_handler": self.computer_handler,
- "max_retries": self.max_retries,
- "use_prompt_caching": self.use_prompt_caching,
- **merged_kwargs
- }
-
- # Run agent loop iteration
- result = await self.agent_loop(
- **loop_kwargs,
- _on_api_start=self._on_api_start,
- _on_api_end=self._on_api_end,
- _on_usage=self._on_usage,
- _on_screenshot=self._on_screenshot,
- )
- result = get_json(result)
-
- # Lifecycle hook: Postprocess messages after the LLM call
- # Use cases:
- # - PII deanonymization (if you want tool calls to see PII)
- result["output"] = await self._on_llm_end(result.get("output", []))
- await self._on_responses(loop_kwargs, result)
-
- # Yield agent response
- yield result
-
- # Add agent response to new_items
- new_items += result.get("output")
-
- # Handle computer actions
- for item in result.get("output"):
- partial_items = await self._handle_item(item, self.computer_handler)
- new_items += partial_items
-
- # Yield partial response
- yield {
- "output": partial_items,
- "usage": Usage(
- prompt_tokens=0,
- completion_tokens=0,
- total_tokens=0,
- )
- }
-
- await self._on_run_end(loop_kwargs, old_items, new_items)
\ No newline at end of file
diff --git a/libs/python/agent2/agent/callbacks/__init__.py b/libs/python/agent2/agent/callbacks/__init__.py
deleted file mode 100644
index 6f364b1d..00000000
--- a/libs/python/agent2/agent/callbacks/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-"""
-Callback system for ComputerAgent preprocessing and postprocessing hooks.
-"""
-
-from .base import AsyncCallbackHandler
-from .image_retention import ImageRetentionCallback
-from .logging import LoggingCallback
-from .trajectory_saver import TrajectorySaverCallback
-from .budget_manager import BudgetManagerCallback
-
-__all__ = [
- "AsyncCallbackHandler",
- "ImageRetentionCallback",
- "LoggingCallback",
- "TrajectorySaverCallback",
- "BudgetManagerCallback",
-]
diff --git a/libs/python/agent2/agent/callbacks/base.py b/libs/python/agent2/agent/callbacks/base.py
deleted file mode 100644
index 01688077..00000000
--- a/libs/python/agent2/agent/callbacks/base.py
+++ /dev/null
@@ -1,153 +0,0 @@
-"""
-Base callback handler interface for ComputerAgent preprocessing and postprocessing hooks.
-"""
-
-from abc import ABC, abstractmethod
-from typing import List, Dict, Any, Optional, Union
-
-
-class AsyncCallbackHandler(ABC):
- """
- Base class for async callback handlers that can preprocess messages before
- the agent loop and postprocess output after the agent loop.
- """
-
- async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
- """Called at the start of an agent run loop."""
- pass
-
- async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
- """Called at the end of an agent run loop."""
- pass
-
- async def on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool:
- """Called during agent run loop to determine if execution should continue.
-
- Args:
- kwargs: Run arguments
- old_items: Original messages
- new_items: New messages generated during run
-
- Returns:
- True to continue execution, False to stop
- """
- return True
-
- async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """
- Called before messages are sent to the agent loop.
-
- Args:
- messages: List of message dictionaries to preprocess
-
- Returns:
- List of preprocessed message dictionaries
- """
- return messages
-
- async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """
- Called after the agent loop returns output.
-
- Args:
- output: List of output message dictionaries to postprocess
-
- Returns:
- List of postprocessed output dictionaries
- """
- return output
-
- async def on_computer_call_start(self, item: Dict[str, Any]) -> None:
- """
- Called when a computer call is about to start.
-
- Args:
- item: The computer call item dictionary
- """
- pass
-
- async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
- """
- Called when a computer call has completed.
-
- Args:
- item: The computer call item dictionary
- result: The result of the computer call
- """
- pass
-
- async def on_function_call_start(self, item: Dict[str, Any]) -> None:
- """
- Called when a function call is about to start.
-
- Args:
- item: The function call item dictionary
- """
- pass
-
- async def on_function_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
- """
- Called when a function call has completed.
-
- Args:
- item: The function call item dictionary
- result: The result of the function call
- """
- pass
-
- async def on_text(self, item: Dict[str, Any]) -> None:
- """
- Called when a text message is encountered.
-
- Args:
- item: The message item dictionary
- """
- pass
-
- async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
- """
- Called when an API call is about to start.
-
- Args:
- kwargs: The kwargs being passed to the API call
- """
- pass
-
- async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
- """
- Called when an API call has completed.
-
- Args:
- kwargs: The kwargs that were passed to the API call
- result: The result of the API call
- """
- pass
-
- async def on_usage(self, usage: Dict[str, Any]) -> None:
- """
- Called when usage information is received.
-
- Args:
- usage: The usage information
- """
- pass
-
- async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
- """
- Called when a screenshot is taken.
-
- Args:
- screenshot: The screenshot image
- name: The name of the screenshot
- """
- pass
-
- async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
- """
- Called when responses are received.
-
- Args:
- kwargs: The kwargs being passed to the agent loop
- responses: The responses received
- """
- pass
\ No newline at end of file
diff --git a/libs/python/agent2/agent/callbacks/budget_manager.py b/libs/python/agent2/agent/callbacks/budget_manager.py
deleted file mode 100644
index bc17c695..00000000
--- a/libs/python/agent2/agent/callbacks/budget_manager.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from typing import Dict, List, Any
-from .base import AsyncCallbackHandler
-
-class BudgetExceededError(Exception):
- """Exception raised when budget is exceeded."""
- pass
-
-class BudgetManagerCallback(AsyncCallbackHandler):
- """Budget manager callback that tracks usage costs and can stop execution when budget is exceeded."""
-
- def __init__(self, max_budget: float, reset_after_each_run: bool = True, raise_error: bool = False):
- """
- Initialize BudgetManagerCallback.
-
- Args:
- max_budget: Maximum budget allowed
- reset_after_each_run: Whether to reset budget after each run
- raise_error: Whether to raise an error when budget is exceeded
- """
- self.max_budget = max_budget
- self.reset_after_each_run = reset_after_each_run
- self.raise_error = raise_error
- self.total_cost = 0.0
-
- async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
- """Reset budget if configured to do so."""
- if self.reset_after_each_run:
- self.total_cost = 0.0
-
- async def on_usage(self, usage: Dict[str, Any]) -> None:
- """Track usage costs."""
- if "response_cost" in usage:
- self.total_cost += usage["response_cost"]
-
- async def on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool:
- """Check if budget allows continuation."""
- if self.total_cost >= self.max_budget:
- if self.raise_error:
- raise BudgetExceededError(f"Budget exceeded: ${self.total_cost} >= ${self.max_budget}")
- else:
- print(f"Budget exceeded: ${self.total_cost} >= ${self.max_budget}")
- return False
- return True
-
\ No newline at end of file
diff --git a/libs/python/agent2/agent/callbacks/image_retention.py b/libs/python/agent2/agent/callbacks/image_retention.py
deleted file mode 100644
index d91754b1..00000000
--- a/libs/python/agent2/agent/callbacks/image_retention.py
+++ /dev/null
@@ -1,139 +0,0 @@
-"""
-Image retention callback handler that limits the number of recent images in message history.
-"""
-
-from typing import List, Dict, Any, Optional
-from .base import AsyncCallbackHandler
-
-
-class ImageRetentionCallback(AsyncCallbackHandler):
- """
- Callback handler that applies image retention policy to limit the number
- of recent images in message history to prevent context window overflow.
- """
-
- def __init__(self, only_n_most_recent_images: Optional[int] = None):
- """
- Initialize the image retention callback.
-
- Args:
- only_n_most_recent_images: If set, only keep the N most recent images in message history
- """
- self.only_n_most_recent_images = only_n_most_recent_images
-
- async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """
- Apply image retention policy to messages before sending to agent loop.
-
- Args:
- messages: List of message dictionaries
-
- Returns:
- List of messages with image retention policy applied
- """
- if self.only_n_most_recent_images is None:
- return messages
-
- return self._apply_image_retention(messages)
-
- def _apply_image_retention(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """Apply image retention policy to keep only the N most recent images.
-
- Removes computer_call_output items with image_url and their corresponding computer_call items,
- keeping only the most recent N image pairs based on only_n_most_recent_images setting.
-
- Args:
- messages: List of message dictionaries
-
- Returns:
- Filtered list of messages with image retention applied
- """
- if self.only_n_most_recent_images is None:
- return messages
-
- # First pass: Assign call_id to reasoning items based on the next computer_call
- messages_with_call_ids = []
- for i, msg in enumerate(messages):
- msg_copy = msg.copy() if isinstance(msg, dict) else msg
-
- # If this is a reasoning item without a call_id, find the next computer_call
- if (msg_copy.get("type") == "reasoning" and
- not msg_copy.get("call_id")):
- # Look ahead for the next computer_call
- for j in range(i + 1, len(messages)):
- next_msg = messages[j]
- if (next_msg.get("type") == "computer_call" and
- next_msg.get("call_id")):
- msg_copy["call_id"] = next_msg.get("call_id")
- break
-
- messages_with_call_ids.append(msg_copy)
-
- # Find all computer_call_output items with images and their call_ids
- image_call_ids = []
- for msg in reversed(messages_with_call_ids): # Process in reverse to get most recent first
- if (msg.get("type") == "computer_call_output" and
- isinstance(msg.get("output"), dict) and
- "image_url" in msg.get("output", {})):
- call_id = msg.get("call_id")
- if call_id and call_id not in image_call_ids:
- image_call_ids.append(call_id)
- if len(image_call_ids) >= self.only_n_most_recent_images:
- break
-
- # Keep the most recent N image call_ids (reverse to get chronological order)
- keep_call_ids = set(image_call_ids[:self.only_n_most_recent_images])
-
- # Filter messages: remove computer_call, computer_call_output, and reasoning for old images
- filtered_messages = []
- for msg in messages_with_call_ids:
- msg_type = msg.get("type")
- call_id = msg.get("call_id")
-
- # Remove old computer_call items
- if msg_type == "computer_call" and call_id not in keep_call_ids:
- # Check if this call_id corresponds to an image call
- has_image_output = any(
- m.get("type") == "computer_call_output" and
- m.get("call_id") == call_id and
- isinstance(m.get("output"), dict) and
- "image_url" in m.get("output", {})
- for m in messages_with_call_ids
- )
- if has_image_output:
- continue # Skip this computer_call
-
- # Remove old computer_call_output items with images
- if (msg_type == "computer_call_output" and
- call_id not in keep_call_ids and
- isinstance(msg.get("output"), dict) and
- "image_url" in msg.get("output", {})):
- continue # Skip this computer_call_output
-
- # Remove old reasoning items that are paired with removed computer calls
- if (msg_type == "reasoning" and
- call_id and call_id not in keep_call_ids):
- # Check if this call_id corresponds to an image call that's being removed
- has_image_output = any(
- m.get("type") == "computer_call_output" and
- m.get("call_id") == call_id and
- isinstance(m.get("output"), dict) and
- "image_url" in m.get("output", {})
- for m in messages_with_call_ids
- )
- if has_image_output:
- continue # Skip this reasoning item
-
- filtered_messages.append(msg)
-
- # Clean up: Remove call_id from reasoning items before returning
- final_messages = []
- for msg in filtered_messages:
- if msg.get("type") == "reasoning" and "call_id" in msg:
- # Create a copy without call_id for reasoning items
- cleaned_msg = {k: v for k, v in msg.items() if k != "call_id"}
- final_messages.append(cleaned_msg)
- else:
- final_messages.append(msg)
-
- return final_messages
\ No newline at end of file
diff --git a/libs/python/agent2/agent/callbacks/logging.py b/libs/python/agent2/agent/callbacks/logging.py
deleted file mode 100644
index af171925..00000000
--- a/libs/python/agent2/agent/callbacks/logging.py
+++ /dev/null
@@ -1,247 +0,0 @@
-"""
-Logging callback for ComputerAgent that provides configurable logging of agent lifecycle events.
-"""
-
-import json
-import logging
-from typing import Dict, List, Any, Optional, Union
-from .base import AsyncCallbackHandler
-
-
-def sanitize_image_urls(data: Any) -> Any:
- """
- Recursively search for 'image_url' keys and set their values to '[omitted]'.
-
- Args:
- data: Any data structure (dict, list, or primitive type)
-
- Returns:
- A deep copy of the data with all 'image_url' values replaced with '[omitted]'
- """
- if isinstance(data, dict):
- # Create a copy of the dictionary
- sanitized = {}
- for key, value in data.items():
- if key == "image_url":
- sanitized[key] = "[omitted]"
- else:
- # Recursively sanitize the value
- sanitized[key] = sanitize_image_urls(value)
- return sanitized
-
- elif isinstance(data, list):
- # Recursively sanitize each item in the list
- return [sanitize_image_urls(item) for item in data]
-
- else:
- # For primitive types (str, int, bool, None, etc.), return as-is
- return data
-
-
-class LoggingCallback(AsyncCallbackHandler):
- """
- Callback handler that logs agent lifecycle events with configurable verbosity.
-
- Logging levels:
- - DEBUG: All events including API calls, message preprocessing, and detailed outputs
- - INFO: Major lifecycle events (start/end, messages, outputs)
- - WARNING: Only warnings and errors
- - ERROR: Only errors
- """
-
- def __init__(self, logger: Optional[logging.Logger] = None, level: int = logging.INFO):
- """
- Initialize the logging callback.
-
- Args:
- logger: Logger instance to use. If None, creates a logger named 'agent.ComputerAgent'
- level: Logging level (logging.DEBUG, logging.INFO, etc.)
- """
- self.logger = logger or logging.getLogger('agent.ComputerAgent')
- self.level = level
-
- # Set up logger if it doesn't have handlers
- if not self.logger.handlers:
- handler = logging.StreamHandler()
- formatter = logging.Formatter(
- '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
- )
- handler.setFormatter(formatter)
- self.logger.addHandler(handler)
- self.logger.setLevel(level)
-
- def _update_usage(self, usage: Dict[str, Any]) -> None:
- """Update total usage statistics."""
- def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
- for key, value in source.items():
- if isinstance(value, dict):
- if key not in target:
- target[key] = {}
- add_dicts(target[key], value)
- else:
- if key not in target:
- target[key] = 0
- target[key] += value
- add_dicts(self.total_usage, usage)
-
- async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
- """Called before the run starts."""
- self.total_usage = {}
-
- async def on_usage(self, usage: Dict[str, Any]) -> None:
- """Called when usage information is received."""
- self._update_usage(usage)
-
- async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
- """Called after the run ends."""
- def format_dict(d, indent=0):
- lines = []
- prefix = f" - {' ' * indent}"
- for key, value in d.items():
- if isinstance(value, dict):
- lines.append(f"{prefix}{key}:")
- lines.extend(format_dict(value, indent + 1))
- elif isinstance(value, float):
- lines.append(f"{prefix}{key}: ${value:.4f}")
- else:
- lines.append(f"{prefix}{key}: {value}")
- return lines
-
- formatted_output = "\n".join(format_dict(self.total_usage))
- self.logger.info(f"Total usage:\n{formatted_output}")
-
- async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """Called before LLM processing starts."""
- if self.logger.isEnabledFor(logging.INFO):
- self.logger.info(f"LLM processing started with {len(messages)} messages")
- if self.logger.isEnabledFor(logging.DEBUG):
- sanitized_messages = [sanitize_image_urls(msg) for msg in messages]
- self.logger.debug(f"LLM input messages: {json.dumps(sanitized_messages, indent=2)}")
- return messages
-
- async def on_llm_end(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """Called after LLM processing ends."""
- if self.logger.isEnabledFor(logging.DEBUG):
- sanitized_messages = [sanitize_image_urls(msg) for msg in messages]
- self.logger.debug(f"LLM output: {json.dumps(sanitized_messages, indent=2)}")
- return messages
-
- async def on_computer_call_start(self, item: Dict[str, Any]) -> None:
- """Called when a computer call starts."""
- action = item.get("action", {})
- action_type = action.get("type", "unknown")
- action_args = {k: v for k, v in action.items() if k != "type"}
-
- # INFO level logging for the action
- self.logger.info(f"Computer: {action_type}({action_args})")
-
- # DEBUG level logging for full details
- if self.logger.isEnabledFor(logging.DEBUG):
- self.logger.debug(f"Computer call started: {json.dumps(action, indent=2)}")
-
- async def on_computer_call_end(self, item: Dict[str, Any], result: Any) -> None:
- """Called when a computer call ends."""
- if self.logger.isEnabledFor(logging.DEBUG):
- action = item.get("action", "unknown")
- self.logger.debug(f"Computer call completed: {json.dumps(action, indent=2)}")
- if result:
- sanitized_result = sanitize_image_urls(result)
- self.logger.debug(f"Computer call result: {json.dumps(sanitized_result, indent=2)}")
-
- async def on_function_call_start(self, item: Dict[str, Any]) -> None:
- """Called when a function call starts."""
- name = item.get("name", "unknown")
- arguments = item.get("arguments", "{}")
-
- # INFO level logging for the function call
- self.logger.info(f"Function: {name}({arguments})")
-
- # DEBUG level logging for full details
- if self.logger.isEnabledFor(logging.DEBUG):
- self.logger.debug(f"Function call started: {name}")
-
- async def on_function_call_end(self, item: Dict[str, Any], result: Any) -> None:
- """Called when a function call ends."""
- # INFO level logging for function output (similar to function_call_output)
- if result:
- # Handle both list and direct result formats
- if isinstance(result, list) and len(result) > 0:
- output = result[0].get("output", str(result)) if isinstance(result[0], dict) else str(result[0])
- else:
- output = str(result)
-
- # Truncate long outputs
- if len(output) > 100:
- output = output[:100] + "..."
-
- self.logger.info(f"Output: {output}")
-
- # DEBUG level logging for full details
- if self.logger.isEnabledFor(logging.DEBUG):
- name = item.get("name", "unknown")
- self.logger.debug(f"Function call completed: {name}")
- if result:
- self.logger.debug(f"Function call result: {json.dumps(result, indent=2)}")
-
- async def on_text(self, item: Dict[str, Any]) -> None:
- """Called when a text message is encountered."""
- # Get the role to determine if it's Agent or User
- role = item.get("role", "unknown")
- content_items = item.get("content", [])
-
- # Process content items to build display text
- text_parts = []
- for content_item in content_items:
- content_type = content_item.get("type", "output_text")
- if content_type == "output_text":
- text_content = content_item.get("text", "")
- if not text_content.strip():
- text_parts.append("[empty]")
- else:
- # Truncate long text and add ellipsis
- if len(text_content) > 2048:
- text_parts.append(text_content[:2048] + "...")
- else:
- text_parts.append(text_content)
- else:
- # Non-text content, show as [type]
- text_parts.append(f"[{content_type}]")
-
- # Join all text parts
- display_text = ''.join(text_parts) if text_parts else "[empty]"
-
- # Log with appropriate level and format
- if role == "assistant":
- self.logger.info(f"Agent: {display_text}")
- elif role == "user":
- self.logger.info(f"User: {display_text}")
- else:
- # Fallback for unknown roles, use debug level
- if self.logger.isEnabledFor(logging.DEBUG):
- self.logger.debug(f"Text message ({role}): {display_text}")
-
- async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
- """Called when an API call is about to start."""
- if self.logger.isEnabledFor(logging.DEBUG):
- model = kwargs.get("model", "unknown")
- self.logger.debug(f"API call starting for model: {model}")
- # Log sanitized messages if present
- if "messages" in kwargs:
- sanitized_messages = sanitize_image_urls(kwargs["messages"])
- self.logger.debug(f"API call messages: {json.dumps(sanitized_messages, indent=2)}")
- elif "input" in kwargs:
- sanitized_input = sanitize_image_urls(kwargs["input"])
- self.logger.debug(f"API call input: {json.dumps(sanitized_input, indent=2)}")
-
- async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
- """Called when an API call has completed."""
- if self.logger.isEnabledFor(logging.DEBUG):
- model = kwargs.get("model", "unknown")
- self.logger.debug(f"API call completed for model: {model}")
- self.logger.debug(f"API call result: {json.dumps(sanitize_image_urls(result), indent=2)}")
-
- async def on_screenshot(self, item: Union[str, bytes], name: str = "screenshot") -> None:
- """Called when a screenshot is taken."""
- if self.logger.isEnabledFor(logging.DEBUG):
- image_size = len(item) / 1024
- self.logger.debug(f"Screenshot captured: {name} {image_size:.2f} KB")
\ No newline at end of file
diff --git a/libs/python/agent2/agent/callbacks/pii_anonymization.py b/libs/python/agent2/agent/callbacks/pii_anonymization.py
deleted file mode 100644
index f5c31a61..00000000
--- a/libs/python/agent2/agent/callbacks/pii_anonymization.py
+++ /dev/null
@@ -1,259 +0,0 @@
-"""
-PII anonymization callback handler using Microsoft Presidio for text and image redaction.
-"""
-
-from typing import List, Dict, Any, Optional, Tuple
-from .base import AsyncCallbackHandler
-import base64
-import io
-import logging
-
-try:
- from presidio_analyzer import AnalyzerEngine
- from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine
- from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
- from presidio_image_redactor import ImageRedactorEngine
- from PIL import Image
- PRESIDIO_AVAILABLE = True
-except ImportError:
- PRESIDIO_AVAILABLE = False
-
-logger = logging.getLogger(__name__)
-
-class PIIAnonymizationCallback(AsyncCallbackHandler):
- """
- Callback handler that anonymizes PII in text and images using Microsoft Presidio.
-
- This handler:
- 1. Anonymizes PII in messages before sending to the agent loop
- 2. Deanonymizes PII in tool calls and message outputs after the agent loop
- 3. Redacts PII from images in computer_call_output messages
- """
-
- def __init__(
- self,
- anonymize_text: bool = True,
- anonymize_images: bool = True,
- entities_to_anonymize: Optional[List[str]] = None,
- anonymization_operator: str = "replace",
- image_redaction_color: Tuple[int, int, int] = (255, 192, 203) # Pink
- ):
- """
- Initialize the PII anonymization callback.
-
- Args:
- anonymize_text: Whether to anonymize text content
- anonymize_images: Whether to redact images
- entities_to_anonymize: List of entity types to anonymize (None for all)
- anonymization_operator: Presidio operator to use ("replace", "mask", "redact", etc.)
- image_redaction_color: RGB color for image redaction
- """
- if not PRESIDIO_AVAILABLE:
- raise ImportError(
- "Presidio is not available. Install with: "
- "pip install presidio-analyzer presidio-anonymizer presidio-image-redactor"
- )
-
- self.anonymize_text = anonymize_text
- self.anonymize_images = anonymize_images
- self.entities_to_anonymize = entities_to_anonymize
- self.anonymization_operator = anonymization_operator
- self.image_redaction_color = image_redaction_color
-
- # Initialize Presidio engines
- self.analyzer = AnalyzerEngine()
- self.anonymizer = AnonymizerEngine()
- self.deanonymizer = DeanonymizeEngine()
- self.image_redactor = ImageRedactorEngine()
-
- # Store anonymization mappings for deanonymization
- self.anonymization_mappings: Dict[str, Any] = {}
-
- async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """
- Anonymize PII in messages before sending to agent loop.
-
- Args:
- messages: List of message dictionaries
-
- Returns:
- List of messages with PII anonymized
- """
- if not self.anonymize_text and not self.anonymize_images:
- return messages
-
- anonymized_messages = []
- for msg in messages:
- anonymized_msg = await self._anonymize_message(msg)
- anonymized_messages.append(anonymized_msg)
-
- return anonymized_messages
-
- async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """
- Deanonymize PII in tool calls and message outputs after agent loop.
-
- Args:
- output: List of output dictionaries
-
- Returns:
- List of output with PII deanonymized for tool calls
- """
- if not self.anonymize_text:
- return output
-
- deanonymized_output = []
- for item in output:
- # Only deanonymize tool calls and computer_call messages
- if item.get("type") in ["computer_call", "computer_call_output"]:
- deanonymized_item = await self._deanonymize_item(item)
- deanonymized_output.append(deanonymized_item)
- else:
- deanonymized_output.append(item)
-
- return deanonymized_output
-
- async def _anonymize_message(self, message: Dict[str, Any]) -> Dict[str, Any]:
- """Anonymize PII in a single message."""
- msg_copy = message.copy()
-
- # Anonymize text content
- if self.anonymize_text:
- msg_copy = await self._anonymize_text_content(msg_copy)
-
- # Redact images in computer_call_output
- if self.anonymize_images and msg_copy.get("type") == "computer_call_output":
- msg_copy = await self._redact_image_content(msg_copy)
-
- return msg_copy
-
- async def _anonymize_text_content(self, message: Dict[str, Any]) -> Dict[str, Any]:
- """Anonymize text content in a message."""
- msg_copy = message.copy()
-
- # Handle content array
- content = msg_copy.get("content", [])
- if isinstance(content, str):
- anonymized_text, _ = await self._anonymize_text(content)
- msg_copy["content"] = anonymized_text
- elif isinstance(content, list):
- anonymized_content = []
- for item in content:
- if isinstance(item, dict) and item.get("type") == "text":
- text = item.get("text", "")
- anonymized_text, _ = await self._anonymize_text(text)
- item_copy = item.copy()
- item_copy["text"] = anonymized_text
- anonymized_content.append(item_copy)
- else:
- anonymized_content.append(item)
- msg_copy["content"] = anonymized_content
-
- return msg_copy
-
- async def _redact_image_content(self, message: Dict[str, Any]) -> Dict[str, Any]:
- """Redact PII from images in computer_call_output messages."""
- msg_copy = message.copy()
- output = msg_copy.get("output", {})
-
- if isinstance(output, dict) and "image_url" in output:
- try:
- # Extract base64 image data
- image_url = output["image_url"]
- if image_url.startswith("data:image/"):
- # Parse data URL
- header, data = image_url.split(",", 1)
- image_data = base64.b64decode(data)
-
- # Load image with PIL
- image = Image.open(io.BytesIO(image_data))
-
- # Redact PII from image
- redacted_image = self.image_redactor.redact(image, self.image_redaction_color)
-
- # Convert back to base64
- buffer = io.BytesIO()
- redacted_image.save(buffer, format="PNG")
- redacted_data = base64.b64encode(buffer.getvalue()).decode()
-
- # Update image URL
- output_copy = output.copy()
- output_copy["image_url"] = f"data:image/png;base64,{redacted_data}"
- msg_copy["output"] = output_copy
-
- except Exception as e:
- logger.warning(f"Failed to redact image: {e}")
-
- return msg_copy
-
- async def _deanonymize_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
- """Deanonymize PII in tool calls and computer outputs."""
- item_copy = item.copy()
-
- # Handle computer_call arguments
- if item.get("type") == "computer_call":
- args = item_copy.get("args", {})
- if isinstance(args, dict):
- deanonymized_args = {}
- for key, value in args.items():
- if isinstance(value, str):
- deanonymized_value, _ = await self._deanonymize_text(value)
- deanonymized_args[key] = deanonymized_value
- else:
- deanonymized_args[key] = value
- item_copy["args"] = deanonymized_args
-
- return item_copy
-
- async def _anonymize_text(self, text: str) -> Tuple[str, List[RecognizerResult]]:
- """Anonymize PII in text and return the anonymized text and results."""
- if not text.strip():
- return text, []
-
- try:
- # Analyze text for PII
- analyzer_results = self.analyzer.analyze(
- text=text,
- entities=self.entities_to_anonymize,
- language="en"
- )
-
- if not analyzer_results:
- return text, []
-
- # Anonymize the text
- anonymized_result = self.anonymizer.anonymize(
- text=text,
- analyzer_results=analyzer_results,
- operators={entity_type: OperatorConfig(self.anonymization_operator)
- for entity_type in set(result.entity_type for result in analyzer_results)}
- )
-
- # Store mapping for deanonymization
- mapping_key = str(hash(text))
- self.anonymization_mappings[mapping_key] = {
- "original": text,
- "anonymized": anonymized_result.text,
- "results": analyzer_results
- }
-
- return anonymized_result.text, analyzer_results
-
- except Exception as e:
- logger.warning(f"Failed to anonymize text: {e}")
- return text, []
-
- async def _deanonymize_text(self, text: str) -> Tuple[str, bool]:
- """Attempt to deanonymize text using stored mappings."""
- try:
- # Look for matching anonymized text in mappings
- for mapping_key, mapping in self.anonymization_mappings.items():
- if mapping["anonymized"] == text:
- return mapping["original"], True
-
- # If no mapping found, return original text
- return text, False
-
- except Exception as e:
- logger.warning(f"Failed to deanonymize text: {e}")
- return text, False
diff --git a/libs/python/agent2/agent/callbacks/trajectory_saver.py b/libs/python/agent2/agent/callbacks/trajectory_saver.py
deleted file mode 100644
index b59563d5..00000000
--- a/libs/python/agent2/agent/callbacks/trajectory_saver.py
+++ /dev/null
@@ -1,305 +0,0 @@
-"""
-Trajectory saving callback handler for ComputerAgent.
-"""
-
-import os
-import json
-import uuid
-from datetime import datetime
-import base64
-from pathlib import Path
-from typing import List, Dict, Any, Optional, Union, override
-from PIL import Image, ImageDraw
-import io
-from .base import AsyncCallbackHandler
-
-def sanitize_image_urls(data: Any) -> Any:
- """
- Recursively search for 'image_url' keys and set their values to '[omitted]'.
-
- Args:
- data: Any data structure (dict, list, or primitive type)
-
- Returns:
- A deep copy of the data with all 'image_url' values replaced with '[omitted]'
- """
- if isinstance(data, dict):
- # Create a copy of the dictionary
- sanitized = {}
- for key, value in data.items():
- if key == "image_url":
- sanitized[key] = "[omitted]"
- else:
- # Recursively sanitize the value
- sanitized[key] = sanitize_image_urls(value)
- return sanitized
-
- elif isinstance(data, list):
- # Recursively sanitize each item in the list
- return [sanitize_image_urls(item) for item in data]
-
- else:
- # For primitive types (str, int, bool, None, etc.), return as-is
- return data
-
-
-class TrajectorySaverCallback(AsyncCallbackHandler):
- """
- Callback handler that saves agent trajectories to disk.
-
- Saves each run as a separate trajectory with unique ID, and each turn
- within the trajectory gets its own folder with screenshots and responses.
- """
-
- def __init__(self, trajectory_dir: str):
- """
- Initialize trajectory saver.
-
- Args:
- trajectory_dir: Base directory to save trajectories
- """
- self.trajectory_dir = Path(trajectory_dir)
- self.trajectory_id: Optional[str] = None
- self.current_turn: int = 0
- self.current_artifact: int = 0
- self.model: Optional[str] = None
- self.total_usage: Dict[str, Any] = {}
-
- # Ensure trajectory directory exists
- self.trajectory_dir.mkdir(parents=True, exist_ok=True)
-
- def _get_turn_dir(self) -> Path:
- """Get the directory for the current turn."""
- if not self.trajectory_id:
- raise ValueError("Trajectory not initialized - call _on_run_start first")
-
- # format: trajectory_id/turn_000
- turn_dir = self.trajectory_dir / self.trajectory_id / f"turn_{self.current_turn:03d}"
- turn_dir.mkdir(parents=True, exist_ok=True)
- return turn_dir
-
- def _save_artifact(self, name: str, artifact: Union[str, bytes, Dict[str, Any]]) -> None:
- """Save an artifact to the current turn directory."""
- turn_dir = self._get_turn_dir()
- if isinstance(artifact, bytes):
- # format: turn_000/0000_name.png
- artifact_filename = f"{self.current_artifact:04d}_{name}"
- artifact_path = turn_dir / f"{artifact_filename}.png"
- with open(artifact_path, "wb") as f:
- f.write(artifact)
- else:
- # format: turn_000/0000_name.json
- artifact_filename = f"{self.current_artifact:04d}_{name}"
- artifact_path = turn_dir / f"{artifact_filename}.json"
- with open(artifact_path, "w") as f:
- json.dump(sanitize_image_urls(artifact), f, indent=2)
- self.current_artifact += 1
-
- def _update_usage(self, usage: Dict[str, Any]) -> None:
- """Update total usage statistics."""
- def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
- for key, value in source.items():
- if isinstance(value, dict):
- if key not in target:
- target[key] = {}
- add_dicts(target[key], value)
- else:
- if key not in target:
- target[key] = 0
- target[key] += value
- add_dicts(self.total_usage, usage)
-
- @override
- async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
- """Initialize trajectory tracking for a new run."""
- model = kwargs.get("model", "unknown")
- model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
- if "+" in model:
- model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
-
- # id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
- now = datetime.now()
- self.trajectory_id = f"{now.strftime('%Y-%m-%d')}_{model_name_short}_{now.strftime('%H%M%S')}_{str(uuid.uuid4())[:4]}"
- self.current_turn = 0
- self.current_artifact = 0
- self.model = model
- self.total_usage = {}
-
- # Create trajectory directory
- trajectory_path = self.trajectory_dir / self.trajectory_id
- trajectory_path.mkdir(parents=True, exist_ok=True)
-
- # Save trajectory metadata
- metadata = {
- "trajectory_id": self.trajectory_id,
- "created_at": str(uuid.uuid1().time),
- "status": "running",
- "kwargs": kwargs,
- }
-
- with open(trajectory_path / "metadata.json", "w") as f:
- json.dump(metadata, f, indent=2)
-
- @override
- async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
- """Finalize run tracking by updating metadata with completion status, usage, and new items."""
- if not self.trajectory_id:
- return
-
- # Update metadata with completion status, total usage, and new items
- trajectory_path = self.trajectory_dir / self.trajectory_id
- metadata_path = trajectory_path / "metadata.json"
-
- # Read existing metadata
- if metadata_path.exists():
- with open(metadata_path, "r") as f:
- metadata = json.load(f)
- else:
- metadata = {}
-
- # Update metadata with completion info
- metadata.update({
- "status": "completed",
- "completed_at": str(uuid.uuid1().time),
- "total_usage": self.total_usage,
- "new_items": sanitize_image_urls(new_items),
- "total_turns": self.current_turn
- })
-
- # Save updated metadata
- with open(metadata_path, "w") as f:
- json.dump(metadata, f, indent=2)
-
- @override
- async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
- if not self.trajectory_id:
- return
-
- self._save_artifact("api_start", { "kwargs": kwargs })
-
- @override
- async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
- """Save API call result."""
- if not self.trajectory_id:
- return
-
- self._save_artifact("api_result", { "kwargs": kwargs, "result": result })
-
- @override
- async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
- """Save a screenshot."""
- if isinstance(screenshot, str):
- screenshot = base64.b64decode(screenshot)
- self._save_artifact(name, screenshot)
-
- @override
- async def on_usage(self, usage: Dict[str, Any]) -> None:
- """Called when usage information is received."""
- self._update_usage(usage)
-
- @override
- async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
- """Save responses to the current turn directory and update usage statistics."""
- if not self.trajectory_id:
- return
-
- # Save responses
- turn_dir = self._get_turn_dir()
- response_data = {
- "timestamp": str(uuid.uuid1().time),
- "model": self.model,
- "kwargs": kwargs,
- "response": responses
- }
-
- self._save_artifact("agent_response", response_data)
-
- # Increment turn counter
- self.current_turn += 1
-
- def _draw_crosshair_on_image(self, image_bytes: bytes, x: int, y: int) -> bytes:
- """
- Draw a red dot and crosshair at the specified coordinates on the image.
-
- Args:
- image_bytes: The original image as bytes
- x: X coordinate for the crosshair
- y: Y coordinate for the crosshair
-
- Returns:
- Modified image as bytes with red dot and crosshair
- """
- # Open the image
- image = Image.open(io.BytesIO(image_bytes))
- draw = ImageDraw.Draw(image)
-
- # Draw crosshair lines (red, 2px thick)
- crosshair_size = 20
- line_width = 2
- color = "red"
-
- # Horizontal line
- draw.line([(x - crosshair_size, y), (x + crosshair_size, y)], fill=color, width=line_width)
- # Vertical line
- draw.line([(x, y - crosshair_size), (x, y + crosshair_size)], fill=color, width=line_width)
-
- # Draw center dot (filled circle)
- dot_radius = 3
- draw.ellipse([(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color)
-
- # Convert back to bytes
- output = io.BytesIO()
- image.save(output, format='PNG')
- return output.getvalue()
-
- @override
- async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
- """
- Called when a computer call has completed.
- Saves screenshots and computer call output.
- """
- if not self.trajectory_id:
- return
-
- self._save_artifact("computer_call_result", { "item": item, "result": result })
-
- # Check if action has x/y coordinates and there's a screenshot in the result
- action = item.get("action", {})
- if "x" in action and "y" in action:
- # Look for screenshot in the result
- for result_item in result:
- if (result_item.get("type") == "computer_call_output" and
- result_item.get("output", {}).get("type") == "input_image"):
-
- image_url = result_item["output"]["image_url"]
-
- # Extract base64 image data
- if image_url.startswith("data:image/"):
- # Format: data:image/png;base64,
- base64_data = image_url.split(",", 1)[1]
- else:
- # Assume it's just base64 data
- base64_data = image_url
-
- try:
- # Decode the image
- image_bytes = base64.b64decode(base64_data)
-
- # Draw crosshair at the action coordinates
- annotated_image = self._draw_crosshair_on_image(
- image_bytes,
- int(action["x"]),
- int(action["y"])
- )
-
- # Save as screenshot_action
- self._save_artifact("screenshot_action", annotated_image)
-
- except Exception as e:
- # If annotation fails, just log and continue
- print(f"Failed to annotate screenshot: {e}")
-
- break # Only process the first screenshot found
-
- # Increment turn counter
- self.current_turn += 1
\ No newline at end of file
diff --git a/libs/python/agent2/agent/cli.py b/libs/python/agent2/agent/cli.py
deleted file mode 100644
index 8656e86f..00000000
--- a/libs/python/agent2/agent/cli.py
+++ /dev/null
@@ -1,297 +0,0 @@
-"""
-CLI chat interface for agent - Computer Use Agent
-
-Usage:
- python -m agent.cli
-
-Examples:
- python -m agent.cli openai/computer-use-preview
- python -m agent.cli anthropic/claude-3-5-sonnet-20241022
- python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
-"""
-
-try:
- import asyncio
- import argparse
- import os
- import sys
- import json
- from typing import List, Dict, Any
- import dotenv
- from yaspin import yaspin
-except ImportError:
- if __name__ == "__main__":
- raise ImportError(
- "CLI dependencies not found. "
- "Please install with: pip install \"cua-agent[cli]\""
- )
-
-# Load environment variables
-dotenv.load_dotenv()
-
-# Color codes for terminal output
-class Colors:
- RESET = '\033[0m'
- BOLD = '\033[1m'
- DIM = '\033[2m'
-
- # Text colors
- RED = '\033[31m'
- GREEN = '\033[32m'
- YELLOW = '\033[33m'
- BLUE = '\033[34m'
- MAGENTA = '\033[35m'
- CYAN = '\033[36m'
- WHITE = '\033[37m'
- GRAY = '\033[90m'
-
- # Background colors
- BG_RED = '\033[41m'
- BG_GREEN = '\033[42m'
- BG_YELLOW = '\033[43m'
- BG_BLUE = '\033[44m'
-
-
-def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n"):
- """Print colored text to terminal."""
- prefix = ""
- if bold:
- prefix += Colors.BOLD
- if dim:
- prefix += Colors.DIM
- if color:
- prefix += color
-
- print(f"{prefix}{text}{Colors.RESET}", end=end)
-
-
-def print_action(action_type: str, details: Dict[str, Any]):
- """Print computer action with nice formatting."""
- # Format action details
- args_str = ""
- if action_type == "click" and "x" in details and "y" in details:
- args_str = f"({details['x']}, {details['y']})"
- elif action_type == "type" and "text" in details:
- text = details["text"]
- if len(text) > 50:
- text = text[:47] + "..."
- args_str = f'"{text}"'
- elif action_type == "key" and "key" in details:
- args_str = f"'{details['key']}'"
- elif action_type == "scroll" and "x" in details and "y" in details:
- args_str = f"({details['x']}, {details['y']})"
-
- print_colored(f"đ ī¸ {action_type}{args_str}", dim=True)
-
-
-def print_welcome(model: str, agent_loop: str, container_name: str):
- """Print welcome message."""
- print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
- print_colored("Type 'exit' to quit.", dim=True)
-
-async def ainput(prompt: str = ""):
- return await asyncio.to_thread(input, prompt)
-
-async def chat_loop(agent, model: str, container_name: str):
- """Main chat loop with the agent."""
- print_welcome(model, agent.agent_loop.__name__, container_name)
-
- history = []
-
- while True:
- # Get user input with prompt
- print_colored("> ", end="")
- user_input = await ainput()
-
- if user_input.lower() in ['exit', 'quit', 'q']:
- print_colored("\nđ Goodbye!")
- break
-
- if not user_input:
- continue
-
- # Add user message to history
- history.append({"role": "user", "content": user_input})
-
- # Stream responses from the agent with spinner
- with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
- spinner.hide()
-
- async for result in agent.run(history):
- # Add agent responses to history
- history.extend(result.get("output", []))
-
- # Process and display the output
- for item in result.get("output", []):
- if item.get("type") == "message":
- # Display agent text response
- content = item.get("content", [])
- for content_part in content:
- if content_part.get("text"):
- text = content_part.get("text", "").strip()
- if text:
- spinner.hide()
- print_colored(text)
-
- elif item.get("type") == "computer_call":
- # Display computer action
- action = item.get("action", {})
- action_type = action.get("type", "")
- if action_type:
- spinner.hide()
- print_action(action_type, action)
- spinner.text = f"Performing {action_type}..."
- spinner.show()
-
- elif item.get("type") == "function_call":
- # Display function call
- function_name = item.get("name", "")
- spinner.hide()
- print_colored(f"đ§ Calling function: {function_name}", dim=True)
- spinner.text = f"Calling {function_name}..."
- spinner.show()
-
- elif item.get("type") == "function_call_output":
- # Display function output (dimmed)
- output = item.get("output", "")
- if output and len(output.strip()) > 0:
- spinner.hide()
- print_colored(f"đ¤ {output}", dim=True)
-
- spinner.hide()
-
-
-async def main():
- """Main CLI function."""
- parser = argparse.ArgumentParser(
- description="CUA Agent CLI - Interactive computer use assistant",
- formatter_class=argparse.RawDescriptionHelpFormatter,
- epilog="""
-Examples:
- python -m agent.cli openai/computer-use-preview
- python -m agent.cli anthropic/claude-3-5-sonnet-20241022
- python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
- python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
- """
- )
-
- parser.add_argument(
- "model",
- help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
- )
-
- parser.add_argument(
- "--images",
- type=int,
- default=3,
- help="Number of recent images to keep in context (default: 3)"
- )
-
- parser.add_argument(
- "--trajectory",
- action="store_true",
- help="Save trajectory for debugging"
- )
-
- parser.add_argument(
- "--budget",
- type=float,
- help="Maximum budget for the session (in dollars)"
- )
-
- parser.add_argument(
- "--verbose",
- action="store_true",
- help="Enable verbose logging"
- )
-
- args = parser.parse_args()
-
- # Check for required environment variables
- container_name = os.getenv("CUA_CONTAINER_NAME")
- cua_api_key = os.getenv("CUA_API_KEY")
-
- # Prompt for missing environment variables
- if not container_name:
- print_colored("CUA_CONTAINER_NAME not set.", dim=True)
- print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
- container_name = input("Enter your CUA container name: ").strip()
- if not container_name:
- print_colored("â Container name is required.")
- sys.exit(1)
-
- if not cua_api_key:
- print_colored("CUA_API_KEY not set.", dim=True)
- cua_api_key = input("Enter your CUA API key: ").strip()
- if not cua_api_key:
- print_colored("â API key is required.")
- sys.exit(1)
-
- # Check for provider-specific API keys based on model
- provider_api_keys = {
- "openai/": "OPENAI_API_KEY",
- "anthropic/": "ANTHROPIC_API_KEY",
- "omniparser+": "OPENAI_API_KEY",
- "omniparser+": "ANTHROPIC_API_KEY",
- }
-
- # Find matching provider and check for API key
- for prefix, env_var in provider_api_keys.items():
- if args.model.startswith(prefix):
- if not os.getenv(env_var):
- print_colored(f"{env_var} not set.", dim=True)
- api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
- if not api_key:
- print_colored(f"â {env_var.replace('_', ' ').title()} is required.")
- sys.exit(1)
- # Set the environment variable for the session
- os.environ[env_var] = api_key
- break
-
- # Import here to avoid import errors if dependencies are missing
- try:
- from agent import ComputerAgent
- from computer import Computer
- except ImportError as e:
- print_colored(f"â Import error: {e}", Colors.RED, bold=True)
- print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
- sys.exit(1)
-
- # Create computer instance
- async with Computer(
- os_type="linux",
- provider_type="cloud",
- name=container_name,
- api_key=cua_api_key
- ) as computer:
-
- # Create agent
- agent_kwargs = {
- "model": args.model,
- "tools": [computer],
- "only_n_most_recent_images": args.images,
- "verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
- }
-
- if args.trajectory:
- agent_kwargs["trajectory_dir"] = "trajectories"
-
- if args.budget:
- agent_kwargs["max_trajectory_budget"] = {
- "max_budget": args.budget,
- "raise_error": True,
- "reset_after_each_run": False
- }
-
- agent = ComputerAgent(**agent_kwargs)
-
- # Start chat loop
- await chat_loop(agent, args.model, container_name)
-
-
-
-if __name__ == "__main__":
- try:
- asyncio.run(main())
- except (KeyboardInterrupt, EOFError) as _:
- print_colored("\n\nđ Goodbye!")
\ No newline at end of file
diff --git a/libs/python/agent2/agent/computer_handler.py b/libs/python/agent2/agent/computer_handler.py
deleted file mode 100644
index 4a9f0186..00000000
--- a/libs/python/agent2/agent/computer_handler.py
+++ /dev/null
@@ -1,107 +0,0 @@
-"""
-Computer handler implementation for OpenAI computer-use-preview protocol.
-"""
-
-import base64
-from typing import Dict, List, Any, Literal
-from .types import Computer
-
-
-class OpenAIComputerHandler:
- """Computer handler that implements the Computer protocol using the computer interface."""
-
- def __init__(self, computer_interface):
- """Initialize with a computer interface (from tool schema)."""
- self.interface = computer_interface
-
- async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
- """Get the current environment type."""
- # For now, return a default - this could be enhanced to detect actual environment
- return "windows"
-
- async def get_dimensions(self) -> tuple[int, int]:
- """Get screen dimensions as (width, height)."""
- screen_size = await self.interface.get_screen_size()
- return screen_size["width"], screen_size["height"]
-
- async def screenshot(self) -> str:
- """Take a screenshot and return as base64 string."""
- screenshot_bytes = await self.interface.screenshot()
- return base64.b64encode(screenshot_bytes).decode('utf-8')
-
- async def click(self, x: int, y: int, button: str = "left") -> None:
- """Click at coordinates with specified button."""
- if button == "left":
- await self.interface.left_click(x, y)
- elif button == "right":
- await self.interface.right_click(x, y)
- else:
- # Default to left click for unknown buttons
- await self.interface.left_click(x, y)
-
- async def double_click(self, x: int, y: int) -> None:
- """Double click at coordinates."""
- await self.interface.double_click(x, y)
-
- async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
- """Scroll at coordinates with specified scroll amounts."""
- await self.interface.move_cursor(x, y)
- await self.interface.scroll(scroll_x, scroll_y)
-
- async def type(self, text: str) -> None:
- """Type text."""
- await self.interface.type_text(text)
-
- async def wait(self, ms: int = 1000) -> None:
- """Wait for specified milliseconds."""
- import asyncio
- await asyncio.sleep(ms / 1000.0)
-
- async def move(self, x: int, y: int) -> None:
- """Move cursor to coordinates."""
- await self.interface.move_cursor(x, y)
-
- async def keypress(self, keys: List[str]) -> None:
- """Press key combination."""
- if len(keys) == 1:
- await self.interface.press_key(keys[0])
- else:
- # Handle key combinations
- await self.interface.hotkey(*keys)
-
- async def drag(self, path: List[Dict[str, int]]) -> None:
- """Drag along specified path."""
- if not path:
- return
-
- # Start drag from first point
- start = path[0]
- await self.interface.mouse_down(start["x"], start["y"])
-
- # Move through path
- for point in path[1:]:
- await self.interface.move_cursor(point["x"], point["y"])
-
- # End drag at last point
- end = path[-1]
- await self.interface.mouse_up(end["x"], end["y"])
-
- async def get_current_url(self) -> str:
- """Get current URL (for browser environments)."""
- # This would need to be implemented based on the specific browser interface
- # For now, return empty string
- return ""
-
-
-def acknowledge_safety_check_callback(message: str) -> bool:
- """Safety check callback for user acknowledgment."""
- response = input(
- f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): "
- ).lower()
- return response.strip() == "y"
-
-
-def check_blocklisted_url(url: str) -> None:
- """Check if URL is blocklisted (placeholder implementation)."""
- # This would contain actual URL checking logic
- pass
diff --git a/libs/python/agent2/agent/decorators.py b/libs/python/agent2/agent/decorators.py
deleted file mode 100644
index 0b31c25a..00000000
--- a/libs/python/agent2/agent/decorators.py
+++ /dev/null
@@ -1,90 +0,0 @@
-"""
-Decorators for agent - agent_loop decorator
-"""
-
-import asyncio
-import inspect
-from typing import Dict, List, Any, Callable, Optional
-from functools import wraps
-
-from .types import AgentLoopInfo
-
-# Global registry
-_agent_loops: List[AgentLoopInfo] = []
-
-def agent_loop(models: str, priority: int = 0):
- """
- Decorator to register an agent loop function.
-
- Args:
- models: Regex pattern to match supported models
- priority: Priority for loop selection (higher = more priority)
- """
- def decorator(func: Callable):
- # Validate function signature
- sig = inspect.signature(func)
- required_params = {'messages', 'model'}
- func_params = set(sig.parameters.keys())
-
- if not required_params.issubset(func_params):
- missing = required_params - func_params
- raise ValueError(f"Agent loop function must have parameters: {missing}")
-
- # Register the loop
- loop_info = AgentLoopInfo(
- func=func,
- models_regex=models,
- priority=priority
- )
- _agent_loops.append(loop_info)
-
- # Sort by priority (highest first)
- _agent_loops.sort(key=lambda x: x.priority, reverse=True)
-
- @wraps(func)
- async def wrapper(*args, **kwargs):
- # Wrap the function in an asyncio.Queue for cancellation support
- queue = asyncio.Queue()
- task = None
-
- try:
- # Create a task that can be cancelled
- async def run_loop():
- try:
- result = await func(*args, **kwargs)
- await queue.put(('result', result))
- except Exception as e:
- await queue.put(('error', e))
-
- task = asyncio.create_task(run_loop())
-
- # Wait for result or cancellation
- event_type, data = await queue.get()
-
- if event_type == 'error':
- raise data
- return data
-
- except asyncio.CancelledError:
- if task:
- task.cancel()
- try:
- await task
- except asyncio.CancelledError:
- pass
- raise
-
- return wrapper
-
- return decorator
-
-def get_agent_loops() -> List[AgentLoopInfo]:
- """Get all registered agent loops"""
- return _agent_loops.copy()
-
-def find_agent_loop(model: str) -> Optional[AgentLoopInfo]:
- """Find the best matching agent loop for a model"""
- for loop_info in _agent_loops:
- if loop_info.matches_model(model):
- return loop_info
- return None
diff --git a/libs/python/agent2/agent/loops/__init__.py b/libs/python/agent2/agent/loops/__init__.py
deleted file mode 100644
index aa159411..00000000
--- a/libs/python/agent2/agent/loops/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-"""
-Agent loops for agent
-"""
-
-# Import the loops to register them
-from . import anthropic
-from . import openai
-from . import uitars
-from . import omniparser
-
-__all__ = ["anthropic", "openai", "uitars", "omniparser"]
diff --git a/libs/python/agent2/agent/loops/anthropic.py b/libs/python/agent2/agent/loops/anthropic.py
deleted file mode 100644
index 23a587f5..00000000
--- a/libs/python/agent2/agent/loops/anthropic.py
+++ /dev/null
@@ -1,728 +0,0 @@
-"""
-Anthropic hosted tools agent loop implementation using liteLLM
-"""
-
-import asyncio
-import json
-from typing import Dict, List, Any, AsyncGenerator, Union, Optional
-import litellm
-from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
-
-from ..decorators import agent_loop
-from ..types import Messages, AgentResponse, Tools
-from ..responses import (
- make_reasoning_item,
- make_output_text_item,
- make_click_item,
- make_double_click_item,
- make_drag_item,
- make_keypress_item,
- make_move_item,
- make_scroll_item,
- make_type_item,
- make_wait_item,
- make_input_image_item,
- make_screenshot_item
-)
-
-# Model version mapping to tool version and beta flag
-MODEL_TOOL_MAPPING = [
- # Claude 4 models
- {
- "pattern": r"claude-4|claude-opus-4|claude-sonnet-4",
- "tool_version": "computer_20250124",
- "beta_flag": "computer-use-2025-01-24"
- },
- # Claude 3.7 models
- {
- "pattern": r"claude-3\.?7|claude-3-7",
- "tool_version": "computer_20250124",
- "beta_flag": "computer-use-2025-01-24"
- },
- # Claude 3.5 models (fallback)
- {
- "pattern": r"claude-3\.?5|claude-3-5",
- "tool_version": "computer_20241022",
- "beta_flag": "computer-use-2024-10-22"
- }
-]
-
-def _get_tool_config_for_model(model: str) -> Dict[str, str]:
- """Get tool version and beta flag for the given model."""
- import re
-
- for mapping in MODEL_TOOL_MAPPING:
- if re.search(mapping["pattern"], model, re.IGNORECASE):
- return {
- "tool_version": mapping["tool_version"],
- "beta_flag": mapping["beta_flag"]
- }
-
- # Default to Claude 3.5 configuration
- return {
- "tool_version": "computer_20241022",
- "beta_flag": "computer-use-2024-10-22"
- }
-
-def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
- """Map a computer tool to Anthropic's hosted tool schema."""
- return {
- "type": tool_version,
- "function": {
- "name": "computer",
- "parameters": {
- "display_height_px": getattr(computer_tool, 'display_height', 768),
- "display_width_px": getattr(computer_tool, 'display_width', 1024),
- "display_number": getattr(computer_tool, 'display_number', 1),
- },
- },
- }
-
-def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
- """Prepare tools for Anthropic API format."""
- tool_config = _get_tool_config_for_model(model)
- anthropic_tools = []
-
- for schema in tool_schemas:
- if schema["type"] == "computer":
- # Map computer tool to Anthropic format
- anthropic_tools.append(_map_computer_tool_to_anthropic(
- schema["computer"],
- tool_config["tool_version"]
- ))
- elif schema["type"] == "function":
- # Function tools - convert to Anthropic format
- function_schema = schema["function"]
- anthropic_tools.append({
- "type": "function",
- "function": {
- "name": function_schema["name"],
- "description": function_schema.get("description", ""),
- "parameters": function_schema.get("parameters", {})
- }
- })
-
- return anthropic_tools
-
-def _convert_responses_items_to_completion_messages(messages: Messages) -> List[Dict[str, Any]]:
- """Convert responses_items message format to liteLLM completion format."""
- completion_messages = []
-
- for message in messages:
- msg_type = message.get("type")
- role = message.get("role")
-
- # Handle user messages (both with and without explicit type)
- if role == "user" or msg_type == "user":
- content = message.get("content", "")
- if isinstance(content, list):
- # Multi-modal content - convert input_image to image format
- converted_content = []
- for item in content:
- if isinstance(item, dict) and item.get("type") == "input_image":
- # Convert input_image to Anthropic image format
- image_url = item.get("image_url", "")
- if image_url and image_url != "[omitted]":
- # Extract base64 data from data URL
- if "," in image_url:
- base64_data = image_url.split(",")[-1]
- else:
- base64_data = image_url
-
- converted_content.append({
- "type": "image",
- "source": {
- "type": "base64",
- "media_type": "image/png",
- "data": base64_data
- }
- })
- else:
- # Keep other content types as-is
- converted_content.append(item)
-
- completion_messages.append({
- "role": "user",
- "content": converted_content if converted_content else content
- })
- else:
- # Text content
- completion_messages.append({
- "role": "user",
- "content": content
- })
-
- # Handle assistant messages
- elif role == "assistant":
- content = message.get("content", [])
- if isinstance(content, str):
- content = [{ "type": "output_text", "text": content }]
-
- content = "\n".join(item.get("text", "") for item in content)
- completion_messages.append({
- "role": "assistant",
- "content": content
- })
-
- elif msg_type == "reasoning":
- # Reasoning becomes part of assistant message
- summary = message.get("summary", [])
- reasoning_text = ""
-
- if isinstance(summary, list) and summary:
- # Extract text from summary items
- for item in summary:
- if isinstance(item, dict) and item.get("type") == "summary_text":
- reasoning_text = item.get("text", "")
- break
- else:
- # Fallback to direct reasoning field
- reasoning_text = message.get("reasoning", "")
-
- if reasoning_text:
- completion_messages.append({
- "role": "assistant",
- "content": reasoning_text
- })
-
- elif msg_type == "computer_call":
- # Computer call becomes tool use in assistant message
- action = message.get("action", {})
- action_type = action.get("type")
- call_id = message.get("call_id", "call_1")
-
- tool_use_content = []
-
- if action_type == "click":
- tool_use_content.append({
- "type": "tool_use",
- "id": call_id,
- "name": "computer",
- "input": {
- "action": "click",
- "coordinate": [action.get("x", 0), action.get("y", 0)]
- }
- })
- elif action_type == "type":
- tool_use_content.append({
- "type": "tool_use",
- "id": call_id,
- "name": "computer",
- "input": {
- "action": "type",
- "text": action.get("text", "")
- }
- })
- elif action_type == "key":
- tool_use_content.append({
- "type": "tool_use",
- "id": call_id,
- "name": "computer",
- "input": {
- "action": "key",
- "key": action.get("key", "")
- }
- })
- elif action_type == "wait":
- tool_use_content.append({
- "type": "tool_use",
- "id": call_id,
- "name": "computer",
- "input": {
- "action": "screenshot"
- }
- })
- elif action_type == "screenshot":
- tool_use_content.append({
- "type": "tool_use",
- "id": call_id,
- "name": "computer",
- "input": {
- "action": "screenshot"
- }
- })
-
- # Convert tool_use_content to OpenAI tool_calls format
- openai_tool_calls = []
- for tool_use in tool_use_content:
- openai_tool_calls.append({
- "id": tool_use["id"],
- "type": "function",
- "function": {
- "name": tool_use["name"],
- "arguments": json.dumps(tool_use["input"])
- }
- })
-
- # If the last completion message is an assistant message, extend the tool_calls
- if completion_messages and completion_messages[-1].get("role") == "assistant":
- if "tool_calls" not in completion_messages[-1]:
- completion_messages[-1]["tool_calls"] = []
- completion_messages[-1]["tool_calls"].extend(openai_tool_calls)
- else:
- # Create new assistant message with tool calls
- completion_messages.append({
- "role": "assistant",
- "content": None,
- "tool_calls": openai_tool_calls
- })
-
- elif msg_type == "computer_call_output":
- # Computer call output becomes OpenAI function result
- output = message.get("output", {})
- call_id = message.get("call_id", "call_1")
-
- if output.get("type") == "input_image":
- # Screenshot result - convert to OpenAI format with image_url content
- image_url = output.get("image_url", "")
- completion_messages.append({
- "role": "function",
- "name": "computer",
- "tool_call_id": call_id,
- "content": [{
- "type": "image_url",
- "image_url": {
- "url": image_url
- }
- }]
- })
- else:
- # Text result - convert to OpenAI format
- completion_messages.append({
- "role": "function",
- "name": "computer",
- "tool_call_id": call_id,
- "content": str(output)
- })
-
- return completion_messages
-
-def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]]:
- """Convert liteLLM completion response to responses_items message format."""
- responses_items = []
-
- if not response or not hasattr(response, 'choices') or not response.choices:
- return responses_items
-
- choice = response.choices[0]
- message = choice.message
-
- # Handle text content
- if hasattr(message, 'content') and message.content:
- if isinstance(message.content, str):
- responses_items.append(make_output_text_item(message.content))
- elif isinstance(message.content, list):
- for content_item in message.content:
- if isinstance(content_item, dict):
- if content_item.get("type") == "text":
- responses_items.append(make_output_text_item(content_item.get("text", "")))
- elif content_item.get("type") == "tool_use":
- # Convert tool use to computer call
- tool_input = content_item.get("input", {})
- action_type = tool_input.get("action")
- call_id = content_item.get("id")
-
- # Action reference:
- # https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool#available-actions
-
- # Basic actions (all versions)
- if action_type == "screenshot":
- responses_items.append(make_screenshot_item(call_id=call_id))
- elif action_type == "left_click":
- coordinate = tool_input.get("coordinate", [0, 0])
- responses_items.append(make_click_item(
- x=coordinate[0] if len(coordinate) > 0 else 0,
- y=coordinate[1] if len(coordinate) > 1 else 0,
- call_id=call_id
- ))
- elif action_type == "type":
- responses_items.append(make_type_item(
- text=tool_input.get("text", ""),
- call_id=call_id
- ))
- elif action_type == "key":
- responses_items.append(make_keypress_item(
- key=tool_input.get("key", ""),
- call_id=call_id
- ))
- elif action_type == "mouse_move":
- # Mouse move - create a custom action item
- coordinate = tool_input.get("coordinate", [0, 0])
- responses_items.append({
- "type": "computer_call",
- "call_id": call_id,
- "action": {
- "type": "mouse_move",
- "x": coordinate[0] if len(coordinate) > 0 else 0,
- "y": coordinate[1] if len(coordinate) > 1 else 0
- }
- })
-
- # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
- elif action_type == "scroll":
- coordinate = tool_input.get("coordinate", [0, 0])
- responses_items.append(make_scroll_item(
- x=coordinate[0] if len(coordinate) > 0 else 0,
- y=coordinate[1] if len(coordinate) > 1 else 0,
- direction=tool_input.get("scroll_direction", "down"),
- amount=tool_input.get("scroll_amount", 3),
- call_id=call_id
- ))
- elif action_type == "left_click_drag":
- start_coord = tool_input.get("start_coordinate", [0, 0])
- end_coord = tool_input.get("end_coordinate", [0, 0])
- responses_items.append(make_drag_item(
- start_x=start_coord[0] if len(start_coord) > 0 else 0,
- start_y=start_coord[1] if len(start_coord) > 1 else 0,
- end_x=end_coord[0] if len(end_coord) > 0 else 0,
- end_y=end_coord[1] if len(end_coord) > 1 else 0,
- call_id=call_id
- ))
- elif action_type == "right_click":
- coordinate = tool_input.get("coordinate", [0, 0])
- responses_items.append(make_click_item(
- x=coordinate[0] if len(coordinate) > 0 else 0,
- y=coordinate[1] if len(coordinate) > 1 else 0,
- button="right",
- call_id=call_id
- ))
- elif action_type == "middle_click":
- coordinate = tool_input.get("coordinate", [0, 0])
- responses_items.append(make_click_item(
- x=coordinate[0] if len(coordinate) > 0 else 0,
- y=coordinate[1] if len(coordinate) > 1 else 0,
- button="wheel",
- call_id=call_id
- ))
- elif action_type == "double_click":
- coordinate = tool_input.get("coordinate", [0, 0])
- responses_items.append(make_double_click_item(
- x=coordinate[0] if len(coordinate) > 0 else 0,
- y=coordinate[1] if len(coordinate) > 1 else 0,
- call_id=call_id
- ))
- elif action_type == "triple_click":
- # coordinate = tool_input.get("coordinate", [0, 0])
- # responses_items.append({
- # "type": "computer_call",
- # "call_id": call_id,
- # "action": {
- # "type": "triple_click",
- # "x": coordinate[0] if len(coordinate) > 0 else 0,
- # "y": coordinate[1] if len(coordinate) > 1 else 0
- # }
- # })
- raise NotImplementedError("triple_click")
- elif action_type == "left_mouse_down":
- # coordinate = tool_input.get("coordinate", [0, 0])
- # responses_items.append({
- # "type": "computer_call",
- # "call_id": call_id,
- # "action": {
- # "type": "mouse_down",
- # "button": "left",
- # "x": coordinate[0] if len(coordinate) > 0 else 0,
- # "y": coordinate[1] if len(coordinate) > 1 else 0
- # }
- # })
- raise NotImplementedError("left_mouse_down")
- elif action_type == "left_mouse_up":
- # coordinate = tool_input.get("coordinate", [0, 0])
- # responses_items.append({
- # "type": "computer_call",
- # "call_id": call_id,
- # "action": {
- # "type": "mouse_up",
- # "button": "left",
- # "x": coordinate[0] if len(coordinate) > 0 else 0,
- # "y": coordinate[1] if len(coordinate) > 1 else 0
- # }
- # })
- raise NotImplementedError("left_mouse_up")
- elif action_type == "hold_key":
- # responses_items.append({
- # "type": "computer_call",
- # "call_id": call_id,
- # "action": {
- # "type": "key_hold",
- # "key": tool_input.get("key", "")
- # }
- # })
- raise NotImplementedError("hold_key")
- elif action_type == "wait":
- responses_items.append(make_wait_item(
- call_id=call_id
- ))
- else:
- raise ValueError(f"Unknown action type: {action_type}")
-
- # Handle tool calls (alternative format)
- if hasattr(message, 'tool_calls') and message.tool_calls:
- for tool_call in message.tool_calls:
- print(tool_call)
- if tool_call.function.name == "computer":
- try:
- args = json.loads(tool_call.function.arguments)
- action_type = args.get("action")
- call_id = tool_call.id
-
- # Basic actions (all versions)
- if action_type == "screenshot":
- responses_items.append(make_screenshot_item(
- call_id=call_id
- ))
- elif action_type in ["click", "left_click"]:
- coordinate = args.get("coordinate", [0, 0])
- responses_items.append(make_click_item(
- x=coordinate[0] if len(coordinate) > 0 else 0,
- y=coordinate[1] if len(coordinate) > 1 else 0,
- call_id=call_id
- ))
- elif action_type == "type":
- responses_items.append(make_type_item(
- text=args.get("text", ""),
- call_id=call_id
- ))
- elif action_type == "key":
- responses_items.append(make_keypress_item(
- key=args.get("key", ""),
- call_id=call_id
- ))
- elif action_type == "mouse_move":
- coordinate = args.get("coordinate", [0, 0])
- responses_items.append(make_move_item(
- x=coordinate[0] if len(coordinate) > 0 else 0,
- y=coordinate[1] if len(coordinate) > 1 else 0,
- call_id=call_id
- ))
-
- # Enhanced actions (computer_20250124) Available in Claude 4 and Claude Sonnet 3.7
- elif action_type == "scroll":
- coordinate = args.get("coordinate", [0, 0])
- direction = args.get("scroll_direction", "down")
- amount = args.get("scroll_amount", 3)
- scroll_x = amount if direction == "left" else \
- -amount if direction == "right" else 0
- scroll_y = amount if direction == "up" else \
- -amount if direction == "down" else 0
- responses_items.append(make_scroll_item(
- x=coordinate[0] if len(coordinate) > 0 else 0,
- y=coordinate[1] if len(coordinate) > 1 else 0,
- scroll_x=scroll_x,
- scroll_y=scroll_y,
- call_id=call_id
- ))
- elif action_type == "left_click_drag":
- start_coord = args.get("start_coordinate", [0, 0])
- end_coord = args.get("end_coordinate", [0, 0])
- responses_items.append(make_drag_item(
- start_x=start_coord[0] if len(start_coord) > 0 else 0,
- start_y=start_coord[1] if len(start_coord) > 1 else 0,
- end_x=end_coord[0] if len(end_coord) > 0 else 0,
- end_y=end_coord[1] if len(end_coord) > 1 else 0,
- call_id=call_id
- ))
- elif action_type == "right_click":
- coordinate = args.get("coordinate", [0, 0])
- responses_items.append(make_click_item(
- x=coordinate[0] if len(coordinate) > 0 else 0,
- y=coordinate[1] if len(coordinate) > 1 else 0,
- button="right",
- call_id=call_id
- ))
- elif action_type == "middle_click":
- coordinate = args.get("coordinate", [0, 0])
- responses_items.append(make_click_item(
- x=coordinate[0] if len(coordinate) > 0 else 0,
- y=coordinate[1] if len(coordinate) > 1 else 0,
- button="scroll",
- call_id=call_id
- ))
- elif action_type == "double_click":
- coordinate = args.get("coordinate", [0, 0])
- responses_items.append(make_double_click_item(
- x=coordinate[0] if len(coordinate) > 0 else 0,
- y=coordinate[1] if len(coordinate) > 1 else 0,
- call_id=call_id
- ))
- elif action_type == "triple_click":
- raise NotImplementedError("triple_click")
- elif action_type == "left_mouse_down":
- raise NotImplementedError("left_mouse_down")
- elif action_type == "left_mouse_up":
- raise NotImplementedError("left_mouse_up")
- elif action_type == "hold_key":
- raise NotImplementedError("hold_key")
- elif action_type == "wait":
- responses_items.append(make_wait_item(
- call_id=call_id
- ))
- except json.JSONDecodeError:
- print("Failed to decode tool call arguments")
- # Skip malformed tool calls
- continue
-
- return responses_items
-
-def _add_cache_control(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """Add cache control to completion messages"""
- num_writes = 0
- for message in completion_messages:
- message["cache_control"] = { "type": "ephemeral" }
- num_writes += 1
- # Cache control has a maximum of 4 blocks
- if num_writes >= 4:
- break
-
- return completion_messages
-
-def _combine_completion_messages(completion_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """Combine completion messages with the same role"""
- if not completion_messages:
- return completion_messages
-
- combined_messages = []
-
- for message in completion_messages:
- # If this is the first message or role is different from last, add as new message
- if not combined_messages or combined_messages[-1]["role"] != message["role"]:
- # Ensure content is a list format and normalize text content
- new_message = message.copy()
- new_message["content"] = _normalize_content(message.get("content", ""))
-
- # Copy tool_calls if present
- if "tool_calls" in message:
- new_message["tool_calls"] = message["tool_calls"].copy()
-
- combined_messages.append(new_message)
- else:
- # Same role as previous message, combine them
- last_message = combined_messages[-1]
-
- # Combine content
- current_content = _normalize_content(message.get("content", ""))
- last_message["content"].extend(current_content)
-
- # Combine tool_calls if present
- if "tool_calls" in message:
- if "tool_calls" not in last_message:
- last_message["tool_calls"] = []
- last_message["tool_calls"].extend(message["tool_calls"])
-
- # Post-process to merge consecutive text blocks
- for message in combined_messages:
- message["content"] = _merge_consecutive_text(message["content"])
-
- return combined_messages
-
-def _normalize_content(content) -> List[Dict[str, Any]]:
- """Normalize content to list format"""
- if isinstance(content, str):
- if content.strip(): # Only add non-empty strings
- return [{"type": "text", "text": content}]
- else:
- return []
- elif isinstance(content, list):
- return content.copy()
- else:
- return []
-
-def _merge_consecutive_text(content_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """Merge consecutive text blocks with newlines"""
- if not content_list:
- return content_list
-
- merged = []
-
- for item in content_list:
- if (item.get("type") == "text" and
- merged and
- merged[-1].get("type") == "text"):
- # Merge with previous text block
- merged[-1]["text"] += "\n" + item["text"]
- else:
- merged.append(item.copy())
-
- return merged
-
-@agent_loop(models=r".*claude-.*", priority=5)
-async def anthropic_hosted_tools_loop(
- messages: Messages,
- model: str,
- tools: Optional[List[Dict[str, Any]]] = None,
- max_retries: Optional[int] = None,
- stream: bool = False,
- computer_handler=None,
- use_prompt_caching: Optional[bool] = False,
- _on_api_start=None,
- _on_api_end=None,
- _on_usage=None,
- _on_screenshot=None,
- **kwargs
-) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
- """
- Anthropic hosted tools agent loop using liteLLM acompletion.
-
- Supports Anthropic's computer use models with hosted tools.
- """
- tools = tools or []
-
- # Get tool configuration for this model
- tool_config = _get_tool_config_for_model(model)
-
- # Prepare tools for Anthropic API
- anthropic_tools = _prepare_tools_for_anthropic(tools, model)
-
- # Convert responses_items messages to completion format
- completion_messages = _convert_responses_items_to_completion_messages(messages)
- if use_prompt_caching:
- # First combine messages to reduce number of blocks
- completion_messages = _combine_completion_messages(completion_messages)
- # Then add cache control, anthropic requires explicit "cache_control" dicts
- completion_messages = _add_cache_control(completion_messages)
-
- # Prepare API call kwargs
- api_kwargs = {
- "model": model,
- "messages": completion_messages,
- "tools": anthropic_tools if anthropic_tools else None,
- "stream": stream,
- "num_retries": max_retries,
- **kwargs
- }
-
- # Add beta header for computer use
- if anthropic_tools:
- api_kwargs["headers"] = {
- "anthropic-beta": tool_config["beta_flag"]
- }
-
- # Call API start hook
- if _on_api_start:
- await _on_api_start(api_kwargs)
-
- # Use liteLLM acompletion
- response = await litellm.acompletion(**api_kwargs)
-
- # Call API end hook
- if _on_api_end:
- await _on_api_end(api_kwargs, response)
-
- # Convert response to responses_items format
- responses_items = _convert_completion_to_responses_items(response)
-
- # Extract usage information
- responses_usage = {
- **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
- "response_cost": response._hidden_params.get("response_cost", 0.0),
- }
- if _on_usage:
- await _on_usage(responses_usage)
-
- # Create agent response
- agent_response = {
- "output": responses_items,
- "usage": responses_usage
- }
-
- return agent_response
diff --git a/libs/python/agent2/agent/loops/omniparser.py b/libs/python/agent2/agent/loops/omniparser.py
deleted file mode 100644
index f0e7832a..00000000
--- a/libs/python/agent2/agent/loops/omniparser.py
+++ /dev/null
@@ -1,339 +0,0 @@
-"""
-OpenAI computer-use-preview agent loop implementation using liteLLM
-"""
-
-import asyncio
-import json
-from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
-import litellm
-import inspect
-import base64
-
-from ..decorators import agent_loop
-from ..types import Messages, AgentResponse, Tools
-
-SOM_TOOL_SCHEMA = {
- "type": "function",
- "name": "computer",
- "description": "Control a computer by taking screenshots and interacting with UI elements. This tool shows screenshots with numbered elements overlaid on them. Each UI element has been assigned a unique ID number that you can see in the image. Use the element's ID number to interact with any element instead of pixel coordinates.",
- "parameters": {
- "type": "object",
- "properties": {
- "action": {
- "type": "string",
- "enum": [
- "screenshot",
- "click",
- "double_click",
- "drag",
- "type",
- "keypress",
- "scroll",
- "move",
- "wait",
- "get_current_url",
- "get_dimensions",
- "get_environment"
- ],
- "description": "The action to perform"
- },
- "element_id": {
- "type": "integer",
- "description": "The ID of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
- },
- "start_element_id": {
- "type": "integer",
- "description": "The ID of the element to start dragging from (required for drag action)"
- },
- "end_element_id": {
- "type": "integer",
- "description": "The ID of the element to drag to (required for drag action)"
- },
- "text": {
- "type": "string",
- "description": "The text to type (required for type action)"
- },
- "keys": {
- "type": "string",
- "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
- },
- "button": {
- "type": "string",
- "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
- },
- "scroll_x": {
- "type": "integer",
- "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
- },
- "scroll_y": {
- "type": "integer",
- "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
- },
- },
- "required": [
- "action"
- ]
- }
-}
-
-OMNIPARSER_AVAILABLE = False
-try:
- from som import OmniParser
- OMNIPARSER_AVAILABLE = True
-except ImportError:
- pass
-OMNIPARSER_SINGLETON = None
-
-def get_parser():
- global OMNIPARSER_SINGLETON
- if OMNIPARSER_SINGLETON is None:
- OMNIPARSER_SINGLETON = OmniParser()
- return OMNIPARSER_SINGLETON
-
-def get_last_computer_call_output(messages: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
- """Get the last computer_call_output message from a messages list.
-
- Args:
- messages: List of messages to search through
-
- Returns:
- The last computer_call_output message dict, or None if not found
- """
- for message in reversed(messages):
- if isinstance(message, dict) and message.get("type") == "computer_call_output":
- return message
- return None
-
-def _prepare_tools_for_omniparser(tool_schemas: List[Dict[str, Any]]) -> Tuple[Tools, dict]:
- """Prepare tools for OpenAI API format"""
- omniparser_tools = []
- id2xy = dict()
-
- for schema in tool_schemas:
- if schema["type"] == "computer":
- omniparser_tools.append(SOM_TOOL_SCHEMA)
- if "id2xy" in schema:
- id2xy = schema["id2xy"]
- else:
- schema["id2xy"] = id2xy
- elif schema["type"] == "function":
- # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
- # Schema should be: {type, name, description, parameters}
- omniparser_tools.append({ "type": "function", **schema["function"] })
-
- return omniparser_tools, id2xy
-
-async def replace_function_with_computer_call(item: Dict[str, Any], id2xy: Dict[int, Tuple[float, float]]):
- item_type = item.get("type")
-
- def _get_xy(element_id: Optional[int]) -> Union[Tuple[float, float], Tuple[None, None]]:
- if element_id is None:
- return (None, None)
- return id2xy.get(element_id, (None, None))
-
- if item_type == "function_call":
- fn_name = item.get("name")
- fn_args = json.loads(item.get("arguments", "{}"))
-
- item_id = item.get("id")
- call_id = item.get("call_id")
-
- if fn_name == "computer":
- action = fn_args.get("action")
- element_id = fn_args.get("element_id")
- start_element_id = fn_args.get("start_element_id")
- end_element_id = fn_args.get("end_element_id")
- text = fn_args.get("text")
- keys = fn_args.get("keys")
- button = fn_args.get("button")
- scroll_x = fn_args.get("scroll_x")
- scroll_y = fn_args.get("scroll_y")
-
- x, y = _get_xy(element_id)
- start_x, start_y = _get_xy(start_element_id)
- end_x, end_y = _get_xy(end_element_id)
-
- action_args = {
- "type": action,
- "x": x,
- "y": y,
- "start_x": start_x,
- "start_y": start_y,
- "end_x": end_x,
- "end_y": end_y,
- "text": text,
- "keys": keys,
- "button": button,
- "scroll_x": scroll_x,
- "scroll_y": scroll_y
- }
- # Remove None values to keep the JSON clean
- action_args = {k: v for k, v in action_args.items() if v is not None}
-
- return [{
- "type": "computer_call",
- "action": action_args,
- "id": item_id,
- "call_id": call_id,
- "status": "completed"
- }]
-
- return [item]
-
-async def replace_computer_call_with_function(item: Dict[str, Any], xy2id: Dict[Tuple[float, float], int]):
- """
- Convert computer_call back to function_call format.
- Also handles computer_call_output -> function_call_output conversion.
-
- Args:
- item: The item to convert
- xy2id: Mapping from (x, y) coordinates to element IDs
- """
- item_type = item.get("type")
-
- def _get_element_id(x: Optional[float], y: Optional[float]) -> Optional[int]:
- """Get element ID from coordinates, return None if coordinates are None"""
- if x is None or y is None:
- return None
- return xy2id.get((x, y))
-
- if item_type == "computer_call":
- action_data = item.get("action", {})
-
- # Extract coordinates and convert back to element IDs
- element_id = _get_element_id(action_data.get("x"), action_data.get("y"))
- start_element_id = _get_element_id(action_data.get("start_x"), action_data.get("start_y"))
- end_element_id = _get_element_id(action_data.get("end_x"), action_data.get("end_y"))
-
- # Build function arguments
- fn_args = {
- "action": action_data.get("type"),
- "element_id": element_id,
- "start_element_id": start_element_id,
- "end_element_id": end_element_id,
- "text": action_data.get("text"),
- "keys": action_data.get("keys"),
- "button": action_data.get("button"),
- "scroll_x": action_data.get("scroll_x"),
- "scroll_y": action_data.get("scroll_y")
- }
-
- # Remove None values to keep the JSON clean
- fn_args = {k: v for k, v in fn_args.items() if v is not None}
-
- return [{
- "type": "function_call",
- "name": "computer",
- "arguments": json.dumps(fn_args),
- "id": item.get("id"),
- "call_id": item.get("call_id"),
- "status": "completed",
-
- # Fall back to string representation
- "content": f"Used tool: {action_data.get("type")}({json.dumps(fn_args)})"
- }]
-
- elif item_type == "computer_call_output":
- # Simple conversion: computer_call_output -> function_call_output
- return [{
- "type": "function_call_output",
- "call_id": item.get("call_id"),
- "content": [item.get("output")],
- "id": item.get("id"),
- "status": "completed"
- }]
-
- return [item]
-
-
-@agent_loop(models=r"omniparser\+.*|omni\+.*", priority=10)
-async def omniparser_loop(
- messages: Messages,
- model: str,
- tools: Optional[List[Dict[str, Any]]] = None,
- max_retries: Optional[int] = None,
- stream: bool = False,
- computer_handler=None,
- use_prompt_caching: Optional[bool] = False,
- _on_api_start=None,
- _on_api_end=None,
- _on_usage=None,
- _on_screenshot=None,
- **kwargs
-) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
- """
- OpenAI computer-use-preview agent loop using liteLLM responses.
-
- Supports OpenAI's computer use preview models.
- """
- if not OMNIPARSER_AVAILABLE:
- raise ValueError("omniparser loop requires som to be installed. Install it with `pip install cua-som`.")
-
- tools = tools or []
-
- llm_model = model.split('+')[-1]
-
- # Prepare tools for OpenAI API
- openai_tools, id2xy = _prepare_tools_for_omniparser(tools)
-
- # Find last computer_call_output
- last_computer_call_output = get_last_computer_call_output(messages)
- if last_computer_call_output:
- image_url = last_computer_call_output.get("output", {}).get("image_url", "")
- image_data = image_url.split(",")[-1]
- if image_data:
- parser = get_parser()
- result = parser.parse(image_data)
- if _on_screenshot:
- await _on_screenshot(result.annotated_image_base64, "annotated_image")
- for element in result.elements:
- id2xy[element.id] = ((element.bbox.x1 + element.bbox.x2) / 2, (element.bbox.y1 + element.bbox.y2) / 2)
-
- # handle computer calls -> function calls
- new_messages = []
- for message in messages:
- if not isinstance(message, dict):
- message = message.__dict__
- new_messages += await replace_computer_call_with_function(message, id2xy)
- messages = new_messages
-
- # Prepare API call kwargs
- api_kwargs = {
- "model": llm_model,
- "input": messages,
- "tools": openai_tools if openai_tools else None,
- "stream": stream,
- "reasoning": {"summary": "concise"},
- "truncation": "auto",
- "num_retries": max_retries,
- **kwargs
- }
-
- # Call API start hook
- if _on_api_start:
- await _on_api_start(api_kwargs)
-
- print(str(api_kwargs)[:1000])
-
- # Use liteLLM responses
- response = await litellm.aresponses(**api_kwargs)
-
- # Call API end hook
- if _on_api_end:
- await _on_api_end(api_kwargs, response)
-
- # Extract usage information
- response.usage = {
- **response.usage.model_dump(),
- "response_cost": response._hidden_params.get("response_cost", 0.0),
- }
- if _on_usage:
- await _on_usage(response.usage)
-
- # handle som function calls -> xy computer calls
- new_output = []
- for i in range(len(response.output)):
- new_output += await replace_function_with_computer_call(response.output[i].model_dump(), id2xy)
- response.output = new_output
-
- return response
diff --git a/libs/python/agent2/agent/loops/openai.py b/libs/python/agent2/agent/loops/openai.py
deleted file mode 100644
index 84b79d1f..00000000
--- a/libs/python/agent2/agent/loops/openai.py
+++ /dev/null
@@ -1,95 +0,0 @@
-"""
-OpenAI computer-use-preview agent loop implementation using liteLLM
-"""
-
-import asyncio
-import json
-from typing import Dict, List, Any, AsyncGenerator, Union, Optional
-import litellm
-
-from ..decorators import agent_loop
-from ..types import Messages, AgentResponse, Tools
-
-def _map_computer_tool_to_openai(computer_tool: Any) -> Dict[str, Any]:
- """Map a computer tool to OpenAI's computer-use-preview tool schema"""
- return {
- "type": "computer_use_preview",
- "display_width": getattr(computer_tool, 'display_width', 1024),
- "display_height": getattr(computer_tool, 'display_height', 768),
- "environment": getattr(computer_tool, 'environment', "linux") # mac, windows, linux, browser
- }
-
-
-def _prepare_tools_for_openai(tool_schemas: List[Dict[str, Any]]) -> Tools:
- """Prepare tools for OpenAI API format"""
- openai_tools = []
-
- for schema in tool_schemas:
- if schema["type"] == "computer":
- # Map computer tool to OpenAI format
- openai_tools.append(_map_computer_tool_to_openai(schema["computer"]))
- elif schema["type"] == "function":
- # Function tools use OpenAI-compatible schema directly (liteLLM expects this format)
- # Schema should be: {type, name, description, parameters}
- openai_tools.append({ "type": "function", **schema["function"] })
-
- return openai_tools
-
-
-@agent_loop(models=r".*computer-use-preview.*", priority=10)
-async def openai_computer_use_loop(
- messages: Messages,
- model: str,
- tools: Optional[List[Dict[str, Any]]] = None,
- max_retries: Optional[int] = None,
- stream: bool = False,
- computer_handler=None,
- use_prompt_caching: Optional[bool] = False,
- _on_api_start=None,
- _on_api_end=None,
- _on_usage=None,
- _on_screenshot=None,
- **kwargs
-) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
- """
- OpenAI computer-use-preview agent loop using liteLLM responses.
-
- Supports OpenAI's computer use preview models.
- """
- tools = tools or []
-
- # Prepare tools for OpenAI API
- openai_tools = _prepare_tools_for_openai(tools)
-
- # Prepare API call kwargs
- api_kwargs = {
- "model": model,
- "input": messages,
- "tools": openai_tools if openai_tools else None,
- "stream": stream,
- "reasoning": {"summary": "concise"},
- "truncation": "auto",
- "num_retries": max_retries,
- **kwargs
- }
-
- # Call API start hook
- if _on_api_start:
- await _on_api_start(api_kwargs)
-
- # Use liteLLM responses
- response = await litellm.aresponses(**api_kwargs)
-
- # Call API end hook
- if _on_api_end:
- await _on_api_end(api_kwargs, response)
-
- # Extract usage information
- response.usage = {
- **response.usage.model_dump(),
- "response_cost": response._hidden_params.get("response_cost", 0.0),
- }
- if _on_usage:
- await _on_usage(response.usage)
-
- return response
diff --git a/libs/python/agent2/agent/loops/uitars.py b/libs/python/agent2/agent/loops/uitars.py
deleted file mode 100644
index e82e005d..00000000
--- a/libs/python/agent2/agent/loops/uitars.py
+++ /dev/null
@@ -1,688 +0,0 @@
-"""
-UITARS agent loop implementation using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B
-"""
-
-import asyncio
-from ctypes import cast
-import json
-import base64
-import math
-import re
-import ast
-from typing import Dict, List, Any, AsyncGenerator, Union, Optional
-from io import BytesIO
-from PIL import Image
-import litellm
-from litellm.types.utils import ModelResponse
-from litellm.responses.litellm_completion_transformation.transformation import LiteLLMCompletionResponsesConfig
-from litellm.responses.utils import Usage
-from openai.types.responses.response_computer_tool_call_param import ActionType, ResponseComputerToolCallParam
-from openai.types.responses.response_input_param import ComputerCallOutput
-from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
-from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
-
-from ..decorators import agent_loop
-from ..types import Messages, AgentResponse, Tools
-from ..responses import (
- make_reasoning_item,
- make_output_text_item,
- make_click_item,
- make_double_click_item,
- make_drag_item,
- make_keypress_item,
- make_scroll_item,
- make_type_item,
- make_wait_item,
- make_input_image_item
-)
-
-# Constants from reference code
-IMAGE_FACTOR = 28
-MIN_PIXELS = 100 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
-MAX_RATIO = 200
-
-FINISH_WORD = "finished"
-WAIT_WORD = "wait"
-ENV_FAIL_WORD = "error_env"
-CALL_USER = "call_user"
-
-# Action space prompt for UITARS
-UITARS_ACTION_SPACE = """
-click(start_box='<|box_start|>(x1,y1)<|box_end|>')
-left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
-right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
-drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
-hotkey(key='')
-type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
-scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
-wait() #Sleep for 5s and take a screenshot to check for any changes.
-finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
-"""
-
-UITARS_PROMPT_TEMPLATE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
-
-## Output Format
-```
-Thought: ...
-Action: ...
-```
-
-## Action Space
-{action_space}
-
-## Note
-- Use {language} in `Thought` part.
-- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
-
-## User Instruction
-{instruction}
-"""
-
-
-def round_by_factor(number: float, factor: int) -> int:
- """Returns the closest integer to 'number' that is divisible by 'factor'."""
- return round(number / factor) * factor
-
-
-def ceil_by_factor(number: float, factor: int) -> int:
- """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
- return math.ceil(number / factor) * factor
-
-
-def floor_by_factor(number: float, factor: int) -> int:
- """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
- return math.floor(number / factor) * factor
-
-
-def smart_resize(
- height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
-) -> tuple[int, int]:
- """
- Rescales the image so that the following conditions are met:
- 1. Both dimensions (height and width) are divisible by 'factor'.
- 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
- 3. The aspect ratio of the image is maintained as closely as possible.
- """
- if max(height, width) / min(height, width) > MAX_RATIO:
- raise ValueError(
- f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
- )
- h_bar = max(factor, round_by_factor(height, factor))
- w_bar = max(factor, round_by_factor(width, factor))
- if h_bar * w_bar > max_pixels:
- beta = math.sqrt((height * width) / max_pixels)
- h_bar = floor_by_factor(height / beta, factor)
- w_bar = floor_by_factor(width / beta, factor)
- elif h_bar * w_bar < min_pixels:
- beta = math.sqrt(min_pixels / (height * width))
- h_bar = ceil_by_factor(height * beta, factor)
- w_bar = ceil_by_factor(width * beta, factor)
- return h_bar, w_bar
-
-
-def escape_single_quotes(text):
- """Escape single quotes in text for safe string formatting."""
- pattern = r"(? List[Dict[str, Any]]:
- """Parse UITARS model response into structured actions."""
- text = text.strip()
-
- # Extract thought
- thought = None
- if text.startswith("Thought:"):
- thought_match = re.search(r"Thought: (.+?)(?=\s*Action:|$)", text, re.DOTALL)
- if thought_match:
- thought = thought_match.group(1).strip()
-
- # Extract action
- if "Action:" not in text:
- raise ValueError("No Action found in response")
-
- action_str = text.split("Action:")[-1].strip()
-
- # Handle special case for type actions
- if "type(content" in action_str:
- def escape_quotes(match):
- return match.group(1)
-
- pattern = r"type\(content='(.*?)'\)"
- content = re.sub(pattern, escape_quotes, action_str)
- action_str = escape_single_quotes(content)
- action_str = "type(content='" + action_str + "')"
-
-
- # Parse the action
- parsed_action = parse_action(action_str.replace("\n", "\\n").lstrip())
- if parsed_action is None:
- raise ValueError(f"Action can't parse: {action_str}")
-
- action_type = parsed_action["function"]
- params = parsed_action["args"]
-
- # Process parameters
- action_inputs = {}
- for param_name, param in params.items():
- if param == "":
- continue
- param = str(param).lstrip()
- action_inputs[param_name.strip()] = param
-
- # Handle coordinate parameters
- if "start_box" in param_name or "end_box" in param_name:
- # Parse coordinates like '(x,y)' or '(x1,y1,x2,y2)'
- numbers = param.replace("(", "").replace(")", "").split(",")
- float_numbers = [float(num.strip()) / 1000 for num in numbers] # Normalize to 0-1 range
-
- if len(float_numbers) == 2:
- # Single point, duplicate for box format
- float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
-
- action_inputs[param_name.strip()] = str(float_numbers)
-
- return [{
- "thought": thought,
- "action_type": action_type,
- "action_inputs": action_inputs,
- "text": text
- }]
-
-
-def convert_to_computer_actions(parsed_responses: List[Dict[str, Any]], image_width: int, image_height: int) -> List[ResponseComputerToolCallParam | ResponseOutputMessageParam]:
- """Convert parsed UITARS responses to computer actions."""
- computer_actions = []
-
- for response in parsed_responses:
- action_type = response.get("action_type")
- action_inputs = response.get("action_inputs", {})
-
- if action_type == "finished":
- finished_text = action_inputs.get("content", "Task completed successfully.")
- computer_actions.append(make_output_text_item(finished_text))
- break
-
- elif action_type == "wait":
- computer_actions.append(make_wait_item())
-
- elif action_type == "call_user":
- computer_actions.append(make_output_text_item("I need assistance from the user to proceed with this task."))
-
- elif action_type in ["click", "left_single"]:
- start_box = action_inputs.get("start_box")
- if start_box:
- coords = eval(start_box)
- x = int((coords[0] + coords[2]) / 2 * image_width)
- y = int((coords[1] + coords[3]) / 2 * image_height)
-
- computer_actions.append(make_click_item(x, y, "left"))
-
- elif action_type == "double_click":
- start_box = action_inputs.get("start_box")
- if start_box:
- coords = eval(start_box)
- x = int((coords[0] + coords[2]) / 2 * image_width)
- y = int((coords[1] + coords[3]) / 2 * image_height)
-
- computer_actions.append(make_double_click_item(x, y))
-
- elif action_type == "right_click":
- start_box = action_inputs.get("start_box")
- if start_box:
- coords = eval(start_box)
- x = int((coords[0] + coords[2]) / 2 * image_width)
- y = int((coords[1] + coords[3]) / 2 * image_height)
-
- computer_actions.append(make_click_item(x, y, "right"))
-
- elif action_type == "type":
- content = action_inputs.get("content", "")
- computer_actions.append(make_type_item(content))
-
- elif action_type == "hotkey":
- key = action_inputs.get("key", "")
- keys = key.split()
- computer_actions.append(make_keypress_item(keys))
-
- elif action_type == "press":
- key = action_inputs.get("key", "")
- computer_actions.append(make_keypress_item([key]))
-
- elif action_type == "scroll":
- start_box = action_inputs.get("start_box")
- direction = action_inputs.get("direction", "down")
-
- if start_box:
- coords = eval(start_box)
- x = int((coords[0] + coords[2]) / 2 * image_width)
- y = int((coords[1] + coords[3]) / 2 * image_height)
- else:
- x, y = image_width // 2, image_height // 2
-
- scroll_y = 5 if "up" in direction.lower() else -5
- computer_actions.append(make_scroll_item(x, y, 0, scroll_y))
-
- elif action_type == "drag":
- start_box = action_inputs.get("start_box")
- end_box = action_inputs.get("end_box")
-
- if start_box and end_box:
- start_coords = eval(start_box)
- end_coords = eval(end_box)
-
- start_x = int((start_coords[0] + start_coords[2]) / 2 * image_width)
- start_y = int((start_coords[1] + start_coords[3]) / 2 * image_height)
- end_x = int((end_coords[0] + end_coords[2]) / 2 * image_width)
- end_y = int((end_coords[1] + end_coords[3]) / 2 * image_height)
-
- path = [{"x": start_x, "y": start_y}, {"x": end_x, "y": end_y}]
- computer_actions.append(make_drag_item(path))
-
- return computer_actions
-
-
-def pil_to_base64(image: Image.Image) -> str:
- """Convert PIL image to base64 string."""
- buffer = BytesIO()
- image.save(buffer, format="PNG")
- return base64.b64encode(buffer.getvalue()).decode("utf-8")
-
-
-def process_image_for_uitars(image_data: str, max_pixels: int = MAX_PIXELS, min_pixels: int = MIN_PIXELS) -> tuple[Image.Image, int, int]:
- """Process image for UITARS model input."""
- # Decode base64 image
- if image_data.startswith('data:image'):
- image_data = image_data.split(',')[1]
-
- image_bytes = base64.b64decode(image_data)
- image = Image.open(BytesIO(image_bytes))
-
- original_width, original_height = image.size
-
- # Resize image according to UITARS requirements
- if image.width * image.height > max_pixels:
- resize_factor = math.sqrt(max_pixels / (image.width * image.height))
- width = int(image.width * resize_factor)
- height = int(image.height * resize_factor)
- image = image.resize((width, height))
-
- if image.width * image.height < min_pixels:
- resize_factor = math.sqrt(min_pixels / (image.width * image.height))
- width = math.ceil(image.width * resize_factor)
- height = math.ceil(image.height * resize_factor)
- image = image.resize((width, height))
-
- if image.mode != "RGB":
- image = image.convert("RGB")
-
- return image, original_width, original_height
-
-
-def sanitize_message(msg: Any) -> Any:
- """Return a copy of the message with image_url ommited within content parts"""
- if isinstance(msg, dict):
- result = {}
- for key, value in msg.items():
- if key == "content" and isinstance(value, list):
- result[key] = [
- {k: v for k, v in item.items() if k != "image_url"} if isinstance(item, dict) else item
- for item in value
- ]
- else:
- result[key] = value
- return result
- elif isinstance(msg, list):
- return [sanitize_message(item) for item in msg]
- else:
- return msg
-
-
-def convert_uitars_messages_to_litellm(messages: Messages) -> List[Dict[str, Any]]:
- """
- Convert UITARS internal message format back to LiteLLM format.
-
- This function processes reasoning, computer_call, and computer_call_output messages
- and converts them to the appropriate LiteLLM assistant message format.
-
- Args:
- messages: List of UITARS internal messages
-
- Returns:
- List of LiteLLM formatted messages
- """
- litellm_messages = []
- current_assistant_content = []
-
- for message in messages:
- if isinstance(message, dict):
- message_type = message.get("type")
-
- if message_type == "reasoning":
- # Extract reasoning text from summary
- summary = message.get("summary", [])
- if summary and isinstance(summary, list):
- for summary_item in summary:
- if isinstance(summary_item, dict) and summary_item.get("type") == "summary_text":
- reasoning_text = summary_item.get("text", "")
- if reasoning_text:
- current_assistant_content.append(f"Thought: {reasoning_text}")
-
- elif message_type == "computer_call":
- # Convert computer action to UITARS action format
- action = message.get("action", {})
- action_type = action.get("type")
-
- if action_type == "click":
- x, y = action.get("x", 0), action.get("y", 0)
- button = action.get("button", "left")
- if button == "left":
- action_text = f"Action: click(start_box='({x},{y})')"
- elif button == "right":
- action_text = f"Action: right_single(start_box='({x},{y})')"
- else:
- action_text = f"Action: click(start_box='({x},{y})')"
-
- elif action_type == "double_click":
- x, y = action.get("x", 0), action.get("y", 0)
- action_text = f"Action: left_double(start_box='({x},{y})')"
-
- elif action_type == "drag":
- start_x, start_y = action.get("start_x", 0), action.get("start_y", 0)
- end_x, end_y = action.get("end_x", 0), action.get("end_y", 0)
- action_text = f"Action: drag(start_box='({start_x},{start_y})', end_box='({end_x},{end_y})')"
-
- elif action_type == "key":
- key = action.get("key", "")
- action_text = f"Action: hotkey(key='{key}')"
-
- elif action_type == "type":
- text = action.get("text", "")
- # Escape single quotes in the text
- escaped_text = escape_single_quotes(text)
- action_text = f"Action: type(content='{escaped_text}')"
-
- elif action_type == "scroll":
- x, y = action.get("x", 0), action.get("y", 0)
- direction = action.get("direction", "down")
- action_text = f"Action: scroll(start_box='({x},{y})', direction='{direction}')"
-
- elif action_type == "wait":
- action_text = "Action: wait()"
-
- else:
- # Fallback for unknown action types
- action_text = f"Action: {action_type}({action})"
-
- current_assistant_content.append(action_text)
-
- # When we hit a computer_call_output, finalize the current assistant message
- if current_assistant_content:
- litellm_messages.append({
- "role": "assistant",
- "content": [{"type": "text", "text": "\n".join(current_assistant_content)}]
- })
- current_assistant_content = []
-
- elif message_type == "computer_call_output":
- # Add screenshot from computer call output
- output = message.get("output", {})
- if isinstance(output, dict) and output.get("type") == "input_image":
- image_url = output.get("image_url", "")
- if image_url:
- litellm_messages.append({
- "role": "user",
- "content": [{"type": "image_url", "image_url": {"url": image_url}}]
- })
-
- elif message.get("role") == "user":
- # # Handle user messages
- # content = message.get("content", "")
- # if isinstance(content, str):
- # litellm_messages.append({
- # "role": "user",
- # "content": content
- # })
- # elif isinstance(content, list):
- # litellm_messages.append({
- # "role": "user",
- # "content": content
- # })
- pass
-
- # Add any remaining assistant content
- if current_assistant_content:
- litellm_messages.append({
- "role": "assistant",
- "content": current_assistant_content
- })
-
- return litellm_messages
-
-@agent_loop(models=r"(?i).*ui-?tars.*", priority=10)
-async def uitars_loop(
- messages: Messages,
- model: str,
- tools: Optional[List[Dict[str, Any]]] = None,
- max_retries: Optional[int] = None,
- stream: bool = False,
- computer_handler=None,
- use_prompt_caching: Optional[bool] = False,
- _on_api_start=None,
- _on_api_end=None,
- _on_usage=None,
- _on_screenshot=None,
- **kwargs
-) -> Union[AgentResponse, AsyncGenerator[Dict[str, Any], None]]:
- """
- UITARS agent loop using liteLLM for ByteDance-Seed/UI-TARS-1.5-7B model.
-
- Supports UITARS vision-language models for computer control.
- """
- tools = tools or []
-
- # Create response items
- response_items = []
-
- # Find computer tool for screen dimensions
- computer_tool = None
- for tool_schema in tools:
- if tool_schema["type"] == "computer":
- computer_tool = tool_schema["computer"]
- break
-
- # Get screen dimensions
- screen_width, screen_height = 1024, 768
- if computer_tool:
- try:
- screen_width, screen_height = await computer_tool.get_dimensions()
- except:
- pass
-
- # Process messages to extract instruction and image
- instruction = ""
- image_data = None
-
- # Convert messages to list if string
- if isinstance(messages, str):
- messages = [{"role": "user", "content": messages}]
-
- # Extract instruction and latest screenshot
- for message in reversed(messages):
- if isinstance(message, dict):
- content = message.get("content", "")
-
- # Handle different content formats
- if isinstance(content, str):
- if not instruction and message.get("role") == "user":
- instruction = content
- elif isinstance(content, list):
- for item in content:
- if isinstance(item, dict):
- if item.get("type") == "text" and not instruction:
- instruction = item.get("text", "")
- elif item.get("type") == "image_url" and not image_data:
- image_url = item.get("image_url", {})
- if isinstance(image_url, dict):
- image_data = image_url.get("url", "")
- else:
- image_data = image_url
-
- # Also check for computer_call_output with screenshots
- if message.get("type") == "computer_call_output" and not image_data:
- output = message.get("output", {})
- if isinstance(output, dict) and output.get("type") == "input_image":
- image_data = output.get("image_url", "")
-
- if instruction and image_data:
- break
-
- if not instruction:
- instruction = "Help me complete this task by analyzing the screen and taking appropriate actions."
-
- # Create prompt
- user_prompt = UITARS_PROMPT_TEMPLATE.format(
- instruction=instruction,
- action_space=UITARS_ACTION_SPACE,
- language="English"
- )
-
- # Convert conversation history to LiteLLM format
- history_messages = convert_uitars_messages_to_litellm(messages)
-
- # Prepare messages for liteLLM
- litellm_messages = [
- {
- "role": "system",
- "content": "You are a helpful assistant."
- }
- ]
-
- # Add current user instruction with screenshot
- current_user_message = {
- "role": "user",
- "content": [
- {"type": "text", "text": user_prompt},
- ]
- }
- litellm_messages.append(current_user_message)
-
- # Process image for UITARS
- if not image_data:
- # Take screenshot if none found in messages
- if computer_handler:
- image_data = await computer_handler.screenshot()
- await _on_screenshot(image_data, "screenshot_before")
-
- # Add screenshot to output items so it can be retained in history
- response_items.append(make_input_image_item(image_data))
- else:
- raise ValueError("No screenshot found in messages and no computer_handler provided")
- processed_image, original_width, original_height = process_image_for_uitars(image_data)
- encoded_image = pil_to_base64(processed_image)
-
- # Add conversation history
- if history_messages:
- litellm_messages.extend(history_messages)
- else:
- litellm_messages.append({
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded_image}"}}
- ]
- })
-
- # Prepare API call kwargs
- api_kwargs = {
- "model": model,
- "messages": litellm_messages,
- "max_tokens": kwargs.get("max_tokens", 500),
- "temperature": kwargs.get("temperature", 0.0),
- "do_sample": kwargs.get("temperature", 0.0) > 0.0,
- "num_retries": max_retries,
- **{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]}
- }
-
- # Call API start hook
- if _on_api_start:
- await _on_api_start(api_kwargs)
-
- # Call liteLLM with UITARS model
- response = await litellm.acompletion(**api_kwargs)
-
- # Call API end hook
- if _on_api_end:
- await _on_api_end(api_kwargs, response)
-
- # Extract response content
- response_content = response.choices[0].message.content.strip() # type: ignore
-
- # Parse UITARS response
- parsed_responses = parse_uitars_response(response_content, original_width, original_height)
-
- # Convert to computer actions
- computer_actions = convert_to_computer_actions(parsed_responses, original_width, original_height)
-
- # Add computer actions to response items
- thought = parsed_responses[0].get("thought", "")
- if thought:
- response_items.append(make_reasoning_item(thought))
- response_items.extend(computer_actions)
-
- # Extract usage information
- response_usage = {
- **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(response.usage).model_dump(),
- "response_cost": response._hidden_params.get("response_cost", 0.0),
- }
- if _on_usage:
- await _on_usage(response_usage)
-
- # Create agent response
- agent_response = {
- "output": response_items,
- "usage": response_usage
- }
-
- return agent_response
\ No newline at end of file
diff --git a/libs/python/agent2/agent/responses.py b/libs/python/agent2/agent/responses.py
deleted file mode 100644
index 2d7e85d0..00000000
--- a/libs/python/agent2/agent/responses.py
+++ /dev/null
@@ -1,207 +0,0 @@
-"""
-Functions for making various Responses API items from different types of responses.
-Based on the OpenAI spec for Responses API items.
-"""
-
-import base64
-import json
-import uuid
-from typing import List, Dict, Any, Literal, Union, Optional
-
-from openai.types.responses.response_computer_tool_call_param import (
- ResponseComputerToolCallParam,
- ActionClick,
- ActionDoubleClick,
- ActionDrag,
- ActionDragPath,
- ActionKeypress,
- ActionMove,
- ActionScreenshot,
- ActionScroll,
- ActionType as ActionTypeAction,
- ActionWait,
- PendingSafetyCheck
-)
-
-from openai.types.responses.response_function_tool_call_param import ResponseFunctionToolCallParam
-from openai.types.responses.response_output_text_param import ResponseOutputTextParam
-from openai.types.responses.response_reasoning_item_param import ResponseReasoningItemParam, Summary
-from openai.types.responses.response_output_message_param import ResponseOutputMessageParam
-from openai.types.responses.easy_input_message_param import EasyInputMessageParam
-from openai.types.responses.response_input_image_param import ResponseInputImageParam
-
-def random_id():
- return str(uuid.uuid4())
-
-# User message items
-def make_input_image_item(image_data: Union[str, bytes]) -> EasyInputMessageParam:
- return EasyInputMessageParam(
- content=[
- ResponseInputImageParam(
- type="input_image",
- image_url=f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8') if isinstance(image_data, bytes) else image_data}"
- )
- ],
- role="user",
- type="message"
- )
-
-# Text items
-def make_reasoning_item(reasoning: str) -> ResponseReasoningItemParam:
- return ResponseReasoningItemParam(
- id=random_id(),
- summary=[
- Summary(text=reasoning, type="summary_text")
- ],
- type="reasoning"
- )
-
-def make_output_text_item(content: str) -> ResponseOutputMessageParam:
- return ResponseOutputMessageParam(
- id=random_id(),
- content=[
- ResponseOutputTextParam(
- text=content,
- type="output_text",
- annotations=[]
- )
- ],
- role="assistant",
- status="completed",
- type="message"
- )
-
-# Function call items
-def make_function_call_item(function_name: str, arguments: Dict[str, Any], call_id: Optional[str] = None) -> ResponseFunctionToolCallParam:
- return ResponseFunctionToolCallParam(
- id=random_id(),
- call_id=call_id if call_id else random_id(),
- name=function_name,
- arguments=json.dumps(arguments),
- status="completed",
- type="function_call"
- )
-
-# Computer tool call items
-def make_click_item(x: int, y: int, button: Literal["left", "right", "wheel", "back", "forward"] = "left", call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
- return ResponseComputerToolCallParam(
- id=random_id(),
- call_id=call_id if call_id else random_id(),
- action=ActionClick(
- button=button,
- type="click",
- x=x,
- y=y
- ),
- pending_safety_checks=[],
- status="completed",
- type="computer_call"
- )
-
-def make_double_click_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
- return ResponseComputerToolCallParam(
- id=random_id(),
- call_id=call_id if call_id else random_id(),
- action=ActionDoubleClick(
- type="double_click",
- x=x,
- y=y
- ),
- pending_safety_checks=[],
- status="completed",
- type="computer_call"
- )
-
-def make_drag_item(path: List[Dict[str, int]], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
- drag_path = [ActionDragPath(x=point["x"], y=point["y"]) for point in path]
- return ResponseComputerToolCallParam(
- id=random_id(),
- call_id=call_id if call_id else random_id(),
- action=ActionDrag(
- path=drag_path,
- type="drag"
- ),
- pending_safety_checks=[],
- status="completed",
- type="computer_call"
- )
-
-def make_keypress_item(keys: List[str], call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
- return ResponseComputerToolCallParam(
- id=random_id(),
- call_id=call_id if call_id else random_id(),
- action=ActionKeypress(
- keys=keys,
- type="keypress"
- ),
- pending_safety_checks=[],
- status="completed",
- type="computer_call"
- )
-
-def make_move_item(x: int, y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
- return ResponseComputerToolCallParam(
- id=random_id(),
- call_id=call_id if call_id else random_id(),
- action=ActionMove(
- type="move",
- x=x,
- y=y
- ),
- pending_safety_checks=[],
- status="completed",
- type="computer_call"
- )
-
-def make_screenshot_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
- return ResponseComputerToolCallParam(
- id=random_id(),
- call_id=call_id if call_id else random_id(),
- action=ActionScreenshot(
- type="screenshot"
- ),
- pending_safety_checks=[],
- status="completed",
- type="computer_call"
- )
-
-def make_scroll_item(x: int, y: int, scroll_x: int, scroll_y: int, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
- return ResponseComputerToolCallParam(
- id=random_id(),
- call_id=call_id if call_id else random_id(),
- action=ActionScroll(
- scroll_x=scroll_x,
- scroll_y=scroll_y,
- type="scroll",
- x=x,
- y=y
- ),
- pending_safety_checks=[],
- status="completed",
- type="computer_call"
- )
-
-def make_type_item(text: str, call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
- return ResponseComputerToolCallParam(
- id=random_id(),
- call_id=call_id if call_id else random_id(),
- action=ActionTypeAction(
- text=text,
- type="type"
- ),
- pending_safety_checks=[],
- status="completed",
- type="computer_call"
- )
-
-def make_wait_item(call_id: Optional[str] = None) -> ResponseComputerToolCallParam:
- return ResponseComputerToolCallParam(
- id=random_id(),
- call_id=call_id if call_id else random_id(),
- action=ActionWait(
- type="wait"
- ),
- pending_safety_checks=[],
- status="completed",
- type="computer_call"
- )
diff --git a/libs/python/agent2/agent/types.py b/libs/python/agent2/agent/types.py
deleted file mode 100644
index 2b07a6cf..00000000
--- a/libs/python/agent2/agent/types.py
+++ /dev/null
@@ -1,79 +0,0 @@
-"""
-Type definitions for agent
-"""
-
-from typing import Dict, List, Any, Optional, Callable, Protocol, Literal
-from pydantic import BaseModel
-import re
-from litellm import ResponseInputParam, ResponsesAPIResponse, ToolParam
-from collections.abc import Iterable
-
-# Agent input types
-Messages = str | ResponseInputParam
-Tools = Optional[Iterable[ToolParam]]
-
-# Agent output types
-AgentResponse = ResponsesAPIResponse
-
-# Agent loop registration
-class AgentLoopInfo(BaseModel):
- """Information about a registered agent loop"""
- func: Callable
- models_regex: str
- priority: int = 0
-
- def matches_model(self, model: str) -> bool:
- """Check if this loop matches the given model"""
- return bool(re.match(self.models_regex, model))
-
-# Computer tool interface
-class Computer(Protocol):
- """Protocol defining the interface for computer interactions."""
-
- async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
- """Get the current environment type."""
- ...
-
- async def get_dimensions(self) -> tuple[int, int]:
- """Get screen dimensions as (width, height)."""
- ...
-
- async def screenshot(self) -> str:
- """Take a screenshot and return as base64 string."""
- ...
-
- async def click(self, x: int, y: int, button: str = "left") -> None:
- """Click at coordinates with specified button."""
- ...
-
- async def double_click(self, x: int, y: int) -> None:
- """Double click at coordinates."""
- ...
-
- async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
- """Scroll at coordinates with specified scroll amounts."""
- ...
-
- async def type(self, text: str) -> None:
- """Type text."""
- ...
-
- async def wait(self, ms: int = 1000) -> None:
- """Wait for specified milliseconds."""
- ...
-
- async def move(self, x: int, y: int) -> None:
- """Move cursor to coordinates."""
- ...
-
- async def keypress(self, keys: List[str]) -> None:
- """Press key combination."""
- ...
-
- async def drag(self, path: List[Dict[str, int]]) -> None:
- """Drag along specified path."""
- ...
-
- async def get_current_url(self) -> str:
- """Get current URL (for browser environments)."""
- ...
diff --git a/libs/python/agent2/agent/ui/__init__.py b/libs/python/agent2/agent/ui/__init__.py
deleted file mode 100644
index ae5ced7a..00000000
--- a/libs/python/agent2/agent/ui/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-"""
-UI components for agent
-"""
-
-from .gradio import launch_ui, create_gradio_ui
-
-__all__ = ["launch_ui", "create_gradio_ui"]
diff --git a/libs/python/agent2/agent/ui/__main__.py b/libs/python/agent2/agent/ui/__main__.py
deleted file mode 100644
index 4ac782a5..00000000
--- a/libs/python/agent2/agent/ui/__main__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .gradio import launch_ui
-
-if __name__ == "__main__":
- launch_ui()
\ No newline at end of file
diff --git a/libs/python/agent2/agent/ui/gradio/__init__.py b/libs/python/agent2/agent/ui/gradio/__init__.py
deleted file mode 100644
index de2d351e..00000000
--- a/libs/python/agent2/agent/ui/gradio/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""
-Gradio UI for agent
-"""
-
-from .app import launch_ui
-from .ui_components import create_gradio_ui
-
-__all__ = ["launch_ui", "create_gradio_ui"]
diff --git a/libs/python/agent2/agent/ui/gradio/app.py b/libs/python/agent2/agent/ui/gradio/app.py
deleted file mode 100644
index 13c0786f..00000000
--- a/libs/python/agent2/agent/ui/gradio/app.py
+++ /dev/null
@@ -1,248 +0,0 @@
-"""
-Advanced Gradio UI for Computer-Use Agent (cua-agent)
-
-This is a Gradio interface for the Computer-Use Agent v0.4.x (cua-agent)
-with an advanced UI for model selection and configuration.
-
-Supported Agent Models:
-- OpenAI: openai/computer-use-preview
-- Anthropic: anthropic/claude-3-5-sonnet-20241022, anthropic/claude-3-7-sonnet-20250219
-- UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
-- Omniparser: omniparser+anthropic/claude-3-5-sonnet-20241022, omniparser+ollama_chat/gemma3
-
-Requirements:
- - Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
- - macOS 14 (Sonoma) or newer / Ubuntu 20.04+
- - Python 3.11+
- - Lume CLI installed (https://github.com/trycua/cua)
- - OpenAI or Anthropic API key
-"""
-
-import os
-import asyncio
-import logging
-import json
-import platform
-from pathlib import Path
-from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
-import gradio as gr
-from gradio.components.chatbot import MetadataDict
-from typing import cast
-
-# Import from agent package
-from agent import ComputerAgent
-from agent.types import Messages, AgentResponse
-from computer import Computer
-
-# Global variables
-global_agent = None
-global_computer = None
-SETTINGS_FILE = Path(".gradio_settings.json")
-
-
-import dotenv
-if dotenv.load_dotenv():
- print(f"DEBUG - Loaded environment variables from {dotenv.find_dotenv()}")
-else:
- print("DEBUG - No .env file found")
-
-# --- Settings Load/Save Functions ---
-def load_settings() -> Dict[str, Any]:
- """Loads settings from the JSON file."""
- if SETTINGS_FILE.exists():
- try:
- with open(SETTINGS_FILE, "r") as f:
- settings = json.load(f)
- if isinstance(settings, dict):
- print(f"DEBUG - Loaded settings from {SETTINGS_FILE}")
- return settings
- except (json.JSONDecodeError, IOError) as e:
- print(f"Warning: Could not load settings from {SETTINGS_FILE}: {e}")
- return {}
-
-
-def save_settings(settings: Dict[str, Any]):
- """Saves settings to the JSON file."""
- settings.pop("provider_api_key", None)
- try:
- with open(SETTINGS_FILE, "w") as f:
- json.dump(settings, f, indent=4)
- print(f"DEBUG - Saved settings to {SETTINGS_FILE}")
- except IOError as e:
- print(f"Warning: Could not save settings to {SETTINGS_FILE}: {e}")
-
-
-# # Custom Screenshot Handler for Gradio chat
-# class GradioChatScreenshotHandler:
-# """Custom handler that adds screenshots to the Gradio chatbot."""
-
-# def __init__(self, chatbot_history: List[gr.ChatMessage]):
-# self.chatbot_history = chatbot_history
-# print("GradioChatScreenshotHandler initialized")
-
-# async def on_screenshot(self, screenshot_base64: str, action_type: str = "") -> None:
-# """Add screenshot to chatbot when a screenshot is taken."""
-# image_markdown = f""
-
-# if self.chatbot_history is not None:
-# self.chatbot_history.append(
-# gr.ChatMessage(
-# role="assistant",
-# content=image_markdown,
-# metadata={"title": f"đĨī¸ Screenshot - {action_type}", "status": "done"},
-# )
-# )
-
-
-# Detect platform capabilities
-is_mac = platform.system().lower() == "darwin"
-is_lume_available = is_mac or (os.environ.get("PYLUME_HOST", "localhost") != "localhost")
-
-print("PYLUME_HOST: ", os.environ.get("PYLUME_HOST", "localhost"))
-print("is_mac: ", is_mac)
-print("Lume available: ", is_lume_available)
-
-# Map model names to agent model strings
-MODEL_MAPPINGS = {
- "openai": {
- "default": "openai/computer-use-preview",
- "OpenAI: Computer-Use Preview": "openai/computer-use-preview",
- },
- "anthropic": {
- "default": "anthropic/claude-3-7-sonnet-20250219",
- "Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
- "Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
- "Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
- "Anthropic: Claude 3.5 Sonnet (20240620)": "anthropic/claude-3-5-sonnet-20240620",
- },
- "omni": {
- "default": "omniparser+openai/gpt-4o",
- "OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
- "OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
- "OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
- "OMNI: Claude 3.5 Sonnet (20240620)": "omniparser+anthropic/claude-3-5-sonnet-20240620",
- },
- "uitars": {
- "default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",
- "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
- },
-}
-
-
-def get_model_string(model_name: str, loop_provider: str) -> str:
- """Determine the agent model string based on the input."""
- if model_name == "Custom model (OpenAI compatible API)":
- return "custom_oaicompat"
- elif model_name == "Custom model (ollama)":
- return "custom_ollama"
- elif loop_provider == "OMNI-OLLAMA" or model_name.startswith("OMNI: Ollama "):
- if model_name.startswith("OMNI: Ollama "):
- ollama_model = model_name.split("OMNI: Ollama ", 1)[1]
- return f"omniparser+ollama_chat/{ollama_model}"
- return "omniparser+ollama_chat/llama3"
-
- # Map based on loop provider
- mapping = MODEL_MAPPINGS.get(loop_provider.lower(), MODEL_MAPPINGS["openai"])
- return mapping.get(model_name, mapping["default"])
-
-
-def get_ollama_models() -> List[str]:
- """Get available models from Ollama if installed."""
- try:
- import subprocess
- result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
- if result.returncode == 0:
- lines = result.stdout.strip().split("\n")
- if len(lines) < 2:
- return []
- models = []
- for line in lines[1:]:
- parts = line.split()
- if parts:
- model_name = parts[0]
- models.append(f"OMNI: Ollama {model_name}")
- return models
- return []
- except Exception as e:
- logging.error(f"Error getting Ollama models: {e}")
- return []
-
-
-def create_computer_instance(
- verbosity: int = logging.INFO,
- os_type: str = "macos",
- provider_type: str = "lume",
- name: Optional[str] = None,
- api_key: Optional[str] = None
-) -> Computer:
- """Create or get the global Computer instance."""
- global global_computer
- if global_computer is None:
- global_computer = Computer(
- verbosity=verbosity,
- os_type=os_type,
- provider_type=provider_type,
- name=name if name else "",
- api_key=api_key
- )
- return global_computer
-
-
-def create_agent(
- model_string: str,
- save_trajectory: bool = True,
- only_n_most_recent_images: int = 3,
- verbosity: int = logging.INFO,
- custom_model_name: Optional[str] = None,
- computer_os: str = "macos",
- computer_provider: str = "lume",
- computer_name: Optional[str] = None,
- computer_api_key: Optional[str] = None,
- max_trajectory_budget: Optional[float] = None,
-) -> ComputerAgent:
- """Create or update the global agent with the specified parameters."""
- global global_agent
-
- # Create the computer
- computer = create_computer_instance(
- verbosity=verbosity,
- os_type=computer_os,
- provider_type=computer_provider,
- name=computer_name,
- api_key=computer_api_key
- )
-
- # Handle custom models
- if model_string == "custom_oaicompat" and custom_model_name:
- model_string = custom_model_name
- elif model_string == "custom_ollama" and custom_model_name:
- model_string = f"omniparser+ollama_chat/{custom_model_name}"
-
- # Create agent kwargs
- agent_kwargs = {
- "model": model_string,
- "tools": [computer],
- "only_n_most_recent_images": only_n_most_recent_images,
- "verbosity": verbosity,
- }
-
- if save_trajectory:
- agent_kwargs["trajectory_dir"] = "trajectories"
-
- if max_trajectory_budget:
- agent_kwargs["max_trajectory_budget"] = {"max_budget": max_trajectory_budget, "raise_error": True}
-
- global_agent = ComputerAgent(**agent_kwargs)
- return global_agent
-
-
-def launch_ui():
- """Standalone function to launch the Gradio app."""
- from agent.ui.gradio.ui_components import create_gradio_ui
- print(f"Starting Gradio app for CUA Agent...")
- demo = create_gradio_ui()
- demo.launch(share=False, inbrowser=True)
-
-
-if __name__ == "__main__":
- launch_ui()
diff --git a/libs/python/agent2/agent/ui/gradio/ui_components.py b/libs/python/agent2/agent/ui/gradio/ui_components.py
deleted file mode 100644
index dfcceb4e..00000000
--- a/libs/python/agent2/agent/ui/gradio/ui_components.py
+++ /dev/null
@@ -1,721 +0,0 @@
-"""
-UI Components for the Gradio interface
-"""
-
-import os
-import asyncio
-import logging
-import json
-import platform
-from pathlib import Path
-from typing import Dict, List, Optional, Any, cast
-import gradio as gr
-from gradio.components.chatbot import MetadataDict
-
-from .app import (
- load_settings, save_settings, create_agent, get_model_string,
- get_ollama_models, global_agent, global_computer
-)
-
-# Global messages array to maintain conversation history
-global_messages = []
-
-
-def create_gradio_ui() -> gr.Blocks:
- """Create a Gradio UI for the Computer-Use Agent."""
-
- # Load settings
- saved_settings = load_settings()
-
- # Check for API keys
- openai_api_key = os.environ.get("OPENAI_API_KEY", "")
- anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
- cua_api_key = os.environ.get("CUA_API_KEY", "")
-
- # Model choices
- openai_models = ["OpenAI: Computer-Use Preview"]
- anthropic_models = [
- "Anthropic: Claude 4 Opus (20250514)",
- "Anthropic: Claude 4 Sonnet (20250514)",
- "Anthropic: Claude 3.7 Sonnet (20250219)",
- "Anthropic: Claude 3.5 Sonnet (20240620)",
- ]
- omni_models = [
- "OMNI: OpenAI GPT-4o",
- "OMNI: OpenAI GPT-4o mini",
- "OMNI: Claude 3.7 Sonnet (20250219)",
- "OMNI: Claude 3.5 Sonnet (20240620)"
- ]
-
- # Check if API keys are available
- has_openai_key = bool(openai_api_key)
- has_anthropic_key = bool(anthropic_api_key)
- has_cua_key = bool(cua_api_key)
-
- # Get Ollama models for OMNI
- ollama_models = get_ollama_models()
- if ollama_models:
- omni_models += ollama_models
-
- # Detect platform
- is_mac = platform.system().lower() == "darwin"
-
- # Format model choices
- provider_to_models = {
- "OPENAI": openai_models,
- "ANTHROPIC": anthropic_models,
- "OMNI": omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
- "UITARS": ([
- "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
- ] if is_mac else []) + ["Custom model (OpenAI compatible API)"],
- }
-
- # Apply saved settings
- initial_loop = saved_settings.get("agent_loop", "OMNI")
- available_models_for_loop = provider_to_models.get(initial_loop, [])
- saved_model_choice = saved_settings.get("model_choice")
- if saved_model_choice and saved_model_choice in available_models_for_loop:
- initial_model = saved_model_choice
- else:
- if initial_loop == "OPENAI":
- initial_model = openai_models[0] if openai_models else "No models available"
- elif initial_loop == "ANTHROPIC":
- initial_model = anthropic_models[0] if anthropic_models else "No models available"
- else: # OMNI
- initial_model = omni_models[0] if omni_models else "Custom model (OpenAI compatible API)"
-
- initial_custom_model = saved_settings.get("custom_model", "Qwen2.5-VL-7B-Instruct")
- initial_provider_base_url = saved_settings.get("provider_base_url", "http://localhost:1234/v1")
- initial_save_trajectory = saved_settings.get("save_trajectory", True)
- initial_recent_images = saved_settings.get("recent_images", 3)
-
- # Example prompts
- example_messages = [
- "Create a Python virtual environment, install pandas and matplotlib, then plot stock data",
- "Open a PDF in Preview, add annotations, and save it as a compressed version",
- "Open Safari, search for 'macOS automation tools', and save the first three results as bookmarks",
- "Configure SSH keys and set up a connection to a remote server",
- ]
-
- def generate_python_code(agent_loop_choice, model_name, tasks, recent_images=3, save_trajectory=True, computer_os="linux", computer_provider="cloud", container_name="", cua_cloud_api_key="", max_budget=None):
- """Generate Python code for the current configuration and tasks."""
- tasks_str = ""
- for task in tasks:
- if task and task.strip():
- tasks_str += f' "{task}",\n'
-
- model_string = get_model_string(model_name, agent_loop_choice)
-
- computer_args = []
- if computer_os != "macos":
- computer_args.append(f'os_type="{computer_os}"')
- if computer_provider != "lume":
- computer_args.append(f'provider_type="{computer_provider}"')
- if container_name:
- computer_args.append(f'name="{container_name}"')
- if cua_cloud_api_key:
- computer_args.append(f'api_key="{cua_cloud_api_key}"')
-
- computer_args_str = ", ".join(computer_args)
- if computer_args_str:
- computer_args_str = f"({computer_args_str})"
- else:
- computer_args_str = "()"
-
- code = f'''import asyncio
-from computer import Computer
-from agent import ComputerAgent
-
-async def main():
- async with Computer{computer_args_str} as computer:
- agent = ComputerAgent(
- model="{model_string}",
- tools=[computer],
- only_n_most_recent_images={recent_images},'''
-
- if save_trajectory:
- code += '''
- trajectory_dir="trajectories",'''
-
- if max_budget:
- code += f'''
- max_trajectory_budget={{"max_budget": {max_budget}, "raise_error": True}},'''
-
- code += '''
- )
- '''
-
- if tasks_str:
- code += f'''
- # Prompts for the computer-use agent
- tasks = [
-{tasks_str.rstrip()}
- ]
-
- for task in tasks:
- print(f"Executing task: {{task}}")
- messages = [{{"role": "user", "content": task}}]
- async for result in agent.run(messages):
- for item in result["output"]:
- if item["type"] == "message":
- print(item["content"][0]["text"])'''
- else:
- code += f'''
- # Execute a single task
- task = "Search for information about CUA on GitHub"
- print(f"Executing task: {{task}}")
- messages = [{{"role": "user", "content": task}}]
- async for result in agent.run(messages):
- for item in result["output"]:
- if item["type"] == "message":
- print(item["content"][0]["text"])'''
-
- code += '''
-
-if __name__ == "__main__":
- asyncio.run(main())'''
-
- return code
-
- # Create the Gradio interface
- with gr.Blocks(title="Computer-Use Agent") as demo:
- with gr.Row():
- # Left column for settings
- with gr.Column(scale=1):
- # Logo
- gr.HTML(
- """
-
-

-
- """
- )
-
- # Python code accordion
- with gr.Accordion("Python Code", open=False):
- code_display = gr.Code(
- language="python",
- value=generate_python_code(initial_loop, "gpt-4o", []),
- interactive=False,
- )
-
- with gr.Accordion("Computer Configuration", open=True):
- computer_os = gr.Radio(
- choices=["macos", "linux", "windows"],
- label="Operating System",
- value="macos",
- info="Select the operating system for the computer",
- )
-
- is_windows = platform.system().lower() == "windows"
- is_mac = platform.system().lower() == "darwin"
-
- providers = ["cloud"]
- if is_mac:
- providers += ["lume"]
- if is_windows:
- providers += ["winsandbox"]
-
- computer_provider = gr.Radio(
- choices=providers,
- label="Provider",
- value="lume" if is_mac else "cloud",
- info="Select the computer provider",
- )
-
- container_name = gr.Textbox(
- label="Container Name",
- placeholder="Enter container name (optional)",
- value=os.environ.get("CUA_CONTAINER_NAME", ""),
- info="Optional name for the container",
- )
-
- cua_cloud_api_key = gr.Textbox(
- label="CUA Cloud API Key",
- placeholder="Enter your CUA Cloud API key",
- value=os.environ.get("CUA_API_KEY", ""),
- type="password",
- info="Required for cloud provider",
- visible=(not has_cua_key)
- )
-
- with gr.Accordion("Agent Configuration", open=True):
- agent_loop = gr.Dropdown(
- choices=["OPENAI", "ANTHROPIC", "OMNI", "UITARS"],
- label="Agent Loop",
- value=initial_loop,
- info="Select the agent loop provider",
- )
-
- # Model selection dropdowns
- with gr.Group() as model_selection_group:
- openai_model_choice = gr.Dropdown(
- choices=openai_models,
- label="OpenAI Model",
- value=openai_models[0] if openai_models else "No models available",
- info="Select OpenAI model",
- interactive=True,
- visible=(initial_loop == "OPENAI")
- )
-
- anthropic_model_choice = gr.Dropdown(
- choices=anthropic_models,
- label="Anthropic Model",
- value=anthropic_models[0] if anthropic_models else "No models available",
- info="Select Anthropic model",
- interactive=True,
- visible=(initial_loop == "ANTHROPIC")
- )
-
- omni_model_choice = gr.Dropdown(
- choices=omni_models + ["Custom model (OpenAI compatible API)", "Custom model (ollama)"],
- label="OMNI Model",
- value=omni_models[0] if omni_models else "Custom model (OpenAI compatible API)",
- info="Select OMNI model or choose a custom model option",
- interactive=True,
- visible=(initial_loop == "OMNI")
- )
-
- uitars_model_choice = gr.Dropdown(
- choices=provider_to_models.get("UITARS", ["No models available"]),
- label="UITARS Model",
- value=provider_to_models.get("UITARS", ["No models available"])[0] if provider_to_models.get("UITARS") else "No models available",
- info="Select UITARS model",
- interactive=True,
- visible=(initial_loop == "UITARS")
- )
-
- model_choice = gr.Textbox(visible=False)
-
- # API key inputs
- with gr.Group(visible=not has_openai_key and (initial_loop == "OPENAI" or initial_loop == "OMNI")) as openai_key_group:
- openai_api_key_input = gr.Textbox(
- label="OpenAI API Key",
- placeholder="Enter your OpenAI API key",
- value=os.environ.get("OPENAI_API_KEY", ""),
- interactive=True,
- type="password",
- info="Required for OpenAI models"
- )
-
- with gr.Group(visible=not has_anthropic_key and (initial_loop == "ANTHROPIC" or initial_loop == "OMNI")) as anthropic_key_group:
- anthropic_api_key_input = gr.Textbox(
- label="Anthropic API Key",
- placeholder="Enter your Anthropic API key",
- value=os.environ.get("ANTHROPIC_API_KEY", ""),
- interactive=True,
- type="password",
- info="Required for Anthropic models"
- )
-
- # API key handlers
- def set_openai_api_key(key):
- if key and key.strip():
- os.environ["OPENAI_API_KEY"] = key.strip()
- print(f"DEBUG - Set OpenAI API key environment variable")
- return key
-
- def set_anthropic_api_key(key):
- if key and key.strip():
- os.environ["ANTHROPIC_API_KEY"] = key.strip()
- print(f"DEBUG - Set Anthropic API key environment variable")
- return key
-
- openai_api_key_input.change(
- fn=set_openai_api_key,
- inputs=[openai_api_key_input],
- outputs=[openai_api_key_input],
- queue=False
- )
-
- anthropic_api_key_input.change(
- fn=set_anthropic_api_key,
- inputs=[anthropic_api_key_input],
- outputs=[anthropic_api_key_input],
- queue=False
- )
-
- # UI update function
- def update_ui(loop=None, openai_model=None, anthropic_model=None, omni_model=None, uitars_model=None):
- loop = loop or agent_loop.value
-
- model_value = None
- if loop == "OPENAI" and openai_model:
- model_value = openai_model
- elif loop == "ANTHROPIC" and anthropic_model:
- model_value = anthropic_model
- elif loop == "OMNI" and omni_model:
- model_value = omni_model
- elif loop == "UITARS" and uitars_model:
- model_value = uitars_model
-
- openai_visible = (loop == "OPENAI")
- anthropic_visible = (loop == "ANTHROPIC")
- omni_visible = (loop == "OMNI")
- uitars_visible = (loop == "UITARS")
-
- show_openai_key = not has_openai_key and (loop == "OPENAI" or (loop == "OMNI" and model_value and "OpenAI" in model_value and "Custom" not in model_value))
- show_anthropic_key = not has_anthropic_key and (loop == "ANTHROPIC" or (loop == "OMNI" and model_value and "Claude" in model_value and "Custom" not in model_value))
-
- is_custom_openai_api = model_value == "Custom model (OpenAI compatible API)"
- is_custom_ollama = model_value == "Custom model (ollama)"
- is_any_custom = is_custom_openai_api or is_custom_ollama
-
- model_choice_value = model_value if model_value else ""
-
- return [
- gr.update(visible=openai_visible),
- gr.update(visible=anthropic_visible),
- gr.update(visible=omni_visible),
- gr.update(visible=uitars_visible),
- gr.update(visible=show_openai_key),
- gr.update(visible=show_anthropic_key),
- gr.update(visible=is_any_custom),
- gr.update(visible=is_custom_openai_api),
- gr.update(visible=is_custom_openai_api),
- gr.update(value=model_choice_value)
- ]
-
- # Custom model inputs
- custom_model = gr.Textbox(
- label="Custom Model Name",
- placeholder="Enter custom model name (e.g., Qwen2.5-VL-7B-Instruct or llama3)",
- value=initial_custom_model,
- visible=(initial_model == "Custom model (OpenAI compatible API)" or initial_model == "Custom model (ollama)"),
- interactive=True,
- )
-
- provider_base_url = gr.Textbox(
- label="Provider Base URL",
- placeholder="Enter provider base URL (e.g., http://localhost:1234/v1)",
- value=initial_provider_base_url,
- visible=(initial_model == "Custom model (OpenAI compatible API)"),
- interactive=True,
- )
-
- provider_api_key = gr.Textbox(
- label="Provider API Key",
- placeholder="Enter provider API key (if required)",
- value="",
- visible=(initial_model == "Custom model (OpenAI compatible API)"),
- interactive=True,
- type="password",
- )
-
- # Connect UI update events
- for dropdown in [agent_loop, omni_model_choice, uitars_model_choice, openai_model_choice, anthropic_model_choice]:
- dropdown.change(
- fn=update_ui,
- inputs=[agent_loop, openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice],
- outputs=[
- openai_model_choice, anthropic_model_choice, omni_model_choice, uitars_model_choice,
- openai_key_group, anthropic_key_group,
- custom_model, provider_base_url, provider_api_key,
- model_choice
- ],
- queue=False
- )
-
- save_trajectory = gr.Checkbox(
- label="Save Trajectory",
- value=initial_save_trajectory,
- info="Save the agent's trajectory for debugging",
- interactive=True,
- )
-
- recent_images = gr.Slider(
- label="Recent Images",
- minimum=1,
- maximum=10,
- value=initial_recent_images,
- step=1,
- info="Number of recent images to keep in context",
- interactive=True,
- )
-
- max_budget = gr.Number(
- label="Max Budget ($)",
- value=lambda: None,
- minimum=-1,
- maximum=100.0,
- step=0.1,
- info="Optional budget limit for trajectory (0 = no limit)",
- interactive=True,
- )
-
- # Right column for chat interface
- with gr.Column(scale=2):
- gr.Markdown(
- "Ask me to perform tasks in a virtual environment.
Built with github.com/trycua/cua."
- )
-
- chatbot_history = gr.Chatbot(type="messages")
- msg = gr.Textbox(
- placeholder="Ask me to perform tasks in a virtual environment"
- )
- clear = gr.Button("Clear")
- cancel_button = gr.Button("Cancel", variant="stop")
-
- # Add examples
- example_group = gr.Examples(examples=example_messages, inputs=msg)
-
- # Chat submission function
- def chat_submit(message, history):
- history.append(gr.ChatMessage(role="user", content=message))
- return "", history
-
- # Cancel function
- async def cancel_agent_task(history):
- global global_agent
- if global_agent:
- print("DEBUG - Cancelling agent task")
- history.append(gr.ChatMessage(role="assistant", content="Task cancelled by user", metadata={"title": "â Cancelled"}))
- else:
- history.append(gr.ChatMessage(role="assistant", content="No active agent task to cancel", metadata={"title": "âšī¸ Info"}))
- return history
-
- # Process response function
- async def process_response(
- history,
- openai_model_value,
- anthropic_model_value,
- omni_model_value,
- uitars_model_value,
- custom_model_value,
- agent_loop_choice,
- save_traj,
- recent_imgs,
- custom_url_value=None,
- custom_api_key=None,
- openai_key_input=None,
- anthropic_key_input=None,
- computer_os="linux",
- computer_provider="cloud",
- container_name="",
- cua_cloud_api_key="",
- max_budget_value=None,
- ):
- if not history:
- yield history
- return
-
- # Get the last user message
- last_user_message = history[-1]["content"]
-
- # Get the appropriate model value based on the agent loop
- if agent_loop_choice == "OPENAI":
- model_choice_value = openai_model_value
- elif agent_loop_choice == "ANTHROPIC":
- model_choice_value = anthropic_model_value
- elif agent_loop_choice == "OMNI":
- model_choice_value = omni_model_value
- elif agent_loop_choice == "UITARS":
- model_choice_value = uitars_model_value
- else:
- model_choice_value = "No models available"
-
- # Determine if this is a custom model selection
- is_custom_model_selected = model_choice_value in ["Custom model (OpenAI compatible API)", "Custom model (ollama)"]
-
- # Determine the model name string to analyze
- if is_custom_model_selected:
- model_string_to_analyze = custom_model_value
- else:
- model_string_to_analyze = model_choice_value
-
- try:
- # Get the model string
- model_string = get_model_string(model_string_to_analyze, agent_loop_choice)
-
- # Set API keys if provided
- if openai_key_input:
- os.environ["OPENAI_API_KEY"] = openai_key_input
- if anthropic_key_input:
- os.environ["ANTHROPIC_API_KEY"] = anthropic_key_input
- if cua_cloud_api_key:
- os.environ["CUA_API_KEY"] = cua_cloud_api_key
-
- # Save settings
- current_settings = {
- "agent_loop": agent_loop_choice,
- "model_choice": model_choice_value,
- "custom_model": custom_model_value,
- "provider_base_url": custom_url_value,
- "save_trajectory": save_traj,
- "recent_images": recent_imgs,
- "computer_os": computer_os,
- "computer_provider": computer_provider,
- "container_name": container_name,
- }
- save_settings(current_settings)
-
- # Create agent
- global_agent = create_agent(
- model_string=model_string,
- save_trajectory=save_traj,
- only_n_most_recent_images=recent_imgs,
- custom_model_name=custom_model_value if is_custom_model_selected else None,
- computer_os=computer_os,
- computer_provider=computer_provider,
- computer_name=container_name,
- computer_api_key=cua_cloud_api_key,
- verbosity=logging.DEBUG,
- max_trajectory_budget=max_budget_value if max_budget_value and max_budget_value > 0 else None,
- )
-
- if global_agent is None:
- history.append(
- gr.ChatMessage(
- role="assistant",
- content="Failed to create agent. Check API keys and configuration.",
- )
- )
- yield history
- return
-
- # Add user message to global history
- global global_messages
- global_messages.append({"role": "user", "content": last_user_message})
-
- # Stream responses from the agent
- async for result in global_agent.run(global_messages):
- global_messages += result.get("output", [])
- # print(f"DEBUG - Agent response ------- START")
- # from pprint import pprint
- # pprint(result)
- # print(f"DEBUG - Agent response ------- END")
-
- # Process the result output
- for item in result.get("output", []):
- if item.get("type") == "message":
- content = item.get("content", [])
- for content_part in content:
- if content_part.get("text"):
- history.append(gr.ChatMessage(
- role=item.get("role", "assistant"),
- content=content_part.get("text", ""),
- metadata=content_part.get("metadata", {})
- ))
- elif item.get("type") == "computer_call":
- action = item.get("action", {})
- action_type = action.get("type", "")
- if action_type:
- action_title = f"đ ī¸ Performing {action_type}"
- if action.get("x") and action.get("y"):
- action_title += f" at ({action['x']}, {action['y']})"
- history.append(gr.ChatMessage(
- role="assistant",
- content=f"```json\n{json.dumps(action)}\n```",
- metadata={"title": action_title}
- ))
- elif item.get("type") == "function_call":
- function_name = item.get("name", "")
- arguments = item.get("arguments", "{}")
- history.append(gr.ChatMessage(
- role="assistant",
- content=f"đ§ Calling function: {function_name}\n```json\n{arguments}\n```",
- metadata={"title": f"Function Call: {function_name}"}
- ))
- elif item.get("type") == "function_call_output":
- output = item.get("output", "")
- history.append(gr.ChatMessage(
- role="assistant",
- content=f"đ¤ Function output:\n```\n{output}\n```",
- metadata={"title": "Function Output"}
- ))
- elif item.get("type") == "computer_call_output":
- output = item.get("output", {}).get("image_url", "")
- image_markdown = f""
- history.append(gr.ChatMessage(
- role="assistant",
- content=image_markdown,
- metadata={"title": "đĨī¸ Computer Output"}
- ))
-
- yield history
-
- except Exception as e:
- import traceback
- traceback.print_exc()
- history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
- yield history
-
- # Connect the submit button
- submit_event = msg.submit(
- fn=chat_submit,
- inputs=[msg, chatbot_history],
- outputs=[msg, chatbot_history],
- queue=False,
- ).then(
- fn=process_response,
- inputs=[
- chatbot_history,
- openai_model_choice,
- anthropic_model_choice,
- omni_model_choice,
- uitars_model_choice,
- custom_model,
- agent_loop,
- save_trajectory,
- recent_images,
- provider_base_url,
- provider_api_key,
- openai_api_key_input,
- anthropic_api_key_input,
- computer_os,
- computer_provider,
- container_name,
- cua_cloud_api_key,
- max_budget,
- ],
- outputs=[chatbot_history],
- queue=True,
- )
-
- # Clear button functionality
- def clear_chat():
- global global_messages
- global_messages.clear()
- return None
-
- clear.click(clear_chat, None, chatbot_history, queue=False)
-
- # Connect cancel button
- cancel_button.click(
- cancel_agent_task,
- [chatbot_history],
- [chatbot_history],
- queue=False
- )
-
- # Code display update function
- def update_code_display(agent_loop, model_choice_val, custom_model_val, chat_history, recent_images_val, save_trajectory_val, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget_val):
- messages = []
- if chat_history:
- for msg in chat_history:
- if isinstance(msg, dict) and msg.get("role") == "user":
- messages.append(msg.get("content", ""))
-
- return generate_python_code(
- agent_loop,
- model_choice_val or custom_model_val or "gpt-4o",
- messages,
- recent_images_val,
- save_trajectory_val,
- computer_os,
- computer_provider,
- container_name,
- cua_cloud_api_key,
- max_budget_val
- )
-
- # Update code display when configuration changes
- for component in [agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget]:
- component.change(
- update_code_display,
- inputs=[agent_loop, model_choice, custom_model, chatbot_history, recent_images, save_trajectory, computer_os, computer_provider, container_name, cua_cloud_api_key, max_budget],
- outputs=[code_display]
- )
-
- return demo
diff --git a/libs/python/agent2/example.py b/libs/python/agent2/example.py
deleted file mode 100644
index f686b790..00000000
--- a/libs/python/agent2/example.py
+++ /dev/null
@@ -1,148 +0,0 @@
-"""
-Example usage of the agent library with docstring-based tool definitions.
-"""
-
-import asyncio
-import logging
-
-from agent import agent_loop, ComputerAgent
-from agent.types import Messages
-from computer import Computer
-from computer.helpers import sandboxed
-
-@sandboxed()
-def read_file(location: str) -> str:
- """Read contents of a file
-
- Parameters
- ----------
- location : str
- Path to the file to read
-
- Returns
- -------
- str
- Contents of the file or error message
- """
- try:
- with open(location, 'r') as f:
- return f.read()
- except Exception as e:
- return f"Error reading file: {str(e)}"
-
-def save_note(content: str, filename: str = "note.txt") -> str:
- """Save content to a note file
-
- Parameters
- ----------
- content : str
- Content to save to the file
- filename : str, optional
- Name of the file to save to (default is "note.txt")
-
- Returns
- -------
- str
- Success or error message
- """
- try:
- with open(filename, 'w') as f:
- f.write(content)
- return f"Saved note to {filename}"
- except Exception as e:
- return f"Error saving note: {str(e)}"
-
-def calculate(a: int, b: int) -> int:
- """Calculate the sum of two integers
-
- Parameters
- ----------
- a : int
- First integer
- b : int
- Second integer
-
- Returns
- -------
- int
- Sum of the two integers
- """
- return a + b
-
-async def main():
- """Example usage of ComputerAgent with different models"""
-
- # Example 1: Using Claude with computer and custom tools
- print("=== Example 1: Claude with Computer ===")
-
- import os
- import dotenv
- import json
- dotenv.load_dotenv()
-
- assert os.getenv("CUA_CONTAINER_NAME") is not None, "CUA_CONTAINER_NAME is not set"
- assert os.getenv("CUA_API_KEY") is not None, "CUA_API_KEY is not set"
-
- async with Computer(
- os_type="linux",
- provider_type="cloud",
- name=os.getenv("CUA_CONTAINER_NAME") or "",
- api_key=os.getenv("CUA_API_KEY") or ""
- ) as computer:
- agent = ComputerAgent(
- # Supported models:
-
- # == OpenAI CUA (computer-use-preview) ==
- model="openai/computer-use-preview",
-
- # == Anthropic CUA (Claude > 3.5) ==
- # model="anthropic/claude-opus-4-20250514",
- # model="anthropic/claude-sonnet-4-20250514",
- # model="anthropic/claude-3-7-sonnet-20250219",
- # model="anthropic/claude-3-5-sonnet-20240620",
-
- # == UI-TARS ==
- # model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
- # TODO: add local mlx provider
- # model="mlx-community/UI-TARS-1.5-7B-6bit",
- # model="ollama_chat/0000/ui-tars-1.5-7b",
-
- # == Omniparser + Any LLM ==
- # model="omniparser+..."
- # model="omniparser+anthropic/claude-opus-4-20250514",
-
- tools=[computer],
- only_n_most_recent_images=3,
- verbosity=logging.INFO,
- trajectory_dir="trajectories",
- use_prompt_caching=True,
- max_trajectory_budget={ "max_budget": 1.0, "raise_error": True, "reset_after_each_run": False },
- )
-
- history = []
- while True:
- user_input = input("> ")
- history.append({"role": "user", "content": user_input})
-
- # Non-streaming usage
- async for result in agent.run(history, stream=False):
- history += result["output"]
-
- # # Print output
- # for item in result["output"]:
- # if item["type"] == "message":
- # print(item["content"][0]["text"])
- # elif item["type"] == "computer_call":
- # action = item["action"]
- # action_type = action["type"]
- # action_args = {k: v for k, v in action.items() if k != "type"}
- # print(f"{action_type}({action_args})")
- # elif item["type"] == "function_call":
- # action = item["name"]
- # action_args = item["arguments"]
- # print(f"{action}({action_args})")
- # elif item["type"] == "function_call_output":
- # print("===>", item["output"])
-
-if __name__ == "__main__":
- asyncio.run(main())
\ No newline at end of file
diff --git a/libs/python/agent2/pyproject.toml b/libs/python/agent2/pyproject.toml
deleted file mode 100644
index 4d4e721e..00000000
--- a/libs/python/agent2/pyproject.toml
+++ /dev/null
@@ -1,71 +0,0 @@
-[build-system]
-requires = ["pdm-backend"]
-build-backend = "pdm.backend"
-
-[project]
-name = "cua-agent"
-version = "0.4.0b4"
-description = "CUA (Computer Use) Agent for AI-driven computer interaction"
-readme = "README.md"
-authors = [
- { name = "TryCua", email = "gh@trycua.com" }
-]
-dependencies = [
- "httpx>=0.27.0",
- "aiohttp>=3.9.3",
- "asyncio",
- "anyio>=4.4.1",
- "typing-extensions>=4.12.2",
- "pydantic>=2.6.4",
- "rich>=13.7.1",
- "python-dotenv>=1.0.1",
- "cua-computer>=0.3.0,<0.5.0",
- "cua-core>=0.1.0,<0.2.0",
- "certifi>=2024.2.2",
- "litellm>=1.74.8"
-]
-requires-python = ">=3.11"
-
-[project.optional-dependencies]
-openai = []
-anthropic = []
-omni = [
- "ultralytics>=8.0.0",
- "cua-som>=0.1.0,<0.2.0",
-]
-uitars = []
-uitars-mlx = [
- "mlx-vlm>=0.1.27; sys_platform == 'darwin'"
-]
-uitars-hf = [
- "transformers>=4.54.0"
-]
-ui = [
- "gradio>=5.23.3",
- "python-dotenv>=1.0.1",
-]
-cli = [
- "yaspin>=3.1.0",
-]
-all = [
- # omni requirements
- "ultralytics>=8.0.0",
- "cua-som>=0.1.0,<0.2.0",
- # uitars requirements
- "mlx-vlm>=0.1.27; sys_platform == 'darwin'",
- "transformers>=4.54.0",
- # ui requirements
- "gradio>=5.23.3",
- "python-dotenv>=1.0.1",
- # cli requirements
- "yaspin>=3.1.0",
-]
-
-[tool.uv]
-constraint-dependencies = ["fastrtc>0.43.0", "mlx-audio>0.2.3"]
-
-[tool.pdm]
-distribution = true
-
-[tool.pdm.build]
-includes = ["agent/"]