mirror of
https://github.com/trycua/lume.git
synced 2026-01-06 04:20:03 -06:00
Merge branch 'main' into models/opencua
This commit is contained in:
@@ -29,16 +29,6 @@
|
||||
|
||||
```bash
|
||||
pip install "cua-agent[all]"
|
||||
|
||||
# or install specific providers
|
||||
pip install "cua-agent[openai]" # OpenAI computer-use-preview support
|
||||
pip install "cua-agent[anthropic]" # Anthropic Claude support
|
||||
pip install "cua-agent[omni]" # Omniparser + any LLM support
|
||||
pip install "cua-agent[uitars]" # UI-TARS
|
||||
pip install "cua-agent[uitars-mlx]" # UI-TARS + MLX support
|
||||
pip install "cua-agent[uitars-hf]" # UI-TARS + Huggingface support
|
||||
pip install "cua-agent[glm45v-hf]" # GLM-4.5V + Huggingface support
|
||||
pip install "cua-agent[ui]" # Gradio UI support
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
@@ -79,303 +69,18 @@ if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Supported Models
|
||||
## Docs
|
||||
|
||||
### Anthropic Claude (Computer Use API)
|
||||
```python
|
||||
model="anthropic/claude-3-5-sonnet-20241022"
|
||||
model="anthropic/claude-3-7-sonnet-20250219"
|
||||
model="anthropic/claude-opus-4-20250514"
|
||||
model="anthropic/claude-sonnet-4-20250514"
|
||||
```
|
||||
|
||||
### OpenAI Computer Use Preview
|
||||
```python
|
||||
model="openai/computer-use-preview"
|
||||
```
|
||||
|
||||
### UI-TARS (Local or Huggingface Inference)
|
||||
```python
|
||||
model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"
|
||||
model="ollama_chat/0000/ui-tars-1.5-7b"
|
||||
```
|
||||
|
||||
### Omniparser + Any LLM
|
||||
```python
|
||||
model="omniparser+ollama_chat/mistral-small3.2"
|
||||
model="omniparser+vertex_ai/gemini-pro"
|
||||
model="omniparser+anthropic/claude-3-5-sonnet-20241022"
|
||||
model="omniparser+openai/gpt-4o"
|
||||
```
|
||||
|
||||
## Custom Tools
|
||||
|
||||
Define custom tools using decorated functions:
|
||||
|
||||
```python
|
||||
from computer.helpers import sandboxed
|
||||
|
||||
@sandboxed()
|
||||
def read_file(location: str) -> str:
|
||||
"""Read contents of a file
|
||||
|
||||
Parameters
|
||||
----------
|
||||
location : str
|
||||
Path to the file to read
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
Contents of the file or error message
|
||||
"""
|
||||
try:
|
||||
with open(location, 'r') as f:
|
||||
return f.read()
|
||||
except Exception as e:
|
||||
return f"Error reading file: {str(e)}"
|
||||
|
||||
def calculate(a: int, b: int) -> int:
|
||||
"""Calculate the sum of two integers"""
|
||||
return a + b
|
||||
|
||||
# Use with agent
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[computer, read_file, calculate]
|
||||
)
|
||||
```
|
||||
|
||||
## Callbacks System
|
||||
|
||||
agent provides a comprehensive callback system for extending functionality:
|
||||
|
||||
### Built-in Callbacks
|
||||
|
||||
```python
|
||||
from agent.callbacks import (
|
||||
ImageRetentionCallback,
|
||||
TrajectorySaverCallback,
|
||||
BudgetManagerCallback,
|
||||
LoggingCallback
|
||||
)
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[computer],
|
||||
callbacks=[
|
||||
ImageRetentionCallback(only_n_most_recent_images=3),
|
||||
TrajectorySaverCallback(trajectory_dir="trajectories"),
|
||||
BudgetManagerCallback(max_budget=10.0, raise_error=True),
|
||||
LoggingCallback(level=logging.INFO)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Custom Callbacks
|
||||
|
||||
```python
|
||||
from agent.callbacks.base import AsyncCallbackHandler
|
||||
|
||||
class CustomCallback(AsyncCallbackHandler):
|
||||
async def on_llm_start(self, messages):
|
||||
"""Preprocess messages before LLM call"""
|
||||
# Add custom preprocessing logic
|
||||
return messages
|
||||
|
||||
async def on_llm_end(self, messages):
|
||||
"""Postprocess messages after LLM call"""
|
||||
# Add custom postprocessing logic
|
||||
return messages
|
||||
|
||||
async def on_usage(self, usage):
|
||||
"""Track usage information"""
|
||||
print(f"Tokens used: {usage.total_tokens}")
|
||||
```
|
||||
|
||||
## Budget Management
|
||||
|
||||
Control costs with built-in budget management:
|
||||
|
||||
```python
|
||||
# Simple budget limit
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
max_trajectory_budget=5.0 # $5 limit
|
||||
)
|
||||
|
||||
# Advanced budget configuration
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
max_trajectory_budget={
|
||||
"max_budget": 10.0,
|
||||
"raise_error": True, # Raise error when exceeded
|
||||
"reset_after_each_run": False # Persistent across runs
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
## Trajectory Management
|
||||
|
||||
Save and replay agent conversations:
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
trajectory_dir="trajectories", # Auto-save trajectories
|
||||
tools=[computer]
|
||||
)
|
||||
|
||||
# Trajectories are saved with:
|
||||
# - Complete conversation history
|
||||
# - Usage statistics and costs
|
||||
# - Timestamps and metadata
|
||||
# - Screenshots and computer actions
|
||||
```
|
||||
|
||||
## Configuration Options
|
||||
|
||||
### ComputerAgent Parameters
|
||||
|
||||
- `model`: Model identifier (required)
|
||||
- `tools`: List of computer objects and decorated functions
|
||||
- `callbacks`: List of callback handlers for extensibility
|
||||
- `only_n_most_recent_images`: Limit recent images to prevent context overflow
|
||||
- `verbosity`: Logging level (logging.INFO, logging.DEBUG, etc.)
|
||||
- `trajectory_dir`: Directory to save conversation trajectories
|
||||
- `max_retries`: Maximum API call retries (default: 3)
|
||||
- `screenshot_delay`: Delay between actions and screenshots (default: 0.5s)
|
||||
- `use_prompt_caching`: Enable prompt caching for supported models
|
||||
- `max_trajectory_budget`: Budget limit configuration
|
||||
|
||||
### Environment Variables
|
||||
|
||||
```bash
|
||||
# Computer instance (cloud)
|
||||
export CUA_CONTAINER_NAME="your-container-name"
|
||||
export CUA_API_KEY="your-cua-api-key"
|
||||
|
||||
# LLM API keys
|
||||
export ANTHROPIC_API_KEY="your-anthropic-key"
|
||||
export OPENAI_API_KEY="your-openai-key"
|
||||
```
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Streaming Responses
|
||||
|
||||
```python
|
||||
async for result in agent.run(messages, stream=True):
|
||||
# Process streaming chunks
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"], end="", flush=True)
|
||||
elif item["type"] == "computer_call":
|
||||
action = item["action"]
|
||||
print(f"\n[Action: {action['type']}]")
|
||||
```
|
||||
|
||||
### Interactive Chat Loop
|
||||
|
||||
```python
|
||||
history = []
|
||||
while True:
|
||||
user_input = input("> ")
|
||||
if user_input.lower() in ['quit', 'exit']:
|
||||
break
|
||||
|
||||
history.append({"role": "user", "content": user_input})
|
||||
|
||||
async for result in agent.run(history):
|
||||
history += result["output"]
|
||||
|
||||
# Display assistant responses
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"])
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
```python
|
||||
try:
|
||||
async for result in agent.run(messages):
|
||||
# Process results
|
||||
pass
|
||||
except BudgetExceededException:
|
||||
print("Budget limit exceeded")
|
||||
except Exception as e:
|
||||
print(f"Agent error: {e}")
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
### ComputerAgent.run()
|
||||
|
||||
```python
|
||||
async def run(
|
||||
self,
|
||||
messages: Messages,
|
||||
stream: bool = False,
|
||||
**kwargs
|
||||
) -> AsyncGenerator[Dict[str, Any], None]:
|
||||
"""
|
||||
Run the agent with the given messages.
|
||||
|
||||
Args:
|
||||
messages: List of message dictionaries
|
||||
stream: Whether to stream the response
|
||||
**kwargs: Additional arguments
|
||||
|
||||
Returns:
|
||||
AsyncGenerator that yields response chunks
|
||||
"""
|
||||
```
|
||||
|
||||
### Message Format
|
||||
|
||||
```python
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Take a screenshot and describe what you see"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "I'll take a screenshot for you."
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### Response Format
|
||||
|
||||
```python
|
||||
{
|
||||
"output": [
|
||||
{
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"content": [{"type": "output_text", "text": "I can see..."}]
|
||||
},
|
||||
{
|
||||
"type": "computer_call",
|
||||
"action": {"type": "screenshot"},
|
||||
"call_id": "call_123"
|
||||
},
|
||||
{
|
||||
"type": "computer_call_output",
|
||||
"call_id": "call_123",
|
||||
"output": {"image_url": "data:image/png;base64,..."}
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 150,
|
||||
"completion_tokens": 75,
|
||||
"total_tokens": 225,
|
||||
"response_cost": 0.01,
|
||||
}
|
||||
}
|
||||
```
|
||||
- [Agent Loops](https://trycua.com/docs/agent-sdk/agent-loops)
|
||||
- [Supported Agents](https://trycua.com/docs/agent-sdk/supported-agents)
|
||||
- [Supported Models](https://trycua.com/docs/agent-sdk/supported-models)
|
||||
- [Chat History](https://trycua.com/docs/agent-sdk/chat-history)
|
||||
- [Callbacks](https://trycua.com/docs/agent-sdk/callbacks)
|
||||
- [Custom Tools](https://trycua.com/docs/agent-sdk/custom-tools)
|
||||
- [Custom Computer Handlers](https://trycua.com/docs/agent-sdk/custom-computer-handlers)
|
||||
- [Prompt Caching](https://trycua.com/docs/agent-sdk/prompt-caching)
|
||||
- [Usage Tracking](https://trycua.com/docs/agent-sdk/usage-tracking)
|
||||
- [Benchmarks](https://trycua.com/docs/agent-sdk/benchmarks)
|
||||
|
||||
## License
|
||||
|
||||
|
||||
@@ -4,8 +4,10 @@ Adapters package for agent - Custom LLM adapters for LiteLLM
|
||||
|
||||
from .huggingfacelocal_adapter import HuggingFaceLocalAdapter
|
||||
from .human_adapter import HumanAdapter
|
||||
from .mlxvlm_adapter import MLXVLMAdapter
|
||||
|
||||
__all__ = [
|
||||
"HuggingFaceLocalAdapter",
|
||||
"HumanAdapter",
|
||||
"MLXVLMAdapter",
|
||||
]
|
||||
|
||||
359
libs/python/agent/agent/adapters/mlxvlm_adapter.py
Normal file
359
libs/python/agent/agent/adapters/mlxvlm_adapter.py
Normal file
@@ -0,0 +1,359 @@
|
||||
import asyncio
|
||||
import functools
|
||||
import warnings
|
||||
import io
|
||||
import base64
|
||||
import math
|
||||
import re
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Iterator, AsyncIterator, Dict, List, Any, Optional, Tuple, cast
|
||||
from PIL import Image
|
||||
from litellm.types.utils import GenericStreamingChunk, ModelResponse
|
||||
from litellm.llms.custom_llm import CustomLLM
|
||||
from litellm import completion, acompletion
|
||||
|
||||
# Try to import MLX dependencies
|
||||
try:
|
||||
import mlx.core as mx
|
||||
from mlx_vlm import load, generate
|
||||
from mlx_vlm.prompt_utils import apply_chat_template
|
||||
from mlx_vlm.utils import load_config
|
||||
from transformers.tokenization_utils import PreTrainedTokenizer
|
||||
MLX_AVAILABLE = True
|
||||
except ImportError:
|
||||
MLX_AVAILABLE = False
|
||||
|
||||
# Constants for smart_resize
|
||||
IMAGE_FACTOR = 28
|
||||
MIN_PIXELS = 100 * 28 * 28
|
||||
MAX_PIXELS = 16384 * 28 * 28
|
||||
MAX_RATIO = 200
|
||||
|
||||
def round_by_factor(number: float, factor: int) -> int:
|
||||
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
||||
return round(number / factor) * factor
|
||||
|
||||
def ceil_by_factor(number: float, factor: int) -> int:
|
||||
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
||||
return math.ceil(number / factor) * factor
|
||||
|
||||
def floor_by_factor(number: float, factor: int) -> int:
|
||||
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
||||
return math.floor(number / factor) * factor
|
||||
|
||||
def smart_resize(
|
||||
height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
|
||||
) -> tuple[int, int]:
|
||||
"""
|
||||
Rescales the image so that the following conditions are met:
|
||||
|
||||
1. Both dimensions (height and width) are divisible by 'factor'.
|
||||
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
||||
3. The aspect ratio of the image is maintained as closely as possible.
|
||||
"""
|
||||
if max(height, width) / min(height, width) > MAX_RATIO:
|
||||
raise ValueError(
|
||||
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
|
||||
)
|
||||
h_bar = max(factor, round_by_factor(height, factor))
|
||||
w_bar = max(factor, round_by_factor(width, factor))
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = floor_by_factor(height / beta, factor)
|
||||
w_bar = floor_by_factor(width / beta, factor)
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = ceil_by_factor(height * beta, factor)
|
||||
w_bar = ceil_by_factor(width * beta, factor)
|
||||
return h_bar, w_bar
|
||||
|
||||
|
||||
class MLXVLMAdapter(CustomLLM):
|
||||
"""MLX VLM Adapter for running vision-language models locally using MLX."""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize the adapter.
|
||||
|
||||
Args:
|
||||
**kwargs: Additional arguments
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.models = {} # Cache for loaded models
|
||||
self.processors = {} # Cache for loaded processors
|
||||
self.configs = {} # Cache for loaded configs
|
||||
self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool
|
||||
|
||||
def _load_model_and_processor(self, model_name: str):
|
||||
"""Load model and processor if not already cached.
|
||||
|
||||
Args:
|
||||
model_name: Name of the model to load
|
||||
|
||||
Returns:
|
||||
Tuple of (model, processor, config)
|
||||
"""
|
||||
if not MLX_AVAILABLE:
|
||||
raise ImportError("MLX VLM dependencies not available. Please install mlx-vlm.")
|
||||
|
||||
if model_name not in self.models:
|
||||
# Load model and processor
|
||||
model_obj, processor = load(
|
||||
model_name,
|
||||
processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS}
|
||||
)
|
||||
config = load_config(model_name)
|
||||
|
||||
# Cache them
|
||||
self.models[model_name] = model_obj
|
||||
self.processors[model_name] = processor
|
||||
self.configs[model_name] = config
|
||||
|
||||
return self.models[model_name], self.processors[model_name], self.configs[model_name]
|
||||
|
||||
def _process_coordinates(self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]) -> str:
|
||||
"""Process coordinates in box tokens based on image resizing using smart_resize approach.
|
||||
|
||||
Args:
|
||||
text: Text containing box tokens
|
||||
original_size: Original image size (width, height)
|
||||
model_size: Model processed image size (width, height)
|
||||
|
||||
Returns:
|
||||
Text with processed coordinates
|
||||
"""
|
||||
# Find all box tokens
|
||||
box_pattern = r"<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>"
|
||||
|
||||
def process_coords(match):
|
||||
model_x, model_y = int(match.group(1)), int(match.group(2))
|
||||
# Scale coordinates from model space to original image space
|
||||
# Both original_size and model_size are in (width, height) format
|
||||
new_x = int(model_x * original_size[0] / model_size[0]) # Width
|
||||
new_y = int(model_y * original_size[1] / model_size[1]) # Height
|
||||
return f"<|box_start|>({new_x},{new_y})<|box_end|>"
|
||||
|
||||
return re.sub(box_pattern, process_coords, text)
|
||||
|
||||
def _convert_messages(self, messages: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Image.Image], Dict[int, Tuple[int, int]], Dict[int, Tuple[int, int]]]:
|
||||
"""Convert OpenAI format messages to MLX VLM format and extract images.
|
||||
|
||||
Args:
|
||||
messages: Messages in OpenAI format
|
||||
|
||||
Returns:
|
||||
Tuple of (processed_messages, images, original_sizes, model_sizes)
|
||||
"""
|
||||
processed_messages = []
|
||||
images = []
|
||||
original_sizes = {} # Track original sizes of images for coordinate mapping
|
||||
model_sizes = {} # Track model processed sizes
|
||||
image_index = 0
|
||||
|
||||
for message in messages:
|
||||
processed_message = {
|
||||
"role": message["role"],
|
||||
"content": []
|
||||
}
|
||||
|
||||
content = message.get("content", [])
|
||||
if isinstance(content, str):
|
||||
# Simple text content
|
||||
processed_message["content"] = content
|
||||
elif isinstance(content, list):
|
||||
# Multi-modal content
|
||||
processed_content = []
|
||||
for item in content:
|
||||
if item.get("type") == "text":
|
||||
processed_content.append({
|
||||
"type": "text",
|
||||
"text": item.get("text", "")
|
||||
})
|
||||
elif item.get("type") == "image_url":
|
||||
image_url = item.get("image_url", {}).get("url", "")
|
||||
pil_image = None
|
||||
|
||||
if image_url.startswith("data:image/"):
|
||||
# Extract base64 data
|
||||
base64_data = image_url.split(',')[1]
|
||||
# Convert base64 to PIL Image
|
||||
image_data = base64.b64decode(base64_data)
|
||||
pil_image = Image.open(io.BytesIO(image_data))
|
||||
else:
|
||||
# Handle file path or URL
|
||||
pil_image = Image.open(image_url)
|
||||
|
||||
# Store original image size for coordinate mapping
|
||||
original_size = pil_image.size
|
||||
original_sizes[image_index] = original_size
|
||||
|
||||
# Use smart_resize to determine model size
|
||||
# Note: smart_resize expects (height, width) but PIL gives (width, height)
|
||||
height, width = original_size[1], original_size[0]
|
||||
new_height, new_width = smart_resize(height, width)
|
||||
# Store model size in (width, height) format for consistent coordinate processing
|
||||
model_sizes[image_index] = (new_width, new_height)
|
||||
|
||||
# Resize the image using the calculated dimensions from smart_resize
|
||||
resized_image = pil_image.resize((new_width, new_height))
|
||||
images.append(resized_image)
|
||||
|
||||
# Add image placeholder to content
|
||||
processed_content.append({
|
||||
"type": "image"
|
||||
})
|
||||
|
||||
image_index += 1
|
||||
|
||||
processed_message["content"] = processed_content
|
||||
|
||||
processed_messages.append(processed_message)
|
||||
|
||||
return processed_messages, images, original_sizes, model_sizes
|
||||
|
||||
def _generate(self, **kwargs) -> str:
|
||||
"""Generate response using the local MLX VLM model.
|
||||
|
||||
Args:
|
||||
**kwargs: Keyword arguments containing messages and model info
|
||||
|
||||
Returns:
|
||||
Generated text response
|
||||
"""
|
||||
messages = kwargs.get('messages', [])
|
||||
model_name = kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')
|
||||
max_tokens = kwargs.get('max_tokens', 128)
|
||||
|
||||
# Warn about ignored kwargs
|
||||
ignored_kwargs = set(kwargs.keys()) - {'messages', 'model', 'max_tokens'}
|
||||
if ignored_kwargs:
|
||||
warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
|
||||
|
||||
# Load model and processor
|
||||
model, processor, config = self._load_model_and_processor(model_name)
|
||||
|
||||
# Convert messages and extract images
|
||||
processed_messages, images, original_sizes, model_sizes = self._convert_messages(messages)
|
||||
|
||||
# Process user text input with box coordinates after image processing
|
||||
# Swap original_size and model_size arguments for inverse transformation
|
||||
for msg_idx, msg in enumerate(processed_messages):
|
||||
if msg.get("role") == "user" and isinstance(msg.get("content"), str):
|
||||
content = msg.get("content", "")
|
||||
if "<|box_start|>" in content and original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
|
||||
orig_size = original_sizes[0]
|
||||
model_size = model_sizes[0]
|
||||
# Swap arguments to perform inverse transformation for user input
|
||||
processed_messages[msg_idx]["content"] = self._process_coordinates(content, model_size, orig_size)
|
||||
|
||||
try:
|
||||
# Format prompt according to model requirements using the processor directly
|
||||
prompt = processor.apply_chat_template(
|
||||
processed_messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
return_tensors='pt'
|
||||
)
|
||||
tokenizer = cast(PreTrainedTokenizer, processor)
|
||||
|
||||
# Generate response
|
||||
text_content, usage = generate(
|
||||
model,
|
||||
tokenizer,
|
||||
str(prompt),
|
||||
images, # type: ignore
|
||||
verbose=False,
|
||||
max_tokens=max_tokens
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error generating response: {str(e)}") from e
|
||||
|
||||
# Process coordinates in the response back to original image space
|
||||
if original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
|
||||
# Get original image size and model size (using the first image)
|
||||
orig_size = original_sizes[0]
|
||||
model_size = model_sizes[0]
|
||||
|
||||
# Check if output contains box tokens that need processing
|
||||
if "<|box_start|>" in text_content:
|
||||
# Process coordinates from model space back to original image space
|
||||
text_content = self._process_coordinates(text_content, orig_size, model_size)
|
||||
|
||||
return text_content
|
||||
|
||||
def completion(self, *args, **kwargs) -> ModelResponse:
|
||||
"""Synchronous completion method.
|
||||
|
||||
Returns:
|
||||
ModelResponse with generated text
|
||||
"""
|
||||
generated_text = self._generate(**kwargs)
|
||||
|
||||
result = completion(
|
||||
model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}",
|
||||
mock_response=generated_text,
|
||||
)
|
||||
return cast(ModelResponse, result)
|
||||
|
||||
async def acompletion(self, *args, **kwargs) -> ModelResponse:
|
||||
"""Asynchronous completion method.
|
||||
|
||||
Returns:
|
||||
ModelResponse with generated text
|
||||
"""
|
||||
# Run _generate in thread pool to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
generated_text = await loop.run_in_executor(
|
||||
self._executor,
|
||||
functools.partial(self._generate, **kwargs)
|
||||
)
|
||||
|
||||
result = await acompletion(
|
||||
model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}",
|
||||
mock_response=generated_text,
|
||||
)
|
||||
return cast(ModelResponse, result)
|
||||
|
||||
def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
|
||||
"""Synchronous streaming method.
|
||||
|
||||
Returns:
|
||||
Iterator of GenericStreamingChunk
|
||||
"""
|
||||
generated_text = self._generate(**kwargs)
|
||||
|
||||
generic_streaming_chunk: GenericStreamingChunk = {
|
||||
"finish_reason": "stop",
|
||||
"index": 0,
|
||||
"is_finished": True,
|
||||
"text": generated_text,
|
||||
"tool_use": None,
|
||||
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
|
||||
}
|
||||
|
||||
yield generic_streaming_chunk
|
||||
|
||||
async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
|
||||
"""Asynchronous streaming method.
|
||||
|
||||
Returns:
|
||||
AsyncIterator of GenericStreamingChunk
|
||||
"""
|
||||
# Run _generate in thread pool to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
generated_text = await loop.run_in_executor(
|
||||
self._executor,
|
||||
functools.partial(self._generate, **kwargs)
|
||||
)
|
||||
|
||||
generic_streaming_chunk: GenericStreamingChunk = {
|
||||
"finish_reason": "stop",
|
||||
"index": 0,
|
||||
"is_finished": True,
|
||||
"text": generated_text,
|
||||
"tool_use": None,
|
||||
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
|
||||
}
|
||||
|
||||
yield generic_streaming_chunk
|
||||
@@ -3,6 +3,7 @@ ComputerAgent - Main agent class that selects and runs agent loops
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set, Tuple
|
||||
|
||||
from litellm.responses.utils import Usage
|
||||
@@ -22,6 +23,7 @@ import inspect
|
||||
from .adapters import (
|
||||
HuggingFaceLocalAdapter,
|
||||
HumanAdapter,
|
||||
MLXVLMAdapter,
|
||||
)
|
||||
from .callbacks import (
|
||||
ImageRetentionCallback,
|
||||
@@ -29,6 +31,7 @@ from .callbacks import (
|
||||
TrajectorySaverCallback,
|
||||
BudgetManagerCallback,
|
||||
TelemetryCallback,
|
||||
OperatorNormalizerCallback
|
||||
)
|
||||
from .computers import (
|
||||
AsyncComputerHandler,
|
||||
@@ -160,7 +163,7 @@ class ComputerAgent:
|
||||
only_n_most_recent_images: Optional[int] = None,
|
||||
callbacks: Optional[List[Any]] = None,
|
||||
verbosity: Optional[int] = None,
|
||||
trajectory_dir: Optional[str] = None,
|
||||
trajectory_dir: Optional[str | Path | dict] = None,
|
||||
max_retries: Optional[int] = 3,
|
||||
screenshot_delay: Optional[float | int] = 0.5,
|
||||
use_prompt_caching: Optional[bool] = False,
|
||||
@@ -187,7 +190,11 @@ class ComputerAgent:
|
||||
telemetry_enabled: If set, adds TelemetryCallback to track anonymized usage data. Enabled by default.
|
||||
trust_remote_code: If set, trust remote code when loading local models. Disabled by default.
|
||||
**kwargs: Additional arguments passed to the agent loop
|
||||
"""
|
||||
"""
|
||||
# If the loop is "human/human", we need to prefix a grounding model fallback
|
||||
if model in ["human/human", "human"]:
|
||||
model = "openai/computer-use-preview+human/human"
|
||||
|
||||
self.model = model
|
||||
self.tools = tools or []
|
||||
self.custom_loop = custom_loop
|
||||
@@ -204,6 +211,9 @@ class ComputerAgent:
|
||||
|
||||
# == Add built-in callbacks ==
|
||||
|
||||
# Prepend operator normalizer callback
|
||||
self.callbacks.insert(0, OperatorNormalizerCallback())
|
||||
|
||||
# Add telemetry callback if telemetry_enabled is set
|
||||
if self.telemetry_enabled:
|
||||
if isinstance(self.telemetry_enabled, bool):
|
||||
@@ -221,7 +231,10 @@ class ComputerAgent:
|
||||
|
||||
# Add trajectory saver callback if trajectory_dir is set
|
||||
if self.trajectory_dir:
|
||||
self.callbacks.append(TrajectorySaverCallback(self.trajectory_dir))
|
||||
if isinstance(self.trajectory_dir, dict):
|
||||
self.callbacks.append(TrajectorySaverCallback(**self.trajectory_dir))
|
||||
elif isinstance(self.trajectory_dir, (str, Path)):
|
||||
self.callbacks.append(TrajectorySaverCallback(str(self.trajectory_dir)))
|
||||
|
||||
# Add budget manager if max_trajectory_budget is set
|
||||
if max_trajectory_budget:
|
||||
@@ -238,9 +251,11 @@ class ComputerAgent:
|
||||
trust_remote_code=self.trust_remote_code or False
|
||||
)
|
||||
human_adapter = HumanAdapter()
|
||||
mlx_adapter = MLXVLMAdapter()
|
||||
litellm.custom_provider_map = [
|
||||
{"provider": "huggingface-local", "custom_handler": hf_adapter},
|
||||
{"provider": "human", "custom_handler": human_adapter}
|
||||
{"provider": "human", "custom_handler": human_adapter},
|
||||
{"provider": "mlx", "custom_handler": mlx_adapter}
|
||||
]
|
||||
litellm.suppress_debug_info = True
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ from .logging import LoggingCallback
|
||||
from .trajectory_saver import TrajectorySaverCallback
|
||||
from .budget_manager import BudgetManagerCallback
|
||||
from .telemetry import TelemetryCallback
|
||||
from .operator_validator import OperatorNormalizerCallback
|
||||
|
||||
__all__ = [
|
||||
"AsyncCallbackHandler",
|
||||
@@ -16,4 +17,5 @@ __all__ = [
|
||||
"TrajectorySaverCallback",
|
||||
"BudgetManagerCallback",
|
||||
"TelemetryCallback",
|
||||
"OperatorNormalizerCallback",
|
||||
]
|
||||
|
||||
@@ -50,90 +50,41 @@ class ImageRetentionCallback(AsyncCallbackHandler):
|
||||
"""
|
||||
if self.only_n_most_recent_images is None:
|
||||
return messages
|
||||
|
||||
# First pass: Assign call_id to reasoning items based on the next computer_call
|
||||
messages_with_call_ids = []
|
||||
for i, msg in enumerate(messages):
|
||||
msg_copy = msg.copy() if isinstance(msg, dict) else msg
|
||||
|
||||
# If this is a reasoning item without a call_id, find the next computer_call
|
||||
if (msg_copy.get("type") == "reasoning" and
|
||||
not msg_copy.get("call_id")):
|
||||
# Look ahead for the next computer_call
|
||||
for j in range(i + 1, len(messages)):
|
||||
next_msg = messages[j]
|
||||
if (next_msg.get("type") == "computer_call" and
|
||||
next_msg.get("call_id")):
|
||||
msg_copy["call_id"] = next_msg.get("call_id")
|
||||
break
|
||||
|
||||
messages_with_call_ids.append(msg_copy)
|
||||
|
||||
# Find all computer_call_output items with images and their call_ids
|
||||
image_call_ids = []
|
||||
for msg in reversed(messages_with_call_ids): # Process in reverse to get most recent first
|
||||
if (msg.get("type") == "computer_call_output" and
|
||||
isinstance(msg.get("output"), dict) and
|
||||
"image_url" in msg.get("output", {})):
|
||||
call_id = msg.get("call_id")
|
||||
if call_id and call_id not in image_call_ids:
|
||||
image_call_ids.append(call_id)
|
||||
if len(image_call_ids) >= self.only_n_most_recent_images:
|
||||
break
|
||||
|
||||
# Keep the most recent N image call_ids (reverse to get chronological order)
|
||||
keep_call_ids = set(image_call_ids[:self.only_n_most_recent_images])
|
||||
|
||||
# Filter messages: remove computer_call, computer_call_output, and reasoning for old images
|
||||
filtered_messages = []
|
||||
for msg in messages_with_call_ids:
|
||||
msg_type = msg.get("type")
|
||||
call_id = msg.get("call_id")
|
||||
|
||||
# Remove old computer_call items
|
||||
if msg_type == "computer_call" and call_id not in keep_call_ids:
|
||||
# Check if this call_id corresponds to an image call
|
||||
has_image_output = any(
|
||||
m.get("type") == "computer_call_output" and
|
||||
m.get("call_id") == call_id and
|
||||
isinstance(m.get("output"), dict) and
|
||||
"image_url" in m.get("output", {})
|
||||
for m in messages_with_call_ids
|
||||
)
|
||||
if has_image_output:
|
||||
continue # Skip this computer_call
|
||||
|
||||
# Remove old computer_call_output items with images
|
||||
if (msg_type == "computer_call_output" and
|
||||
call_id not in keep_call_ids and
|
||||
isinstance(msg.get("output"), dict) and
|
||||
"image_url" in msg.get("output", {})):
|
||||
continue # Skip this computer_call_output
|
||||
|
||||
# Remove old reasoning items that are paired with removed computer calls
|
||||
if (msg_type == "reasoning" and
|
||||
call_id and call_id not in keep_call_ids):
|
||||
# Check if this call_id corresponds to an image call that's being removed
|
||||
has_image_output = any(
|
||||
m.get("type") == "computer_call_output" and
|
||||
m.get("call_id") == call_id and
|
||||
isinstance(m.get("output"), dict) and
|
||||
"image_url" in m.get("output", {})
|
||||
for m in messages_with_call_ids
|
||||
)
|
||||
if has_image_output:
|
||||
continue # Skip this reasoning item
|
||||
|
||||
filtered_messages.append(msg)
|
||||
|
||||
# Clean up: Remove call_id from reasoning items before returning
|
||||
final_messages = []
|
||||
for msg in filtered_messages:
|
||||
if msg.get("type") == "reasoning" and "call_id" in msg:
|
||||
# Create a copy without call_id for reasoning items
|
||||
cleaned_msg = {k: v for k, v in msg.items() if k != "call_id"}
|
||||
final_messages.append(cleaned_msg)
|
||||
else:
|
||||
final_messages.append(msg)
|
||||
|
||||
return final_messages
|
||||
|
||||
# Gather indices of all computer_call_output messages that contain an image_url
|
||||
output_indices: List[int] = []
|
||||
for idx, msg in enumerate(messages):
|
||||
if msg.get("type") == "computer_call_output":
|
||||
out = msg.get("output")
|
||||
if isinstance(out, dict) and ("image_url" in out):
|
||||
output_indices.append(idx)
|
||||
|
||||
# Nothing to trim
|
||||
if len(output_indices) <= self.only_n_most_recent_images:
|
||||
return messages
|
||||
|
||||
# Determine which outputs to keep (most recent N)
|
||||
keep_output_indices = set(output_indices[-self.only_n_most_recent_images :])
|
||||
|
||||
# Build set of indices to remove in one pass
|
||||
to_remove: set[int] = set()
|
||||
|
||||
for idx in output_indices:
|
||||
if idx in keep_output_indices:
|
||||
continue # keep this screenshot and its context
|
||||
|
||||
to_remove.add(idx) # remove the computer_call_output itself
|
||||
|
||||
# Remove the immediately preceding computer_call with matching call_id (if present)
|
||||
call_id = messages[idx].get("call_id")
|
||||
prev_idx = idx - 1
|
||||
if prev_idx >= 0 and messages[prev_idx].get("type") == "computer_call" and messages[prev_idx].get("call_id") == call_id:
|
||||
to_remove.add(prev_idx)
|
||||
# Check a single reasoning immediately before that computer_call
|
||||
r_idx = prev_idx - 1
|
||||
if r_idx >= 0 and messages[r_idx].get("type") == "reasoning":
|
||||
to_remove.add(r_idx)
|
||||
|
||||
# Construct filtered list
|
||||
filtered = [m for i, m in enumerate(messages) if i not in to_remove]
|
||||
return filtered
|
||||
138
libs/python/agent/agent/callbacks/operator_validator.py
Normal file
138
libs/python/agent/agent/callbacks/operator_validator.py
Normal file
@@ -0,0 +1,138 @@
|
||||
"""
|
||||
OperatorValidatorCallback
|
||||
|
||||
Ensures agent output actions conform to expected schemas by fixing common issues:
|
||||
- click: add default button='left' if missing
|
||||
- keypress: wrap keys string into a list
|
||||
- etc.
|
||||
|
||||
This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts).
|
||||
The purpose is to avoid spending another LLM call to fix broken computer call syntax when possible.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from .base import AsyncCallbackHandler
|
||||
|
||||
|
||||
class OperatorNormalizerCallback(AsyncCallbackHandler):
|
||||
"""Normalizes common computer call hallucinations / errors in computer call syntax."""
|
||||
|
||||
async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
# Mutate in-place as requested, but still return the list for chaining
|
||||
for item in output or []:
|
||||
if item.get("type") != "computer_call":
|
||||
continue
|
||||
action = item.get("action")
|
||||
if not isinstance(action, dict):
|
||||
continue
|
||||
|
||||
# rename mouse click actions to "click"
|
||||
for mouse_btn in ["left", "right", "wheel", "back", "forward"]:
|
||||
if action.get("type", "") == f"{mouse_btn}_click":
|
||||
action["type"] = "click"
|
||||
action["button"] = mouse_btn
|
||||
# rename hotkey actions to "keypress"
|
||||
for alias in ["hotkey", "key", "press", "key_press"]:
|
||||
if action.get("type", "") == alias:
|
||||
action["type"] = "keypress"
|
||||
# assume click actions
|
||||
if "button" in action and "type" not in action:
|
||||
action["type"] = "click"
|
||||
if "click" in action and "type" not in action:
|
||||
action["type"] = "click"
|
||||
if ("scroll_x" in action or "scroll_y" in action) and "type" not in action:
|
||||
action["type"] = "scroll"
|
||||
if "text" in action and "type" not in action:
|
||||
action["type"] = "type"
|
||||
|
||||
action_type = action.get("type")
|
||||
def _keep_keys(action: Dict[str, Any], keys_to_keep: List[str]):
|
||||
"""Keep only the provided keys on action; delete everything else.
|
||||
Always ensures required 'type' is present if listed in keys_to_keep.
|
||||
"""
|
||||
for key in list(action.keys()):
|
||||
if key not in keys_to_keep:
|
||||
del action[key]
|
||||
# rename "coordinate" to "x", "y"
|
||||
if "coordinate" in action:
|
||||
action["x"] = action["coordinate"][0]
|
||||
action["y"] = action["coordinate"][1]
|
||||
del action["coordinate"]
|
||||
if action_type == "click":
|
||||
# convert "click" to "button"
|
||||
if "button" not in action and "click" in action:
|
||||
action["button"] = action["click"]
|
||||
del action["click"]
|
||||
# default button to "left"
|
||||
action["button"] = action.get("button", "left")
|
||||
# add default scroll x, y if missing
|
||||
if action_type == "scroll":
|
||||
action["scroll_x"] = action.get("scroll_x", 0)
|
||||
action["scroll_y"] = action.get("scroll_y", 0)
|
||||
# ensure keys arg is a list (normalize aliases first)
|
||||
if action_type == "keypress":
|
||||
keys = action.get("keys")
|
||||
for keys_alias in ["keypress", "key", "press", "key_press", "text"]:
|
||||
if keys_alias in action:
|
||||
action["keys"] = action[keys_alias]
|
||||
del action[keys_alias]
|
||||
keys = action.get("keys")
|
||||
if isinstance(keys, str):
|
||||
action["keys"] = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
|
||||
required_keys_by_type = {
|
||||
# OpenAI actions
|
||||
"click": ["type", "button", "x", "y"],
|
||||
"double_click": ["type", "x", "y"],
|
||||
"drag": ["type", "path"],
|
||||
"keypress": ["type", "keys"],
|
||||
"move": ["type", "x", "y"],
|
||||
"screenshot": ["type"],
|
||||
"scroll": ["type", "scroll_x", "scroll_y", "x", "y"],
|
||||
"type": ["type", "text"],
|
||||
"wait": ["type"],
|
||||
# Anthropic actions
|
||||
"left_mouse_down": ["type", "x", "y"],
|
||||
"left_mouse_up": ["type", "x", "y"],
|
||||
"triple_click": ["type", "button", "x", "y"],
|
||||
}
|
||||
keep = required_keys_by_type.get(action_type or "")
|
||||
if keep:
|
||||
_keep_keys(action, keep)
|
||||
|
||||
|
||||
# # Second pass: if an assistant message is immediately followed by a computer_call,
|
||||
# # replace the assistant message itself with a reasoning message with summary text.
|
||||
# if isinstance(output, list):
|
||||
# for i, item in enumerate(output):
|
||||
# # AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] }
|
||||
# if item.get("type") == "message" and item.get("role") == "assistant":
|
||||
# next_idx = i + 1
|
||||
# if next_idx >= len(output):
|
||||
# continue
|
||||
# next_item = output[next_idx]
|
||||
# if not isinstance(next_item, dict):
|
||||
# continue
|
||||
# if next_item.get("type") != "computer_call":
|
||||
# continue
|
||||
# contents = item.get("content") or []
|
||||
# # Extract text from OutputContent[]
|
||||
# text_parts: List[str] = []
|
||||
# if isinstance(contents, list):
|
||||
# for c in contents:
|
||||
# if isinstance(c, dict) and c.get("type") == "output_text" and isinstance(c.get("text"), str):
|
||||
# text_parts.append(c["text"])
|
||||
# text_content = "\n".join(text_parts).strip()
|
||||
# # Replace assistant message with reasoning message
|
||||
# output[i] = {
|
||||
# "type": "reasoning",
|
||||
# "summary": [
|
||||
# {
|
||||
# "type": "summary_text",
|
||||
# "text": text_content,
|
||||
# }
|
||||
# ],
|
||||
# }
|
||||
|
||||
return output
|
||||
@@ -11,6 +11,8 @@ from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, Union, override
|
||||
from PIL import Image, ImageDraw
|
||||
import io
|
||||
from copy import deepcopy
|
||||
|
||||
from .base import AsyncCallbackHandler
|
||||
|
||||
def sanitize_image_urls(data: Any) -> Any:
|
||||
@@ -43,6 +45,64 @@ def sanitize_image_urls(data: Any) -> Any:
|
||||
return data
|
||||
|
||||
|
||||
def extract_computer_call_outputs(items: List[Dict[str, Any]], screenshot_dir: Optional[Path]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Save any base64-encoded screenshots from computer_call_output entries to files and
|
||||
replace their image_url with the saved file path when a call_id is present.
|
||||
|
||||
Only operates if screenshot_dir is provided and exists; otherwise returns items unchanged.
|
||||
|
||||
Args:
|
||||
items: List of message/result dicts potentially containing computer_call_output entries
|
||||
screenshot_dir: Directory to write screenshots into
|
||||
|
||||
Returns:
|
||||
A new list with updated image_url fields when applicable.
|
||||
"""
|
||||
if not items:
|
||||
return items
|
||||
if not screenshot_dir or not screenshot_dir.exists():
|
||||
return items
|
||||
|
||||
updated: List[Dict[str, Any]] = []
|
||||
for item in items:
|
||||
# work on a shallow copy; deep copy nested 'output' if we modify it
|
||||
msg = dict(item)
|
||||
try:
|
||||
if msg.get("type") == "computer_call_output":
|
||||
call_id = msg.get("call_id")
|
||||
output = msg.get("output", {})
|
||||
image_url = output.get("image_url")
|
||||
if call_id and isinstance(image_url, str) and image_url.startswith("data:"):
|
||||
# derive extension from MIME type e.g. data:image/png;base64,
|
||||
try:
|
||||
ext = image_url.split(";", 1)[0].split("/")[-1]
|
||||
if not ext:
|
||||
ext = "png"
|
||||
except Exception:
|
||||
ext = "png"
|
||||
out_path = screenshot_dir / f"{call_id}.{ext}"
|
||||
# write file if it doesn't exist
|
||||
if not out_path.exists():
|
||||
try:
|
||||
b64_payload = image_url.split(",", 1)[1]
|
||||
img_bytes = base64.b64decode(b64_payload)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(out_path, "wb") as f:
|
||||
f.write(img_bytes)
|
||||
except Exception:
|
||||
# if anything fails, skip modifying this message
|
||||
pass
|
||||
# update image_url to file path
|
||||
new_output = dict(output)
|
||||
new_output["image_url"] = str(out_path)
|
||||
msg["output"] = new_output
|
||||
except Exception:
|
||||
# do not block on malformed entries; keep original
|
||||
pass
|
||||
updated.append(msg)
|
||||
return updated
|
||||
|
||||
class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
"""
|
||||
Callback handler that saves agent trajectories to disk.
|
||||
@@ -51,7 +111,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
within the trajectory gets its own folder with screenshots and responses.
|
||||
"""
|
||||
|
||||
def __init__(self, trajectory_dir: str, reset_on_run: bool = True):
|
||||
def __init__(self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None):
|
||||
"""
|
||||
Initialize trajectory saver.
|
||||
|
||||
@@ -67,10 +127,12 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
self.model: Optional[str] = None
|
||||
self.total_usage: Dict[str, Any] = {}
|
||||
self.reset_on_run = reset_on_run
|
||||
# Optional directory to store extracted screenshots from metadata/new_items
|
||||
self.screenshot_dir: Optional[Path] = Path(screenshot_dir) if screenshot_dir else None
|
||||
|
||||
# Ensure trajectory directory exists
|
||||
self.trajectory_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def _get_turn_dir(self) -> Path:
|
||||
"""Get the directory for the current turn."""
|
||||
if not self.trajectory_id:
|
||||
@@ -94,6 +156,10 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
# format: turn_000/0000_name.json
|
||||
artifact_filename = f"{self.current_artifact:04d}_{name}"
|
||||
artifact_path = turn_dir / f"{artifact_filename}.json"
|
||||
# add created_at
|
||||
if isinstance(artifact, dict):
|
||||
artifact = artifact.copy()
|
||||
artifact["created_at"] = str(uuid.uuid1().time)
|
||||
with open(artifact_path, "w") as f:
|
||||
json.dump(sanitize_image_urls(artifact), f, indent=2)
|
||||
self.current_artifact += 1
|
||||
@@ -135,12 +201,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
trajectory_path = self.trajectory_dir / self.trajectory_id
|
||||
trajectory_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save trajectory metadata
|
||||
# Save trajectory metadata (optionally extract screenshots to screenshot_dir)
|
||||
kwargs_to_save = kwargs.copy()
|
||||
try:
|
||||
if "messages" in kwargs_to_save:
|
||||
kwargs_to_save["messages"] = extract_computer_call_outputs(
|
||||
kwargs_to_save["messages"], self.screenshot_dir
|
||||
)
|
||||
except Exception:
|
||||
# If extraction fails, fall back to original messages
|
||||
pass
|
||||
metadata = {
|
||||
"trajectory_id": self.trajectory_id,
|
||||
"created_at": str(uuid.uuid1().time),
|
||||
"status": "running",
|
||||
"kwargs": kwargs,
|
||||
"kwargs": kwargs_to_save,
|
||||
}
|
||||
|
||||
with open(trajectory_path / "metadata.json", "w") as f:
|
||||
@@ -167,11 +242,18 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
metadata = {}
|
||||
|
||||
# Update metadata with completion info
|
||||
# Optionally extract screenshots from new_items before persisting
|
||||
new_items_to_save = new_items
|
||||
try:
|
||||
new_items_to_save = extract_computer_call_outputs(new_items, self.screenshot_dir)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
metadata.update({
|
||||
"status": "completed",
|
||||
"completed_at": str(uuid.uuid1().time),
|
||||
"total_usage": self.total_usage,
|
||||
"new_items": sanitize_image_urls(new_items),
|
||||
"new_items": new_items_to_save,
|
||||
"total_turns": self.current_turn
|
||||
})
|
||||
|
||||
|
||||
@@ -15,6 +15,11 @@ class HumanCompletionUI:
|
||||
self.current_call_id: Optional[str] = None
|
||||
self.refresh_interval = 2.0 # seconds
|
||||
self.last_image = None # Store the last image for display
|
||||
# Track current interactive action controls
|
||||
self.current_action_type: str = "click"
|
||||
self.current_button: str = "left"
|
||||
self.current_scroll_x: int = 0
|
||||
self.current_scroll_y: int = -120
|
||||
|
||||
def format_messages_for_chatbot(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Format messages for display in gr.Chatbot with type='messages'."""
|
||||
@@ -196,7 +201,9 @@ class HumanCompletionUI:
|
||||
gr.update(choices=["latest"], value="latest"), # dropdown
|
||||
gr.update(value=None), # image (no image)
|
||||
gr.update(value=[]), # chatbot (empty messages)
|
||||
gr.update(interactive=False) # submit button
|
||||
gr.update(interactive=False), # submit button
|
||||
gr.update(visible=False), # click_actions_group hidden
|
||||
gr.update(visible=False), # actions_group hidden
|
||||
)
|
||||
|
||||
# Sort pending calls by created_at to get oldest first
|
||||
@@ -237,7 +244,9 @@ class HumanCompletionUI:
|
||||
gr.update(choices=choices, value="latest"),
|
||||
gr.update(value=self.last_image),
|
||||
gr.update(value=conversation),
|
||||
gr.update(interactive=bool(choices))
|
||||
gr.update(interactive=bool(choices)),
|
||||
gr.update(visible=True), # click_actions_group visible when there is a call
|
||||
gr.update(visible=True), # actions_group visible when there is a call
|
||||
)
|
||||
|
||||
def on_call_selected(self, selected_choice):
|
||||
@@ -246,7 +255,9 @@ class HumanCompletionUI:
|
||||
return (
|
||||
gr.update(value=None), # no image
|
||||
gr.update(value=[]), # empty chatbot
|
||||
gr.update(interactive=False)
|
||||
gr.update(interactive=False),
|
||||
gr.update(visible=False), # click_actions_group hidden
|
||||
gr.update(visible=False), # actions_group hidden
|
||||
)
|
||||
|
||||
pending_calls = self.get_pending_calls()
|
||||
@@ -254,7 +265,9 @@ class HumanCompletionUI:
|
||||
return (
|
||||
gr.update(value=None), # no image
|
||||
gr.update(value=[]), # empty chatbot
|
||||
gr.update(interactive=False)
|
||||
gr.update(interactive=False),
|
||||
gr.update(visible=False), # click_actions_group hidden
|
||||
gr.update(visible=False), # actions_group hidden
|
||||
)
|
||||
|
||||
# Handle "latest" option
|
||||
@@ -286,7 +299,9 @@ class HumanCompletionUI:
|
||||
return (
|
||||
gr.update(value=None), # no image
|
||||
gr.update(value=[]), # empty chatbot
|
||||
gr.update(interactive=False)
|
||||
gr.update(interactive=False),
|
||||
gr.update(visible=False), # click_actions_group hidden
|
||||
gr.update(visible=False), # actions_group hidden
|
||||
)
|
||||
|
||||
conversation = self.format_messages_for_chatbot(selected_call.get("messages", []))
|
||||
@@ -297,7 +312,9 @@ class HumanCompletionUI:
|
||||
return (
|
||||
gr.update(value=self.last_image),
|
||||
gr.update(value=conversation),
|
||||
gr.update(interactive=True)
|
||||
gr.update(interactive=True),
|
||||
gr.update(visible=True), # click_actions_group visible
|
||||
gr.update(visible=True), # actions_group visible
|
||||
)
|
||||
|
||||
def submit_response(self, response_text: str):
|
||||
@@ -368,6 +385,10 @@ class HumanCompletionUI:
|
||||
"""Submit a hotkey action."""
|
||||
return self.submit_action("keypress", keys=keys)
|
||||
|
||||
def submit_wait_action(self) -> str:
|
||||
"""Submit a wait action with no kwargs."""
|
||||
return self.submit_action("wait")
|
||||
|
||||
def submit_description_click(self, description: str, action_type: str = "click", button: str = "left") -> str:
|
||||
"""Submit a description-based action."""
|
||||
if action_type == "click":
|
||||
@@ -407,7 +428,7 @@ def create_ui():
|
||||
"""Create the Gradio interface."""
|
||||
ui_handler = HumanCompletionUI()
|
||||
|
||||
with gr.Blocks(title="Human-in-the-Loop Agent Tool") as demo:
|
||||
with gr.Blocks(title="Human-in-the-Loop Agent Tool", fill_width=True) as demo:
|
||||
gr.Markdown("# 🤖 Human-in-the-Loop Agent Tool")
|
||||
gr.Markdown("Review AI conversation requests and provide human responses.")
|
||||
|
||||
@@ -415,29 +436,42 @@ def create_ui():
|
||||
with gr.Column(scale=2):
|
||||
with gr.Group():
|
||||
screenshot_image = gr.Image(
|
||||
label="Screenshot",
|
||||
label="Interactive Screenshot",
|
||||
interactive=False,
|
||||
height=600
|
||||
)
|
||||
|
||||
# Action type selection for image clicks
|
||||
with gr.Row():
|
||||
action_type_radio = gr.Radio(
|
||||
label="Action Type",
|
||||
choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
|
||||
value="click",
|
||||
scale=2
|
||||
)
|
||||
action_button_radio = gr.Radio(
|
||||
label="Button (for click only)",
|
||||
choices=["left", "right", "wheel", "back", "forward"],
|
||||
value="left",
|
||||
visible=True,
|
||||
scale=1
|
||||
)
|
||||
# Action type selection for image clicks (wrapped for visibility control)
|
||||
with gr.Group(visible=False) as click_actions_group:
|
||||
with gr.Row():
|
||||
action_type_radio = gr.Dropdown(
|
||||
label="Interactive Action",
|
||||
choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down", "scroll"],
|
||||
value="click",
|
||||
scale=2
|
||||
)
|
||||
action_button_radio = gr.Dropdown(
|
||||
label="Button",
|
||||
choices=["left", "right", "wheel", "back", "forward"],
|
||||
value="left",
|
||||
visible=True,
|
||||
scale=1
|
||||
)
|
||||
scroll_x_input = gr.Number(
|
||||
label="scroll_x",
|
||||
value=0,
|
||||
visible=False,
|
||||
scale=1
|
||||
)
|
||||
scroll_y_input = gr.Number(
|
||||
label="scroll_y",
|
||||
value=-120,
|
||||
visible=False,
|
||||
scale=1
|
||||
)
|
||||
|
||||
conversation_chatbot = gr.Chatbot(
|
||||
label="Messages",
|
||||
label="Conversation",
|
||||
type="messages",
|
||||
height=500,
|
||||
show_copy_button=True
|
||||
@@ -446,99 +480,97 @@ def create_ui():
|
||||
with gr.Column(scale=1):
|
||||
with gr.Group():
|
||||
call_dropdown = gr.Dropdown(
|
||||
label="Select a pending call",
|
||||
label="Select a pending conversation request",
|
||||
choices=["latest"],
|
||||
interactive=True,
|
||||
value="latest"
|
||||
)
|
||||
refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
|
||||
status_display = gr.Textbox(
|
||||
label="Status",
|
||||
interactive=False,
|
||||
value="Ready to receive requests..."
|
||||
)
|
||||
|
||||
with gr.Group():
|
||||
response_text = gr.Textbox(
|
||||
label="Response",
|
||||
label="Message",
|
||||
lines=3,
|
||||
placeholder="Enter your response here..."
|
||||
placeholder="Enter your message here..."
|
||||
)
|
||||
submit_btn = gr.Button("📤 Submit Response", variant="primary", interactive=False)
|
||||
submit_btn = gr.Button("📤 Submit Message", variant="primary", interactive=False)
|
||||
|
||||
# Action Accordions
|
||||
with gr.Accordion("🖱️ Click Actions", open=False):
|
||||
with gr.Group():
|
||||
with gr.Row():
|
||||
click_x = gr.Number(label="X", value=0, minimum=0)
|
||||
click_y = gr.Number(label="Y", value=0, minimum=0)
|
||||
with gr.Row():
|
||||
click_action_type = gr.Dropdown(
|
||||
label="Action Type",
|
||||
choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
|
||||
value="click"
|
||||
)
|
||||
click_button = gr.Dropdown(
|
||||
label="Button (for click only)",
|
||||
choices=["left", "right", "wheel", "back", "forward"],
|
||||
value="left"
|
||||
)
|
||||
click_submit_btn = gr.Button("Submit Action")
|
||||
|
||||
with gr.Accordion("📝 Type Action", open=False):
|
||||
with gr.Group():
|
||||
type_text = gr.Textbox(
|
||||
label="Text to Type",
|
||||
placeholder="Enter text to type..."
|
||||
)
|
||||
type_submit_btn = gr.Button("Submit Type")
|
||||
|
||||
with gr.Accordion("⌨️ Keypress Action", open=False):
|
||||
with gr.Group():
|
||||
keypress_text = gr.Textbox(
|
||||
label="Keys",
|
||||
placeholder="e.g., ctrl+c, alt+tab"
|
||||
)
|
||||
keypress_submit_btn = gr.Button("Submit Keypress")
|
||||
|
||||
with gr.Accordion("🎯 Description Action", open=False):
|
||||
with gr.Group():
|
||||
description_text = gr.Textbox(
|
||||
label="Element Description",
|
||||
placeholder="e.g., 'Privacy and security option in left sidebar'"
|
||||
)
|
||||
with gr.Row():
|
||||
description_action_type = gr.Dropdown(
|
||||
label="Action Type",
|
||||
choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
|
||||
value="click"
|
||||
)
|
||||
description_button = gr.Radio(
|
||||
label="Button (for click only)",
|
||||
choices=["left", "right", "wheel", "back", "forward"],
|
||||
value="left"
|
||||
)
|
||||
description_submit_btn = gr.Button("Submit Description Action")
|
||||
|
||||
status_display = gr.Textbox(
|
||||
label="Status",
|
||||
interactive=False,
|
||||
value="Ready to receive calls..."
|
||||
)
|
||||
# Action Accordions (wrapped for visibility control)
|
||||
with gr.Group(visible=False) as actions_group:
|
||||
with gr.Tabs():
|
||||
with gr.Tab("🖱️ Click Actions"):
|
||||
with gr.Group():
|
||||
description_text = gr.Textbox(
|
||||
label="Element Description",
|
||||
placeholder="e.g., 'Privacy and security option in left sidebar'"
|
||||
)
|
||||
with gr.Row():
|
||||
description_action_type = gr.Dropdown(
|
||||
label="Action",
|
||||
choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
|
||||
value="click"
|
||||
)
|
||||
description_button = gr.Dropdown(
|
||||
label="Button",
|
||||
choices=["left", "right", "wheel", "back", "forward"],
|
||||
value="left"
|
||||
)
|
||||
description_submit_btn = gr.Button("Submit Click Action")
|
||||
|
||||
with gr.Tab("📝 Type Action"):
|
||||
with gr.Group():
|
||||
type_text = gr.Textbox(
|
||||
label="Text to Type",
|
||||
placeholder="Enter text to type..."
|
||||
)
|
||||
type_submit_btn = gr.Button("Submit Type")
|
||||
|
||||
with gr.Tab("⌨️ Keypress Action"):
|
||||
with gr.Group():
|
||||
keypress_text = gr.Textbox(
|
||||
label="Keys",
|
||||
placeholder="e.g., ctrl+c, alt+tab"
|
||||
)
|
||||
keypress_submit_btn = gr.Button("Submit Keypress")
|
||||
|
||||
with gr.Tab("🧰 Misc Actions"):
|
||||
with gr.Group():
|
||||
misc_action_dropdown = gr.Dropdown(
|
||||
label="Action",
|
||||
choices=["wait"],
|
||||
value="wait"
|
||||
)
|
||||
misc_submit_btn = gr.Button("Submit Action")
|
||||
|
||||
# Event handlers
|
||||
refresh_btn.click(
|
||||
fn=ui_handler.refresh_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
)
|
||||
|
||||
call_dropdown.change(
|
||||
fn=ui_handler.on_call_selected,
|
||||
inputs=[call_dropdown],
|
||||
outputs=[screenshot_image, conversation_chatbot, submit_btn]
|
||||
outputs=[screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
)
|
||||
|
||||
def handle_image_click(evt: gr.SelectData):
|
||||
if evt.index is not None:
|
||||
x, y = evt.index
|
||||
action_type = action_type_radio.value or "click"
|
||||
button = action_button_radio.value or "left"
|
||||
result = ui_handler.submit_click_action(x, y, action_type, button)
|
||||
action_type = ui_handler.current_action_type or "click"
|
||||
button = ui_handler.current_button or "left"
|
||||
if action_type == "scroll":
|
||||
sx_i = int(ui_handler.current_scroll_x or 0)
|
||||
sy_i = int(ui_handler.current_scroll_y or 0)
|
||||
# Submit a scroll action with x,y position and scroll deltas
|
||||
result = ui_handler.submit_action("scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i)
|
||||
else:
|
||||
result = ui_handler.submit_click_action(x, y, action_type, button)
|
||||
ui_handler.wait_for_pending_calls()
|
||||
return result
|
||||
return "No coordinates selected"
|
||||
@@ -548,7 +580,7 @@ def create_ui():
|
||||
outputs=[status_display]
|
||||
).then(
|
||||
fn=ui_handler.wait_for_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
)
|
||||
|
||||
# Response submission
|
||||
@@ -558,27 +590,52 @@ def create_ui():
|
||||
outputs=[response_text, status_display]
|
||||
).then(
|
||||
fn=ui_handler.refresh_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
)
|
||||
|
||||
# Toggle button radio visibility based on action type
|
||||
def toggle_button_visibility(action_type):
|
||||
return gr.update(visible=(action_type == "click"))
|
||||
# Toggle visibility of controls based on action type
|
||||
def toggle_action_controls(action_type):
|
||||
# Button visible only for click
|
||||
button_vis = gr.update(visible=(action_type == "click"))
|
||||
# Scroll inputs visible only for scroll
|
||||
scroll_x_vis = gr.update(visible=(action_type == "scroll"))
|
||||
scroll_y_vis = gr.update(visible=(action_type == "scroll"))
|
||||
# Update state
|
||||
ui_handler.current_action_type = action_type or "click"
|
||||
return button_vis, scroll_x_vis, scroll_y_vis
|
||||
|
||||
action_type_radio.change(
|
||||
fn=toggle_button_visibility,
|
||||
fn=toggle_action_controls,
|
||||
inputs=[action_type_radio],
|
||||
outputs=[action_button_radio]
|
||||
outputs=[action_button_radio, scroll_x_input, scroll_y_input]
|
||||
)
|
||||
|
||||
# Action accordion handlers
|
||||
click_submit_btn.click(
|
||||
fn=ui_handler.submit_click_action,
|
||||
inputs=[click_x, click_y, click_action_type, click_button],
|
||||
outputs=[status_display]
|
||||
).then(
|
||||
fn=ui_handler.wait_for_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
|
||||
# Keep other control values in ui_handler state
|
||||
def on_button_change(val):
|
||||
ui_handler.current_button = (val or "left")
|
||||
action_button_radio.change(
|
||||
fn=on_button_change,
|
||||
inputs=[action_button_radio]
|
||||
)
|
||||
|
||||
def on_scroll_x_change(val):
|
||||
try:
|
||||
ui_handler.current_scroll_x = int(val) if val is not None else 0
|
||||
except Exception:
|
||||
ui_handler.current_scroll_x = 0
|
||||
scroll_x_input.change(
|
||||
fn=on_scroll_x_change,
|
||||
inputs=[scroll_x_input]
|
||||
)
|
||||
|
||||
def on_scroll_y_change(val):
|
||||
try:
|
||||
ui_handler.current_scroll_y = int(val) if val is not None else 0
|
||||
except Exception:
|
||||
ui_handler.current_scroll_y = 0
|
||||
scroll_y_input.change(
|
||||
fn=on_scroll_y_change,
|
||||
inputs=[scroll_y_input]
|
||||
)
|
||||
|
||||
type_submit_btn.click(
|
||||
@@ -587,7 +644,7 @@ def create_ui():
|
||||
outputs=[status_display]
|
||||
).then(
|
||||
fn=ui_handler.wait_for_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
)
|
||||
|
||||
keypress_submit_btn.click(
|
||||
@@ -596,7 +653,7 @@ def create_ui():
|
||||
outputs=[status_display]
|
||||
).then(
|
||||
fn=ui_handler.wait_for_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
)
|
||||
|
||||
def handle_description_submit(description, action_type, button):
|
||||
@@ -612,13 +669,30 @@ def create_ui():
|
||||
outputs=[status_display]
|
||||
).then(
|
||||
fn=ui_handler.wait_for_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
)
|
||||
|
||||
# Misc action handler
|
||||
def handle_misc_submit(selected_action):
|
||||
if selected_action == "wait":
|
||||
result = ui_handler.submit_wait_action()
|
||||
ui_handler.wait_for_pending_calls()
|
||||
return result
|
||||
return f"Unsupported misc action: {selected_action}"
|
||||
|
||||
misc_submit_btn.click(
|
||||
fn=handle_misc_submit,
|
||||
inputs=[misc_action_dropdown],
|
||||
outputs=[status_display]
|
||||
).then(
|
||||
fn=ui_handler.wait_for_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
)
|
||||
|
||||
# Load initial data
|
||||
demo.load(
|
||||
fn=ui_handler.refresh_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn]
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
)
|
||||
|
||||
return demo
|
||||
|
||||
@@ -1,77 +1,228 @@
|
||||
"""HUD integration for ComputerAgent."""
|
||||
"""HUD integration: Generic HuggingFace dataset evaluation runner (CUA proxy).
|
||||
|
||||
import logging
|
||||
from typing import Any, Optional, Dict
|
||||
from hud import run_job as hud_run_job
|
||||
This module exposes two helpers to evaluate HUD-compatible datasets using
|
||||
HUD's OperatorAgent, while proxying model calls through our ComputerAgent via
|
||||
`FakeAsyncOpenAI` (see `agent/integrations/hud/agent.py`).
|
||||
|
||||
from .agent import ComputerAgent
|
||||
from .adapter import ComputerAgentAdapter
|
||||
from .computer_handler import HUDComputerHandler
|
||||
Exports:
|
||||
- run_single_task(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None)
|
||||
- run_full_dataset(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None, max_concurrent=30, max_steps=50)
|
||||
"""
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from PIL import Image
|
||||
from datasets import load_dataset, Dataset
|
||||
from hud.agents import OperatorAgent
|
||||
from hud.datasets import Task, run_dataset
|
||||
from hud.tools.computer.settings import computer_settings
|
||||
from hud import trace
|
||||
|
||||
from agent.agent import ComputerAgent as BaseComputerAgent
|
||||
from .proxy import FakeAsyncOpenAI
|
||||
|
||||
|
||||
async def run_job(
|
||||
model: str,
|
||||
task_or_taskset: Any,
|
||||
job_name: str,
|
||||
# Job kwargs
|
||||
auto_reply_question: bool = False,
|
||||
adapter_cls: Any = None,
|
||||
adapter_kwargs: Optional[Dict[str, Any]] = None,
|
||||
max_steps_per_task: int = 20,
|
||||
run_parallel: bool = True,
|
||||
job_metadata: Optional[Dict[str, Any]] = None,
|
||||
show_progress: bool = True,
|
||||
max_concurrent_env_creations: Optional[int] = 30, # Limits gym.make calls
|
||||
max_concurrent_agent_predictions: Optional[int] = None, # No limit on LLM calls
|
||||
max_concurrent_tasks: Optional[int] = 30, # Limits overall task concurrency
|
||||
**agent_kwargs: Any
|
||||
) -> Any:
|
||||
# ---------------------------------------------------------------------------
|
||||
# Proxy OperatorAgent
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class ProxyOperatorAgent(OperatorAgent):
|
||||
"""OperatorAgent that proxies model calls through our ComputerAgent.
|
||||
|
||||
Accepts the same config keys we pass via hud.run_dataset `agent_config`:
|
||||
- model: str | None
|
||||
- allowed_tools: list[str] | None
|
||||
Additional kwargs are forwarded to OperatorAgent (if any are supported).
|
||||
"""
|
||||
Run a job using ComputerAgent with the specified model.
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
model: str | None = None,
|
||||
allowed_tools: list[str] | None = None,
|
||||
trajectory_dir: str | dict | None = None,
|
||||
# === ComputerAgent kwargs ===
|
||||
tools: list[Any] | None = None,
|
||||
custom_loop: Any | None = None,
|
||||
only_n_most_recent_images: int | None = None,
|
||||
callbacks: list[Any] | None = None,
|
||||
verbosity: int | None = None,
|
||||
max_retries: int | None = 3,
|
||||
screenshot_delay: float | int = 0.5,
|
||||
use_prompt_caching: bool | None = False,
|
||||
max_trajectory_budget: float | dict | None = None,
|
||||
telemetry_enabled: bool | None = True,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
model = model or "computer-use-preview"
|
||||
allowed_tools = allowed_tools or ["openai_computer"]
|
||||
|
||||
computer_shim = {
|
||||
'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
|
||||
'environment': 'linux',
|
||||
'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
|
||||
}
|
||||
# Build tools ensuring the computer_shim is included
|
||||
agent_tools: list[Any] = [computer_shim]
|
||||
if tools:
|
||||
agent_tools.extend(tools)
|
||||
|
||||
computer_agent = BaseComputerAgent(
|
||||
model=model,
|
||||
tools=agent_tools,
|
||||
custom_loop=custom_loop,
|
||||
only_n_most_recent_images=only_n_most_recent_images,
|
||||
callbacks=callbacks,
|
||||
verbosity=verbosity,
|
||||
trajectory_dir=trajectory_dir,
|
||||
max_retries=max_retries,
|
||||
screenshot_delay=screenshot_delay,
|
||||
use_prompt_caching=use_prompt_caching,
|
||||
max_trajectory_budget=max_trajectory_budget,
|
||||
telemetry_enabled=telemetry_enabled,
|
||||
)
|
||||
model_client = FakeAsyncOpenAI(computer_agent)
|
||||
|
||||
super().__init__(
|
||||
model_client=model_client, # type: ignore[arg-type]
|
||||
model=model,
|
||||
allowed_tools=allowed_tools,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Single-task runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_single_task(
|
||||
dataset: str | Dataset | list[dict[str, Any]],
|
||||
*,
|
||||
task_id: int = 0,
|
||||
model: str | None = None,
|
||||
allowed_tools: list[str] | None = None,
|
||||
# === ComputerAgent kwargs ===
|
||||
tools: list[Any] | None = None,
|
||||
custom_loop: Any | None = None,
|
||||
only_n_most_recent_images: int | None = None,
|
||||
callbacks: list[Any] | None = None,
|
||||
verbosity: int | None = None,
|
||||
trajectory_dir: str | dict | None = None,
|
||||
max_retries: int | None = 3,
|
||||
screenshot_delay: float | int = 0.5,
|
||||
use_prompt_caching: bool | None = False,
|
||||
max_trajectory_budget: float | dict | None = None,
|
||||
telemetry_enabled: bool | None = True,
|
||||
) -> None:
|
||||
"""Load one task from the dataset and execute it with Operator+CUA proxy."""
|
||||
|
||||
# Load dataset and pick a sample
|
||||
if isinstance(dataset, str):
|
||||
dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
|
||||
elif isinstance(dataset, list):
|
||||
dataset = dataset
|
||||
else:
|
||||
dataset = dataset["train"]
|
||||
|
||||
Args:
|
||||
model: Model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
|
||||
task_or_taskset: Task or TaskSet to run
|
||||
job_name: Name for the job
|
||||
auto_reply_question: Whether to auto-reply to questions
|
||||
adapter_cls: Custom adapter class (defaults to ComputerAgentAdapter)
|
||||
adapter_kwargs: Additional kwargs for the adapter
|
||||
max_steps_per_task: Maximum steps per task
|
||||
run_parallel: Whether to run tasks in parallel
|
||||
job_metadata: Additional metadata for the job
|
||||
show_progress: Whether to show progress
|
||||
max_concurrent_env_creations: Max concurrent environment creations
|
||||
max_concurrent_agent_predictions: Max concurrent agent predictions
|
||||
max_concurrent_tasks: Max concurrent tasks
|
||||
**agent_kwargs: Additional kwargs to pass to ComputerAgent
|
||||
|
||||
Returns:
|
||||
Job instance from HUD
|
||||
"""
|
||||
# combine verbose and verbosity kwargs
|
||||
if "verbose" in agent_kwargs:
|
||||
agent_kwargs["verbosity"] = logging.INFO
|
||||
del agent_kwargs["verbose"]
|
||||
verbose = True if agent_kwargs.get("verbosity", logging.WARNING) > logging.INFO else False
|
||||
|
||||
# run job
|
||||
return await hud_run_job(
|
||||
agent_cls=ComputerAgent,
|
||||
agent_kwargs={"model": model, **agent_kwargs},
|
||||
task_or_taskset=task_or_taskset,
|
||||
job_name=job_name,
|
||||
auto_reply_question=auto_reply_question,
|
||||
adapter_cls=adapter_cls,
|
||||
adapter_kwargs=adapter_kwargs,
|
||||
max_steps_per_task=max_steps_per_task,
|
||||
run_parallel=run_parallel,
|
||||
job_metadata=job_metadata,
|
||||
show_progress=show_progress,
|
||||
verbose=verbose,
|
||||
max_concurrent_env_creations=max_concurrent_env_creations,
|
||||
max_concurrent_agent_predictions=max_concurrent_agent_predictions,
|
||||
max_concurrent_tasks=max_concurrent_tasks
|
||||
sample_task = dataset[task_id] # type: ignore[index]
|
||||
task_prompt = sample_task.get("prompt", f"Task {sample_task.get('id', 0)}") # type: ignore[attr-defined]
|
||||
|
||||
with trace(name=task_prompt):
|
||||
task = Task(**sample_task) # type: ignore[arg-type]
|
||||
|
||||
agent = ProxyOperatorAgent(
|
||||
model=model,
|
||||
allowed_tools=allowed_tools,
|
||||
# === ComputerAgent kwargs passthrough ===
|
||||
tools=tools,
|
||||
custom_loop=custom_loop,
|
||||
only_n_most_recent_images=only_n_most_recent_images,
|
||||
callbacks=callbacks,
|
||||
verbosity=verbosity,
|
||||
trajectory_dir=trajectory_dir,
|
||||
max_retries=max_retries,
|
||||
screenshot_delay=screenshot_delay,
|
||||
use_prompt_caching=use_prompt_caching,
|
||||
max_trajectory_budget=max_trajectory_budget,
|
||||
telemetry_enabled=telemetry_enabled,
|
||||
)
|
||||
print(f"Running: {task_prompt}")
|
||||
result = await agent.run(task, max_steps=10)
|
||||
print(f"✅ Reward: {getattr(result, 'reward')}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Full-dataset runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_full_dataset(
|
||||
dataset: str | Dataset | list[dict[str, Any]],
|
||||
*,
|
||||
job_name: Optional[str] = None,
|
||||
model: str | None = None,
|
||||
allowed_tools: list[str] | None = None,
|
||||
max_concurrent: int = 30,
|
||||
max_steps: int = 50,
|
||||
split: str = "train",
|
||||
trajectory_dir: str | dict | None = None,
|
||||
# === ComputerAgent kwargs ===
|
||||
tools: list[Any] | None = None,
|
||||
custom_loop: Any | None = None,
|
||||
only_n_most_recent_images: int | None = 5,
|
||||
callbacks: list[Any] | None = None,
|
||||
verbosity: int | None = None,
|
||||
max_retries: int | None = 3,
|
||||
screenshot_delay: float | int = 0.5,
|
||||
use_prompt_caching: bool | None = False,
|
||||
max_trajectory_budget: float | dict | None = None,
|
||||
telemetry_enabled: bool | None = True,
|
||||
) -> list[Any]:
|
||||
"""Run evaluation across the entire dataset using hud.datasets.run_dataset."""
|
||||
|
||||
# We pass OperatorAgent as the class and provide a config that injects our
|
||||
# FakeAsyncOpenAI per agent instantiation.
|
||||
|
||||
if isinstance(dataset, str):
|
||||
dataset_name = dataset.split('/')[-1]
|
||||
job_name = job_name or f"Evaluation {dataset_name}"
|
||||
dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
|
||||
else:
|
||||
dataset_name = "custom"
|
||||
job_name = job_name or f"Evaluation {time.strftime('%H:%M %Y-%m-%d')}"
|
||||
|
||||
# Execute evaluation
|
||||
return await run_dataset(
|
||||
name=job_name,
|
||||
dataset=dataset,
|
||||
agent_class=ProxyOperatorAgent,
|
||||
agent_config={
|
||||
"model": model,
|
||||
"allowed_tools": allowed_tools,
|
||||
"trajectory_dir": trajectory_dir,
|
||||
# === ComputerAgent kwargs passthrough ===
|
||||
"tools": tools,
|
||||
"custom_loop": custom_loop,
|
||||
"only_n_most_recent_images": only_n_most_recent_images,
|
||||
"callbacks": callbacks,
|
||||
"verbosity": verbosity,
|
||||
"max_retries": max_retries,
|
||||
"screenshot_delay": screenshot_delay,
|
||||
"use_prompt_caching": use_prompt_caching,
|
||||
"max_trajectory_budget": max_trajectory_budget,
|
||||
"telemetry_enabled": telemetry_enabled,
|
||||
},
|
||||
max_concurrent=max_concurrent,
|
||||
metadata={"dataset": dataset_name},
|
||||
max_steps=max_steps,
|
||||
auto_respond=True,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler", "run_job"]
|
||||
__all__ = [
|
||||
"run_single_task",
|
||||
"run_full_dataset",
|
||||
"ProxyOperatorAgent",
|
||||
]
|
||||
@@ -1,121 +0,0 @@
|
||||
"""HUD Adapter for ComputerAgent integration."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, ClassVar
|
||||
|
||||
from hud.adapters.common import CLA, Adapter
|
||||
from hud.adapters.common.types import (
|
||||
CLAButton,
|
||||
CLAKey,
|
||||
ClickAction,
|
||||
CustomAction,
|
||||
DragAction,
|
||||
MoveAction,
|
||||
Point,
|
||||
PressAction,
|
||||
ResponseAction,
|
||||
ScreenshotFetch,
|
||||
ScrollAction,
|
||||
TypeAction,
|
||||
WaitAction,
|
||||
)
|
||||
|
||||
|
||||
class ComputerAgentAdapter(Adapter):
|
||||
"""Adapter for ComputerAgent to work with HUD."""
|
||||
|
||||
KEY_MAP: ClassVar[dict[str, CLAKey]] = {
|
||||
"return": "enter",
|
||||
"arrowup": "up",
|
||||
"arrowdown": "down",
|
||||
"arrowleft": "left",
|
||||
"arrowright": "right",
|
||||
"cmd": "ctrl",
|
||||
"super": "win",
|
||||
"meta": "win",
|
||||
}
|
||||
|
||||
BUTTON_MAP: ClassVar[dict[str, CLAButton]] = {
|
||||
"wheel": "middle",
|
||||
"middle": "middle",
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
# ComputerAgent default dimensions (can be overridden)
|
||||
self.agent_width = 1024
|
||||
self.agent_height = 768
|
||||
|
||||
def _map_key(self, key: str) -> CLAKey:
|
||||
"""Map a key to its standardized form."""
|
||||
return self.KEY_MAP.get(key.lower(), key.lower()) # type: ignore
|
||||
|
||||
def convert(self, data: Any) -> CLA:
|
||||
"""Convert a ComputerAgent action to a HUD action."""
|
||||
try:
|
||||
action_type = data.get("type")
|
||||
|
||||
if action_type == "click":
|
||||
x, y = data.get("x", 0), data.get("y", 0)
|
||||
button = data.get("button", "left")
|
||||
button = self.BUTTON_MAP.get(button, button)
|
||||
if button is None:
|
||||
button = "left"
|
||||
converted_action = ClickAction(point=Point(x=x, y=y), button=button)
|
||||
|
||||
elif action_type == "double_click":
|
||||
x, y = data.get("x", 0), data.get("y", 0)
|
||||
converted_action = ClickAction(point=Point(x=x, y=y), button="left", pattern=[100])
|
||||
|
||||
elif action_type == "scroll":
|
||||
x, y = int(data.get("x", 0)), int(data.get("y", 0))
|
||||
scroll_x = int(data.get("scroll_x", 0))
|
||||
scroll_y = int(data.get("scroll_y", 0))
|
||||
converted_action = ScrollAction(
|
||||
point=Point(x=x, y=y), scroll=Point(x=scroll_x, y=scroll_y)
|
||||
)
|
||||
|
||||
elif action_type == "type":
|
||||
text = data.get("text", "")
|
||||
converted_action = TypeAction(text=text, enter_after=False)
|
||||
|
||||
elif action_type == "wait":
|
||||
ms = data.get("ms", 1000)
|
||||
converted_action = WaitAction(time=ms)
|
||||
|
||||
elif action_type == "move":
|
||||
x, y = data.get("x", 0), data.get("y", 0)
|
||||
converted_action = MoveAction(point=Point(x=x, y=y))
|
||||
|
||||
elif action_type == "keypress":
|
||||
keys = data.get("keys", [])
|
||||
if isinstance(keys, str):
|
||||
keys = [keys]
|
||||
converted_action = PressAction(keys=[self._map_key(k) for k in keys])
|
||||
|
||||
elif action_type == "drag":
|
||||
path = data.get("path", [])
|
||||
points = [Point(x=p.get("x", 0), y=p.get("y", 0)) for p in path]
|
||||
converted_action = DragAction(path=points)
|
||||
|
||||
elif action_type == "screenshot":
|
||||
converted_action = ScreenshotFetch()
|
||||
|
||||
elif action_type == "response":
|
||||
converted_action = ResponseAction(text=data.get("text", ""))
|
||||
|
||||
elif action_type == "custom":
|
||||
converted_action = CustomAction(action=data.get("action", ""))
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported action type: {action_type}")
|
||||
|
||||
# Add reasoning and logs if available
|
||||
converted_action.reasoning = data.get("reasoning", "")
|
||||
converted_action.logs = data.get("logs", "")
|
||||
|
||||
return converted_action
|
||||
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid action: {data}. Error: {e!s}") from e
|
||||
@@ -1,373 +0,0 @@
|
||||
"""HUD ComputerAgent wrapper for OSWorld benchmarking."""
|
||||
|
||||
import logging
|
||||
from typing import Any, Literal, Optional, Union, List, Dict
|
||||
import asyncio
|
||||
|
||||
from agent import ComputerAgent as BaseComputerAgent
|
||||
from agent.responses import make_failed_tool_call_items
|
||||
from hud.adapters import Adapter
|
||||
from hud.agent.base import Agent
|
||||
from hud.utils.common import Observation
|
||||
from hud.adapters.common.types import LogType
|
||||
from hud.types import Gym
|
||||
|
||||
from .adapter import ComputerAgentAdapter
|
||||
from .computer_handler import HUDComputerHandler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BASE_SYSTEM_PROMPT = """
|
||||
You are an autonomous computer-using agent. Follow these guidelines:
|
||||
|
||||
1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
|
||||
2. Use the computer tools to complete the task and do not stop until the task is complete.
|
||||
3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
|
||||
4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
|
||||
5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
|
||||
6. Trust that the user wants you to complete the entire task they've requested.
|
||||
7. You must say "Task completed" when the task is complete.
|
||||
|
||||
Remember: You have been given permission to complete the requested task autonomously.
|
||||
""".strip()
|
||||
|
||||
class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
|
||||
"""
|
||||
A ComputerAgent wrapper for HUD integration.
|
||||
|
||||
This agent wraps the base ComputerAgent to work with HUD environments,
|
||||
providing the same interface as OperatorAgent but using ComputerAgent internally.
|
||||
"""
|
||||
|
||||
transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str = "anthropic/claude-3-5-sonnet-20241022",
|
||||
environment: Literal["windows", "mac", "linux", "browser"] = "linux",
|
||||
adapter: Optional[Adapter] = None,
|
||||
name: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""
|
||||
Initialize the ComputerAgent for HUD.
|
||||
|
||||
Args:
|
||||
model: The model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
|
||||
environment: The environment type (windows, mac, linux, browser)
|
||||
adapter: The adapter to use for preprocessing and postprocessing
|
||||
name: The name of the agent
|
||||
**kwargs: Additional arguments passed to ComputerAgent
|
||||
"""
|
||||
# Create adapter if not provided
|
||||
adapter = adapter or ComputerAgentAdapter()
|
||||
|
||||
if name is None:
|
||||
name = f"computeragent-{model.split('/')[-1]}"
|
||||
|
||||
# Initialize the base Agent class without client (we'll create it later)
|
||||
super().__init__(client=None, adapter=adapter, name=name)
|
||||
|
||||
self.model = model
|
||||
self.environment = environment
|
||||
self.kwargs = kwargs
|
||||
|
||||
# Default dimensions
|
||||
self.width = 1024
|
||||
self.height = 768
|
||||
|
||||
# Update dimensions if adapter is provided
|
||||
if self.adapter:
|
||||
self.width = self.adapter.agent_width
|
||||
self.height = self.adapter.agent_height
|
||||
|
||||
# Create HUD computer handler
|
||||
self.hud_computer = HUDComputerHandler(
|
||||
environment=environment,
|
||||
dimensions=(self.width, self.height)
|
||||
)
|
||||
|
||||
# Handle trajectory_dir by adding TrajectorySaverCallback
|
||||
trajectory_dir = kwargs.pop("trajectory_dir", None)
|
||||
callbacks = kwargs.get("callbacks", [])
|
||||
|
||||
if trajectory_dir:
|
||||
from agent.callbacks.trajectory_saver import TrajectorySaverCallback
|
||||
trajectory_callback = TrajectorySaverCallback(trajectory_dir, reset_on_run=False)
|
||||
callbacks = callbacks + [trajectory_callback]
|
||||
kwargs["callbacks"] = callbacks
|
||||
|
||||
# Initialize ComputerAgent with HUD computer handler
|
||||
self.computer_agent = BaseComputerAgent(
|
||||
model=model,
|
||||
tools=[self.hud_computer],
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Set the client to the computer_agent for compatibility
|
||||
self.client = self.computer_agent
|
||||
|
||||
# State tracking
|
||||
self.conversation_history: List[Dict[str, Any]] = []
|
||||
self.initial_prompt: Optional[str] = None
|
||||
|
||||
# System prompt for computer use tasks
|
||||
self.base_system_prompt = BASE_SYSTEM_PROMPT
|
||||
|
||||
async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
|
||||
"""
|
||||
Fetch a response from ComputerAgent based on the observation.
|
||||
|
||||
Args:
|
||||
observation: The preprocessed observation, attributes:
|
||||
screenshot: Base64 encoded PNG string of the screen
|
||||
text: Text observation, if available
|
||||
|
||||
Returns:
|
||||
tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions,
|
||||
boolean indicating if the agent believes the task is complete.
|
||||
"""
|
||||
try:
|
||||
# Update the computer handler with the current screenshot
|
||||
if observation.screenshot:
|
||||
self.hud_computer.update_screenshot(observation.screenshot)
|
||||
|
||||
# Set up action callback to capture actions
|
||||
captured_actions = []
|
||||
action_done = False
|
||||
|
||||
async def action_callback(action: Dict[str, Any]) -> None:
|
||||
"""Callback to capture actions from ComputerAgent."""
|
||||
nonlocal captured_actions, action_done
|
||||
captured_actions.append(action)
|
||||
|
||||
# Set the action callback
|
||||
self.hud_computer.set_action_callback(action_callback)
|
||||
|
||||
# Prepare the message for ComputerAgent
|
||||
if not self.conversation_history:
|
||||
# First interaction - use the observation text as initial prompt
|
||||
if observation.text:
|
||||
self.initial_prompt = observation.text
|
||||
message = f"{self.base_system_prompt}\n\nTask: {observation.text}"
|
||||
else:
|
||||
message = f"{self.base_system_prompt}\n\nPlease analyze the current screen and determine what action to take."
|
||||
|
||||
input_content = [
|
||||
{"type": "input_text", "text": message}
|
||||
]
|
||||
|
||||
# Add screenshot if present
|
||||
if observation.screenshot:
|
||||
input_content.append(
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/png;base64,{observation.screenshot}",
|
||||
}
|
||||
)
|
||||
|
||||
self.conversation_history.append({"role": "user", "content": input_content})
|
||||
else:
|
||||
# Subsequent interactions - check if last action was computer_call
|
||||
# If so, add computer_call_output with screenshot instead of user message
|
||||
last_computer_calls = []
|
||||
for msg in reversed(self.conversation_history):
|
||||
if msg.get("type") == "computer_call":
|
||||
call_id = msg.get("call_id")
|
||||
if call_id:
|
||||
# Check if this call_id already has a computer_call_output
|
||||
has_output = any(
|
||||
m.get("type") == "computer_call_output" and m.get("call_id") == call_id
|
||||
for m in self.conversation_history
|
||||
)
|
||||
if not has_output:
|
||||
last_computer_calls.append(call_id)
|
||||
|
||||
if last_computer_calls:
|
||||
if not observation.screenshot:
|
||||
print("No screenshot found, taking screenshot")
|
||||
screenshot_b64 = await self.hud_computer.screenshot()
|
||||
# Add computer_call_output for each unresponded computer_call
|
||||
for call_id in reversed(last_computer_calls): # Maintain order
|
||||
self.conversation_history.append({
|
||||
"type": "computer_call_output",
|
||||
"call_id": call_id,
|
||||
"output": {
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/png;base64,{screenshot_b64}"
|
||||
}
|
||||
})
|
||||
else:
|
||||
# No computer_call found, add regular user message
|
||||
message = "Continue with the task based on the current screen state."
|
||||
input_content = [
|
||||
{"type": "input_text", "text": message}
|
||||
]
|
||||
|
||||
# Add screenshot if present
|
||||
if observation.screenshot:
|
||||
input_content.append(
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/png;base64,{observation.screenshot}",
|
||||
}
|
||||
)
|
||||
|
||||
self.conversation_history.append({"role": "user", "content": input_content})
|
||||
|
||||
# If the last message is a reasoning message, change it to output_text
|
||||
if (self.conversation_history and
|
||||
self.conversation_history[-1].get("type") == "reasoning" and
|
||||
self.conversation_history[-1].get("summary")):
|
||||
|
||||
reasoning_msg = self.conversation_history[-1]
|
||||
summary_texts = []
|
||||
|
||||
# Extract all summary_text entries
|
||||
for summary_item in reasoning_msg["summary"]:
|
||||
if summary_item.get("type") == "summary_text":
|
||||
summary_texts.append(summary_item.get("text", ""))
|
||||
|
||||
# Convert to message format with output_text
|
||||
if summary_texts:
|
||||
converted_message = {
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{
|
||||
"text": " ".join(summary_texts),
|
||||
"type": "output_text"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Replace the reasoning message with the converted message
|
||||
self.conversation_history[-1] = converted_message
|
||||
|
||||
# Run ComputerAgent
|
||||
try:
|
||||
new_items = []
|
||||
|
||||
# ComputerAgent.run returns an async generator
|
||||
try:
|
||||
async for result in self.computer_agent.run(self.conversation_history, stream=False):
|
||||
# if the result has computer_call_output, immediately exit
|
||||
if result.get("output", []) and result.get("output", [])[-1].get("type") == "computer_call_output":
|
||||
break
|
||||
# otherwise add agent output to conversation history
|
||||
new_items += result["output"]
|
||||
except Exception as e:
|
||||
# if the last message is reasoning, change it to output_text
|
||||
if new_items and new_items[-1].get("type") == "reasoning":
|
||||
new_items[-1] = {
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{
|
||||
"text": new_items[-1].get("summary", [{}])[0].get("text", ""),
|
||||
"type": "output_text"
|
||||
}
|
||||
]
|
||||
}
|
||||
# Check if there are any computer_call items in new_items
|
||||
computer_calls = [item for item in new_items if item.get("type") == "computer_call"]
|
||||
if computer_calls:
|
||||
# Remove computer_call items from new_items
|
||||
new_items = [item for item in new_items if item.get("type") != "computer_call"]
|
||||
|
||||
# Add failed tool call items for each computer call
|
||||
for computer_call in computer_calls:
|
||||
tool_input = computer_call.get("action", {})
|
||||
call_id = computer_call.get("call_id")
|
||||
new_items.extend(make_failed_tool_call_items(
|
||||
tool_name="computer",
|
||||
tool_kwargs=tool_input,
|
||||
error_message=repr(e),
|
||||
call_id=call_id
|
||||
))
|
||||
else:
|
||||
# add error message to conversation history (fallback for non-computer-call errors)
|
||||
new_items.append({
|
||||
"type": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": f"Error during previous attempted action: {repr(e)}"
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
# Check if we captured any actions
|
||||
if captured_actions:
|
||||
# Extract reasoning from the conversation history
|
||||
reasoning = ""
|
||||
# Look for the latest reasoning message
|
||||
for msg in reversed(new_items):
|
||||
if msg.get("type") == "reasoning" and msg.get("summary"):
|
||||
reasoning = " ".join([s.get("text", "") for s in msg["summary"] if s.get("type") == "summary_text"])
|
||||
break
|
||||
elif msg.get("type") == "message" and msg.get("role") == "assistant":
|
||||
content = msg.get("content", [])
|
||||
if isinstance(content, list):
|
||||
reasoning = " ".join([c.get("text", "") for c in content if c.get("type") == "output_text"])
|
||||
break
|
||||
|
||||
# update conversation history
|
||||
self.conversation_history += new_items
|
||||
|
||||
# Add reasoning and logs to each action
|
||||
for action in captured_actions:
|
||||
action["reasoning"] = reasoning
|
||||
action["logs"] = {"conversation_length": len(self.conversation_history)}
|
||||
|
||||
return captured_actions, False
|
||||
|
||||
# Check if the last message is "Task completed"
|
||||
response_text = ""
|
||||
for msg in reversed(new_items):
|
||||
if msg.get("type") == "message" and msg.get("role") == "assistant":
|
||||
content = msg.get("content", [])
|
||||
for c in content:
|
||||
if c.get("type") == "output_text":
|
||||
response_text = c.get("text", response_text)
|
||||
break
|
||||
break
|
||||
|
||||
done = "task completed" in response_text.lower()
|
||||
|
||||
# update conversation history
|
||||
self.conversation_history += new_items
|
||||
|
||||
response_action = {
|
||||
"type": "response",
|
||||
"text": response_text,
|
||||
"reasoning": response_text,
|
||||
"logs": {"conversation_length": len(self.conversation_history)}
|
||||
}
|
||||
|
||||
# Check if this indicates task completion or failure
|
||||
if "task is infeasible" in response_text.lower():
|
||||
response_action = {"type": "custom", "action": "FAIL"}
|
||||
done = True
|
||||
|
||||
return [response_action], done
|
||||
except Exception as e:
|
||||
logger.error(f"Error running ComputerAgent: {e}")
|
||||
# Return an error response
|
||||
error_action = {
|
||||
"type": "response",
|
||||
"text": f"Error occurred: {str(e)}",
|
||||
"reasoning": f"ComputerAgent encountered an error: {str(e)}",
|
||||
"logs": {"error": str(e)}
|
||||
}
|
||||
return [error_action], True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in fetch_response: {e}")
|
||||
error_action = {
|
||||
"type": "response",
|
||||
"text": f"Error in agent processing: {str(e)}",
|
||||
"reasoning": f"Agent processing error: {str(e)}",
|
||||
"logs": {"error": str(e)}
|
||||
}
|
||||
return [error_action], True
|
||||
@@ -1,187 +0,0 @@
|
||||
"""HUD Computer Handler for ComputerAgent integration."""
|
||||
|
||||
import base64
|
||||
from io import BytesIO
|
||||
from typing import Literal, Optional, Any, Dict, Callable
|
||||
from PIL import Image
|
||||
|
||||
from agent.computers import AsyncComputerHandler
|
||||
|
||||
|
||||
class HUDComputerHandler(AsyncComputerHandler):
|
||||
"""Computer handler that interfaces with HUD environment."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
environment: Literal["windows", "mac", "linux", "browser"] = "linux",
|
||||
dimensions: tuple[int, int] = (1024, 768),
|
||||
screenshot_callback: Optional[Callable] = None,
|
||||
action_callback: Optional[Callable] = None,
|
||||
):
|
||||
"""
|
||||
Initialize HUD computer handler.
|
||||
|
||||
Args:
|
||||
environment: The environment type for HUD
|
||||
dimensions: Screen dimensions as (width, height)
|
||||
screenshot_callback: Optional callback to get screenshots from HUD environment
|
||||
action_callback: Optional callback to execute actions in HUD environment
|
||||
"""
|
||||
super().__init__()
|
||||
self._environment = environment
|
||||
self._dimensions = dimensions
|
||||
self._screenshot_callback = screenshot_callback
|
||||
self._action_callback = action_callback
|
||||
|
||||
# Store the last screenshot for reuse
|
||||
self._last_screenshot: Optional[str] = None
|
||||
|
||||
def set_screenshot_callback(self, callback: Callable) -> None:
|
||||
"""Set the screenshot callback."""
|
||||
self._screenshot_callback = callback
|
||||
|
||||
def set_action_callback(self, callback: Callable) -> None:
|
||||
"""Set the action callback."""
|
||||
self._action_callback = callback
|
||||
|
||||
def update_screenshot(self, screenshot: str) -> None:
|
||||
"""Update the stored screenshot (base64 string)."""
|
||||
self._last_screenshot = screenshot
|
||||
|
||||
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
|
||||
"""Get the current environment type."""
|
||||
return self._environment # type: ignore
|
||||
|
||||
async def get_dimensions(self) -> tuple[int, int]:
|
||||
"""Get screen dimensions as (width, height)."""
|
||||
return self._dimensions
|
||||
|
||||
async def screenshot(self) -> str:
|
||||
"""Take a screenshot and return as base64 string."""
|
||||
if self._screenshot_callback:
|
||||
screenshot = await self._screenshot_callback()
|
||||
if isinstance(screenshot, str):
|
||||
self._last_screenshot = screenshot
|
||||
return screenshot
|
||||
elif isinstance(screenshot, Image.Image):
|
||||
# Convert PIL Image to base64
|
||||
buffer = BytesIO()
|
||||
screenshot.save(buffer, format="PNG")
|
||||
screenshot_b64 = base64.b64encode(buffer.getvalue()).decode()
|
||||
self._last_screenshot = screenshot_b64
|
||||
return screenshot_b64
|
||||
elif isinstance(screenshot, bytes):
|
||||
screenshot_b64 = base64.b64encode(screenshot).decode()
|
||||
self._last_screenshot = screenshot_b64
|
||||
return screenshot_b64
|
||||
|
||||
# Return last screenshot if available, otherwise create a blank one
|
||||
if self._last_screenshot:
|
||||
return self._last_screenshot
|
||||
|
||||
# Create a blank screenshot as fallback
|
||||
blank_image = Image.new('RGB', self._dimensions, color='white')
|
||||
buffer = BytesIO()
|
||||
blank_image.save(buffer, format="PNG")
|
||||
screenshot_b64 = base64.b64encode(buffer.getvalue()).decode()
|
||||
self._last_screenshot = screenshot_b64
|
||||
return screenshot_b64
|
||||
|
||||
async def click(self, x: int, y: int, button: str = "left") -> None:
|
||||
"""Click at coordinates with specified button."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "click",
|
||||
"x": x,
|
||||
"y": y,
|
||||
"button": button
|
||||
})
|
||||
|
||||
async def double_click(self, x: int, y: int) -> None:
|
||||
"""Double click at coordinates."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "double_click",
|
||||
"x": x,
|
||||
"y": y
|
||||
})
|
||||
|
||||
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
|
||||
"""Scroll at coordinates with specified scroll amounts."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "scroll",
|
||||
"x": x,
|
||||
"y": y,
|
||||
"scroll_x": scroll_x,
|
||||
"scroll_y": scroll_y
|
||||
})
|
||||
|
||||
async def type(self, text: str) -> None:
|
||||
"""Type text."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "type",
|
||||
"text": text
|
||||
})
|
||||
|
||||
async def wait(self, ms: int = 1000) -> None:
|
||||
"""Wait for specified milliseconds."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "wait",
|
||||
"ms": ms
|
||||
})
|
||||
|
||||
async def move(self, x: int, y: int) -> None:
|
||||
"""Move cursor to coordinates."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "move",
|
||||
"x": x,
|
||||
"y": y
|
||||
})
|
||||
|
||||
async def keypress(self, keys: list[str] | str) -> None:
|
||||
"""Press key combination."""
|
||||
if isinstance(keys, str):
|
||||
keys = [keys]
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "keypress",
|
||||
"keys": keys
|
||||
})
|
||||
|
||||
async def drag(self, path: list[dict[str, int]]) -> None:
|
||||
"""Drag along a path of points."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "drag",
|
||||
"path": path
|
||||
})
|
||||
|
||||
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse down at coordinates."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "left_mouse_down",
|
||||
"x": x,
|
||||
"y": y
|
||||
})
|
||||
|
||||
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse up at coordinates."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "left_mouse_up",
|
||||
"x": x,
|
||||
"y": y
|
||||
})
|
||||
|
||||
async def get_current_url(self) -> str:
|
||||
"""Get the current URL."""
|
||||
if self._action_callback:
|
||||
return await self._action_callback({
|
||||
"type": "get_current_url"
|
||||
})
|
||||
return ""
|
||||
183
libs/python/agent/agent/integrations/hud/proxy.py
Normal file
183
libs/python/agent/agent/integrations/hud/proxy.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""HUD ComputerAgent wrapper and Fake AsyncOpenAI client.
|
||||
|
||||
Provides FakeAsyncOpenAI that adapts our ComputerAgent to the OpenAI Responses
|
||||
interface needed by HUD's OperatorAgent. It implements only `responses.create`
|
||||
and returns an OpenAI Response object with `id` and `output` fields, where `output` is a list of
|
||||
OpenAI-like response blocks. We intentionally only support a single-step call
|
||||
by consuming the first yielded result from `ComputerAgent.run()`.
|
||||
"""
|
||||
|
||||
import traceback
|
||||
import time
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from agent.agent import ComputerAgent as BaseComputerAgent
|
||||
|
||||
# OpenAI Responses typed models (required)
|
||||
from openai.types.responses import (
|
||||
Response,
|
||||
ResponseInputParam,
|
||||
ResponseOutputItem,
|
||||
ResponseComputerToolCall,
|
||||
ResponseOutputMessage,
|
||||
ResponseOutputText,
|
||||
ResponseReasoningItem,
|
||||
ResponseUsage,
|
||||
)
|
||||
|
||||
def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> List[ResponseOutputItem]:
|
||||
"""Map our agent output items to OpenAI ResponseOutputItem typed models.
|
||||
|
||||
Only a subset is supported: computer_call, assistant message (text), and reasoning.
|
||||
Unknown types are ignored.
|
||||
"""
|
||||
blocks: List[ResponseOutputItem] = []
|
||||
for item in output_items or []:
|
||||
t = item.get("type")
|
||||
if t == "computer_call":
|
||||
comp = ResponseComputerToolCall.model_validate({
|
||||
"id": item.get("id") or f"cu_{uuid.uuid4().hex}",
|
||||
"type": "computer_call",
|
||||
"call_id": item["call_id"],
|
||||
"action": item["action"],
|
||||
"pending_safety_checks": item.get("pending_safety_checks", []),
|
||||
"status": "completed",
|
||||
})
|
||||
blocks.append(comp)
|
||||
# we will exit early here as the responses api only supports a single step
|
||||
break
|
||||
elif t == "message" and item.get("role") == "assistant":
|
||||
content_blocks: List[ResponseOutputText] = []
|
||||
for c in item.get("content", []) or []:
|
||||
content_blocks.append(
|
||||
ResponseOutputText.model_validate({
|
||||
"type": "output_text",
|
||||
"text": c["text"],
|
||||
"annotations": [],
|
||||
})
|
||||
)
|
||||
if content_blocks:
|
||||
msg = ResponseOutputMessage.model_validate({
|
||||
"id": item.get("id") or f"msg_{uuid.uuid4()}",
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"status": "completed",
|
||||
"content": [ct.model_dump() for ct in content_blocks],
|
||||
})
|
||||
blocks.append(msg)
|
||||
elif t == "reasoning":
|
||||
reasoning = ResponseReasoningItem.model_validate({
|
||||
"id": item.get("id") or f"rsn_{uuid.uuid4()}",
|
||||
"type": "reasoning",
|
||||
"summary": item["summary"],
|
||||
})
|
||||
blocks.append(reasoning)
|
||||
# Unhandled types are ignored
|
||||
return blocks
|
||||
|
||||
def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
|
||||
out: List[Dict[str, Any]] = []
|
||||
for it in list(items):
|
||||
if hasattr(it, "model_dump"):
|
||||
out.append(it.model_dump()) # type: ignore[attr-defined]
|
||||
elif isinstance(it, dict):
|
||||
out.append(it)
|
||||
else:
|
||||
# Strict: rely on default __dict__ if present
|
||||
out.append(dict(it)) # may raise if not mapping
|
||||
return out
|
||||
|
||||
class FakeAsyncOpenAI:
|
||||
"""Minimal fake OpenAI client with only `responses.create` implemented.
|
||||
|
||||
It uses a provided `ComputerAgent` instance to produce a single-step
|
||||
response compatible with HUD's OperatorAgent loop.
|
||||
"""
|
||||
|
||||
def __init__(self, computer_agent: BaseComputerAgent) -> None:
|
||||
self._agent = computer_agent
|
||||
self.responses = self._Responses(self)
|
||||
|
||||
class _Responses:
|
||||
def __init__(self, parent: "FakeAsyncOpenAI") -> None:
|
||||
# Caches for cross-call context when using previous_response_id
|
||||
self.blocks_cache: Dict[str, ResponseInputParam | ResponseOutputItem] = {}
|
||||
self.context_cache: Dict[str, List[str]] = {}
|
||||
self.agent = parent._agent
|
||||
|
||||
async def create(
|
||||
self,
|
||||
*,
|
||||
model: str,
|
||||
input: ResponseInputParam,
|
||||
tools: Optional[List[Dict[str, Any]]] = None,
|
||||
instructions: Optional[str] = None,
|
||||
previous_response_id: Optional[str] = None,
|
||||
max_retries: int = 5,
|
||||
**_: Any,
|
||||
) -> Any:
|
||||
for attempt in range(max_retries):
|
||||
# Prepend cached blocks from previous_response_id to input
|
||||
full_input = input
|
||||
if previous_response_id is not None:
|
||||
prev_block_ids = self.context_cache[previous_response_id]
|
||||
prev_blocks = [self.blocks_cache[b_id] for b_id in prev_block_ids]
|
||||
full_input = _to_plain_dict_list(prev_blocks + input)
|
||||
|
||||
# Pre-pend instructions message
|
||||
effective_input = full_input
|
||||
if instructions:
|
||||
effective_input = [{
|
||||
"role": "user",
|
||||
"content": instructions,
|
||||
}] + full_input
|
||||
|
||||
# Run a single iteration of the ComputerAgent
|
||||
agent_result: Optional[Dict[str, Any]] = None
|
||||
async for result in self.agent.run(effective_input): # type: ignore[arg-type]
|
||||
agent_result = result
|
||||
break
|
||||
assert agent_result is not None, "Agent failed to produce result"
|
||||
|
||||
output = _map_agent_output_to_openai_blocks(agent_result["output"])
|
||||
usage = agent_result["usage"]
|
||||
|
||||
# Cache conversation context using the last response id
|
||||
block_ids: List[str] = []
|
||||
blocks_to_cache = full_input + output
|
||||
for b in blocks_to_cache:
|
||||
bid = getattr(b, "id", None) or f"tmp-{hash(repr(b))}"
|
||||
self.blocks_cache[bid] = b # type: ignore[assignment]
|
||||
block_ids.append(bid)
|
||||
response_id = agent_result.get("id") or f"fake-{int(time.time()*1000)}"
|
||||
self.context_cache[response_id] = block_ids
|
||||
|
||||
try:
|
||||
return Response.model_validate({
|
||||
"id": response_id,
|
||||
"created_at": time.time(),
|
||||
"object": "response",
|
||||
"model": model,
|
||||
"output": output,
|
||||
"parallel_tool_calls": False,
|
||||
"tool_choice": "auto",
|
||||
"tools": [],
|
||||
"previous_response_id": previous_response_id,
|
||||
"usage": ResponseUsage.model_validate({
|
||||
"input_tokens": usage.get("input_tokens", 0),
|
||||
"output_tokens": usage.get("output_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
"input_tokens_details": usage.get("input_tokens_details", { "cached_tokens": 0 }),
|
||||
"output_tokens_details": usage.get("output_tokens_details", { "reasoning_tokens": 0 }),
|
||||
}),
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ", e)
|
||||
if attempt == max_retries - 1:
|
||||
print(traceback.format_exc())
|
||||
raise e
|
||||
|
||||
__all__ = [
|
||||
"FakeAsyncOpenAI",
|
||||
]
|
||||
@@ -132,23 +132,22 @@ def _convert_responses_items_to_completion_messages(messages: Messages) -> List[
|
||||
converted_content = []
|
||||
for item in content:
|
||||
if isinstance(item, dict) and item.get("type") == "input_image":
|
||||
# Convert input_image to Anthropic image format
|
||||
# Convert input_image to OpenAI image format
|
||||
image_url = item.get("image_url", "")
|
||||
if image_url and image_url != "[omitted]":
|
||||
# Extract base64 data from data URL
|
||||
if "," in image_url:
|
||||
base64_data = image_url.split(",")[-1]
|
||||
else:
|
||||
base64_data = image_url
|
||||
|
||||
converted_content.append({
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": "image/png",
|
||||
"data": base64_data
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
})
|
||||
elif isinstance(item, dict) and item.get("type") == "input_text":
|
||||
# Convert input_text to OpenAI text format
|
||||
text = item.get("text", "")
|
||||
converted_content.append({
|
||||
"type": "text",
|
||||
"text": text
|
||||
})
|
||||
else:
|
||||
# Keep other content types as-is
|
||||
converted_content.append(item)
|
||||
@@ -1530,7 +1529,18 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
|
||||
"text": f"""You are a UI grounding expert. Follow these guidelines:
|
||||
|
||||
1. NEVER ask for confirmation. Complete all tasks autonomously.
|
||||
2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
|
||||
3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
|
||||
4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
|
||||
5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
|
||||
6. The user has already given you permission by running this agent. No further confirmation is needed.
|
||||
7. Be decisive and action-oriented. Complete the requested task fully.
|
||||
|
||||
Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
|
||||
Task: Click {instruction}. Output ONLY a click action on the target element."""
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
|
||||
@@ -48,11 +48,11 @@ GROUNDED_COMPUTER_TOOL_SCHEMA = {
|
||||
"get_dimensions",
|
||||
"get_environment"
|
||||
],
|
||||
"description": "The action to perform"
|
||||
"description": "The action to perform (required for all actions)"
|
||||
},
|
||||
"element_description": {
|
||||
"type": "string",
|
||||
"description": "Description of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
|
||||
"description": "Description of the element to interact with (required for click, double_click, move, scroll actions)"
|
||||
},
|
||||
"start_element_description": {
|
||||
"type": "string",
|
||||
@@ -67,20 +67,30 @@ GROUNDED_COMPUTER_TOOL_SCHEMA = {
|
||||
"description": "The text to type (required for type action)"
|
||||
},
|
||||
"keys": {
|
||||
"type": "string",
|
||||
"description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Key(s) to press (required for keypress action)"
|
||||
},
|
||||
"button": {
|
||||
"type": "string",
|
||||
"description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
|
||||
"enum": [
|
||||
"left",
|
||||
"right",
|
||||
"wheel",
|
||||
"back",
|
||||
"forward"
|
||||
],
|
||||
"description": "The mouse button to use for click action (required for click and double_click action)",
|
||||
},
|
||||
"scroll_x": {
|
||||
"type": "integer",
|
||||
"description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
|
||||
"description": "Horizontal scroll amount for scroll action (required for scroll action)",
|
||||
},
|
||||
"scroll_y": {
|
||||
"type": "integer",
|
||||
"description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
|
||||
"description": "Vertical scroll amount for scroll action (required for scroll action)",
|
||||
},
|
||||
},
|
||||
"required": [
|
||||
@@ -266,13 +276,15 @@ class ComposedGroundedConfig(AsyncAgentConfig):
|
||||
grounding_agent = grounding_agent_conf.agent_class()
|
||||
|
||||
for desc in element_descriptions:
|
||||
coords = await grounding_agent.predict_click(
|
||||
model=grounding_model,
|
||||
image_b64=last_image_b64,
|
||||
instruction=desc
|
||||
)
|
||||
if coords:
|
||||
self.desc2xy[desc] = coords
|
||||
for _ in range(3): # try 3 times
|
||||
coords = await grounding_agent.predict_click(
|
||||
model=grounding_model,
|
||||
image_b64=last_image_b64,
|
||||
instruction=desc
|
||||
)
|
||||
if coords:
|
||||
self.desc2xy[desc] = coords
|
||||
break
|
||||
|
||||
# Step 6: Convert computer calls from descriptions back to xy coordinates
|
||||
final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
|
||||
|
||||
@@ -162,7 +162,18 @@ class OpenAIComputerUseConfig:
|
||||
input_items = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
|
||||
"content": f"""You are a UI grounding expert. Follow these guidelines:
|
||||
|
||||
1. NEVER ask for confirmation. Complete all tasks autonomously.
|
||||
2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
|
||||
3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
|
||||
4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
|
||||
5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
|
||||
6. The user has already given you permission by running this agent. No further confirmation is needed.
|
||||
7. Be decisive and action-oriented. Complete the requested task fully.
|
||||
|
||||
Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
|
||||
Task: Click {instruction}. Output ONLY a click action on the target element."""
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
@@ -200,7 +211,7 @@ class OpenAIComputerUseConfig:
|
||||
"stream": False,
|
||||
"reasoning": {"summary": "concise"},
|
||||
"truncation": "auto",
|
||||
"max_tokens": 100 # Keep response short for click prediction
|
||||
"max_tokens": 200 # Keep response short for click prediction
|
||||
}
|
||||
|
||||
# Use liteLLM responses
|
||||
@@ -217,11 +228,8 @@ class OpenAIComputerUseConfig:
|
||||
isinstance(item.get("action"), dict)):
|
||||
|
||||
action = item["action"]
|
||||
if action.get("type") == "click":
|
||||
x = action.get("x")
|
||||
y = action.get("y")
|
||||
if x is not None and y is not None:
|
||||
return (int(x), int(y))
|
||||
if action.get("x") is not None and action.get("y") is not None:
|
||||
return (int(action.get("x")), int(action.get("y")))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@@ -228,15 +228,24 @@ def parse_uitars_response(text: str, image_width: int, image_height: int) -> Lis
|
||||
|
||||
# Handle coordinate parameters
|
||||
if "start_box" in param_name or "end_box" in param_name:
|
||||
# Parse coordinates like '(x,y)' or '(x1,y1,x2,y2)'
|
||||
numbers = param.replace("(", "").replace(")", "").split(",")
|
||||
float_numbers = [float(num.strip()) / 1000 for num in numbers] # Normalize to 0-1 range
|
||||
# Parse coordinates like '<|box_start|>(x,y)<|box_end|>' or '(x,y)'
|
||||
# First, remove special tokens
|
||||
clean_param = param.replace("<|box_start|>", "").replace("<|box_end|>", "")
|
||||
# Then remove parentheses and split
|
||||
numbers = clean_param.replace("(", "").replace(")", "").split(",")
|
||||
|
||||
if len(float_numbers) == 2:
|
||||
# Single point, duplicate for box format
|
||||
float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
|
||||
|
||||
action_inputs[param_name.strip()] = str(float_numbers)
|
||||
try:
|
||||
float_numbers = [float(num.strip()) / 1000 for num in numbers] # Normalize to 0-1 range
|
||||
|
||||
if len(float_numbers) == 2:
|
||||
# Single point, duplicate for box format
|
||||
float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]]
|
||||
|
||||
action_inputs[param_name.strip()] = str(float_numbers)
|
||||
except ValueError as e:
|
||||
# If parsing fails, keep the original parameter value
|
||||
print(f"Warning: Could not parse coordinates '{param}': {e}")
|
||||
action_inputs[param_name.strip()] = param
|
||||
|
||||
return [{
|
||||
"thought": thought,
|
||||
|
||||
192
libs/python/agent/agent/proxy/examples.py
Normal file
192
libs/python/agent/agent/proxy/examples.py
Normal file
@@ -0,0 +1,192 @@
|
||||
"""
|
||||
Example usage of the proxy server and client requests.
|
||||
"""
|
||||
import dotenv
|
||||
dotenv.load_dotenv()
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import aiohttp
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
async def test_http_endpoint():
|
||||
"""Test the HTTP /responses endpoint."""
|
||||
|
||||
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
|
||||
assert isinstance(anthropic_api_key, str), "ANTHROPIC_API_KEY environment variable must be set"
|
||||
|
||||
# Example 1: Simple text request
|
||||
simple_request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"input": "Tell me a three sentence bedtime story about a unicorn.",
|
||||
"env": {
|
||||
"ANTHROPIC_API_KEY": anthropic_api_key
|
||||
}
|
||||
}
|
||||
|
||||
# Example 2: Multi-modal request with image
|
||||
multimodal_request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"input": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "input_text", "text": "what is in this image?"},
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"env": {
|
||||
"ANTHROPIC_API_KEY": anthropic_api_key
|
||||
}
|
||||
}
|
||||
|
||||
# Example 3: Request with custom agent and computer kwargs
|
||||
custom_request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"input": "Take a screenshot and tell me what you see",
|
||||
"env": {
|
||||
"ANTHROPIC_API_KEY": anthropic_api_key
|
||||
}
|
||||
}
|
||||
|
||||
# Test requests
|
||||
base_url = "https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443"
|
||||
# base_url = "http://localhost:8000"
|
||||
api_key = os.getenv("CUA_API_KEY")
|
||||
assert isinstance(api_key, str), "CUA_API_KEY environment variable must be set"
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
for i, request_data in enumerate([
|
||||
simple_request,
|
||||
# multimodal_request,
|
||||
custom_request
|
||||
], 1):
|
||||
print(f"\n--- Test {i} ---")
|
||||
print(f"Request: {json.dumps(request_data, indent=2)}")
|
||||
|
||||
try:
|
||||
print(f"Sending request to {base_url}/responses")
|
||||
async with session.post(
|
||||
f"{base_url}/responses",
|
||||
json=request_data,
|
||||
headers={"Content-Type": "application/json", "X-API-Key": api_key}
|
||||
) as response:
|
||||
result = await response.json()
|
||||
print(f"Status: {response.status}")
|
||||
print(f"Response: {json.dumps(result, indent=2)}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
|
||||
def curl_examples():
|
||||
"""Print curl command examples."""
|
||||
|
||||
print("=== CURL Examples ===\n")
|
||||
|
||||
print("1. Simple text request:")
|
||||
print("""curl http://localhost:8000/responses \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"input": "Tell me a three sentence bedtime story about a unicorn."
|
||||
}'""")
|
||||
|
||||
print("\n2. Multi-modal request with image:")
|
||||
print("""curl http://localhost:8000/responses \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"input": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "input_text", "text": "what is in this image?"},
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}'""")
|
||||
|
||||
print("\n3. Request with custom configuration:")
|
||||
print("""curl http://localhost:8000/responses \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"input": "Take a screenshot and tell me what you see",
|
||||
"agent_kwargs": {
|
||||
"save_trajectory": true,
|
||||
"verbosity": 20
|
||||
},
|
||||
"computer_kwargs": {
|
||||
"os_type": "linux",
|
||||
"provider_type": "cloud"
|
||||
}
|
||||
}'""")
|
||||
|
||||
|
||||
async def test_p2p_client():
|
||||
"""Example P2P client using peerjs-python."""
|
||||
try:
|
||||
from peerjs import Peer, PeerOptions, ConnectionEventType
|
||||
from aiortc import RTCConfiguration, RTCIceServer
|
||||
|
||||
# Set up client peer
|
||||
options = PeerOptions(
|
||||
host="0.peerjs.com",
|
||||
port=443,
|
||||
secure=True,
|
||||
config=RTCConfiguration(
|
||||
iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]
|
||||
)
|
||||
)
|
||||
|
||||
client_peer = Peer(id="test-client", peer_options=options)
|
||||
await client_peer.start()
|
||||
|
||||
# Connect to proxy server
|
||||
connection = client_peer.connect("computer-agent-proxy")
|
||||
|
||||
@connection.on(ConnectionEventType.Open)
|
||||
async def connection_open():
|
||||
print("Connected to proxy server")
|
||||
|
||||
# Send a test request
|
||||
request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"input": "Hello from P2P client!"
|
||||
}
|
||||
await connection.send(json.dumps(request))
|
||||
|
||||
@connection.on(ConnectionEventType.Data)
|
||||
async def connection_data(data):
|
||||
print(f"Received response: {data}")
|
||||
await client_peer.destroy()
|
||||
|
||||
# Wait for connection
|
||||
await asyncio.sleep(10)
|
||||
|
||||
except ImportError:
|
||||
print("P2P dependencies not available. Install peerjs-python for P2P testing.")
|
||||
except Exception as e:
|
||||
print(f"P2P test error: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "curl":
|
||||
curl_examples()
|
||||
elif len(sys.argv) > 1 and sys.argv[1] == "p2p":
|
||||
asyncio.run(test_p2p_client())
|
||||
else:
|
||||
asyncio.run(test_http_endpoint())
|
||||
248
libs/python/agent/agent/proxy/handlers.py
Normal file
248
libs/python/agent/agent/proxy/handlers.py
Normal file
@@ -0,0 +1,248 @@
|
||||
"""
|
||||
Request handlers for the proxy endpoints.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from typing import Dict, Any, List, Union, Optional
|
||||
|
||||
from ..agent import ComputerAgent
|
||||
from computer import Computer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ResponsesHandler:
|
||||
"""Handler for /responses endpoint that processes agent requests."""
|
||||
|
||||
def __init__(self):
|
||||
self.computer = None
|
||||
self.agent = None
|
||||
# Simple in-memory caches
|
||||
self._computer_cache: Dict[str, Any] = {}
|
||||
self._agent_cache: Dict[str, Any] = {}
|
||||
|
||||
async def setup_computer_agent(
|
||||
self,
|
||||
model: str,
|
||||
agent_kwargs: Optional[Dict[str, Any]] = None,
|
||||
computer_kwargs: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""Set up (and cache) computer and agent instances.
|
||||
|
||||
Caching keys:
|
||||
- Computer cache key: computer_kwargs
|
||||
- Agent cache key: {"model": model, **agent_kwargs}
|
||||
"""
|
||||
agent_kwargs = agent_kwargs or {}
|
||||
computer_kwargs = computer_kwargs or {}
|
||||
|
||||
def _stable_key(obj: Dict[str, Any]) -> str:
|
||||
try:
|
||||
return json.dumps(obj, sort_keys=True, separators=(",", ":"))
|
||||
except Exception:
|
||||
# Fallback: stringify non-serializable values
|
||||
safe_obj = {}
|
||||
for k, v in obj.items():
|
||||
try:
|
||||
json.dumps(v)
|
||||
safe_obj[k] = v
|
||||
except Exception:
|
||||
safe_obj[k] = str(v)
|
||||
return json.dumps(safe_obj, sort_keys=True, separators=(",", ":"))
|
||||
|
||||
# Determine if custom tools are supplied; if so, skip computer setup entirely
|
||||
has_custom_tools = bool(agent_kwargs.get("tools"))
|
||||
|
||||
computer = None
|
||||
if not has_custom_tools:
|
||||
# ---------- Computer setup (with cache) ----------
|
||||
comp_key = _stable_key(computer_kwargs)
|
||||
|
||||
computer = self._computer_cache.get(comp_key)
|
||||
if computer is None:
|
||||
# Default computer configuration
|
||||
default_c_config = {
|
||||
"os_type": "linux",
|
||||
"provider_type": "cloud",
|
||||
"name": os.getenv("CUA_CONTAINER_NAME"),
|
||||
"api_key": os.getenv("CUA_API_KEY"),
|
||||
}
|
||||
default_c_config.update(computer_kwargs)
|
||||
computer = Computer(**default_c_config)
|
||||
await computer.__aenter__()
|
||||
self._computer_cache[comp_key] = computer
|
||||
logger.info(f"Computer created and cached with key={comp_key} config={default_c_config}")
|
||||
else:
|
||||
logger.info(f"Reusing cached computer for key={comp_key}")
|
||||
|
||||
# Bind current computer reference (None if custom tools supplied)
|
||||
self.computer = computer
|
||||
|
||||
# ---------- Agent setup (with cache) ----------
|
||||
# Build agent cache key from {model} + agent_kwargs (excluding tools unless explicitly passed)
|
||||
agent_kwargs_for_key = dict(agent_kwargs)
|
||||
agent_key_payload = {"model": model, **agent_kwargs_for_key}
|
||||
agent_key = _stable_key(agent_key_payload)
|
||||
|
||||
agent = self._agent_cache.get(agent_key)
|
||||
if agent is None:
|
||||
# Default agent configuration
|
||||
default_a_config: Dict[str, Any] = {"model": model}
|
||||
if not has_custom_tools:
|
||||
default_a_config["tools"] = [computer]
|
||||
# Apply user overrides, but keep tools unless user explicitly sets
|
||||
if agent_kwargs:
|
||||
if not has_custom_tools:
|
||||
agent_kwargs.setdefault("tools", [computer])
|
||||
default_a_config.update(agent_kwargs)
|
||||
# JSON-derived kwargs may have loose types; ignore static arg typing here
|
||||
agent = ComputerAgent(**default_a_config) # type: ignore[arg-type]
|
||||
self._agent_cache[agent_key] = agent
|
||||
logger.info(f"Agent created and cached with key={agent_key} model={model}")
|
||||
else:
|
||||
# Ensure cached agent uses the current computer tool (in case object differs)
|
||||
# Only update if tools not explicitly provided in agent_kwargs
|
||||
if not has_custom_tools:
|
||||
try:
|
||||
agent.tools = [computer]
|
||||
except Exception:
|
||||
pass
|
||||
logger.info(f"Reusing cached agent for key={agent_key}")
|
||||
|
||||
# Bind current agent reference
|
||||
self.agent = agent
|
||||
|
||||
async def process_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Process a /responses request and return the result.
|
||||
|
||||
Args:
|
||||
request_data: Dictionary containing model, input, and optional kwargs
|
||||
|
||||
Returns:
|
||||
Dictionary with the agent's response
|
||||
"""
|
||||
try:
|
||||
# Extract request parameters
|
||||
model = request_data.get("model")
|
||||
input_data = request_data.get("input")
|
||||
agent_kwargs = request_data.get("agent_kwargs", {})
|
||||
computer_kwargs = request_data.get("computer_kwargs", {})
|
||||
env_overrides = request_data.get("env", {}) or {}
|
||||
|
||||
if not model:
|
||||
raise ValueError("Model is required")
|
||||
if not input_data:
|
||||
raise ValueError("Input is required")
|
||||
|
||||
# Apply env overrides for the duration of this request
|
||||
with self._env_overrides(env_overrides):
|
||||
# Set up (and possibly reuse) computer and agent via caches
|
||||
await self.setup_computer_agent(model, agent_kwargs, computer_kwargs)
|
||||
|
||||
# Defensive: ensure agent is initialized for type checkers
|
||||
agent = self.agent
|
||||
if agent is None:
|
||||
raise RuntimeError("Agent failed to initialize")
|
||||
|
||||
# Convert input to messages format
|
||||
messages = self._convert_input_to_messages(input_data)
|
||||
|
||||
# Run agent and get first result
|
||||
async for result in agent.run(messages):
|
||||
# Return the first result and break
|
||||
return {
|
||||
"success": True,
|
||||
"result": result,
|
||||
"model": model
|
||||
}
|
||||
|
||||
# If no results were yielded
|
||||
return {
|
||||
"success": False,
|
||||
"error": "No results from agent",
|
||||
"model": model
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing request: {e}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"model": request_data.get("model", "unknown")
|
||||
}
|
||||
|
||||
def _convert_input_to_messages(self, input_data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
||||
"""Convert input data to messages format."""
|
||||
if isinstance(input_data, str):
|
||||
# Simple string input
|
||||
return [{"role": "user", "content": input_data}]
|
||||
elif isinstance(input_data, list):
|
||||
# Already in messages format
|
||||
messages = []
|
||||
for msg in input_data:
|
||||
# Convert content array format if needed
|
||||
if isinstance(msg.get("content"), list):
|
||||
content_parts = []
|
||||
for part in msg["content"]:
|
||||
if part.get("type") == "input_text":
|
||||
content_parts.append({"type": "text", "text": part["text"]})
|
||||
elif part.get("type") == "input_image":
|
||||
content_parts.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": part["image_url"]}
|
||||
})
|
||||
else:
|
||||
content_parts.append(part)
|
||||
messages.append({
|
||||
"role": msg["role"],
|
||||
"content": content_parts
|
||||
})
|
||||
else:
|
||||
messages.append(msg)
|
||||
return messages
|
||||
else:
|
||||
raise ValueError("Input must be string or list of messages")
|
||||
|
||||
async def cleanup(self):
|
||||
"""Clean up resources."""
|
||||
if self.computer:
|
||||
try:
|
||||
await self.computer.__aexit__(None, None, None)
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up computer: {e}")
|
||||
finally:
|
||||
self.computer = None
|
||||
self.agent = None
|
||||
|
||||
@staticmethod
|
||||
@contextmanager
|
||||
def _env_overrides(env: Dict[str, str]):
|
||||
"""Temporarily apply environment variable overrides for the current process.
|
||||
Restores previous values after the context exits.
|
||||
|
||||
Args:
|
||||
env: Mapping of env var names to override for this request.
|
||||
"""
|
||||
if not env:
|
||||
# No-op context
|
||||
yield
|
||||
return
|
||||
|
||||
original: Dict[str, Optional[str]] = {}
|
||||
try:
|
||||
for k, v in env.items():
|
||||
original[k] = os.environ.get(k)
|
||||
os.environ[k] = str(v)
|
||||
yield
|
||||
finally:
|
||||
for k, old in original.items():
|
||||
if old is None:
|
||||
# Was not set before
|
||||
os.environ.pop(k, None)
|
||||
else:
|
||||
os.environ[k] = old
|
||||
@@ -30,7 +30,6 @@ requires-python = ">=3.12"
|
||||
openai = []
|
||||
anthropic = []
|
||||
omni = [
|
||||
"ultralytics>=8.0.0",
|
||||
"cua-som>=0.1.0,<0.2.0",
|
||||
]
|
||||
uitars = []
|
||||
@@ -62,12 +61,9 @@ cli = [
|
||||
"yaspin>=3.1.0",
|
||||
]
|
||||
hud = [
|
||||
"hud-python==0.2.10",
|
||||
"hud-python>=0.4.12,<0.5.0",
|
||||
]
|
||||
all = [
|
||||
# omni requirements
|
||||
"ultralytics>=8.0.0",
|
||||
"cua-som>=0.1.0,<0.2.0",
|
||||
# uitars requirements
|
||||
"mlx-vlm>=0.1.27; sys_platform == 'darwin'",
|
||||
"accelerate",
|
||||
@@ -82,7 +78,7 @@ all = [
|
||||
# cli requirements
|
||||
"yaspin>=3.1.0",
|
||||
# hud requirements
|
||||
"hud-python==0.2.10",
|
||||
"hud-python>=0.4.12,<0.5.0",
|
||||
]
|
||||
|
||||
[tool.uv]
|
||||
|
||||
@@ -35,4 +35,11 @@ pip install cua-computer-server
|
||||
|
||||
Refer to this notebook for a step-by-step guide on how to use the Computer-Use Server on the host system or VM:
|
||||
|
||||
- [Computer-Use Server](../../notebooks/computer_server_nb.ipynb)
|
||||
- [Computer-Use Server](../../notebooks/computer_server_nb.ipynb)
|
||||
|
||||
## Docs
|
||||
|
||||
- [Commands](https://trycua.com/docs/libraries/computer-server/Commands)
|
||||
- [REST-API](https://trycua.com/docs/libraries/computer-server/REST-API)
|
||||
- [WebSocket-API](https://trycua.com/docs/libraries/computer-server/WebSocket-API)
|
||||
- [Index](https://trycua.com/docs/libraries/computer-server/index)
|
||||
@@ -6,11 +6,26 @@ class DioramaComputer:
|
||||
Implements _initialized, run(), and __aenter__ for agent compatibility.
|
||||
"""
|
||||
def __init__(self, diorama):
|
||||
"""
|
||||
Initialize the DioramaComputer with a diorama instance.
|
||||
|
||||
Args:
|
||||
diorama: The diorama instance to wrap with a computer-like interface.
|
||||
"""
|
||||
self.diorama = diorama
|
||||
self.interface = self.diorama.interface
|
||||
self._initialized = False
|
||||
|
||||
async def __aenter__(self):
|
||||
"""
|
||||
Async context manager entry method for compatibility with ComputerAgent.
|
||||
|
||||
Ensures an event loop is running and marks the instance as initialized.
|
||||
Creates a new event loop if none is currently running.
|
||||
|
||||
Returns:
|
||||
DioramaComputer: The initialized instance.
|
||||
"""
|
||||
# Ensure the event loop is running (for compatibility)
|
||||
try:
|
||||
asyncio.get_running_loop()
|
||||
@@ -20,6 +35,15 @@ class DioramaComputer:
|
||||
return self
|
||||
|
||||
async def run(self):
|
||||
"""
|
||||
Run method stub for compatibility with ComputerAgent interface.
|
||||
|
||||
Ensures the instance is initialized before returning. If not already
|
||||
initialized, calls __aenter__ to perform initialization.
|
||||
|
||||
Returns:
|
||||
DioramaComputer: The initialized instance.
|
||||
"""
|
||||
# This is a stub for compatibility
|
||||
if not self._initialized:
|
||||
await self.__aenter__()
|
||||
|
||||
@@ -167,7 +167,7 @@ class BaseAutomationHandler(ABC):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def hotkey(self, *keys: str) -> Dict[str, Any]:
|
||||
async def hotkey(self, keys: List[str]) -> Dict[str, Any]:
|
||||
"""Press a combination of keys together."""
|
||||
pass
|
||||
|
||||
|
||||
@@ -88,6 +88,7 @@ class LinuxAccessibilityHandler(BaseAccessibilityHandler):
|
||||
class LinuxAutomationHandler(BaseAutomationHandler):
|
||||
"""Linux implementation of automation handler using pyautogui."""
|
||||
keyboard = KeyboardController()
|
||||
mouse = MouseController()
|
||||
|
||||
# Mouse Actions
|
||||
async def mouse_down(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]:
|
||||
@@ -217,7 +218,7 @@ class LinuxAutomationHandler(BaseAutomationHandler):
|
||||
# Scrolling Actions
|
||||
async def scroll(self, x: int, y: int) -> Dict[str, Any]:
|
||||
try:
|
||||
pyautogui.scroll(x, y)
|
||||
self.mouse.scroll(x, y)
|
||||
return {"success": True}
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
@@ -77,13 +77,37 @@ NSApplicationActivationOptions = {
|
||||
}
|
||||
|
||||
def CFAttributeToPyObject(attrValue):
|
||||
"""Convert Core Foundation attribute values to Python objects.
|
||||
|
||||
Args:
|
||||
attrValue: Core Foundation attribute value to convert
|
||||
|
||||
Returns:
|
||||
Converted Python object or None if conversion fails
|
||||
"""
|
||||
def list_helper(list_value):
|
||||
"""Helper function to convert CF arrays to Python lists.
|
||||
|
||||
Args:
|
||||
list_value: Core Foundation array to convert
|
||||
|
||||
Returns:
|
||||
Python list containing converted items
|
||||
"""
|
||||
list_builder = []
|
||||
for item in list_value:
|
||||
list_builder.append(CFAttributeToPyObject(item))
|
||||
return list_builder
|
||||
|
||||
def number_helper(number_value):
|
||||
"""Helper function to convert CF numbers to Python numbers.
|
||||
|
||||
Args:
|
||||
number_value: Core Foundation number to convert
|
||||
|
||||
Returns:
|
||||
Python int or float, or None if conversion fails
|
||||
"""
|
||||
success, int_value = Foundation.CFNumberGetValue( # type: ignore
|
||||
number_value, Foundation.kCFNumberIntType, None # type: ignore
|
||||
)
|
||||
@@ -98,6 +122,14 @@ def CFAttributeToPyObject(attrValue):
|
||||
return None
|
||||
|
||||
def axuielement_helper(element_value):
|
||||
"""Helper function to handle AX UI elements.
|
||||
|
||||
Args:
|
||||
element_value: Accessibility UI element to process
|
||||
|
||||
Returns:
|
||||
The element value unchanged
|
||||
"""
|
||||
return element_value
|
||||
|
||||
cf_attr_type = Foundation.CFGetTypeID(attrValue) # type: ignore
|
||||
@@ -131,6 +163,15 @@ def CFAttributeToPyObject(attrValue):
|
||||
|
||||
|
||||
def element_attribute(element, attribute):
|
||||
"""Get an attribute value from an accessibility element.
|
||||
|
||||
Args:
|
||||
element: The accessibility element
|
||||
attribute: The attribute name to retrieve
|
||||
|
||||
Returns:
|
||||
The attribute value or None if not found
|
||||
"""
|
||||
if attribute == kAXChildrenAttribute:
|
||||
err, value = AXUIElementCopyAttributeValues(element, attribute, 0, 999, None)
|
||||
if err == kAXErrorSuccess:
|
||||
@@ -148,6 +189,15 @@ def element_attribute(element, attribute):
|
||||
|
||||
|
||||
def element_value(element, type):
|
||||
"""Extract a typed value from an accessibility element.
|
||||
|
||||
Args:
|
||||
element: The accessibility element containing the value
|
||||
type: The expected value type
|
||||
|
||||
Returns:
|
||||
The extracted value or None if extraction fails
|
||||
"""
|
||||
err, value = AXValueGetValue(element, type, None)
|
||||
if err == True:
|
||||
return value
|
||||
@@ -155,7 +205,18 @@ def element_value(element, type):
|
||||
|
||||
|
||||
class UIElement:
|
||||
"""Represents a UI element in the accessibility tree with position, size, and hierarchy information."""
|
||||
|
||||
def __init__(self, element, offset_x=0, offset_y=0, max_depth=None, parents_visible_bbox=None):
|
||||
"""Initialize a UIElement from an accessibility element.
|
||||
|
||||
Args:
|
||||
element: The accessibility element to wrap
|
||||
offset_x: X offset for position calculations
|
||||
offset_y: Y offset for position calculations
|
||||
max_depth: Maximum depth to traverse for children
|
||||
parents_visible_bbox: Parent's visible bounding box for clipping
|
||||
"""
|
||||
self.ax_element = element
|
||||
self.content_identifier = ""
|
||||
self.identifier = ""
|
||||
@@ -235,6 +296,11 @@ class UIElement:
|
||||
self.calculate_hashes()
|
||||
|
||||
def _set_bboxes(self, parents_visible_bbox):
|
||||
"""Set bounding box and visible bounding box for the element.
|
||||
|
||||
Args:
|
||||
parents_visible_bbox: Parent's visible bounding box for intersection calculation
|
||||
"""
|
||||
if not self.absolute_position or not self.size:
|
||||
self.bbox = None
|
||||
self.visible_bbox = None
|
||||
@@ -265,6 +331,17 @@ class UIElement:
|
||||
self.visible_bbox = self.bbox
|
||||
|
||||
def _get_children(self, element, start_position, offset_x, offset_y):
|
||||
"""Get child elements from the accessibility element.
|
||||
|
||||
Args:
|
||||
element: The parent accessibility element
|
||||
start_position: Starting position for offset calculations
|
||||
offset_x: X offset for child positioning
|
||||
offset_y: Y offset for child positioning
|
||||
|
||||
Returns:
|
||||
List of UIElement children
|
||||
"""
|
||||
children = element_attribute(element, kAXChildrenAttribute)
|
||||
visible_children = element_attribute(element, kAXVisibleChildrenAttribute)
|
||||
found_children = []
|
||||
@@ -288,10 +365,16 @@ class UIElement:
|
||||
return result
|
||||
|
||||
def calculate_hashes(self):
|
||||
"""Calculate unique identifiers for the element and its content."""
|
||||
self.identifier = self.component_hash()
|
||||
self.content_identifier = self.children_content_hash(self.children)
|
||||
|
||||
def component_hash(self):
|
||||
"""Generate a hash identifier for this component based on its properties.
|
||||
|
||||
Returns:
|
||||
MD5 hash string of component properties
|
||||
"""
|
||||
if self.position is None or self.size is None:
|
||||
return ""
|
||||
position_string = f"{self.position.x:.0f};{self.position.y:.0f}"
|
||||
@@ -304,6 +387,14 @@ class UIElement:
|
||||
return self.hash_from_string(position_string + size_string + enabled_string + role_string)
|
||||
|
||||
def hash_from_string(self, string):
|
||||
"""Generate MD5 hash from a string.
|
||||
|
||||
Args:
|
||||
string: Input string to hash
|
||||
|
||||
Returns:
|
||||
MD5 hash hexdigest or empty string if input is None/empty
|
||||
"""
|
||||
if string is None or string == "":
|
||||
return ""
|
||||
from hashlib import md5
|
||||
@@ -311,6 +402,14 @@ class UIElement:
|
||||
return md5(string.encode()).hexdigest()
|
||||
|
||||
def children_content_hash(self, children):
|
||||
"""Generate a hash representing the content and structure of child elements.
|
||||
|
||||
Args:
|
||||
children: List of child UIElement objects
|
||||
|
||||
Returns:
|
||||
Combined hash of children content and structure
|
||||
"""
|
||||
if len(children) == 0:
|
||||
return ""
|
||||
all_content_hashes = []
|
||||
@@ -326,7 +425,20 @@ class UIElement:
|
||||
return self.hash_from_string(content_hash.join(content_structure_hash))
|
||||
|
||||
def to_dict(self):
|
||||
"""Convert the UIElement to a dictionary representation.
|
||||
|
||||
Returns:
|
||||
Dictionary containing all element properties and children
|
||||
"""
|
||||
def children_to_dict(children):
|
||||
"""Convert list of children to dictionary format.
|
||||
|
||||
Args:
|
||||
children: List of UIElement children to convert
|
||||
|
||||
Returns:
|
||||
List of dictionaries representing the children
|
||||
"""
|
||||
result = []
|
||||
for child in children:
|
||||
result.append(child.to_dict())
|
||||
@@ -375,6 +487,12 @@ from AppKit import NSWorkspace, NSRunningApplication
|
||||
from pathlib import Path
|
||||
|
||||
def get_all_windows_zorder():
|
||||
"""Get all windows in the system with their z-order information.
|
||||
|
||||
Returns:
|
||||
List of window dictionaries sorted by z-index, containing window properties
|
||||
like id, name, pid, owner, bounds, layer, and opacity
|
||||
"""
|
||||
window_list = Quartz.CGWindowListCopyWindowInfo(
|
||||
Quartz.kCGWindowListOptionOnScreenOnly,
|
||||
Quartz.kCGNullWindowID
|
||||
@@ -425,6 +543,14 @@ def get_all_windows_zorder():
|
||||
return windows
|
||||
|
||||
def get_app_info(app):
|
||||
"""Extract information from an NSRunningApplication object.
|
||||
|
||||
Args:
|
||||
app: NSRunningApplication instance
|
||||
|
||||
Returns:
|
||||
Dictionary containing app name, bundle ID, PID, and status flags
|
||||
"""
|
||||
return {
|
||||
"name": app.localizedName(),
|
||||
"bundle_id": app.bundleIdentifier(),
|
||||
@@ -435,6 +561,14 @@ def get_app_info(app):
|
||||
}
|
||||
|
||||
def get_menubar_items(active_app_pid=None):
|
||||
"""Get menubar items for the active application.
|
||||
|
||||
Args:
|
||||
active_app_pid: Process ID of the active application, or None to use frontmost app
|
||||
|
||||
Returns:
|
||||
List of menubar item dictionaries with title, bounds, index, and app_pid
|
||||
"""
|
||||
menubar_items = []
|
||||
if active_app_pid is None:
|
||||
frontmost_app = NSWorkspace.sharedWorkspace().frontmostApplication()
|
||||
@@ -473,6 +607,12 @@ def get_menubar_items(active_app_pid=None):
|
||||
return menubar_items
|
||||
|
||||
def get_dock_items():
|
||||
"""Get all items in the macOS Dock.
|
||||
|
||||
Returns:
|
||||
List of dock item dictionaries with title, description, bounds, index,
|
||||
type, role, and subrole information
|
||||
"""
|
||||
dock_items = []
|
||||
dock_pid = None
|
||||
running_apps = NSWorkspace.sharedWorkspace().runningApplications()
|
||||
@@ -538,7 +678,14 @@ def get_dock_items():
|
||||
return dock_items
|
||||
|
||||
class MacOSAccessibilityHandler(BaseAccessibilityHandler):
|
||||
"""Handler for macOS accessibility features and UI element inspection."""
|
||||
|
||||
def get_desktop_state(self):
|
||||
"""Get the current state of the desktop including windows, apps, menubar, and dock.
|
||||
|
||||
Returns:
|
||||
Dictionary containing applications, windows, menubar_items, and dock_items
|
||||
"""
|
||||
windows = [w for w in get_all_windows_zorder() if w.get("is_on_screen")]
|
||||
running_apps = self.get_running_apps()
|
||||
applications = []
|
||||
@@ -586,7 +733,14 @@ class MacOSAccessibilityHandler(BaseAccessibilityHandler):
|
||||
}
|
||||
|
||||
def get_application_windows(self, pid: int):
|
||||
"""Get all windows for a specific application."""
|
||||
"""Get all windows for a specific application.
|
||||
|
||||
Args:
|
||||
pid: Process ID of the application
|
||||
|
||||
Returns:
|
||||
List of accessibility window elements or empty list if none found
|
||||
"""
|
||||
try:
|
||||
app = AXUIElementCreateApplication(pid)
|
||||
err, windows = AXUIElementCopyAttributeValue(app, kAXWindowsAttribute, None)
|
||||
@@ -598,7 +752,11 @@ class MacOSAccessibilityHandler(BaseAccessibilityHandler):
|
||||
return []
|
||||
|
||||
def get_all_windows(self):
|
||||
"""Get all visible windows in the system."""
|
||||
"""Get all visible windows in the system.
|
||||
|
||||
Returns:
|
||||
List of window dictionaries with app information and window details
|
||||
"""
|
||||
try:
|
||||
windows = []
|
||||
running_apps = self.get_running_apps()
|
||||
@@ -632,16 +790,38 @@ class MacOSAccessibilityHandler(BaseAccessibilityHandler):
|
||||
return []
|
||||
|
||||
def get_running_apps(self):
|
||||
"""Get all currently running applications.
|
||||
|
||||
Returns:
|
||||
List of NSRunningApplication objects
|
||||
"""
|
||||
# From NSWorkspace.runningApplications docs: https://developer.apple.com/documentation/appkit/nsworkspace/runningapplications
|
||||
# "Similar to the NSRunningApplication class’s properties, this property will only change when the main run loop runs in a common mode"
|
||||
# "Similar to the NSRunningApplication class's properties, this property will only change when the main run loop runs in a common mode"
|
||||
# So we need to run the main run loop to get the latest running applications
|
||||
Foundation.CFRunLoopRunInMode(Foundation.kCFRunLoopDefaultMode, 0.1, False) # type: ignore
|
||||
return NSWorkspace.sharedWorkspace().runningApplications()
|
||||
|
||||
def get_ax_attribute(self, element, attribute):
|
||||
"""Get an accessibility attribute from an element.
|
||||
|
||||
Args:
|
||||
element: The accessibility element
|
||||
attribute: The attribute name to retrieve
|
||||
|
||||
Returns:
|
||||
The attribute value or None if not found
|
||||
"""
|
||||
return element_attribute(element, attribute)
|
||||
|
||||
def serialize_node(self, element):
|
||||
"""Create a serializable dictionary representation of an accessibility element.
|
||||
|
||||
Args:
|
||||
element: The accessibility element to serialize
|
||||
|
||||
Returns:
|
||||
Dictionary containing element properties like role, title, value, position, and size
|
||||
"""
|
||||
# Create a serializable dictionary representation of an accessibility element
|
||||
result = {}
|
||||
|
||||
@@ -669,7 +849,12 @@ class MacOSAccessibilityHandler(BaseAccessibilityHandler):
|
||||
|
||||
return result
|
||||
|
||||
async def get_accessibility_tree(self) -> Dict[str, Any]:
|
||||
async def get_accessibility_tree(self) -> Dict[str, Any]:
|
||||
"""Get the complete accessibility tree for the current desktop state.
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and desktop state information
|
||||
"""
|
||||
try:
|
||||
desktop_state = self.get_desktop_state()
|
||||
return {
|
||||
@@ -683,10 +868,28 @@ class MacOSAccessibilityHandler(BaseAccessibilityHandler):
|
||||
async def find_element(
|
||||
self, role: Optional[str] = None, title: Optional[str] = None, value: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Find an accessibility element matching the specified criteria.
|
||||
|
||||
Args:
|
||||
role: The accessibility role to match (optional)
|
||||
title: The title to match (optional)
|
||||
value: The value to match (optional)
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and the found element or error message
|
||||
"""
|
||||
try:
|
||||
system = AXUIElementCreateSystemWide()
|
||||
|
||||
def match_element(element):
|
||||
"""Check if an element matches the search criteria.
|
||||
|
||||
Args:
|
||||
element: The accessibility element to check
|
||||
|
||||
Returns:
|
||||
True if element matches all specified criteria, False otherwise
|
||||
"""
|
||||
if role and self.get_ax_attribute(element, kAXRoleAttribute) != role:
|
||||
return False
|
||||
if title and self.get_ax_attribute(element, kAXTitleAttribute) != title:
|
||||
@@ -696,6 +899,14 @@ class MacOSAccessibilityHandler(BaseAccessibilityHandler):
|
||||
return True
|
||||
|
||||
def search_tree(element):
|
||||
"""Recursively search the accessibility tree for matching elements.
|
||||
|
||||
Args:
|
||||
element: The accessibility element to search from
|
||||
|
||||
Returns:
|
||||
Serialized element dictionary if match found, None otherwise
|
||||
"""
|
||||
if match_element(element):
|
||||
return self.serialize_node(element)
|
||||
|
||||
@@ -714,11 +925,23 @@ class MacOSAccessibilityHandler(BaseAccessibilityHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
"""Handler for macOS automation including mouse, keyboard, and screen operations."""
|
||||
|
||||
# Mouse Actions
|
||||
mouse = MouseController()
|
||||
keyboard = KeyboardController()
|
||||
|
||||
async def mouse_down(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]:
|
||||
"""Press and hold a mouse button at the specified coordinates.
|
||||
|
||||
Args:
|
||||
x: X coordinate (optional, uses current position if None)
|
||||
y: Y coordinate (optional, uses current position if None)
|
||||
button: Mouse button to press ("left", "right", or "middle")
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
if x is not None and y is not None:
|
||||
self.mouse.position = (x, y)
|
||||
@@ -728,6 +951,16 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def mouse_up(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]:
|
||||
"""Release a mouse button at the specified coordinates.
|
||||
|
||||
Args:
|
||||
x: X coordinate (optional, uses current position if None)
|
||||
y: Y coordinate (optional, uses current position if None)
|
||||
button: Mouse button to release ("left", "right", or "middle")
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
if x is not None and y is not None:
|
||||
self.mouse.position = (x, y)
|
||||
@@ -737,6 +970,15 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def left_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]:
|
||||
"""Perform a left mouse click at the specified coordinates.
|
||||
|
||||
Args:
|
||||
x: X coordinate (optional, uses current position if None)
|
||||
y: Y coordinate (optional, uses current position if None)
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
if x is not None and y is not None:
|
||||
self.mouse.position = (x, y)
|
||||
@@ -746,6 +988,15 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def right_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]:
|
||||
"""Perform a right mouse click at the specified coordinates.
|
||||
|
||||
Args:
|
||||
x: X coordinate (optional, uses current position if None)
|
||||
y: Y coordinate (optional, uses current position if None)
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
if x is not None and y is not None:
|
||||
self.mouse.position = (x, y)
|
||||
@@ -757,6 +1008,15 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
async def double_click(
|
||||
self, x: Optional[int] = None, y: Optional[int] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Perform a double left mouse click at the specified coordinates.
|
||||
|
||||
Args:
|
||||
x: X coordinate (optional, uses current position if None)
|
||||
y: Y coordinate (optional, uses current position if None)
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
if x is not None and y is not None:
|
||||
self.mouse.position = (x, y)
|
||||
@@ -766,6 +1026,15 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def move_cursor(self, x: int, y: int) -> Dict[str, Any]:
|
||||
"""Move the mouse cursor to the specified coordinates.
|
||||
|
||||
Args:
|
||||
x: Target X coordinate
|
||||
y: Target Y coordinate
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
self.mouse.position = (x, y)
|
||||
return {"success": True}
|
||||
@@ -775,6 +1044,17 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
async def drag_to(
|
||||
self, x: int, y: int, button: str = "left", duration: float = 0.5
|
||||
) -> Dict[str, Any]:
|
||||
"""Drag from current position to target coordinates.
|
||||
|
||||
Args:
|
||||
x: Target X coordinate
|
||||
y: Target Y coordinate
|
||||
button: Mouse button to use for dragging ("left", "right", or "middle")
|
||||
duration: Duration of the drag operation in seconds
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
btn = Button.left if button == "left" else Button.right if button == "right" else Button.middle
|
||||
# Press
|
||||
@@ -801,6 +1081,16 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
async def drag(
|
||||
self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5
|
||||
) -> Dict[str, Any]:
|
||||
"""Drag the mouse along a specified path of coordinates.
|
||||
|
||||
Args:
|
||||
path: List of (x, y) coordinate tuples defining the drag path
|
||||
button: Mouse button to use for dragging ("left", "right", or "middle")
|
||||
duration: Total duration of the drag operation in seconds
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
if not path or len(path) < 2:
|
||||
return {"success": False, "error": "Path must contain at least 2 points"}
|
||||
@@ -823,6 +1113,14 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
|
||||
# Keyboard Actions
|
||||
async def key_down(self, key: str) -> Dict[str, Any]:
|
||||
"""Press and hold a keyboard key.
|
||||
|
||||
Args:
|
||||
key: Key name to press (using pyautogui key names)
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
# use pyautogui for their key names
|
||||
pyautogui.keyDown(key)
|
||||
@@ -831,6 +1129,14 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def key_up(self, key: str) -> Dict[str, Any]:
|
||||
"""Release a keyboard key.
|
||||
|
||||
Args:
|
||||
key: Key name to release (using pyautogui key names)
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
# use pyautogui for their key names
|
||||
pyautogui.keyUp(key)
|
||||
@@ -839,6 +1145,14 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def type_text(self, text: str) -> Dict[str, Any]:
|
||||
"""Type text using the keyboard with Unicode support.
|
||||
|
||||
Args:
|
||||
text: Text string to type
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
# use pynput for Unicode support
|
||||
self.keyboard.type(text)
|
||||
@@ -847,6 +1161,14 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def press_key(self, key: str) -> Dict[str, Any]:
|
||||
"""Press and release a keyboard key.
|
||||
|
||||
Args:
|
||||
key: Key name to press (using pyautogui key names)
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
# use pyautogui for their key names
|
||||
pyautogui.press(key)
|
||||
@@ -855,6 +1177,14 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def hotkey(self, keys: List[str]) -> Dict[str, Any]:
|
||||
"""Press a combination of keys simultaneously.
|
||||
|
||||
Args:
|
||||
keys: List of key names to press together (using pyautogui key names)
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
# use pyautogui for their key names
|
||||
pyautogui.hotkey(*keys)
|
||||
@@ -864,6 +1194,15 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
|
||||
# Scrolling Actions
|
||||
async def scroll(self, x: int, y: int) -> Dict[str, Any]:
|
||||
"""Scroll the mouse wheel in the specified direction.
|
||||
|
||||
Args:
|
||||
x: Horizontal scroll amount
|
||||
y: Vertical scroll amount (positive for up, negative for down)
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
self.mouse.scroll(x, y)
|
||||
return {"success": True}
|
||||
@@ -871,6 +1210,14 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def scroll_down(self, clicks: int = 1) -> Dict[str, Any]:
|
||||
"""Scroll down by the specified number of clicks.
|
||||
|
||||
Args:
|
||||
clicks: Number of scroll clicks to perform
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
self.mouse.scroll(0, -clicks)
|
||||
return {"success": True}
|
||||
@@ -878,6 +1225,14 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def scroll_up(self, clicks: int = 1) -> Dict[str, Any]:
|
||||
"""Scroll up by the specified number of clicks.
|
||||
|
||||
Args:
|
||||
clicks: Number of scroll clicks to perform
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
self.mouse.scroll(0, clicks)
|
||||
return {"success": True}
|
||||
@@ -886,6 +1241,11 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
|
||||
# Screen Actions
|
||||
async def screenshot(self) -> Dict[str, Any]:
|
||||
"""Capture a screenshot of the current screen.
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and base64-encoded image data or error message
|
||||
"""
|
||||
try:
|
||||
from PIL import Image
|
||||
|
||||
@@ -902,6 +1262,11 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": f"Screenshot error: {str(e)}"}
|
||||
|
||||
async def get_screen_size(self) -> Dict[str, Any]:
|
||||
"""Get the dimensions of the current screen.
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and screen size or error message
|
||||
"""
|
||||
try:
|
||||
size = pyautogui.size()
|
||||
return {"success": True, "size": {"width": size.width, "height": size.height}}
|
||||
@@ -909,6 +1274,11 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def get_cursor_position(self) -> Dict[str, Any]:
|
||||
"""Get the current position of the mouse cursor.
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and cursor position or error message
|
||||
"""
|
||||
try:
|
||||
x, y = self.mouse.position
|
||||
return {"success": True, "position": {"x": x, "y": y}}
|
||||
@@ -917,6 +1287,11 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
|
||||
# Clipboard Actions
|
||||
async def copy_to_clipboard(self) -> Dict[str, Any]:
|
||||
"""Get the current content of the system clipboard.
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and clipboard content or error message
|
||||
"""
|
||||
try:
|
||||
import pyperclip
|
||||
|
||||
@@ -926,6 +1301,14 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def set_clipboard(self, text: str) -> Dict[str, Any]:
|
||||
"""Set the content of the system clipboard.
|
||||
|
||||
Args:
|
||||
text: Text to copy to the clipboard
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status and error message if failed
|
||||
"""
|
||||
try:
|
||||
import pyperclip
|
||||
|
||||
@@ -935,7 +1318,14 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def run_command(self, command: str) -> Dict[str, Any]:
|
||||
"""Run a shell command and return its output."""
|
||||
"""Run a shell command and return its output.
|
||||
|
||||
Args:
|
||||
command: Shell command to execute
|
||||
|
||||
Returns:
|
||||
Dictionary containing success status, stdout, stderr, and return code
|
||||
"""
|
||||
try:
|
||||
# Create subprocess
|
||||
process = await asyncio.create_subprocess_shell(
|
||||
|
||||
@@ -11,6 +11,8 @@ import asyncio
|
||||
import base64
|
||||
import os
|
||||
from io import BytesIO
|
||||
from pynput.mouse import Controller as MouseController
|
||||
from pynput.keyboard import Controller as KeyboardController
|
||||
|
||||
# Configure logger
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -41,7 +43,14 @@ class WindowsAccessibilityHandler(BaseAccessibilityHandler):
|
||||
"""Windows implementation of accessibility handler."""
|
||||
|
||||
async def get_accessibility_tree(self) -> Dict[str, Any]:
|
||||
"""Get the accessibility tree of the current window."""
|
||||
"""Get the accessibility tree of the current window.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the success status and either
|
||||
the accessibility tree or an error message.
|
||||
Structure: {"success": bool, "tree": dict} or
|
||||
{"success": bool, "error": str}
|
||||
"""
|
||||
if not WINDOWS_API_AVAILABLE:
|
||||
return {"success": False, "error": "Windows API not available"}
|
||||
|
||||
@@ -65,6 +74,15 @@ class WindowsAccessibilityHandler(BaseAccessibilityHandler):
|
||||
|
||||
# Enumerate child windows
|
||||
def enum_child_proc(hwnd_child, children_list):
|
||||
"""Callback function to enumerate child windows and collect their information.
|
||||
|
||||
Args:
|
||||
hwnd_child: Handle to the child window being enumerated.
|
||||
children_list: List to append child window information to.
|
||||
|
||||
Returns:
|
||||
bool: True to continue enumeration, False to stop.
|
||||
"""
|
||||
try:
|
||||
child_text = win32gui.GetWindowText(hwnd_child)
|
||||
child_rect = win32gui.GetWindowRect(hwnd_child)
|
||||
@@ -93,7 +111,19 @@ class WindowsAccessibilityHandler(BaseAccessibilityHandler):
|
||||
async def find_element(self, role: Optional[str] = None,
|
||||
title: Optional[str] = None,
|
||||
value: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Find an element in the accessibility tree by criteria."""
|
||||
"""Find an element in the accessibility tree by criteria.
|
||||
|
||||
Args:
|
||||
role (Optional[str]): The role or class name of the element to find.
|
||||
title (Optional[str]): The title or text of the element to find.
|
||||
value (Optional[str]): The value of the element (not used in Windows implementation).
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the success status and either
|
||||
the found element or an error message.
|
||||
Structure: {"success": bool, "element": dict} or
|
||||
{"success": bool, "error": str}
|
||||
"""
|
||||
if not WINDOWS_API_AVAILABLE:
|
||||
return {"success": False, "error": "Windows API not available"}
|
||||
|
||||
@@ -138,8 +168,20 @@ class WindowsAccessibilityHandler(BaseAccessibilityHandler):
|
||||
class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
"""Windows implementation of automation handler using pyautogui and Windows APIs."""
|
||||
|
||||
mouse = MouseController()
|
||||
|
||||
# Mouse Actions
|
||||
async def mouse_down(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]:
|
||||
"""Press and hold a mouse button at the specified coordinates.
|
||||
|
||||
Args:
|
||||
x (Optional[int]): The x-coordinate to move to before pressing. If None, uses current position.
|
||||
y (Optional[int]): The y-coordinate to move to before pressing. If None, uses current position.
|
||||
button (str): The mouse button to press ("left", "right", or "middle").
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -152,6 +194,16 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def mouse_up(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]:
|
||||
"""Release a mouse button at the specified coordinates.
|
||||
|
||||
Args:
|
||||
x (Optional[int]): The x-coordinate to move to before releasing. If None, uses current position.
|
||||
y (Optional[int]): The y-coordinate to move to before releasing. If None, uses current position.
|
||||
button (str): The mouse button to release ("left", "right", or "middle").
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -164,6 +216,15 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def move_cursor(self, x: int, y: int) -> Dict[str, Any]:
|
||||
"""Move the mouse cursor to the specified coordinates.
|
||||
|
||||
Args:
|
||||
x (int): The x-coordinate to move to.
|
||||
y (int): The y-coordinate to move to.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -174,6 +235,15 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def left_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]:
|
||||
"""Perform a left mouse click at the specified coordinates.
|
||||
|
||||
Args:
|
||||
x (Optional[int]): The x-coordinate to click at. If None, clicks at current position.
|
||||
y (Optional[int]): The y-coordinate to click at. If None, clicks at current position.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -186,6 +256,15 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def right_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]:
|
||||
"""Perform a right mouse click at the specified coordinates.
|
||||
|
||||
Args:
|
||||
x (Optional[int]): The x-coordinate to click at. If None, clicks at current position.
|
||||
y (Optional[int]): The y-coordinate to click at. If None, clicks at current position.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -198,6 +277,15 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def double_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]:
|
||||
"""Perform a double left mouse click at the specified coordinates.
|
||||
|
||||
Args:
|
||||
x (Optional[int]): The x-coordinate to double-click at. If None, clicks at current position.
|
||||
y (Optional[int]): The y-coordinate to double-click at. If None, clicks at current position.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -210,6 +298,17 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def drag_to(self, x: int, y: int, button: str = "left", duration: float = 0.5) -> Dict[str, Any]:
|
||||
"""Drag from the current position to the specified coordinates.
|
||||
|
||||
Args:
|
||||
x (int): The x-coordinate to drag to.
|
||||
y (int): The y-coordinate to drag to.
|
||||
button (str): The mouse button to use for dragging ("left", "right", or "middle").
|
||||
duration (float): The time in seconds to take for the drag operation.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -220,6 +319,16 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def drag(self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5) -> Dict[str, Any]:
|
||||
"""Drag the mouse through a series of coordinates.
|
||||
|
||||
Args:
|
||||
path (List[Tuple[int, int]]): A list of (x, y) coordinate tuples to drag through.
|
||||
button (str): The mouse button to use for dragging ("left", "right", or "middle").
|
||||
duration (float): The total time in seconds for the entire drag operation.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -240,6 +349,14 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
|
||||
# Keyboard Actions
|
||||
async def key_down(self, key: str) -> Dict[str, Any]:
|
||||
"""Press and hold a keyboard key.
|
||||
|
||||
Args:
|
||||
key (str): The key to press down (e.g., 'ctrl', 'shift', 'a').
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -250,6 +367,14 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def key_up(self, key: str) -> Dict[str, Any]:
|
||||
"""Release a keyboard key.
|
||||
|
||||
Args:
|
||||
key (str): The key to release (e.g., 'ctrl', 'shift', 'a').
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -260,6 +385,14 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def type_text(self, text: str) -> Dict[str, Any]:
|
||||
"""Type the specified text.
|
||||
|
||||
Args:
|
||||
text (str): The text to type.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -270,6 +403,14 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def press_key(self, key: str) -> Dict[str, Any]:
|
||||
"""Press and release a keyboard key.
|
||||
|
||||
Args:
|
||||
key (str): The key to press (e.g., 'enter', 'space', 'tab').
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -279,7 +420,15 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def hotkey(self, keys: str) -> Dict[str, Any]:
|
||||
async def hotkey(self, keys: List[str]) -> Dict[str, Any]:
|
||||
"""Press a combination of keys simultaneously.
|
||||
|
||||
Args:
|
||||
keys (List[str]): The keys to press together (e.g., ['ctrl', 'c'], ['alt', 'tab']).
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -291,17 +440,33 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
|
||||
# Scrolling Actions
|
||||
async def scroll(self, x: int, y: int) -> Dict[str, Any]:
|
||||
"""Scroll vertically at the current cursor position.
|
||||
|
||||
Args:
|
||||
x (int): Horizontal scroll amount (not used in pyautogui implementation).
|
||||
y (int): Vertical scroll amount. Positive values scroll up, negative values scroll down.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
try:
|
||||
# pyautogui.scroll() only takes one parameter (vertical scroll)
|
||||
pyautogui.scroll(y)
|
||||
self.mouse.scroll(x, y)
|
||||
return {"success": True}
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def scroll_down(self, clicks: int = 1) -> Dict[str, Any]:
|
||||
"""Scroll down by the specified number of clicks.
|
||||
|
||||
Args:
|
||||
clicks (int): The number of scroll clicks to perform downward.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -312,6 +477,14 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def scroll_up(self, clicks: int = 1) -> Dict[str, Any]:
|
||||
"""Scroll up by the specified number of clicks.
|
||||
|
||||
Args:
|
||||
clicks (int): The number of scroll clicks to perform upward.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -323,6 +496,14 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
|
||||
# Screen Actions
|
||||
async def screenshot(self) -> Dict[str, Any]:
|
||||
"""Capture a screenshot of the entire screen.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the success status and either
|
||||
base64-encoded image data or an error message.
|
||||
Structure: {"success": bool, "image_data": str} or
|
||||
{"success": bool, "error": str}
|
||||
"""
|
||||
if not pyautogui:
|
||||
return {"success": False, "error": "pyautogui not available"}
|
||||
|
||||
@@ -341,6 +522,14 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": f"Screenshot error: {str(e)}"}
|
||||
|
||||
async def get_screen_size(self) -> Dict[str, Any]:
|
||||
"""Get the size of the screen in pixels.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the success status and either
|
||||
screen size information or an error message.
|
||||
Structure: {"success": bool, "size": {"width": int, "height": int}} or
|
||||
{"success": bool, "error": str}
|
||||
"""
|
||||
try:
|
||||
if pyautogui:
|
||||
size = pyautogui.size()
|
||||
@@ -356,6 +545,14 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def get_cursor_position(self) -> Dict[str, Any]:
|
||||
"""Get the current position of the mouse cursor.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the success status and either
|
||||
cursor position or an error message.
|
||||
Structure: {"success": bool, "position": {"x": int, "y": int}} or
|
||||
{"success": bool, "error": str}
|
||||
"""
|
||||
try:
|
||||
if pyautogui:
|
||||
pos = pyautogui.position()
|
||||
@@ -371,6 +568,14 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
|
||||
# Clipboard Actions
|
||||
async def copy_to_clipboard(self) -> Dict[str, Any]:
|
||||
"""Get the current content of the clipboard.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the success status and either
|
||||
clipboard content or an error message.
|
||||
Structure: {"success": bool, "content": str} or
|
||||
{"success": bool, "error": str}
|
||||
"""
|
||||
try:
|
||||
import pyperclip
|
||||
content = pyperclip.paste()
|
||||
@@ -379,6 +584,14 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def set_clipboard(self, text: str) -> Dict[str, Any]:
|
||||
"""Set the clipboard content to the specified text.
|
||||
|
||||
Args:
|
||||
text (str): The text to copy to the clipboard.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
try:
|
||||
import pyperclip
|
||||
pyperclip.copy(text)
|
||||
@@ -388,6 +601,17 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
|
||||
# Command Execution
|
||||
async def run_command(self, command: str) -> Dict[str, Any]:
|
||||
"""Execute a shell command asynchronously.
|
||||
|
||||
Args:
|
||||
command (str): The shell command to execute.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the success status and either
|
||||
command output or an error message.
|
||||
Structure: {"success": bool, "stdout": str, "stderr": str, "return_code": int} or
|
||||
{"success": bool, "error": str}
|
||||
"""
|
||||
try:
|
||||
# Create subprocess
|
||||
process = await asyncio.create_subprocess_shell(
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request, HTTPException, Header
|
||||
from fastapi.responses import StreamingResponse
|
||||
from typing import List, Dict, Any, Optional
|
||||
from fastapi.responses import StreamingResponse, JSONResponse
|
||||
from typing import List, Dict, Any, Optional, Union, Literal, cast
|
||||
import uvicorn
|
||||
import logging
|
||||
import asyncio
|
||||
@@ -14,6 +14,14 @@ import os
|
||||
import aiohttp
|
||||
import hashlib
|
||||
import time
|
||||
import platform
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
try:
|
||||
from agent import ComputerAgent
|
||||
HAS_AGENT = True
|
||||
except ImportError:
|
||||
HAS_AGENT = False
|
||||
|
||||
# Set up logging with more detail
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -30,6 +38,16 @@ app = FastAPI(
|
||||
websocket_max_size=WEBSOCKET_MAX_SIZE,
|
||||
)
|
||||
|
||||
# CORS configuration
|
||||
origins = ["*"]
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=origins,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
protocol_version = 1
|
||||
try:
|
||||
from importlib.metadata import version
|
||||
@@ -197,6 +215,21 @@ class ConnectionManager:
|
||||
manager = ConnectionManager()
|
||||
auth_manager = AuthenticationManager()
|
||||
|
||||
@app.get("/status")
|
||||
async def status():
|
||||
sys = platform.system().lower()
|
||||
# get os type
|
||||
if "darwin" in sys or sys == "macos" or sys == "mac":
|
||||
os_type = "macos"
|
||||
elif "windows" in sys:
|
||||
os_type = "windows"
|
||||
else:
|
||||
os_type = "linux"
|
||||
# get computer-server features
|
||||
features = []
|
||||
if HAS_AGENT:
|
||||
features.append("agent")
|
||||
return {"status": "ok", "os_type": os_type, "features": features}
|
||||
|
||||
@app.websocket("/ws", name="websocket_endpoint")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
@@ -331,7 +364,6 @@ async def websocket_endpoint(websocket: WebSocket):
|
||||
pass
|
||||
manager.disconnect(websocket)
|
||||
|
||||
|
||||
@app.post("/cmd")
|
||||
async def cmd_endpoint(
|
||||
request: Request,
|
||||
@@ -420,12 +452,255 @@ async def cmd_endpoint(
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"Access-Control-Allow-Origin": "*",
|
||||
"Access-Control-Allow-Methods": "POST, OPTIONS",
|
||||
"Access-Control-Allow-Headers": "Content-Type, X-Container-Name, X-API-Key"
|
||||
}
|
||||
)
|
||||
|
||||
@app.post("/responses")
|
||||
async def agent_response_endpoint(
|
||||
request: Request,
|
||||
api_key: Optional[str] = Header(None, alias="X-API-Key"),
|
||||
):
|
||||
"""
|
||||
Minimal proxy to run ComputerAgent for up to 2 turns.
|
||||
|
||||
Security:
|
||||
- If CONTAINER_NAME is set on the server, require X-API-Key
|
||||
and validate using AuthenticationManager unless CUA_ENABLE_PUBLIC_PROXY is true.
|
||||
|
||||
Body JSON:
|
||||
{
|
||||
"model": "...", # required
|
||||
"input": "... or messages[]", # required
|
||||
"agent_kwargs": { ... }, # optional, passed directly to ComputerAgent
|
||||
"env": { ... } # optional env overrides for agent
|
||||
}
|
||||
"""
|
||||
if not HAS_AGENT:
|
||||
raise HTTPException(status_code=501, detail="ComputerAgent not available")
|
||||
|
||||
# Authenticate via AuthenticationManager if running in cloud (CONTAINER_NAME set)
|
||||
container_name = os.environ.get("CONTAINER_NAME")
|
||||
if container_name:
|
||||
is_public = os.environ.get("CUA_ENABLE_PUBLIC_PROXY", "").lower().strip() in ["1", "true", "yes", "y", "on"]
|
||||
if not is_public:
|
||||
if not api_key:
|
||||
raise HTTPException(status_code=401, detail="Missing AGENT PROXY auth headers")
|
||||
ok = await auth_manager.auth(container_name, api_key)
|
||||
if not ok:
|
||||
raise HTTPException(status_code=401, detail="Unauthorized")
|
||||
|
||||
# Parse request body
|
||||
try:
|
||||
body = await request.json()
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
|
||||
|
||||
model = body.get("model")
|
||||
input_data = body.get("input")
|
||||
if not model or input_data is None:
|
||||
raise HTTPException(status_code=400, detail="'model' and 'input' are required")
|
||||
|
||||
agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {}
|
||||
env_overrides: Dict[str, str] = body.get("env") or {}
|
||||
|
||||
# Simple env override context
|
||||
class _EnvOverride:
|
||||
def __init__(self, overrides: Dict[str, str]):
|
||||
self.overrides = overrides
|
||||
self._original: Dict[str, Optional[str]] = {}
|
||||
def __enter__(self):
|
||||
for k, v in (self.overrides or {}).items():
|
||||
self._original[k] = os.environ.get(k)
|
||||
os.environ[k] = str(v)
|
||||
def __exit__(self, exc_type, exc, tb):
|
||||
for k, old in self._original.items():
|
||||
if old is None:
|
||||
os.environ.pop(k, None)
|
||||
else:
|
||||
os.environ[k] = old
|
||||
|
||||
# Convert input to messages
|
||||
def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
||||
if isinstance(data, str):
|
||||
return [{"role": "user", "content": data}]
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
|
||||
messages = _to_messages(input_data)
|
||||
|
||||
# Define a direct computer tool that implements the AsyncComputerHandler protocol
|
||||
# and delegates to our existing automation/file/accessibility handlers.
|
||||
from agent.computers import AsyncComputerHandler # runtime-checkable Protocol
|
||||
|
||||
class DirectComputer(AsyncComputerHandler):
|
||||
def __init__(self):
|
||||
# use module-scope handler singletons created by HandlerFactory
|
||||
self._auto = automation_handler
|
||||
self._file = file_handler
|
||||
self._access = accessibility_handler
|
||||
|
||||
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
|
||||
sys = platform.system().lower()
|
||||
if "darwin" in sys or sys in ("macos", "mac"):
|
||||
return "mac"
|
||||
if "windows" in sys:
|
||||
return "windows"
|
||||
return "linux"
|
||||
|
||||
async def get_dimensions(self) -> tuple[int, int]:
|
||||
size = await self._auto.get_screen_size()
|
||||
return size["width"], size["height"]
|
||||
|
||||
async def screenshot(self) -> str:
|
||||
img_b64 = await self._auto.screenshot()
|
||||
return img_b64["image_data"]
|
||||
|
||||
async def click(self, x: int, y: int, button: str = "left") -> None:
|
||||
if button == "left":
|
||||
await self._auto.left_click(x, y)
|
||||
elif button == "right":
|
||||
await self._auto.right_click(x, y)
|
||||
else:
|
||||
await self._auto.left_click(x, y)
|
||||
|
||||
async def double_click(self, x: int, y: int) -> None:
|
||||
await self._auto.double_click(x, y)
|
||||
|
||||
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
|
||||
await self._auto.move_cursor(x, y)
|
||||
await self._auto.scroll(scroll_x, scroll_y)
|
||||
|
||||
async def type(self, text: str) -> None:
|
||||
await self._auto.type_text(text)
|
||||
|
||||
async def wait(self, ms: int = 1000) -> None:
|
||||
await asyncio.sleep(ms / 1000.0)
|
||||
|
||||
async def move(self, x: int, y: int) -> None:
|
||||
await self._auto.move_cursor(x, y)
|
||||
|
||||
async def keypress(self, keys: Union[List[str], str]) -> None:
|
||||
if isinstance(keys, str):
|
||||
parts = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
|
||||
else:
|
||||
parts = keys
|
||||
if len(parts) == 1:
|
||||
await self._auto.press_key(parts[0])
|
||||
else:
|
||||
await self._auto.hotkey(parts)
|
||||
|
||||
async def drag(self, path: List[Dict[str, int]]) -> None:
|
||||
if not path:
|
||||
return
|
||||
start = path[0]
|
||||
await self._auto.mouse_down(start["x"], start["y"])
|
||||
for pt in path[1:]:
|
||||
await self._auto.move_cursor(pt["x"], pt["y"])
|
||||
end = path[-1]
|
||||
await self._auto.mouse_up(end["x"], end["y"])
|
||||
|
||||
async def get_current_url(self) -> str:
|
||||
# Not available in this server context
|
||||
return ""
|
||||
|
||||
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
await self._auto.mouse_down(x, y, button="left")
|
||||
|
||||
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
await self._auto.mouse_up(x, y, button="left")
|
||||
|
||||
# # Inline image URLs to base64
|
||||
# import base64, mimetypes, requests
|
||||
# # Use a browser-like User-Agent to avoid 403s from some CDNs (e.g., Wikimedia)
|
||||
# HEADERS = {
|
||||
# "User-Agent": (
|
||||
# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
# "AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
# "Chrome/124.0.0.0 Safari/537.36"
|
||||
# )
|
||||
# }
|
||||
# def _to_data_url(content_bytes: bytes, url: str, resp: requests.Response) -> str:
|
||||
# ctype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream"
|
||||
# b64 = base64.b64encode(content_bytes).decode("utf-8")
|
||||
# return f"data:{ctype};base64,{b64}"
|
||||
# def inline_image_urls(messages):
|
||||
# # messages: List[{"role": "...","content":[...]}]
|
||||
# out = []
|
||||
# for m in messages:
|
||||
# if not isinstance(m.get("content"), list):
|
||||
# out.append(m)
|
||||
# continue
|
||||
# new_content = []
|
||||
# for part in (m.get("content") or []):
|
||||
# if part.get("type") == "input_image" and (url := part.get("image_url")):
|
||||
# resp = requests.get(url, headers=HEADERS, timeout=30)
|
||||
# resp.raise_for_status()
|
||||
# new_content.append({
|
||||
# "type": "input_image",
|
||||
# "image_url": _to_data_url(resp.content, url, resp)
|
||||
# })
|
||||
# else:
|
||||
# new_content.append(part)
|
||||
# out.append({**m, "content": new_content})
|
||||
# return out
|
||||
# messages = inline_image_urls(messages)
|
||||
|
||||
error = None
|
||||
|
||||
with _EnvOverride(env_overrides):
|
||||
# Prepare tools: if caller did not pass tools, inject our DirectComputer
|
||||
tools = agent_kwargs.get("tools")
|
||||
if not tools:
|
||||
tools = [DirectComputer()]
|
||||
agent_kwargs = {**agent_kwargs, "tools": tools}
|
||||
# Instantiate agent with our tools
|
||||
agent = ComputerAgent(model=model, **agent_kwargs) # type: ignore[arg-type]
|
||||
|
||||
total_output: List[Any] = []
|
||||
total_usage: Dict[str, Any] = {}
|
||||
|
||||
pending_computer_call_ids = set()
|
||||
try:
|
||||
async for result in agent.run(messages):
|
||||
total_output += result["output"]
|
||||
# Try to collect usage if present
|
||||
if isinstance(result, dict) and "usage" in result and isinstance(result["usage"], dict):
|
||||
# Merge usage counters
|
||||
for k, v in result["usage"].items():
|
||||
if isinstance(v, (int, float)):
|
||||
total_usage[k] = total_usage.get(k, 0) + v
|
||||
else:
|
||||
total_usage[k] = v
|
||||
for msg in result.get("output", []):
|
||||
if msg.get("type") == "computer_call":
|
||||
pending_computer_call_ids.add(msg["call_id"])
|
||||
elif msg.get("type") == "computer_call_output":
|
||||
pending_computer_call_ids.discard(msg["call_id"])
|
||||
# exit if no pending computer calls
|
||||
if not pending_computer_call_ids:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error running agent: {str(e)}")
|
||||
logger.error(traceback.format_exc())
|
||||
error = str(e)
|
||||
|
||||
# Build response payload
|
||||
payload = {
|
||||
"model": model,
|
||||
"error": error,
|
||||
"output": total_output,
|
||||
"usage": total_usage,
|
||||
"status": "completed" if not error else "failed"
|
||||
}
|
||||
|
||||
# CORS: allow any origin
|
||||
headers = {
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
}
|
||||
|
||||
return JSONResponse(content=payload, headers=headers)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
|
||||
@@ -65,80 +65,9 @@ Refer to this notebook for a step-by-step guide on how to use the Computer-Use I
|
||||
|
||||
- [Computer-Use Interface (CUI)](https://github.com/trycua/cua/blob/main/notebooks/computer_nb.ipynb)
|
||||
|
||||
## Using the Gradio Computer UI
|
||||
|
||||
The computer module includes a Gradio UI for creating and sharing demonstration data. We make it easy for people to build community datasets for better computer use models with an upload to Huggingface feature.
|
||||
|
||||
```bash
|
||||
# Install with UI support
|
||||
pip install "cua-computer[ui]"
|
||||
```
|
||||
|
||||
> **Note:** For precise control of the computer, we recommend using VNC or Screen Sharing instead of the Computer Gradio UI.
|
||||
|
||||
### Building and Sharing Demonstrations with Huggingface
|
||||
|
||||
Follow these steps to contribute your own demonstrations:
|
||||
|
||||
#### 1. Set up Huggingface Access
|
||||
|
||||
Set your HF_TOKEN in a .env file or in your environment variables:
|
||||
|
||||
```bash
|
||||
# In .env file
|
||||
HF_TOKEN=your_huggingface_token
|
||||
```
|
||||
|
||||
#### 2. Launch the Computer UI
|
||||
|
||||
```python
|
||||
# launch_ui.py
|
||||
from computer.ui.gradio.app import create_gradio_ui
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('.env')
|
||||
|
||||
app = create_gradio_ui()
|
||||
app.launch(share=False)
|
||||
```
|
||||
|
||||
For examples, see [Computer UI Examples](https://github.com/trycua/cua/tree/main/examples/computer_ui_examples.py)
|
||||
|
||||
#### 3. Record Your Tasks
|
||||
|
||||
<details open>
|
||||
<summary>View demonstration video</summary>
|
||||
<video src="https://github.com/user-attachments/assets/de3c3477-62fe-413c-998d-4063e48de176" controls width="600"></video>
|
||||
</details>
|
||||
|
||||
Record yourself performing various computer tasks using the UI.
|
||||
|
||||
#### 4. Save Your Demonstrations
|
||||
|
||||
<details open>
|
||||
<summary>View demonstration video</summary>
|
||||
<video src="https://github.com/user-attachments/assets/5ad1df37-026a-457f-8b49-922ae805faef" controls width="600"></video>
|
||||
</details>
|
||||
|
||||
Save each task by picking a descriptive name and adding relevant tags (e.g., "office", "web-browsing", "coding").
|
||||
|
||||
#### 5. Record Additional Demonstrations
|
||||
|
||||
Repeat steps 3 and 4 until you have a good amount of demonstrations covering different tasks and scenarios.
|
||||
|
||||
#### 6. Upload to Huggingface
|
||||
|
||||
<details open>
|
||||
<summary>View demonstration video</summary>
|
||||
<video src="https://github.com/user-attachments/assets/c586d460-3877-4b5f-a736-3248886d2134" controls width="600"></video>
|
||||
</details>
|
||||
|
||||
Upload your dataset to Huggingface by:
|
||||
- Naming it as `{your_username}/{dataset_name}`
|
||||
- Choosing public or private visibility
|
||||
- Optionally selecting specific tags to upload only tasks with certain tags
|
||||
|
||||
#### Examples and Resources
|
||||
|
||||
- Example Dataset: [ddupont/test-dataset](https://huggingface.co/datasets/ddupont/test-dataset)
|
||||
- Find Community Datasets: 🔍 [Browse CUA Datasets on Huggingface](https://huggingface.co/datasets?other=cua)
|
||||
## Docs
|
||||
|
||||
- [Computers](https://trycua.com/docs/computer-sdk/computers)
|
||||
- [Commands](https://trycua.com/docs/computer-sdk/commands)
|
||||
- [Computer UI](https://trycua.com/docs/computer-sdk/computer-ui)
|
||||
- [Sandboxed Python](https://trycua.com/docs/computer-sdk/sandboxed-python)
|
||||
|
||||
@@ -6,16 +6,35 @@ class DioramaComputer:
|
||||
A Computer-compatible proxy for Diorama that sends commands over the ComputerInterface.
|
||||
"""
|
||||
def __init__(self, computer, apps):
|
||||
"""
|
||||
Initialize the DioramaComputer with a computer instance and list of apps.
|
||||
|
||||
Args:
|
||||
computer: The computer instance to proxy commands through
|
||||
apps: List of applications available in the diorama environment
|
||||
"""
|
||||
self.computer = computer
|
||||
self.apps = apps
|
||||
self.interface = DioramaComputerInterface(computer, apps)
|
||||
self._initialized = False
|
||||
|
||||
async def __aenter__(self):
|
||||
"""
|
||||
Async context manager entry point.
|
||||
|
||||
Returns:
|
||||
self: The DioramaComputer instance
|
||||
"""
|
||||
self._initialized = True
|
||||
return self
|
||||
|
||||
async def run(self):
|
||||
"""
|
||||
Initialize and run the DioramaComputer if not already initialized.
|
||||
|
||||
Returns:
|
||||
self: The DioramaComputer instance
|
||||
"""
|
||||
if not self._initialized:
|
||||
await self.__aenter__()
|
||||
return self
|
||||
@@ -25,11 +44,31 @@ class DioramaComputerInterface:
|
||||
Diorama Interface proxy that sends diorama_cmds via the Computer's interface.
|
||||
"""
|
||||
def __init__(self, computer, apps):
|
||||
"""
|
||||
Initialize the DioramaComputerInterface.
|
||||
|
||||
Args:
|
||||
computer: The computer instance to send commands through
|
||||
apps: List of applications available in the diorama environment
|
||||
"""
|
||||
self.computer = computer
|
||||
self.apps = apps
|
||||
self._scene_size = None
|
||||
|
||||
async def _send_cmd(self, action, arguments=None):
|
||||
"""
|
||||
Send a command to the diorama interface through the computer.
|
||||
|
||||
Args:
|
||||
action (str): The action/command to execute
|
||||
arguments (dict, optional): Additional arguments for the command
|
||||
|
||||
Returns:
|
||||
The result from the diorama command execution
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the computer interface is not initialized or command fails
|
||||
"""
|
||||
arguments = arguments or {}
|
||||
arguments = {"app_list": self.apps, **arguments}
|
||||
# Use the computer's interface (must be initialized)
|
||||
@@ -42,6 +81,15 @@ class DioramaComputerInterface:
|
||||
return result.get("result")
|
||||
|
||||
async def screenshot(self, as_bytes=True):
|
||||
"""
|
||||
Take a screenshot of the diorama scene.
|
||||
|
||||
Args:
|
||||
as_bytes (bool): If True, return image as bytes; if False, return PIL Image object
|
||||
|
||||
Returns:
|
||||
bytes or PIL.Image: Screenshot data in the requested format
|
||||
"""
|
||||
from PIL import Image
|
||||
import base64
|
||||
result = await self._send_cmd("screenshot")
|
||||
@@ -53,41 +101,122 @@ class DioramaComputerInterface:
|
||||
return img_bytes if as_bytes else img
|
||||
|
||||
async def get_screen_size(self):
|
||||
"""
|
||||
Get the dimensions of the diorama scene.
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing 'width' and 'height' keys with pixel dimensions
|
||||
"""
|
||||
if not self._scene_size:
|
||||
await self.screenshot(as_bytes=False)
|
||||
return {"width": self._scene_size[0], "height": self._scene_size[1]}
|
||||
|
||||
async def move_cursor(self, x, y):
|
||||
"""
|
||||
Move the cursor to the specified coordinates.
|
||||
|
||||
Args:
|
||||
x (int): X coordinate to move cursor to
|
||||
y (int): Y coordinate to move cursor to
|
||||
"""
|
||||
await self._send_cmd("move_cursor", {"x": x, "y": y})
|
||||
|
||||
async def left_click(self, x=None, y=None):
|
||||
"""
|
||||
Perform a left mouse click at the specified coordinates or current cursor position.
|
||||
|
||||
Args:
|
||||
x (int, optional): X coordinate to click at. If None, clicks at current cursor position
|
||||
y (int, optional): Y coordinate to click at. If None, clicks at current cursor position
|
||||
"""
|
||||
await self._send_cmd("left_click", {"x": x, "y": y})
|
||||
|
||||
async def right_click(self, x=None, y=None):
|
||||
"""
|
||||
Perform a right mouse click at the specified coordinates or current cursor position.
|
||||
|
||||
Args:
|
||||
x (int, optional): X coordinate to click at. If None, clicks at current cursor position
|
||||
y (int, optional): Y coordinate to click at. If None, clicks at current cursor position
|
||||
"""
|
||||
await self._send_cmd("right_click", {"x": x, "y": y})
|
||||
|
||||
async def double_click(self, x=None, y=None):
|
||||
"""
|
||||
Perform a double mouse click at the specified coordinates or current cursor position.
|
||||
|
||||
Args:
|
||||
x (int, optional): X coordinate to double-click at. If None, clicks at current cursor position
|
||||
y (int, optional): Y coordinate to double-click at. If None, clicks at current cursor position
|
||||
"""
|
||||
await self._send_cmd("double_click", {"x": x, "y": y})
|
||||
|
||||
async def scroll_up(self, clicks=1):
|
||||
"""
|
||||
Scroll up by the specified number of clicks.
|
||||
|
||||
Args:
|
||||
clicks (int): Number of scroll clicks to perform upward. Defaults to 1
|
||||
"""
|
||||
await self._send_cmd("scroll_up", {"clicks": clicks})
|
||||
|
||||
async def scroll_down(self, clicks=1):
|
||||
"""
|
||||
Scroll down by the specified number of clicks.
|
||||
|
||||
Args:
|
||||
clicks (int): Number of scroll clicks to perform downward. Defaults to 1
|
||||
"""
|
||||
await self._send_cmd("scroll_down", {"clicks": clicks})
|
||||
|
||||
async def drag_to(self, x, y, duration=0.5):
|
||||
"""
|
||||
Drag from the current cursor position to the specified coordinates.
|
||||
|
||||
Args:
|
||||
x (int): X coordinate to drag to
|
||||
y (int): Y coordinate to drag to
|
||||
duration (float): Duration of the drag operation in seconds. Defaults to 0.5
|
||||
"""
|
||||
await self._send_cmd("drag_to", {"x": x, "y": y, "duration": duration})
|
||||
|
||||
async def get_cursor_position(self):
|
||||
"""
|
||||
Get the current cursor position.
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing the current cursor coordinates
|
||||
"""
|
||||
return await self._send_cmd("get_cursor_position")
|
||||
|
||||
async def type_text(self, text):
|
||||
"""
|
||||
Type the specified text at the current cursor position.
|
||||
|
||||
Args:
|
||||
text (str): The text to type
|
||||
"""
|
||||
await self._send_cmd("type_text", {"text": text})
|
||||
|
||||
async def press_key(self, key):
|
||||
"""
|
||||
Press a single key.
|
||||
|
||||
Args:
|
||||
key: The key to press
|
||||
"""
|
||||
await self._send_cmd("press_key", {"key": key})
|
||||
|
||||
async def hotkey(self, *keys):
|
||||
"""
|
||||
Press multiple keys simultaneously as a hotkey combination.
|
||||
|
||||
Args:
|
||||
*keys: Variable number of keys to press together. Can be Key enum instances or strings
|
||||
|
||||
Raises:
|
||||
ValueError: If any key is not a Key enum or string type
|
||||
"""
|
||||
actual_keys = []
|
||||
for key in keys:
|
||||
if isinstance(key, Key):
|
||||
@@ -101,4 +230,14 @@ class DioramaComputerInterface:
|
||||
await self._send_cmd("hotkey", {"keys": actual_keys})
|
||||
|
||||
async def to_screen_coordinates(self, x, y):
|
||||
"""
|
||||
Convert coordinates to screen coordinates.
|
||||
|
||||
Args:
|
||||
x (int): X coordinate to convert
|
||||
y (int): Y coordinate to convert
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing the converted screen coordinates
|
||||
"""
|
||||
return await self._send_cmd("to_screen_coordinates", {"x": x, "y": y})
|
||||
|
||||
@@ -17,60 +17,20 @@
|
||||
|
||||
**cua-mcp-server** is a MCP server for the Computer-Use Agent (CUA), allowing you to run CUA through Claude Desktop or other MCP clients.
|
||||
|
||||
## LiteLLM Integration
|
||||
|
||||
This MCP server features comprehensive liteLLM integration, allowing you to use any supported LLM provider with a simple model string configuration.
|
||||
|
||||
- **Unified Configuration**: Use a single `CUA_MODEL_NAME` environment variable with a model string
|
||||
- **Automatic Provider Detection**: The agent automatically detects the provider and capabilities from the model string
|
||||
- **Extensive Provider Support**: Works with Anthropic, OpenAI, local models, and any liteLLM-compatible provider
|
||||
|
||||
### Model String Examples:
|
||||
- **Anthropic**: `"anthropic/claude-3-5-sonnet-20241022"`
|
||||
- **OpenAI**: `"openai/computer-use-preview"`
|
||||
- **UI-TARS**: `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`
|
||||
- **Omni + Any LiteLLM**: `"omniparser+litellm/gpt-4o"`, `"omniparser+litellm/claude-3-haiku"`, `"omniparser+ollama_chat/gemma3"`
|
||||
|
||||
### Get started with Agent
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before installing the MCP server, you'll need to set up full Computer-Use Agent capabilities. This includes:
|
||||
Cua MCP Server requires [lume](https://github.com/trycua/cua/blob/main/libs/lume/README.md#install) to be installed.
|
||||
|
||||
1. Installing the Lume CLI
|
||||
2. Pulling the latest macOS CUA image
|
||||
3. Starting the Lume daemon service
|
||||
4. Installing the required Python libraries (Optional: only needed if you want to verify the agent is working before installing MCP server)
|
||||
## Install
|
||||
|
||||
Make sure these steps are completed and working before proceeding with the MCP server installation.
|
||||
|
||||
## Installation
|
||||
|
||||
Install the package from PyPI:
|
||||
|
||||
```bash
|
||||
pip install cua-mcp-server
|
||||
```
|
||||
|
||||
This will install:
|
||||
- The MCP server
|
||||
- CUA agent and computer dependencies
|
||||
- An executable `cua-mcp-server` script in your PATH
|
||||
|
||||
## Easy Setup Script
|
||||
|
||||
If you want to simplify installation, you can use this one-liner to download and run the installation script:
|
||||
Download and run the installation script:
|
||||
|
||||
```bash
|
||||
curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/python/mcp-server/scripts/install_mcp_server.sh | bash
|
||||
```
|
||||
|
||||
This script will:
|
||||
- Create the ~/.cua directory if it doesn't exist
|
||||
- Generate a startup script at ~/.cua/start_mcp_server.sh
|
||||
- Make the script executable
|
||||
- The startup script automatically manages Python virtual environments and installs/updates the cua-mcp-server package
|
||||
|
||||
You can then use the script in your MCP configuration like this:
|
||||
|
||||
```json
|
||||
@@ -87,9 +47,9 @@ You can then use the script in your MCP configuration like this:
|
||||
}
|
||||
```
|
||||
|
||||
## Development Guide
|
||||
## Development
|
||||
|
||||
If you want to develop with the cua-mcp-server directly without installation, you can use this configuration:
|
||||
Use this configuration to develop with the cua-mcp-server directly without installation:
|
||||
|
||||
```json
|
||||
{
|
||||
@@ -112,61 +72,11 @@ This configuration:
|
||||
|
||||
Just add this to your MCP client's configuration and it will use your local development version of the server.
|
||||
|
||||
### Troubleshooting
|
||||
## Docs
|
||||
|
||||
If you get a `/bin/bash: ~/cua/libs/python/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative.
|
||||
|
||||
To see the logs:
|
||||
```
|
||||
tail -n 20 -f ~/Library/Logs/Claude/mcp*.log
|
||||
```
|
||||
|
||||
## Claude Desktop Integration
|
||||
|
||||
To use with Claude Desktop, add an entry to your Claude Desktop configuration (`claude_desktop_config.json`, typically found in `~/.config/claude-desktop/`):
|
||||
|
||||
For more information on MCP with Claude Desktop, see the [official MCP User Guide](https://modelcontextprotocol.io/quickstart/user).
|
||||
|
||||
## Cursor Integration
|
||||
|
||||
To use with Cursor, add an MCP configuration file in one of these locations:
|
||||
|
||||
- **Project-specific**: Create `.cursor/mcp.json` in your project directory
|
||||
- **Global**: Create `~/.cursor/mcp.json` in your home directory
|
||||
|
||||
After configuration, you can simply tell Cursor's Agent to perform computer tasks by explicitly mentioning the CUA agent, such as "Use the computer control tools to open Safari."
|
||||
|
||||
For more information on MCP with Cursor, see the [official Cursor MCP documentation](https://docs.cursor.com/context/model-context-protocol).
|
||||
|
||||
### First-time Usage Notes
|
||||
|
||||
**API Keys**: Ensure you have valid API keys:
|
||||
- Add your Anthropic API key, or other model provider API key in the Claude Desktop config (as shown above)
|
||||
- Or set it as an environment variable in your shell profile
|
||||
|
||||
## Configuration
|
||||
|
||||
The server is configured using environment variables (can be set in the Claude Desktop config):
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| `CUA_MODEL_NAME` | Model string (e.g., "anthropic/claude-3-5-sonnet-20241022", "openai/computer-use-preview", "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", "omniparser+litellm/gpt-4o", "omniparser+ollama_chat/gemma3") | anthropic/claude-3-5-sonnet-20241022 |
|
||||
| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 |
|
||||
|
||||
## Available Tools
|
||||
|
||||
The MCP server exposes the following tools to Claude:
|
||||
|
||||
1. `run_cua_task` - Run a single Computer-Use Agent task with the given instruction
|
||||
2. `run_multi_cua_tasks` - Run multiple tasks in sequence
|
||||
|
||||
## Usage
|
||||
|
||||
Once configured, you can simply ask Claude to perform computer tasks:
|
||||
|
||||
- "Open Chrome and go to github.com"
|
||||
- "Create a folder called 'Projects' on my desktop"
|
||||
- "Find all PDFs in my Downloads folder"
|
||||
- "Take a screenshot and highlight the error message"
|
||||
|
||||
Claude will automatically use your CUA agent to perform these tasks.
|
||||
- [Installation](https://trycua.com/docs/libraries/mcp-server/installation)
|
||||
- [Configuration](https://trycua.com/docs/libraries/mcp-server/configuration)
|
||||
- [Usage](https://trycua.com/docs/libraries/mcp-server/usage)
|
||||
- [Tools](https://trycua.com/docs/libraries/mcp-server/tools)
|
||||
- [Client Integrations](https://trycua.com/docs/libraries/mcp-server/client-integrations)
|
||||
- [LLM Integrations](https://trycua.com/docs/libraries/mcp-server/llm-integrations)
|
||||
661
libs/python/som/LICENSE
Normal file
661
libs/python/som/LICENSE
Normal file
@@ -0,0 +1,661 @@
|
||||
GNU AFFERO GENERAL PUBLIC LICENSE
|
||||
Version 3, 19 November 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU Affero General Public License is a free, copyleft license for
|
||||
software and other kinds of works, specifically designed to ensure
|
||||
cooperation with the community in the case of network server software.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
our General Public Licenses are intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
Developers that use our General Public Licenses protect your rights
|
||||
with two steps: (1) assert copyright on the software, and (2) offer
|
||||
you this License which gives you legal permission to copy, distribute
|
||||
and/or modify the software.
|
||||
|
||||
A secondary benefit of defending all users' freedom is that
|
||||
improvements made in alternate versions of the program, if they
|
||||
receive widespread use, become available for other developers to
|
||||
incorporate. Many developers of free software are heartened and
|
||||
encouraged by the resulting cooperation. However, in the case of
|
||||
software used on network servers, this result may fail to come about.
|
||||
The GNU General Public License permits making a modified version and
|
||||
letting the public access it on a server without ever releasing its
|
||||
source code to the public.
|
||||
|
||||
The GNU Affero General Public License is designed specifically to
|
||||
ensure that, in such cases, the modified source code becomes available
|
||||
to the community. It requires the operator of a network server to
|
||||
provide the source code of the modified version running there to the
|
||||
users of that server. Therefore, public use of a modified version, on
|
||||
a publicly accessible server, gives the public access to the source
|
||||
code of the modified version.
|
||||
|
||||
An older license, called the Affero General Public License and
|
||||
published by Affero, was designed to accomplish similar goals. This is
|
||||
a different license, not a version of the Affero GPL, but Affero has
|
||||
released a new version of the Affero GPL which permits relicensing under
|
||||
this license.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU Affero General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Remote Network Interaction; Use with the GNU General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, if you modify the
|
||||
Program, your modified version must prominently offer all users
|
||||
interacting with it remotely through a computer network (if your version
|
||||
supports such interaction) an opportunity to receive the Corresponding
|
||||
Source of your version by providing access to the Corresponding Source
|
||||
from a network server at no charge, through some standard or customary
|
||||
means of facilitating copying of software. This Corresponding Source
|
||||
shall include the Corresponding Source for any work covered by version 3
|
||||
of the GNU General Public License that is incorporated pursuant to the
|
||||
following paragraph.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the work with which it is combined will remain governed by version
|
||||
3 of the GNU General Public License.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU Affero General Public License from time to time. Such new versions
|
||||
will be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU Affero General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU Affero General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU Affero General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If your software can interact with users remotely through a computer
|
||||
network, you should also make sure that it provides a way for users to
|
||||
get its source. For example, if your program is a web application, its
|
||||
interface could display a "Source" link that leads users to an archive
|
||||
of the code. There are many ways you could offer source, and different
|
||||
solutions will be better for different programs; see section 13 for the
|
||||
specific requirements.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU AGPL, see
|
||||
<https://www.gnu.org/licenses/>.
|
||||
@@ -75,93 +75,9 @@ for elem in result.elements:
|
||||
print(f"Text: '{elem.content}', confidence={elem.confidence:.3f}")
|
||||
```
|
||||
|
||||
## Configuration
|
||||
## Docs
|
||||
|
||||
### Detection Parameters
|
||||
|
||||
#### Box Threshold (0.3)
|
||||
Controls the confidence threshold for accepting detections:
|
||||
```
|
||||
High Threshold (0.3): Low Threshold (0.01):
|
||||
+----------------+ +----------------+
|
||||
| | | +--------+ |
|
||||
| Confident | | |Unsure?| |
|
||||
| Detection | | +--------+ |
|
||||
| (✓ Accept) | | (? Reject) |
|
||||
| | | |
|
||||
+----------------+ +----------------+
|
||||
conf = 0.85 conf = 0.02
|
||||
```
|
||||
- Higher values (0.3) yield more precise but fewer detections
|
||||
- Lower values (0.01) catch more potential icons but increase false positives
|
||||
- Default is 0.3 for optimal precision/recall balance
|
||||
|
||||
#### IOU Threshold (0.1)
|
||||
Controls how overlapping detections are merged:
|
||||
```
|
||||
IOU = Intersection Area / Union Area
|
||||
|
||||
Low Overlap (Keep Both): High Overlap (Merge):
|
||||
+----------+ +----------+
|
||||
| Box1 | | Box1 |
|
||||
| | vs. |+-----+ |
|
||||
+----------+ ||Box2 | |
|
||||
+----------+ |+-----+ |
|
||||
| Box2 | +----------+
|
||||
| |
|
||||
+----------+
|
||||
IOU ≈ 0.05 (Keep Both) IOU ≈ 0.7 (Merge)
|
||||
```
|
||||
- Lower values (0.1) more aggressively remove overlapping boxes
|
||||
- Higher values (0.5) allow more overlapping detections
|
||||
- Default is 0.1 to handle densely packed UI elements
|
||||
|
||||
### OCR Configuration
|
||||
|
||||
- **Engine**: EasyOCR
|
||||
- Primary choice for all platforms
|
||||
- Fast initialization and processing
|
||||
- Built-in English language support
|
||||
- GPU acceleration when available
|
||||
|
||||
- **Settings**:
|
||||
- Timeout: 5 seconds
|
||||
- Confidence threshold: 0.5
|
||||
- Paragraph mode: Disabled
|
||||
- Language: English only
|
||||
|
||||
## Performance
|
||||
|
||||
### Hardware Acceleration
|
||||
|
||||
#### MPS (Metal Performance Shaders)
|
||||
- Multi-scale detection (640px, 1280px, 1920px)
|
||||
- Test-time augmentation enabled
|
||||
- Half-precision (FP16)
|
||||
- Average detection time: ~0.4s
|
||||
- Best for production use when available
|
||||
|
||||
#### CPU
|
||||
- Single-scale detection (1280px)
|
||||
- Full-precision (FP32)
|
||||
- Average detection time: ~1.3s
|
||||
- Reliable fallback option
|
||||
|
||||
### Example Output Structure
|
||||
|
||||
```
|
||||
examples/output/
|
||||
├── {timestamp}_no_ocr/
|
||||
│ ├── annotated_images/
|
||||
│ │ └── screenshot_analyzed.png
|
||||
│ ├── screen_details.txt
|
||||
│ └── summary.json
|
||||
└── {timestamp}_ocr/
|
||||
├── annotated_images/
|
||||
│ └── screenshot_analyzed.png
|
||||
├── screen_details.txt
|
||||
└── summary.json
|
||||
```
|
||||
- [Configuration](http://localhost:8090/docs/libraries/som/configuration)
|
||||
|
||||
## Development
|
||||
|
||||
|
||||
@@ -26,12 +26,12 @@ dependencies = [
|
||||
]
|
||||
requires-python = ">=3.11"
|
||||
readme = "README.md"
|
||||
license = {text = "MIT"}
|
||||
license = {text = "AGPL-3.0-or-later"}
|
||||
keywords = ["computer-vision", "ocr", "ui-analysis", "icon-detection"]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
"License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
|
||||
Reference in New Issue
Block a user