mirror of
https://github.com/trycua/computer.git
synced 2025-12-31 02:19:58 -06:00
Merge pull request #371 from trycua/chore/hud-upgrade
[Agent] Upgrade HUD SDK to 0.4.12
This commit is contained in:
@@ -10,37 +10,35 @@ The HUD integration allows you to use ComputerAgent with the [HUD benchmarking f
|
||||
```bash
|
||||
pip install "cua-agent[hud]"
|
||||
## or install hud-python directly
|
||||
# pip install hud-python==0.2.10
|
||||
# pip install hud-python==0.4.12
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
from agent.integrations.hud import run_job
|
||||
from hud import load_taskset
|
||||
from hud.taskset import TaskSet
|
||||
import logging
|
||||
# Quick single-task smoke test
|
||||
from agent.integrations.hud import run_single_task
|
||||
|
||||
# Load taskset
|
||||
taskset = await load_taskset("OSWorld-Verified")
|
||||
taskset = TaskSet(tasks=taskset[:10]) # limit to 10 tasks instead of all 370
|
||||
|
||||
# Run benchmark job
|
||||
job = await run_job(
|
||||
model="openai/computer-use-preview",
|
||||
# model="anthropic/claude-3-5-sonnet-20241022",
|
||||
# model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5",
|
||||
task_or_taskset=taskset,
|
||||
job_name="test-computeragent-job",
|
||||
max_concurrent_tasks=5,
|
||||
# add any extra ComputerAgent kwargs:
|
||||
verbosity=logging.INFO, # Enable logging
|
||||
# trajectory_dir=".." # Save trajectories locally
|
||||
await run_single_task(
|
||||
dataset="hud-evals/OSWorld-Verified-XLang", # or another HUD dataset
|
||||
model="openai/computer-use-preview+openai/gpt-5-nano", # any supported model string
|
||||
task_id=155, # e.g., reopen last closed tab
|
||||
)
|
||||
|
||||
# Get results OR view them at app.hud.so
|
||||
print(await job.get_analytics())
|
||||
print(f"View results at: https://app.hud.so/jobs/{job.id}")
|
||||
# Run a small split of OSWorld-Verified in parallel
|
||||
from agent.integrations.hud import run_full_dataset
|
||||
|
||||
results = await run_full_dataset(
|
||||
dataset="hud-evals/OSWorld-Verified-XLang", # can also pass a Dataset or list[dict]
|
||||
model="openai/computer-use-preview",
|
||||
split="train[:3]", # try a few tasks to start
|
||||
max_concurrent=20, # tune to your infra
|
||||
max_steps=50 # safety cap per task
|
||||
)
|
||||
|
||||
# Environment variables required:
|
||||
# - HUD_API_KEY (HUD access)
|
||||
# - OPENAI_API_KEY or ANTHROPIC_API_KEY depending on your chosen model(s)
|
||||
```
|
||||
|
||||
**Available Benchmarks:**
|
||||
|
||||
@@ -30,6 +30,7 @@ from .callbacks import (
|
||||
TrajectorySaverCallback,
|
||||
BudgetManagerCallback,
|
||||
TelemetryCallback,
|
||||
OperatorNormalizerCallback
|
||||
)
|
||||
from .computers import (
|
||||
AsyncComputerHandler,
|
||||
@@ -202,6 +203,9 @@ class ComputerAgent:
|
||||
|
||||
# == Add built-in callbacks ==
|
||||
|
||||
# Prepend operator normalizer callback
|
||||
self.callbacks.insert(0, OperatorNormalizerCallback())
|
||||
|
||||
# Add telemetry callback if telemetry_enabled is set
|
||||
if self.telemetry_enabled:
|
||||
if isinstance(self.telemetry_enabled, bool):
|
||||
|
||||
@@ -8,6 +8,7 @@ from .logging import LoggingCallback
|
||||
from .trajectory_saver import TrajectorySaverCallback
|
||||
from .budget_manager import BudgetManagerCallback
|
||||
from .telemetry import TelemetryCallback
|
||||
from .operator_validator import OperatorNormalizerCallback
|
||||
|
||||
__all__ = [
|
||||
"AsyncCallbackHandler",
|
||||
@@ -16,4 +17,5 @@ __all__ = [
|
||||
"TrajectorySaverCallback",
|
||||
"BudgetManagerCallback",
|
||||
"TelemetryCallback",
|
||||
"OperatorNormalizerCallback",
|
||||
]
|
||||
|
||||
138
libs/python/agent/agent/callbacks/operator_validator.py
Normal file
138
libs/python/agent/agent/callbacks/operator_validator.py
Normal file
@@ -0,0 +1,138 @@
|
||||
"""
|
||||
OperatorValidatorCallback
|
||||
|
||||
Ensures agent output actions conform to expected schemas by fixing common issues:
|
||||
- click: add default button='left' if missing
|
||||
- keypress: wrap keys string into a list
|
||||
- etc.
|
||||
|
||||
This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts).
|
||||
The purpose is to avoid spending another LLM call to fix broken computer call syntax when possible.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from .base import AsyncCallbackHandler
|
||||
|
||||
|
||||
class OperatorNormalizerCallback(AsyncCallbackHandler):
|
||||
"""Normalizes common computer call hallucinations / errors in computer call syntax."""
|
||||
|
||||
async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
# Mutate in-place as requested, but still return the list for chaining
|
||||
for item in output or []:
|
||||
if item.get("type") != "computer_call":
|
||||
continue
|
||||
action = item.get("action")
|
||||
if not isinstance(action, dict):
|
||||
continue
|
||||
|
||||
# rename mouse click actions to "click"
|
||||
for mouse_btn in ["left", "right", "wheel", "back", "forward"]:
|
||||
if action.get("type", "") == f"{mouse_btn}_click":
|
||||
action["type"] = "click"
|
||||
action["button"] = mouse_btn
|
||||
# rename hotkey actions to "keypress"
|
||||
for alias in ["hotkey", "key", "press", "key_press"]:
|
||||
if action.get("type", "") == alias:
|
||||
action["type"] = "keypress"
|
||||
# assume click actions
|
||||
if "button" in action and "type" not in action:
|
||||
action["type"] = "click"
|
||||
if "click" in action and "type" not in action:
|
||||
action["type"] = "click"
|
||||
if ("scroll_x" in action or "scroll_y" in action) and "type" not in action:
|
||||
action["type"] = "scroll"
|
||||
if "text" in action and "type" not in action:
|
||||
action["type"] = "type"
|
||||
|
||||
action_type = action.get("type")
|
||||
def _keep_keys(action: Dict[str, Any], keys_to_keep: List[str]):
|
||||
"""Keep only the provided keys on action; delete everything else.
|
||||
Always ensures required 'type' is present if listed in keys_to_keep.
|
||||
"""
|
||||
for key in list(action.keys()):
|
||||
if key not in keys_to_keep:
|
||||
del action[key]
|
||||
# rename "coordinate" to "x", "y"
|
||||
if "coordinate" in action:
|
||||
action["x"] = action["coordinate"][0]
|
||||
action["y"] = action["coordinate"][1]
|
||||
del action["coordinate"]
|
||||
if action_type == "click":
|
||||
# convert "click" to "button"
|
||||
if "button" not in action and "click" in action:
|
||||
action["button"] = action["click"]
|
||||
del action["click"]
|
||||
# default button to "left"
|
||||
action["button"] = action.get("button", "left")
|
||||
# add default scroll x, y if missing
|
||||
if action_type == "scroll":
|
||||
action["scroll_x"] = action.get("scroll_x", 0)
|
||||
action["scroll_y"] = action.get("scroll_y", 0)
|
||||
# ensure keys arg is a list (normalize aliases first)
|
||||
if action_type == "keypress":
|
||||
keys = action.get("keys")
|
||||
for keys_alias in ["keypress", "key", "press", "key_press", "text"]:
|
||||
if keys_alias in action:
|
||||
action["keys"] = action[keys_alias]
|
||||
del action[keys_alias]
|
||||
keys = action.get("keys")
|
||||
if isinstance(keys, str):
|
||||
action["keys"] = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
|
||||
required_keys_by_type = {
|
||||
# OpenAI actions
|
||||
"click": ["type", "button", "x", "y"],
|
||||
"double_click": ["type", "x", "y"],
|
||||
"drag": ["type", "path"],
|
||||
"keypress": ["type", "keys"],
|
||||
"move": ["type", "x", "y"],
|
||||
"screenshot": ["type"],
|
||||
"scroll": ["type", "scroll_x", "scroll_y", "x", "y"],
|
||||
"type": ["type", "text"],
|
||||
"wait": ["type"],
|
||||
# Anthropic actions
|
||||
"left_mouse_down": ["type", "x", "y"],
|
||||
"left_mouse_up": ["type", "x", "y"],
|
||||
"triple_click": ["type", "button", "x", "y"],
|
||||
}
|
||||
keep = required_keys_by_type.get(action_type or "")
|
||||
if keep:
|
||||
_keep_keys(action, keep)
|
||||
|
||||
|
||||
# Second pass: if an assistant message is immediately followed by a computer_call,
|
||||
# replace the assistant message itself with a reasoning message with summary text.
|
||||
if isinstance(output, list):
|
||||
for i, item in enumerate(output):
|
||||
# AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] }
|
||||
if item.get("type") == "message" and item.get("role") == "assistant":
|
||||
next_idx = i + 1
|
||||
if next_idx >= len(output):
|
||||
continue
|
||||
next_item = output[next_idx]
|
||||
if not isinstance(next_item, dict):
|
||||
continue
|
||||
if next_item.get("type") != "computer_call":
|
||||
continue
|
||||
contents = item.get("content") or []
|
||||
# Extract text from OutputContent[]
|
||||
text_parts: List[str] = []
|
||||
if isinstance(contents, list):
|
||||
for c in contents:
|
||||
if isinstance(c, dict) and c.get("type") == "output_text" and isinstance(c.get("text"), str):
|
||||
text_parts.append(c["text"])
|
||||
text_content = "\n".join(text_parts).strip()
|
||||
# Replace assistant message with reasoning message
|
||||
output[i] = {
|
||||
"type": "reasoning",
|
||||
"summary": [
|
||||
{
|
||||
"type": "summary_text",
|
||||
"text": text_content,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
return output
|
||||
@@ -94,6 +94,10 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
# format: turn_000/0000_name.json
|
||||
artifact_filename = f"{self.current_artifact:04d}_{name}"
|
||||
artifact_path = turn_dir / f"{artifact_filename}.json"
|
||||
# add created_at
|
||||
if isinstance(artifact, dict):
|
||||
artifact = artifact.copy()
|
||||
artifact["created_at"] = str(uuid.uuid1().time)
|
||||
with open(artifact_path, "w") as f:
|
||||
json.dump(sanitize_image_urls(artifact), f, indent=2)
|
||||
self.current_artifact += 1
|
||||
@@ -171,7 +175,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
"status": "completed",
|
||||
"completed_at": str(uuid.uuid1().time),
|
||||
"total_usage": self.total_usage,
|
||||
"new_items": sanitize_image_urls(new_items),
|
||||
"new_items": new_items,
|
||||
"total_turns": self.current_turn
|
||||
})
|
||||
|
||||
|
||||
@@ -1,77 +1,228 @@
|
||||
"""HUD integration for ComputerAgent."""
|
||||
"""HUD integration: Generic HuggingFace dataset evaluation runner (CUA proxy).
|
||||
|
||||
import logging
|
||||
from typing import Any, Optional, Dict
|
||||
from hud import run_job as hud_run_job
|
||||
This module exposes two helpers to evaluate HUD-compatible datasets using
|
||||
HUD's OperatorAgent, while proxying model calls through our ComputerAgent via
|
||||
`FakeAsyncOpenAI` (see `agent/integrations/hud/agent.py`).
|
||||
|
||||
from .agent import ComputerAgent
|
||||
from .adapter import ComputerAgentAdapter
|
||||
from .computer_handler import HUDComputerHandler
|
||||
Exports:
|
||||
- run_single_task(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None)
|
||||
- run_full_dataset(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None, max_concurrent=30, max_steps=50)
|
||||
"""
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from PIL import Image
|
||||
from datasets import load_dataset, Dataset
|
||||
from hud.agents import OperatorAgent
|
||||
from hud.datasets import Task, run_dataset
|
||||
from hud.tools.computer.settings import computer_settings
|
||||
from hud import trace
|
||||
|
||||
from agent.agent import ComputerAgent as BaseComputerAgent
|
||||
from .proxy import FakeAsyncOpenAI
|
||||
|
||||
|
||||
async def run_job(
|
||||
model: str,
|
||||
task_or_taskset: Any,
|
||||
job_name: str,
|
||||
# Job kwargs
|
||||
auto_reply_question: bool = False,
|
||||
adapter_cls: Any = None,
|
||||
adapter_kwargs: Optional[Dict[str, Any]] = None,
|
||||
max_steps_per_task: int = 20,
|
||||
run_parallel: bool = True,
|
||||
job_metadata: Optional[Dict[str, Any]] = None,
|
||||
show_progress: bool = True,
|
||||
max_concurrent_env_creations: Optional[int] = 30, # Limits gym.make calls
|
||||
max_concurrent_agent_predictions: Optional[int] = None, # No limit on LLM calls
|
||||
max_concurrent_tasks: Optional[int] = 30, # Limits overall task concurrency
|
||||
**agent_kwargs: Any
|
||||
) -> Any:
|
||||
# ---------------------------------------------------------------------------
|
||||
# Proxy OperatorAgent
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class ProxyOperatorAgent(OperatorAgent):
|
||||
"""OperatorAgent that proxies model calls through our ComputerAgent.
|
||||
|
||||
Accepts the same config keys we pass via hud.run_dataset `agent_config`:
|
||||
- model: str | None
|
||||
- allowed_tools: list[str] | None
|
||||
Additional kwargs are forwarded to OperatorAgent (if any are supported).
|
||||
"""
|
||||
Run a job using ComputerAgent with the specified model.
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
model: str | None = None,
|
||||
allowed_tools: list[str] | None = None,
|
||||
trajectory_dir: str | None = None,
|
||||
# === ComputerAgent kwargs ===
|
||||
tools: list[Any] | None = None,
|
||||
custom_loop: Any | None = None,
|
||||
only_n_most_recent_images: int | None = None,
|
||||
callbacks: list[Any] | None = None,
|
||||
verbosity: int | None = None,
|
||||
max_retries: int | None = 3,
|
||||
screenshot_delay: float | int = 0.5,
|
||||
use_prompt_caching: bool | None = False,
|
||||
max_trajectory_budget: float | dict | None = None,
|
||||
telemetry_enabled: bool | None = True,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
model = model or "computer-use-preview"
|
||||
allowed_tools = allowed_tools or ["openai_computer"]
|
||||
|
||||
computer_shim = {
|
||||
'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
|
||||
'environment': 'linux',
|
||||
'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
|
||||
}
|
||||
# Build tools ensuring the computer_shim is included
|
||||
agent_tools: list[Any] = [computer_shim]
|
||||
if tools:
|
||||
agent_tools.extend(tools)
|
||||
|
||||
computer_agent = BaseComputerAgent(
|
||||
model=model,
|
||||
tools=agent_tools,
|
||||
custom_loop=custom_loop,
|
||||
only_n_most_recent_images=only_n_most_recent_images,
|
||||
callbacks=callbacks,
|
||||
verbosity=verbosity,
|
||||
trajectory_dir=trajectory_dir,
|
||||
max_retries=max_retries,
|
||||
screenshot_delay=screenshot_delay,
|
||||
use_prompt_caching=use_prompt_caching,
|
||||
max_trajectory_budget=max_trajectory_budget,
|
||||
telemetry_enabled=telemetry_enabled,
|
||||
)
|
||||
model_client = FakeAsyncOpenAI(computer_agent)
|
||||
|
||||
super().__init__(
|
||||
model_client=model_client, # type: ignore[arg-type]
|
||||
model=model,
|
||||
allowed_tools=allowed_tools,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Single-task runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_single_task(
|
||||
dataset: str | Dataset | list[dict[str, Any]],
|
||||
*,
|
||||
task_id: int = 0,
|
||||
model: str | None = None,
|
||||
allowed_tools: list[str] | None = None,
|
||||
# === ComputerAgent kwargs ===
|
||||
tools: list[Any] | None = None,
|
||||
custom_loop: Any | None = None,
|
||||
only_n_most_recent_images: int | None = None,
|
||||
callbacks: list[Any] | None = None,
|
||||
verbosity: int | None = None,
|
||||
trajectory_dir: str | None = None,
|
||||
max_retries: int | None = 3,
|
||||
screenshot_delay: float | int = 0.5,
|
||||
use_prompt_caching: bool | None = False,
|
||||
max_trajectory_budget: float | dict | None = None,
|
||||
telemetry_enabled: bool | None = True,
|
||||
) -> None:
|
||||
"""Load one task from the dataset and execute it with Operator+CUA proxy."""
|
||||
|
||||
# Load dataset and pick a sample
|
||||
if isinstance(dataset, str):
|
||||
dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
|
||||
elif isinstance(dataset, list):
|
||||
dataset = dataset
|
||||
else:
|
||||
dataset = dataset["train"]
|
||||
|
||||
Args:
|
||||
model: Model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
|
||||
task_or_taskset: Task or TaskSet to run
|
||||
job_name: Name for the job
|
||||
auto_reply_question: Whether to auto-reply to questions
|
||||
adapter_cls: Custom adapter class (defaults to ComputerAgentAdapter)
|
||||
adapter_kwargs: Additional kwargs for the adapter
|
||||
max_steps_per_task: Maximum steps per task
|
||||
run_parallel: Whether to run tasks in parallel
|
||||
job_metadata: Additional metadata for the job
|
||||
show_progress: Whether to show progress
|
||||
max_concurrent_env_creations: Max concurrent environment creations
|
||||
max_concurrent_agent_predictions: Max concurrent agent predictions
|
||||
max_concurrent_tasks: Max concurrent tasks
|
||||
**agent_kwargs: Additional kwargs to pass to ComputerAgent
|
||||
|
||||
Returns:
|
||||
Job instance from HUD
|
||||
"""
|
||||
# combine verbose and verbosity kwargs
|
||||
if "verbose" in agent_kwargs:
|
||||
agent_kwargs["verbosity"] = logging.INFO
|
||||
del agent_kwargs["verbose"]
|
||||
verbose = True if agent_kwargs.get("verbosity", logging.WARNING) > logging.INFO else False
|
||||
|
||||
# run job
|
||||
return await hud_run_job(
|
||||
agent_cls=ComputerAgent,
|
||||
agent_kwargs={"model": model, **agent_kwargs},
|
||||
task_or_taskset=task_or_taskset,
|
||||
job_name=job_name,
|
||||
auto_reply_question=auto_reply_question,
|
||||
adapter_cls=adapter_cls,
|
||||
adapter_kwargs=adapter_kwargs,
|
||||
max_steps_per_task=max_steps_per_task,
|
||||
run_parallel=run_parallel,
|
||||
job_metadata=job_metadata,
|
||||
show_progress=show_progress,
|
||||
verbose=verbose,
|
||||
max_concurrent_env_creations=max_concurrent_env_creations,
|
||||
max_concurrent_agent_predictions=max_concurrent_agent_predictions,
|
||||
max_concurrent_tasks=max_concurrent_tasks
|
||||
sample_task = dataset[task_id] # type: ignore[index]
|
||||
task_prompt = sample_task.get("prompt", f"Task {sample_task.get('id', 0)}") # type: ignore[attr-defined]
|
||||
|
||||
with trace(name=task_prompt):
|
||||
task = Task(**sample_task) # type: ignore[arg-type]
|
||||
|
||||
agent = ProxyOperatorAgent(
|
||||
model=model,
|
||||
allowed_tools=allowed_tools,
|
||||
# === ComputerAgent kwargs passthrough ===
|
||||
tools=tools,
|
||||
custom_loop=custom_loop,
|
||||
only_n_most_recent_images=only_n_most_recent_images,
|
||||
callbacks=callbacks,
|
||||
verbosity=verbosity,
|
||||
trajectory_dir=trajectory_dir,
|
||||
max_retries=max_retries,
|
||||
screenshot_delay=screenshot_delay,
|
||||
use_prompt_caching=use_prompt_caching,
|
||||
max_trajectory_budget=max_trajectory_budget,
|
||||
telemetry_enabled=telemetry_enabled,
|
||||
)
|
||||
print(f"Running: {task_prompt}")
|
||||
result = await agent.run(task, max_steps=10)
|
||||
print(f"✅ Reward: {getattr(result, 'reward')}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Full-dataset runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_full_dataset(
|
||||
dataset: str | Dataset | list[dict[str, Any]],
|
||||
*,
|
||||
job_name: Optional[str] = None,
|
||||
model: str | None = None,
|
||||
allowed_tools: list[str] | None = None,
|
||||
max_concurrent: int = 30,
|
||||
max_steps: int = 50,
|
||||
split: str = "train",
|
||||
trajectory_dir: str | None = None,
|
||||
# === ComputerAgent kwargs ===
|
||||
tools: list[Any] | None = None,
|
||||
custom_loop: Any | None = None,
|
||||
only_n_most_recent_images: int | None = 5,
|
||||
callbacks: list[Any] | None = None,
|
||||
verbosity: int | None = None,
|
||||
max_retries: int | None = 3,
|
||||
screenshot_delay: float | int = 0.5,
|
||||
use_prompt_caching: bool | None = False,
|
||||
max_trajectory_budget: float | dict | None = None,
|
||||
telemetry_enabled: bool | None = True,
|
||||
) -> list[Any]:
|
||||
"""Run evaluation across the entire dataset using hud.datasets.run_dataset."""
|
||||
|
||||
# We pass OperatorAgent as the class and provide a config that injects our
|
||||
# FakeAsyncOpenAI per agent instantiation.
|
||||
|
||||
if isinstance(dataset, str):
|
||||
dataset_name = dataset.split('/')[-1]
|
||||
job_name = job_name or f"Evaluation {dataset_name}"
|
||||
dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
|
||||
else:
|
||||
dataset_name = "custom"
|
||||
job_name = job_name or f"Evaluation {time.strftime('%H:%M %Y-%m-%d')}"
|
||||
|
||||
# Execute evaluation
|
||||
return await run_dataset(
|
||||
name=job_name,
|
||||
dataset=dataset,
|
||||
agent_class=ProxyOperatorAgent,
|
||||
agent_config={
|
||||
"model": model,
|
||||
"allowed_tools": allowed_tools,
|
||||
"trajectory_dir": trajectory_dir,
|
||||
# === ComputerAgent kwargs passthrough ===
|
||||
"tools": tools,
|
||||
"custom_loop": custom_loop,
|
||||
"only_n_most_recent_images": only_n_most_recent_images,
|
||||
"callbacks": callbacks,
|
||||
"verbosity": verbosity,
|
||||
"max_retries": max_retries,
|
||||
"screenshot_delay": screenshot_delay,
|
||||
"use_prompt_caching": use_prompt_caching,
|
||||
"max_trajectory_budget": max_trajectory_budget,
|
||||
"telemetry_enabled": telemetry_enabled,
|
||||
},
|
||||
max_concurrent=max_concurrent,
|
||||
metadata={"dataset": dataset_name},
|
||||
max_steps=max_steps,
|
||||
auto_respond=True,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler", "run_job"]
|
||||
__all__ = [
|
||||
"run_single_task",
|
||||
"run_full_dataset",
|
||||
"ProxyOperatorAgent",
|
||||
]
|
||||
@@ -1,121 +0,0 @@
|
||||
"""HUD Adapter for ComputerAgent integration."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, ClassVar
|
||||
|
||||
from hud.adapters.common import CLA, Adapter
|
||||
from hud.adapters.common.types import (
|
||||
CLAButton,
|
||||
CLAKey,
|
||||
ClickAction,
|
||||
CustomAction,
|
||||
DragAction,
|
||||
MoveAction,
|
||||
Point,
|
||||
PressAction,
|
||||
ResponseAction,
|
||||
ScreenshotFetch,
|
||||
ScrollAction,
|
||||
TypeAction,
|
||||
WaitAction,
|
||||
)
|
||||
|
||||
|
||||
class ComputerAgentAdapter(Adapter):
|
||||
"""Adapter for ComputerAgent to work with HUD."""
|
||||
|
||||
KEY_MAP: ClassVar[dict[str, CLAKey]] = {
|
||||
"return": "enter",
|
||||
"arrowup": "up",
|
||||
"arrowdown": "down",
|
||||
"arrowleft": "left",
|
||||
"arrowright": "right",
|
||||
"cmd": "ctrl",
|
||||
"super": "win",
|
||||
"meta": "win",
|
||||
}
|
||||
|
||||
BUTTON_MAP: ClassVar[dict[str, CLAButton]] = {
|
||||
"wheel": "middle",
|
||||
"middle": "middle",
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
# ComputerAgent default dimensions (can be overridden)
|
||||
self.agent_width = 1024
|
||||
self.agent_height = 768
|
||||
|
||||
def _map_key(self, key: str) -> CLAKey:
|
||||
"""Map a key to its standardized form."""
|
||||
return self.KEY_MAP.get(key.lower(), key.lower()) # type: ignore
|
||||
|
||||
def convert(self, data: Any) -> CLA:
|
||||
"""Convert a ComputerAgent action to a HUD action."""
|
||||
try:
|
||||
action_type = data.get("type")
|
||||
|
||||
if action_type == "click":
|
||||
x, y = data.get("x", 0), data.get("y", 0)
|
||||
button = data.get("button", "left")
|
||||
button = self.BUTTON_MAP.get(button, button)
|
||||
if button is None:
|
||||
button = "left"
|
||||
converted_action = ClickAction(point=Point(x=x, y=y), button=button)
|
||||
|
||||
elif action_type == "double_click":
|
||||
x, y = data.get("x", 0), data.get("y", 0)
|
||||
converted_action = ClickAction(point=Point(x=x, y=y), button="left", pattern=[100])
|
||||
|
||||
elif action_type == "scroll":
|
||||
x, y = int(data.get("x", 0)), int(data.get("y", 0))
|
||||
scroll_x = int(data.get("scroll_x", 0))
|
||||
scroll_y = int(data.get("scroll_y", 0))
|
||||
converted_action = ScrollAction(
|
||||
point=Point(x=x, y=y), scroll=Point(x=scroll_x, y=scroll_y)
|
||||
)
|
||||
|
||||
elif action_type == "type":
|
||||
text = data.get("text", "")
|
||||
converted_action = TypeAction(text=text, enter_after=False)
|
||||
|
||||
elif action_type == "wait":
|
||||
ms = data.get("ms", 1000)
|
||||
converted_action = WaitAction(time=ms)
|
||||
|
||||
elif action_type == "move":
|
||||
x, y = data.get("x", 0), data.get("y", 0)
|
||||
converted_action = MoveAction(point=Point(x=x, y=y))
|
||||
|
||||
elif action_type == "keypress":
|
||||
keys = data.get("keys", [])
|
||||
if isinstance(keys, str):
|
||||
keys = [keys]
|
||||
converted_action = PressAction(keys=[self._map_key(k) for k in keys])
|
||||
|
||||
elif action_type == "drag":
|
||||
path = data.get("path", [])
|
||||
points = [Point(x=p.get("x", 0), y=p.get("y", 0)) for p in path]
|
||||
converted_action = DragAction(path=points)
|
||||
|
||||
elif action_type == "screenshot":
|
||||
converted_action = ScreenshotFetch()
|
||||
|
||||
elif action_type == "response":
|
||||
converted_action = ResponseAction(text=data.get("text", ""))
|
||||
|
||||
elif action_type == "custom":
|
||||
converted_action = CustomAction(action=data.get("action", ""))
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported action type: {action_type}")
|
||||
|
||||
# Add reasoning and logs if available
|
||||
converted_action.reasoning = data.get("reasoning", "")
|
||||
converted_action.logs = data.get("logs", "")
|
||||
|
||||
return converted_action
|
||||
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid action: {data}. Error: {e!s}") from e
|
||||
@@ -1,373 +0,0 @@
|
||||
"""HUD ComputerAgent wrapper for OSWorld benchmarking."""
|
||||
|
||||
import logging
|
||||
from typing import Any, Literal, Optional, Union, List, Dict
|
||||
import asyncio
|
||||
|
||||
from agent import ComputerAgent as BaseComputerAgent
|
||||
from agent.responses import make_failed_tool_call_items
|
||||
from hud.adapters import Adapter
|
||||
from hud.agent.base import Agent
|
||||
from hud.utils.common import Observation
|
||||
from hud.adapters.common.types import LogType
|
||||
from hud.types import Gym
|
||||
|
||||
from .adapter import ComputerAgentAdapter
|
||||
from .computer_handler import HUDComputerHandler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BASE_SYSTEM_PROMPT = """
|
||||
You are an autonomous computer-using agent. Follow these guidelines:
|
||||
|
||||
1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
|
||||
2. Use the computer tools to complete the task and do not stop until the task is complete.
|
||||
3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
|
||||
4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
|
||||
5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
|
||||
6. Trust that the user wants you to complete the entire task they've requested.
|
||||
7. You must say "Task completed" when the task is complete.
|
||||
|
||||
Remember: You have been given permission to complete the requested task autonomously.
|
||||
""".strip()
|
||||
|
||||
class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
|
||||
"""
|
||||
A ComputerAgent wrapper for HUD integration.
|
||||
|
||||
This agent wraps the base ComputerAgent to work with HUD environments,
|
||||
providing the same interface as OperatorAgent but using ComputerAgent internally.
|
||||
"""
|
||||
|
||||
transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str = "anthropic/claude-3-5-sonnet-20241022",
|
||||
environment: Literal["windows", "mac", "linux", "browser"] = "linux",
|
||||
adapter: Optional[Adapter] = None,
|
||||
name: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""
|
||||
Initialize the ComputerAgent for HUD.
|
||||
|
||||
Args:
|
||||
model: The model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
|
||||
environment: The environment type (windows, mac, linux, browser)
|
||||
adapter: The adapter to use for preprocessing and postprocessing
|
||||
name: The name of the agent
|
||||
**kwargs: Additional arguments passed to ComputerAgent
|
||||
"""
|
||||
# Create adapter if not provided
|
||||
adapter = adapter or ComputerAgentAdapter()
|
||||
|
||||
if name is None:
|
||||
name = f"computeragent-{model.split('/')[-1]}"
|
||||
|
||||
# Initialize the base Agent class without client (we'll create it later)
|
||||
super().__init__(client=None, adapter=adapter, name=name)
|
||||
|
||||
self.model = model
|
||||
self.environment = environment
|
||||
self.kwargs = kwargs
|
||||
|
||||
# Default dimensions
|
||||
self.width = 1024
|
||||
self.height = 768
|
||||
|
||||
# Update dimensions if adapter is provided
|
||||
if self.adapter:
|
||||
self.width = self.adapter.agent_width
|
||||
self.height = self.adapter.agent_height
|
||||
|
||||
# Create HUD computer handler
|
||||
self.hud_computer = HUDComputerHandler(
|
||||
environment=environment,
|
||||
dimensions=(self.width, self.height)
|
||||
)
|
||||
|
||||
# Handle trajectory_dir by adding TrajectorySaverCallback
|
||||
trajectory_dir = kwargs.pop("trajectory_dir", None)
|
||||
callbacks = kwargs.get("callbacks", [])
|
||||
|
||||
if trajectory_dir:
|
||||
from agent.callbacks.trajectory_saver import TrajectorySaverCallback
|
||||
trajectory_callback = TrajectorySaverCallback(trajectory_dir, reset_on_run=False)
|
||||
callbacks = callbacks + [trajectory_callback]
|
||||
kwargs["callbacks"] = callbacks
|
||||
|
||||
# Initialize ComputerAgent with HUD computer handler
|
||||
self.computer_agent = BaseComputerAgent(
|
||||
model=model,
|
||||
tools=[self.hud_computer],
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Set the client to the computer_agent for compatibility
|
||||
self.client = self.computer_agent
|
||||
|
||||
# State tracking
|
||||
self.conversation_history: List[Dict[str, Any]] = []
|
||||
self.initial_prompt: Optional[str] = None
|
||||
|
||||
# System prompt for computer use tasks
|
||||
self.base_system_prompt = BASE_SYSTEM_PROMPT
|
||||
|
||||
async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
|
||||
"""
|
||||
Fetch a response from ComputerAgent based on the observation.
|
||||
|
||||
Args:
|
||||
observation: The preprocessed observation, attributes:
|
||||
screenshot: Base64 encoded PNG string of the screen
|
||||
text: Text observation, if available
|
||||
|
||||
Returns:
|
||||
tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions,
|
||||
boolean indicating if the agent believes the task is complete.
|
||||
"""
|
||||
try:
|
||||
# Update the computer handler with the current screenshot
|
||||
if observation.screenshot:
|
||||
self.hud_computer.update_screenshot(observation.screenshot)
|
||||
|
||||
# Set up action callback to capture actions
|
||||
captured_actions = []
|
||||
action_done = False
|
||||
|
||||
async def action_callback(action: Dict[str, Any]) -> None:
|
||||
"""Callback to capture actions from ComputerAgent."""
|
||||
nonlocal captured_actions, action_done
|
||||
captured_actions.append(action)
|
||||
|
||||
# Set the action callback
|
||||
self.hud_computer.set_action_callback(action_callback)
|
||||
|
||||
# Prepare the message for ComputerAgent
|
||||
if not self.conversation_history:
|
||||
# First interaction - use the observation text as initial prompt
|
||||
if observation.text:
|
||||
self.initial_prompt = observation.text
|
||||
message = f"{self.base_system_prompt}\n\nTask: {observation.text}"
|
||||
else:
|
||||
message = f"{self.base_system_prompt}\n\nPlease analyze the current screen and determine what action to take."
|
||||
|
||||
input_content = [
|
||||
{"type": "input_text", "text": message}
|
||||
]
|
||||
|
||||
# Add screenshot if present
|
||||
if observation.screenshot:
|
||||
input_content.append(
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/png;base64,{observation.screenshot}",
|
||||
}
|
||||
)
|
||||
|
||||
self.conversation_history.append({"role": "user", "content": input_content})
|
||||
else:
|
||||
# Subsequent interactions - check if last action was computer_call
|
||||
# If so, add computer_call_output with screenshot instead of user message
|
||||
last_computer_calls = []
|
||||
for msg in reversed(self.conversation_history):
|
||||
if msg.get("type") == "computer_call":
|
||||
call_id = msg.get("call_id")
|
||||
if call_id:
|
||||
# Check if this call_id already has a computer_call_output
|
||||
has_output = any(
|
||||
m.get("type") == "computer_call_output" and m.get("call_id") == call_id
|
||||
for m in self.conversation_history
|
||||
)
|
||||
if not has_output:
|
||||
last_computer_calls.append(call_id)
|
||||
|
||||
if last_computer_calls:
|
||||
if not observation.screenshot:
|
||||
print("No screenshot found, taking screenshot")
|
||||
screenshot_b64 = await self.hud_computer.screenshot()
|
||||
# Add computer_call_output for each unresponded computer_call
|
||||
for call_id in reversed(last_computer_calls): # Maintain order
|
||||
self.conversation_history.append({
|
||||
"type": "computer_call_output",
|
||||
"call_id": call_id,
|
||||
"output": {
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/png;base64,{screenshot_b64}"
|
||||
}
|
||||
})
|
||||
else:
|
||||
# No computer_call found, add regular user message
|
||||
message = "Continue with the task based on the current screen state."
|
||||
input_content = [
|
||||
{"type": "input_text", "text": message}
|
||||
]
|
||||
|
||||
# Add screenshot if present
|
||||
if observation.screenshot:
|
||||
input_content.append(
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/png;base64,{observation.screenshot}",
|
||||
}
|
||||
)
|
||||
|
||||
self.conversation_history.append({"role": "user", "content": input_content})
|
||||
|
||||
# If the last message is a reasoning message, change it to output_text
|
||||
if (self.conversation_history and
|
||||
self.conversation_history[-1].get("type") == "reasoning" and
|
||||
self.conversation_history[-1].get("summary")):
|
||||
|
||||
reasoning_msg = self.conversation_history[-1]
|
||||
summary_texts = []
|
||||
|
||||
# Extract all summary_text entries
|
||||
for summary_item in reasoning_msg["summary"]:
|
||||
if summary_item.get("type") == "summary_text":
|
||||
summary_texts.append(summary_item.get("text", ""))
|
||||
|
||||
# Convert to message format with output_text
|
||||
if summary_texts:
|
||||
converted_message = {
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{
|
||||
"text": " ".join(summary_texts),
|
||||
"type": "output_text"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Replace the reasoning message with the converted message
|
||||
self.conversation_history[-1] = converted_message
|
||||
|
||||
# Run ComputerAgent
|
||||
try:
|
||||
new_items = []
|
||||
|
||||
# ComputerAgent.run returns an async generator
|
||||
try:
|
||||
async for result in self.computer_agent.run(self.conversation_history, stream=False):
|
||||
# if the result has computer_call_output, immediately exit
|
||||
if result.get("output", []) and result.get("output", [])[-1].get("type") == "computer_call_output":
|
||||
break
|
||||
# otherwise add agent output to conversation history
|
||||
new_items += result["output"]
|
||||
except Exception as e:
|
||||
# if the last message is reasoning, change it to output_text
|
||||
if new_items and new_items[-1].get("type") == "reasoning":
|
||||
new_items[-1] = {
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{
|
||||
"text": new_items[-1].get("summary", [{}])[0].get("text", ""),
|
||||
"type": "output_text"
|
||||
}
|
||||
]
|
||||
}
|
||||
# Check if there are any computer_call items in new_items
|
||||
computer_calls = [item for item in new_items if item.get("type") == "computer_call"]
|
||||
if computer_calls:
|
||||
# Remove computer_call items from new_items
|
||||
new_items = [item for item in new_items if item.get("type") != "computer_call"]
|
||||
|
||||
# Add failed tool call items for each computer call
|
||||
for computer_call in computer_calls:
|
||||
tool_input = computer_call.get("action", {})
|
||||
call_id = computer_call.get("call_id")
|
||||
new_items.extend(make_failed_tool_call_items(
|
||||
tool_name="computer",
|
||||
tool_kwargs=tool_input,
|
||||
error_message=repr(e),
|
||||
call_id=call_id
|
||||
))
|
||||
else:
|
||||
# add error message to conversation history (fallback for non-computer-call errors)
|
||||
new_items.append({
|
||||
"type": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": f"Error during previous attempted action: {repr(e)}"
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
# Check if we captured any actions
|
||||
if captured_actions:
|
||||
# Extract reasoning from the conversation history
|
||||
reasoning = ""
|
||||
# Look for the latest reasoning message
|
||||
for msg in reversed(new_items):
|
||||
if msg.get("type") == "reasoning" and msg.get("summary"):
|
||||
reasoning = " ".join([s.get("text", "") for s in msg["summary"] if s.get("type") == "summary_text"])
|
||||
break
|
||||
elif msg.get("type") == "message" and msg.get("role") == "assistant":
|
||||
content = msg.get("content", [])
|
||||
if isinstance(content, list):
|
||||
reasoning = " ".join([c.get("text", "") for c in content if c.get("type") == "output_text"])
|
||||
break
|
||||
|
||||
# update conversation history
|
||||
self.conversation_history += new_items
|
||||
|
||||
# Add reasoning and logs to each action
|
||||
for action in captured_actions:
|
||||
action["reasoning"] = reasoning
|
||||
action["logs"] = {"conversation_length": len(self.conversation_history)}
|
||||
|
||||
return captured_actions, False
|
||||
|
||||
# Check if the last message is "Task completed"
|
||||
response_text = ""
|
||||
for msg in reversed(new_items):
|
||||
if msg.get("type") == "message" and msg.get("role") == "assistant":
|
||||
content = msg.get("content", [])
|
||||
for c in content:
|
||||
if c.get("type") == "output_text":
|
||||
response_text = c.get("text", response_text)
|
||||
break
|
||||
break
|
||||
|
||||
done = "task completed" in response_text.lower()
|
||||
|
||||
# update conversation history
|
||||
self.conversation_history += new_items
|
||||
|
||||
response_action = {
|
||||
"type": "response",
|
||||
"text": response_text,
|
||||
"reasoning": response_text,
|
||||
"logs": {"conversation_length": len(self.conversation_history)}
|
||||
}
|
||||
|
||||
# Check if this indicates task completion or failure
|
||||
if "task is infeasible" in response_text.lower():
|
||||
response_action = {"type": "custom", "action": "FAIL"}
|
||||
done = True
|
||||
|
||||
return [response_action], done
|
||||
except Exception as e:
|
||||
logger.error(f"Error running ComputerAgent: {e}")
|
||||
# Return an error response
|
||||
error_action = {
|
||||
"type": "response",
|
||||
"text": f"Error occurred: {str(e)}",
|
||||
"reasoning": f"ComputerAgent encountered an error: {str(e)}",
|
||||
"logs": {"error": str(e)}
|
||||
}
|
||||
return [error_action], True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in fetch_response: {e}")
|
||||
error_action = {
|
||||
"type": "response",
|
||||
"text": f"Error in agent processing: {str(e)}",
|
||||
"reasoning": f"Agent processing error: {str(e)}",
|
||||
"logs": {"error": str(e)}
|
||||
}
|
||||
return [error_action], True
|
||||
@@ -1,187 +0,0 @@
|
||||
"""HUD Computer Handler for ComputerAgent integration."""
|
||||
|
||||
import base64
|
||||
from io import BytesIO
|
||||
from typing import Literal, Optional, Any, Dict, Callable
|
||||
from PIL import Image
|
||||
|
||||
from agent.computers import AsyncComputerHandler
|
||||
|
||||
|
||||
class HUDComputerHandler(AsyncComputerHandler):
|
||||
"""Computer handler that interfaces with HUD environment."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
environment: Literal["windows", "mac", "linux", "browser"] = "linux",
|
||||
dimensions: tuple[int, int] = (1024, 768),
|
||||
screenshot_callback: Optional[Callable] = None,
|
||||
action_callback: Optional[Callable] = None,
|
||||
):
|
||||
"""
|
||||
Initialize HUD computer handler.
|
||||
|
||||
Args:
|
||||
environment: The environment type for HUD
|
||||
dimensions: Screen dimensions as (width, height)
|
||||
screenshot_callback: Optional callback to get screenshots from HUD environment
|
||||
action_callback: Optional callback to execute actions in HUD environment
|
||||
"""
|
||||
super().__init__()
|
||||
self._environment = environment
|
||||
self._dimensions = dimensions
|
||||
self._screenshot_callback = screenshot_callback
|
||||
self._action_callback = action_callback
|
||||
|
||||
# Store the last screenshot for reuse
|
||||
self._last_screenshot: Optional[str] = None
|
||||
|
||||
def set_screenshot_callback(self, callback: Callable) -> None:
|
||||
"""Set the screenshot callback."""
|
||||
self._screenshot_callback = callback
|
||||
|
||||
def set_action_callback(self, callback: Callable) -> None:
|
||||
"""Set the action callback."""
|
||||
self._action_callback = callback
|
||||
|
||||
def update_screenshot(self, screenshot: str) -> None:
|
||||
"""Update the stored screenshot (base64 string)."""
|
||||
self._last_screenshot = screenshot
|
||||
|
||||
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
|
||||
"""Get the current environment type."""
|
||||
return self._environment # type: ignore
|
||||
|
||||
async def get_dimensions(self) -> tuple[int, int]:
|
||||
"""Get screen dimensions as (width, height)."""
|
||||
return self._dimensions
|
||||
|
||||
async def screenshot(self) -> str:
|
||||
"""Take a screenshot and return as base64 string."""
|
||||
if self._screenshot_callback:
|
||||
screenshot = await self._screenshot_callback()
|
||||
if isinstance(screenshot, str):
|
||||
self._last_screenshot = screenshot
|
||||
return screenshot
|
||||
elif isinstance(screenshot, Image.Image):
|
||||
# Convert PIL Image to base64
|
||||
buffer = BytesIO()
|
||||
screenshot.save(buffer, format="PNG")
|
||||
screenshot_b64 = base64.b64encode(buffer.getvalue()).decode()
|
||||
self._last_screenshot = screenshot_b64
|
||||
return screenshot_b64
|
||||
elif isinstance(screenshot, bytes):
|
||||
screenshot_b64 = base64.b64encode(screenshot).decode()
|
||||
self._last_screenshot = screenshot_b64
|
||||
return screenshot_b64
|
||||
|
||||
# Return last screenshot if available, otherwise create a blank one
|
||||
if self._last_screenshot:
|
||||
return self._last_screenshot
|
||||
|
||||
# Create a blank screenshot as fallback
|
||||
blank_image = Image.new('RGB', self._dimensions, color='white')
|
||||
buffer = BytesIO()
|
||||
blank_image.save(buffer, format="PNG")
|
||||
screenshot_b64 = base64.b64encode(buffer.getvalue()).decode()
|
||||
self._last_screenshot = screenshot_b64
|
||||
return screenshot_b64
|
||||
|
||||
async def click(self, x: int, y: int, button: str = "left") -> None:
|
||||
"""Click at coordinates with specified button."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "click",
|
||||
"x": x,
|
||||
"y": y,
|
||||
"button": button
|
||||
})
|
||||
|
||||
async def double_click(self, x: int, y: int) -> None:
|
||||
"""Double click at coordinates."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "double_click",
|
||||
"x": x,
|
||||
"y": y
|
||||
})
|
||||
|
||||
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
|
||||
"""Scroll at coordinates with specified scroll amounts."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "scroll",
|
||||
"x": x,
|
||||
"y": y,
|
||||
"scroll_x": scroll_x,
|
||||
"scroll_y": scroll_y
|
||||
})
|
||||
|
||||
async def type(self, text: str) -> None:
|
||||
"""Type text."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "type",
|
||||
"text": text
|
||||
})
|
||||
|
||||
async def wait(self, ms: int = 1000) -> None:
|
||||
"""Wait for specified milliseconds."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "wait",
|
||||
"ms": ms
|
||||
})
|
||||
|
||||
async def move(self, x: int, y: int) -> None:
|
||||
"""Move cursor to coordinates."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "move",
|
||||
"x": x,
|
||||
"y": y
|
||||
})
|
||||
|
||||
async def keypress(self, keys: list[str] | str) -> None:
|
||||
"""Press key combination."""
|
||||
if isinstance(keys, str):
|
||||
keys = [keys]
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "keypress",
|
||||
"keys": keys
|
||||
})
|
||||
|
||||
async def drag(self, path: list[dict[str, int]]) -> None:
|
||||
"""Drag along a path of points."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "drag",
|
||||
"path": path
|
||||
})
|
||||
|
||||
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse down at coordinates."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "left_mouse_down",
|
||||
"x": x,
|
||||
"y": y
|
||||
})
|
||||
|
||||
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse up at coordinates."""
|
||||
if self._action_callback:
|
||||
await self._action_callback({
|
||||
"type": "left_mouse_up",
|
||||
"x": x,
|
||||
"y": y
|
||||
})
|
||||
|
||||
async def get_current_url(self) -> str:
|
||||
"""Get the current URL."""
|
||||
if self._action_callback:
|
||||
return await self._action_callback({
|
||||
"type": "get_current_url"
|
||||
})
|
||||
return ""
|
||||
183
libs/python/agent/agent/integrations/hud/proxy.py
Normal file
183
libs/python/agent/agent/integrations/hud/proxy.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""HUD ComputerAgent wrapper and Fake AsyncOpenAI client.
|
||||
|
||||
Provides FakeAsyncOpenAI that adapts our ComputerAgent to the OpenAI Responses
|
||||
interface needed by HUD's OperatorAgent. It implements only `responses.create`
|
||||
and returns an OpenAI Response object with `id` and `output` fields, where `output` is a list of
|
||||
OpenAI-like response blocks. We intentionally only support a single-step call
|
||||
by consuming the first yielded result from `ComputerAgent.run()`.
|
||||
"""
|
||||
|
||||
import traceback
|
||||
import time
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from agent.agent import ComputerAgent as BaseComputerAgent
|
||||
|
||||
# OpenAI Responses typed models (required)
|
||||
from openai.types.responses import (
|
||||
Response,
|
||||
ResponseInputParam,
|
||||
ResponseOutputItem,
|
||||
ResponseComputerToolCall,
|
||||
ResponseOutputMessage,
|
||||
ResponseOutputText,
|
||||
ResponseReasoningItem,
|
||||
ResponseUsage,
|
||||
)
|
||||
|
||||
def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> List[ResponseOutputItem]:
|
||||
"""Map our agent output items to OpenAI ResponseOutputItem typed models.
|
||||
|
||||
Only a subset is supported: computer_call, assistant message (text), and reasoning.
|
||||
Unknown types are ignored.
|
||||
"""
|
||||
blocks: List[ResponseOutputItem] = []
|
||||
for item in output_items or []:
|
||||
t = item.get("type")
|
||||
if t == "computer_call":
|
||||
comp = ResponseComputerToolCall.model_validate({
|
||||
"id": item.get("id") or f"cu_{uuid.uuid4().hex}",
|
||||
"type": "computer_call",
|
||||
"call_id": item["call_id"],
|
||||
"action": item["action"],
|
||||
"pending_safety_checks": item.get("pending_safety_checks", []),
|
||||
"status": "completed",
|
||||
})
|
||||
blocks.append(comp)
|
||||
# we will exit early here as the responses api only supports a single step
|
||||
break
|
||||
elif t == "message" and item.get("role") == "assistant":
|
||||
content_blocks: List[ResponseOutputText] = []
|
||||
for c in item.get("content", []) or []:
|
||||
content_blocks.append(
|
||||
ResponseOutputText.model_validate({
|
||||
"type": "output_text",
|
||||
"text": c["text"],
|
||||
"annotations": [],
|
||||
})
|
||||
)
|
||||
if content_blocks:
|
||||
msg = ResponseOutputMessage.model_validate({
|
||||
"id": item.get("id") or f"msg_{uuid.uuid4()}",
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"status": "completed",
|
||||
"content": [ct.model_dump() for ct in content_blocks],
|
||||
})
|
||||
blocks.append(msg)
|
||||
elif t == "reasoning":
|
||||
reasoning = ResponseReasoningItem.model_validate({
|
||||
"id": item.get("id") or f"rsn_{uuid.uuid4()}",
|
||||
"type": "reasoning",
|
||||
"summary": item["summary"],
|
||||
})
|
||||
blocks.append(reasoning)
|
||||
# Unhandled types are ignored
|
||||
return blocks
|
||||
|
||||
def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
|
||||
out: List[Dict[str, Any]] = []
|
||||
for it in list(items):
|
||||
if hasattr(it, "model_dump"):
|
||||
out.append(it.model_dump()) # type: ignore[attr-defined]
|
||||
elif isinstance(it, dict):
|
||||
out.append(it)
|
||||
else:
|
||||
# Strict: rely on default __dict__ if present
|
||||
out.append(dict(it)) # may raise if not mapping
|
||||
return out
|
||||
|
||||
class FakeAsyncOpenAI:
|
||||
"""Minimal fake OpenAI client with only `responses.create` implemented.
|
||||
|
||||
It uses a provided `ComputerAgent` instance to produce a single-step
|
||||
response compatible with HUD's OperatorAgent loop.
|
||||
"""
|
||||
|
||||
def __init__(self, computer_agent: BaseComputerAgent) -> None:
|
||||
self._agent = computer_agent
|
||||
self.responses = self._Responses(self)
|
||||
|
||||
class _Responses:
|
||||
def __init__(self, parent: "FakeAsyncOpenAI") -> None:
|
||||
# Caches for cross-call context when using previous_response_id
|
||||
self.blocks_cache: Dict[str, ResponseInputParam | ResponseOutputItem] = {}
|
||||
self.context_cache: Dict[str, List[str]] = {}
|
||||
self.agent = parent._agent
|
||||
|
||||
async def create(
|
||||
self,
|
||||
*,
|
||||
model: str,
|
||||
input: ResponseInputParam,
|
||||
tools: Optional[List[Dict[str, Any]]] = None,
|
||||
instructions: Optional[str] = None,
|
||||
previous_response_id: Optional[str] = None,
|
||||
max_retries: int = 5,
|
||||
**_: Any,
|
||||
) -> Any:
|
||||
for attempt in range(max_retries):
|
||||
# Prepend cached blocks from previous_response_id to input
|
||||
full_input = input
|
||||
if previous_response_id is not None:
|
||||
prev_block_ids = self.context_cache[previous_response_id]
|
||||
prev_blocks = [self.blocks_cache[b_id] for b_id in prev_block_ids]
|
||||
full_input = _to_plain_dict_list(prev_blocks + input)
|
||||
|
||||
# Pre-pend instructions message
|
||||
effective_input = full_input
|
||||
if instructions:
|
||||
effective_input = [{
|
||||
"role": "user",
|
||||
"content": instructions,
|
||||
}] + full_input
|
||||
|
||||
# Run a single iteration of the ComputerAgent
|
||||
agent_result: Optional[Dict[str, Any]] = None
|
||||
async for result in self.agent.run(effective_input): # type: ignore[arg-type]
|
||||
agent_result = result
|
||||
break
|
||||
assert agent_result is not None, "Agent failed to produce result"
|
||||
|
||||
output = _map_agent_output_to_openai_blocks(agent_result["output"])
|
||||
usage = agent_result["usage"]
|
||||
|
||||
# Cache conversation context using the last response id
|
||||
block_ids: List[str] = []
|
||||
blocks_to_cache = full_input + output
|
||||
for b in blocks_to_cache:
|
||||
bid = getattr(b, "id", None) or f"tmp-{hash(repr(b))}"
|
||||
self.blocks_cache[bid] = b # type: ignore[assignment]
|
||||
block_ids.append(bid)
|
||||
response_id = agent_result.get("id") or f"fake-{int(time.time()*1000)}"
|
||||
self.context_cache[response_id] = block_ids
|
||||
|
||||
try:
|
||||
return Response.model_validate({
|
||||
"id": response_id,
|
||||
"created_at": time.time(),
|
||||
"object": "response",
|
||||
"model": model,
|
||||
"output": output,
|
||||
"parallel_tool_calls": False,
|
||||
"tool_choice": "auto",
|
||||
"tools": [],
|
||||
"previous_response_id": previous_response_id,
|
||||
"usage": ResponseUsage.model_validate({
|
||||
"input_tokens": usage.get("input_tokens", 0),
|
||||
"output_tokens": usage.get("output_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
"input_tokens_details": usage.get("input_tokens_details", { "cached_tokens": 0 }),
|
||||
"output_tokens_details": usage.get("output_tokens_details", { "reasoning_tokens": 0 }),
|
||||
}),
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ", e)
|
||||
if attempt == max_retries - 1:
|
||||
print(traceback.format_exc())
|
||||
raise e
|
||||
|
||||
__all__ = [
|
||||
"FakeAsyncOpenAI",
|
||||
]
|
||||
@@ -1530,7 +1530,18 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
|
||||
"text": f"""You are a UI grounding expert. Follow these guidelines:
|
||||
|
||||
1. NEVER ask for confirmation. Complete all tasks autonomously.
|
||||
2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
|
||||
3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
|
||||
4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
|
||||
5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
|
||||
6. The user has already given you permission by running this agent. No further confirmation is needed.
|
||||
7. Be decisive and action-oriented. Complete the requested task fully.
|
||||
|
||||
Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
|
||||
Task: Click {instruction}. Output ONLY a click action on the target element."""
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
|
||||
@@ -48,11 +48,11 @@ GROUNDED_COMPUTER_TOOL_SCHEMA = {
|
||||
"get_dimensions",
|
||||
"get_environment"
|
||||
],
|
||||
"description": "The action to perform"
|
||||
"description": "The action to perform (required for all actions)"
|
||||
},
|
||||
"element_description": {
|
||||
"type": "string",
|
||||
"description": "Description of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
|
||||
"description": "Description of the element to interact with (required for click, double_click, move, scroll actions)"
|
||||
},
|
||||
"start_element_description": {
|
||||
"type": "string",
|
||||
@@ -67,20 +67,30 @@ GROUNDED_COMPUTER_TOOL_SCHEMA = {
|
||||
"description": "The text to type (required for type action)"
|
||||
},
|
||||
"keys": {
|
||||
"type": "string",
|
||||
"description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Key(s) to press (required for keypress action)"
|
||||
},
|
||||
"button": {
|
||||
"type": "string",
|
||||
"description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
|
||||
"enum": [
|
||||
"left",
|
||||
"right",
|
||||
"wheel",
|
||||
"back",
|
||||
"forward"
|
||||
],
|
||||
"description": "The mouse button to use for click action (required for click and double_click action)",
|
||||
},
|
||||
"scroll_x": {
|
||||
"type": "integer",
|
||||
"description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
|
||||
"description": "Horizontal scroll amount for scroll action (required for scroll action)",
|
||||
},
|
||||
"scroll_y": {
|
||||
"type": "integer",
|
||||
"description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
|
||||
"description": "Vertical scroll amount for scroll action (required for scroll action)",
|
||||
},
|
||||
},
|
||||
"required": [
|
||||
@@ -266,13 +276,15 @@ class ComposedGroundedConfig:
|
||||
grounding_agent = grounding_agent_conf.agent_class()
|
||||
|
||||
for desc in element_descriptions:
|
||||
coords = await grounding_agent.predict_click(
|
||||
model=grounding_model,
|
||||
image_b64=last_image_b64,
|
||||
instruction=desc
|
||||
)
|
||||
if coords:
|
||||
self.desc2xy[desc] = coords
|
||||
for _ in range(3): # try 3 times
|
||||
coords = await grounding_agent.predict_click(
|
||||
model=grounding_model,
|
||||
image_b64=last_image_b64,
|
||||
instruction=desc
|
||||
)
|
||||
if coords:
|
||||
self.desc2xy[desc] = coords
|
||||
break
|
||||
|
||||
# Step 6: Convert computer calls from descriptions back to xy coordinates
|
||||
final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
|
||||
|
||||
@@ -162,7 +162,18 @@ class OpenAIComputerUseConfig:
|
||||
input_items = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
|
||||
"content": f"""You are a UI grounding expert. Follow these guidelines:
|
||||
|
||||
1. NEVER ask for confirmation. Complete all tasks autonomously.
|
||||
2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
|
||||
3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
|
||||
4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
|
||||
5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
|
||||
6. The user has already given you permission by running this agent. No further confirmation is needed.
|
||||
7. Be decisive and action-oriented. Complete the requested task fully.
|
||||
|
||||
Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
|
||||
Task: Click {instruction}. Output ONLY a click action on the target element."""
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
@@ -200,7 +211,7 @@ class OpenAIComputerUseConfig:
|
||||
"stream": False,
|
||||
"reasoning": {"summary": "concise"},
|
||||
"truncation": "auto",
|
||||
"max_tokens": 100 # Keep response short for click prediction
|
||||
"max_tokens": 200 # Keep response short for click prediction
|
||||
}
|
||||
|
||||
# Use liteLLM responses
|
||||
@@ -217,11 +228,8 @@ class OpenAIComputerUseConfig:
|
||||
isinstance(item.get("action"), dict)):
|
||||
|
||||
action = item["action"]
|
||||
if action.get("type") == "click":
|
||||
x = action.get("x")
|
||||
y = action.get("y")
|
||||
if x is not None and y is not None:
|
||||
return (int(x), int(y))
|
||||
if action.get("x") is not None and action.get("y") is not None:
|
||||
return (int(action.get("x")), int(action.get("y")))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
192
libs/python/agent/agent/proxy/examples.py
Normal file
192
libs/python/agent/agent/proxy/examples.py
Normal file
@@ -0,0 +1,192 @@
|
||||
"""
|
||||
Example usage of the proxy server and client requests.
|
||||
"""
|
||||
import dotenv
|
||||
dotenv.load_dotenv()
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import aiohttp
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
async def test_http_endpoint():
|
||||
"""Test the HTTP /responses endpoint."""
|
||||
|
||||
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
|
||||
assert isinstance(anthropic_api_key, str), "ANTHROPIC_API_KEY environment variable must be set"
|
||||
|
||||
# Example 1: Simple text request
|
||||
simple_request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"input": "Tell me a three sentence bedtime story about a unicorn.",
|
||||
"env": {
|
||||
"ANTHROPIC_API_KEY": anthropic_api_key
|
||||
}
|
||||
}
|
||||
|
||||
# Example 2: Multi-modal request with image
|
||||
multimodal_request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"input": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "input_text", "text": "what is in this image?"},
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"env": {
|
||||
"ANTHROPIC_API_KEY": anthropic_api_key
|
||||
}
|
||||
}
|
||||
|
||||
# Example 3: Request with custom agent and computer kwargs
|
||||
custom_request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"input": "Take a screenshot and tell me what you see",
|
||||
"env": {
|
||||
"ANTHROPIC_API_KEY": anthropic_api_key
|
||||
}
|
||||
}
|
||||
|
||||
# Test requests
|
||||
base_url = "https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443"
|
||||
# base_url = "http://localhost:8000"
|
||||
api_key = os.getenv("CUA_API_KEY")
|
||||
assert isinstance(api_key, str), "CUA_API_KEY environment variable must be set"
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
for i, request_data in enumerate([
|
||||
simple_request,
|
||||
# multimodal_request,
|
||||
custom_request
|
||||
], 1):
|
||||
print(f"\n--- Test {i} ---")
|
||||
print(f"Request: {json.dumps(request_data, indent=2)}")
|
||||
|
||||
try:
|
||||
print(f"Sending request to {base_url}/responses")
|
||||
async with session.post(
|
||||
f"{base_url}/responses",
|
||||
json=request_data,
|
||||
headers={"Content-Type": "application/json", "X-API-Key": api_key}
|
||||
) as response:
|
||||
result = await response.json()
|
||||
print(f"Status: {response.status}")
|
||||
print(f"Response: {json.dumps(result, indent=2)}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
|
||||
def curl_examples():
|
||||
"""Print curl command examples."""
|
||||
|
||||
print("=== CURL Examples ===\n")
|
||||
|
||||
print("1. Simple text request:")
|
||||
print("""curl http://localhost:8000/responses \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"input": "Tell me a three sentence bedtime story about a unicorn."
|
||||
}'""")
|
||||
|
||||
print("\n2. Multi-modal request with image:")
|
||||
print("""curl http://localhost:8000/responses \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"input": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "input_text", "text": "what is in this image?"},
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}'""")
|
||||
|
||||
print("\n3. Request with custom configuration:")
|
||||
print("""curl http://localhost:8000/responses \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"input": "Take a screenshot and tell me what you see",
|
||||
"agent_kwargs": {
|
||||
"save_trajectory": true,
|
||||
"verbosity": 20
|
||||
},
|
||||
"computer_kwargs": {
|
||||
"os_type": "linux",
|
||||
"provider_type": "cloud"
|
||||
}
|
||||
}'""")
|
||||
|
||||
|
||||
async def test_p2p_client():
|
||||
"""Example P2P client using peerjs-python."""
|
||||
try:
|
||||
from peerjs import Peer, PeerOptions, ConnectionEventType
|
||||
from aiortc import RTCConfiguration, RTCIceServer
|
||||
|
||||
# Set up client peer
|
||||
options = PeerOptions(
|
||||
host="0.peerjs.com",
|
||||
port=443,
|
||||
secure=True,
|
||||
config=RTCConfiguration(
|
||||
iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]
|
||||
)
|
||||
)
|
||||
|
||||
client_peer = Peer(id="test-client", peer_options=options)
|
||||
await client_peer.start()
|
||||
|
||||
# Connect to proxy server
|
||||
connection = client_peer.connect("computer-agent-proxy")
|
||||
|
||||
@connection.on(ConnectionEventType.Open)
|
||||
async def connection_open():
|
||||
print("Connected to proxy server")
|
||||
|
||||
# Send a test request
|
||||
request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"input": "Hello from P2P client!"
|
||||
}
|
||||
await connection.send(json.dumps(request))
|
||||
|
||||
@connection.on(ConnectionEventType.Data)
|
||||
async def connection_data(data):
|
||||
print(f"Received response: {data}")
|
||||
await client_peer.destroy()
|
||||
|
||||
# Wait for connection
|
||||
await asyncio.sleep(10)
|
||||
|
||||
except ImportError:
|
||||
print("P2P dependencies not available. Install peerjs-python for P2P testing.")
|
||||
except Exception as e:
|
||||
print(f"P2P test error: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "curl":
|
||||
curl_examples()
|
||||
elif len(sys.argv) > 1 and sys.argv[1] == "p2p":
|
||||
asyncio.run(test_p2p_client())
|
||||
else:
|
||||
asyncio.run(test_http_endpoint())
|
||||
248
libs/python/agent/agent/proxy/handlers.py
Normal file
248
libs/python/agent/agent/proxy/handlers.py
Normal file
@@ -0,0 +1,248 @@
|
||||
"""
|
||||
Request handlers for the proxy endpoints.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from typing import Dict, Any, List, Union, Optional
|
||||
|
||||
from ..agent import ComputerAgent
|
||||
from computer import Computer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ResponsesHandler:
|
||||
"""Handler for /responses endpoint that processes agent requests."""
|
||||
|
||||
def __init__(self):
|
||||
self.computer = None
|
||||
self.agent = None
|
||||
# Simple in-memory caches
|
||||
self._computer_cache: Dict[str, Any] = {}
|
||||
self._agent_cache: Dict[str, Any] = {}
|
||||
|
||||
async def setup_computer_agent(
|
||||
self,
|
||||
model: str,
|
||||
agent_kwargs: Optional[Dict[str, Any]] = None,
|
||||
computer_kwargs: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""Set up (and cache) computer and agent instances.
|
||||
|
||||
Caching keys:
|
||||
- Computer cache key: computer_kwargs
|
||||
- Agent cache key: {"model": model, **agent_kwargs}
|
||||
"""
|
||||
agent_kwargs = agent_kwargs or {}
|
||||
computer_kwargs = computer_kwargs or {}
|
||||
|
||||
def _stable_key(obj: Dict[str, Any]) -> str:
|
||||
try:
|
||||
return json.dumps(obj, sort_keys=True, separators=(",", ":"))
|
||||
except Exception:
|
||||
# Fallback: stringify non-serializable values
|
||||
safe_obj = {}
|
||||
for k, v in obj.items():
|
||||
try:
|
||||
json.dumps(v)
|
||||
safe_obj[k] = v
|
||||
except Exception:
|
||||
safe_obj[k] = str(v)
|
||||
return json.dumps(safe_obj, sort_keys=True, separators=(",", ":"))
|
||||
|
||||
# Determine if custom tools are supplied; if so, skip computer setup entirely
|
||||
has_custom_tools = bool(agent_kwargs.get("tools"))
|
||||
|
||||
computer = None
|
||||
if not has_custom_tools:
|
||||
# ---------- Computer setup (with cache) ----------
|
||||
comp_key = _stable_key(computer_kwargs)
|
||||
|
||||
computer = self._computer_cache.get(comp_key)
|
||||
if computer is None:
|
||||
# Default computer configuration
|
||||
default_c_config = {
|
||||
"os_type": "linux",
|
||||
"provider_type": "cloud",
|
||||
"name": os.getenv("CUA_CONTAINER_NAME"),
|
||||
"api_key": os.getenv("CUA_API_KEY"),
|
||||
}
|
||||
default_c_config.update(computer_kwargs)
|
||||
computer = Computer(**default_c_config)
|
||||
await computer.__aenter__()
|
||||
self._computer_cache[comp_key] = computer
|
||||
logger.info(f"Computer created and cached with key={comp_key} config={default_c_config}")
|
||||
else:
|
||||
logger.info(f"Reusing cached computer for key={comp_key}")
|
||||
|
||||
# Bind current computer reference (None if custom tools supplied)
|
||||
self.computer = computer
|
||||
|
||||
# ---------- Agent setup (with cache) ----------
|
||||
# Build agent cache key from {model} + agent_kwargs (excluding tools unless explicitly passed)
|
||||
agent_kwargs_for_key = dict(agent_kwargs)
|
||||
agent_key_payload = {"model": model, **agent_kwargs_for_key}
|
||||
agent_key = _stable_key(agent_key_payload)
|
||||
|
||||
agent = self._agent_cache.get(agent_key)
|
||||
if agent is None:
|
||||
# Default agent configuration
|
||||
default_a_config: Dict[str, Any] = {"model": model}
|
||||
if not has_custom_tools:
|
||||
default_a_config["tools"] = [computer]
|
||||
# Apply user overrides, but keep tools unless user explicitly sets
|
||||
if agent_kwargs:
|
||||
if not has_custom_tools:
|
||||
agent_kwargs.setdefault("tools", [computer])
|
||||
default_a_config.update(agent_kwargs)
|
||||
# JSON-derived kwargs may have loose types; ignore static arg typing here
|
||||
agent = ComputerAgent(**default_a_config) # type: ignore[arg-type]
|
||||
self._agent_cache[agent_key] = agent
|
||||
logger.info(f"Agent created and cached with key={agent_key} model={model}")
|
||||
else:
|
||||
# Ensure cached agent uses the current computer tool (in case object differs)
|
||||
# Only update if tools not explicitly provided in agent_kwargs
|
||||
if not has_custom_tools:
|
||||
try:
|
||||
agent.tools = [computer]
|
||||
except Exception:
|
||||
pass
|
||||
logger.info(f"Reusing cached agent for key={agent_key}")
|
||||
|
||||
# Bind current agent reference
|
||||
self.agent = agent
|
||||
|
||||
async def process_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Process a /responses request and return the result.
|
||||
|
||||
Args:
|
||||
request_data: Dictionary containing model, input, and optional kwargs
|
||||
|
||||
Returns:
|
||||
Dictionary with the agent's response
|
||||
"""
|
||||
try:
|
||||
# Extract request parameters
|
||||
model = request_data.get("model")
|
||||
input_data = request_data.get("input")
|
||||
agent_kwargs = request_data.get("agent_kwargs", {})
|
||||
computer_kwargs = request_data.get("computer_kwargs", {})
|
||||
env_overrides = request_data.get("env", {}) or {}
|
||||
|
||||
if not model:
|
||||
raise ValueError("Model is required")
|
||||
if not input_data:
|
||||
raise ValueError("Input is required")
|
||||
|
||||
# Apply env overrides for the duration of this request
|
||||
with self._env_overrides(env_overrides):
|
||||
# Set up (and possibly reuse) computer and agent via caches
|
||||
await self.setup_computer_agent(model, agent_kwargs, computer_kwargs)
|
||||
|
||||
# Defensive: ensure agent is initialized for type checkers
|
||||
agent = self.agent
|
||||
if agent is None:
|
||||
raise RuntimeError("Agent failed to initialize")
|
||||
|
||||
# Convert input to messages format
|
||||
messages = self._convert_input_to_messages(input_data)
|
||||
|
||||
# Run agent and get first result
|
||||
async for result in agent.run(messages):
|
||||
# Return the first result and break
|
||||
return {
|
||||
"success": True,
|
||||
"result": result,
|
||||
"model": model
|
||||
}
|
||||
|
||||
# If no results were yielded
|
||||
return {
|
||||
"success": False,
|
||||
"error": "No results from agent",
|
||||
"model": model
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing request: {e}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"model": request_data.get("model", "unknown")
|
||||
}
|
||||
|
||||
def _convert_input_to_messages(self, input_data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
||||
"""Convert input data to messages format."""
|
||||
if isinstance(input_data, str):
|
||||
# Simple string input
|
||||
return [{"role": "user", "content": input_data}]
|
||||
elif isinstance(input_data, list):
|
||||
# Already in messages format
|
||||
messages = []
|
||||
for msg in input_data:
|
||||
# Convert content array format if needed
|
||||
if isinstance(msg.get("content"), list):
|
||||
content_parts = []
|
||||
for part in msg["content"]:
|
||||
if part.get("type") == "input_text":
|
||||
content_parts.append({"type": "text", "text": part["text"]})
|
||||
elif part.get("type") == "input_image":
|
||||
content_parts.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": part["image_url"]}
|
||||
})
|
||||
else:
|
||||
content_parts.append(part)
|
||||
messages.append({
|
||||
"role": msg["role"],
|
||||
"content": content_parts
|
||||
})
|
||||
else:
|
||||
messages.append(msg)
|
||||
return messages
|
||||
else:
|
||||
raise ValueError("Input must be string or list of messages")
|
||||
|
||||
async def cleanup(self):
|
||||
"""Clean up resources."""
|
||||
if self.computer:
|
||||
try:
|
||||
await self.computer.__aexit__(None, None, None)
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up computer: {e}")
|
||||
finally:
|
||||
self.computer = None
|
||||
self.agent = None
|
||||
|
||||
@staticmethod
|
||||
@contextmanager
|
||||
def _env_overrides(env: Dict[str, str]):
|
||||
"""Temporarily apply environment variable overrides for the current process.
|
||||
Restores previous values after the context exits.
|
||||
|
||||
Args:
|
||||
env: Mapping of env var names to override for this request.
|
||||
"""
|
||||
if not env:
|
||||
# No-op context
|
||||
yield
|
||||
return
|
||||
|
||||
original: Dict[str, Optional[str]] = {}
|
||||
try:
|
||||
for k, v in env.items():
|
||||
original[k] = os.environ.get(k)
|
||||
os.environ[k] = str(v)
|
||||
yield
|
||||
finally:
|
||||
for k, old in original.items():
|
||||
if old is None:
|
||||
# Was not set before
|
||||
os.environ.pop(k, None)
|
||||
else:
|
||||
os.environ[k] = old
|
||||
@@ -55,7 +55,7 @@ cli = [
|
||||
"yaspin>=3.1.0",
|
||||
]
|
||||
hud = [
|
||||
"hud-python==0.2.10",
|
||||
"hud-python>=0.4.12,<0.5.0",
|
||||
]
|
||||
all = [
|
||||
# omni requirements
|
||||
@@ -72,7 +72,7 @@ all = [
|
||||
# cli requirements
|
||||
"yaspin>=3.1.0",
|
||||
# hud requirements
|
||||
"hud-python==0.2.10",
|
||||
"hud-python>=0.4.12,<0.5.0",
|
||||
]
|
||||
|
||||
[tool.uv]
|
||||
|
||||
110361
notebooks/eval_osworld.ipynb
110361
notebooks/eval_osworld.ipynb
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user