Merge pull request #371 from trycua/chore/hud-upgrade

[Agent] Upgrade HUD SDK to 0.4.12
2026-02-16 19:39:20 -06:00 · 2025-08-28 11:29:18 -04:00
parent 6866b5a16f 5fafe861ef
commit 311bbf9709
17 changed files with 1496 additions and 110729 deletions
--- a/docs/content/docs/agent-sdk/integrations/hud.mdx
+++ b/docs/content/docs/agent-sdk/integrations/hud.mdx
@@ -10,37 +10,35 @@ The HUD integration allows you to use ComputerAgent with the [HUD benchmarking f
 ```bash
 pip install "cua-agent[hud]"
 ## or install hud-python directly
-# pip install hud-python==0.2.10
+# pip install hud-python==0.4.12
 ```

 ## Usage

 ```python
-from agent.integrations.hud import run_job
-from hud import load_taskset
-from hud.taskset import TaskSet
-import logging
+# Quick single-task smoke test
+from agent.integrations.hud import run_single_task

-# Load taskset
-taskset = await load_taskset("OSWorld-Verified")
-taskset = TaskSet(tasks=taskset[:10]) # limit to 10 tasks instead of all 370
-
-# Run benchmark job
-job = await run_job(
-    model="openai/computer-use-preview",
-    # model="anthropic/claude-3-5-sonnet-20241022",
-    # model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5",
-    task_or_taskset=taskset,
-    job_name="test-computeragent-job",
-    max_concurrent_tasks=5,
-    # add any extra ComputerAgent kwargs:
-    verbosity=logging.INFO,  # Enable logging
-    # trajectory_dir=".."       # Save trajectories locally
+await run_single_task(
+    dataset="hud-evals/OSWorld-Verified-XLang",   # or another HUD dataset
+    model="openai/computer-use-preview+openai/gpt-5-nano",  # any supported model string
+    task_id=155,  # e.g., reopen last closed tab
 )

-# Get results OR view them at app.hud.so
-print(await job.get_analytics())
-print(f"View results at: https://app.hud.so/jobs/{job.id}")
+# Run a small split of OSWorld-Verified in parallel
+from agent.integrations.hud import run_full_dataset
+
+results = await run_full_dataset(
+    dataset="hud-evals/OSWorld-Verified-XLang",   # can also pass a Dataset or list[dict]
+    model="openai/computer-use-preview",
+    split="train[:3]",           # try a few tasks to start
+    max_concurrent=20,            # tune to your infra
+    max_steps=50                  # safety cap per task
+)
+
+# Environment variables required:
+# - HUD_API_KEY (HUD access)
+# - OPENAI_API_KEY or ANTHROPIC_API_KEY depending on your chosen model(s)
 ```

 **Available Benchmarks:**
--- a/libs/python/agent/agent/agent.py
+++ b/libs/python/agent/agent/agent.py
@@ -30,6 +30,7 @@ from .callbacks import (
    TrajectorySaverCallback, 
    BudgetManagerCallback,
    TelemetryCallback,
+    OperatorNormalizerCallback
 )
 from .computers import (
    AsyncComputerHandler,
@@ -202,6 +203,9 @@ class ComputerAgent:

        # == Add built-in callbacks ==

+        # Prepend operator normalizer callback
+        self.callbacks.insert(0, OperatorNormalizerCallback())
+
        # Add telemetry callback if telemetry_enabled is set
        if self.telemetry_enabled:
            if isinstance(self.telemetry_enabled, bool):
--- a/libs/python/agent/agent/callbacks/init.py
+++ b/libs/python/agent/agent/callbacks/init.py
@@ -8,6 +8,7 @@ from .logging import LoggingCallback
 from .trajectory_saver import TrajectorySaverCallback
 from .budget_manager import BudgetManagerCallback
 from .telemetry import TelemetryCallback
+from .operator_validator import OperatorNormalizerCallback

 __all__ = [
    "AsyncCallbackHandler",
@@ -16,4 +17,5 @@ __all__ = [
    "TrajectorySaverCallback",
    "BudgetManagerCallback",
    "TelemetryCallback",
+    "OperatorNormalizerCallback",
 ]
--- a/libs/python/agent/agent/callbacks/operator_validator.py
+++ b/libs/python/agent/agent/callbacks/operator_validator.py
@@ -0,0 +1,138 @@
+"""
+OperatorValidatorCallback
+
+Ensures agent output actions conform to expected schemas by fixing common issues:
+- click: add default button='left' if missing
+- keypress: wrap keys string into a list
+- etc.
+
+This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts).
+The purpose is to avoid spending another LLM call to fix broken computer call syntax when possible.
+"""
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+from .base import AsyncCallbackHandler
+
+
+class OperatorNormalizerCallback(AsyncCallbackHandler):
+    """Normalizes common computer call hallucinations / errors in computer call syntax."""
+
+    async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        # Mutate in-place as requested, but still return the list for chaining
+        for item in output or []:
+            if item.get("type") != "computer_call":
+                continue
+            action = item.get("action")
+            if not isinstance(action, dict):
+                continue
+
+            # rename mouse click actions to "click"
+            for mouse_btn in ["left", "right", "wheel", "back", "forward"]:
+                if action.get("type", "") == f"{mouse_btn}_click":
+                    action["type"] = "click"
+                    action["button"] = mouse_btn
+            # rename hotkey actions to "keypress"
+            for alias in ["hotkey", "key", "press", "key_press"]:
+                if action.get("type", "") == alias:
+                    action["type"] = "keypress"
+            # assume click actions
+            if "button" in action and "type" not in action:
+                action["type"] = "click"
+            if "click" in action and "type" not in action:
+                action["type"] = "click"
+            if ("scroll_x" in action or "scroll_y" in action) and "type" not in action:
+                action["type"] = "scroll"
+            if "text" in action and "type" not in action:
+                action["type"] = "type"
+
+            action_type = action.get("type")
+            def _keep_keys(action: Dict[str, Any], keys_to_keep: List[str]):
+                """Keep only the provided keys on action; delete everything else.
+                Always ensures required 'type' is present if listed in keys_to_keep.
+                """
+                for key in list(action.keys()):
+                    if key not in keys_to_keep:
+                        del action[key]
+            # rename "coordinate" to "x", "y"
+            if "coordinate" in action:
+                action["x"] = action["coordinate"][0]
+                action["y"] = action["coordinate"][1]
+                del action["coordinate"]
+            if action_type == "click":
+                # convert "click" to "button"
+                if "button" not in action and "click" in action:
+                    action["button"] = action["click"]
+                    del action["click"]
+                # default button to "left"
+                action["button"] = action.get("button", "left")
+            # add default scroll x, y if missing
+            if action_type == "scroll":
+                action["scroll_x"] = action.get("scroll_x", 0)
+                action["scroll_y"] = action.get("scroll_y", 0)
+            # ensure keys arg is a list (normalize aliases first)
+            if action_type == "keypress":
+                keys = action.get("keys")
+                for keys_alias in ["keypress", "key", "press", "key_press", "text"]:
+                    if keys_alias in action:
+                        action["keys"] = action[keys_alias]
+                        del action[keys_alias]
+                keys = action.get("keys")
+                if isinstance(keys, str):
+                    action["keys"] = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
+            required_keys_by_type = {
+                # OpenAI actions
+                "click": ["type", "button", "x", "y"],
+                "double_click": ["type", "x", "y"],
+                "drag": ["type", "path"],
+                "keypress": ["type", "keys"],
+                "move": ["type", "x", "y"],
+                "screenshot": ["type"],
+                "scroll": ["type", "scroll_x", "scroll_y", "x", "y"],
+                "type": ["type", "text"],
+                "wait": ["type"],
+                # Anthropic actions
+                "left_mouse_down": ["type", "x", "y"],
+                "left_mouse_up": ["type", "x", "y"],
+                "triple_click": ["type", "button", "x", "y"],
+            }
+            keep = required_keys_by_type.get(action_type or "")
+            if keep:
+                _keep_keys(action, keep)
+            
+
+        # Second pass: if an assistant message is immediately followed by a computer_call,
+        # replace the assistant message itself with a reasoning message with summary text.
+        if isinstance(output, list):
+            for i, item in enumerate(output):
+                # AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] }
+                if item.get("type") == "message" and item.get("role") == "assistant":
+                    next_idx = i + 1
+                    if next_idx >= len(output):
+                        continue
+                    next_item = output[next_idx]
+                    if not isinstance(next_item, dict):
+                        continue
+                    if next_item.get("type") != "computer_call":
+                        continue
+                    contents = item.get("content") or []
+                    # Extract text from OutputContent[]
+                    text_parts: List[str] = []
+                    if isinstance(contents, list):
+                        for c in contents:
+                            if isinstance(c, dict) and c.get("type") == "output_text" and isinstance(c.get("text"), str):
+                                text_parts.append(c["text"])
+                    text_content = "\n".join(text_parts).strip()
+                    # Replace assistant message with reasoning message
+                    output[i] = {
+                        "type": "reasoning",
+                        "summary": [
+                            {
+                                "type": "summary_text",
+                                "text": text_content,
+                            }
+                        ],
+                    }
+
+        return output
--- a/libs/python/agent/agent/callbacks/trajectory_saver.py
+++ b/libs/python/agent/agent/callbacks/trajectory_saver.py
@@ -94,6 +94,10 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
            # format: turn_000/0000_name.json
            artifact_filename = f"{self.current_artifact:04d}_{name}"
            artifact_path = turn_dir / f"{artifact_filename}.json"
+            # add created_at
+            if isinstance(artifact, dict):
+                artifact = artifact.copy()
+                artifact["created_at"] = str(uuid.uuid1().time)
            with open(artifact_path, "w") as f:
                json.dump(sanitize_image_urls(artifact), f, indent=2)
        self.current_artifact += 1
@@ -171,7 +175,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
            "status": "completed",
            "completed_at": str(uuid.uuid1().time),
            "total_usage": self.total_usage,
-            "new_items": sanitize_image_urls(new_items),
+            "new_items": new_items,
            "total_turns": self.current_turn
        })
        
--- a/libs/python/agent/agent/integrations/hud/init.py
+++ b/libs/python/agent/agent/integrations/hud/init.py
@@ -1,77 +1,228 @@
-"""HUD integration for ComputerAgent."""
+"""HUD integration: Generic HuggingFace dataset evaluation runner (CUA proxy).

-import logging
-from typing import Any, Optional, Dict
-from hud import run_job as hud_run_job
+This module exposes two helpers to evaluate HUD-compatible datasets using
+HUD's OperatorAgent, while proxying model calls through our ComputerAgent via
+`FakeAsyncOpenAI` (see `agent/integrations/hud/agent.py`).

-from .agent import ComputerAgent
-from .adapter import ComputerAgentAdapter
-from .computer_handler import HUDComputerHandler
+Exports:
+- run_single_task(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None)
+- run_full_dataset(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None, max_concurrent=30, max_steps=50)
+"""
+import time
+from typing import Any, Optional
+
+from PIL import Image
+from datasets import load_dataset, Dataset
+from hud.agents import OperatorAgent
+from hud.datasets import Task, run_dataset
+from hud.tools.computer.settings import computer_settings
+from hud import trace
+
+from agent.agent import ComputerAgent as BaseComputerAgent
+from .proxy import FakeAsyncOpenAI


-async def run_job(
-    model: str,
-    task_or_taskset: Any,
-    job_name: str,
-    # Job kwargs
-    auto_reply_question: bool = False,
-    adapter_cls: Any = None,
-    adapter_kwargs: Optional[Dict[str, Any]] = None,
-    max_steps_per_task: int = 20,
-    run_parallel: bool = True,
-    job_metadata: Optional[Dict[str, Any]] = None,
-    show_progress: bool = True,
-    max_concurrent_env_creations: Optional[int] = 30,  # Limits gym.make calls
-    max_concurrent_agent_predictions: Optional[int] = None,  # No limit on LLM calls
-    max_concurrent_tasks: Optional[int] = 30,  # Limits overall task concurrency
-    **agent_kwargs: Any
-) -> Any:
+# ---------------------------------------------------------------------------
+# Proxy OperatorAgent
+# ---------------------------------------------------------------------------
+
+
+class ProxyOperatorAgent(OperatorAgent):
+    """OperatorAgent that proxies model calls through our ComputerAgent.
+
+    Accepts the same config keys we pass via hud.run_dataset `agent_config`:
+    - model: str | None
+    - allowed_tools: list[str] | None
+    Additional kwargs are forwarded to OperatorAgent (if any are supported).
    """
-    Run a job using ComputerAgent with the specified model.
+
+    def __init__(
+        self,
+        *,
+        model: str | None = None,
+        allowed_tools: list[str] | None = None,
+        trajectory_dir: str | None = None,
+        # === ComputerAgent kwargs ===
+        tools: list[Any] | None = None,
+        custom_loop: Any | None = None,
+        only_n_most_recent_images: int | None = None,
+        callbacks: list[Any] | None = None,
+        verbosity: int | None = None,
+        max_retries: int | None = 3,
+        screenshot_delay: float | int = 0.5,
+        use_prompt_caching: bool | None = False,
+        max_trajectory_budget: float | dict | None = None,
+        telemetry_enabled: bool | None = True,
+        **kwargs: Any,
+    ) -> None:
+        model = model or "computer-use-preview"
+        allowed_tools = allowed_tools or ["openai_computer"]
+        
+        computer_shim = {
+            'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
+            'environment': 'linux',
+            'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
+        }
+        # Build tools ensuring the computer_shim is included
+        agent_tools: list[Any] = [computer_shim]
+        if tools:
+            agent_tools.extend(tools)
+
+        computer_agent = BaseComputerAgent(
+            model=model,
+            tools=agent_tools,
+            custom_loop=custom_loop,
+            only_n_most_recent_images=only_n_most_recent_images,
+            callbacks=callbacks,
+            verbosity=verbosity,
+            trajectory_dir=trajectory_dir,
+            max_retries=max_retries,
+            screenshot_delay=screenshot_delay,
+            use_prompt_caching=use_prompt_caching,
+            max_trajectory_budget=max_trajectory_budget,
+            telemetry_enabled=telemetry_enabled,
+        )
+        model_client = FakeAsyncOpenAI(computer_agent)
+
+        super().__init__( 
+            model_client=model_client, # type: ignore[arg-type]
+            model=model,
+            allowed_tools=allowed_tools,
+            **kwargs,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Single-task runner
+# ---------------------------------------------------------------------------
+
+
+async def run_single_task(
+    dataset: str | Dataset | list[dict[str, Any]],
+    *,
+    task_id: int = 0,
+    model: str | None = None,
+    allowed_tools: list[str] | None = None,
+    # === ComputerAgent kwargs ===
+    tools: list[Any] | None = None,
+    custom_loop: Any | None = None,
+    only_n_most_recent_images: int | None = None,
+    callbacks: list[Any] | None = None,
+    verbosity: int | None = None,
+    trajectory_dir: str | None = None,
+    max_retries: int | None = 3,
+    screenshot_delay: float | int = 0.5,
+    use_prompt_caching: bool | None = False,
+    max_trajectory_budget: float | dict | None = None,
+    telemetry_enabled: bool | None = True,
+) -> None:
+    """Load one task from the dataset and execute it with Operator+CUA proxy."""
+
+    # Load dataset and pick a sample
+    if isinstance(dataset, str):
+        dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
+    elif isinstance(dataset, list):
+        dataset = dataset
+    else:
+        dataset = dataset["train"]
    
-    Args:
-        model: Model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
-        task_or_taskset: Task or TaskSet to run
-        job_name: Name for the job
-        auto_reply_question: Whether to auto-reply to questions
-        adapter_cls: Custom adapter class (defaults to ComputerAgentAdapter)
-        adapter_kwargs: Additional kwargs for the adapter
-        max_steps_per_task: Maximum steps per task
-        run_parallel: Whether to run tasks in parallel
-        job_metadata: Additional metadata for the job
-        show_progress: Whether to show progress
-        max_concurrent_env_creations: Max concurrent environment creations
-        max_concurrent_agent_predictions: Max concurrent agent predictions
-        max_concurrent_tasks: Max concurrent tasks
-        **agent_kwargs: Additional kwargs to pass to ComputerAgent
-    
-    Returns:
-        Job instance from HUD
-    """
-    # combine verbose and verbosity kwargs
-    if "verbose" in agent_kwargs:
-        agent_kwargs["verbosity"] = logging.INFO
-        del agent_kwargs["verbose"]
-    verbose = True if agent_kwargs.get("verbosity", logging.WARNING) > logging.INFO else False
-    
-    # run job
-    return await hud_run_job(
-        agent_cls=ComputerAgent,
-        agent_kwargs={"model": model, **agent_kwargs},
-        task_or_taskset=task_or_taskset,
-        job_name=job_name,
-        auto_reply_question=auto_reply_question,
-        adapter_cls=adapter_cls,
-        adapter_kwargs=adapter_kwargs,
-        max_steps_per_task=max_steps_per_task,
-        run_parallel=run_parallel,
-        job_metadata=job_metadata,
-        show_progress=show_progress,
-        verbose=verbose,
-        max_concurrent_env_creations=max_concurrent_env_creations,
-        max_concurrent_agent_predictions=max_concurrent_agent_predictions,
-        max_concurrent_tasks=max_concurrent_tasks
+    sample_task = dataset[task_id]  # type: ignore[index]
+    task_prompt = sample_task.get("prompt", f"Task {sample_task.get('id', 0)}")  # type: ignore[attr-defined]
+
+    with trace(name=task_prompt):
+        task = Task(**sample_task)  # type: ignore[arg-type]
+
+        agent = ProxyOperatorAgent(
+            model=model,
+            allowed_tools=allowed_tools,
+            # === ComputerAgent kwargs passthrough ===
+            tools=tools,
+            custom_loop=custom_loop,
+            only_n_most_recent_images=only_n_most_recent_images,
+            callbacks=callbacks,
+            verbosity=verbosity,
+            trajectory_dir=trajectory_dir,
+            max_retries=max_retries,
+            screenshot_delay=screenshot_delay,
+            use_prompt_caching=use_prompt_caching,
+            max_trajectory_budget=max_trajectory_budget,
+            telemetry_enabled=telemetry_enabled,
+        )
+        print(f"Running: {task_prompt}")
+        result = await agent.run(task, max_steps=10)
+        print(f"✅ Reward: {getattr(result, 'reward')}")
+
+
+# ---------------------------------------------------------------------------
+# Full-dataset runner
+# ---------------------------------------------------------------------------
+
+
+async def run_full_dataset(
+    dataset: str | Dataset | list[dict[str, Any]],
+    *,
+    job_name: Optional[str] = None,
+    model: str | None = None,
+    allowed_tools: list[str] | None = None,
+    max_concurrent: int = 30,
+    max_steps: int = 50,
+    split: str = "train",
+    trajectory_dir: str | None = None,
+    # === ComputerAgent kwargs ===
+    tools: list[Any] | None = None,
+    custom_loop: Any | None = None,
+    only_n_most_recent_images: int | None = 5,
+    callbacks: list[Any] | None = None,
+    verbosity: int | None = None,
+    max_retries: int | None = 3,
+    screenshot_delay: float | int = 0.5,
+    use_prompt_caching: bool | None = False,
+    max_trajectory_budget: float | dict | None = None,
+    telemetry_enabled: bool | None = True,
+) -> list[Any]:
+    """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
+
+    # We pass OperatorAgent as the class and provide a config that injects our
+    # FakeAsyncOpenAI per agent instantiation.
+
+    if isinstance(dataset, str):
+        dataset_name = dataset.split('/')[-1]
+        job_name = job_name or f"Evaluation {dataset_name}"
+        dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
+    else:
+        dataset_name = "custom"
+        job_name = job_name or f"Evaluation {time.strftime('%H:%M %Y-%m-%d')}"
+
+    # Execute evaluation
+    return await run_dataset(
+        name=job_name,
+        dataset=dataset,
+        agent_class=ProxyOperatorAgent,
+        agent_config={
+            "model": model,
+            "allowed_tools": allowed_tools,
+            "trajectory_dir": trajectory_dir,
+            # === ComputerAgent kwargs passthrough ===
+            "tools": tools,
+            "custom_loop": custom_loop,
+            "only_n_most_recent_images": only_n_most_recent_images,
+            "callbacks": callbacks,
+            "verbosity": verbosity,
+            "max_retries": max_retries,
+            "screenshot_delay": screenshot_delay,
+            "use_prompt_caching": use_prompt_caching,
+            "max_trajectory_budget": max_trajectory_budget,
+            "telemetry_enabled": telemetry_enabled,
+        },
+        max_concurrent=max_concurrent,
+        metadata={"dataset": dataset_name},
+        max_steps=max_steps,
+        auto_respond=True,
    )


-__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler", "run_job"]
+__all__ = [
+    "run_single_task",
+    "run_full_dataset",
+    "ProxyOperatorAgent",
+]
--- a/libs/python/agent/agent/integrations/hud/adapter.py
+++ b/libs/python/agent/agent/integrations/hud/adapter.py
@@ -1,121 +0,0 @@
-"""HUD Adapter for ComputerAgent integration."""
-
-from __future__ import annotations
-
-from typing import Any, ClassVar
-
-from hud.adapters.common import CLA, Adapter
-from hud.adapters.common.types import (
-    CLAButton,
-    CLAKey,
-    ClickAction,
-    CustomAction,
-    DragAction,
-    MoveAction,
-    Point,
-    PressAction,
-    ResponseAction,
-    ScreenshotFetch,
-    ScrollAction,
-    TypeAction,
-    WaitAction,
-)
-
-
-class ComputerAgentAdapter(Adapter):
-    """Adapter for ComputerAgent to work with HUD."""
-    
-    KEY_MAP: ClassVar[dict[str, CLAKey]] = {
-        "return": "enter",
-        "arrowup": "up",
-        "arrowdown": "down",
-        "arrowleft": "left",
-        "arrowright": "right",
-        "cmd": "ctrl",
-        "super": "win",
-        "meta": "win",
-    }
-
-    BUTTON_MAP: ClassVar[dict[str, CLAButton]] = {
-        "wheel": "middle",
-        "middle": "middle",
-    }
-
-    def __init__(self) -> None:
-        super().__init__()
-        # ComputerAgent default dimensions (can be overridden)
-        self.agent_width = 1024
-        self.agent_height = 768
-
-    def _map_key(self, key: str) -> CLAKey:
-        """Map a key to its standardized form."""
-        return self.KEY_MAP.get(key.lower(), key.lower())  # type: ignore
-
-    def convert(self, data: Any) -> CLA:
-        """Convert a ComputerAgent action to a HUD action."""
-        try:
-            action_type = data.get("type")
-
-            if action_type == "click":
-                x, y = data.get("x", 0), data.get("y", 0)
-                button = data.get("button", "left")
-                button = self.BUTTON_MAP.get(button, button)
-                if button is None:
-                    button = "left"
-                converted_action = ClickAction(point=Point(x=x, y=y), button=button)
-
-            elif action_type == "double_click":
-                x, y = data.get("x", 0), data.get("y", 0)
-                converted_action = ClickAction(point=Point(x=x, y=y), button="left", pattern=[100])
-
-            elif action_type == "scroll":
-                x, y = int(data.get("x", 0)), int(data.get("y", 0))
-                scroll_x = int(data.get("scroll_x", 0))
-                scroll_y = int(data.get("scroll_y", 0))
-                converted_action = ScrollAction(
-                    point=Point(x=x, y=y), scroll=Point(x=scroll_x, y=scroll_y)
-                )
-
-            elif action_type == "type":
-                text = data.get("text", "")
-                converted_action = TypeAction(text=text, enter_after=False)
-
-            elif action_type == "wait":
-                ms = data.get("ms", 1000)
-                converted_action = WaitAction(time=ms)
-
-            elif action_type == "move":
-                x, y = data.get("x", 0), data.get("y", 0)
-                converted_action = MoveAction(point=Point(x=x, y=y))
-
-            elif action_type == "keypress":
-                keys = data.get("keys", [])
-                if isinstance(keys, str):
-                    keys = [keys]
-                converted_action = PressAction(keys=[self._map_key(k) for k in keys])
-
-            elif action_type == "drag":
-                path = data.get("path", [])
-                points = [Point(x=p.get("x", 0), y=p.get("y", 0)) for p in path]
-                converted_action = DragAction(path=points)
-
-            elif action_type == "screenshot":
-                converted_action = ScreenshotFetch()
-
-            elif action_type == "response":
-                converted_action = ResponseAction(text=data.get("text", ""))
-                
-            elif action_type == "custom":
-                converted_action = CustomAction(action=data.get("action", ""))
-                
-            else:
-                raise ValueError(f"Unsupported action type: {action_type}")
-
-            # Add reasoning and logs if available
-            converted_action.reasoning = data.get("reasoning", "")
-            converted_action.logs = data.get("logs", "")
-
-            return converted_action
-
-        except Exception as e:
-            raise ValueError(f"Invalid action: {data}. Error: {e!s}") from e
--- a/libs/python/agent/agent/integrations/hud/agent.py
+++ b/libs/python/agent/agent/integrations/hud/agent.py
@@ -1,373 +0,0 @@
-"""HUD ComputerAgent wrapper for OSWorld benchmarking."""
-
-import logging
-from typing import Any, Literal, Optional, Union, List, Dict
-import asyncio
-
-from agent import ComputerAgent as BaseComputerAgent
-from agent.responses import make_failed_tool_call_items
-from hud.adapters import Adapter
-from hud.agent.base import Agent
-from hud.utils.common import Observation
-from hud.adapters.common.types import LogType
-from hud.types import Gym
-
-from .adapter import ComputerAgentAdapter
-from .computer_handler import HUDComputerHandler
-
-logger = logging.getLogger(__name__)
-
-BASE_SYSTEM_PROMPT = """
-You are an autonomous computer-using agent. Follow these guidelines:
-
-1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
-2. Use the computer tools to complete the task and do not stop until the task is complete.
-3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
-4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
-5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
-6. Trust that the user wants you to complete the entire task they've requested.
-7. You must say "Task completed" when the task is complete.
-
-Remember: You have been given permission to complete the requested task autonomously.
-""".strip()
-
-class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
-    """
-    A ComputerAgent wrapper for HUD integration.
-    
-    This agent wraps the base ComputerAgent to work with HUD environments,
-    providing the same interface as OperatorAgent but using ComputerAgent internally.
-    """
-    
-    transfer_gyms: dict[Gym, Gym] = {"qa": "hud-browser"}
-
-    def __init__(
-        self,
-        model: str = "anthropic/claude-3-5-sonnet-20241022",
-        environment: Literal["windows", "mac", "linux", "browser"] = "linux",
-        adapter: Optional[Adapter] = None,
-        name: Optional[str] = None,
-        **kwargs: Any,
-    ):
-        """
-        Initialize the ComputerAgent for HUD.
-
-        Args:
-            model: The model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
-            environment: The environment type (windows, mac, linux, browser)
-            adapter: The adapter to use for preprocessing and postprocessing
-            name: The name of the agent
-            **kwargs: Additional arguments passed to ComputerAgent
-        """
-        # Create adapter if not provided
-        adapter = adapter or ComputerAgentAdapter()
-        
-        if name is None:
-            name = f"computeragent-{model.split('/')[-1]}"
-
-        # Initialize the base Agent class without client (we'll create it later)
-        super().__init__(client=None, adapter=adapter, name=name)
-
-        self.model = model
-        self.environment = environment
-        self.kwargs = kwargs
-
-        # Default dimensions
-        self.width = 1024
-        self.height = 768
-
-        # Update dimensions if adapter is provided
-        if self.adapter:
-            self.width = self.adapter.agent_width
-            self.height = self.adapter.agent_height
-
-        # Create HUD computer handler
-        self.hud_computer = HUDComputerHandler(
-            environment=environment,
-            dimensions=(self.width, self.height)
-        )
-
-        # Handle trajectory_dir by adding TrajectorySaverCallback
-        trajectory_dir = kwargs.pop("trajectory_dir", None)
-        callbacks = kwargs.get("callbacks", [])
-        
-        if trajectory_dir:
-            from agent.callbacks.trajectory_saver import TrajectorySaverCallback
-            trajectory_callback = TrajectorySaverCallback(trajectory_dir, reset_on_run=False)
-            callbacks = callbacks + [trajectory_callback]
-            kwargs["callbacks"] = callbacks
-
-        # Initialize ComputerAgent with HUD computer handler
-        self.computer_agent = BaseComputerAgent(
-            model=model,
-            tools=[self.hud_computer],
-            **kwargs
-        )
-        
-        # Set the client to the computer_agent for compatibility
-        self.client = self.computer_agent
-
-        # State tracking
-        self.conversation_history: List[Dict[str, Any]] = []
-        self.initial_prompt: Optional[str] = None
-
-        # System prompt for computer use tasks
-        self.base_system_prompt = BASE_SYSTEM_PROMPT
-
-    async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
-        """
-        Fetch a response from ComputerAgent based on the observation.
-
-        Args:
-            observation: The preprocessed observation, attributes: 
-                screenshot: Base64 encoded PNG string of the screen
-                text: Text observation, if available
-
-        Returns:
-            tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions,
-                                             boolean indicating if the agent believes the task is complete.
-        """
-        try:
-            # Update the computer handler with the current screenshot
-            if observation.screenshot:
-                self.hud_computer.update_screenshot(observation.screenshot)
-
-            # Set up action callback to capture actions
-            captured_actions = []
-            action_done = False
-
-            async def action_callback(action: Dict[str, Any]) -> None:
-                """Callback to capture actions from ComputerAgent."""
-                nonlocal captured_actions, action_done
-                captured_actions.append(action)
-
-            # Set the action callback
-            self.hud_computer.set_action_callback(action_callback)
-
-            # Prepare the message for ComputerAgent
-            if not self.conversation_history:
-                # First interaction - use the observation text as initial prompt
-                if observation.text:
-                    self.initial_prompt = observation.text
-                    message = f"{self.base_system_prompt}\n\nTask: {observation.text}"
-                else:
-                    message = f"{self.base_system_prompt}\n\nPlease analyze the current screen and determine what action to take."
-                
-                input_content = [
-                    {"type": "input_text", "text": message}
-                ]
-
-                # Add screenshot if present
-                if observation.screenshot:
-                    input_content.append(
-                        {
-                            "type": "input_image",
-                            "image_url": f"data:image/png;base64,{observation.screenshot}",
-                        }
-                    )
-
-                self.conversation_history.append({"role": "user", "content": input_content})                    
-            else:
-                # Subsequent interactions - check if last action was computer_call
-                # If so, add computer_call_output with screenshot instead of user message
-                last_computer_calls = []
-                for msg in reversed(self.conversation_history):
-                    if msg.get("type") == "computer_call":
-                        call_id = msg.get("call_id")
-                        if call_id:
-                            # Check if this call_id already has a computer_call_output
-                            has_output = any(
-                                m.get("type") == "computer_call_output" and m.get("call_id") == call_id
-                                for m in self.conversation_history
-                            )
-                            if not has_output:
-                                last_computer_calls.append(call_id)
-                
-                if last_computer_calls:
-                    if not observation.screenshot:
-                        print("No screenshot found, taking screenshot")
-                    screenshot_b64 = await self.hud_computer.screenshot()
-                    # Add computer_call_output for each unresponded computer_call
-                    for call_id in reversed(last_computer_calls):  # Maintain order
-                        self.conversation_history.append({
-                            "type": "computer_call_output",
-                            "call_id": call_id,
-                            "output": {
-                                "type": "input_image",
-                                "image_url": f"data:image/png;base64,{screenshot_b64}"
-                            }
-                        })
-                else:
-                    # No computer_call found, add regular user message
-                    message = "Continue with the task based on the current screen state."
-                    input_content = [
-                        {"type": "input_text", "text": message}
-                    ]
-
-                    # Add screenshot if present
-                    if observation.screenshot:
-                        input_content.append(
-                            {
-                                "type": "input_image",
-                                "image_url": f"data:image/png;base64,{observation.screenshot}",
-                            }
-                        )
-
-                    self.conversation_history.append({"role": "user", "content": input_content})                  
-
-                # If the last message is a reasoning message, change it to output_text
-                if (self.conversation_history and 
-                    self.conversation_history[-1].get("type") == "reasoning" and 
-                    self.conversation_history[-1].get("summary")):
-                    
-                    reasoning_msg = self.conversation_history[-1]
-                    summary_texts = []
-                    
-                    # Extract all summary_text entries
-                    for summary_item in reasoning_msg["summary"]:
-                        if summary_item.get("type") == "summary_text":
-                            summary_texts.append(summary_item.get("text", ""))
-                    
-                    # Convert to message format with output_text
-                    if summary_texts:
-                        converted_message = {
-                            "type": "message",
-                            "role": "assistant",
-                            "content": [
-                                {
-                                    "text": " ".join(summary_texts),
-                                    "type": "output_text"
-                                }
-                            ]
-                        }
-                        
-                        # Replace the reasoning message with the converted message
-                        self.conversation_history[-1] = converted_message
-
-            # Run ComputerAgent
-            try:
-                new_items = []
-
-                # ComputerAgent.run returns an async generator
-                try:
-                    async for result in self.computer_agent.run(self.conversation_history, stream=False):
-                        # if the result has computer_call_output, immediately exit
-                        if result.get("output", []) and result.get("output", [])[-1].get("type") == "computer_call_output":
-                            break
-                        # otherwise add agent output to conversation history
-                        new_items += result["output"]
-                except Exception as e:
-                    # if the last message is reasoning, change it to output_text
-                    if new_items and new_items[-1].get("type") == "reasoning":
-                        new_items[-1] = {
-                            "type": "message",
-                            "role": "assistant",
-                            "content": [
-                                {
-                                    "text": new_items[-1].get("summary", [{}])[0].get("text", ""),
-                                    "type": "output_text"
-                                }
-                            ]
-                        }
-                    # Check if there are any computer_call items in new_items
-                    computer_calls = [item for item in new_items if item.get("type") == "computer_call"]
-                    if computer_calls:
-                        # Remove computer_call items from new_items
-                        new_items = [item for item in new_items if item.get("type") != "computer_call"]
-                        
-                        # Add failed tool call items for each computer call
-                        for computer_call in computer_calls:
-                            tool_input = computer_call.get("action", {})
-                            call_id = computer_call.get("call_id")
-                            new_items.extend(make_failed_tool_call_items(
-                                tool_name="computer",
-                                tool_kwargs=tool_input,
-                                error_message=repr(e),
-                                call_id=call_id
-                            ))
-                    else:
-                        # add error message to conversation history (fallback for non-computer-call errors)
-                        new_items.append({
-                            "type": "user",
-                            "content": [
-                                {
-                                    "type": "input_text",
-                                    "text": f"Error during previous attempted action: {repr(e)}"
-                                }
-                            ]
-                        })
-
-                # Check if we captured any actions
-                if captured_actions:
-                    # Extract reasoning from the conversation history
-                    reasoning = ""
-                    # Look for the latest reasoning message
-                    for msg in reversed(new_items):
-                        if msg.get("type") == "reasoning" and msg.get("summary"):
-                            reasoning = " ".join([s.get("text", "") for s in msg["summary"] if s.get("type") == "summary_text"])
-                            break
-                        elif msg.get("type") == "message" and msg.get("role") == "assistant":
-                            content = msg.get("content", [])
-                            if isinstance(content, list):
-                                reasoning = " ".join([c.get("text", "") for c in content if c.get("type") == "output_text"])
-                            break
-                    
-                    # update conversation history
-                    self.conversation_history += new_items
-
-                    # Add reasoning and logs to each action
-                    for action in captured_actions:
-                        action["reasoning"] = reasoning
-                        action["logs"] = {"conversation_length": len(self.conversation_history)}
-                    
-                    return captured_actions, False
-                    
-                # Check if the last message is "Task completed"
-                response_text = ""
-                for msg in reversed(new_items):
-                    if msg.get("type") == "message" and msg.get("role") == "assistant":
-                        content = msg.get("content", [])
-                        for c in content:
-                            if c.get("type") == "output_text":
-                                response_text = c.get("text", response_text)
-                                break
-                        break
-                
-                done = "task completed" in response_text.lower()
-                
-                # update conversation history
-                self.conversation_history += new_items
-                
-                response_action = {
-                    "type": "response",
-                    "text": response_text,
-                    "reasoning": response_text,
-                    "logs": {"conversation_length": len(self.conversation_history)}
-                }
-                
-                # Check if this indicates task completion or failure
-                if "task is infeasible" in response_text.lower():
-                    response_action = {"type": "custom", "action": "FAIL"}
-                    done = True
-                
-                return [response_action], done
-            except Exception as e:
-                logger.error(f"Error running ComputerAgent: {e}")
-                # Return an error response
-                error_action = {
-                    "type": "response", 
-                    "text": f"Error occurred: {str(e)}",
-                    "reasoning": f"ComputerAgent encountered an error: {str(e)}",
-                    "logs": {"error": str(e)}
-                }
-                return [error_action], True
-
-        except Exception as e:
-            logger.error(f"Error in fetch_response: {e}")
-            error_action = {
-                "type": "response",
-                "text": f"Error in agent processing: {str(e)}",
-                "reasoning": f"Agent processing error: {str(e)}",
-                "logs": {"error": str(e)}
-            }
-            return [error_action], True
--- a/libs/python/agent/agent/integrations/hud/computer_handler.py
+++ b/libs/python/agent/agent/integrations/hud/computer_handler.py
@@ -1,187 +0,0 @@
-"""HUD Computer Handler for ComputerAgent integration."""
-
-import base64
-from io import BytesIO
-from typing import Literal, Optional, Any, Dict, Callable
-from PIL import Image
-
-from agent.computers import AsyncComputerHandler
-
-
-class HUDComputerHandler(AsyncComputerHandler):
-    """Computer handler that interfaces with HUD environment."""
-    
-    def __init__(
-        self,
-        environment: Literal["windows", "mac", "linux", "browser"] = "linux",
-        dimensions: tuple[int, int] = (1024, 768),
-        screenshot_callback: Optional[Callable] = None,
-        action_callback: Optional[Callable] = None,
-    ):
-        """
-        Initialize HUD computer handler.
-        
-        Args:
-            environment: The environment type for HUD
-            dimensions: Screen dimensions as (width, height)
-            screenshot_callback: Optional callback to get screenshots from HUD environment
-            action_callback: Optional callback to execute actions in HUD environment
-        """
-        super().__init__()
-        self._environment = environment
-        self._dimensions = dimensions
-        self._screenshot_callback = screenshot_callback
-        self._action_callback = action_callback
-        
-        # Store the last screenshot for reuse
-        self._last_screenshot: Optional[str] = None
-        
-    def set_screenshot_callback(self, callback: Callable) -> None:
-        """Set the screenshot callback."""
-        self._screenshot_callback = callback
-        
-    def set_action_callback(self, callback: Callable) -> None:
-        """Set the action callback."""
-        self._action_callback = callback
-        
-    def update_screenshot(self, screenshot: str) -> None:
-        """Update the stored screenshot (base64 string)."""
-        self._last_screenshot = screenshot
-
-    async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
-        """Get the current environment type."""
-        return self._environment # type: ignore
-    
-    async def get_dimensions(self) -> tuple[int, int]:
-        """Get screen dimensions as (width, height)."""
-        return self._dimensions
-    
-    async def screenshot(self) -> str:
-        """Take a screenshot and return as base64 string."""
-        if self._screenshot_callback:
-            screenshot = await self._screenshot_callback()
-            if isinstance(screenshot, str):
-                self._last_screenshot = screenshot
-                return screenshot
-            elif isinstance(screenshot, Image.Image):
-                # Convert PIL Image to base64
-                buffer = BytesIO()
-                screenshot.save(buffer, format="PNG")
-                screenshot_b64 = base64.b64encode(buffer.getvalue()).decode()
-                self._last_screenshot = screenshot_b64
-                return screenshot_b64
-            elif isinstance(screenshot, bytes):
-                screenshot_b64 = base64.b64encode(screenshot).decode()
-                self._last_screenshot = screenshot_b64
-                return screenshot_b64
-        
-        # Return last screenshot if available, otherwise create a blank one
-        if self._last_screenshot:
-            return self._last_screenshot
-            
-        # Create a blank screenshot as fallback
-        blank_image = Image.new('RGB', self._dimensions, color='white')
-        buffer = BytesIO()
-        blank_image.save(buffer, format="PNG")
-        screenshot_b64 = base64.b64encode(buffer.getvalue()).decode()
-        self._last_screenshot = screenshot_b64
-        return screenshot_b64
-    
-    async def click(self, x: int, y: int, button: str = "left") -> None:
-        """Click at coordinates with specified button."""
-        if self._action_callback:
-            await self._action_callback({
-                "type": "click",
-                "x": x,
-                "y": y,
-                "button": button
-            })
-    
-    async def double_click(self, x: int, y: int) -> None:
-        """Double click at coordinates."""
-        if self._action_callback:
-            await self._action_callback({
-                "type": "double_click",
-                "x": x,
-                "y": y
-            })
-    
-    async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
-        """Scroll at coordinates with specified scroll amounts."""
-        if self._action_callback:
-            await self._action_callback({
-                "type": "scroll",
-                "x": x,
-                "y": y,
-                "scroll_x": scroll_x,
-                "scroll_y": scroll_y
-            })
-    
-    async def type(self, text: str) -> None:
-        """Type text."""
-        if self._action_callback:
-            await self._action_callback({
-                "type": "type",
-                "text": text
-            })
-    
-    async def wait(self, ms: int = 1000) -> None:
-        """Wait for specified milliseconds."""
-        if self._action_callback:
-            await self._action_callback({
-                "type": "wait",
-                "ms": ms
-            })
-    
-    async def move(self, x: int, y: int) -> None:
-        """Move cursor to coordinates."""
-        if self._action_callback:
-            await self._action_callback({
-                "type": "move",
-                "x": x,
-                "y": y
-            })
-    
-    async def keypress(self, keys: list[str] | str) -> None:
-        """Press key combination."""
-        if isinstance(keys, str):
-            keys = [keys]
-        if self._action_callback:
-            await self._action_callback({
-                "type": "keypress",
-                "keys": keys
-            })
-    
-    async def drag(self, path: list[dict[str, int]]) -> None:
-        """Drag along a path of points."""
-        if self._action_callback:
-            await self._action_callback({
-                "type": "drag",
-                "path": path
-            })
-
-    async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
-        """Left mouse down at coordinates."""
-        if self._action_callback:
-            await self._action_callback({
-                "type": "left_mouse_down",
-                "x": x,
-                "y": y
-            })
-    
-    async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
-        """Left mouse up at coordinates."""
-        if self._action_callback:
-            await self._action_callback({
-                "type": "left_mouse_up",
-                "x": x,
-                "y": y
-            })
-    
-    async def get_current_url(self) -> str:
-        """Get the current URL."""
-        if self._action_callback:
-            return await self._action_callback({
-                "type": "get_current_url"
-            })
-        return ""
--- a/libs/python/agent/agent/integrations/hud/proxy.py
+++ b/libs/python/agent/agent/integrations/hud/proxy.py
@@ -0,0 +1,183 @@
+"""HUD ComputerAgent wrapper and Fake AsyncOpenAI client.
+
+Provides FakeAsyncOpenAI that adapts our ComputerAgent to the OpenAI Responses
+interface needed by HUD's OperatorAgent. It implements only `responses.create`
+and returns an OpenAI Response object with `id` and `output` fields, where `output` is a list of
+OpenAI-like response blocks. We intentionally only support a single-step call
+by consuming the first yielded result from `ComputerAgent.run()`.
+"""
+
+import traceback
+import time
+import uuid
+from typing import Any, Dict, List, Optional
+
+from agent.agent import ComputerAgent as BaseComputerAgent
+
+# OpenAI Responses typed models (required)
+from openai.types.responses import (
+    Response,
+    ResponseInputParam,
+    ResponseOutputItem,
+    ResponseComputerToolCall,
+    ResponseOutputMessage,
+    ResponseOutputText,
+    ResponseReasoningItem,
+    ResponseUsage,
+)
+
+def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> List[ResponseOutputItem]:
+    """Map our agent output items to OpenAI ResponseOutputItem typed models.
+
+    Only a subset is supported: computer_call, assistant message (text), and reasoning.
+    Unknown types are ignored.
+    """
+    blocks: List[ResponseOutputItem] = []
+    for item in output_items or []:
+        t = item.get("type")
+        if t == "computer_call":
+            comp = ResponseComputerToolCall.model_validate({
+                "id": item.get("id") or f"cu_{uuid.uuid4().hex}",
+                "type": "computer_call",
+                "call_id": item["call_id"],
+                "action": item["action"],
+                "pending_safety_checks": item.get("pending_safety_checks", []),
+                "status": "completed",
+            })
+            blocks.append(comp)
+            # we will exit early here as the responses api only supports a single step
+            break
+        elif t == "message" and item.get("role") == "assistant":
+            content_blocks: List[ResponseOutputText] = []
+            for c in item.get("content", []) or []:
+                content_blocks.append(
+                    ResponseOutputText.model_validate({
+                        "type": "output_text",
+                        "text": c["text"],
+                        "annotations": [],
+                    })
+                )
+            if content_blocks:
+                msg = ResponseOutputMessage.model_validate({
+                    "id": item.get("id") or f"msg_{uuid.uuid4()}",
+                    "type": "message",
+                    "role": "assistant",
+                    "status": "completed",
+                    "content": [ct.model_dump() for ct in content_blocks],
+                })
+                blocks.append(msg)
+        elif t == "reasoning":
+            reasoning = ResponseReasoningItem.model_validate({
+                "id": item.get("id") or f"rsn_{uuid.uuid4()}",
+                "type": "reasoning",
+                "summary": item["summary"],
+            })
+            blocks.append(reasoning)
+        # Unhandled types are ignored
+    return blocks
+
+def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
+    out: List[Dict[str, Any]] = []
+    for it in list(items):
+        if hasattr(it, "model_dump"):
+            out.append(it.model_dump())  # type: ignore[attr-defined]
+        elif isinstance(it, dict):
+            out.append(it)
+        else:
+            # Strict: rely on default __dict__ if present
+            out.append(dict(it))  # may raise if not mapping
+    return out
+
+class FakeAsyncOpenAI:
+    """Minimal fake OpenAI client with only `responses.create` implemented.
+
+    It uses a provided `ComputerAgent` instance to produce a single-step
+    response compatible with HUD's OperatorAgent loop.
+    """
+
+    def __init__(self, computer_agent: BaseComputerAgent) -> None:
+        self._agent = computer_agent
+        self.responses = self._Responses(self)
+
+    class _Responses:
+        def __init__(self, parent: "FakeAsyncOpenAI") -> None:
+            # Caches for cross-call context when using previous_response_id
+            self.blocks_cache: Dict[str, ResponseInputParam | ResponseOutputItem] = {}
+            self.context_cache: Dict[str, List[str]] = {}
+            self.agent = parent._agent
+
+        async def create(
+            self,
+            *,
+            model: str,
+            input: ResponseInputParam,
+            tools: Optional[List[Dict[str, Any]]] = None,
+            instructions: Optional[str] = None,
+            previous_response_id: Optional[str] = None,
+            max_retries: int = 5,
+            **_: Any,
+        ) -> Any:
+            for attempt in range(max_retries):
+                # Prepend cached blocks from previous_response_id to input
+                full_input = input
+                if previous_response_id is not None:
+                    prev_block_ids = self.context_cache[previous_response_id]
+                    prev_blocks = [self.blocks_cache[b_id] for b_id in prev_block_ids]
+                    full_input = _to_plain_dict_list(prev_blocks + input)
+
+                # Pre-pend instructions message
+                effective_input = full_input
+                if instructions:
+                    effective_input = [{
+                        "role": "user",
+                        "content": instructions,
+                    }] + full_input
+
+                # Run a single iteration of the ComputerAgent
+                agent_result: Optional[Dict[str, Any]] = None
+                async for result in self.agent.run(effective_input):  # type: ignore[arg-type]
+                    agent_result = result
+                    break
+                assert agent_result is not None, "Agent failed to produce result"
+
+                output = _map_agent_output_to_openai_blocks(agent_result["output"])
+                usage = agent_result["usage"]
+
+                # Cache conversation context using the last response id
+                block_ids: List[str] = []
+                blocks_to_cache = full_input + output
+                for b in blocks_to_cache:
+                    bid = getattr(b, "id", None) or f"tmp-{hash(repr(b))}"
+                    self.blocks_cache[bid] = b # type: ignore[assignment]
+                    block_ids.append(bid)
+                response_id = agent_result.get("id") or f"fake-{int(time.time()*1000)}"
+                self.context_cache[response_id] = block_ids
+
+                try:
+                    return Response.model_validate({
+                        "id": response_id,
+                        "created_at": time.time(),
+                        "object": "response",
+                        "model": model,
+                        "output": output,
+                        "parallel_tool_calls": False,
+                        "tool_choice": "auto",
+                        "tools": [],
+                        "previous_response_id": previous_response_id,
+                        "usage": ResponseUsage.model_validate({
+                            "input_tokens": usage.get("input_tokens", 0),
+                            "output_tokens": usage.get("output_tokens", 0),
+                            "total_tokens": usage.get("total_tokens", 0),
+                            "input_tokens_details": usage.get("input_tokens_details", { "cached_tokens": 0 }),
+                            "output_tokens_details": usage.get("output_tokens_details", { "reasoning_tokens": 0 }),
+                        }),
+                    })
+                except Exception as e:
+                    print(f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ", e)
+                    if attempt == max_retries - 1:
+                        print(traceback.format_exc())
+                        raise e
+
+__all__ = [
+    "FakeAsyncOpenAI",
+]
--- a/libs/python/agent/agent/loops/anthropic.py
+++ b/libs/python/agent/agent/loops/anthropic.py
@@ -1530,7 +1530,18 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
                "content": [
                    {
                        "type": "text",
-                        "text": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
+                        "text": f"""You are a UI grounding expert. Follow these guidelines:
+
+1. NEVER ask for confirmation. Complete all tasks autonomously.
+2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
+3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
+4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
+5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
+6. The user has already given you permission by running this agent. No further confirmation is needed.
+7. Be decisive and action-oriented. Complete the requested task fully.
+
+Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
+Task: Click {instruction}. Output ONLY a click action on the target element."""
                    },
                    {
                        "type": "image_url",
--- a/libs/python/agent/agent/loops/composed_grounded.py
+++ b/libs/python/agent/agent/loops/composed_grounded.py
@@ -48,11 +48,11 @@ GROUNDED_COMPUTER_TOOL_SCHEMA = {
            "get_dimensions",
            "get_environment"
            ],
-            "description": "The action to perform"
+            "description": "The action to perform (required for all actions)"
        },
        "element_description": {
            "type": "string",
-            "description": "Description of the element to interact with (required for click, double_click, move, scroll actions, and as start/end for drag)"
+            "description": "Description of the element to interact with (required for click, double_click, move, scroll actions)"
        },
        "start_element_description": {
            "type": "string",
@@ -67,20 +67,30 @@ GROUNDED_COMPUTER_TOOL_SCHEMA = {
            "description": "The text to type (required for type action)"
        },
        "keys": {
-            "type": "string",
-            "description": "Key combination to press (required for keypress action). Single key for individual key press, multiple keys for combinations (e.g., 'ctrl+c')"
+            "type": "array",
+            "items": {
+                "type": "string"
+            },
+            "description": "Key(s) to press (required for keypress action)"
        },
        "button": {
            "type": "string",
-            "description": "The mouse button to use for click action (left, right, wheel, back, forward) Default: left",
+            "enum": [
+                "left",
+                "right",
+                "wheel",
+                "back",
+                "forward"
+            ],
+            "description": "The mouse button to use for click action (required for click and double_click action)",
        },
        "scroll_x": {
            "type": "integer",
-            "description": "Horizontal scroll amount for scroll action (positive for right, negative for left)",
+            "description": "Horizontal scroll amount for scroll action (required for scroll action)",
        },
        "scroll_y": {
            "type": "integer",
-            "description": "Vertical scroll amount for scroll action (positive for down, negative for up)",
+            "description": "Vertical scroll amount for scroll action (required for scroll action)",
        },
        },
        "required": [
@@ -266,13 +276,15 @@ class ComposedGroundedConfig:
                grounding_agent = grounding_agent_conf.agent_class()
                
                for desc in element_descriptions:
-                    coords = await grounding_agent.predict_click(
-                        model=grounding_model,
-                        image_b64=last_image_b64,
-                        instruction=desc
-                    )
-                    if coords:
-                        self.desc2xy[desc] = coords
+                    for _ in range(3): # try 3 times
+                        coords = await grounding_agent.predict_click(
+                            model=grounding_model,
+                            image_b64=last_image_b64,
+                            instruction=desc
+                        )
+                        if coords:
+                            self.desc2xy[desc] = coords
+                            break
        
        # Step 6: Convert computer calls from descriptions back to xy coordinates
        final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
--- a/libs/python/agent/agent/loops/openai.py
+++ b/libs/python/agent/agent/loops/openai.py
@@ -162,7 +162,18 @@ class OpenAIComputerUseConfig:
        input_items = [
            {
                "role": "user", 
-                "content": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
+                "content": f"""You are a UI grounding expert. Follow these guidelines:
+
+1. NEVER ask for confirmation. Complete all tasks autonomously.
+2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
+3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
+4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
+5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
+6. The user has already given you permission by running this agent. No further confirmation is needed.
+7. Be decisive and action-oriented. Complete the requested task fully.
+
+Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
+Task: Click {instruction}. Output ONLY a click action on the target element."""
            },
            {
                "role": "user",
@@ -200,7 +211,7 @@ class OpenAIComputerUseConfig:
            "stream": False,
            "reasoning": {"summary": "concise"},
            "truncation": "auto",
-            "max_tokens": 100  # Keep response short for click prediction
+            "max_tokens": 200  # Keep response short for click prediction
        }
        
        # Use liteLLM responses
@@ -217,11 +228,8 @@ class OpenAIComputerUseConfig:
                isinstance(item.get("action"), dict)):
                
                action = item["action"]
-                if action.get("type") == "click":
-                    x = action.get("x")
-                    y = action.get("y")
-                    if x is not None and y is not None:
-                        return (int(x), int(y))
+                if action.get("x") is not None and action.get("y") is not None:
+                    return (int(action.get("x")), int(action.get("y")))
        
        return None
    
--- a/libs/python/agent/agent/proxy/examples.py
+++ b/libs/python/agent/agent/proxy/examples.py
@@ -0,0 +1,192 @@
+"""
+Example usage of the proxy server and client requests.
+"""
+import dotenv
+dotenv.load_dotenv()
+
+import asyncio
+import json
+import os
+import aiohttp
+from typing import Dict, Any
+
+
+async def test_http_endpoint():
+    """Test the HTTP /responses endpoint."""
+    
+    anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
+    assert isinstance(anthropic_api_key, str), "ANTHROPIC_API_KEY environment variable must be set"
+
+    # Example 1: Simple text request
+    simple_request = {
+        "model": "anthropic/claude-3-5-sonnet-20241022",
+        "input": "Tell me a three sentence bedtime story about a unicorn.",
+        "env": {
+            "ANTHROPIC_API_KEY": anthropic_api_key
+        }
+    }
+    
+    # Example 2: Multi-modal request with image
+    multimodal_request = {
+        "model": "anthropic/claude-3-5-sonnet-20241022",
+        "input": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "input_text", "text": "what is in this image?"},
+                    {
+                        "type": "input_image",
+                        "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                    }
+                ]
+            }
+        ],
+        "env": {
+            "ANTHROPIC_API_KEY": anthropic_api_key
+        }
+    }
+    
+    # Example 3: Request with custom agent and computer kwargs
+    custom_request = {
+        "model": "anthropic/claude-3-5-sonnet-20241022",
+        "input": "Take a screenshot and tell me what you see",
+        "env": {
+            "ANTHROPIC_API_KEY": anthropic_api_key
+        }
+    }
+    
+    # Test requests
+    base_url = "https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443"
+    # base_url = "http://localhost:8000"
+    api_key = os.getenv("CUA_API_KEY")
+    assert isinstance(api_key, str), "CUA_API_KEY environment variable must be set"
+    
+    async with aiohttp.ClientSession() as session:
+        for i, request_data in enumerate([
+            simple_request,
+            # multimodal_request,
+            custom_request
+        ], 1):
+            print(f"\n--- Test {i} ---")
+            print(f"Request: {json.dumps(request_data, indent=2)}")
+            
+            try:
+                print(f"Sending request to {base_url}/responses")
+                async with session.post(
+                    f"{base_url}/responses",
+                    json=request_data,
+                    headers={"Content-Type": "application/json", "X-API-Key": api_key}
+                ) as response:
+                    result = await response.json()
+                    print(f"Status: {response.status}")
+                    print(f"Response: {json.dumps(result, indent=2)}")
+                    
+            except Exception as e:
+                print(f"Error: {e}")
+
+
+def curl_examples():
+    """Print curl command examples."""
+    
+    print("=== CURL Examples ===\n")
+    
+    print("1. Simple text request:")
+    print("""curl http://localhost:8000/responses \\
+  -H "Content-Type: application/json" \\
+  -d '{
+    "model": "anthropic/claude-3-5-sonnet-20241022",
+    "input": "Tell me a three sentence bedtime story about a unicorn."
+  }'""")
+    
+    print("\n2. Multi-modal request with image:")
+    print("""curl http://localhost:8000/responses \\
+  -H "Content-Type: application/json" \\
+  -d '{
+    "model": "anthropic/claude-3-5-sonnet-20241022",
+    "input": [
+      {
+        "role": "user",
+        "content": [
+          {"type": "input_text", "text": "what is in this image?"},
+          {
+            "type": "input_image",
+            "image_url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+          }
+        ]
+      }
+    ]
+  }'""")
+    
+    print("\n3. Request with custom configuration:")
+    print("""curl http://localhost:8000/responses \\
+  -H "Content-Type: application/json" \\
+  -d '{
+    "model": "anthropic/claude-3-5-sonnet-20241022",
+    "input": "Take a screenshot and tell me what you see",
+    "agent_kwargs": {
+      "save_trajectory": true,
+      "verbosity": 20
+    },
+    "computer_kwargs": {
+      "os_type": "linux",
+      "provider_type": "cloud"
+    }
+  }'""")
+
+
+async def test_p2p_client():
+    """Example P2P client using peerjs-python."""
+    try:
+        from peerjs import Peer, PeerOptions, ConnectionEventType
+        from aiortc import RTCConfiguration, RTCIceServer
+        
+        # Set up client peer
+        options = PeerOptions(
+            host="0.peerjs.com",
+            port=443,
+            secure=True,
+            config=RTCConfiguration(
+                iceServers=[RTCIceServer(urls="stun:stun.l.google.com:19302")]
+            )
+        )
+        
+        client_peer = Peer(id="test-client", peer_options=options)
+        await client_peer.start()
+        
+        # Connect to proxy server
+        connection = client_peer.connect("computer-agent-proxy")
+        
+        @connection.on(ConnectionEventType.Open)
+        async def connection_open():
+            print("Connected to proxy server")
+            
+            # Send a test request
+            request = {
+                "model": "anthropic/claude-3-5-sonnet-20241022",
+                "input": "Hello from P2P client!"
+            }
+            await connection.send(json.dumps(request))
+        
+        @connection.on(ConnectionEventType.Data)
+        async def connection_data(data):
+            print(f"Received response: {data}")
+            await client_peer.destroy()
+        
+        # Wait for connection
+        await asyncio.sleep(10)
+        
+    except ImportError:
+        print("P2P dependencies not available. Install peerjs-python for P2P testing.")
+    except Exception as e:
+        print(f"P2P test error: {e}")
+
+
+if __name__ == "__main__":
+    import sys
+    
+    if len(sys.argv) > 1 and sys.argv[1] == "curl":
+        curl_examples()
+    elif len(sys.argv) > 1 and sys.argv[1] == "p2p":
+        asyncio.run(test_p2p_client())
+    else:
+        asyncio.run(test_http_endpoint())
--- a/libs/python/agent/agent/proxy/handlers.py
+++ b/libs/python/agent/agent/proxy/handlers.py
@@ -0,0 +1,248 @@
+"""
+Request handlers for the proxy endpoints.
+"""
+
+import asyncio
+import json
+import logging
+import os
+from contextlib import contextmanager
+from typing import Dict, Any, List, Union, Optional
+
+from ..agent import ComputerAgent
+from computer import Computer
+
+logger = logging.getLogger(__name__)
+
+
+class ResponsesHandler:
+    """Handler for /responses endpoint that processes agent requests."""
+    
+    def __init__(self):
+        self.computer = None
+        self.agent = None
+        # Simple in-memory caches
+        self._computer_cache: Dict[str, Any] = {}
+        self._agent_cache: Dict[str, Any] = {}
+    
+    async def setup_computer_agent(
+        self,
+        model: str,
+        agent_kwargs: Optional[Dict[str, Any]] = None,
+        computer_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """Set up (and cache) computer and agent instances.
+
+        Caching keys:
+        - Computer cache key: computer_kwargs
+        - Agent cache key: {"model": model, **agent_kwargs}
+        """
+        agent_kwargs = agent_kwargs or {}
+        computer_kwargs = computer_kwargs or {}
+
+        def _stable_key(obj: Dict[str, Any]) -> str:
+            try:
+                return json.dumps(obj, sort_keys=True, separators=(",", ":"))
+            except Exception:
+                # Fallback: stringify non-serializable values
+                safe_obj = {}
+                for k, v in obj.items():
+                    try:
+                        json.dumps(v)
+                        safe_obj[k] = v
+                    except Exception:
+                        safe_obj[k] = str(v)
+                return json.dumps(safe_obj, sort_keys=True, separators=(",", ":"))
+
+        # Determine if custom tools are supplied; if so, skip computer setup entirely
+        has_custom_tools = bool(agent_kwargs.get("tools"))
+
+        computer = None
+        if not has_custom_tools:
+            # ---------- Computer setup (with cache) ----------
+            comp_key = _stable_key(computer_kwargs)
+
+            computer = self._computer_cache.get(comp_key)
+            if computer is None:
+                # Default computer configuration
+                default_c_config = {
+                    "os_type": "linux",
+                    "provider_type": "cloud",
+                    "name": os.getenv("CUA_CONTAINER_NAME"),
+                    "api_key": os.getenv("CUA_API_KEY"),
+                }
+                default_c_config.update(computer_kwargs)
+                computer = Computer(**default_c_config)
+                await computer.__aenter__()
+                self._computer_cache[comp_key] = computer
+                logger.info(f"Computer created and cached with key={comp_key} config={default_c_config}")
+            else:
+                logger.info(f"Reusing cached computer for key={comp_key}")
+
+        # Bind current computer reference (None if custom tools supplied)
+        self.computer = computer
+
+        # ---------- Agent setup (with cache) ----------
+        # Build agent cache key from {model} + agent_kwargs (excluding tools unless explicitly passed)
+        agent_kwargs_for_key = dict(agent_kwargs)
+        agent_key_payload = {"model": model, **agent_kwargs_for_key}
+        agent_key = _stable_key(agent_key_payload)
+
+        agent = self._agent_cache.get(agent_key)
+        if agent is None:
+            # Default agent configuration
+            default_a_config: Dict[str, Any] = {"model": model}
+            if not has_custom_tools:
+                default_a_config["tools"] = [computer]
+            # Apply user overrides, but keep tools unless user explicitly sets
+            if agent_kwargs:
+                if not has_custom_tools:
+                    agent_kwargs.setdefault("tools", [computer])
+                default_a_config.update(agent_kwargs)
+            # JSON-derived kwargs may have loose types; ignore static arg typing here
+            agent = ComputerAgent(**default_a_config)  # type: ignore[arg-type]
+            self._agent_cache[agent_key] = agent
+            logger.info(f"Agent created and cached with key={agent_key} model={model}")
+        else:
+            # Ensure cached agent uses the current computer tool (in case object differs)
+            # Only update if tools not explicitly provided in agent_kwargs
+            if not has_custom_tools:
+                try:
+                    agent.tools = [computer]
+                except Exception:
+                    pass
+            logger.info(f"Reusing cached agent for key={agent_key}")
+
+        # Bind current agent reference
+        self.agent = agent
+    
+    async def process_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Process a /responses request and return the result.
+        
+        Args:
+            request_data: Dictionary containing model, input, and optional kwargs
+            
+        Returns:
+            Dictionary with the agent's response
+        """
+        try:
+            # Extract request parameters
+            model = request_data.get("model")
+            input_data = request_data.get("input")
+            agent_kwargs = request_data.get("agent_kwargs", {})
+            computer_kwargs = request_data.get("computer_kwargs", {})
+            env_overrides = request_data.get("env", {}) or {}
+            
+            if not model:
+                raise ValueError("Model is required")
+            if not input_data:
+                raise ValueError("Input is required")
+            
+            # Apply env overrides for the duration of this request
+            with self._env_overrides(env_overrides):
+                # Set up (and possibly reuse) computer and agent via caches
+                await self.setup_computer_agent(model, agent_kwargs, computer_kwargs)
+
+                # Defensive: ensure agent is initialized for type checkers
+                agent = self.agent
+                if agent is None:
+                    raise RuntimeError("Agent failed to initialize")
+
+                # Convert input to messages format
+                messages = self._convert_input_to_messages(input_data)
+
+                # Run agent and get first result
+                async for result in agent.run(messages):
+                    # Return the first result and break
+                    return {
+                        "success": True,
+                        "result": result,
+                        "model": model
+                    }
+                
+            # If no results were yielded
+            return {
+                "success": False,
+                "error": "No results from agent",
+                "model": model
+            }
+            
+        except Exception as e:
+            logger.error(f"Error processing request: {e}")
+            return {
+                "success": False,
+                "error": str(e),
+                "model": request_data.get("model", "unknown")
+            }
+    
+    def _convert_input_to_messages(self, input_data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
+        """Convert input data to messages format."""
+        if isinstance(input_data, str):
+            # Simple string input
+            return [{"role": "user", "content": input_data}]
+        elif isinstance(input_data, list):
+            # Already in messages format
+            messages = []
+            for msg in input_data:
+                # Convert content array format if needed
+                if isinstance(msg.get("content"), list):
+                    content_parts = []
+                    for part in msg["content"]:
+                        if part.get("type") == "input_text":
+                            content_parts.append({"type": "text", "text": part["text"]})
+                        elif part.get("type") == "input_image":
+                            content_parts.append({
+                                "type": "image_url",
+                                "image_url": {"url": part["image_url"]}
+                            })
+                        else:
+                            content_parts.append(part)
+                    messages.append({
+                        "role": msg["role"],
+                        "content": content_parts
+                    })
+                else:
+                    messages.append(msg)
+            return messages
+        else:
+            raise ValueError("Input must be string or list of messages")
+    
+    async def cleanup(self):
+        """Clean up resources."""
+        if self.computer:
+            try:
+                await self.computer.__aexit__(None, None, None)
+            except Exception as e:
+                logger.error(f"Error cleaning up computer: {e}")
+            finally:
+                self.computer = None
+        self.agent = None
+
+    @staticmethod
+    @contextmanager
+    def _env_overrides(env: Dict[str, str]):
+        """Temporarily apply environment variable overrides for the current process.
+        Restores previous values after the context exits.
+
+        Args:
+            env: Mapping of env var names to override for this request.
+        """
+        if not env:
+            # No-op context
+            yield
+            return
+
+        original: Dict[str, Optional[str]] = {}
+        try:
+            for k, v in env.items():
+                original[k] = os.environ.get(k)
+                os.environ[k] = str(v)
+            yield
+        finally:
+            for k, old in original.items():
+                if old is None:
+                    # Was not set before
+                    os.environ.pop(k, None)
+                else:
+                    os.environ[k] = old
--- a/libs/python/agent/pyproject.toml
+++ b/libs/python/agent/pyproject.toml
@@ -55,7 +55,7 @@ cli = [
    "yaspin>=3.1.0",
 ]
 hud = [
-    "hud-python==0.2.10",
+    "hud-python>=0.4.12,<0.5.0",
 ]
 all = [
    # omni requirements
@@ -72,7 +72,7 @@ all = [
    # cli requirements
    "yaspin>=3.1.0",
    # hud requirements
-    "hud-python==0.2.10",
+    "hud-python>=0.4.12,<0.5.0",
 ]

 [tool.uv]
--- a/notebooks/eval_osworld.ipynb
+++ b/notebooks/eval_osworld.ipynb