From f795660f7588c9ea384cdbecceb33f05b36d50f1 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Fri, 12 Sep 2025 11:14:03 -0400
Subject: [PATCH 1/4] Upgraded HUD impl. to support custom tools

---
 .../agent/agent/integrations/hud/__init__.py  | 111 +------
 .../agent/agent/integrations/hud/agent.py     | 299 ++++++++++++++++++
 .../agent/agent/integrations/hud/proxy.py     |  81 +++++
 3 files changed, 394 insertions(+), 97 deletions(-)
 create mode 100644 libs/python/agent/agent/integrations/hud/agent.py

diff --git a/libs/python/agent/agent/integrations/hud/__init__.py b/libs/python/agent/agent/integrations/hud/__init__.py
index b0d06041..8a203e0e 100644
--- a/libs/python/agent/agent/integrations/hud/__init__.py
+++ b/libs/python/agent/agent/integrations/hud/__init__.py
@@ -1,102 +1,21 @@
-"""HUD integration: Generic HuggingFace dataset evaluation runner (CUA proxy).
+"""HUD integration: dataset runners and MCP-based computer agent export.
 
-This module exposes two helpers to evaluate HUD-compatible datasets using
-HUD's OperatorAgent, while proxying model calls through our ComputerAgent via
-`FakeAsyncOpenAI` (see `agent/integrations/hud/agent.py`).
+This module exposes helpers to evaluate HUD-compatible datasets and exports
+the MCP-compatible computer agent implementation.
 
 Exports:
-- run_single_task(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None)
-- run_full_dataset(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None, max_concurrent=30, max_steps=50)
+- run_single_task(dataset, ...)
+- run_full_dataset(dataset, ...)
+- MCPComputerAgent
 """
 import time
 from typing import Any, Optional
 
-from PIL import Image
 from datasets import load_dataset, Dataset
-from hud.agents import OperatorAgent
 from hud.datasets import Task, run_dataset
-from hud.tools.computer.settings import computer_settings
 from hud import trace
 
-from agent.agent import ComputerAgent as BaseComputerAgent
-from .proxy import FakeAsyncOpenAI
-from agent.callbacks import PromptInstructionsCallback
-
-
-# ---------------------------------------------------------------------------
-# Proxy OperatorAgent
-# ---------------------------------------------------------------------------
-
-
-class ProxyOperatorAgent(OperatorAgent):
-    """OperatorAgent that proxies model calls through our ComputerAgent.
-
-    Accepts the same config keys we pass via hud.run_dataset `agent_config`:
-    - model: str | None
-    - allowed_tools: list[str] | None
-    Additional kwargs are forwarded to OperatorAgent (if any are supported).
-    """
-
-    def __init__(
-        self,
-        *,
-        model: str | None = None,
-        allowed_tools: list[str] | None = None,
-        trajectory_dir: str | dict | None = None,
-        # === ComputerAgent kwargs ===
-        tools: list[Any] | None = None,
-        custom_loop: Any | None = None,
-        only_n_most_recent_images: int | None = None,
-        callbacks: list[Any] | None = None,
-        instructions: str | None = None,
-        verbosity: int | None = None,
-        max_retries: int | None = 3,
-        screenshot_delay: float | int = 0.5,
-        use_prompt_caching: bool | None = False,
-        max_trajectory_budget: float | dict | None = None,
-        telemetry_enabled: bool | None = True,
-        **kwargs: Any,
-    ) -> None:
-        model = model or "computer-use-preview"
-        allowed_tools = allowed_tools or ["openai_computer"]
-        
-        computer_shim = {
-            'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
-            'environment': 'linux',
-            'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
-        }
-        # Build tools ensuring the computer_shim is included
-        agent_tools: list[Any] = [computer_shim]
-        if tools:
-            agent_tools.extend(tools)
-
-        # Build callbacks, injecting prompt instructions if provided
-        agent_callbacks = list(callbacks or [])
-        if instructions:
-            agent_callbacks.append(PromptInstructionsCallback(instructions))
-
-        computer_agent = BaseComputerAgent(
-            model=model,
-            tools=agent_tools,
-            custom_loop=custom_loop,
-            only_n_most_recent_images=only_n_most_recent_images,
-            callbacks=agent_callbacks,
-            verbosity=verbosity,
-            trajectory_dir=trajectory_dir,
-            max_retries=max_retries,
-            screenshot_delay=screenshot_delay,
-            use_prompt_caching=use_prompt_caching,
-            max_trajectory_budget=max_trajectory_budget,
-            telemetry_enabled=telemetry_enabled,
-        )
-        model_client = FakeAsyncOpenAI(computer_agent)
-
-        super().__init__( 
-            model_client=model_client, # type: ignore[arg-type]
-            model=model,
-            allowed_tools=allowed_tools,
-            **kwargs,
-        )
+from .agent import MCPComputerAgent
 
 
 # ---------------------------------------------------------------------------
@@ -123,7 +42,7 @@ async def run_single_task(
     max_trajectory_budget: float | dict | None = None,
     telemetry_enabled: bool | None = True,
 ) -> None:
-    """Load one task from the dataset and execute it with Operator+CUA proxy."""
+    """Load one task from the dataset and execute it with MCPComputerAgent."""
 
     # Load dataset and pick a sample
     if isinstance(dataset, str):
@@ -139,9 +58,9 @@ async def run_single_task(
     with trace(name=task_prompt):
         task = Task(**sample_task)  # type: ignore[arg-type]
 
-        agent = ProxyOperatorAgent(
-            model=model,
-            allowed_tools=allowed_tools,
+        agent = MCPComputerAgent(
+            model=model or "computer-use-preview",
+            allowed_tools=allowed_tools or ["openai_computer"],
             # === ComputerAgent kwargs passthrough ===
             tools=tools,
             custom_loop=custom_loop,
@@ -190,9 +109,7 @@ async def run_full_dataset(
 ) -> list[Any]:
     """Run evaluation across the entire dataset using hud.datasets.run_dataset."""
 
-    # We pass OperatorAgent as the class and provide a config that injects our
-    # FakeAsyncOpenAI per agent instantiation.
-
+    # Run with our MCP-based agent class.
     if isinstance(dataset, str):
         dataset_name = dataset.split('/')[-1]
         job_name = job_name or f"Evaluation {dataset_name}"
@@ -205,7 +122,7 @@ async def run_full_dataset(
     return await run_dataset(
         name=job_name,
         dataset=dataset,
-        agent_class=ProxyOperatorAgent,
+        agent_class=MCPComputerAgent,
         agent_config={
             "model": model,
             "allowed_tools": allowed_tools,
@@ -233,5 +150,5 @@ async def run_full_dataset(
 __all__ = [
     "run_single_task",
     "run_full_dataset",
-    "ProxyOperatorAgent",
+    "MCPComputerAgent",
 ]
\ No newline at end of file
diff --git a/libs/python/agent/agent/integrations/hud/agent.py b/libs/python/agent/agent/integrations/hud/agent.py
new file mode 100644
index 00000000..f53cef5b
--- /dev/null
+++ b/libs/python/agent/agent/integrations/hud/agent.py
@@ -0,0 +1,299 @@
+"""MCP-compatible Computer Agent for HUD integration.
+
+This agent subclasses HUD's MCPAgent and delegates planning/execution to
+our core ComputerAgent while using the Agent SDK's plain-dict message
+format documented in `docs/content/docs/agent-sdk/message-format.mdx`.
+
+Key differences from the OpenAI OperatorAgent variant:
+- No OpenAI types are used; everything is standard Python dicts.
+- Planning is executed via `ComputerAgent.run(messages)`.
+- The first yielded result per step is returned as the agent response.
+"""
+from __future__ import annotations
+
+from typing import Any, ClassVar, Optional
+
+from agent.agent import ComputerAgent as BaseComputerAgent
+from agent.callbacks import PromptInstructionsCallback
+from agent.callbacks.trajectory_saver import TrajectorySaverCallback
+from hud.agents import MCPAgent
+from hud.tools.computer.settings import computer_settings
+from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
+
+from agent.responses import make_failed_tool_call_items
+from agent.computers import is_agent_computer
+from PIL import Image
+import mcp.types as types
+import hud
+import uuid
+import base64
+from pathlib import Path
+
+
+class MCPComputerAgent(MCPAgent):
+    """MCP agent that uses ComputerAgent for planning and tools for execution.
+
+    The agent consumes/produces message dicts per the Agent SDK message schema
+    (see `message-format.mdx`).
+    """
+
+    metadata: ClassVar[dict[str, Any]] = {
+        "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
+        "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
+    }
+
+    required_tools: ClassVar[list[str]] = ["openai_computer"]
+
+    def __init__(
+        self,
+        *,
+        model: str | None = None,
+        allowed_tools: list[str] | None = None,
+        trajectory_dir: str | dict | None = None,
+        # === ComputerAgent kwargs ===
+        tools: list[Any] | None = None,
+        custom_loop: Any | None = None,
+        only_n_most_recent_images: int | None = None,
+        callbacks: list[Any] | None = None,
+        instructions: str | None = None,
+        verbosity: int | None = None,
+        max_retries: int | None = 3,
+        screenshot_delay: float | int = 0.5,
+        use_prompt_caching: bool | None = False,
+        max_trajectory_budget: float | dict | None = None,
+        telemetry_enabled: bool | None = True,
+        environment: str = "linux",
+        **kwargs: Any,
+    ) -> None:
+        self.allowed_tools = allowed_tools or ["openai_computer"]
+        super().__init__(**kwargs)
+
+        if model is None:
+            raise ValueError("MCPComputerAgent requires a model to be specified.")
+
+        self.model = model
+        self.environment = environment
+
+        # Update model name for HUD logging
+        self.model_name = "cua-" + self.model
+
+        # Stateful tracking of tool call inputs
+        self.tool_call_inputs: dict[str, list[dict[str, Any]]] = {}
+
+        # Build system prompt
+        operator_instructions = """
+        You are an autonomous computer-using agent. Follow these guidelines:
+
+        1. NEVER ask for confirmation. Complete all tasks autonomously.
+        2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
+        3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
+        4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
+        5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
+        6. The user has already given you permission by running this agent. No further confirmation is needed.
+        7. Be decisive and action-oriented. Complete the requested task fully.
+
+        Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
+        """.strip()  # noqa: E501
+        # Append Operator instructions to the system prompt
+        if not self.system_prompt:
+            self.system_prompt = operator_instructions
+        else:
+            self.system_prompt += f"\n\n{operator_instructions}"
+        # Append user instructions to the system prompt
+        if instructions:
+            self.system_prompt += f"\n\n{instructions}"
+
+        # Configure trajectory_dir for HUD
+        if isinstance(trajectory_dir, str) or isinstance(trajectory_dir, Path):
+            trajectory_dir = {"trajectory_dir": str(trajectory_dir)}
+        if isinstance(trajectory_dir, dict):
+            trajectory_dir["reset_on_run"] = False
+
+        # Ensure a computer shim is present so width/height/environment are known
+        computer_shim = {
+            "screenshot": lambda: lambda: Image.new('RGB', (self.metadata["display_width"], self.metadata["display_height"])),
+            "environment": self.environment,
+            "dimensions": (
+                self.metadata["display_width"],
+                self.metadata["display_height"],
+            ),
+        }
+        agent_tools: list[Any] = [computer_shim]
+        if tools:
+            for tool in tools:
+                if is_agent_computer(tool):
+                    raise ValueError(f"Too many Computer tools: MCPComputerAgent already includes a Computer interface. Received a Computer tool in tools= (e.g., {tool!r}). Remove it and retry.")
+            agent_tools.extend(tools)
+        
+        agent_kwargs = {
+            "model": self.model,
+            "tools": agent_tools,
+            "custom_loop": custom_loop,
+            "only_n_most_recent_images": only_n_most_recent_images,
+            "callbacks": callbacks,
+            "instructions": self.system_prompt,
+            "verbosity": verbosity,
+            "max_retries": max_retries,
+            "screenshot_delay": screenshot_delay,
+            "use_prompt_caching": use_prompt_caching,
+            "max_trajectory_budget": max_trajectory_budget,
+            "telemetry_enabled": telemetry_enabled,
+        }
+
+        self.computer_agent = BaseComputerAgent(
+            **agent_kwargs
+        )
+
+    async def get_system_messages(self) -> list[Any]:
+        """Create initial messages.
+
+        Unused - ComputerAgent handles this with the 'instructions' parameter.
+        """
+        return []
+
+    async def format_blocks(
+        self, blocks: list[types.ContentBlock]
+    ) -> list[dict[str, Any]]:
+        """
+        Format blocks for OpenAI input format.
+
+        Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
+        """  # noqa: E501
+        formatted = []
+        for block in blocks:
+            if isinstance(block, types.TextContent):
+                formatted.append({"type": "input_text", "text": block.text})
+            elif isinstance(block, types.ImageContent):
+                mime_type = getattr(block, "mimeType", "image/png")
+                formatted.append(
+                    {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"}
+                )
+        return [{"role": "user", "content": formatted}]
+
+    @hud.instrument(
+        span_type="agent",
+        record_args=False,  # Messages can be large
+        record_result=True,
+    )
+    async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
+        """Get a single-step response by delegating to ComputerAgent.run.
+
+        Returns an Agent SDK-style response dict:
+        { "output": [AgentMessage, ...], "usage": Usage }
+        """
+        tool_calls: list[MCPToolCall] = []
+        output_text: list[str] = []
+        is_done: bool = False
+
+        agent_result: list[dict[str, Any]] = []
+
+        # Call the ComputerAgent LLM API
+        async for result in self.computer_agent.run(messages):  # type: ignore[arg-type]
+            agent_result.append(result)
+            # Add messages to output text
+            if result['type'] == 'reasoning':
+                output_text.extend(
+                    f"Reasoning: {summary['text']}"
+                    for summary in result['summary']
+                )
+            elif result['type'] == 'message':
+                if isinstance(result['content'], list):
+                    output_text.extend(
+                        item['text'] 
+                        for item in result['content']
+                        if item['type'] == 'output_text'
+                    )
+                elif isinstance(result['content'], str):
+                    output_text.append(result['content'])
+            # If we get a tool call, we're not done
+            if result['type'] == 'computer_call':
+                id = result["call_id"]
+                tool_calls.append(MCPToolCall(
+                    name="openai_computer",
+                    arguments=result["action"],
+                    id=id,
+                ))
+                is_done = False
+                self.tool_call_inputs[id] = agent_result
+                break
+
+        return AgentResponse(
+            content="\n".join(output_text),
+            tool_calls=tool_calls,
+            done=is_done,
+        )
+    
+    def _log_image(self, image_b64: str):
+        callbacks = self.computer_agent.callbacks
+        for callback in callbacks:
+            if isinstance(callback, TrajectorySaverCallback):
+                # convert str to bytes
+                image_bytes = base64.b64decode(image_b64)
+                callback._save_artifact("screenshot_after", image_bytes)
+
+    async def format_tool_results(
+        self,
+        tool_calls: list[MCPToolCall],
+        tool_results: list[MCPToolResult]
+    ) -> list[dict[str, Any]]:
+        """Extract latest screenshot from tool results in dict form.
+
+        Expects results to already be in the message-format content dicts.
+        Returns a list of input content dicts suitable for follow-up calls.
+        """
+        messages = []
+
+        for call, result in zip(tool_calls, tool_results):
+            # Add the assistant's computer call
+            messages.extend(self.tool_call_inputs[call.id])
+            
+            if result.isError:
+                error_text = "".join([
+                    content.text
+                    for content in result.content
+                    if isinstance(content, types.TextContent)
+                ])
+
+                # Replace computer call with failed tool call
+                messages.pop()
+                messages.extend(make_failed_tool_call_items(
+                    tool_name=call.name,
+                    tool_kwargs=call.arguments or {},
+                    error_message=error_text,
+                    call_id=call.id,
+                ))
+            else:
+                # Get the latest screenshot
+                screenshots = [
+                    content.data
+                    for content in result.content
+                    if isinstance(content, types.ImageContent)
+                ]
+
+                # Add the resulting screenshot
+                if screenshots:
+                    self._log_image(screenshots[0])
+                    messages.append({
+                        "type": "computer_call_output",
+                        "call_id": call.id,
+                        "output": {
+                            "type": "input_image",
+                            "image_url": f"data:image/png;base64,{screenshots[0]}"
+                        },
+                    })
+                else:
+                    # Otherwise, replace computer call with failed tool call
+                    messages.pop()
+                    messages.extend(make_failed_tool_call_items(
+                        tool_name=call.name,
+                        tool_kwargs=call.arguments or {},
+                        error_message="No screenshots returned.",
+                        call_id=call.id,
+                    ))
+
+        return messages
+
+
+__all__ = [
+    "MCPComputerAgent",
+]
diff --git a/libs/python/agent/agent/integrations/hud/proxy.py b/libs/python/agent/agent/integrations/hud/proxy.py
index a88fc63e..9087d1c9 100644
--- a/libs/python/agent/agent/integrations/hud/proxy.py
+++ b/libs/python/agent/agent/integrations/hud/proxy.py
@@ -13,6 +13,10 @@ import uuid
 from typing import Any, Dict, List, Optional
 
 from agent.agent import ComputerAgent as BaseComputerAgent
+from agent.callbacks import PromptInstructionsCallback
+from hud.tools.computer.settings import computer_settings
+from PIL import Image
+from hud.agents import OperatorAgent
 
 # OpenAI Responses typed models (required)
 from openai.types.responses import (
@@ -178,6 +182,83 @@ class FakeAsyncOpenAI:
                         print(traceback.format_exc())
                         raise e
 
+
+# ---------------------------------------------------------------------------
+# Proxy OperatorAgent (moved from __init__.py)
+# ---------------------------------------------------------------------------
+
+
+class ProxyOperatorAgent(OperatorAgent):
+    """OperatorAgent that proxies model calls through our ComputerAgent.
+
+    Accepts the same config keys we pass via hud.run_dataset `agent_config`:
+    - model: str | None
+    - allowed_tools: list[str] | None
+    Additional kwargs are forwarded to OperatorAgent (if any are supported).
+    """
+
+    def __init__(
+        self,
+        *,
+        model: str | None = None,
+        allowed_tools: list[str] | None = None,
+        trajectory_dir: str | dict | None = None,
+        # === ComputerAgent kwargs ===
+        tools: list[Any] | None = None,
+        custom_loop: Any | None = None,
+        only_n_most_recent_images: int | None = None,
+        callbacks: list[Any] | None = None,
+        instructions: str | None = None,
+        verbosity: int | None = None,
+        max_retries: int | None = 3,
+        screenshot_delay: float | int = 0.5,
+        use_prompt_caching: bool | None = False,
+        max_trajectory_budget: float | dict | None = None,
+        telemetry_enabled: bool | None = True,
+        **kwargs: Any,
+    ) -> None:
+        model = model or "computer-use-preview"
+        allowed_tools = allowed_tools or ["openai_computer"]
+
+        computer_shim = {
+            'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
+            'environment': 'linux',
+            'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
+        }
+        # Build tools ensuring the computer_shim is included
+        agent_tools: list[Any] = [computer_shim]
+        if tools:
+            agent_tools.extend(tools)
+
+        # Build callbacks, injecting prompt instructions if provided
+        agent_callbacks = list(callbacks or [])
+        if instructions:
+            agent_callbacks.append(PromptInstructionsCallback(instructions))
+
+        computer_agent = BaseComputerAgent(
+            model=model,
+            tools=agent_tools,
+            custom_loop=custom_loop,
+            only_n_most_recent_images=only_n_most_recent_images,
+            callbacks=agent_callbacks,
+            verbosity=verbosity,
+            trajectory_dir=trajectory_dir,
+            max_retries=max_retries,
+            screenshot_delay=screenshot_delay,
+            use_prompt_caching=use_prompt_caching,
+            max_trajectory_budget=max_trajectory_budget,
+            telemetry_enabled=telemetry_enabled,
+        )
+        model_client = FakeAsyncOpenAI(computer_agent)
+
+        super().__init__(
+            model_client=model_client,  # type: ignore[arg-type]
+            model=model,
+            allowed_tools=allowed_tools,
+            **kwargs,
+        )
+
 __all__ = [
     "FakeAsyncOpenAI",
+    "ProxyOperatorAgent",
 ]

From b69943121de5f88705e447372e5201871417016f Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Fri, 12 Sep 2025 11:29:40 -0400
Subject: [PATCH 2/4] Fixed KeyError

---
 .../agent/agent/integrations/hud/agent.py     | 64 +++++++++++--------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/libs/python/agent/agent/integrations/hud/agent.py b/libs/python/agent/agent/integrations/hud/agent.py
index f53cef5b..5022b3dc 100644
--- a/libs/python/agent/agent/integrations/hud/agent.py
+++ b/libs/python/agent/agent/integrations/hud/agent.py
@@ -127,6 +127,7 @@ class MCPComputerAgent(MCPAgent):
         
         agent_kwargs = {
             "model": self.model,
+            "trajectory_dir": trajectory_dir,
             "tools": agent_tools,
             "custom_loop": custom_loop,
             "only_n_most_recent_images": only_n_most_recent_images,
@@ -159,6 +160,7 @@ class MCPComputerAgent(MCPAgent):
 
         Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
         """  # noqa: E501
+        print("format_blocks")
         formatted = []
         for block in blocks:
             if isinstance(block, types.TextContent):
@@ -181,41 +183,50 @@ class MCPComputerAgent(MCPAgent):
         Returns an Agent SDK-style response dict:
         { "output": [AgentMessage, ...], "usage": Usage }
         """
+        print("get_response")
         tool_calls: list[MCPToolCall] = []
         output_text: list[str] = []
-        is_done: bool = False
+        is_done: bool = True
 
         agent_result: list[dict[str, Any]] = []
 
         # Call the ComputerAgent LLM API
         async for result in self.computer_agent.run(messages):  # type: ignore[arg-type]
-            agent_result.append(result)
-            # Add messages to output text
-            if result['type'] == 'reasoning':
-                output_text.extend(
-                    f"Reasoning: {summary['text']}"
-                    for summary in result['summary']
-                )
-            elif result['type'] == 'message':
-                if isinstance(result['content'], list):
+            items = result['output']
+            if not items or tool_calls:
+                continue
+
+            for item in items:
+                if item['type'] in ['reasoning', 'message', 'computer_call', 'function_call', 'function_call_output']:
+                    agent_result.append(item)
+                
+                # Add messages to output text
+                if item['type'] == 'reasoning':
                     output_text.extend(
-                        item['text'] 
-                        for item in result['content']
-                        if item['type'] == 'output_text'
+                        f"Reasoning: {summary['text']}"
+                        for summary in item['summary']
                     )
-                elif isinstance(result['content'], str):
-                    output_text.append(result['content'])
-            # If we get a tool call, we're not done
-            if result['type'] == 'computer_call':
-                id = result["call_id"]
-                tool_calls.append(MCPToolCall(
-                    name="openai_computer",
-                    arguments=result["action"],
-                    id=id,
-                ))
-                is_done = False
-                self.tool_call_inputs[id] = agent_result
-                break
+                elif item['type'] == 'message':
+                    if isinstance(item['content'], list):
+                        output_text.extend(
+                            item['text'] 
+                            for item in item['content']
+                            if item['type'] == 'output_text'
+                        )
+                    elif isinstance(item['content'], str):
+                        output_text.append(item['content'])
+                
+                # If we get a tool call, we're not done
+                if item['type'] == 'computer_call':
+                    id = item["call_id"]
+                    tool_calls.append(MCPToolCall(
+                        name="openai_computer",
+                        arguments=result["action"],
+                        id=id,
+                    ))
+                    is_done = False
+                    self.tool_call_inputs[id] = agent_result
+                    break
 
         return AgentResponse(
             content="\n".join(output_text),
@@ -241,6 +252,7 @@ class MCPComputerAgent(MCPAgent):
         Expects results to already be in the message-format content dicts.
         Returns a list of input content dicts suitable for follow-up calls.
         """
+        print("format_tool_results")
         messages = []
 
         for call, result in zip(tool_calls, tool_results):

From b3040306b8021aa455f4a80e77171847558d853f Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Fri, 12 Sep 2025 12:06:36 -0400
Subject: [PATCH 3/4] Fixing bugs

---
 .../agent/agent/integrations/hud/agent.py     | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/libs/python/agent/agent/integrations/hud/agent.py b/libs/python/agent/agent/integrations/hud/agent.py
index 5022b3dc..3196fc95 100644
--- a/libs/python/agent/agent/integrations/hud/agent.py
+++ b/libs/python/agent/agent/integrations/hud/agent.py
@@ -11,6 +11,7 @@ Key differences from the OpenAI OperatorAgent variant:
 """
 from __future__ import annotations
 
+import io
 from typing import Any, ClassVar, Optional
 
 from agent.agent import ComputerAgent as BaseComputerAgent
@@ -109,9 +110,15 @@ class MCPComputerAgent(MCPAgent):
         if isinstance(trajectory_dir, dict):
             trajectory_dir["reset_on_run"] = False
 
+        self.last_screenshot_b64 = None
+
+        buffer = io.BytesIO()
+        Image.new('RGB', (self.metadata["display_width"], self.metadata["display_height"])).save(buffer, format='PNG')
+        self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+
         # Ensure a computer shim is present so width/height/environment are known
         computer_shim = {
-            "screenshot": lambda: lambda: Image.new('RGB', (self.metadata["display_width"], self.metadata["display_height"])),
+            "screenshot": lambda: self.last_screenshot_b64,
             "environment": self.environment,
             "dimensions": (
                 self.metadata["display_width"],
@@ -160,7 +167,6 @@ class MCPComputerAgent(MCPAgent):
 
         Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
         """  # noqa: E501
-        print("format_blocks")
         formatted = []
         for block in blocks:
             if isinstance(block, types.TextContent):
@@ -170,6 +176,7 @@ class MCPComputerAgent(MCPAgent):
                 formatted.append(
                     {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"}
                 )
+                self.last_screenshot_b64 = block.data
         return [{"role": "user", "content": formatted}]
 
     @hud.instrument(
@@ -183,7 +190,6 @@ class MCPComputerAgent(MCPAgent):
         Returns an Agent SDK-style response dict:
         { "output": [AgentMessage, ...], "usage": Usage }
         """
-        print("get_response")
         tool_calls: list[MCPToolCall] = []
         output_text: list[str] = []
         is_done: bool = True
@@ -194,7 +200,7 @@ class MCPComputerAgent(MCPAgent):
         async for result in self.computer_agent.run(messages):  # type: ignore[arg-type]
             items = result['output']
             if not items or tool_calls:
-                continue
+                break
 
             for item in items:
                 if item['type'] in ['reasoning', 'message', 'computer_call', 'function_call', 'function_call_output']:
@@ -221,12 +227,16 @@ class MCPComputerAgent(MCPAgent):
                     id = item["call_id"]
                     tool_calls.append(MCPToolCall(
                         name="openai_computer",
-                        arguments=result["action"],
+                        arguments=item["action"],
                         id=id,
                     ))
                     is_done = False
                     self.tool_call_inputs[id] = agent_result
                     break
+            
+            # if we have tool calls, we should exit the loop
+            if tool_calls:
+                break
 
         return AgentResponse(
             content="\n".join(output_text),
@@ -252,7 +262,6 @@ class MCPComputerAgent(MCPAgent):
         Expects results to already be in the message-format content dicts.
         Returns a list of input content dicts suitable for follow-up calls.
         """
-        print("format_tool_results")
         messages = []
 
         for call, result in zip(tool_calls, tool_results):
@@ -285,6 +294,7 @@ class MCPComputerAgent(MCPAgent):
                 # Add the resulting screenshot
                 if screenshots:
                     self._log_image(screenshots[0])
+                    self.last_screenshot_b64 = screenshots[0]
                     messages.append({
                         "type": "computer_call_output",
                         "call_id": call.id,

From faf531825ec7984f3a98c92afe6f87e494e7e895 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Fri, 12 Sep 2025 12:32:03 -0400
Subject: [PATCH 4/4] Fixed error during response call

---
 .../agent/agent/integrations/hud/agent.py     | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/libs/python/agent/agent/integrations/hud/agent.py b/libs/python/agent/agent/integrations/hud/agent.py
index 3196fc95..18a231c8 100644
--- a/libs/python/agent/agent/integrations/hud/agent.py
+++ b/libs/python/agent/agent/integrations/hud/agent.py
@@ -80,6 +80,7 @@ class MCPComputerAgent(MCPAgent):
 
         # Stateful tracking of tool call inputs
         self.tool_call_inputs: dict[str, list[dict[str, Any]]] = {}
+        self.previous_output: list[dict[str, Any]] = []
 
         # Build system prompt
         operator_instructions = """
@@ -238,6 +239,8 @@ class MCPComputerAgent(MCPAgent):
             if tool_calls:
                 break
 
+        self.previous_output = agent_result
+
         return AgentResponse(
             content="\n".join(output_text),
             tool_calls=tool_calls,
@@ -265,6 +268,32 @@ class MCPComputerAgent(MCPAgent):
         messages = []
 
         for call, result in zip(tool_calls, tool_results):
+            if call.id not in self.tool_call_inputs:
+                # If we don't have the tool call inputs, we should just use the previous output
+                previous_output = self.previous_output.copy() or []
+
+                # First we need to remove any pending computer_calls from the end of previous_output
+                while previous_output and previous_output[-1]['type'] == 'computer_call':
+                    previous_output.pop()
+                messages.extend(previous_output)
+
+                # If the call is a 'response', don't add the result
+                if call.name == 'response':
+                    continue
+                # Otherwise, if we have a result, we should add it to the messages
+                content = [
+                    { "type": "input_text", "text": content.text } if isinstance(content, types.TextContent)
+                    else { "type": "input_image", "image_url": f"data:image/png;base64,{content.data}" } if isinstance(content, types.ImageContent)
+                    else { "type": "input_text", "text": "" }
+                    for content in result.content
+                ]
+                messages.append({
+                    "role": "user",
+                    "content": content,
+                })
+
+                continue
+                
             # Add the assistant's computer call
             messages.extend(self.tool_call_inputs[call.id])