From f795660f7588c9ea384cdbecceb33f05b36d50f1 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Fri, 12 Sep 2025 11:14:03 -0400 Subject: [PATCH 1/4] Upgraded HUD impl. to support custom tools --- .../agent/agent/integrations/hud/__init__.py | 111 +------ .../agent/agent/integrations/hud/agent.py | 299 ++++++++++++++++++ .../agent/agent/integrations/hud/proxy.py | 81 +++++ 3 files changed, 394 insertions(+), 97 deletions(-) create mode 100644 libs/python/agent/agent/integrations/hud/agent.py diff --git a/libs/python/agent/agent/integrations/hud/__init__.py b/libs/python/agent/agent/integrations/hud/__init__.py index b0d06041..8a203e0e 100644 --- a/libs/python/agent/agent/integrations/hud/__init__.py +++ b/libs/python/agent/agent/integrations/hud/__init__.py @@ -1,102 +1,21 @@ -"""HUD integration: Generic HuggingFace dataset evaluation runner (CUA proxy). +"""HUD integration: dataset runners and MCP-based computer agent export. -This module exposes two helpers to evaluate HUD-compatible datasets using -HUD's OperatorAgent, while proxying model calls through our ComputerAgent via -`FakeAsyncOpenAI` (see `agent/integrations/hud/agent.py`). +This module exposes helpers to evaluate HUD-compatible datasets and exports +the MCP-compatible computer agent implementation. Exports: -- run_single_task(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None) -- run_full_dataset(dataset_name, *, agent_type="cua-proxy", model=None, allowed_tools=None, max_concurrent=30, max_steps=50) +- run_single_task(dataset, ...) +- run_full_dataset(dataset, ...) +- MCPComputerAgent """ import time from typing import Any, Optional -from PIL import Image from datasets import load_dataset, Dataset -from hud.agents import OperatorAgent from hud.datasets import Task, run_dataset -from hud.tools.computer.settings import computer_settings from hud import trace -from agent.agent import ComputerAgent as BaseComputerAgent -from .proxy import FakeAsyncOpenAI -from agent.callbacks import PromptInstructionsCallback - - -# --------------------------------------------------------------------------- -# Proxy OperatorAgent -# --------------------------------------------------------------------------- - - -class ProxyOperatorAgent(OperatorAgent): - """OperatorAgent that proxies model calls through our ComputerAgent. - - Accepts the same config keys we pass via hud.run_dataset `agent_config`: - - model: str | None - - allowed_tools: list[str] | None - Additional kwargs are forwarded to OperatorAgent (if any are supported). - """ - - def __init__( - self, - *, - model: str | None = None, - allowed_tools: list[str] | None = None, - trajectory_dir: str | dict | None = None, - # === ComputerAgent kwargs === - tools: list[Any] | None = None, - custom_loop: Any | None = None, - only_n_most_recent_images: int | None = None, - callbacks: list[Any] | None = None, - instructions: str | None = None, - verbosity: int | None = None, - max_retries: int | None = 3, - screenshot_delay: float | int = 0.5, - use_prompt_caching: bool | None = False, - max_trajectory_budget: float | dict | None = None, - telemetry_enabled: bool | None = True, - **kwargs: Any, - ) -> None: - model = model or "computer-use-preview" - allowed_tools = allowed_tools or ["openai_computer"] - - computer_shim = { - 'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)), - 'environment': 'linux', - 'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT) - } - # Build tools ensuring the computer_shim is included - agent_tools: list[Any] = [computer_shim] - if tools: - agent_tools.extend(tools) - - # Build callbacks, injecting prompt instructions if provided - agent_callbacks = list(callbacks or []) - if instructions: - agent_callbacks.append(PromptInstructionsCallback(instructions)) - - computer_agent = BaseComputerAgent( - model=model, - tools=agent_tools, - custom_loop=custom_loop, - only_n_most_recent_images=only_n_most_recent_images, - callbacks=agent_callbacks, - verbosity=verbosity, - trajectory_dir=trajectory_dir, - max_retries=max_retries, - screenshot_delay=screenshot_delay, - use_prompt_caching=use_prompt_caching, - max_trajectory_budget=max_trajectory_budget, - telemetry_enabled=telemetry_enabled, - ) - model_client = FakeAsyncOpenAI(computer_agent) - - super().__init__( - model_client=model_client, # type: ignore[arg-type] - model=model, - allowed_tools=allowed_tools, - **kwargs, - ) +from .agent import MCPComputerAgent # --------------------------------------------------------------------------- @@ -123,7 +42,7 @@ async def run_single_task( max_trajectory_budget: float | dict | None = None, telemetry_enabled: bool | None = True, ) -> None: - """Load one task from the dataset and execute it with Operator+CUA proxy.""" + """Load one task from the dataset and execute it with MCPComputerAgent.""" # Load dataset and pick a sample if isinstance(dataset, str): @@ -139,9 +58,9 @@ async def run_single_task( with trace(name=task_prompt): task = Task(**sample_task) # type: ignore[arg-type] - agent = ProxyOperatorAgent( - model=model, - allowed_tools=allowed_tools, + agent = MCPComputerAgent( + model=model or "computer-use-preview", + allowed_tools=allowed_tools or ["openai_computer"], # === ComputerAgent kwargs passthrough === tools=tools, custom_loop=custom_loop, @@ -190,9 +109,7 @@ async def run_full_dataset( ) -> list[Any]: """Run evaluation across the entire dataset using hud.datasets.run_dataset.""" - # We pass OperatorAgent as the class and provide a config that injects our - # FakeAsyncOpenAI per agent instantiation. - + # Run with our MCP-based agent class. if isinstance(dataset, str): dataset_name = dataset.split('/')[-1] job_name = job_name or f"Evaluation {dataset_name}" @@ -205,7 +122,7 @@ async def run_full_dataset( return await run_dataset( name=job_name, dataset=dataset, - agent_class=ProxyOperatorAgent, + agent_class=MCPComputerAgent, agent_config={ "model": model, "allowed_tools": allowed_tools, @@ -233,5 +150,5 @@ async def run_full_dataset( __all__ = [ "run_single_task", "run_full_dataset", - "ProxyOperatorAgent", + "MCPComputerAgent", ] \ No newline at end of file diff --git a/libs/python/agent/agent/integrations/hud/agent.py b/libs/python/agent/agent/integrations/hud/agent.py new file mode 100644 index 00000000..f53cef5b --- /dev/null +++ b/libs/python/agent/agent/integrations/hud/agent.py @@ -0,0 +1,299 @@ +"""MCP-compatible Computer Agent for HUD integration. + +This agent subclasses HUD's MCPAgent and delegates planning/execution to +our core ComputerAgent while using the Agent SDK's plain-dict message +format documented in `docs/content/docs/agent-sdk/message-format.mdx`. + +Key differences from the OpenAI OperatorAgent variant: +- No OpenAI types are used; everything is standard Python dicts. +- Planning is executed via `ComputerAgent.run(messages)`. +- The first yielded result per step is returned as the agent response. +""" +from __future__ import annotations + +from typing import Any, ClassVar, Optional + +from agent.agent import ComputerAgent as BaseComputerAgent +from agent.callbacks import PromptInstructionsCallback +from agent.callbacks.trajectory_saver import TrajectorySaverCallback +from hud.agents import MCPAgent +from hud.tools.computer.settings import computer_settings +from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace + +from agent.responses import make_failed_tool_call_items +from agent.computers import is_agent_computer +from PIL import Image +import mcp.types as types +import hud +import uuid +import base64 +from pathlib import Path + + +class MCPComputerAgent(MCPAgent): + """MCP agent that uses ComputerAgent for planning and tools for execution. + + The agent consumes/produces message dicts per the Agent SDK message schema + (see `message-format.mdx`). + """ + + metadata: ClassVar[dict[str, Any]] = { + "display_width": computer_settings.OPENAI_COMPUTER_WIDTH, + "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT, + } + + required_tools: ClassVar[list[str]] = ["openai_computer"] + + def __init__( + self, + *, + model: str | None = None, + allowed_tools: list[str] | None = None, + trajectory_dir: str | dict | None = None, + # === ComputerAgent kwargs === + tools: list[Any] | None = None, + custom_loop: Any | None = None, + only_n_most_recent_images: int | None = None, + callbacks: list[Any] | None = None, + instructions: str | None = None, + verbosity: int | None = None, + max_retries: int | None = 3, + screenshot_delay: float | int = 0.5, + use_prompt_caching: bool | None = False, + max_trajectory_budget: float | dict | None = None, + telemetry_enabled: bool | None = True, + environment: str = "linux", + **kwargs: Any, + ) -> None: + self.allowed_tools = allowed_tools or ["openai_computer"] + super().__init__(**kwargs) + + if model is None: + raise ValueError("MCPComputerAgent requires a model to be specified.") + + self.model = model + self.environment = environment + + # Update model name for HUD logging + self.model_name = "cua-" + self.model + + # Stateful tracking of tool call inputs + self.tool_call_inputs: dict[str, list[dict[str, Any]]] = {} + + # Build system prompt + operator_instructions = """ + You are an autonomous computer-using agent. Follow these guidelines: + + 1. NEVER ask for confirmation. Complete all tasks autonomously. + 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed. + 3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking. + 4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files). + 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT. + 6. The user has already given you permission by running this agent. No further confirmation is needed. + 7. Be decisive and action-oriented. Complete the requested task fully. + + Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked. + """.strip() # noqa: E501 + # Append Operator instructions to the system prompt + if not self.system_prompt: + self.system_prompt = operator_instructions + else: + self.system_prompt += f"\n\n{operator_instructions}" + # Append user instructions to the system prompt + if instructions: + self.system_prompt += f"\n\n{instructions}" + + # Configure trajectory_dir for HUD + if isinstance(trajectory_dir, str) or isinstance(trajectory_dir, Path): + trajectory_dir = {"trajectory_dir": str(trajectory_dir)} + if isinstance(trajectory_dir, dict): + trajectory_dir["reset_on_run"] = False + + # Ensure a computer shim is present so width/height/environment are known + computer_shim = { + "screenshot": lambda: lambda: Image.new('RGB', (self.metadata["display_width"], self.metadata["display_height"])), + "environment": self.environment, + "dimensions": ( + self.metadata["display_width"], + self.metadata["display_height"], + ), + } + agent_tools: list[Any] = [computer_shim] + if tools: + for tool in tools: + if is_agent_computer(tool): + raise ValueError(f"Too many Computer tools: MCPComputerAgent already includes a Computer interface. Received a Computer tool in tools= (e.g., {tool!r}). Remove it and retry.") + agent_tools.extend(tools) + + agent_kwargs = { + "model": self.model, + "tools": agent_tools, + "custom_loop": custom_loop, + "only_n_most_recent_images": only_n_most_recent_images, + "callbacks": callbacks, + "instructions": self.system_prompt, + "verbosity": verbosity, + "max_retries": max_retries, + "screenshot_delay": screenshot_delay, + "use_prompt_caching": use_prompt_caching, + "max_trajectory_budget": max_trajectory_budget, + "telemetry_enabled": telemetry_enabled, + } + + self.computer_agent = BaseComputerAgent( + **agent_kwargs + ) + + async def get_system_messages(self) -> list[Any]: + """Create initial messages. + + Unused - ComputerAgent handles this with the 'instructions' parameter. + """ + return [] + + async def format_blocks( + self, blocks: list[types.ContentBlock] + ) -> list[dict[str, Any]]: + """ + Format blocks for OpenAI input format. + + Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts. + """ # noqa: E501 + formatted = [] + for block in blocks: + if isinstance(block, types.TextContent): + formatted.append({"type": "input_text", "text": block.text}) + elif isinstance(block, types.ImageContent): + mime_type = getattr(block, "mimeType", "image/png") + formatted.append( + {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"} + ) + return [{"role": "user", "content": formatted}] + + @hud.instrument( + span_type="agent", + record_args=False, # Messages can be large + record_result=True, + ) + async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: + """Get a single-step response by delegating to ComputerAgent.run. + + Returns an Agent SDK-style response dict: + { "output": [AgentMessage, ...], "usage": Usage } + """ + tool_calls: list[MCPToolCall] = [] + output_text: list[str] = [] + is_done: bool = False + + agent_result: list[dict[str, Any]] = [] + + # Call the ComputerAgent LLM API + async for result in self.computer_agent.run(messages): # type: ignore[arg-type] + agent_result.append(result) + # Add messages to output text + if result['type'] == 'reasoning': + output_text.extend( + f"Reasoning: {summary['text']}" + for summary in result['summary'] + ) + elif result['type'] == 'message': + if isinstance(result['content'], list): + output_text.extend( + item['text'] + for item in result['content'] + if item['type'] == 'output_text' + ) + elif isinstance(result['content'], str): + output_text.append(result['content']) + # If we get a tool call, we're not done + if result['type'] == 'computer_call': + id = result["call_id"] + tool_calls.append(MCPToolCall( + name="openai_computer", + arguments=result["action"], + id=id, + )) + is_done = False + self.tool_call_inputs[id] = agent_result + break + + return AgentResponse( + content="\n".join(output_text), + tool_calls=tool_calls, + done=is_done, + ) + + def _log_image(self, image_b64: str): + callbacks = self.computer_agent.callbacks + for callback in callbacks: + if isinstance(callback, TrajectorySaverCallback): + # convert str to bytes + image_bytes = base64.b64decode(image_b64) + callback._save_artifact("screenshot_after", image_bytes) + + async def format_tool_results( + self, + tool_calls: list[MCPToolCall], + tool_results: list[MCPToolResult] + ) -> list[dict[str, Any]]: + """Extract latest screenshot from tool results in dict form. + + Expects results to already be in the message-format content dicts. + Returns a list of input content dicts suitable for follow-up calls. + """ + messages = [] + + for call, result in zip(tool_calls, tool_results): + # Add the assistant's computer call + messages.extend(self.tool_call_inputs[call.id]) + + if result.isError: + error_text = "".join([ + content.text + for content in result.content + if isinstance(content, types.TextContent) + ]) + + # Replace computer call with failed tool call + messages.pop() + messages.extend(make_failed_tool_call_items( + tool_name=call.name, + tool_kwargs=call.arguments or {}, + error_message=error_text, + call_id=call.id, + )) + else: + # Get the latest screenshot + screenshots = [ + content.data + for content in result.content + if isinstance(content, types.ImageContent) + ] + + # Add the resulting screenshot + if screenshots: + self._log_image(screenshots[0]) + messages.append({ + "type": "computer_call_output", + "call_id": call.id, + "output": { + "type": "input_image", + "image_url": f"data:image/png;base64,{screenshots[0]}" + }, + }) + else: + # Otherwise, replace computer call with failed tool call + messages.pop() + messages.extend(make_failed_tool_call_items( + tool_name=call.name, + tool_kwargs=call.arguments or {}, + error_message="No screenshots returned.", + call_id=call.id, + )) + + return messages + + +__all__ = [ + "MCPComputerAgent", +] diff --git a/libs/python/agent/agent/integrations/hud/proxy.py b/libs/python/agent/agent/integrations/hud/proxy.py index a88fc63e..9087d1c9 100644 --- a/libs/python/agent/agent/integrations/hud/proxy.py +++ b/libs/python/agent/agent/integrations/hud/proxy.py @@ -13,6 +13,10 @@ import uuid from typing import Any, Dict, List, Optional from agent.agent import ComputerAgent as BaseComputerAgent +from agent.callbacks import PromptInstructionsCallback +from hud.tools.computer.settings import computer_settings +from PIL import Image +from hud.agents import OperatorAgent # OpenAI Responses typed models (required) from openai.types.responses import ( @@ -178,6 +182,83 @@ class FakeAsyncOpenAI: print(traceback.format_exc()) raise e + +# --------------------------------------------------------------------------- +# Proxy OperatorAgent (moved from __init__.py) +# --------------------------------------------------------------------------- + + +class ProxyOperatorAgent(OperatorAgent): + """OperatorAgent that proxies model calls through our ComputerAgent. + + Accepts the same config keys we pass via hud.run_dataset `agent_config`: + - model: str | None + - allowed_tools: list[str] | None + Additional kwargs are forwarded to OperatorAgent (if any are supported). + """ + + def __init__( + self, + *, + model: str | None = None, + allowed_tools: list[str] | None = None, + trajectory_dir: str | dict | None = None, + # === ComputerAgent kwargs === + tools: list[Any] | None = None, + custom_loop: Any | None = None, + only_n_most_recent_images: int | None = None, + callbacks: list[Any] | None = None, + instructions: str | None = None, + verbosity: int | None = None, + max_retries: int | None = 3, + screenshot_delay: float | int = 0.5, + use_prompt_caching: bool | None = False, + max_trajectory_budget: float | dict | None = None, + telemetry_enabled: bool | None = True, + **kwargs: Any, + ) -> None: + model = model or "computer-use-preview" + allowed_tools = allowed_tools or ["openai_computer"] + + computer_shim = { + 'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)), + 'environment': 'linux', + 'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT) + } + # Build tools ensuring the computer_shim is included + agent_tools: list[Any] = [computer_shim] + if tools: + agent_tools.extend(tools) + + # Build callbacks, injecting prompt instructions if provided + agent_callbacks = list(callbacks or []) + if instructions: + agent_callbacks.append(PromptInstructionsCallback(instructions)) + + computer_agent = BaseComputerAgent( + model=model, + tools=agent_tools, + custom_loop=custom_loop, + only_n_most_recent_images=only_n_most_recent_images, + callbacks=agent_callbacks, + verbosity=verbosity, + trajectory_dir=trajectory_dir, + max_retries=max_retries, + screenshot_delay=screenshot_delay, + use_prompt_caching=use_prompt_caching, + max_trajectory_budget=max_trajectory_budget, + telemetry_enabled=telemetry_enabled, + ) + model_client = FakeAsyncOpenAI(computer_agent) + + super().__init__( + model_client=model_client, # type: ignore[arg-type] + model=model, + allowed_tools=allowed_tools, + **kwargs, + ) + __all__ = [ "FakeAsyncOpenAI", + "ProxyOperatorAgent", ] From b69943121de5f88705e447372e5201871417016f Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Fri, 12 Sep 2025 11:29:40 -0400 Subject: [PATCH 2/4] Fixed KeyError --- .../agent/agent/integrations/hud/agent.py | 64 +++++++++++-------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/libs/python/agent/agent/integrations/hud/agent.py b/libs/python/agent/agent/integrations/hud/agent.py index f53cef5b..5022b3dc 100644 --- a/libs/python/agent/agent/integrations/hud/agent.py +++ b/libs/python/agent/agent/integrations/hud/agent.py @@ -127,6 +127,7 @@ class MCPComputerAgent(MCPAgent): agent_kwargs = { "model": self.model, + "trajectory_dir": trajectory_dir, "tools": agent_tools, "custom_loop": custom_loop, "only_n_most_recent_images": only_n_most_recent_images, @@ -159,6 +160,7 @@ class MCPComputerAgent(MCPAgent): Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts. """ # noqa: E501 + print("format_blocks") formatted = [] for block in blocks: if isinstance(block, types.TextContent): @@ -181,41 +183,50 @@ class MCPComputerAgent(MCPAgent): Returns an Agent SDK-style response dict: { "output": [AgentMessage, ...], "usage": Usage } """ + print("get_response") tool_calls: list[MCPToolCall] = [] output_text: list[str] = [] - is_done: bool = False + is_done: bool = True agent_result: list[dict[str, Any]] = [] # Call the ComputerAgent LLM API async for result in self.computer_agent.run(messages): # type: ignore[arg-type] - agent_result.append(result) - # Add messages to output text - if result['type'] == 'reasoning': - output_text.extend( - f"Reasoning: {summary['text']}" - for summary in result['summary'] - ) - elif result['type'] == 'message': - if isinstance(result['content'], list): + items = result['output'] + if not items or tool_calls: + continue + + for item in items: + if item['type'] in ['reasoning', 'message', 'computer_call', 'function_call', 'function_call_output']: + agent_result.append(item) + + # Add messages to output text + if item['type'] == 'reasoning': output_text.extend( - item['text'] - for item in result['content'] - if item['type'] == 'output_text' + f"Reasoning: {summary['text']}" + for summary in item['summary'] ) - elif isinstance(result['content'], str): - output_text.append(result['content']) - # If we get a tool call, we're not done - if result['type'] == 'computer_call': - id = result["call_id"] - tool_calls.append(MCPToolCall( - name="openai_computer", - arguments=result["action"], - id=id, - )) - is_done = False - self.tool_call_inputs[id] = agent_result - break + elif item['type'] == 'message': + if isinstance(item['content'], list): + output_text.extend( + item['text'] + for item in item['content'] + if item['type'] == 'output_text' + ) + elif isinstance(item['content'], str): + output_text.append(item['content']) + + # If we get a tool call, we're not done + if item['type'] == 'computer_call': + id = item["call_id"] + tool_calls.append(MCPToolCall( + name="openai_computer", + arguments=result["action"], + id=id, + )) + is_done = False + self.tool_call_inputs[id] = agent_result + break return AgentResponse( content="\n".join(output_text), @@ -241,6 +252,7 @@ class MCPComputerAgent(MCPAgent): Expects results to already be in the message-format content dicts. Returns a list of input content dicts suitable for follow-up calls. """ + print("format_tool_results") messages = [] for call, result in zip(tool_calls, tool_results): From b3040306b8021aa455f4a80e77171847558d853f Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Fri, 12 Sep 2025 12:06:36 -0400 Subject: [PATCH 3/4] Fixing bugs --- .../agent/agent/integrations/hud/agent.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/libs/python/agent/agent/integrations/hud/agent.py b/libs/python/agent/agent/integrations/hud/agent.py index 5022b3dc..3196fc95 100644 --- a/libs/python/agent/agent/integrations/hud/agent.py +++ b/libs/python/agent/agent/integrations/hud/agent.py @@ -11,6 +11,7 @@ Key differences from the OpenAI OperatorAgent variant: """ from __future__ import annotations +import io from typing import Any, ClassVar, Optional from agent.agent import ComputerAgent as BaseComputerAgent @@ -109,9 +110,15 @@ class MCPComputerAgent(MCPAgent): if isinstance(trajectory_dir, dict): trajectory_dir["reset_on_run"] = False + self.last_screenshot_b64 = None + + buffer = io.BytesIO() + Image.new('RGB', (self.metadata["display_width"], self.metadata["display_height"])).save(buffer, format='PNG') + self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8') + # Ensure a computer shim is present so width/height/environment are known computer_shim = { - "screenshot": lambda: lambda: Image.new('RGB', (self.metadata["display_width"], self.metadata["display_height"])), + "screenshot": lambda: self.last_screenshot_b64, "environment": self.environment, "dimensions": ( self.metadata["display_width"], @@ -160,7 +167,6 @@ class MCPComputerAgent(MCPAgent): Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts. """ # noqa: E501 - print("format_blocks") formatted = [] for block in blocks: if isinstance(block, types.TextContent): @@ -170,6 +176,7 @@ class MCPComputerAgent(MCPAgent): formatted.append( {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"} ) + self.last_screenshot_b64 = block.data return [{"role": "user", "content": formatted}] @hud.instrument( @@ -183,7 +190,6 @@ class MCPComputerAgent(MCPAgent): Returns an Agent SDK-style response dict: { "output": [AgentMessage, ...], "usage": Usage } """ - print("get_response") tool_calls: list[MCPToolCall] = [] output_text: list[str] = [] is_done: bool = True @@ -194,7 +200,7 @@ class MCPComputerAgent(MCPAgent): async for result in self.computer_agent.run(messages): # type: ignore[arg-type] items = result['output'] if not items or tool_calls: - continue + break for item in items: if item['type'] in ['reasoning', 'message', 'computer_call', 'function_call', 'function_call_output']: @@ -221,12 +227,16 @@ class MCPComputerAgent(MCPAgent): id = item["call_id"] tool_calls.append(MCPToolCall( name="openai_computer", - arguments=result["action"], + arguments=item["action"], id=id, )) is_done = False self.tool_call_inputs[id] = agent_result break + + # if we have tool calls, we should exit the loop + if tool_calls: + break return AgentResponse( content="\n".join(output_text), @@ -252,7 +262,6 @@ class MCPComputerAgent(MCPAgent): Expects results to already be in the message-format content dicts. Returns a list of input content dicts suitable for follow-up calls. """ - print("format_tool_results") messages = [] for call, result in zip(tool_calls, tool_results): @@ -285,6 +294,7 @@ class MCPComputerAgent(MCPAgent): # Add the resulting screenshot if screenshots: self._log_image(screenshots[0]) + self.last_screenshot_b64 = screenshots[0] messages.append({ "type": "computer_call_output", "call_id": call.id, From faf531825ec7984f3a98c92afe6f87e494e7e895 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Fri, 12 Sep 2025 12:32:03 -0400 Subject: [PATCH 4/4] Fixed error during response call --- .../agent/agent/integrations/hud/agent.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/libs/python/agent/agent/integrations/hud/agent.py b/libs/python/agent/agent/integrations/hud/agent.py index 3196fc95..18a231c8 100644 --- a/libs/python/agent/agent/integrations/hud/agent.py +++ b/libs/python/agent/agent/integrations/hud/agent.py @@ -80,6 +80,7 @@ class MCPComputerAgent(MCPAgent): # Stateful tracking of tool call inputs self.tool_call_inputs: dict[str, list[dict[str, Any]]] = {} + self.previous_output: list[dict[str, Any]] = [] # Build system prompt operator_instructions = """ @@ -238,6 +239,8 @@ class MCPComputerAgent(MCPAgent): if tool_calls: break + self.previous_output = agent_result + return AgentResponse( content="\n".join(output_text), tool_calls=tool_calls, @@ -265,6 +268,32 @@ class MCPComputerAgent(MCPAgent): messages = [] for call, result in zip(tool_calls, tool_results): + if call.id not in self.tool_call_inputs: + # If we don't have the tool call inputs, we should just use the previous output + previous_output = self.previous_output.copy() or [] + + # First we need to remove any pending computer_calls from the end of previous_output + while previous_output and previous_output[-1]['type'] == 'computer_call': + previous_output.pop() + messages.extend(previous_output) + + # If the call is a 'response', don't add the result + if call.name == 'response': + continue + # Otherwise, if we have a result, we should add it to the messages + content = [ + { "type": "input_text", "text": content.text } if isinstance(content, types.TextContent) + else { "type": "input_image", "image_url": f"data:image/png;base64,{content.data}" } if isinstance(content, types.ImageContent) + else { "type": "input_text", "text": "" } + for content in result.content + ] + messages.append({ + "role": "user", + "content": content, + }) + + continue + # Add the assistant's computer call messages.extend(self.tool_call_inputs[call.id])