From fa888361321efcc9fea2ae493da44ae140ba7449 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Fri, 8 Aug 2025 18:45:53 -0400 Subject: [PATCH] fixed missing screenshots in agent --- .../agent/agent/integrations/hud/agent.py | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/libs/python/agent/agent/integrations/hud/agent.py b/libs/python/agent/agent/integrations/hud/agent.py index 9156cf4a..8ade909c 100644 --- a/libs/python/agent/agent/integrations/hud/agent.py +++ b/libs/python/agent/agent/integrations/hud/agent.py @@ -142,7 +142,20 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]): else: message = f"{self.base_system_prompt}\n\nPlease analyze the current screen and determine what action to take." - self.conversation_history.append({"role": "user", "content": message}) + input_content = [ + {"type": "input_text", "text": message} + ] + + # Add screenshot if present + if observation.screenshot: + input_content.append( + { + "type": "input_image", + "image_url": f"data:image/png;base64,{observation.screenshot}", + } + ) + + self.conversation_history.append({"role": "user", "content": input_content}) else: # Subsequent interactions - check if last action was computer_call # If so, add computer_call_output with screenshot instead of user message @@ -176,7 +189,20 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]): else: # No computer_call found, add regular user message message = "Continue with the task based on the current screen state." - self.conversation_history.append({"role": "user", "content": message}) + input_content = [ + {"type": "input_text", "text": message} + ] + + # Add screenshot if present + if observation.screenshot: + input_content.append( + { + "type": "input_image", + "image_url": f"data:image/png;base64,{observation.screenshot}", + } + ) + + self.conversation_history.append({"role": "user", "content": input_content}) # Run ComputerAgent try: