fixed missing screenshots in agent

2026-02-17 11:58:59 -06:00 · 2025-08-08 18:45:53 -04:00
parent f45f6b84e9
commit fa88836132
1 changed files with 28 additions and 2 deletions
--- a/libs/python/agent/agent/integrations/hud/agent.py
+++ b/libs/python/agent/agent/integrations/hud/agent.py
@@ -142,7 +142,20 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
                else:
                    message = f"{self.base_system_prompt}\n\nPlease analyze the current screen and determine what action to take."
                
-                self.conversation_history.append({"role": "user", "content": message})
+                input_content = [
+                    {"type": "input_text", "text": message}
+                ]
+
+                # Add screenshot if present
+                if observation.screenshot:
+                    input_content.append(
+                        {
+                            "type": "input_image",
+                            "image_url": f"data:image/png;base64,{observation.screenshot}",
+                        }
+                    )
+
+                self.conversation_history.append({"role": "user", "content": input_content})                    
            else:
                # Subsequent interactions - check if last action was computer_call
                # If so, add computer_call_output with screenshot instead of user message
@@ -176,7 +189,20 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
                else:
                    # No computer_call found, add regular user message
                    message = "Continue with the task based on the current screen state."
-                    self.conversation_history.append({"role": "user", "content": message})
+                    input_content = [
+                        {"type": "input_text", "text": message}
+                    ]
+
+                    # Add screenshot if present
+                    if observation.screenshot:
+                        input_content.append(
+                            {
+                                "type": "input_image",
+                                "image_url": f"data:image/png;base64,{observation.screenshot}",
+                            }
+                        )
+
+                    self.conversation_history.append({"role": "user", "content": input_content})                  

            # Run ComputerAgent
            try: