Removed pre/post action screenshots in tools. Improved image retention (#289)

2026-01-06 13:30:06 -06:00 · 2025-06-14 10:05:12 -04:00
parent d21a4ca294
commit 7d267701a4
3 changed files with 57 additions and 214 deletions
--- a/libs/agent/agent/core/messages.py
+++ b/libs/agent/agent/core/messages.py
@@ -81,16 +81,27 @@ class StandardMessageManager:
        if not self.config.num_images_to_keep:
            return messages

-        # Find user messages with images
+        # Find messages with images (both user messages and tool call outputs)
        image_messages = []
        for msg in messages:
+            has_image = False
+            
+            # Check user messages with images
            if msg["role"] == "user" and isinstance(msg["content"], list):
                has_image = any(
                    item.get("type") == "image_url" or item.get("type") == "image"
                    for item in msg["content"]
                )
-                if has_image:
-                    image_messages.append(msg)
+            
+            # Check assistant messages with tool calls that have images
+            elif msg["role"] == "assistant" and isinstance(msg["content"], list):
+                for item in msg["content"]:
+                    if item.get("type") == "tool_result" and "base64_image" in item:
+                        has_image = True
+                        break
+            
+            if has_image:
+                image_messages.append(msg)

        # If we don't have more images than the limit, return all messages
        if len(image_messages) <= self.config.num_images_to_keep:
@@ -100,13 +111,35 @@ class StandardMessageManager:
        images_to_keep = image_messages[-self.config.num_images_to_keep :]
        images_to_remove = image_messages[: -self.config.num_images_to_keep]

-        # Create a new message list without the older images
+        # Create a new message list, removing images from older messages
        result = []
        for msg in messages:
            if msg in images_to_remove:
-                # Skip this message
-                continue
-            result.append(msg)
+                # Remove images from this message but keep the text content
+                if msg["role"] == "user" and isinstance(msg["content"], list):
+                    # Keep only text content, remove images
+                    new_content = [
+                        item for item in msg["content"] 
+                        if item.get("type") not in ["image_url", "image"]
+                    ]
+                    if new_content:  # Only add if there's still content
+                        result.append({"role": msg["role"], "content": new_content})
+                elif msg["role"] == "assistant" and isinstance(msg["content"], list):
+                    # Remove base64_image from tool_result items
+                    new_content = []
+                    for item in msg["content"]:
+                        if item.get("type") == "tool_result" and "base64_image" in item:
+                            # Create a copy without the base64_image
+                            new_item = {k: v for k, v in item.items() if k != "base64_image"}
+                            new_content.append(new_item)
+                        else:
+                            new_content.append(item)
+                    result.append({"role": msg["role"], "content": new_content})
+                else:
+                    # For other message types, keep as is
+                    result.append(msg)
+            else:
+                result.append(msg)

        return result

--- a/libs/agent/agent/providers/anthropic/tools/computer.py
+++ b/libs/agent/agent/providers/anthropic/tools/computer.py
@@ -205,26 +205,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                self.logger.info(f"  Coordinates: ({x}, {y})")

                try:
-                    # Take pre-action screenshot to get current dimensions
-                    pre_screenshot = await self.computer.interface.screenshot()
-                    pre_img = Image.open(io.BytesIO(pre_screenshot))
-
-                    # Scale image to match screen dimensions if needed
-                    if pre_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
-                        )
-                        if not isinstance(self.width, int) or not isinstance(self.height, int):
-                            raise ToolError("Screen dimensions must be integers")
-                        size = (int(self.width), int(self.height))
-                        pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
-                        # Save the scaled image back to bytes
-                        buffer = io.BytesIO()
-                        pre_img.save(buffer, format="PNG")
-                        pre_screenshot = buffer.getvalue()
-
-                    self.logger.info(f"  Current dimensions: {pre_img.width}x{pre_img.height}")
-
                    # Perform the click action
                    if action == "left_click":
                        self.logger.info(f"Clicking at ({x}, {y})")
@@ -242,45 +222,14 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                    # Wait briefly for any UI changes
                    await asyncio.sleep(0.5)

-                    # Take and save post-action screenshot
-                    post_screenshot = await self.computer.interface.screenshot()
-                    post_img = Image.open(io.BytesIO(post_screenshot))
-
-                    # Scale post-action image if needed
-                    if post_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                        )
-                        post_img = post_img.resize(
-                            (self.width, self.height), Image.Resampling.LANCZOS
-                        )
-                        buffer = io.BytesIO()
-                        post_img.save(buffer, format="PNG")
-                        post_screenshot = buffer.getvalue()
-
                    return ToolResult(
                        output=f"Performed {action} at ({x}, {y})",
-                        base64_image=base64.b64encode(post_screenshot).decode(),
                    )
                except Exception as e:
                    self.logger.error(f"Error during {action} action: {str(e)}")
                    raise ToolError(f"Failed to perform {action}: {str(e)}")
            else:
                try:
-                    # Take pre-action screenshot
-                    pre_screenshot = await self.computer.interface.screenshot()
-                    pre_img = Image.open(io.BytesIO(pre_screenshot))
-
-                    # Scale image if needed
-                    if pre_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
-                        )
-                        if not isinstance(self.width, int) or not isinstance(self.height, int):
-                            raise ToolError("Screen dimensions must be integers")
-                        size = (int(self.width), int(self.height))
-                        pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
-
                    # Perform the click action
                    if action == "left_click":
                        self.logger.info("Performing left click at current position")
@@ -295,25 +244,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                    # Wait briefly for any UI changes
                    await asyncio.sleep(0.5)

-                    # Take post-action screenshot
-                    post_screenshot = await self.computer.interface.screenshot()
-                    post_img = Image.open(io.BytesIO(post_screenshot))
-
-                    # Scale post-action image if needed
-                    if post_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                        )
-                        post_img = post_img.resize(
-                            (self.width, self.height), Image.Resampling.LANCZOS
-                        )
-                        buffer = io.BytesIO()
-                        post_img.save(buffer, format="PNG")
-                        post_screenshot = buffer.getvalue()
-
                    return ToolResult(
                        output=f"Performed {action} at current position",
-                        base64_image=base64.b64encode(post_screenshot).decode(),
                    )
                except Exception as e:
                    self.logger.error(f"Error during {action} action: {str(e)}")
@@ -328,20 +260,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                raise ToolError(f"{text} must be a string")

            try:
-                # Take pre-action screenshot
-                pre_screenshot = await self.computer.interface.screenshot()
-                pre_img = Image.open(io.BytesIO(pre_screenshot))
-
-                # Scale image if needed
-                if pre_img.size != (self.width, self.height):
-                    self.logger.info(
-                        f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
-                    )
-                    if not isinstance(self.width, int) or not isinstance(self.height, int):
-                        raise ToolError("Screen dimensions must be integers")
-                    size = (int(self.width), int(self.height))
-                    pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
-
                if action == "key":
                    # Special handling for page up/down on macOS
                    if text.lower() in ["pagedown", "page_down", "page down"]:
@@ -378,25 +296,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                    # Wait briefly for UI changes
                    await asyncio.sleep(0.5)

-                    # Take post-action screenshot
-                    post_screenshot = await self.computer.interface.screenshot()
-                    post_img = Image.open(io.BytesIO(post_screenshot))
-
-                    # Scale post-action image if needed
-                    if post_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                        )
-                        post_img = post_img.resize(
-                            (self.width, self.height), Image.Resampling.LANCZOS
-                        )
-                        buffer = io.BytesIO()
-                        post_img.save(buffer, format="PNG")
-                        post_screenshot = buffer.getvalue()
-
                    return ToolResult(
                        output=f"Pressed key: {output_text}",
-                        base64_image=base64.b64encode(post_screenshot).decode(),
                    )

                elif action == "type":
@@ -406,66 +307,13 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                    # Wait briefly for UI changes
                    await asyncio.sleep(0.5)

-                    # Take post-action screenshot
-                    post_screenshot = await self.computer.interface.screenshot()
-                    post_img = Image.open(io.BytesIO(post_screenshot))
-
-                    # Scale post-action image if needed
-                    if post_img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                        )
-                        post_img = post_img.resize(
-                            (self.width, self.height), Image.Resampling.LANCZOS
-                        )
-                        buffer = io.BytesIO()
-                        post_img.save(buffer, format="PNG")
-                        post_screenshot = buffer.getvalue()
-
                    return ToolResult(
                        output=f"Typed text: {text}",
-                        base64_image=base64.b64encode(post_screenshot).decode(),
                    )
            except Exception as e:
                self.logger.error(f"Error during {action} action: {str(e)}")
                raise ToolError(f"Failed to perform {action}: {str(e)}")

-        elif action in ("screenshot", "cursor_position"):
-            if text is not None:
-                raise ToolError(f"text is not accepted for {action}")
-            if coordinate is not None:
-                raise ToolError(f"coordinate is not accepted for {action}")
-
-            try:
-                if action == "screenshot":
-                    # Take screenshot
-                    screenshot = await self.computer.interface.screenshot()
-                    img = Image.open(io.BytesIO(screenshot))
-
-                    # Scale image if needed
-                    if img.size != (self.width, self.height):
-                        self.logger.info(
-                            f"Scaling image from {img.size} to {self.width}x{self.height}"
-                        )
-                        if not isinstance(self.width, int) or not isinstance(self.height, int):
-                            raise ToolError("Screen dimensions must be integers")
-                        size = (int(self.width), int(self.height))
-                        img = img.resize(size, Image.Resampling.LANCZOS)
-                        buffer = io.BytesIO()
-                        img.save(buffer, format="PNG")
-                        screenshot = buffer.getvalue()
-
-                    return ToolResult(base64_image=base64.b64encode(screenshot).decode())
-
-                elif action == "cursor_position":
-                    pos = await self.computer.interface.get_cursor_position()
-                    x, y = pos  # Unpack the tuple
-                    return ToolResult(output=f"X={int(x)},Y={int(y)}")
-
-            except Exception as e:
-                self.logger.error(f"Error during {action} action: {str(e)}")
-                raise ToolError(f"Failed to perform {action}: {str(e)}")
-
        elif action == "scroll":
            # Implement scroll action
            direction = kwargs.get("direction", "down")
@@ -487,28 +335,20 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                # Wait briefly for UI changes
                await asyncio.sleep(0.5)

-                # Take post-action screenshot
-                post_screenshot = await self.computer.interface.screenshot()
-                post_img = Image.open(io.BytesIO(post_screenshot))
-
-                # Scale post-action image if needed
-                if post_img.size != (self.width, self.height):
-                    self.logger.info(
-                        f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
-                    )
-                    post_img = post_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
-                    buffer = io.BytesIO()
-                    post_img.save(buffer, format="PNG")
-                    post_screenshot = buffer.getvalue()
-
                return ToolResult(
                    output=f"Scrolled {direction} by {amount} steps",
-                    base64_image=base64.b64encode(post_screenshot).decode(),
                )
            except Exception as e:
                self.logger.error(f"Error during scroll action: {str(e)}")
                raise ToolError(f"Failed to perform scroll: {str(e)}")

+        elif action == "screenshot":
+            # Take screenshot
+            return await self.screenshot()
+        elif action == "cursor_position":
+            pos = await self.computer.interface.get_cursor_position()
+            x, y = pos  # Unpack the tuple
+            return ToolResult(output=f"X={int(x)},Y={int(y)}")
        raise ToolError(f"Invalid action: {action}")

    async def screenshot(self):
--- a/libs/agent/agent/providers/openai/tools/computer.py
+++ b/libs/agent/agent/providers/openai/tools/computer.py
@@ -61,9 +61,6 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
    computer: Computer  # The CUA Computer instance
    logger = logging.getLogger(__name__)

-    _screenshot_delay = 1.0  # macOS is generally faster than X11
-    _scaling_enabled = True
-
    def __init__(self, computer: Computer):
        """Initialize the computer tool.

@@ -185,26 +182,23 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
            raise ToolError(f"Failed to execute {type}: {str(e)}")

    async def handle_click(self, button: str, x: int, y: int) -> ToolResult:
-        """Handle different click actions."""
+        """Handle mouse clicks."""
        try:
-            # Perform requested click action
+            # Perform the click based on button type
            if button == "left":
                await self.computer.interface.left_click(x, y)
            elif button == "right":
                await self.computer.interface.right_click(x, y)
            elif button == "double":
                await self.computer.interface.double_click(x, y)
+            else:
+                raise ToolError(f"Unsupported button type: {button}")

-            # Wait for UI to update
-            await asyncio.sleep(0.5)
-
-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
+            # Wait briefly for UI to update
+            await asyncio.sleep(0.3)

            return ToolResult(
                output=f"Performed {button} click at ({x}, {y})",
-                base64_image=base64_screenshot,
            )
        except Exception as e:
            self.logger.error(f"Error in handle_click: {str(e)}")
@@ -218,11 +212,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):

            await asyncio.sleep(0.3)

-            # Take screenshot after typing
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
-
-            return ToolResult(output=f"Typed: {text}", base64_image=base64_screenshot)
+            return ToolResult(output=f"Typed: {text}")
        except Exception as e:
            self.logger.error(f"Error in handle_typing: {str(e)}")
            raise ToolError(f"Failed to type '{text}': {str(e)}")
@@ -254,11 +244,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
            # Wait briefly
            await asyncio.sleep(0.3)

-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
-
-            return ToolResult(output=f"Pressed key: {key}", base64_image=base64_screenshot)
+            return ToolResult(output=f"Pressed key: {key}")
        except Exception as e:
            self.logger.error(f"Error in handle_key: {str(e)}")
            raise ToolError(f"Failed to press key '{key}': {str(e)}")
@@ -272,11 +258,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
            # Wait briefly
            await asyncio.sleep(0.2)

-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
-
-            return ToolResult(output=f"Moved cursor to ({x}, {y})", base64_image=base64_screenshot)
+            return ToolResult(output=f"Moved cursor to ({x}, {y})")
        except Exception as e:
            self.logger.error(f"Error in handle_mouse_move: {str(e)}")
            raise ToolError(f"Failed to move cursor to ({x}, {y}): {str(e)}")
@@ -296,14 +278,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
            # Wait for UI to update
            await asyncio.sleep(0.5)

-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
-
-            return ToolResult(
-                output=f"Scrolled at ({x}, {y}) with delta ({scroll_x}, {scroll_y})",
-                base64_image=base64_screenshot,
-            )
+            return ToolResult(output=f"Scrolled at ({x}, {y}) by ({scroll_x}, {scroll_y})")
        except Exception as e:
            self.logger.error(f"Error in handle_scroll: {str(e)}")
            raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
@@ -331,13 +306,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
            # Wait for UI to update
            await asyncio.sleep(0.5)
            
-            # Take screenshot after action
-            screenshot = await self.computer.interface.screenshot()
-            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
-            
            return ToolResult(
                output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
-                base64_image=base64_screenshot,
            )
        except Exception as e:
            self.logger.error(f"Error in handle_drag: {str(e)}")