From 7d267701a4bfcd3f09be3e75d861c2cc86c2241e Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Sat, 14 Jun 2025 10:05:12 -0400 Subject: [PATCH] Removed pre/post action screenshots in tools. Improved image retention (#289) --- libs/agent/agent/core/messages.py | 47 ++++- .../providers/anthropic/tools/computer.py | 174 +----------------- .../agent/providers/openai/tools/computer.py | 50 +---- 3 files changed, 57 insertions(+), 214 deletions(-) diff --git a/libs/agent/agent/core/messages.py b/libs/agent/agent/core/messages.py index 2a582a7a..d2c70558 100644 --- a/libs/agent/agent/core/messages.py +++ b/libs/agent/agent/core/messages.py @@ -81,16 +81,27 @@ class StandardMessageManager: if not self.config.num_images_to_keep: return messages - # Find user messages with images + # Find messages with images (both user messages and tool call outputs) image_messages = [] for msg in messages: + has_image = False + + # Check user messages with images if msg["role"] == "user" and isinstance(msg["content"], list): has_image = any( item.get("type") == "image_url" or item.get("type") == "image" for item in msg["content"] ) - if has_image: - image_messages.append(msg) + + # Check assistant messages with tool calls that have images + elif msg["role"] == "assistant" and isinstance(msg["content"], list): + for item in msg["content"]: + if item.get("type") == "tool_result" and "base64_image" in item: + has_image = True + break + + if has_image: + image_messages.append(msg) # If we don't have more images than the limit, return all messages if len(image_messages) <= self.config.num_images_to_keep: @@ -100,13 +111,35 @@ class StandardMessageManager: images_to_keep = image_messages[-self.config.num_images_to_keep :] images_to_remove = image_messages[: -self.config.num_images_to_keep] - # Create a new message list without the older images + # Create a new message list, removing images from older messages result = [] for msg in messages: if msg in images_to_remove: - # Skip this message - continue - result.append(msg) + # Remove images from this message but keep the text content + if msg["role"] == "user" and isinstance(msg["content"], list): + # Keep only text content, remove images + new_content = [ + item for item in msg["content"] + if item.get("type") not in ["image_url", "image"] + ] + if new_content: # Only add if there's still content + result.append({"role": msg["role"], "content": new_content}) + elif msg["role"] == "assistant" and isinstance(msg["content"], list): + # Remove base64_image from tool_result items + new_content = [] + for item in msg["content"]: + if item.get("type") == "tool_result" and "base64_image" in item: + # Create a copy without the base64_image + new_item = {k: v for k, v in item.items() if k != "base64_image"} + new_content.append(new_item) + else: + new_content.append(item) + result.append({"role": msg["role"], "content": new_content}) + else: + # For other message types, keep as is + result.append(msg) + else: + result.append(msg) return result diff --git a/libs/agent/agent/providers/anthropic/tools/computer.py b/libs/agent/agent/providers/anthropic/tools/computer.py index 2bb944ea..dd1dc281 100644 --- a/libs/agent/agent/providers/anthropic/tools/computer.py +++ b/libs/agent/agent/providers/anthropic/tools/computer.py @@ -205,26 +205,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool): self.logger.info(f" Coordinates: ({x}, {y})") try: - # Take pre-action screenshot to get current dimensions - pre_screenshot = await self.computer.interface.screenshot() - pre_img = Image.open(io.BytesIO(pre_screenshot)) - - # Scale image to match screen dimensions if needed - if pre_img.size != (self.width, self.height): - self.logger.info( - f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions" - ) - if not isinstance(self.width, int) or not isinstance(self.height, int): - raise ToolError("Screen dimensions must be integers") - size = (int(self.width), int(self.height)) - pre_img = pre_img.resize(size, Image.Resampling.LANCZOS) - # Save the scaled image back to bytes - buffer = io.BytesIO() - pre_img.save(buffer, format="PNG") - pre_screenshot = buffer.getvalue() - - self.logger.info(f" Current dimensions: {pre_img.width}x{pre_img.height}") - # Perform the click action if action == "left_click": self.logger.info(f"Clicking at ({x}, {y})") @@ -242,45 +222,14 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool): # Wait briefly for any UI changes await asyncio.sleep(0.5) - # Take and save post-action screenshot - post_screenshot = await self.computer.interface.screenshot() - post_img = Image.open(io.BytesIO(post_screenshot)) - - # Scale post-action image if needed - if post_img.size != (self.width, self.height): - self.logger.info( - f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}" - ) - post_img = post_img.resize( - (self.width, self.height), Image.Resampling.LANCZOS - ) - buffer = io.BytesIO() - post_img.save(buffer, format="PNG") - post_screenshot = buffer.getvalue() - return ToolResult( output=f"Performed {action} at ({x}, {y})", - base64_image=base64.b64encode(post_screenshot).decode(), ) except Exception as e: self.logger.error(f"Error during {action} action: {str(e)}") raise ToolError(f"Failed to perform {action}: {str(e)}") else: try: - # Take pre-action screenshot - pre_screenshot = await self.computer.interface.screenshot() - pre_img = Image.open(io.BytesIO(pre_screenshot)) - - # Scale image if needed - if pre_img.size != (self.width, self.height): - self.logger.info( - f"Scaling image from {pre_img.size} to {self.width}x{self.height}" - ) - if not isinstance(self.width, int) or not isinstance(self.height, int): - raise ToolError("Screen dimensions must be integers") - size = (int(self.width), int(self.height)) - pre_img = pre_img.resize(size, Image.Resampling.LANCZOS) - # Perform the click action if action == "left_click": self.logger.info("Performing left click at current position") @@ -295,25 +244,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool): # Wait briefly for any UI changes await asyncio.sleep(0.5) - # Take post-action screenshot - post_screenshot = await self.computer.interface.screenshot() - post_img = Image.open(io.BytesIO(post_screenshot)) - - # Scale post-action image if needed - if post_img.size != (self.width, self.height): - self.logger.info( - f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}" - ) - post_img = post_img.resize( - (self.width, self.height), Image.Resampling.LANCZOS - ) - buffer = io.BytesIO() - post_img.save(buffer, format="PNG") - post_screenshot = buffer.getvalue() - return ToolResult( output=f"Performed {action} at current position", - base64_image=base64.b64encode(post_screenshot).decode(), ) except Exception as e: self.logger.error(f"Error during {action} action: {str(e)}") @@ -328,20 +260,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool): raise ToolError(f"{text} must be a string") try: - # Take pre-action screenshot - pre_screenshot = await self.computer.interface.screenshot() - pre_img = Image.open(io.BytesIO(pre_screenshot)) - - # Scale image if needed - if pre_img.size != (self.width, self.height): - self.logger.info( - f"Scaling image from {pre_img.size} to {self.width}x{self.height}" - ) - if not isinstance(self.width, int) or not isinstance(self.height, int): - raise ToolError("Screen dimensions must be integers") - size = (int(self.width), int(self.height)) - pre_img = pre_img.resize(size, Image.Resampling.LANCZOS) - if action == "key": # Special handling for page up/down on macOS if text.lower() in ["pagedown", "page_down", "page down"]: @@ -378,25 +296,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool): # Wait briefly for UI changes await asyncio.sleep(0.5) - # Take post-action screenshot - post_screenshot = await self.computer.interface.screenshot() - post_img = Image.open(io.BytesIO(post_screenshot)) - - # Scale post-action image if needed - if post_img.size != (self.width, self.height): - self.logger.info( - f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}" - ) - post_img = post_img.resize( - (self.width, self.height), Image.Resampling.LANCZOS - ) - buffer = io.BytesIO() - post_img.save(buffer, format="PNG") - post_screenshot = buffer.getvalue() - return ToolResult( output=f"Pressed key: {output_text}", - base64_image=base64.b64encode(post_screenshot).decode(), ) elif action == "type": @@ -406,66 +307,13 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool): # Wait briefly for UI changes await asyncio.sleep(0.5) - # Take post-action screenshot - post_screenshot = await self.computer.interface.screenshot() - post_img = Image.open(io.BytesIO(post_screenshot)) - - # Scale post-action image if needed - if post_img.size != (self.width, self.height): - self.logger.info( - f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}" - ) - post_img = post_img.resize( - (self.width, self.height), Image.Resampling.LANCZOS - ) - buffer = io.BytesIO() - post_img.save(buffer, format="PNG") - post_screenshot = buffer.getvalue() - return ToolResult( output=f"Typed text: {text}", - base64_image=base64.b64encode(post_screenshot).decode(), ) except Exception as e: self.logger.error(f"Error during {action} action: {str(e)}") raise ToolError(f"Failed to perform {action}: {str(e)}") - elif action in ("screenshot", "cursor_position"): - if text is not None: - raise ToolError(f"text is not accepted for {action}") - if coordinate is not None: - raise ToolError(f"coordinate is not accepted for {action}") - - try: - if action == "screenshot": - # Take screenshot - screenshot = await self.computer.interface.screenshot() - img = Image.open(io.BytesIO(screenshot)) - - # Scale image if needed - if img.size != (self.width, self.height): - self.logger.info( - f"Scaling image from {img.size} to {self.width}x{self.height}" - ) - if not isinstance(self.width, int) or not isinstance(self.height, int): - raise ToolError("Screen dimensions must be integers") - size = (int(self.width), int(self.height)) - img = img.resize(size, Image.Resampling.LANCZOS) - buffer = io.BytesIO() - img.save(buffer, format="PNG") - screenshot = buffer.getvalue() - - return ToolResult(base64_image=base64.b64encode(screenshot).decode()) - - elif action == "cursor_position": - pos = await self.computer.interface.get_cursor_position() - x, y = pos # Unpack the tuple - return ToolResult(output=f"X={int(x)},Y={int(y)}") - - except Exception as e: - self.logger.error(f"Error during {action} action: {str(e)}") - raise ToolError(f"Failed to perform {action}: {str(e)}") - elif action == "scroll": # Implement scroll action direction = kwargs.get("direction", "down") @@ -487,28 +335,20 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool): # Wait briefly for UI changes await asyncio.sleep(0.5) - # Take post-action screenshot - post_screenshot = await self.computer.interface.screenshot() - post_img = Image.open(io.BytesIO(post_screenshot)) - - # Scale post-action image if needed - if post_img.size != (self.width, self.height): - self.logger.info( - f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}" - ) - post_img = post_img.resize((self.width, self.height), Image.Resampling.LANCZOS) - buffer = io.BytesIO() - post_img.save(buffer, format="PNG") - post_screenshot = buffer.getvalue() - return ToolResult( output=f"Scrolled {direction} by {amount} steps", - base64_image=base64.b64encode(post_screenshot).decode(), ) except Exception as e: self.logger.error(f"Error during scroll action: {str(e)}") raise ToolError(f"Failed to perform scroll: {str(e)}") + elif action == "screenshot": + # Take screenshot + return await self.screenshot() + elif action == "cursor_position": + pos = await self.computer.interface.get_cursor_position() + x, y = pos # Unpack the tuple + return ToolResult(output=f"X={int(x)},Y={int(y)}") raise ToolError(f"Invalid action: {action}") async def screenshot(self): diff --git a/libs/agent/agent/providers/openai/tools/computer.py b/libs/agent/agent/providers/openai/tools/computer.py index c5602f4e..5575c792 100644 --- a/libs/agent/agent/providers/openai/tools/computer.py +++ b/libs/agent/agent/providers/openai/tools/computer.py @@ -61,9 +61,6 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool): computer: Computer # The CUA Computer instance logger = logging.getLogger(__name__) - _screenshot_delay = 1.0 # macOS is generally faster than X11 - _scaling_enabled = True - def __init__(self, computer: Computer): """Initialize the computer tool. @@ -185,26 +182,23 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool): raise ToolError(f"Failed to execute {type}: {str(e)}") async def handle_click(self, button: str, x: int, y: int) -> ToolResult: - """Handle different click actions.""" + """Handle mouse clicks.""" try: - # Perform requested click action + # Perform the click based on button type if button == "left": await self.computer.interface.left_click(x, y) elif button == "right": await self.computer.interface.right_click(x, y) elif button == "double": await self.computer.interface.double_click(x, y) + else: + raise ToolError(f"Unsupported button type: {button}") - # Wait for UI to update - await asyncio.sleep(0.5) - - # Take screenshot after action - screenshot = await self.computer.interface.screenshot() - base64_screenshot = base64.b64encode(screenshot).decode("utf-8") + # Wait briefly for UI to update + await asyncio.sleep(0.3) return ToolResult( output=f"Performed {button} click at ({x}, {y})", - base64_image=base64_screenshot, ) except Exception as e: self.logger.error(f"Error in handle_click: {str(e)}") @@ -218,11 +212,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool): await asyncio.sleep(0.3) - # Take screenshot after typing - screenshot = await self.computer.interface.screenshot() - base64_screenshot = base64.b64encode(screenshot).decode("utf-8") - - return ToolResult(output=f"Typed: {text}", base64_image=base64_screenshot) + return ToolResult(output=f"Typed: {text}") except Exception as e: self.logger.error(f"Error in handle_typing: {str(e)}") raise ToolError(f"Failed to type '{text}': {str(e)}") @@ -254,11 +244,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool): # Wait briefly await asyncio.sleep(0.3) - # Take screenshot after action - screenshot = await self.computer.interface.screenshot() - base64_screenshot = base64.b64encode(screenshot).decode("utf-8") - - return ToolResult(output=f"Pressed key: {key}", base64_image=base64_screenshot) + return ToolResult(output=f"Pressed key: {key}") except Exception as e: self.logger.error(f"Error in handle_key: {str(e)}") raise ToolError(f"Failed to press key '{key}': {str(e)}") @@ -272,11 +258,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool): # Wait briefly await asyncio.sleep(0.2) - # Take screenshot after action - screenshot = await self.computer.interface.screenshot() - base64_screenshot = base64.b64encode(screenshot).decode("utf-8") - - return ToolResult(output=f"Moved cursor to ({x}, {y})", base64_image=base64_screenshot) + return ToolResult(output=f"Moved cursor to ({x}, {y})") except Exception as e: self.logger.error(f"Error in handle_mouse_move: {str(e)}") raise ToolError(f"Failed to move cursor to ({x}, {y}): {str(e)}") @@ -296,14 +278,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool): # Wait for UI to update await asyncio.sleep(0.5) - # Take screenshot after action - screenshot = await self.computer.interface.screenshot() - base64_screenshot = base64.b64encode(screenshot).decode("utf-8") - - return ToolResult( - output=f"Scrolled at ({x}, {y}) with delta ({scroll_x}, {scroll_y})", - base64_image=base64_screenshot, - ) + return ToolResult(output=f"Scrolled at ({x}, {y}) by ({scroll_x}, {scroll_y})") except Exception as e: self.logger.error(f"Error in handle_scroll: {str(e)}") raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}") @@ -331,13 +306,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool): # Wait for UI to update await asyncio.sleep(0.5) - # Take screenshot after action - screenshot = await self.computer.interface.screenshot() - base64_screenshot = base64.b64encode(screenshot).decode("utf-8") - return ToolResult( output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})", - base64_image=base64_screenshot, ) except Exception as e: self.logger.error(f"Error in handle_drag: {str(e)}")