Removed pre/post action screenshots in tools. Improved image retention (#289)

This commit is contained in:
Dillon DuPont
2025-06-14 10:05:12 -04:00
parent d21a4ca294
commit 7d267701a4
3 changed files with 57 additions and 214 deletions

View File

@@ -81,16 +81,27 @@ class StandardMessageManager:
if not self.config.num_images_to_keep:
return messages
# Find user messages with images
# Find messages with images (both user messages and tool call outputs)
image_messages = []
for msg in messages:
has_image = False
# Check user messages with images
if msg["role"] == "user" and isinstance(msg["content"], list):
has_image = any(
item.get("type") == "image_url" or item.get("type") == "image"
for item in msg["content"]
)
if has_image:
image_messages.append(msg)
# Check assistant messages with tool calls that have images
elif msg["role"] == "assistant" and isinstance(msg["content"], list):
for item in msg["content"]:
if item.get("type") == "tool_result" and "base64_image" in item:
has_image = True
break
if has_image:
image_messages.append(msg)
# If we don't have more images than the limit, return all messages
if len(image_messages) <= self.config.num_images_to_keep:
@@ -100,13 +111,35 @@ class StandardMessageManager:
images_to_keep = image_messages[-self.config.num_images_to_keep :]
images_to_remove = image_messages[: -self.config.num_images_to_keep]
# Create a new message list without the older images
# Create a new message list, removing images from older messages
result = []
for msg in messages:
if msg in images_to_remove:
# Skip this message
continue
result.append(msg)
# Remove images from this message but keep the text content
if msg["role"] == "user" and isinstance(msg["content"], list):
# Keep only text content, remove images
new_content = [
item for item in msg["content"]
if item.get("type") not in ["image_url", "image"]
]
if new_content: # Only add if there's still content
result.append({"role": msg["role"], "content": new_content})
elif msg["role"] == "assistant" and isinstance(msg["content"], list):
# Remove base64_image from tool_result items
new_content = []
for item in msg["content"]:
if item.get("type") == "tool_result" and "base64_image" in item:
# Create a copy without the base64_image
new_item = {k: v for k, v in item.items() if k != "base64_image"}
new_content.append(new_item)
else:
new_content.append(item)
result.append({"role": msg["role"], "content": new_content})
else:
# For other message types, keep as is
result.append(msg)
else:
result.append(msg)
return result

View File

@@ -205,26 +205,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
self.logger.info(f" Coordinates: ({x}, {y})")
try:
# Take pre-action screenshot to get current dimensions
pre_screenshot = await self.computer.interface.screenshot()
pre_img = Image.open(io.BytesIO(pre_screenshot))
# Scale image to match screen dimensions if needed
if pre_img.size != (self.width, self.height):
self.logger.info(
f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
)
if not isinstance(self.width, int) or not isinstance(self.height, int):
raise ToolError("Screen dimensions must be integers")
size = (int(self.width), int(self.height))
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
# Save the scaled image back to bytes
buffer = io.BytesIO()
pre_img.save(buffer, format="PNG")
pre_screenshot = buffer.getvalue()
self.logger.info(f" Current dimensions: {pre_img.width}x{pre_img.height}")
# Perform the click action
if action == "left_click":
self.logger.info(f"Clicking at ({x}, {y})")
@@ -242,45 +222,14 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
# Wait briefly for any UI changes
await asyncio.sleep(0.5)
# Take and save post-action screenshot
post_screenshot = await self.computer.interface.screenshot()
post_img = Image.open(io.BytesIO(post_screenshot))
# Scale post-action image if needed
if post_img.size != (self.width, self.height):
self.logger.info(
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
)
post_img = post_img.resize(
(self.width, self.height), Image.Resampling.LANCZOS
)
buffer = io.BytesIO()
post_img.save(buffer, format="PNG")
post_screenshot = buffer.getvalue()
return ToolResult(
output=f"Performed {action} at ({x}, {y})",
base64_image=base64.b64encode(post_screenshot).decode(),
)
except Exception as e:
self.logger.error(f"Error during {action} action: {str(e)}")
raise ToolError(f"Failed to perform {action}: {str(e)}")
else:
try:
# Take pre-action screenshot
pre_screenshot = await self.computer.interface.screenshot()
pre_img = Image.open(io.BytesIO(pre_screenshot))
# Scale image if needed
if pre_img.size != (self.width, self.height):
self.logger.info(
f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
)
if not isinstance(self.width, int) or not isinstance(self.height, int):
raise ToolError("Screen dimensions must be integers")
size = (int(self.width), int(self.height))
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
# Perform the click action
if action == "left_click":
self.logger.info("Performing left click at current position")
@@ -295,25 +244,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
# Wait briefly for any UI changes
await asyncio.sleep(0.5)
# Take post-action screenshot
post_screenshot = await self.computer.interface.screenshot()
post_img = Image.open(io.BytesIO(post_screenshot))
# Scale post-action image if needed
if post_img.size != (self.width, self.height):
self.logger.info(
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
)
post_img = post_img.resize(
(self.width, self.height), Image.Resampling.LANCZOS
)
buffer = io.BytesIO()
post_img.save(buffer, format="PNG")
post_screenshot = buffer.getvalue()
return ToolResult(
output=f"Performed {action} at current position",
base64_image=base64.b64encode(post_screenshot).decode(),
)
except Exception as e:
self.logger.error(f"Error during {action} action: {str(e)}")
@@ -328,20 +260,6 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
raise ToolError(f"{text} must be a string")
try:
# Take pre-action screenshot
pre_screenshot = await self.computer.interface.screenshot()
pre_img = Image.open(io.BytesIO(pre_screenshot))
# Scale image if needed
if pre_img.size != (self.width, self.height):
self.logger.info(
f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
)
if not isinstance(self.width, int) or not isinstance(self.height, int):
raise ToolError("Screen dimensions must be integers")
size = (int(self.width), int(self.height))
pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
if action == "key":
# Special handling for page up/down on macOS
if text.lower() in ["pagedown", "page_down", "page down"]:
@@ -378,25 +296,8 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
# Wait briefly for UI changes
await asyncio.sleep(0.5)
# Take post-action screenshot
post_screenshot = await self.computer.interface.screenshot()
post_img = Image.open(io.BytesIO(post_screenshot))
# Scale post-action image if needed
if post_img.size != (self.width, self.height):
self.logger.info(
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
)
post_img = post_img.resize(
(self.width, self.height), Image.Resampling.LANCZOS
)
buffer = io.BytesIO()
post_img.save(buffer, format="PNG")
post_screenshot = buffer.getvalue()
return ToolResult(
output=f"Pressed key: {output_text}",
base64_image=base64.b64encode(post_screenshot).decode(),
)
elif action == "type":
@@ -406,66 +307,13 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
# Wait briefly for UI changes
await asyncio.sleep(0.5)
# Take post-action screenshot
post_screenshot = await self.computer.interface.screenshot()
post_img = Image.open(io.BytesIO(post_screenshot))
# Scale post-action image if needed
if post_img.size != (self.width, self.height):
self.logger.info(
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
)
post_img = post_img.resize(
(self.width, self.height), Image.Resampling.LANCZOS
)
buffer = io.BytesIO()
post_img.save(buffer, format="PNG")
post_screenshot = buffer.getvalue()
return ToolResult(
output=f"Typed text: {text}",
base64_image=base64.b64encode(post_screenshot).decode(),
)
except Exception as e:
self.logger.error(f"Error during {action} action: {str(e)}")
raise ToolError(f"Failed to perform {action}: {str(e)}")
elif action in ("screenshot", "cursor_position"):
if text is not None:
raise ToolError(f"text is not accepted for {action}")
if coordinate is not None:
raise ToolError(f"coordinate is not accepted for {action}")
try:
if action == "screenshot":
# Take screenshot
screenshot = await self.computer.interface.screenshot()
img = Image.open(io.BytesIO(screenshot))
# Scale image if needed
if img.size != (self.width, self.height):
self.logger.info(
f"Scaling image from {img.size} to {self.width}x{self.height}"
)
if not isinstance(self.width, int) or not isinstance(self.height, int):
raise ToolError("Screen dimensions must be integers")
size = (int(self.width), int(self.height))
img = img.resize(size, Image.Resampling.LANCZOS)
buffer = io.BytesIO()
img.save(buffer, format="PNG")
screenshot = buffer.getvalue()
return ToolResult(base64_image=base64.b64encode(screenshot).decode())
elif action == "cursor_position":
pos = await self.computer.interface.get_cursor_position()
x, y = pos # Unpack the tuple
return ToolResult(output=f"X={int(x)},Y={int(y)}")
except Exception as e:
self.logger.error(f"Error during {action} action: {str(e)}")
raise ToolError(f"Failed to perform {action}: {str(e)}")
elif action == "scroll":
# Implement scroll action
direction = kwargs.get("direction", "down")
@@ -487,28 +335,20 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
# Wait briefly for UI changes
await asyncio.sleep(0.5)
# Take post-action screenshot
post_screenshot = await self.computer.interface.screenshot()
post_img = Image.open(io.BytesIO(post_screenshot))
# Scale post-action image if needed
if post_img.size != (self.width, self.height):
self.logger.info(
f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
)
post_img = post_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
buffer = io.BytesIO()
post_img.save(buffer, format="PNG")
post_screenshot = buffer.getvalue()
return ToolResult(
output=f"Scrolled {direction} by {amount} steps",
base64_image=base64.b64encode(post_screenshot).decode(),
)
except Exception as e:
self.logger.error(f"Error during scroll action: {str(e)}")
raise ToolError(f"Failed to perform scroll: {str(e)}")
elif action == "screenshot":
# Take screenshot
return await self.screenshot()
elif action == "cursor_position":
pos = await self.computer.interface.get_cursor_position()
x, y = pos # Unpack the tuple
return ToolResult(output=f"X={int(x)},Y={int(y)}")
raise ToolError(f"Invalid action: {action}")
async def screenshot(self):

View File

@@ -61,9 +61,6 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
computer: Computer # The CUA Computer instance
logger = logging.getLogger(__name__)
_screenshot_delay = 1.0 # macOS is generally faster than X11
_scaling_enabled = True
def __init__(self, computer: Computer):
"""Initialize the computer tool.
@@ -185,26 +182,23 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
raise ToolError(f"Failed to execute {type}: {str(e)}")
async def handle_click(self, button: str, x: int, y: int) -> ToolResult:
"""Handle different click actions."""
"""Handle mouse clicks."""
try:
# Perform requested click action
# Perform the click based on button type
if button == "left":
await self.computer.interface.left_click(x, y)
elif button == "right":
await self.computer.interface.right_click(x, y)
elif button == "double":
await self.computer.interface.double_click(x, y)
else:
raise ToolError(f"Unsupported button type: {button}")
# Wait for UI to update
await asyncio.sleep(0.5)
# Take screenshot after action
screenshot = await self.computer.interface.screenshot()
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
# Wait briefly for UI to update
await asyncio.sleep(0.3)
return ToolResult(
output=f"Performed {button} click at ({x}, {y})",
base64_image=base64_screenshot,
)
except Exception as e:
self.logger.error(f"Error in handle_click: {str(e)}")
@@ -218,11 +212,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
await asyncio.sleep(0.3)
# Take screenshot after typing
screenshot = await self.computer.interface.screenshot()
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
return ToolResult(output=f"Typed: {text}", base64_image=base64_screenshot)
return ToolResult(output=f"Typed: {text}")
except Exception as e:
self.logger.error(f"Error in handle_typing: {str(e)}")
raise ToolError(f"Failed to type '{text}': {str(e)}")
@@ -254,11 +244,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
# Wait briefly
await asyncio.sleep(0.3)
# Take screenshot after action
screenshot = await self.computer.interface.screenshot()
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
return ToolResult(output=f"Pressed key: {key}", base64_image=base64_screenshot)
return ToolResult(output=f"Pressed key: {key}")
except Exception as e:
self.logger.error(f"Error in handle_key: {str(e)}")
raise ToolError(f"Failed to press key '{key}': {str(e)}")
@@ -272,11 +258,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
# Wait briefly
await asyncio.sleep(0.2)
# Take screenshot after action
screenshot = await self.computer.interface.screenshot()
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
return ToolResult(output=f"Moved cursor to ({x}, {y})", base64_image=base64_screenshot)
return ToolResult(output=f"Moved cursor to ({x}, {y})")
except Exception as e:
self.logger.error(f"Error in handle_mouse_move: {str(e)}")
raise ToolError(f"Failed to move cursor to ({x}, {y}): {str(e)}")
@@ -296,14 +278,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
# Wait for UI to update
await asyncio.sleep(0.5)
# Take screenshot after action
screenshot = await self.computer.interface.screenshot()
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
return ToolResult(
output=f"Scrolled at ({x}, {y}) with delta ({scroll_x}, {scroll_y})",
base64_image=base64_screenshot,
)
return ToolResult(output=f"Scrolled at ({x}, {y}) by ({scroll_x}, {scroll_y})")
except Exception as e:
self.logger.error(f"Error in handle_scroll: {str(e)}")
raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
@@ -331,13 +306,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
# Wait for UI to update
await asyncio.sleep(0.5)
# Take screenshot after action
screenshot = await self.computer.interface.screenshot()
base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
return ToolResult(
output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
base64_image=base64_screenshot,
)
except Exception as e:
self.logger.error(f"Error in handle_drag: {str(e)}")