Added anthropic click mode

2026-04-28 00:00:21 -05:00 · 2025-08-05 12:32:42 -04:00
parent dc0e479170
commit e1e6a7d6a7
2 changed files with 112 additions and 33 deletions
@@ -65,21 +65,28 @@ def _get_tool_config_for_model(model: str) -> Dict[str, str]:
        "beta_flag": "computer-use-2024-10-22"
    }

-def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
+async def _map_computer_tool_to_anthropic(computer_tool: Any, tool_version: str) -> Dict[str, Any]:
    """Map a computer tool to Anthropic's hosted tool schema."""
+    # Get dimensions from the computer handler
+    try:
+        width, height = await computer_tool.get_dimensions()
+    except Exception:
+        # Fallback to default dimensions if method fails
+        width, height = 1024, 768
+    
    return {
        "type": tool_version,
        "function": {
            "name": "computer",
            "parameters": {
-                "display_height_px": getattr(computer_tool, 'display_height', 768),
-                "display_width_px": getattr(computer_tool, 'display_width', 1024),
-                "display_number": getattr(computer_tool, 'display_number', 1),
+                "display_height_px": height,
+                "display_width_px": width,
+                "display_number": 1,
            },
        },
    }

-def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
+async def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str) -> Tools:
    """Prepare tools for Anthropic API format."""
    tool_config = _get_tool_config_for_model(model)
    anthropic_tools = []
@@ -87,7 +94,7 @@ def _prepare_tools_for_anthropic(tool_schemas: List[Dict[str, Any]], model: str)
    for schema in tool_schemas:
        if schema["type"] == "computer":
            # Map computer tool to Anthropic format
-            anthropic_tools.append(_map_computer_tool_to_anthropic(
+            anthropic_tools.append(await _map_computer_tool_to_anthropic(
                schema["computer"], 
                tool_config["tool_version"]
            ))
@@ -1315,7 +1322,7 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
        tool_config = _get_tool_config_for_model(model)
        
        # Prepare tools for Anthropic API
-        anthropic_tools = _prepare_tools_for_anthropic(tools, model)
+        anthropic_tools = await _prepare_tools_for_anthropic(tools, model)
        
        # Convert responses_items messages to completion format
        completion_messages = _convert_responses_items_to_completion_messages(messages)
@@ -1375,10 +1382,102 @@ class AnthropicHostedToolsConfig(AsyncAgentConfig):
        image_b64: str,
        instruction: str,
        **kwargs
-    ) -> Optional[Tuple[float, float]]:
-        """Anthropic hosted tools does not support click prediction."""
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates based on image and instruction.
+        
+        Uses Anthropic's computer use models with a custom prompt that instructs
+        the agent to only output clicks.
+        
+        Args:
+            model: Model name to use
+            image_b64: Base64 encoded image
+            instruction: Instruction for where to click
+            
+        Returns:
+            Tuple of (x, y) coordinates or None if prediction fails
+        """
+        # Get image dimensions from base64 data
+        try:
+            import base64
+            from PIL import Image
+            from io import BytesIO
+            
+            image_data = base64.b64decode(image_b64)
+            image = Image.open(BytesIO(image_data))
+            display_width, display_height = image.size
+        except Exception:
+            # Fallback to default dimensions if image parsing fails
+            display_width, display_height = 1024, 768
+        
+        # Get tool configuration for this model
+        tool_config = _get_tool_config_for_model(model)
+        
+        # Prepare computer tool for Anthropic format
+        computer_tool = {
+            "type": tool_config["tool_version"],
+            "function": {
+                "name": "computer",
+                "parameters": {
+                    "display_height_px": display_height,
+                    "display_width_px": display_width,
+                    "display_number": 1,
+                },
+            },
+        }
+        
+        # Construct messages in OpenAI chat completion format for liteLLM
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"You are a UI grounding expert. Look at the image and {instruction}. Output ONLY a click action on the target element. No explanations, confirmations, or additional text."
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{image_b64}"
+                        }
+                    }
+                ]
+            }
+        ]
+        
+        # Prepare API call kwargs
+        api_kwargs = {
+            "model": model,
+            "messages": messages,
+            "tools": [computer_tool],
+            "stream": False,
+            "max_tokens": 100,  # Keep response short for click prediction
+            "headers": {
+                "anthropic-beta": tool_config["beta_flag"]
+            }
+        }
+    
+        # Use liteLLM acompletion
+        response = await litellm.acompletion(**api_kwargs)
+        
+        # Convert response to responses_items format to extract click coordinates
+        responses_items = _convert_completion_to_responses_items(response)
+        
+        # Look for computer_call with click action
+        for item in responses_items:
+            if (isinstance(item, dict) and 
+                item.get("type") == "computer_call" and
+                isinstance(item.get("action"), dict)):
+                
+                action = item["action"]
+                if action.get("type") == "click":
+                    x = action.get("x")
+                    y = action.get("y")
+                    if x is not None and y is not None:
+                        return (int(x), int(y))
+        
        return None
    
    def get_capabilities(self) -> List[AgentCapability]:
        """Return the capabilities supported by this agent."""
-        return ["step"]
+        return ["click", "step"]
@@ -158,24 +158,6 @@ class OpenAIComputerUseConfig:
            Tuple of (x, y) coordinates or None if prediction fails
        """
        # TODO: use computer tool to get dimensions + environment
-        # Scale image to half size
-        try:
-            image_data = base64.b64decode(image_b64)
-            image = Image.open(BytesIO(image_data))
-            
-            # Scale to half size
-            new_width = image.width // 2
-            new_height = image.height // 2
-            scaled_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
-            
-            # Convert back to base64
-            buffer = BytesIO()
-            scaled_image.save(buffer, format='PNG')
-            image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
-        except Exception:
-            # If scaling fails, use original image
-            pass
-        
        # Manually construct input items with image and click instruction
        input_items = [
            {
@@ -207,7 +189,7 @@ class OpenAIComputerUseConfig:
            "type": "computer_use_preview",
            "display_width": display_width,
            "display_height": display_height,
-            "environment": "linux"
+            "environment": "windows"
        }
        
        # Prepare API call kwargs
@@ -226,9 +208,7 @@ class OpenAIComputerUseConfig:
        
        # Extract click coordinates from response output
        output_dict = response.model_dump()
-        output_items = output_dict.get("output", [])
-        
-        # print(output_items)
+        output_items = output_dict.get("output", [])        
        
        # Look for computer_call with click action
        for item in output_items:
@@ -241,7 +221,7 @@ class OpenAIComputerUseConfig:
                    x = action.get("x")
                    y = action.get("y")
                    if x is not None and y is not None:
-                        return (int(x) * 2, int(y) * 2)
+                        return (int(x), int(y))
        
        return None