Add OpenCUA Grounding mode

2026-01-02 19:40:18 -06:00 · 2025-08-20 10:03:41 -04:00
parent 5225de8d88
commit e065ae59d2
1 changed files with 133 additions and 0 deletions
--- a/libs/python/agent/agent/loops/opencua.py
+++ b/libs/python/agent/agent/loops/opencua.py
@@ -0,0 +1,133 @@
+"""
+OpenCUA agent loop implementation for click prediction using litellm.acompletion
+Based on OpenCUA model for GUI grounding tasks.
+"""
+
+import asyncio
+import json
+import re
+import base64
+from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
+from io import BytesIO
+import uuid
+from PIL import Image
+import litellm
+import math
+
+from ..decorators import register_agent
+from ..types import Messages, AgentResponse, Tools, AgentCapability
+from ..loops.base import AsyncAgentConfig
+
+def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
+    """Extract coordinates from pyautogui.click(x=..., y=...) format."""
+    try:
+        # Look for pyautogui.click(x=1443, y=343) pattern
+        pattern = r"pyautogui\.click\(x=(\d+),\s*y=(\d+)\)"
+        match = re.search(pattern, text)
+        if match:
+            x, y = int(match.group(1)), int(match.group(2))
+            return (x, y)
+        return None
+    except Exception:
+        return None
+
+@register_agent(models=r"(?i).*OpenCUA.*")
+class OpenCUAConfig(AsyncAgentConfig):
+    """OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
+    
+    def __init__(self):
+        self.current_model = None
+        self.last_screenshot_b64 = None
+
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        """Predict step is not implemented for OpenCUA model."""
+        raise NotImplementedError("predict_step is not implemented for OpenCUA model")
+
+    async def predict_click(
+        self,
+        model: str,
+        image_b64: str,
+        instruction: str,
+        **kwargs
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates using OpenCUA model via litellm.acompletion.
+        
+        Args:
+            model: The OpenCUA model name
+            image_b64: Base64 encoded image
+            instruction: Instruction for where to click
+            
+        Returns:
+            Tuple of (x, y) coordinates or None if prediction fails
+        """
+        # Prepare system message
+        system_prompt = (
+            "You are a GUI agent. You are given a task and a screenshot of the screen. "
+            "You need to perform a series of pyautogui actions to complete the task."
+        )
+        
+        system_message = {
+            "role": "system",
+            "content": system_prompt
+        }
+        
+        # Prepare user message with image and instruction
+        user_message = {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": f"data:image/png;base64,{image_b64}"
+                },
+                {
+                    "type": "text",
+                    "text": instruction
+                }
+            ]
+        }
+        
+        # Prepare API call kwargs
+        api_kwargs = {
+            "model": model,
+            "messages": [system_message, user_message],
+            "max_new_tokens": 512,
+            "temperature": 0,
+            **kwargs
+        }
+        
+        try:
+            # Use liteLLM acompletion
+            response = await litellm.acompletion(**api_kwargs)
+            
+            # Extract response text
+            output_text = response.choices[0].message.content
+            
+            if not output_text:
+                return None
+            
+            # Extract coordinates from pyautogui format
+            coordinates = extract_coordinates_from_pyautogui(output_text)
+            
+            return coordinates
+            
+        except Exception as e:
+            print(f"Error in OpenCUA predict_click: {e}")
+            return None
+    
+    def get_capabilities(self) -> List[AgentCapability]:
+        """Return the capabilities supported by this agent."""
+        return ["click"]