Add OpenCUA Grounding mode

This commit is contained in:
Dillon DuPont
2025-08-20 10:03:41 -04:00
parent 5225de8d88
commit e065ae59d2

View File

@@ -0,0 +1,133 @@
"""
OpenCUA agent loop implementation for click prediction using litellm.acompletion
Based on OpenCUA model for GUI grounding tasks.
"""
import asyncio
import json
import re
import base64
from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
from io import BytesIO
import uuid
from PIL import Image
import litellm
import math
from ..decorators import register_agent
from ..types import Messages, AgentResponse, Tools, AgentCapability
from ..loops.base import AsyncAgentConfig
def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
"""Extract coordinates from pyautogui.click(x=..., y=...) format."""
try:
# Look for pyautogui.click(x=1443, y=343) pattern
pattern = r"pyautogui\.click\(x=(\d+),\s*y=(\d+)\)"
match = re.search(pattern, text)
if match:
x, y = int(match.group(1)), int(match.group(2))
return (x, y)
return None
except Exception:
return None
@register_agent(models=r"(?i).*OpenCUA.*")
class OpenCUAConfig(AsyncAgentConfig):
"""OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
def __init__(self):
self.current_model = None
self.last_screenshot_b64 = None
async def predict_step(
self,
messages: List[Dict[str, Any]],
model: str,
tools: Optional[List[Dict[str, Any]]] = None,
max_retries: Optional[int] = None,
stream: bool = False,
computer_handler=None,
_on_api_start=None,
_on_api_end=None,
_on_usage=None,
_on_screenshot=None,
**kwargs
) -> Dict[str, Any]:
"""Predict step is not implemented for OpenCUA model."""
raise NotImplementedError("predict_step is not implemented for OpenCUA model")
async def predict_click(
self,
model: str,
image_b64: str,
instruction: str,
**kwargs
) -> Optional[Tuple[int, int]]:
"""
Predict click coordinates using OpenCUA model via litellm.acompletion.
Args:
model: The OpenCUA model name
image_b64: Base64 encoded image
instruction: Instruction for where to click
Returns:
Tuple of (x, y) coordinates or None if prediction fails
"""
# Prepare system message
system_prompt = (
"You are a GUI agent. You are given a task and a screenshot of the screen. "
"You need to perform a series of pyautogui actions to complete the task."
)
system_message = {
"role": "system",
"content": system_prompt
}
# Prepare user message with image and instruction
user_message = {
"role": "user",
"content": [
{
"type": "image",
"image": f"data:image/png;base64,{image_b64}"
},
{
"type": "text",
"text": instruction
}
]
}
# Prepare API call kwargs
api_kwargs = {
"model": model,
"messages": [system_message, user_message],
"max_new_tokens": 512,
"temperature": 0,
**kwargs
}
try:
# Use liteLLM acompletion
response = await litellm.acompletion(**api_kwargs)
# Extract response text
output_text = response.choices[0].message.content
if not output_text:
return None
# Extract coordinates from pyautogui format
coordinates = extract_coordinates_from_pyautogui(output_text)
return coordinates
except Exception as e:
print(f"Error in OpenCUA predict_click: {e}")
return None
def get_capabilities(self) -> List[AgentCapability]:
"""Return the capabilities supported by this agent."""
return ["click"]