mirror of
https://github.com/trycua/computer.git
synced 2026-01-02 19:40:18 -06:00
Add OpenCUA Grounding mode
This commit is contained in:
133
libs/python/agent/agent/loops/opencua.py
Normal file
133
libs/python/agent/agent/loops/opencua.py
Normal file
@@ -0,0 +1,133 @@
|
||||
"""
|
||||
OpenCUA agent loop implementation for click prediction using litellm.acompletion
|
||||
Based on OpenCUA model for GUI grounding tasks.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import base64
|
||||
from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
|
||||
from io import BytesIO
|
||||
import uuid
|
||||
from PIL import Image
|
||||
import litellm
|
||||
import math
|
||||
|
||||
from ..decorators import register_agent
|
||||
from ..types import Messages, AgentResponse, Tools, AgentCapability
|
||||
from ..loops.base import AsyncAgentConfig
|
||||
|
||||
def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
|
||||
"""Extract coordinates from pyautogui.click(x=..., y=...) format."""
|
||||
try:
|
||||
# Look for pyautogui.click(x=1443, y=343) pattern
|
||||
pattern = r"pyautogui\.click\(x=(\d+),\s*y=(\d+)\)"
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
x, y = int(match.group(1)), int(match.group(2))
|
||||
return (x, y)
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@register_agent(models=r"(?i).*OpenCUA.*")
|
||||
class OpenCUAConfig(AsyncAgentConfig):
|
||||
"""OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
|
||||
|
||||
def __init__(self):
|
||||
self.current_model = None
|
||||
self.last_screenshot_b64 = None
|
||||
|
||||
async def predict_step(
|
||||
self,
|
||||
messages: List[Dict[str, Any]],
|
||||
model: str,
|
||||
tools: Optional[List[Dict[str, Any]]] = None,
|
||||
max_retries: Optional[int] = None,
|
||||
stream: bool = False,
|
||||
computer_handler=None,
|
||||
_on_api_start=None,
|
||||
_on_api_end=None,
|
||||
_on_usage=None,
|
||||
_on_screenshot=None,
|
||||
**kwargs
|
||||
) -> Dict[str, Any]:
|
||||
"""Predict step is not implemented for OpenCUA model."""
|
||||
raise NotImplementedError("predict_step is not implemented for OpenCUA model")
|
||||
|
||||
async def predict_click(
|
||||
self,
|
||||
model: str,
|
||||
image_b64: str,
|
||||
instruction: str,
|
||||
**kwargs
|
||||
) -> Optional[Tuple[int, int]]:
|
||||
"""
|
||||
Predict click coordinates using OpenCUA model via litellm.acompletion.
|
||||
|
||||
Args:
|
||||
model: The OpenCUA model name
|
||||
image_b64: Base64 encoded image
|
||||
instruction: Instruction for where to click
|
||||
|
||||
Returns:
|
||||
Tuple of (x, y) coordinates or None if prediction fails
|
||||
"""
|
||||
# Prepare system message
|
||||
system_prompt = (
|
||||
"You are a GUI agent. You are given a task and a screenshot of the screen. "
|
||||
"You need to perform a series of pyautogui actions to complete the task."
|
||||
)
|
||||
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": system_prompt
|
||||
}
|
||||
|
||||
# Prepare user message with image and instruction
|
||||
user_message = {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"image": f"data:image/png;base64,{image_b64}"
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": instruction
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Prepare API call kwargs
|
||||
api_kwargs = {
|
||||
"model": model,
|
||||
"messages": [system_message, user_message],
|
||||
"max_new_tokens": 512,
|
||||
"temperature": 0,
|
||||
**kwargs
|
||||
}
|
||||
|
||||
try:
|
||||
# Use liteLLM acompletion
|
||||
response = await litellm.acompletion(**api_kwargs)
|
||||
|
||||
# Extract response text
|
||||
output_text = response.choices[0].message.content
|
||||
|
||||
if not output_text:
|
||||
return None
|
||||
|
||||
# Extract coordinates from pyautogui format
|
||||
coordinates = extract_coordinates_from_pyautogui(output_text)
|
||||
|
||||
return coordinates
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in OpenCUA predict_click: {e}")
|
||||
return None
|
||||
|
||||
def get_capabilities(self) -> List[AgentCapability]:
|
||||
"""Return the capabilities supported by this agent."""
|
||||
return ["click"]
|
||||
Reference in New Issue
Block a user