From e065ae59d28ee6496cee7f75d877a093d2af4815 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 20 Aug 2025 10:03:41 -0400 Subject: [PATCH 01/17] Add OpenCUA Grounding mode --- libs/python/agent/agent/loops/opencua.py | 133 +++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 libs/python/agent/agent/loops/opencua.py diff --git a/libs/python/agent/agent/loops/opencua.py b/libs/python/agent/agent/loops/opencua.py new file mode 100644 index 00000000..d1c6c5fb --- /dev/null +++ b/libs/python/agent/agent/loops/opencua.py @@ -0,0 +1,133 @@ +""" +OpenCUA agent loop implementation for click prediction using litellm.acompletion +Based on OpenCUA model for GUI grounding tasks. +""" + +import asyncio +import json +import re +import base64 +from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple +from io import BytesIO +import uuid +from PIL import Image +import litellm +import math + +from ..decorators import register_agent +from ..types import Messages, AgentResponse, Tools, AgentCapability +from ..loops.base import AsyncAgentConfig + +def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]: + """Extract coordinates from pyautogui.click(x=..., y=...) format.""" + try: + # Look for pyautogui.click(x=1443, y=343) pattern + pattern = r"pyautogui\.click\(x=(\d+),\s*y=(\d+)\)" + match = re.search(pattern, text) + if match: + x, y = int(match.group(1)), int(match.group(2)) + return (x, y) + return None + except Exception: + return None + +@register_agent(models=r"(?i).*OpenCUA.*") +class OpenCUAConfig(AsyncAgentConfig): + """OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction.""" + + def __init__(self): + self.current_model = None + self.last_screenshot_b64 = None + + async def predict_step( + self, + messages: List[Dict[str, Any]], + model: str, + tools: Optional[List[Dict[str, Any]]] = None, + max_retries: Optional[int] = None, + stream: bool = False, + computer_handler=None, + _on_api_start=None, + _on_api_end=None, + _on_usage=None, + _on_screenshot=None, + **kwargs + ) -> Dict[str, Any]: + """Predict step is not implemented for OpenCUA model.""" + raise NotImplementedError("predict_step is not implemented for OpenCUA model") + + async def predict_click( + self, + model: str, + image_b64: str, + instruction: str, + **kwargs + ) -> Optional[Tuple[int, int]]: + """ + Predict click coordinates using OpenCUA model via litellm.acompletion. + + Args: + model: The OpenCUA model name + image_b64: Base64 encoded image + instruction: Instruction for where to click + + Returns: + Tuple of (x, y) coordinates or None if prediction fails + """ + # Prepare system message + system_prompt = ( + "You are a GUI agent. You are given a task and a screenshot of the screen. " + "You need to perform a series of pyautogui actions to complete the task." + ) + + system_message = { + "role": "system", + "content": system_prompt + } + + # Prepare user message with image and instruction + user_message = { + "role": "user", + "content": [ + { + "type": "image", + "image": f"data:image/png;base64,{image_b64}" + }, + { + "type": "text", + "text": instruction + } + ] + } + + # Prepare API call kwargs + api_kwargs = { + "model": model, + "messages": [system_message, user_message], + "max_new_tokens": 512, + "temperature": 0, + **kwargs + } + + try: + # Use liteLLM acompletion + response = await litellm.acompletion(**api_kwargs) + + # Extract response text + output_text = response.choices[0].message.content + + if not output_text: + return None + + # Extract coordinates from pyautogui format + coordinates = extract_coordinates_from_pyautogui(output_text) + + return coordinates + + except Exception as e: + print(f"Error in OpenCUA predict_click: {e}") + return None + + def get_capabilities(self) -> List[AgentCapability]: + """Return the capabilities supported by this agent.""" + return ["click"] From d7e25048be12769020aa46a250dcbdf7e6eda15a Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 20 Aug 2025 10:21:53 -0400 Subject: [PATCH 02/17] Register OpenCUA loop --- libs/python/agent/agent/loops/__init__.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py index 45f70e20..25227e64 100644 --- a/libs/python/agent/agent/loops/__init__.py +++ b/libs/python/agent/agent/loops/__init__.py @@ -10,5 +10,15 @@ from . import omniparser from . import gta1 from . import composed_grounded from . import glm45v +from . import opencua -__all__ = ["anthropic", "openai", "uitars", "omniparser", "gta1", "composed_grounded", "glm45v"] +__all__ = [ + "anthropic", + "openai", + "uitars", + "omniparser", + "gta1", + "composed_grounded", + "glm45v", + "opencua" +] \ No newline at end of file From dad6634ffd9900750c7374c6c9db0f0da0d5bf75 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Thu, 21 Aug 2025 10:51:39 -0400 Subject: [PATCH 03/17] Added local inference routing for different models --- .../adapters/huggingfacelocal_adapter.py | 77 +++------------ .../agent/agent/adapters/models/__init__.py | 28 ++++++ .../agent/agent/adapters/models/generic.py | 72 ++++++++++++++ .../agent/agent/adapters/models/opencua.py | 98 +++++++++++++++++++ libs/python/agent/agent/loops/opencua.py | 36 +++---- libs/python/agent/pyproject.toml | 10 ++ 6 files changed, 236 insertions(+), 85 deletions(-) create mode 100644 libs/python/agent/agent/adapters/models/__init__.py create mode 100644 libs/python/agent/agent/adapters/models/generic.py create mode 100644 libs/python/agent/agent/adapters/models/opencua.py diff --git a/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py b/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py index 46d72db3..6f06734c 100644 --- a/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py +++ b/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py @@ -15,6 +15,7 @@ try: except ImportError: HF_AVAILABLE = False +from .models import load_model as load_model_handler class HuggingFaceLocalAdapter(CustomLLM): """HuggingFace Local Adapter for running vision-language models locally.""" @@ -28,41 +29,15 @@ class HuggingFaceLocalAdapter(CustomLLM): """ super().__init__() self.device = device - self.models = {} # Cache for loaded models - self.processors = {} # Cache for loaded processors + # Cache for model handlers keyed by model_name + self._handlers: Dict[str, Any] = {} self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool - def _load_model_and_processor(self, model_name: str): - """Load model and processor if not already cached. - - Args: - model_name: Name of the model to load - - Returns: - Tuple of (model, processor) - """ - if model_name not in self.models: - # Load model - model = AutoModelForImageTextToText.from_pretrained( - model_name, - torch_dtype=torch.float16, - device_map=self.device, - attn_implementation="sdpa" - ) - - # Load processor - processor = AutoProcessor.from_pretrained( - model_name, - min_pixels=3136, - max_pixels=4096 * 2160, - device_map=self.device - ) - - # Cache them - self.models[model_name] = model - self.processors[model_name] = processor - - return self.models[model_name], self.processors[model_name] + def _get_handler(self, model_name: str): + """Get or create a model handler for the given model name.""" + if model_name not in self._handlers: + self._handlers[model_name] = load_model_handler(model_name=model_name, device=self.device) + return self._handlers[model_name] def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Convert OpenAI format messages to HuggingFace format. @@ -133,41 +108,13 @@ class HuggingFaceLocalAdapter(CustomLLM): if ignored_kwargs: warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}") - # Load model and processor - model, processor = self._load_model_and_processor(model_name) - # Convert messages to HuggingFace format hf_messages = self._convert_messages(messages) - # Apply chat template and tokenize - inputs = processor.apply_chat_template( - hf_messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - return_tensors="pt" - ) - - # Move inputs to the same device as model - inputs = inputs.to(model.device) - - # Generate response - with torch.no_grad(): - generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens) - - # Trim input tokens from output - generated_ids_trimmed = [ - out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) - ] - - # Decode output - output_text = processor.batch_decode( - generated_ids_trimmed, - skip_special_tokens=True, - clean_up_tokenization_spaces=False - ) - - return output_text[0] if output_text else "" + # Delegate to model handler + handler = self._get_handler(model_name) + generated_text = handler.generate(hf_messages, max_new_tokens=max_new_tokens) + return generated_text def completion(self, *args, **kwargs) -> ModelResponse: """Synchronous completion method. diff --git a/libs/python/agent/agent/adapters/models/__init__.py b/libs/python/agent/agent/adapters/models/__init__.py new file mode 100644 index 00000000..6811c142 --- /dev/null +++ b/libs/python/agent/agent/adapters/models/__init__.py @@ -0,0 +1,28 @@ +from typing import Optional + +try: + from transformers import AutoConfig + HF_AVAILABLE = True +except ImportError: + HF_AVAILABLE = False + +from .generic import GenericHFModel +from .opencua import OpenCUAModel + + +def load_model(model_name: str, device: str = "auto"): + """Factory function to load and return the right model handler instance. + + - If the underlying transformers config class matches OpenCUA, return OpenCUAModel + - Otherwise, return GenericHFModel + """ + if not HF_AVAILABLE: + raise ImportError( + "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\"" + ) + cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + cls = cfg.__class__.__name__ + print(f"cls: {cls}") + if "OpenCUA" in cls: + return OpenCUAModel(model_name=model_name, device=device) + return GenericHFModel(model_name=model_name, device=device) diff --git a/libs/python/agent/agent/adapters/models/generic.py b/libs/python/agent/agent/adapters/models/generic.py new file mode 100644 index 00000000..de267239 --- /dev/null +++ b/libs/python/agent/agent/adapters/models/generic.py @@ -0,0 +1,72 @@ +from typing import List, Dict, Any, Optional + +# Hugging Face imports are local to avoid hard dependency at module import +try: + import torch # type: ignore + from transformers import AutoModelForImageTextToText, AutoProcessor # type: ignore + HF_AVAILABLE = True +except Exception: + HF_AVAILABLE = False + + +class GenericHFModel: + """Generic Hugging Face vision-language model handler. + Loads an AutoModelForImageTextToText and AutoProcessor and generates text. + """ + + def __init__(self, model_name: str, device: str = "auto") -> None: + if not HF_AVAILABLE: + raise ImportError( + "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\"" + ) + self.model_name = model_name + self.device = device + self.model = None + self.processor = None + self._load() + + def _load(self) -> None: + # Load model + self.model = AutoModelForImageTextToText.from_pretrained( + self.model_name, + torch_dtype=torch.float16, + device_map=self.device, + attn_implementation="sdpa", + ) + # Load processor + self.processor = AutoProcessor.from_pretrained( + self.model_name, + min_pixels=3136, + max_pixels=4096 * 2160, + device_map=self.device, + ) + + def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str: + """Generate text for the given HF-format messages. + messages: [{ role, content: [{type:'text'|'image', text|image}] }] + """ + assert self.model is not None and self.processor is not None + # Apply chat template and tokenize + inputs = self.processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + # Move inputs to the same device as model + inputs = inputs.to(self.model.device) + # Generate + with torch.no_grad(): + generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens) + # Trim prompt tokens from output + generated_ids_trimmed = [ + out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + # Decode + output_text = self.processor.batch_decode( + generated_ids_trimmed, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) + return output_text[0] if output_text else "" diff --git a/libs/python/agent/agent/adapters/models/opencua.py b/libs/python/agent/agent/adapters/models/opencua.py new file mode 100644 index 00000000..f24dfa6b --- /dev/null +++ b/libs/python/agent/agent/adapters/models/opencua.py @@ -0,0 +1,98 @@ +from typing import List, Dict, Any +import re +import base64 +from io import BytesIO + +try: + import torch # type: ignore + from transformers import AutoTokenizer, AutoModel, AutoImageProcessor # type: ignore + from PIL import Image # type: ignore + import blobfile as _ # assert blobfile is installed + OPENCUA_AVAILABLE = True +except Exception: + OPENCUA_AVAILABLE = False + + +class OpenCUAModel: + """OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor.""" + + def __init__(self, model_name: str, device: str = "auto") -> None: + if not OPENCUA_AVAILABLE: + raise ImportError( + "OpenCUA requirements not found. Install with: pip install \"cua-agent[opencua-hf]\"" + ) + self.model_name = model_name + self.device = device + self.model = None + self.tokenizer = None + self.image_processor = None + self._load() + + def _load(self) -> None: + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_name, trust_remote_code=True + ) + self.model = AutoModel.from_pretrained( + self.model_name, + torch_dtype="auto", + device_map=self.device, + trust_remote_code=True, + ) + self.image_processor = AutoImageProcessor.from_pretrained( + self.model_name, trust_remote_code=True + ) + + @staticmethod + def _extract_last_image_b64(messages: List[Dict[str, Any]]) -> str: + # Expect HF-format messages with content items type: "image" with data URL + for msg in reversed(messages): + for item in reversed(msg.get("content", [])): + if isinstance(item, dict) and item.get("type") == "image": + url = item.get("image", "") + if isinstance(url, str) and url.startswith("data:image/"): + return url.split(",", 1)[1] + return "" + + def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str: + assert self.model is not None and self.tokenizer is not None and self.image_processor is not None + + # Tokenize text side using chat template + input_ids = self.tokenizer.apply_chat_template( + messages, tokenize=True, add_generation_prompt=True + ) + input_ids = torch.tensor([input_ids]).to(self.model.device) + + # Prepare image inputs from last data URL image + image_b64 = self._extract_last_image_b64(messages) + pixel_values = None + grid_thws = None + if image_b64: + image = Image.open(BytesIO(base64.b64decode(image_b64))).convert("RGB") + image_info = self.image_processor.preprocess(images=[image]) + pixel_values = torch.tensor(image_info["pixel_values"]).to( + dtype=torch.bfloat16, device=self.model.device + ) + grid_thws = torch.tensor(image_info["image_grid_thw"]) if "image_grid_thw" in image_info else None + + gen_kwargs: Dict[str, Any] = { + "max_new_tokens": max_new_tokens, + "temperature": 0, + } + if pixel_values is not None: + gen_kwargs["pixel_values"] = pixel_values + if grid_thws is not None: + gen_kwargs["grid_thws"] = grid_thws + + with torch.no_grad(): + generated_ids = self.model.generate( + input_ids, + **gen_kwargs, + ) + + # Remove prompt tokens + prompt_len = input_ids.shape[1] + generated_ids = generated_ids[:, prompt_len:] + output_text = self.tokenizer.batch_decode( + generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False + )[0] + return output_text diff --git a/libs/python/agent/agent/loops/opencua.py b/libs/python/agent/agent/loops/opencua.py index d1c6c5fb..c13875b2 100644 --- a/libs/python/agent/agent/loops/opencua.py +++ b/libs/python/agent/agent/loops/opencua.py @@ -90,8 +90,10 @@ class OpenCUAConfig(AsyncAgentConfig): "role": "user", "content": [ { - "type": "image", - "image": f"data:image/png;base64,{image_b64}" + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{image_b64}" + } }, { "type": "text", @@ -109,24 +111,18 @@ class OpenCUAConfig(AsyncAgentConfig): **kwargs } - try: - # Use liteLLM acompletion - response = await litellm.acompletion(**api_kwargs) - - # Extract response text - output_text = response.choices[0].message.content - - if not output_text: - return None - - # Extract coordinates from pyautogui format - coordinates = extract_coordinates_from_pyautogui(output_text) - - return coordinates - - except Exception as e: - print(f"Error in OpenCUA predict_click: {e}") - return None + # Use liteLLM acompletion + response = await litellm.acompletion(**api_kwargs) + + # Extract response text + output_text = response.choices[0].message.content + + print(output_text) + + # Extract coordinates from pyautogui format + coordinates = extract_coordinates_from_pyautogui(output_text) + + return coordinates def get_capabilities(self) -> List[AgentCapability]: """Return the capabilities supported by this agent.""" diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml index 4dd27062..811c3a9c 100644 --- a/libs/python/agent/pyproject.toml +++ b/libs/python/agent/pyproject.toml @@ -47,6 +47,13 @@ glm45v-hf = [ "torch", "transformers-v4.55.0-GLM-4.5V-preview" ] +opencua-hf = [ + "accelerate", + "torch", + "transformers>=4.54.0", + "tiktoken>=0.11.0", + "blobfile>=3.0.0" +] ui = [ "gradio>=5.23.3", "python-dotenv>=1.0.1", @@ -66,6 +73,9 @@ all = [ "accelerate", "torch", "transformers>=4.54.0", + # opencua requirements + "tiktoken>=0.11.0", + "blobfile>=3.0.0" # ui requirements "gradio>=5.23.3", "python-dotenv>=1.0.1", From b20d2a0a9384674bc255c7876d43dcf1f90fdcbf Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Thu, 21 Aug 2025 11:54:13 -0400 Subject: [PATCH 04/17] Pinned transformers version, first working grounding version --- libs/python/agent/agent/adapters/models/__init__.py | 2 +- libs/python/agent/agent/adapters/models/opencua.py | 1 + libs/python/agent/agent/loops/opencua.py | 5 ++--- libs/python/agent/pyproject.toml | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/libs/python/agent/agent/adapters/models/__init__.py b/libs/python/agent/agent/adapters/models/__init__.py index 6811c142..8a5fb00b 100644 --- a/libs/python/agent/agent/adapters/models/__init__.py +++ b/libs/python/agent/agent/adapters/models/__init__.py @@ -22,7 +22,7 @@ def load_model(model_name: str, device: str = "auto"): ) cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True) cls = cfg.__class__.__name__ - print(f"cls: {cls}") + # print(f"cls: {cls}") if "OpenCUA" in cls: return OpenCUAModel(model_name=model_name, device=device) return GenericHFModel(model_name=model_name, device=device) diff --git a/libs/python/agent/agent/adapters/models/opencua.py b/libs/python/agent/agent/adapters/models/opencua.py index f24dfa6b..f8abf4a6 100644 --- a/libs/python/agent/agent/adapters/models/opencua.py +++ b/libs/python/agent/agent/adapters/models/opencua.py @@ -37,6 +37,7 @@ class OpenCUAModel: torch_dtype="auto", device_map=self.device, trust_remote_code=True, + attn_implementation="sdpa", ) self.image_processor = AutoImageProcessor.from_pretrained( self.model_name, trust_remote_code=True diff --git a/libs/python/agent/agent/loops/opencua.py b/libs/python/agent/agent/loops/opencua.py index c13875b2..a494377b 100644 --- a/libs/python/agent/agent/loops/opencua.py +++ b/libs/python/agent/agent/loops/opencua.py @@ -97,7 +97,7 @@ class OpenCUAConfig(AsyncAgentConfig): }, { "type": "text", - "text": instruction + "text": f"Click on {instruction}" } ] } @@ -116,8 +116,7 @@ class OpenCUAConfig(AsyncAgentConfig): # Extract response text output_text = response.choices[0].message.content - - print(output_text) + # print(output_text) # Extract coordinates from pyautogui format coordinates = extract_coordinates_from_pyautogui(output_text) diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml index 811c3a9c..0d382fdf 100644 --- a/libs/python/agent/pyproject.toml +++ b/libs/python/agent/pyproject.toml @@ -50,7 +50,7 @@ glm45v-hf = [ opencua-hf = [ "accelerate", "torch", - "transformers>=4.54.0", + "transformers==4.53.0", "tiktoken>=0.11.0", "blobfile>=3.0.0" ] @@ -75,7 +75,7 @@ all = [ "transformers>=4.54.0", # opencua requirements "tiktoken>=0.11.0", - "blobfile>=3.0.0" + "blobfile>=3.0.0", # ui requirements "gradio>=5.23.3", "python-dotenv>=1.0.1", From 52afcd4c6fcdf74948c42a82eeea35cdfd60a536 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 26 Aug 2025 18:14:22 -0400 Subject: [PATCH 05/17] Increased max tokens for glm 4.5v grounding calls --- libs/python/agent/agent/loops/glm45v.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libs/python/agent/agent/loops/glm45v.py b/libs/python/agent/agent/loops/glm45v.py index adc87026..516a9cb1 100644 --- a/libs/python/agent/agent/loops/glm45v.py +++ b/libs/python/agent/agent/loops/glm45v.py @@ -844,7 +844,7 @@ Where x,y are coordinates normalized to 0-999 range.""" api_kwargs = { "model": model, "messages": litellm_messages, - "max_tokens": 100, + "max_tokens": 2056, "temperature": 0.001, "extra_body": { "skip_special_tokens": False, @@ -856,6 +856,7 @@ Where x,y are coordinates normalized to 0-999 range.""" # Extract response content response_content = response.choices[0].message.content.strip() + print(response) # Parse response for click coordinates # Look for coordinates in the response, handling special tokens @@ -866,7 +867,7 @@ Where x,y are coordinates normalized to 0-999 range.""" # Fallback: look for coordinates without special tokens coord_pattern = r"left_click\(start_box='?\[(\d+),(\d+)\]'?\)" match = re.search(coord_pattern, response_content) - + if match: x, y = int(match.group(1)), int(match.group(2)) From bf3c3256dfab93cd7c9e2c7f58a6719c9b5586fe Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 26 Aug 2025 18:15:24 -0400 Subject: [PATCH 06/17] increased max tokens, added trust_remote_code kwarg --- .../agent/agent/adapters/huggingfacelocal_adapter.py | 6 ++++-- libs/python/agent/agent/adapters/models/__init__.py | 8 ++++---- libs/python/agent/agent/adapters/models/generic.py | 9 ++++++--- libs/python/agent/agent/adapters/models/opencua.py | 9 +++++---- libs/python/agent/agent/agent.py | 6 +++++- libs/python/agent/agent/cli.py | 1 + libs/python/agent/agent/loops/gta1.py | 2 +- libs/python/agent/agent/loops/opencua.py | 2 +- libs/python/agent/agent/loops/uitars.py | 2 +- 9 files changed, 28 insertions(+), 17 deletions(-) diff --git a/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py b/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py index 6f06734c..3ecba641 100644 --- a/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py +++ b/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py @@ -20,15 +20,17 @@ from .models import load_model as load_model_handler class HuggingFaceLocalAdapter(CustomLLM): """HuggingFace Local Adapter for running vision-language models locally.""" - def __init__(self, device: str = "auto", **kwargs): + def __init__(self, device: str = "auto", trust_remote_code: bool = False, **kwargs): """Initialize the adapter. Args: device: Device to load model on ("auto", "cuda", "cpu", etc.) + trust_remote_code: Whether to trust remote code **kwargs: Additional arguments """ super().__init__() self.device = device + self.trust_remote_code = trust_remote_code # Cache for model handlers keyed by model_name self._handlers: Dict[str, Any] = {} self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool @@ -36,7 +38,7 @@ class HuggingFaceLocalAdapter(CustomLLM): def _get_handler(self, model_name: str): """Get or create a model handler for the given model name.""" if model_name not in self._handlers: - self._handlers[model_name] = load_model_handler(model_name=model_name, device=self.device) + self._handlers[model_name] = load_model_handler(model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code) return self._handlers[model_name] def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: diff --git a/libs/python/agent/agent/adapters/models/__init__.py b/libs/python/agent/agent/adapters/models/__init__.py index 8a5fb00b..99696a1a 100644 --- a/libs/python/agent/agent/adapters/models/__init__.py +++ b/libs/python/agent/agent/adapters/models/__init__.py @@ -10,7 +10,7 @@ from .generic import GenericHFModel from .opencua import OpenCUAModel -def load_model(model_name: str, device: str = "auto"): +def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = False): """Factory function to load and return the right model handler instance. - If the underlying transformers config class matches OpenCUA, return OpenCUAModel @@ -20,9 +20,9 @@ def load_model(model_name: str, device: str = "auto"): raise ImportError( "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\"" ) - cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code) cls = cfg.__class__.__name__ # print(f"cls: {cls}") if "OpenCUA" in cls: - return OpenCUAModel(model_name=model_name, device=device) - return GenericHFModel(model_name=model_name, device=device) + return OpenCUAModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code) + return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code) diff --git a/libs/python/agent/agent/adapters/models/generic.py b/libs/python/agent/agent/adapters/models/generic.py index de267239..aefbaa7f 100644 --- a/libs/python/agent/agent/adapters/models/generic.py +++ b/libs/python/agent/agent/adapters/models/generic.py @@ -3,7 +3,7 @@ from typing import List, Dict, Any, Optional # Hugging Face imports are local to avoid hard dependency at module import try: import torch # type: ignore - from transformers import AutoModelForImageTextToText, AutoProcessor # type: ignore + from transformers import AutoModel, AutoProcessor # type: ignore HF_AVAILABLE = True except Exception: HF_AVAILABLE = False @@ -14,7 +14,7 @@ class GenericHFModel: Loads an AutoModelForImageTextToText and AutoProcessor and generates text. """ - def __init__(self, model_name: str, device: str = "auto") -> None: + def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None: if not HF_AVAILABLE: raise ImportError( "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\"" @@ -23,15 +23,17 @@ class GenericHFModel: self.device = device self.model = None self.processor = None + self.trust_remote_code = trust_remote_code self._load() def _load(self) -> None: # Load model - self.model = AutoModelForImageTextToText.from_pretrained( + self.model = AutoModel.from_pretrained( self.model_name, torch_dtype=torch.float16, device_map=self.device, attn_implementation="sdpa", + trust_remote_code=self.trust_remote_code, ) # Load processor self.processor = AutoProcessor.from_pretrained( @@ -39,6 +41,7 @@ class GenericHFModel: min_pixels=3136, max_pixels=4096 * 2160, device_map=self.device, + trust_remote_code=self.trust_remote_code, ) def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str: diff --git a/libs/python/agent/agent/adapters/models/opencua.py b/libs/python/agent/agent/adapters/models/opencua.py index f8abf4a6..32c73134 100644 --- a/libs/python/agent/agent/adapters/models/opencua.py +++ b/libs/python/agent/agent/adapters/models/opencua.py @@ -16,7 +16,7 @@ except Exception: class OpenCUAModel: """OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor.""" - def __init__(self, model_name: str, device: str = "auto") -> None: + def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None: if not OPENCUA_AVAILABLE: raise ImportError( "OpenCUA requirements not found. Install with: pip install \"cua-agent[opencua-hf]\"" @@ -26,21 +26,22 @@ class OpenCUAModel: self.model = None self.tokenizer = None self.image_processor = None + self.trust_remote_code = trust_remote_code self._load() def _load(self) -> None: self.tokenizer = AutoTokenizer.from_pretrained( - self.model_name, trust_remote_code=True + self.model_name, trust_remote_code=self.trust_remote_code ) self.model = AutoModel.from_pretrained( self.model_name, torch_dtype="auto", device_map=self.device, - trust_remote_code=True, + trust_remote_code=self.trust_remote_code, attn_implementation="sdpa", ) self.image_processor = AutoImageProcessor.from_pretrained( - self.model_name, trust_remote_code=True + self.model_name, trust_remote_code=self.trust_remote_code ) @staticmethod diff --git a/libs/python/agent/agent/agent.py b/libs/python/agent/agent/agent.py index 78fc3f45..bdcf5977 100644 --- a/libs/python/agent/agent/agent.py +++ b/libs/python/agent/agent/agent.py @@ -166,6 +166,7 @@ class ComputerAgent: use_prompt_caching: Optional[bool] = False, max_trajectory_budget: Optional[float | dict] = None, telemetry_enabled: Optional[bool] = True, + trust_remote_code: Optional[bool] = False, **kwargs ): """ @@ -184,6 +185,7 @@ class ComputerAgent: use_prompt_caching: If set, use prompt caching to avoid reprocessing the same prompt. Intended for use with anthropic providers. max_trajectory_budget: If set, adds BudgetManagerCallback to track usage costs and stop when budget is exceeded telemetry_enabled: If set, adds TelemetryCallback to track anonymized usage data. Enabled by default. + trust_remote_code: If set, trust remote code when loading local models. Disabled by default. **kwargs: Additional arguments passed to the agent loop """ self.model = model @@ -198,6 +200,7 @@ class ComputerAgent: self.use_prompt_caching = use_prompt_caching self.telemetry_enabled = telemetry_enabled self.kwargs = kwargs + self.trust_remote_code = trust_remote_code # == Add built-in callbacks == @@ -231,7 +234,8 @@ class ComputerAgent: # Register local model providers hf_adapter = HuggingFaceLocalAdapter( - device="auto" + device="auto", + trust_remote_code=self.trust_remote_code or False ) human_adapter = HumanAdapter() litellm.custom_provider_map = [ diff --git a/libs/python/agent/agent/cli.py b/libs/python/agent/agent/cli.py index de9e3450..b04f11db 100644 --- a/libs/python/agent/agent/cli.py +++ b/libs/python/agent/agent/cli.py @@ -331,6 +331,7 @@ Examples: agent_kwargs = { "model": args.model, "tools": [computer], + "trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA) "verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING "max_retries": args.max_retries } diff --git a/libs/python/agent/agent/loops/gta1.py b/libs/python/agent/agent/loops/gta1.py index 13678b48..400daa29 100644 --- a/libs/python/agent/agent/loops/gta1.py +++ b/libs/python/agent/agent/loops/gta1.py @@ -155,7 +155,7 @@ class GTA1Config(AsyncAgentConfig): api_kwargs = { "model": model, "messages": [system_message, user_message], - "max_tokens": 32, + "max_tokens": 2056, "temperature": 0.0, **kwargs } diff --git a/libs/python/agent/agent/loops/opencua.py b/libs/python/agent/agent/loops/opencua.py index a494377b..1688b587 100644 --- a/libs/python/agent/agent/loops/opencua.py +++ b/libs/python/agent/agent/loops/opencua.py @@ -106,7 +106,7 @@ class OpenCUAConfig(AsyncAgentConfig): api_kwargs = { "model": model, "messages": [system_message, user_message], - "max_new_tokens": 512, + "max_new_tokens": 2056, "temperature": 0, **kwargs } diff --git a/libs/python/agent/agent/loops/uitars.py b/libs/python/agent/agent/loops/uitars.py index 10e0e45a..79c5d241 100644 --- a/libs/python/agent/agent/loops/uitars.py +++ b/libs/python/agent/agent/loops/uitars.py @@ -771,7 +771,7 @@ class UITARSConfig: api_kwargs = { "model": model, "messages": litellm_messages, - "max_tokens": 100, + "max_tokens": 2056, "temperature": 0.0, "do_sample": False } From 84e2a27aea3874fa0c3de263d0c194526b989103 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 26 Aug 2025 18:29:39 -0400 Subject: [PATCH 07/17] added notebook --- notebooks/composite_agents_docker_nb.ipynb | 162 +++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 notebooks/composite_agents_docker_nb.ipynb diff --git a/notebooks/composite_agents_docker_nb.ipynb b/notebooks/composite_agents_docker_nb.ipynb new file mode 100644 index 00000000..5b328f3e --- /dev/null +++ b/notebooks/composite_agents_docker_nb.ipynb @@ -0,0 +1,162 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Composite Agents with Docker Container Computer\n", + "\n", + "This notebook walks you through running a composed GUI agent using a Docker-based Computer and OpenRouter for the grounding model, paired with a planning model.\n", + "\n", + "We'll use the model string:\n", + "\n", + "- `\"openrouter/z-ai/glm-4.5v+openai/gpt-5-nano\"` (grounding + planning)\n", + "\n", + "Grounding (left) generates actionable UI coordinates; planning (right) reasons and drives steps." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prerequisites\n", + "\n", + "- Docker Desktop or Engine installed and running\n", + "- An OpenRouter account and API key (https://openrouter.ai/)\n", + "- (Optional) An OpenAI API key if using `openai/gpt-5-nano` for planning\n", + "- Python 3.12 environment with `cua-agent` installed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install CUA Agent (and extras as needed)\n", + "!pip install -q \"cua-agent[all]\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare a Docker Computer\n", + "\n", + "We'll follow the documented Docker provider flow (see `docs/content/docs/computer-sdk/computers.mdx`).\n", + "\n", + "If you don't have the image yet, either pull or build it locally. Run these in a terminal, not inside the notebook:\n", + "\n", + "```bash\n", + "# Option 1: Pull from Docker Hub\n", + "docker pull trycua/cua-ubuntu:latest\n", + "\n", + "# Option 2: Build locally (from repo root)\n", + "cd libs/kasm\n", + "docker build -t cua-ubuntu:latest .\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set environment keys\n", + "\n", + "- Get an OpenRouter API key at https://openrouter.ai/\n", + "- If using OpenAI for planning, set your OpenAI key as well\n", + "- You can input them here to set for this notebook session" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY') or input('Enter your OPENROUTER_API_KEY: ').strip()\n", + "os.environ['OPENROUTER_API_KEY'] = OPENROUTER_API_KEY\n", + "\n", + "# Optional: if planning model uses OpenAI provider\n", + "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or input('(Optional) Enter your OPENAI_API_KEY (press Enter to skip): ').strip()\n", + "if OPENAI_API_KEY:\n", + " os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a Docker Computer and a composed agent\n", + "\n", + "This uses the documented Docker provider parameters: `os_type=\"linux\"`, `provider_type=\"docker\"`, plus `image` and `name`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import asyncio\n", + "from computer import Computer\n", + "from agent import ComputerAgent\n", + "\n", + "async def main():\n", + " # Launch & connect to a Docker container running the Computer Server\n", + " async with Computer(\n", + " os_type='linux',\n", + " provider_type='docker',\n", + " image='trycua/cua-ubuntu:latest',\n", + " name='my-cua-container'\n", + " ) as computer:\n", + " agent = ComputerAgent(\n", + " model='openrouter/z-ai/glm-4.5v+openai/gpt-5-nano',\n", + " tools=[computer],\n", + " trajectory_dir='trajectories' # Save agent trajectory (screenshots, api calls)\n", + " )\n", + "\n", + " # Simple task to verify end-to-end\n", + " async for _ in agent.run('Open a browser and go to example.com'):\n", + " pass\n", + "\n", + "asyncio.run(main())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Notes\n", + "\n", + "- Grounding (OpenRouter `z-ai/glm-4.5v`) + Planning (OpenAI `gpt-5-nano`) can be swapped for other providers/models.\n", + "- If you prefer to avoid OpenAI, choose a planning model on OpenRouter and update the model string accordingly.\n", + "- Be sure the planning model supports `vision` input and the `tools` parameter.\n", + "- The agent emits normalized Agent Responses across providers." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 58807378dddcf99221fd54b264eb180984efd88c Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Fri, 12 Sep 2025 13:30:09 -0400 Subject: [PATCH 08/17] Added internVL --- libs/python/agent/agent/loops/__init__.py | 4 +- .../agent/agent/loops/composed_grounded.py | 2 +- libs/python/agent/agent/loops/internvl.py | 179 ++++++++++++++++++ libs/python/agent/agent/loops/opencua.py | 20 +- 4 files changed, 200 insertions(+), 5 deletions(-) create mode 100644 libs/python/agent/agent/loops/internvl.py diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py index 25227e64..958e484c 100644 --- a/libs/python/agent/agent/loops/__init__.py +++ b/libs/python/agent/agent/loops/__init__.py @@ -11,6 +11,7 @@ from . import gta1 from . import composed_grounded from . import glm45v from . import opencua +from . import internvl __all__ = [ "anthropic", @@ -20,5 +21,6 @@ __all__ = [ "gta1", "composed_grounded", "glm45v", - "opencua" + "opencua", + "internvl" ] \ No newline at end of file diff --git a/libs/python/agent/agent/loops/composed_grounded.py b/libs/python/agent/agent/loops/composed_grounded.py index cf029d13..87ba50e1 100644 --- a/libs/python/agent/agent/loops/composed_grounded.py +++ b/libs/python/agent/agent/loops/composed_grounded.py @@ -116,7 +116,7 @@ def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str @register_agent(r".*\+.*", priority=1) -class ComposedGroundedConfig: +class ComposedGroundedConfig(AsyncAgentConfig): """ Composed-grounded agent configuration that uses both grounding and thinking models. diff --git a/libs/python/agent/agent/loops/internvl.py b/libs/python/agent/agent/loops/internvl.py new file mode 100644 index 00000000..d1b8c3fe --- /dev/null +++ b/libs/python/agent/agent/loops/internvl.py @@ -0,0 +1,179 @@ +""" +InternVL agent loop implementation for click prediction using litellm.acompletion. + +Implements the ScreenSpot InternVL grounding baseline behavior: +- Uses the exact grounding prompt format with and tags +- Expects coordinates in 0-1000 normalized range in formats [[x1,y1,x2,y2]] or [[x,y]] +- Converts to pixel coordinates relative to the original screenshot size + +Note: We do NOT manually load the InternVL model; acompletions (via HuggingFaceLocalAdapter) +will handle loading based on the provided model name. +""" + +from __future__ import annotations + +import base64 +import math +import re +from io import BytesIO +from typing import Any, Dict, List, Optional, Tuple + +from PIL import Image +import litellm + +from ..decorators import register_agent +from .composed_grounded import ComposedGroundedConfig +from ..types import AgentCapability + + +# Regex patterns matching ScreenSpot baseline extractors +_POINT_PATTERN = re.compile(r"\[\[(\d+),(\d+)\]\]") +_BBOX_PATTERN = re.compile(r"\[\[(\d+),(\d+),(\d+),(\d+)\]\]") + + +def _extract_first_point(text: str) -> Optional[Tuple[float, float]]: + """Extract the first [[x,y]] as normalized (0-1000) floats.""" + m = _POINT_PATTERN.search(text) + if not m: + return None + try: + x = float(m.group(1)) + y = float(m.group(2)) + return x, y + except Exception: + return None + + +def _extract_last_bbox(text: str) -> Optional[Tuple[float, float, float, float]]: + """Extract the last [[x1,y1,x2,y2]] as normalized (0-1000) floats.""" + matches = list(_BBOX_PATTERN.finditer(text)) + if not matches: + return None + m = matches[-1] + try: + x1 = float(m.group(1)) + y1 = float(m.group(2)) + x2 = float(m.group(3)) + y2 = float(m.group(4)) + return x1, y1, x2, y2 + except Exception: + return None + + +def _scale_norm_to_pixels(x_norm: float, y_norm: float, width: int, height: int) -> Tuple[int, int]: + """Scale 0-1000 normalized coordinates to pixel coordinates for given image size.""" + x_px = int(math.floor((x_norm / 1000.0) * width)) + y_px = int(math.floor((y_norm / 1000.0) * height)) + # Clamp to image bounds just in case + x_px = max(0, min(width - 1, x_px)) + y_px = max(0, min(height - 1, y_px)) + return x_px, y_px + + +@register_agent(models=r"(?i).*InternVL.*") +class InternVLConfig(ComposedGroundedConfig): + """InternVL agent configuration reusing ComposedGroundedConfig for steps and + overriding predict_click to implement ScreenSpot InternVL grounding baseline.""" + + async def predict_step( + self, + messages: List[Dict[str, Any]], + model: str, + tools: Optional[List[Dict[str, Any]]] = None, + max_retries: Optional[int] = None, + stream: bool = False, + computer_handler=None, + _on_api_start=None, + _on_api_end=None, + _on_usage=None, + _on_screenshot=None, + **kwargs + ) -> Dict[str, Any]: + """Fallback to a self-composed model""" + return await super().predict_step( + messages=messages, + model=f"{model}+{model}", + tools=tools, + max_retries=max_retries, + stream=stream, + computer_handler=computer_handler, + _on_api_start=_on_api_start, + _on_api_end=_on_api_end, + _on_usage=_on_usage, + _on_screenshot=_on_screenshot, + **kwargs + ) + + async def predict_click( + self, + model: str, + image_b64: str, + instruction: str, + **kwargs + ) -> Optional[Tuple[int, int]]: + """ + Predict click coordinates using InternVL via litellm.acompletion. + + Behavior mirrors the ScreenSpot InternVL baseline: + - Prompt: "\nPlease provide the bounding box coordinate of the UI element this user instruction describes: {instruction}. Answer in the format of [[x1, y1, x2, y2]]" + - Parse either [[x,y]] point or [[x1,y1,x2,y2]] bbox, using bbox center if point missing + - Coordinates are 0-1000 normalized; convert to pixel coordinates for the original screenshot + """ + try: + # Decode image dimensions to scale the normalized outputs + img_bytes = base64.b64decode(image_b64) + image = Image.open(BytesIO(img_bytes)) + width, height = image.size + except Exception: + # If decoding fails, proceed with a safe default size to avoid crash + width, height = 1920, 1080 + + # Build grounding prompt exactly like the baseline + grounding_prompt = ( + f"Please provide the bounding box coordinate of the UI element this user instruction describes: {instruction}. " + f"Answer in the format of [[x1, y1, x2, y2]]" + ) + + # Prepare messages for LiteLLM + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{image_b64}"}, + }, + {"type": "text", "text": grounding_prompt}, + ], + } + ] + + # Call acompletion; HuggingFaceLocalAdapter/model handler will handle InternVL loading + api_kwargs = { + "model": model, + "messages": messages, + # Conservative generation params akin to baseline (deterministic) + "max_tokens": kwargs.get("max_tokens", 256), + "temperature": kwargs.get("temperature", 0.0), + } + + response = await litellm.acompletion(**api_kwargs) + output_text = (response.choices[0].message.content or "").strip() # type: ignore + + # Try to parse a point first; if absent, parse bbox and take center + point = _extract_first_point(output_text) + if point is None: + bbox = _extract_last_bbox(output_text) + if bbox is None: + return None + x1, y1, x2, y2 = bbox + cx = (x1 + x2) / 2.0 + cy = (y1 + y2) / 2.0 + point = (cx, cy) + + x_norm, y_norm = point + x_px, y_px = _scale_norm_to_pixels(x_norm, y_norm, width, height) + return (x_px, y_px) + + def get_capabilities(self) -> List[AgentCapability]: + return ["click", "step"] diff --git a/libs/python/agent/agent/loops/opencua.py b/libs/python/agent/agent/loops/opencua.py index 1688b587..b06ea126 100644 --- a/libs/python/agent/agent/loops/opencua.py +++ b/libs/python/agent/agent/loops/opencua.py @@ -14,6 +14,7 @@ from PIL import Image import litellm import math +from .composed_grounded import ComposedGroundedConfig from ..decorators import register_agent from ..types import Messages, AgentResponse, Tools, AgentCapability from ..loops.base import AsyncAgentConfig @@ -32,10 +33,11 @@ def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]: return None @register_agent(models=r"(?i).*OpenCUA.*") -class OpenCUAConfig(AsyncAgentConfig): +class OpenCUAConfig(ComposedGroundedConfig): """OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction.""" def __init__(self): + super().__init__() self.current_model = None self.last_screenshot_b64 = None @@ -53,8 +55,20 @@ class OpenCUAConfig(AsyncAgentConfig): _on_screenshot=None, **kwargs ) -> Dict[str, Any]: - """Predict step is not implemented for OpenCUA model.""" - raise NotImplementedError("predict_step is not implemented for OpenCUA model") + """Fallback to a self-composed model""" + return await super().predict_step( + messages=messages, + model=f"{model}+{model}", + tools=tools, + max_retries=max_retries, + stream=stream, + computer_handler=computer_handler, + _on_api_start=_on_api_start, + _on_api_end=_on_api_end, + _on_usage=_on_usage, + _on_screenshot=_on_screenshot, + **kwargs + ) async def predict_click( self, From eba94ce9193380f70ad3321eaf1d7ab088203727 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 15 Sep 2025 15:55:33 -0400 Subject: [PATCH 09/17] added internVL, Holo1.5, and OpenCUA to readme --- README.md | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index eea56257..f97adb12 100644 --- a/README.md +++ b/README.md @@ -29,21 +29,25 @@ With the Computer SDK, you can: - create & manage VMs [locally](https://docs.trycua.com/docs/computer-sdk/computers#cua-local-containers) or using [cua cloud](https://www.trycua.com/) With the Agent SDK, you can: -- run computer-use models with a [consistent output](https://docs.trycua.com/docs/agent-sdk/chat-history#message-array-structure) -- run composed agents using UI grounding models and any LLM -- use any liteLLM provider (`openai/`, `openrouter/`, etc.) or our included local providers (`huggingface-local/`, `mlx/`) -- quickly evaluate new UI agent models and UI grounding models - - `anthropic/claude-opus-4-1-20250805` (using [Computer-Use Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/computer-use-agents)) - - `openai/computer-use-preview` - - `openrouter/z-ai/glm-4.5v` - - `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` - - `omniparser+{any LLM}` (using [Composed Agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents)) - - `huggingface-local/HelloKKMe/GTA1-7B+{any LLM}` - - `huggingface/HelloKKMe/GTA1-32B+{any LLM}` - - `vllm_hosted/HelloKKMe/GTA1-72B+{any LLM}` - - `human/human` (using [Human-in-the-Loop](https://docs.trycua.com/docs/agent-sdk/supported-agents/human-in-the-loop)) +- run computer-use models with a [consistent schema](https://docs.trycua.com/docs/agent-sdk/message-format) +- combine UI grounding models with any LLM using [composed agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) +- use new UI agent models and UI grounding models from the Model Zoo below with just a model string (e.g., `ComputerAgent(model="openai/computer-use-preview")`) +- use any API or local provider by specifying a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers)) - benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://docs.trycua.com/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb)) +### CUA Model Zoo 🐨 + +| [All-in-one CUAs](https://docs.trycua.com/docs/agent-sdk/supported-agents/computer-use-agents) | [UI Grounding Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) | [UI Planning Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) | +|---|---|---| +| `anthropic/claude-opus-4-1-20250805` | `huggingface-local/xlangai/OpenCUA-{7B,32B}` | any all-in-one CUA | +| `openai/computer-use-preview` | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | any VLM (using liteLLM) | +| `openrouter/z-ai/glm-4.5v` | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` | | +| `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` | | +| `omniparser+{ui planning}` | | | +| `{ui grounding}+{ui planning}` | | | + +- `human/human` → [Human-in-the-Loop](https://docs.trycua.com/docs/agent-sdk/supported-agents/human-in-the-loop) + Missing a model? [Raise a feature request](https://github.com/trycua/cua/issues/new?assignees=&labels=enhancement&projects=&title=%5BAgent%5D%3A+Add+model+support+for+) or [contribute](https://github.com/trycua/cua/blob/main/CONTRIBUTING.md)!
From 3a1244e1c18421c78ccc3afac2ba83f806b6824f Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 15 Sep 2025 15:59:33 -0400 Subject: [PATCH 10/17] updated model zoo --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f97adb12..cd732f57 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ With the Agent SDK, you can: - run computer-use models with a [consistent schema](https://docs.trycua.com/docs/agent-sdk/message-format) - combine UI grounding models with any LLM using [composed agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) - use new UI agent models and UI grounding models from the Model Zoo below with just a model string (e.g., `ComputerAgent(model="openai/computer-use-preview")`) -- use any API or local provider by specifying a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers)) +- use API or local inference by specifying a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers)) - benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://docs.trycua.com/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb)) ### CUA Model Zoo 🐨 @@ -40,7 +40,7 @@ With the Agent SDK, you can: | [All-in-one CUAs](https://docs.trycua.com/docs/agent-sdk/supported-agents/computer-use-agents) | [UI Grounding Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) | [UI Planning Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) | |---|---|---| | `anthropic/claude-opus-4-1-20250805` | `huggingface-local/xlangai/OpenCUA-{7B,32B}` | any all-in-one CUA | -| `openai/computer-use-preview` | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | any VLM (using liteLLM) | +| `openai/computer-use-preview` | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | any VLM (using liteLLM, requires `tools` parameter) | | `openrouter/z-ai/glm-4.5v` | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` | | | `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` | | | `omniparser+{ui planning}` | | | From c217ed13af6a07608c4f1d2e63e3cee80aa0f12f Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 15 Sep 2025 16:01:31 -0400 Subject: [PATCH 11/17] wording change --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cd732f57..c9e814aa 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ With the Agent SDK, you can: - run computer-use models with a [consistent schema](https://docs.trycua.com/docs/agent-sdk/message-format) - combine UI grounding models with any LLM using [composed agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) - use new UI agent models and UI grounding models from the Model Zoo below with just a model string (e.g., `ComputerAgent(model="openai/computer-use-preview")`) -- use API or local inference by specifying a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers)) +- use API or local inference by changing a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers)) - benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://docs.trycua.com/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb)) ### CUA Model Zoo 🐨 From 7a7de5d50f8c001300da69c3c154579be890ffe9 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 15 Sep 2025 16:10:54 -0400 Subject: [PATCH 12/17] add holo models --- README.md | 5 +- libs/python/agent/agent/loops/__init__.py | 4 +- libs/python/agent/agent/loops/holo.py | 216 ++++++++++++++++++++++ 3 files changed, 222 insertions(+), 3 deletions(-) create mode 100644 libs/python/agent/agent/loops/holo.py diff --git a/README.md b/README.md index c9e814aa..24316555 100644 --- a/README.md +++ b/README.md @@ -40,9 +40,10 @@ With the Agent SDK, you can: | [All-in-one CUAs](https://docs.trycua.com/docs/agent-sdk/supported-agents/computer-use-agents) | [UI Grounding Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) | [UI Planning Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) | |---|---|---| | `anthropic/claude-opus-4-1-20250805` | `huggingface-local/xlangai/OpenCUA-{7B,32B}` | any all-in-one CUA | -| `openai/computer-use-preview` | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | any VLM (using liteLLM, requires `tools` parameter) | +| `openai/computer-use-preview` | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` | any VLM (using liteLLM, requires `tools` parameter) | | `openrouter/z-ai/glm-4.5v` | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` | | -| `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` | | +| `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` | any all-in-one CUA | | +| `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | | | `omniparser+{ui planning}` | | | | `{ui grounding}+{ui planning}` | | | diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py index 958e484c..406f14ca 100644 --- a/libs/python/agent/agent/loops/__init__.py +++ b/libs/python/agent/agent/loops/__init__.py @@ -12,6 +12,7 @@ from . import composed_grounded from . import glm45v from . import opencua from . import internvl +from . import holo __all__ = [ "anthropic", @@ -22,5 +23,6 @@ __all__ = [ "composed_grounded", "glm45v", "opencua", - "internvl" + "internvl", + "holo", ] \ No newline at end of file diff --git a/libs/python/agent/agent/loops/holo.py b/libs/python/agent/agent/loops/holo.py new file mode 100644 index 00000000..b1cbc5a1 --- /dev/null +++ b/libs/python/agent/agent/loops/holo.py @@ -0,0 +1,216 @@ +""" +Holo 1.5 agent loop implementation for click prediction using litellm.acompletion. + +Implements the Holo1.5 grounding behavior: +- Prompt asks for absolute pixel coordinates in JSON: {"action":"click_absolute","x":int,"y":int} +- Optionally resizes the image using Qwen2-VL smart_resize parameters (via transformers AutoProcessor) +- If resized, maps predicted coordinates back to the original screenshot resolution + +Note: We do NOT manually load the model; acompletions (via HuggingFaceLocalAdapter) +will handle loading based on the provided model name. +""" + +from __future__ import annotations + +import base64 +import json +from io import BytesIO +from typing import Any, Dict, List, Optional, Tuple + +import litellm +from PIL import Image + +from ..decorators import register_agent +from .base import AsyncAgentConfig +from ..types import AgentCapability + + +def _strip_hf_prefix(model: str) -> str: + """Strip provider prefixes like 'huggingface-local/' from model names for HF processor load.""" + if "/" in model and model.lower().startswith("huggingface-local/"): + return model.split("/", 1)[1] + return model + + +def _maybe_smart_resize(image: Image.Image, model: str) -> Tuple[Image.Image, Tuple[int, int]]: + """ + Try to compute Qwen2-VL smart_resize output size using transformers AutoProcessor. + + Returns (processed_image, (orig_w, orig_h)). If transformers or processor unavailable, + returns the original image and size without resizing. + """ + orig_w, orig_h = image.size + try: + # Import lazily to avoid hard dependency if not installed + from transformers import AutoProcessor # type: ignore + from transformers.models.qwen2_vl.image_processing_qwen2_vl import ( # type: ignore + smart_resize, + ) + + processor_name = _strip_hf_prefix(model) + processor = AutoProcessor.from_pretrained(processor_name) + image_processor = getattr(processor, "image_processor", None) + if image_processor is None: + return image, (orig_w, orig_h) + + factor = getattr(image_processor, "patch_size", 14) * getattr(image_processor, "merge_size", 1) + min_pixels = getattr(image_processor, "min_pixels", 256 * 256) + max_pixels = getattr(image_processor, "max_pixels", 1536 * 1536) + + resized_h, resized_w = smart_resize( + orig_h, + orig_w, + factor=factor, + min_pixels=min_pixels, + max_pixels=max_pixels, + ) + + if (resized_w, resized_h) == (orig_w, orig_h): + return image, (orig_w, orig_h) + + processed = image.resize((resized_w, resized_h), resample=Image.Resampling.LANCZOS) + return processed, (orig_w, orig_h) + except Exception: + # If any failure (no transformers, processor load error), fall back to original + return image, (orig_w, orig_h) + + +def _build_holo_prompt(instruction: str) -> str: + """Construct the Holo1.5 grounding prompt.""" + # Keep it close to the cookbook while avoiding heavy schema generation + schema_hint = '{"action": "click_absolute", "x": , "y": }' + return ( + "Localize an element on the GUI image according to the provided target and output a click position. " + f"You must output a valid JSON following the format: {schema_hint} " + f"Your target is: {instruction}" + ) + + +def _parse_click_json(output_text: str) -> Optional[Tuple[int, int]]: + """ + Parse JSON from model output and extract x, y ints. + Tries to find the first JSON object substring if extra text is present. + """ + try: + # Fast path: direct JSON + data = json.loads(output_text) + except Exception: + # Try to locate a JSON object within the text + start = output_text.find("{") + end = output_text.rfind("}") + if start == -1 or end == -1 or end <= start: + return None + try: + data = json.loads(output_text[start : end + 1]) + except Exception: + return None + + try: + x = int(data.get("x")) + y = int(data.get("y")) + return x, y + except Exception: + return None + + +@register_agent(models=r"(?i).*(Holo1\.5|Hcompany/Holo1\.5).*") +class HoloConfig(AsyncAgentConfig): + """Holo is a family of UI grounding models from H Company""" + + async def predict_step( + self, + messages: List[Dict[str, Any]], + model: str, + tools: Optional[List[Dict[str, Any]]] = None, + max_retries: Optional[int] = None, + stream: bool = False, + computer_handler=None, + _on_api_start=None, + _on_api_end=None, + _on_usage=None, + _on_screenshot=None, + **kwargs, + ) -> Dict[str, Any]: + # Holo models are only trained on UI localization tasks, not all-in-one agent + raise NotImplementedError() + + async def predict_click( + self, + model: str, + image_b64: str, + instruction: str, + **kwargs, + ) -> Optional[Tuple[int, int]]: + """ + Predict click coordinates using Holo1.5 via litellm.acompletion. + + - Optionally smart-resizes the image using Qwen2-VL rules if transformers are available + - Prompts for JSON with absolute pixel coordinates + - Parses x,y and maps back to original screenshot size if resized + """ + try: + img_bytes = base64.b64decode(image_b64) + original_img = Image.open(BytesIO(img_bytes)) + except Exception: + return None + + # Optional preprocessing + processed_img, (orig_w, orig_h) = _maybe_smart_resize(original_img, model) + + # If we resized, send the resized image; otherwise send original + img_to_send = processed_img + buf = BytesIO() + img_to_send.save(buf, format="PNG") + processed_b64 = base64.b64encode(buf.getvalue()).decode("utf-8") + + prompt = _build_holo_prompt(instruction) + + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{processed_b64}"}, + }, + {"type": "text", "text": prompt}, + ], + } + ] + + api_kwargs = { + "model": model, + "messages": messages, + # Deterministic, small output + "max_tokens": kwargs.get("max_tokens", 256), + "temperature": kwargs.get("temperature", 0.0), + } + + response = await litellm.acompletion(**api_kwargs) + output_text = (response.choices[0].message.content or "").strip() # type: ignore + + coords = _parse_click_json(output_text) + if coords is None: + return None + + x, y = coords + + # Map back to original size if we resized + proc_w, proc_h = img_to_send.size + if (proc_w, proc_h) != (orig_w, orig_h): + try: + sx = orig_w / float(proc_w) + sy = orig_h / float(proc_h) + x = int(round(x * sx)) + y = int(round(y * sy)) + except Exception: + # Fallback: clamp within original bounds + pass + + # Clamp to original image bounds + x = max(0, min(orig_w - 1, x)) + y = max(0, min(orig_h - 1, y)) + return x, y + + def get_capabilities(self) -> List[AgentCapability]: + return ["click"] From c5bbd4611ab11d9eaaba684f916717978c3f0005 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 15 Sep 2025 16:29:26 -0400 Subject: [PATCH 13/17] add qwen2_5_vl.py --- .../agent/agent/adapters/models/__init__.py | 4 +- .../agent/agent/adapters/models/qwen2_5_vl.py | 75 +++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 libs/python/agent/agent/adapters/models/qwen2_5_vl.py diff --git a/libs/python/agent/agent/adapters/models/__init__.py b/libs/python/agent/agent/adapters/models/__init__.py index 99696a1a..10f896d2 100644 --- a/libs/python/agent/agent/adapters/models/__init__.py +++ b/libs/python/agent/agent/adapters/models/__init__.py @@ -8,7 +8,7 @@ except ImportError: from .generic import GenericHFModel from .opencua import OpenCUAModel - +from .qwen2_5_vl import Qwen2_5_VLModel def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = False): """Factory function to load and return the right model handler instance. @@ -25,4 +25,6 @@ def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = # print(f"cls: {cls}") if "OpenCUA" in cls: return OpenCUAModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code) + elif "Qwen2_5_VLConfig" in cls: + return Qwen2_5_VLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code) return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code) diff --git a/libs/python/agent/agent/adapters/models/qwen2_5_vl.py b/libs/python/agent/agent/adapters/models/qwen2_5_vl.py new file mode 100644 index 00000000..17b25f8a --- /dev/null +++ b/libs/python/agent/agent/adapters/models/qwen2_5_vl.py @@ -0,0 +1,75 @@ +from typing import List, Dict, Any, Optional + +# Hugging Face imports are local to avoid hard dependency at module import +try: + import torch # type: ignore + from transformers import AutoModelForImageTextToText, AutoProcessor # type: ignore + HF_AVAILABLE = True +except Exception: + HF_AVAILABLE = False + + +class Qwen2_5_VLModel: + """Qwen2.5-VL Hugging Face vision-language model handler. + Loads an AutoModelForImageTextToText and AutoProcessor and generates text. + """ + + def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None: + if not HF_AVAILABLE: + raise ImportError( + "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\"" + ) + self.model_name = model_name + self.device = device + self.model = None + self.processor = None + self.trust_remote_code = trust_remote_code + self._load() + + def _load(self) -> None: + # Load model + self.model = AutoModelForImageTextToText.from_pretrained( + self.model_name, + torch_dtype=torch.bfloat16, + device_map=self.device, + attn_implementation="sdpa", + trust_remote_code=self.trust_remote_code, + ) + # Load processor + self.processor = AutoProcessor.from_pretrained( + self.model_name, + min_pixels=3136, + max_pixels=4096 * 2160, + device_map=self.device, + trust_remote_code=self.trust_remote_code, + ) + + def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str: + """Generate text for the given HF-format messages. + messages: [{ role, content: [{type:'text'|'image', text|image}] }] + """ + assert self.model is not None and self.processor is not None + # Apply chat template and tokenize + inputs = self.processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + # Move inputs to the same device as model + inputs = inputs.to(self.model.device) + # Generate + with torch.no_grad(): + generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens) + # Trim prompt tokens from output + generated_ids_trimmed = [ + out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + # Decode + output_text = self.processor.batch_decode( + generated_ids_trimmed, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) + return output_text[0] if output_text else "" From a46c276e70063607e851030fac880e75e9cd21a5 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 15 Sep 2025 16:41:39 -0400 Subject: [PATCH 14/17] updated model docs --- .../supported-agents/composed-agents.mdx | 32 ++++++++------- .../supported-agents/computer-use-agents.mdx | 41 ++++++++++++------- .../supported-agents/grounding-models.mdx | 29 +++++++------ 3 files changed, 61 insertions(+), 41 deletions(-) diff --git a/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx b/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx index 8040d2e5..485074e2 100644 --- a/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx +++ b/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx @@ -5,32 +5,36 @@ description: Combine grounding models with any LLM for computer-use capabilities Composed agents combine the best of both worlds: specialized grounding models for precise click prediction and powerful LLMs for task planning and reasoning. -Use the format `"grounding_model+thinking_model"` to create a composed agent with any vision-enabled LiteLLM-compatible model. +Use the format `"grounding_model+planning_model"` to create a composed agent with any vision-enabled LiteLLM-compatible model. ## How Composed Agents Work -1. **Planning Phase**: The thinking model (LLM) analyzes the task and decides what actions to take (e.g., `click("find the login button")`, `type("username")`) +1. **Planning Phase**: The planning model (LLM) analyzes the task and decides what actions to take (e.g., `click("find the login button")`, `type("username")`) 2. **Grounding Phase**: The grounding model converts element descriptions to precise coordinates 3. **Execution**: Actions are performed using the predicted coordinates ## Supported Grounding Models -Any model that supports `predict_click()` can be used as the grounding component: +Any model that supports `predict_click()` can be used as the grounding component. See the full list on [Grounding Models](./grounding-models). -- `omniparser` (OSS set-of-marks model) -- `huggingface-local/HelloKKMe/GTA1-7B` (OSS grounding model) -- `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` (OSS unified model) -- `claude-3-5-sonnet-20241022` (Anthropic CUA) -- `openai/computer-use-preview` (OpenAI CUA) +- OpenCUA: `huggingface-local/xlangai/OpenCUA-{7B,32B}` +- GTA1 family: `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` +- Holo 1.5 family: `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` +- InternVL 3.5 family: `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` +- UI‑TARS 1.5: `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` (also supports full CU) +- OmniParser (OCR): `omniparser` (requires combination with a LiteLLM vision model) -## Supported Thinking Models +## Supported Planning Models -Any vision-enabled LiteLLM-compatible model can be used as the thinking component: +Any vision-enabled LiteLLM-compatible model can be used as the planning component: -- **Anthropic**: `anthropic/claude-3-5-sonnet-20241022`, `anthropic/claude-3-opus-20240229` -- **OpenAI**: `openai/gpt-5`, `openai/gpt-o3`, `openai/gpt-4o` -- **Google**: `gemini/gemini-1.5-pro`, `vertex_ai/gemini-pro-vision` -- **Local models**: Any Hugging Face vision-language model +- Any All‑in‑one CUA (planning-capable). See [All‑in‑one CUAs](./computer-use-agents). +- Any VLM via LiteLLM providers: `anthropic/*`, `openai/*`, `openrouter/*`, `gemini/*`, `vertex_ai/*`, `huggingface-local/*`, `mlx/*`, etc. +- Examples: + - **Anthropic**: `anthropic/claude-3-5-sonnet-20241022`, `anthropic/claude-opus-4-1-20250805` + - **OpenAI**: `openai/gpt-5`, `openai/gpt-o3`, `openai/gpt-4o` + - **Google**: `gemini/gemini-1.5-pro`, `vertex_ai/gemini-pro-vision` + - **Local models**: Any Hugging Face vision-language model ## Usage Examples diff --git a/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx b/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx index 44ab41d1..b2487a7c 100644 --- a/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx +++ b/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx @@ -1,5 +1,5 @@ --- -title: Computer-Use Models +title: All‑in‑one CUA Models description: Models that support full computer-use agent capabilities with ComputerAgent.run() --- @@ -36,19 +36,6 @@ async for _ in agent.run("Take a screenshot and describe what you see"): pass ``` -## UI-TARS 1.5 - -Unified vision-language model for computer-use: - -- `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` -- `huggingface/ByteDance-Seed/UI-TARS-1.5-7B` (requires TGI endpoint) - -```python -agent = ComputerAgent("huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", tools=[computer]) -async for _ in agent.run("Open the settings menu and change the theme to dark mode"): - pass -``` - ## GLM-4.5V Zhipu AI's GLM-4.5V vision-language model with computer-use capabilities: @@ -62,6 +49,32 @@ async for _ in agent.run("Click on the search bar and type 'hello world'"): pass ``` +## InternVL 3.5 + +InternVL 3.5 family: +- `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` + +```python +agent = ComputerAgent("huggingface-local/OpenGVLab/InternVL3_5-1B", tools=[computer]) +async for _ in agent.run("Open Firefox and navigate to github.com"): + pass +``` + +## UI-TARS 1.5 + +Unified vision-language model for computer-use: + +- `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` +- `huggingface/ByteDance-Seed/UI-TARS-1.5-7B` (requires TGI endpoint) + +```python +agent = ComputerAgent("huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", tools=[computer]) +async for _ in agent.run("Open the settings menu and change the theme to dark mode"): + pass +``` + --- +CUAs also support direct click prediction. See [Grounding Models](./grounding-models) for details on `predict_click()`. + For details on agent loop behavior and usage, see [Agent Loops](../agent-loops). diff --git a/docs/content/docs/agent-sdk/supported-agents/grounding-models.mdx b/docs/content/docs/agent-sdk/supported-agents/grounding-models.mdx index 65d254fe..9270f183 100644 --- a/docs/content/docs/agent-sdk/supported-agents/grounding-models.mdx +++ b/docs/content/docs/agent-sdk/supported-agents/grounding-models.mdx @@ -7,9 +7,7 @@ These models specialize in UI element grounding and click prediction. They can i Use `ComputerAgent.predict_click()` to get coordinates for specific UI elements. -## All Computer-Use Agents - -All models that support `ComputerAgent.run()` also support `ComputerAgent.predict_click()`: +All models that support `ComputerAgent.run()` also support `ComputerAgent.predict_click()`. See [All‑in‑one CUAs](./computer-use-agents). ### Anthropic CUAs @@ -21,7 +19,7 @@ All models that support `ComputerAgent.run()` also support `ComputerAgent.predic ### OpenAI CUA Preview - Computer-use-preview: `computer-use-preview` -### UI-TARS 1.5 +### UI-TARS 1.5 (Unified VLM with grounding support) - `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` - `huggingface/ByteDance-Seed/UI-TARS-1.5-7B` (requires TGI endpoint) @@ -29,18 +27,24 @@ All models that support `ComputerAgent.run()` also support `ComputerAgent.predic These models are optimized specifically for click prediction and UI element grounding: -### OmniParser +### OpenCUA +- `huggingface-local/xlangai/OpenCUA-{7B,32B}` + +### GTA1 Family +- `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` + +### Holo 1.5 Family +- `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` + +### InternVL 3.5 Family +- `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` + +### OmniParser (OCR) OCR-focused set-of-marks model that requires an LLM for click prediction: - `omniparser` (requires combination with any LiteLLM vision model) -### GTA1-7B - -State-of-the-art grounding model from the [GUI Agent Grounding Leaderboard](https://gui-agent.github.io/grounding-leaderboard/): - -- `huggingface-local/HelloKKMe/GTA1-7B` - ## Usage Examples ```python @@ -83,7 +87,6 @@ print(f"Click coordinates: {coords}") # (450, 320) # agent.run("Fill out the form and submit it") ``` - --- -For information on combining grounding models with planning capabilities, see [Composed Agents](./composed-agents). +For information on combining grounding models with planning capabilities, see [Composed Agents](./composed-agents) and [All‑in‑one CUAs](./computer-use-agents). From 7cf27b1cc373431b3036a2d5d855c9d86f2e807d Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 15 Sep 2025 16:47:34 -0400 Subject: [PATCH 15/17] docs organization --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 24316555..e03d9bb3 100644 --- a/README.md +++ b/README.md @@ -30,10 +30,10 @@ With the Computer SDK, you can: With the Agent SDK, you can: - run computer-use models with a [consistent schema](https://docs.trycua.com/docs/agent-sdk/message-format) +- benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://docs.trycua.com/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb)) - combine UI grounding models with any LLM using [composed agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) - use new UI agent models and UI grounding models from the Model Zoo below with just a model string (e.g., `ComputerAgent(model="openai/computer-use-preview")`) - use API or local inference by changing a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers)) -- benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://docs.trycua.com/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb)) ### CUA Model Zoo 🐨 From 9147e8eeaf419c7116663df66b1d8604175cc800 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 16 Sep 2025 12:02:07 -0400 Subject: [PATCH 16/17] Added "cua-agent[internvl-hf]" dep --- .../agent/agent/adapters/models/__init__.py | 5 +- .../agent/agent/adapters/models/internvl.py | 78 ++++++++++++++++ libs/python/agent/agent/cli.py | 90 ++++++++++++++++++- libs/python/agent/pyproject.toml | 12 ++- 4 files changed, 182 insertions(+), 3 deletions(-) create mode 100644 libs/python/agent/agent/adapters/models/internvl.py diff --git a/libs/python/agent/agent/adapters/models/__init__.py b/libs/python/agent/agent/adapters/models/__init__.py index 10f896d2..b36fda1b 100644 --- a/libs/python/agent/agent/adapters/models/__init__.py +++ b/libs/python/agent/agent/adapters/models/__init__.py @@ -9,6 +9,7 @@ except ImportError: from .generic import GenericHFModel from .opencua import OpenCUAModel from .qwen2_5_vl import Qwen2_5_VLModel +from .internvl import InternVLModel def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = False): """Factory function to load and return the right model handler instance. @@ -22,9 +23,11 @@ def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = ) cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code) cls = cfg.__class__.__name__ - # print(f"cls: {cls}") + print(f"cls: {cls}") if "OpenCUA" in cls: return OpenCUAModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code) elif "Qwen2_5_VLConfig" in cls: return Qwen2_5_VLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code) + elif "InternVLChatConfig" in cls: + return InternVLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code) return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code) diff --git a/libs/python/agent/agent/adapters/models/internvl.py b/libs/python/agent/agent/adapters/models/internvl.py new file mode 100644 index 00000000..0ed32e6b --- /dev/null +++ b/libs/python/agent/agent/adapters/models/internvl.py @@ -0,0 +1,78 @@ +from typing import List, Dict, Any, Optional + +# Hugging Face imports are local to avoid hard dependency at module import +try: + import torch # type: ignore + from transformers import AutoModel, AutoProcessor # type: ignore + # Attempt to import InternVL's model dependencies + import einops as _ # type: ignore + import timm as _ # type: ignore + HF_AVAILABLE = True +except Exception: + HF_AVAILABLE = False + + +class InternVLModel: + """Generic Hugging Face vision-language model handler. + Loads an AutoModelForImageTextToText and AutoProcessor and generates text. + """ + + def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None: + if not HF_AVAILABLE: + raise ImportError( + "InternVL dependencies not found. Install with: pip install \"cua-agent[internvl-hf]\"" + ) + self.model_name = model_name + self.device = device + self.model = None + self.processor = None + self.trust_remote_code = trust_remote_code + self._load() + + def _load(self) -> None: + # Load model + self.model = AutoModel.from_pretrained( + self.model_name, + torch_dtype=torch.float16, + device_map=self.device, + attn_implementation="sdpa", + trust_remote_code=self.trust_remote_code, + ) + # Load processor + self.processor = AutoProcessor.from_pretrained( + self.model_name, + min_pixels=3136, + max_pixels=4096 * 2160, + device_map=self.device, + trust_remote_code=self.trust_remote_code, + ) + + def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str: + """Generate text for the given HF-format messages. + messages: [{ role, content: [{type:'text'|'image', text|image}] }] + """ + assert self.model is not None and self.processor is not None + # Apply chat template and tokenize + inputs = self.processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + # Move inputs to the same device as model + inputs = inputs.to(self.model.device) + # Generate + with torch.no_grad(): + generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens) + # Trim prompt tokens from output + generated_ids_trimmed = [ + out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + # Decode + output_text = self.processor.batch_decode( + generated_ids_trimmed, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) + return output_text[0] if output_text else "" diff --git a/libs/python/agent/agent/cli.py b/libs/python/agent/agent/cli.py index b04f11db..c0434d02 100644 --- a/libs/python/agent/agent/cli.py +++ b/libs/python/agent/agent/cli.py @@ -18,6 +18,15 @@ try: import json from typing import List, Dict, Any import dotenv + import base64 + import time + import platform + from pathlib import Path + try: + from PIL import Image, ImageDraw + PIL_AVAILABLE = True + except Exception: + PIL_AVAILABLE = False from yaspin import yaspin except ImportError: if __name__ == "__main__": @@ -248,6 +257,13 @@ Examples: help="Initial prompt to send to the agent. Leave blank for interactive mode." ) + parser.add_argument( + "--predict-click", + dest="predict_click", + type=str, + help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it." + ) + parser.add_argument( "-c", "--cache", action="store_true", @@ -354,7 +370,79 @@ Examples: agent = ComputerAgent(**agent_kwargs) - # Start chat loop + # If predict-click mode is requested, run once and exit + if args.predict_click: + if not PIL_AVAILABLE: + print_colored("❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow", Colors.RED, bold=True) + sys.exit(1) + + instruction = args.predict_click + print_colored(f"Predicting click for: '{instruction}'", Colors.CYAN) + + # Take a fresh screenshot FIRST + try: + img_bytes = await computer.interface.screenshot() + except Exception as e: + print_colored(f"❌ Failed to take screenshot: {e}", Colors.RED, bold=True) + sys.exit(1) + + # Encode screenshot to base64 for predict_click + try: + image_b64 = base64.b64encode(img_bytes).decode("utf-8") + except Exception as e: + print_colored(f"❌ Failed to encode screenshot: {e}", Colors.RED, bold=True) + sys.exit(1) + + try: + coords = await agent.predict_click(instruction, image_b64=image_b64) + except Exception as e: + print_colored(f"❌ predict_click failed: {e}", Colors.RED, bold=True) + sys.exit(1) + + if not coords: + print_colored("⚠️ No coordinates returned.", Colors.YELLOW) + sys.exit(2) + + x, y = coords + print_colored(f"✅ Predicted coordinates: ({x}, {y})", Colors.GREEN) + + try: + from io import BytesIO + with Image.open(BytesIO(img_bytes)) as img: + img = img.convert("RGB") + draw = ImageDraw.Draw(img) + # Draw crosshair + size = 12 + color = (255, 0, 0) + draw.line([(x - size, y), (x + size, y)], fill=color, width=3) + draw.line([(x, y - size), (x, y + size)], fill=color, width=3) + # Optional small circle + r = 6 + draw.ellipse([(x - r, y - r), (x + r, y + r)], outline=color, width=2) + + out_path = Path.cwd() / f"predict_click_{int(time.time())}.png" + img.save(out_path) + print_colored(f"🖼️ Saved to {out_path}") + + # Open the image with default viewer + try: + system = platform.system().lower() + if system == "windows": + os.startfile(str(out_path)) # type: ignore[attr-defined] + elif system == "darwin": + os.system(f"open \"{out_path}\"") + else: + os.system(f"xdg-open \"{out_path}\"") + except Exception: + pass + except Exception as e: + print_colored(f"❌ Failed to render/save screenshot: {e}", Colors.RED, bold=True) + sys.exit(1) + + # Done + sys.exit(0) + + # Start chat loop (default interactive mode) await chat_loop(agent, args.model, container_name, args.prompt, args.usage) diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml index b78931ea..6fea439c 100644 --- a/libs/python/agent/pyproject.toml +++ b/libs/python/agent/pyproject.toml @@ -53,6 +53,13 @@ opencua-hf = [ "tiktoken>=0.11.0", "blobfile>=3.0.0" ] +internvl-hf = [ + "accelerate", + "torch", + "transformers>=4.55.0", + "einops", + "timm" +] ui = [ "gradio>=5.23.3", "python-dotenv>=1.0.1", @@ -68,7 +75,10 @@ all = [ "mlx-vlm>=0.1.27; sys_platform == 'darwin'", "accelerate", "torch", - "transformers>=4.54.0", + "transformers>=4.55.0", + # internvl requirements, + "einops", + "timm", # opencua requirements "tiktoken>=0.11.0", "blobfile>=3.0.0", From 6ddddf8f880dd1eb86a724b395a0a57ba0bba7e6 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 16 Sep 2025 12:56:07 -0400 Subject: [PATCH 17/17] fix internVL inference --- .../agent/agent/adapters/models/__init__.py | 4 +- .../agent/agent/adapters/models/internvl.py | 247 +++++++++++++++--- libs/python/agent/agent/loops/internvl.py | 12 +- 3 files changed, 222 insertions(+), 41 deletions(-) diff --git a/libs/python/agent/agent/adapters/models/__init__.py b/libs/python/agent/agent/adapters/models/__init__.py index b36fda1b..3ed48404 100644 --- a/libs/python/agent/agent/adapters/models/__init__.py +++ b/libs/python/agent/agent/adapters/models/__init__.py @@ -26,8 +26,8 @@ def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = print(f"cls: {cls}") if "OpenCUA" in cls: return OpenCUAModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code) - elif "Qwen2_5_VLConfig" in cls: + elif "Qwen2_5_VL" in cls: return Qwen2_5_VLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code) - elif "InternVLChatConfig" in cls: + elif "InternVL" in cls: return InternVLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code) return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code) diff --git a/libs/python/agent/agent/adapters/models/internvl.py b/libs/python/agent/agent/adapters/models/internvl.py index 0ed32e6b..bb2de42e 100644 --- a/libs/python/agent/agent/adapters/models/internvl.py +++ b/libs/python/agent/agent/adapters/models/internvl.py @@ -3,10 +3,16 @@ from typing import List, Dict, Any, Optional # Hugging Face imports are local to avoid hard dependency at module import try: import torch # type: ignore - from transformers import AutoModel, AutoProcessor # type: ignore + from transformers import AutoModel, AutoTokenizer # type: ignore # Attempt to import InternVL's model dependencies import einops as _ # type: ignore import timm as _ # type: ignore + from PIL import Image # type: ignore + import torchvision.transforms as T # type: ignore + from torchvision.transforms.functional import InterpolationMode # type: ignore + import base64 # type: ignore + from io import BytesIO # type: ignore + import requests # type: ignore HF_AVAILABLE = True except Exception: HF_AVAILABLE = False @@ -14,7 +20,8 @@ except Exception: class InternVLModel: """Generic Hugging Face vision-language model handler. - Loads an AutoModelForImageTextToText and AutoProcessor and generates text. + Uses InternVL's native `model.chat()` interface with `AutoTokenizer`. + Provides preprocessing to support multi-turn conversations with multiple images. """ def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None: @@ -25,7 +32,7 @@ class InternVLModel: self.model_name = model_name self.device = device self.model = None - self.processor = None + self.tokenizer = None self.trust_remote_code = trust_remote_code self._load() @@ -33,46 +40,214 @@ class InternVLModel: # Load model self.model = AutoModel.from_pretrained( self.model_name, - torch_dtype=torch.float16, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + use_flash_attn=True, device_map=self.device, - attn_implementation="sdpa", trust_remote_code=self.trust_remote_code, - ) - # Load processor - self.processor = AutoProcessor.from_pretrained( + ).eval() + # Load tokenizer (InternVL requires trust_remote_code=True and often use_fast=False) + self.tokenizer = AutoTokenizer.from_pretrained( self.model_name, - min_pixels=3136, - max_pixels=4096 * 2160, - device_map=self.device, trust_remote_code=self.trust_remote_code, + use_fast=False, ) + # ---- Image preprocessing utilities adapted from InternVL docs ---- + IMAGENET_MEAN = (0.485, 0.456, 0.406) + IMAGENET_STD = (0.229, 0.224, 0.225) + + def _build_transform(self, input_size: int) -> T.Compose: + MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD + transform = T.Compose([ + T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=MEAN, std=STD) + ]) + return transform + + def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tuple], width: int, height: int, image_size: int): + best_ratio_diff = float('inf') + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448, use_thumbnail: bool = True) -> List[Image.Image]: + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + + target_ratios = set( + (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if + i * j <= max_num and i * j >= min_num) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + target_aspect_ratio = self._find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size) + + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + resized_img = image.resize((target_width, target_height)) + processed_images: List[Image.Image] = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size + ) + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + + def _load_image_from_source(self, src: str) -> Image.Image: + """Load PIL image from various sources: data URL, http(s), or local path.""" + if src.startswith("data:image/"): + # data URL base64 + header, b64data = src.split(",", 1) + img_bytes = base64.b64decode(b64data) + return Image.open(BytesIO(img_bytes)).convert('RGB') + if src.startswith("http://") or src.startswith("https://"): + resp = requests.get(src, timeout=10) + resp.raise_for_status() + return Image.open(BytesIO(resp.content)).convert('RGB') + # Assume local file path + return Image.open(src).convert('RGB') + + def _images_to_pixel_values(self, images: List[Image.Image], input_size: int = 448, max_num: int = 12): + transform = self._build_transform(input_size=input_size) + pixel_values_list = [] + num_patches_list: List[int] = [] + for img in images: + tiles = self._dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num) + pv = [transform(tile) for tile in tiles] + pv = torch.stack(pv) + num_patches_list.append(pv.shape[0]) + pixel_values_list.append(pv) + if not pixel_values_list: + return None, [] + pixel_values = torch.cat(pixel_values_list) + return pixel_values, num_patches_list + def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str: """Generate text for the given HF-format messages. messages: [{ role, content: [{type:'text'|'image', text|image}] }] + + This implementation constructs InternVL-compatible inputs and uses + `model.chat(tokenizer, pixel_values, question, history=...)` to avoid + relying on AutoProcessor (which fails for some tokenizers). """ - assert self.model is not None and self.processor is not None - # Apply chat template and tokenize - inputs = self.processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - return_tensors="pt", - ) - # Move inputs to the same device as model - inputs = inputs.to(self.model.device) - # Generate - with torch.no_grad(): - generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens) - # Trim prompt tokens from output - generated_ids_trimmed = [ - out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) - ] - # Decode - output_text = self.processor.batch_decode( - generated_ids_trimmed, - skip_special_tokens=True, - clean_up_tokenization_spaces=False, - ) - return output_text[0] if output_text else "" + assert self.model is not None and self.tokenizer is not None + + # Build textual context and collect images and the final question + context_lines: List[str] = [] + all_images: List[Image.Image] = [] + last_user_text_parts: List[str] = [] + + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", []) + if isinstance(content, str): + content_items = [{"type": "text", "text": content}] + else: + content_items = content + + if role == "user": + # Collect text and images + parts_text: List[str] = [] + for item in content_items: + if item.get("type") == "text": + t = item.get("text", "") + if t: + parts_text.append(t) + elif item.get("type") == "image": + url = item.get("image", "") + if url: + try: + all_images.append(self._load_image_from_source(url)) + except Exception: + # Ignore failed image loads but keep going + pass + text = "\n".join(parts_text).strip() + if text: + context_lines.append(f"User: {text}") + # Track last user text separately for question + last_user_text_parts = parts_text or last_user_text_parts + elif role == "assistant": + # Only keep text content for history + parts_text = [item.get("text", "") for item in content_items if item.get("type") == "text"] + text = "\n".join(parts_text).strip() + if text: + context_lines.append(f"Assistant: {text}") + + # Prepare pixel values for all collected images (across turns) + pixel_values = None + num_patches_list: List[int] = [] + if all_images: + pixel_values, num_patches_list = self._images_to_pixel_values(all_images, input_size=448, max_num=12) + if pixel_values is not None: + # Convert dtype/device as in docs + pixel_values = pixel_values.to(torch.bfloat16) + # Chat API expects tensors on CUDA when model is on CUDA + try: + pixel_values = pixel_values.to(self.model.device) + except Exception: + pass + + # Build question with any prior context and numbered image placeholders + if all_images: + # Separate images layout: Image-1: ... then question text + prefix_lines = [f"Image-{i+1}: " for i in range(len(all_images))] + prefix = "\n".join(prefix_lines) + "\n" + else: + prefix = "" + + last_user_text = "\n".join(last_user_text_parts).strip() + # Combine prior text-only turns as context to emulate multi-turn + context_text = "\n".join(context_lines[:-1]) if len(context_lines) > 1 else "" + base_question = last_user_text if last_user_text else "Describe the image(s) in detail." + if context_text: + question = (context_text + "\n" + prefix + base_question).strip() + else: + question = (prefix + base_question).strip() + + # Generation config + generation_config = dict(max_new_tokens=max_new_tokens, do_sample=False) + + # Call InternVL chat + try: + if pixel_values is None: + # Pure-text conversation (embed prior turns in question) + response = self.model.chat(self.tokenizer, None, question, generation_config) + else: + # Multi-image: pass num_patches_list if >1 image + if len(num_patches_list) > 1: + response = self.model.chat( + self.tokenizer, + pixel_values, + question, + generation_config, + num_patches_list=num_patches_list, + ) + else: + response = self.model.chat(self.tokenizer, pixel_values, question, generation_config) + except Exception as e: + # Fallback: return empty string to avoid crashing the adapter + return "" + + return response or "" diff --git a/libs/python/agent/agent/loops/internvl.py b/libs/python/agent/agent/loops/internvl.py index d1b8c3fe..a857ffe3 100644 --- a/libs/python/agent/agent/loops/internvl.py +++ b/libs/python/agent/agent/loops/internvl.py @@ -26,9 +26,13 @@ from .composed_grounded import ComposedGroundedConfig from ..types import AgentCapability -# Regex patterns matching ScreenSpot baseline extractors -_POINT_PATTERN = re.compile(r"\[\[(\d+),(\d+)\]\]") -_BBOX_PATTERN = re.compile(r"\[\[(\d+),(\d+),(\d+),(\d+)\]\]") +# Regex patterns for extracting coordinates +# Accept optional whitespace and optional decimal fractions +_NUM = r"(\d+(?:\.\d+)?)" +_POINT_PATTERN = re.compile(r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]") +_BBOX_PATTERN = re.compile( + r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]" +) def _extract_first_point(text: str) -> Optional[Tuple[float, float]]: @@ -160,6 +164,8 @@ class InternVLConfig(ComposedGroundedConfig): response = await litellm.acompletion(**api_kwargs) output_text = (response.choices[0].message.content or "").strip() # type: ignore + print(f"InternVL output: {output_text}") + # Try to parse a point first; if absent, parse bbox and take center point = _extract_first_point(output_text) if point is None: