From e065ae59d28ee6496cee7f75d877a093d2af4815 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 20 Aug 2025 10:03:41 -0400
Subject: [PATCH 01/17] Add OpenCUA Grounding mode

---
 libs/python/agent/agent/loops/opencua.py | 133 +++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 libs/python/agent/agent/loops/opencua.py

diff --git a/libs/python/agent/agent/loops/opencua.py b/libs/python/agent/agent/loops/opencua.py
new file mode 100644
index 00000000..d1c6c5fb
--- /dev/null
+++ b/libs/python/agent/agent/loops/opencua.py
@@ -0,0 +1,133 @@
+"""
+OpenCUA agent loop implementation for click prediction using litellm.acompletion
+Based on OpenCUA model for GUI grounding tasks.
+"""
+
+import asyncio
+import json
+import re
+import base64
+from typing import Dict, List, Any, AsyncGenerator, Union, Optional, Tuple
+from io import BytesIO
+import uuid
+from PIL import Image
+import litellm
+import math
+
+from ..decorators import register_agent
+from ..types import Messages, AgentResponse, Tools, AgentCapability
+from ..loops.base import AsyncAgentConfig
+
+def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
+    """Extract coordinates from pyautogui.click(x=..., y=...) format."""
+    try:
+        # Look for pyautogui.click(x=1443, y=343) pattern
+        pattern = r"pyautogui\.click\(x=(\d+),\s*y=(\d+)\)"
+        match = re.search(pattern, text)
+        if match:
+            x, y = int(match.group(1)), int(match.group(2))
+            return (x, y)
+        return None
+    except Exception:
+        return None
+
+@register_agent(models=r"(?i).*OpenCUA.*")
+class OpenCUAConfig(AsyncAgentConfig):
+    """OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
+    
+    def __init__(self):
+        self.current_model = None
+        self.last_screenshot_b64 = None
+
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        """Predict step is not implemented for OpenCUA model."""
+        raise NotImplementedError("predict_step is not implemented for OpenCUA model")
+
+    async def predict_click(
+        self,
+        model: str,
+        image_b64: str,
+        instruction: str,
+        **kwargs
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates using OpenCUA model via litellm.acompletion.
+        
+        Args:
+            model: The OpenCUA model name
+            image_b64: Base64 encoded image
+            instruction: Instruction for where to click
+            
+        Returns:
+            Tuple of (x, y) coordinates or None if prediction fails
+        """
+        # Prepare system message
+        system_prompt = (
+            "You are a GUI agent. You are given a task and a screenshot of the screen. "
+            "You need to perform a series of pyautogui actions to complete the task."
+        )
+        
+        system_message = {
+            "role": "system",
+            "content": system_prompt
+        }
+        
+        # Prepare user message with image and instruction
+        user_message = {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": f"data:image/png;base64,{image_b64}"
+                },
+                {
+                    "type": "text",
+                    "text": instruction
+                }
+            ]
+        }
+        
+        # Prepare API call kwargs
+        api_kwargs = {
+            "model": model,
+            "messages": [system_message, user_message],
+            "max_new_tokens": 512,
+            "temperature": 0,
+            **kwargs
+        }
+        
+        try:
+            # Use liteLLM acompletion
+            response = await litellm.acompletion(**api_kwargs)
+            
+            # Extract response text
+            output_text = response.choices[0].message.content
+            
+            if not output_text:
+                return None
+            
+            # Extract coordinates from pyautogui format
+            coordinates = extract_coordinates_from_pyautogui(output_text)
+            
+            return coordinates
+            
+        except Exception as e:
+            print(f"Error in OpenCUA predict_click: {e}")
+            return None
+    
+    def get_capabilities(self) -> List[AgentCapability]:
+        """Return the capabilities supported by this agent."""
+        return ["click"]

From d7e25048be12769020aa46a250dcbdf7e6eda15a Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 20 Aug 2025 10:21:53 -0400
Subject: [PATCH 02/17] Register OpenCUA loop

---
 libs/python/agent/agent/loops/__init__.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py
index 45f70e20..25227e64 100644
--- a/libs/python/agent/agent/loops/__init__.py
+++ b/libs/python/agent/agent/loops/__init__.py
@@ -10,5 +10,15 @@ from . import omniparser
 from . import gta1
 from . import composed_grounded
 from . import glm45v
+from . import opencua
 
-__all__ = ["anthropic", "openai", "uitars", "omniparser", "gta1", "composed_grounded", "glm45v"]
+__all__ = [
+    "anthropic", 
+    "openai", 
+    "uitars", 
+    "omniparser", 
+    "gta1", 
+    "composed_grounded", 
+    "glm45v", 
+    "opencua"
+]
\ No newline at end of file

From dad6634ffd9900750c7374c6c9db0f0da0d5bf75 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Thu, 21 Aug 2025 10:51:39 -0400
Subject: [PATCH 03/17] Added local inference routing for different models

---
 .../adapters/huggingfacelocal_adapter.py      | 77 +++------------
 .../agent/agent/adapters/models/__init__.py   | 28 ++++++
 .../agent/agent/adapters/models/generic.py    | 72 ++++++++++++++
 .../agent/agent/adapters/models/opencua.py    | 98 +++++++++++++++++++
 libs/python/agent/agent/loops/opencua.py      | 36 +++----
 libs/python/agent/pyproject.toml              | 10 ++
 6 files changed, 236 insertions(+), 85 deletions(-)
 create mode 100644 libs/python/agent/agent/adapters/models/__init__.py
 create mode 100644 libs/python/agent/agent/adapters/models/generic.py
 create mode 100644 libs/python/agent/agent/adapters/models/opencua.py

diff --git a/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py b/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py
index 46d72db3..6f06734c 100644
--- a/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py
+++ b/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py
@@ -15,6 +15,7 @@ try:
 except ImportError:
     HF_AVAILABLE = False
 
+from .models import load_model as load_model_handler
 
 class HuggingFaceLocalAdapter(CustomLLM):
     """HuggingFace Local Adapter for running vision-language models locally."""
@@ -28,41 +29,15 @@ class HuggingFaceLocalAdapter(CustomLLM):
         """
         super().__init__()
         self.device = device
-        self.models = {}  # Cache for loaded models
-        self.processors = {}  # Cache for loaded processors
+        # Cache for model handlers keyed by model_name
+        self._handlers: Dict[str, Any] = {}
         self._executor = ThreadPoolExecutor(max_workers=1)  # Single thread pool
         
-    def _load_model_and_processor(self, model_name: str):
-        """Load model and processor if not already cached.
-        
-        Args:
-            model_name: Name of the model to load
-            
-        Returns:
-            Tuple of (model, processor)
-        """
-        if model_name not in self.models:
-            # Load model
-            model = AutoModelForImageTextToText.from_pretrained(
-                model_name,
-                torch_dtype=torch.float16,
-                device_map=self.device,
-                attn_implementation="sdpa"
-            )
-            
-            # Load processor
-            processor = AutoProcessor.from_pretrained(
-                model_name,
-                min_pixels=3136,
-                max_pixels=4096 * 2160,
-                device_map=self.device
-            )
-            
-            # Cache them
-            self.models[model_name] = model
-            self.processors[model_name] = processor
-            
-        return self.models[model_name], self.processors[model_name]
+    def _get_handler(self, model_name: str):
+        """Get or create a model handler for the given model name."""
+        if model_name not in self._handlers:
+            self._handlers[model_name] = load_model_handler(model_name=model_name, device=self.device)
+        return self._handlers[model_name]
     
     def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """Convert OpenAI format messages to HuggingFace format.
@@ -133,41 +108,13 @@ class HuggingFaceLocalAdapter(CustomLLM):
         if ignored_kwargs:
             warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
         
-        # Load model and processor
-        model, processor = self._load_model_and_processor(model_name)
-        
         # Convert messages to HuggingFace format
         hf_messages = self._convert_messages(messages)
         
-        # Apply chat template and tokenize
-        inputs = processor.apply_chat_template(
-            hf_messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt"
-        )
-        
-        # Move inputs to the same device as model
-        inputs = inputs.to(model.device)
-        
-        # Generate response
-        with torch.no_grad():
-            generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
-            
-        # Trim input tokens from output
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        
-        # Decode output
-        output_text = processor.batch_decode(
-            generated_ids_trimmed, 
-            skip_special_tokens=True, 
-            clean_up_tokenization_spaces=False
-        )
-        
-        return output_text[0] if output_text else ""
+        # Delegate to model handler
+        handler = self._get_handler(model_name)
+        generated_text = handler.generate(hf_messages, max_new_tokens=max_new_tokens)
+        return generated_text
     
     def completion(self, *args, **kwargs) -> ModelResponse:
         """Synchronous completion method.
diff --git a/libs/python/agent/agent/adapters/models/__init__.py b/libs/python/agent/agent/adapters/models/__init__.py
new file mode 100644
index 00000000..6811c142
--- /dev/null
+++ b/libs/python/agent/agent/adapters/models/__init__.py
@@ -0,0 +1,28 @@
+from typing import Optional
+
+try:
+    from transformers import AutoConfig
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+
+from .generic import GenericHFModel
+from .opencua import OpenCUAModel
+
+
+def load_model(model_name: str, device: str = "auto"):
+    """Factory function to load and return the right model handler instance.
+    
+    - If the underlying transformers config class matches OpenCUA, return OpenCUAModel
+    - Otherwise, return GenericHFModel
+    """
+    if not HF_AVAILABLE:
+        raise ImportError(
+            "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
+        )
+    cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    cls = cfg.__class__.__name__
+    print(f"cls: {cls}")
+    if "OpenCUA" in cls:
+        return OpenCUAModel(model_name=model_name, device=device)
+    return GenericHFModel(model_name=model_name, device=device)
diff --git a/libs/python/agent/agent/adapters/models/generic.py b/libs/python/agent/agent/adapters/models/generic.py
new file mode 100644
index 00000000..de267239
--- /dev/null
+++ b/libs/python/agent/agent/adapters/models/generic.py
@@ -0,0 +1,72 @@
+from typing import List, Dict, Any, Optional
+
+# Hugging Face imports are local to avoid hard dependency at module import
+try:
+    import torch  # type: ignore
+    from transformers import AutoModelForImageTextToText, AutoProcessor  # type: ignore
+    HF_AVAILABLE = True
+except Exception:
+    HF_AVAILABLE = False
+
+
+class GenericHFModel:
+    """Generic Hugging Face vision-language model handler.
+    Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
+    """
+
+    def __init__(self, model_name: str, device: str = "auto") -> None:
+        if not HF_AVAILABLE:
+            raise ImportError(
+                "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
+            )
+        self.model_name = model_name
+        self.device = device
+        self.model = None
+        self.processor = None
+        self._load()
+
+    def _load(self) -> None:
+        # Load model
+        self.model = AutoModelForImageTextToText.from_pretrained(
+            self.model_name,
+            torch_dtype=torch.float16,
+            device_map=self.device,
+            attn_implementation="sdpa",
+        )
+        # Load processor
+        self.processor = AutoProcessor.from_pretrained(
+            self.model_name,
+            min_pixels=3136,
+            max_pixels=4096 * 2160,
+            device_map=self.device,
+        )
+
+    def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
+        """Generate text for the given HF-format messages.
+        messages: [{ role, content: [{type:'text'|'image', text|image}] }]
+        """
+        assert self.model is not None and self.processor is not None
+        # Apply chat template and tokenize
+        inputs = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        # Move inputs to the same device as model
+        inputs = inputs.to(self.model.device)
+        # Generate
+        with torch.no_grad():
+            generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+        # Trim prompt tokens from output
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        # Decode
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )
+        return output_text[0] if output_text else ""
diff --git a/libs/python/agent/agent/adapters/models/opencua.py b/libs/python/agent/agent/adapters/models/opencua.py
new file mode 100644
index 00000000..f24dfa6b
--- /dev/null
+++ b/libs/python/agent/agent/adapters/models/opencua.py
@@ -0,0 +1,98 @@
+from typing import List, Dict, Any
+import re
+import base64
+from io import BytesIO
+
+try:
+    import torch  # type: ignore
+    from transformers import AutoTokenizer, AutoModel, AutoImageProcessor  # type: ignore
+    from PIL import Image  # type: ignore
+    import blobfile as _ # assert blobfile is installed
+    OPENCUA_AVAILABLE = True
+except Exception:
+    OPENCUA_AVAILABLE = False
+
+
+class OpenCUAModel:
+    """OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""
+
+    def __init__(self, model_name: str, device: str = "auto") -> None:
+        if not OPENCUA_AVAILABLE:
+            raise ImportError(
+                "OpenCUA requirements not found. Install with: pip install \"cua-agent[opencua-hf]\""
+            )
+        self.model_name = model_name
+        self.device = device
+        self.model = None
+        self.tokenizer = None
+        self.image_processor = None
+        self._load()
+
+    def _load(self) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name, trust_remote_code=True
+        )
+        self.model = AutoModel.from_pretrained(
+            self.model_name,
+            torch_dtype="auto",
+            device_map=self.device,
+            trust_remote_code=True,
+        )
+        self.image_processor = AutoImageProcessor.from_pretrained(
+            self.model_name, trust_remote_code=True
+        )
+
+    @staticmethod
+    def _extract_last_image_b64(messages: List[Dict[str, Any]]) -> str:
+        # Expect HF-format messages with content items type: "image" with data URL
+        for msg in reversed(messages):
+            for item in reversed(msg.get("content", [])):
+                if isinstance(item, dict) and item.get("type") == "image":
+                    url = item.get("image", "")
+                    if isinstance(url, str) and url.startswith("data:image/"):
+                        return url.split(",", 1)[1]
+        return ""
+
+    def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str:
+        assert self.model is not None and self.tokenizer is not None and self.image_processor is not None
+
+        # Tokenize text side using chat template
+        input_ids = self.tokenizer.apply_chat_template(
+            messages, tokenize=True, add_generation_prompt=True
+        )
+        input_ids = torch.tensor([input_ids]).to(self.model.device)
+
+        # Prepare image inputs from last data URL image
+        image_b64 = self._extract_last_image_b64(messages)
+        pixel_values = None
+        grid_thws = None
+        if image_b64:
+            image = Image.open(BytesIO(base64.b64decode(image_b64))).convert("RGB")
+            image_info = self.image_processor.preprocess(images=[image])
+            pixel_values = torch.tensor(image_info["pixel_values"]).to(
+                dtype=torch.bfloat16, device=self.model.device
+            )
+            grid_thws = torch.tensor(image_info["image_grid_thw"]) if "image_grid_thw" in image_info else None
+
+        gen_kwargs: Dict[str, Any] = {
+            "max_new_tokens": max_new_tokens,
+            "temperature": 0,
+        }
+        if pixel_values is not None:
+            gen_kwargs["pixel_values"] = pixel_values
+        if grid_thws is not None:
+            gen_kwargs["grid_thws"] = grid_thws
+
+        with torch.no_grad():
+            generated_ids = self.model.generate(
+                input_ids,
+                **gen_kwargs,
+            )
+
+        # Remove prompt tokens
+        prompt_len = input_ids.shape[1]
+        generated_ids = generated_ids[:, prompt_len:]
+        output_text = self.tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        return output_text
diff --git a/libs/python/agent/agent/loops/opencua.py b/libs/python/agent/agent/loops/opencua.py
index d1c6c5fb..c13875b2 100644
--- a/libs/python/agent/agent/loops/opencua.py
+++ b/libs/python/agent/agent/loops/opencua.py
@@ -90,8 +90,10 @@ class OpenCUAConfig(AsyncAgentConfig):
             "role": "user",
             "content": [
                 {
-                    "type": "image",
-                    "image": f"data:image/png;base64,{image_b64}"
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/png;base64,{image_b64}"
+                    }
                 },
                 {
                     "type": "text",
@@ -109,24 +111,18 @@ class OpenCUAConfig(AsyncAgentConfig):
             **kwargs
         }
         
-        try:
-            # Use liteLLM acompletion
-            response = await litellm.acompletion(**api_kwargs)
-            
-            # Extract response text
-            output_text = response.choices[0].message.content
-            
-            if not output_text:
-                return None
-            
-            # Extract coordinates from pyautogui format
-            coordinates = extract_coordinates_from_pyautogui(output_text)
-            
-            return coordinates
-            
-        except Exception as e:
-            print(f"Error in OpenCUA predict_click: {e}")
-            return None
+        # Use liteLLM acompletion
+        response = await litellm.acompletion(**api_kwargs)
+        
+        # Extract response text
+        output_text = response.choices[0].message.content
+        
+        print(output_text)
+        
+        # Extract coordinates from pyautogui format
+        coordinates = extract_coordinates_from_pyautogui(output_text)
+        
+        return coordinates
     
     def get_capabilities(self) -> List[AgentCapability]:
         """Return the capabilities supported by this agent."""
diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml
index 4dd27062..811c3a9c 100644
--- a/libs/python/agent/pyproject.toml
+++ b/libs/python/agent/pyproject.toml
@@ -47,6 +47,13 @@ glm45v-hf = [
     "torch",
     "transformers-v4.55.0-GLM-4.5V-preview"
 ]
+opencua-hf = [
+    "accelerate",
+    "torch",
+    "transformers>=4.54.0",
+    "tiktoken>=0.11.0",
+    "blobfile>=3.0.0"
+]
 ui = [
     "gradio>=5.23.3",
     "python-dotenv>=1.0.1",
@@ -66,6 +73,9 @@ all = [
     "accelerate",
     "torch",
     "transformers>=4.54.0",
+    # opencua requirements
+    "tiktoken>=0.11.0",
+    "blobfile>=3.0.0"
     # ui requirements
     "gradio>=5.23.3",
     "python-dotenv>=1.0.1",

From b20d2a0a9384674bc255c7876d43dcf1f90fdcbf Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Thu, 21 Aug 2025 11:54:13 -0400
Subject: [PATCH 04/17] Pinned transformers version, first working grounding
 version

---
 libs/python/agent/agent/adapters/models/__init__.py | 2 +-
 libs/python/agent/agent/adapters/models/opencua.py  | 1 +
 libs/python/agent/agent/loops/opencua.py            | 5 ++---
 libs/python/agent/pyproject.toml                    | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/libs/python/agent/agent/adapters/models/__init__.py b/libs/python/agent/agent/adapters/models/__init__.py
index 6811c142..8a5fb00b 100644
--- a/libs/python/agent/agent/adapters/models/__init__.py
+++ b/libs/python/agent/agent/adapters/models/__init__.py
@@ -22,7 +22,7 @@ def load_model(model_name: str, device: str = "auto"):
         )
     cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
     cls = cfg.__class__.__name__
-    print(f"cls: {cls}")
+    # print(f"cls: {cls}")
     if "OpenCUA" in cls:
         return OpenCUAModel(model_name=model_name, device=device)
     return GenericHFModel(model_name=model_name, device=device)
diff --git a/libs/python/agent/agent/adapters/models/opencua.py b/libs/python/agent/agent/adapters/models/opencua.py
index f24dfa6b..f8abf4a6 100644
--- a/libs/python/agent/agent/adapters/models/opencua.py
+++ b/libs/python/agent/agent/adapters/models/opencua.py
@@ -37,6 +37,7 @@ class OpenCUAModel:
             torch_dtype="auto",
             device_map=self.device,
             trust_remote_code=True,
+            attn_implementation="sdpa",
         )
         self.image_processor = AutoImageProcessor.from_pretrained(
             self.model_name, trust_remote_code=True
diff --git a/libs/python/agent/agent/loops/opencua.py b/libs/python/agent/agent/loops/opencua.py
index c13875b2..a494377b 100644
--- a/libs/python/agent/agent/loops/opencua.py
+++ b/libs/python/agent/agent/loops/opencua.py
@@ -97,7 +97,7 @@ class OpenCUAConfig(AsyncAgentConfig):
                 },
                 {
                     "type": "text",
-                    "text": instruction
+                    "text": f"Click on {instruction}"
                 }
             ]
         }
@@ -116,8 +116,7 @@ class OpenCUAConfig(AsyncAgentConfig):
         
         # Extract response text
         output_text = response.choices[0].message.content
-        
-        print(output_text)
+        # print(output_text)
         
         # Extract coordinates from pyautogui format
         coordinates = extract_coordinates_from_pyautogui(output_text)
diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml
index 811c3a9c..0d382fdf 100644
--- a/libs/python/agent/pyproject.toml
+++ b/libs/python/agent/pyproject.toml
@@ -50,7 +50,7 @@ glm45v-hf = [
 opencua-hf = [
     "accelerate",
     "torch",
-    "transformers>=4.54.0",
+    "transformers==4.53.0",
     "tiktoken>=0.11.0",
     "blobfile>=3.0.0"
 ]
@@ -75,7 +75,7 @@ all = [
     "transformers>=4.54.0",
     # opencua requirements
     "tiktoken>=0.11.0",
-    "blobfile>=3.0.0"
+    "blobfile>=3.0.0",
     # ui requirements
     "gradio>=5.23.3",
     "python-dotenv>=1.0.1",

From 52afcd4c6fcdf74948c42a82eeea35cdfd60a536 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 26 Aug 2025 18:14:22 -0400
Subject: [PATCH 05/17] Increased max tokens for glm 4.5v grounding calls

---
 libs/python/agent/agent/loops/glm45v.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libs/python/agent/agent/loops/glm45v.py b/libs/python/agent/agent/loops/glm45v.py
index adc87026..516a9cb1 100644
--- a/libs/python/agent/agent/loops/glm45v.py
+++ b/libs/python/agent/agent/loops/glm45v.py
@@ -844,7 +844,7 @@ Where x,y are coordinates normalized to 0-999 range."""
             api_kwargs = {
                 "model": model,
                 "messages": litellm_messages,
-                "max_tokens": 100,
+                "max_tokens": 2056,
                 "temperature": 0.001,
                 "extra_body": {
                     "skip_special_tokens": False,
@@ -856,6 +856,7 @@ Where x,y are coordinates normalized to 0-999 range."""
             
             # Extract response content
             response_content = response.choices[0].message.content.strip()
+            print(response)
             
             # Parse response for click coordinates
             # Look for coordinates in the response, handling special tokens
@@ -866,7 +867,7 @@ Where x,y are coordinates normalized to 0-999 range."""
                 # Fallback: look for coordinates without special tokens
                 coord_pattern = r"left_click\(start_box='?\[(\d+),(\d+)\]'?\)"
                 match = re.search(coord_pattern, response_content)
-            
+
             if match:
                 x, y = int(match.group(1)), int(match.group(2))
                 

From bf3c3256dfab93cd7c9e2c7f58a6719c9b5586fe Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 26 Aug 2025 18:15:24 -0400
Subject: [PATCH 06/17] increased max tokens, added trust_remote_code kwarg

---
 .../agent/agent/adapters/huggingfacelocal_adapter.py     | 6 ++++--
 libs/python/agent/agent/adapters/models/__init__.py      | 8 ++++----
 libs/python/agent/agent/adapters/models/generic.py       | 9 ++++++---
 libs/python/agent/agent/adapters/models/opencua.py       | 9 +++++----
 libs/python/agent/agent/agent.py                         | 6 +++++-
 libs/python/agent/agent/cli.py                           | 1 +
 libs/python/agent/agent/loops/gta1.py                    | 2 +-
 libs/python/agent/agent/loops/opencua.py                 | 2 +-
 libs/python/agent/agent/loops/uitars.py                  | 2 +-
 9 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py b/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py
index 6f06734c..3ecba641 100644
--- a/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py
+++ b/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py
@@ -20,15 +20,17 @@ from .models import load_model as load_model_handler
 class HuggingFaceLocalAdapter(CustomLLM):
     """HuggingFace Local Adapter for running vision-language models locally."""
     
-    def __init__(self, device: str = "auto", **kwargs):
+    def __init__(self, device: str = "auto", trust_remote_code: bool = False, **kwargs):
         """Initialize the adapter.
         
         Args:
             device: Device to load model on ("auto", "cuda", "cpu", etc.)
+            trust_remote_code: Whether to trust remote code
             **kwargs: Additional arguments
         """
         super().__init__()
         self.device = device
+        self.trust_remote_code = trust_remote_code
         # Cache for model handlers keyed by model_name
         self._handlers: Dict[str, Any] = {}
         self._executor = ThreadPoolExecutor(max_workers=1)  # Single thread pool
@@ -36,7 +38,7 @@ class HuggingFaceLocalAdapter(CustomLLM):
     def _get_handler(self, model_name: str):
         """Get or create a model handler for the given model name."""
         if model_name not in self._handlers:
-            self._handlers[model_name] = load_model_handler(model_name=model_name, device=self.device)
+            self._handlers[model_name] = load_model_handler(model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code)
         return self._handlers[model_name]
     
     def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
diff --git a/libs/python/agent/agent/adapters/models/__init__.py b/libs/python/agent/agent/adapters/models/__init__.py
index 8a5fb00b..99696a1a 100644
--- a/libs/python/agent/agent/adapters/models/__init__.py
+++ b/libs/python/agent/agent/adapters/models/__init__.py
@@ -10,7 +10,7 @@ from .generic import GenericHFModel
 from .opencua import OpenCUAModel
 
 
-def load_model(model_name: str, device: str = "auto"):
+def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = False):
     """Factory function to load and return the right model handler instance.
     
     - If the underlying transformers config class matches OpenCUA, return OpenCUAModel
@@ -20,9 +20,9 @@ def load_model(model_name: str, device: str = "auto"):
         raise ImportError(
             "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
         )
-    cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
     cls = cfg.__class__.__name__
     # print(f"cls: {cls}")
     if "OpenCUA" in cls:
-        return OpenCUAModel(model_name=model_name, device=device)
-    return GenericHFModel(model_name=model_name, device=device)
+        return OpenCUAModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
+    return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
diff --git a/libs/python/agent/agent/adapters/models/generic.py b/libs/python/agent/agent/adapters/models/generic.py
index de267239..aefbaa7f 100644
--- a/libs/python/agent/agent/adapters/models/generic.py
+++ b/libs/python/agent/agent/adapters/models/generic.py
@@ -3,7 +3,7 @@ from typing import List, Dict, Any, Optional
 # Hugging Face imports are local to avoid hard dependency at module import
 try:
     import torch  # type: ignore
-    from transformers import AutoModelForImageTextToText, AutoProcessor  # type: ignore
+    from transformers import AutoModel, AutoProcessor  # type: ignore
     HF_AVAILABLE = True
 except Exception:
     HF_AVAILABLE = False
@@ -14,7 +14,7 @@ class GenericHFModel:
     Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
     """
 
-    def __init__(self, model_name: str, device: str = "auto") -> None:
+    def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
         if not HF_AVAILABLE:
             raise ImportError(
                 "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
@@ -23,15 +23,17 @@ class GenericHFModel:
         self.device = device
         self.model = None
         self.processor = None
+        self.trust_remote_code = trust_remote_code
         self._load()
 
     def _load(self) -> None:
         # Load model
-        self.model = AutoModelForImageTextToText.from_pretrained(
+        self.model = AutoModel.from_pretrained(
             self.model_name,
             torch_dtype=torch.float16,
             device_map=self.device,
             attn_implementation="sdpa",
+            trust_remote_code=self.trust_remote_code,
         )
         # Load processor
         self.processor = AutoProcessor.from_pretrained(
@@ -39,6 +41,7 @@ class GenericHFModel:
             min_pixels=3136,
             max_pixels=4096 * 2160,
             device_map=self.device,
+            trust_remote_code=self.trust_remote_code,
         )
 
     def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
diff --git a/libs/python/agent/agent/adapters/models/opencua.py b/libs/python/agent/agent/adapters/models/opencua.py
index f8abf4a6..32c73134 100644
--- a/libs/python/agent/agent/adapters/models/opencua.py
+++ b/libs/python/agent/agent/adapters/models/opencua.py
@@ -16,7 +16,7 @@ except Exception:
 class OpenCUAModel:
     """OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""
 
-    def __init__(self, model_name: str, device: str = "auto") -> None:
+    def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
         if not OPENCUA_AVAILABLE:
             raise ImportError(
                 "OpenCUA requirements not found. Install with: pip install \"cua-agent[opencua-hf]\""
@@ -26,21 +26,22 @@ class OpenCUAModel:
         self.model = None
         self.tokenizer = None
         self.image_processor = None
+        self.trust_remote_code = trust_remote_code
         self._load()
 
     def _load(self) -> None:
         self.tokenizer = AutoTokenizer.from_pretrained(
-            self.model_name, trust_remote_code=True
+            self.model_name, trust_remote_code=self.trust_remote_code
         )
         self.model = AutoModel.from_pretrained(
             self.model_name,
             torch_dtype="auto",
             device_map=self.device,
-            trust_remote_code=True,
+            trust_remote_code=self.trust_remote_code,
             attn_implementation="sdpa",
         )
         self.image_processor = AutoImageProcessor.from_pretrained(
-            self.model_name, trust_remote_code=True
+            self.model_name, trust_remote_code=self.trust_remote_code
         )
 
     @staticmethod
diff --git a/libs/python/agent/agent/agent.py b/libs/python/agent/agent/agent.py
index 78fc3f45..bdcf5977 100644
--- a/libs/python/agent/agent/agent.py
+++ b/libs/python/agent/agent/agent.py
@@ -166,6 +166,7 @@ class ComputerAgent:
         use_prompt_caching: Optional[bool] = False,
         max_trajectory_budget: Optional[float | dict] = None,
         telemetry_enabled: Optional[bool] = True,
+        trust_remote_code: Optional[bool] = False,
         **kwargs
     ):
         """
@@ -184,6 +185,7 @@ class ComputerAgent:
             use_prompt_caching: If set, use prompt caching to avoid reprocessing the same prompt. Intended for use with anthropic providers.
             max_trajectory_budget: If set, adds BudgetManagerCallback to track usage costs and stop when budget is exceeded
             telemetry_enabled: If set, adds TelemetryCallback to track anonymized usage data. Enabled by default.
+            trust_remote_code: If set, trust remote code when loading local models. Disabled by default.
             **kwargs: Additional arguments passed to the agent loop
         """
         self.model = model
@@ -198,6 +200,7 @@ class ComputerAgent:
         self.use_prompt_caching = use_prompt_caching
         self.telemetry_enabled = telemetry_enabled
         self.kwargs = kwargs
+        self.trust_remote_code = trust_remote_code
 
         # == Add built-in callbacks ==
 
@@ -231,7 +234,8 @@ class ComputerAgent:
 
         # Register local model providers
         hf_adapter = HuggingFaceLocalAdapter(
-            device="auto"
+            device="auto",
+            trust_remote_code=self.trust_remote_code or False
         )
         human_adapter = HumanAdapter()
         litellm.custom_provider_map = [
diff --git a/libs/python/agent/agent/cli.py b/libs/python/agent/agent/cli.py
index de9e3450..b04f11db 100644
--- a/libs/python/agent/agent/cli.py
+++ b/libs/python/agent/agent/cli.py
@@ -331,6 +331,7 @@ Examples:
         agent_kwargs = {
             "model": args.model,
             "tools": [computer],
+            "trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
             "verbosity": 20 if args.verbose else 30,  # DEBUG vs WARNING
             "max_retries": args.max_retries
         }
diff --git a/libs/python/agent/agent/loops/gta1.py b/libs/python/agent/agent/loops/gta1.py
index 13678b48..400daa29 100644
--- a/libs/python/agent/agent/loops/gta1.py
+++ b/libs/python/agent/agent/loops/gta1.py
@@ -155,7 +155,7 @@ class GTA1Config(AsyncAgentConfig):
         api_kwargs = {
             "model": model,
             "messages": [system_message, user_message],
-            "max_tokens": 32,
+            "max_tokens": 2056,
             "temperature": 0.0,
             **kwargs
         }
diff --git a/libs/python/agent/agent/loops/opencua.py b/libs/python/agent/agent/loops/opencua.py
index a494377b..1688b587 100644
--- a/libs/python/agent/agent/loops/opencua.py
+++ b/libs/python/agent/agent/loops/opencua.py
@@ -106,7 +106,7 @@ class OpenCUAConfig(AsyncAgentConfig):
         api_kwargs = {
             "model": model,
             "messages": [system_message, user_message],
-            "max_new_tokens": 512,
+            "max_new_tokens": 2056,
             "temperature": 0,
             **kwargs
         }
diff --git a/libs/python/agent/agent/loops/uitars.py b/libs/python/agent/agent/loops/uitars.py
index 10e0e45a..79c5d241 100644
--- a/libs/python/agent/agent/loops/uitars.py
+++ b/libs/python/agent/agent/loops/uitars.py
@@ -771,7 +771,7 @@ class UITARSConfig:
             api_kwargs = {
                 "model": model,
                 "messages": litellm_messages,
-                "max_tokens": 100,
+                "max_tokens": 2056,
                 "temperature": 0.0,
                 "do_sample": False
             }

From 84e2a27aea3874fa0c3de263d0c194526b989103 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 26 Aug 2025 18:29:39 -0400
Subject: [PATCH 07/17] added notebook

---
 notebooks/composite_agents_docker_nb.ipynb | 162 +++++++++++++++++++++
 1 file changed, 162 insertions(+)
 create mode 100644 notebooks/composite_agents_docker_nb.ipynb

diff --git a/notebooks/composite_agents_docker_nb.ipynb b/notebooks/composite_agents_docker_nb.ipynb
new file mode 100644
index 00000000..5b328f3e
--- /dev/null
+++ b/notebooks/composite_agents_docker_nb.ipynb
@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Composite Agents with Docker Container Computer\n",
+    "\n",
+    "This notebook walks you through running a composed GUI agent using a Docker-based Computer and OpenRouter for the grounding model, paired with a planning model.\n",
+    "\n",
+    "We'll use the model string:\n",
+    "\n",
+    "- `\"openrouter/z-ai/glm-4.5v+openai/gpt-5-nano\"` (grounding + planning)\n",
+    "\n",
+    "Grounding (left) generates actionable UI coordinates; planning (right) reasons and drives steps."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prerequisites\n",
+    "\n",
+    "- Docker Desktop or Engine installed and running\n",
+    "- An OpenRouter account and API key (https://openrouter.ai/)\n",
+    "- (Optional) An OpenAI API key if using `openai/gpt-5-nano` for planning\n",
+    "- Python 3.12 environment with `cua-agent` installed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install CUA Agent (and extras as needed)\n",
+    "!pip install -q \"cua-agent[all]\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Prepare a Docker Computer\n",
+    "\n",
+    "We'll follow the documented Docker provider flow (see `docs/content/docs/computer-sdk/computers.mdx`).\n",
+    "\n",
+    "If you don't have the image yet, either pull or build it locally. Run these in a terminal, not inside the notebook:\n",
+    "\n",
+    "```bash\n",
+    "# Option 1: Pull from Docker Hub\n",
+    "docker pull trycua/cua-ubuntu:latest\n",
+    "\n",
+    "# Option 2: Build locally (from repo root)\n",
+    "cd libs/kasm\n",
+    "docker build -t cua-ubuntu:latest .\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Set environment keys\n",
+    "\n",
+    "- Get an OpenRouter API key at https://openrouter.ai/\n",
+    "- If using OpenAI for planning, set your OpenAI key as well\n",
+    "- You can input them here to set for this notebook session"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY') or input('Enter your OPENROUTER_API_KEY: ').strip()\n",
+    "os.environ['OPENROUTER_API_KEY'] = OPENROUTER_API_KEY\n",
+    "\n",
+    "# Optional: if planning model uses OpenAI provider\n",
+    "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or input('(Optional) Enter your OPENAI_API_KEY (press Enter to skip): ').strip()\n",
+    "if OPENAI_API_KEY:\n",
+    "    os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a Docker Computer and a composed agent\n",
+    "\n",
+    "This uses the documented Docker provider parameters: `os_type=\"linux\"`, `provider_type=\"docker\"`, plus `image` and `name`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import asyncio\n",
+    "from computer import Computer\n",
+    "from agent import ComputerAgent\n",
+    "\n",
+    "async def main():\n",
+    "    # Launch & connect to a Docker container running the Computer Server\n",
+    "    async with Computer(\n",
+    "        os_type='linux',\n",
+    "        provider_type='docker',\n",
+    "        image='trycua/cua-ubuntu:latest',\n",
+    "        name='my-cua-container'\n",
+    "    ) as computer:\n",
+    "        agent = ComputerAgent(\n",
+    "            model='openrouter/z-ai/glm-4.5v+openai/gpt-5-nano',\n",
+    "            tools=[computer],\n",
+    "            trajectory_dir='trajectories' # Save agent trajectory (screenshots, api calls)\n",
+    "        )\n",
+    "\n",
+    "        # Simple task to verify end-to-end\n",
+    "        async for _ in agent.run('Open a browser and go to example.com'):\n",
+    "            pass\n",
+    "\n",
+    "asyncio.run(main())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Notes\n",
+    "\n",
+    "- Grounding (OpenRouter `z-ai/glm-4.5v`) + Planning (OpenAI `gpt-5-nano`) can be swapped for other providers/models.\n",
+    "- If you prefer to avoid OpenAI, choose a planning model on OpenRouter and update the model string accordingly.\n",
+    "- Be sure the planning model supports `vision` input and the `tools` parameter.\n",
+    "- The agent emits normalized Agent Responses across providers."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 58807378dddcf99221fd54b264eb180984efd88c Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Fri, 12 Sep 2025 13:30:09 -0400
Subject: [PATCH 08/17] Added internVL

---
 libs/python/agent/agent/loops/__init__.py     |   4 +-
 .../agent/agent/loops/composed_grounded.py    |   2 +-
 libs/python/agent/agent/loops/internvl.py     | 179 ++++++++++++++++++
 libs/python/agent/agent/loops/opencua.py      |  20 +-
 4 files changed, 200 insertions(+), 5 deletions(-)
 create mode 100644 libs/python/agent/agent/loops/internvl.py

diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py
index 25227e64..958e484c 100644
--- a/libs/python/agent/agent/loops/__init__.py
+++ b/libs/python/agent/agent/loops/__init__.py
@@ -11,6 +11,7 @@ from . import gta1
 from . import composed_grounded
 from . import glm45v
 from . import opencua
+from . import internvl
 
 __all__ = [
     "anthropic", 
@@ -20,5 +21,6 @@ __all__ = [
     "gta1", 
     "composed_grounded", 
     "glm45v", 
-    "opencua"
+    "opencua",
+    "internvl"
 ]
\ No newline at end of file
diff --git a/libs/python/agent/agent/loops/composed_grounded.py b/libs/python/agent/agent/loops/composed_grounded.py
index cf029d13..87ba50e1 100644
--- a/libs/python/agent/agent/loops/composed_grounded.py
+++ b/libs/python/agent/agent/loops/composed_grounded.py
@@ -116,7 +116,7 @@ def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str
 
 
 @register_agent(r".*\+.*", priority=1)
-class ComposedGroundedConfig:
+class ComposedGroundedConfig(AsyncAgentConfig):
     """
     Composed-grounded agent configuration that uses both grounding and thinking models.
     
diff --git a/libs/python/agent/agent/loops/internvl.py b/libs/python/agent/agent/loops/internvl.py
new file mode 100644
index 00000000..d1b8c3fe
--- /dev/null
+++ b/libs/python/agent/agent/loops/internvl.py
@@ -0,0 +1,179 @@
+"""
+InternVL agent loop implementation for click prediction using litellm.acompletion.
+
+Implements the ScreenSpot InternVL grounding baseline behavior:
+- Uses the exact grounding prompt format with <image> and <ref> tags
+- Expects coordinates in 0-1000 normalized range in formats [[x1,y1,x2,y2]] or [[x,y]]
+- Converts to pixel coordinates relative to the original screenshot size
+
+Note: We do NOT manually load the InternVL model; acompletions (via HuggingFaceLocalAdapter)
+will handle loading based on the provided model name.
+"""
+
+from __future__ import annotations
+
+import base64
+import math
+import re
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple
+
+from PIL import Image
+import litellm
+
+from ..decorators import register_agent
+from .composed_grounded import ComposedGroundedConfig
+from ..types import AgentCapability
+
+
+# Regex patterns matching ScreenSpot baseline extractors
+_POINT_PATTERN = re.compile(r"\[\[(\d+),(\d+)\]\]")
+_BBOX_PATTERN = re.compile(r"\[\[(\d+),(\d+),(\d+),(\d+)\]\]")
+
+
+def _extract_first_point(text: str) -> Optional[Tuple[float, float]]:
+    """Extract the first [[x,y]] as normalized (0-1000) floats."""
+    m = _POINT_PATTERN.search(text)
+    if not m:
+        return None
+    try:
+        x = float(m.group(1))
+        y = float(m.group(2))
+        return x, y
+    except Exception:
+        return None
+
+
+def _extract_last_bbox(text: str) -> Optional[Tuple[float, float, float, float]]:
+    """Extract the last [[x1,y1,x2,y2]] as normalized (0-1000) floats."""
+    matches = list(_BBOX_PATTERN.finditer(text))
+    if not matches:
+        return None
+    m = matches[-1]
+    try:
+        x1 = float(m.group(1))
+        y1 = float(m.group(2))
+        x2 = float(m.group(3))
+        y2 = float(m.group(4))
+        return x1, y1, x2, y2
+    except Exception:
+        return None
+
+
+def _scale_norm_to_pixels(x_norm: float, y_norm: float, width: int, height: int) -> Tuple[int, int]:
+    """Scale 0-1000 normalized coordinates to pixel coordinates for given image size."""
+    x_px = int(math.floor((x_norm / 1000.0) * width))
+    y_px = int(math.floor((y_norm / 1000.0) * height))
+    # Clamp to image bounds just in case
+    x_px = max(0, min(width - 1, x_px))
+    y_px = max(0, min(height - 1, y_px))
+    return x_px, y_px
+
+
+@register_agent(models=r"(?i).*InternVL.*")
+class InternVLConfig(ComposedGroundedConfig):
+    """InternVL agent configuration reusing ComposedGroundedConfig for steps and
+    overriding predict_click to implement ScreenSpot InternVL grounding baseline."""
+
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        """Fallback to a self-composed model"""
+        return await super().predict_step(
+            messages=messages,
+            model=f"{model}+{model}",
+            tools=tools,
+            max_retries=max_retries,
+            stream=stream,
+            computer_handler=computer_handler,
+            _on_api_start=_on_api_start,
+            _on_api_end=_on_api_end,
+            _on_usage=_on_usage,
+            _on_screenshot=_on_screenshot,
+            **kwargs
+        )
+    
+    async def predict_click(
+        self,
+        model: str,
+        image_b64: str,
+        instruction: str,
+        **kwargs
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates using InternVL via litellm.acompletion.
+
+        Behavior mirrors the ScreenSpot InternVL baseline:
+        - Prompt: "<image>\nPlease provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. Answer in the format of [[x1, y1, x2, y2]]"
+        - Parse either [[x,y]] point or [[x1,y1,x2,y2]] bbox, using bbox center if point missing
+        - Coordinates are 0-1000 normalized; convert to pixel coordinates for the original screenshot
+        """
+        try:
+            # Decode image dimensions to scale the normalized outputs
+            img_bytes = base64.b64decode(image_b64)
+            image = Image.open(BytesIO(img_bytes))
+            width, height = image.size
+        except Exception:
+            # If decoding fails, proceed with a safe default size to avoid crash
+            width, height = 1920, 1080
+
+        # Build grounding prompt exactly like the baseline
+        grounding_prompt = (
+            f"Please provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. "
+            f"Answer in the format of [[x1, y1, x2, y2]]"
+        )
+
+        # Prepare messages for LiteLLM
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                    },
+                    {"type": "text", "text": grounding_prompt},
+                ],
+            }
+        ]
+
+        # Call acompletion; HuggingFaceLocalAdapter/model handler will handle InternVL loading
+        api_kwargs = {
+            "model": model,
+            "messages": messages,
+            # Conservative generation params akin to baseline (deterministic)
+            "max_tokens": kwargs.get("max_tokens", 256),
+            "temperature": kwargs.get("temperature", 0.0),
+        }
+
+        response = await litellm.acompletion(**api_kwargs)
+        output_text = (response.choices[0].message.content or "").strip()  # type: ignore
+
+        # Try to parse a point first; if absent, parse bbox and take center
+        point = _extract_first_point(output_text)
+        if point is None:
+            bbox = _extract_last_bbox(output_text)
+            if bbox is None:
+                return None
+            x1, y1, x2, y2 = bbox
+            cx = (x1 + x2) / 2.0
+            cy = (y1 + y2) / 2.0
+            point = (cx, cy)
+
+        x_norm, y_norm = point
+        x_px, y_px = _scale_norm_to_pixels(x_norm, y_norm, width, height)
+        return (x_px, y_px)
+
+    def get_capabilities(self) -> List[AgentCapability]:
+        return ["click", "step"]
diff --git a/libs/python/agent/agent/loops/opencua.py b/libs/python/agent/agent/loops/opencua.py
index 1688b587..b06ea126 100644
--- a/libs/python/agent/agent/loops/opencua.py
+++ b/libs/python/agent/agent/loops/opencua.py
@@ -14,6 +14,7 @@ from PIL import Image
 import litellm
 import math
 
+from .composed_grounded import ComposedGroundedConfig
 from ..decorators import register_agent
 from ..types import Messages, AgentResponse, Tools, AgentCapability
 from ..loops.base import AsyncAgentConfig
@@ -32,10 +33,11 @@ def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
         return None
 
 @register_agent(models=r"(?i).*OpenCUA.*")
-class OpenCUAConfig(AsyncAgentConfig):
+class OpenCUAConfig(ComposedGroundedConfig):
     """OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
     
     def __init__(self):
+        super().__init__()
         self.current_model = None
         self.last_screenshot_b64 = None
 
@@ -53,8 +55,20 @@ class OpenCUAConfig(AsyncAgentConfig):
         _on_screenshot=None,
         **kwargs
     ) -> Dict[str, Any]:
-        """Predict step is not implemented for OpenCUA model."""
-        raise NotImplementedError("predict_step is not implemented for OpenCUA model")
+        """Fallback to a self-composed model"""
+        return await super().predict_step(
+            messages=messages,
+            model=f"{model}+{model}",
+            tools=tools,
+            max_retries=max_retries,
+            stream=stream,
+            computer_handler=computer_handler,
+            _on_api_start=_on_api_start,
+            _on_api_end=_on_api_end,
+            _on_usage=_on_usage,
+            _on_screenshot=_on_screenshot,
+            **kwargs
+        )
 
     async def predict_click(
         self,

From eba94ce9193380f70ad3321eaf1d7ab088203727 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Mon, 15 Sep 2025 15:55:33 -0400
Subject: [PATCH 09/17] added internVL, Holo1.5,  and OpenCUA to readme

---
 README.md | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index eea56257..f97adb12 100644
--- a/README.md
+++ b/README.md
@@ -29,21 +29,25 @@ With the Computer SDK, you can:
 - create & manage VMs [locally](https://docs.trycua.com/docs/computer-sdk/computers#cua-local-containers) or using [cua cloud](https://www.trycua.com/)
 
 With the Agent SDK, you can:
-- run computer-use models with a [consistent output](https://docs.trycua.com/docs/agent-sdk/chat-history#message-array-structure)
-- run composed agents using UI grounding models and any LLM
-- use any liteLLM provider (`openai/`, `openrouter/`, etc.) or our included local providers (`huggingface-local/`, `mlx/`)
-- quickly evaluate new UI agent models and UI grounding models
-  - `anthropic/claude-opus-4-1-20250805` (using [Computer-Use Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/computer-use-agents))
-  - `openai/computer-use-preview`
-  - `openrouter/z-ai/glm-4.5v`
-  - `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`
-  - `omniparser+{any LLM}` (using [Composed Agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents))
-  - `huggingface-local/HelloKKMe/GTA1-7B+{any LLM}`
-  - `huggingface/HelloKKMe/GTA1-32B+{any LLM}`
-  - `vllm_hosted/HelloKKMe/GTA1-72B+{any LLM}`
-  - `human/human` (using [Human-in-the-Loop](https://docs.trycua.com/docs/agent-sdk/supported-agents/human-in-the-loop))
+- run computer-use models with a [consistent schema](https://docs.trycua.com/docs/agent-sdk/message-format)
+- combine UI grounding models with any LLM using [composed agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents)
+- use new UI agent models and UI grounding models from the Model Zoo below with just a model string (e.g., `ComputerAgent(model="openai/computer-use-preview")`)
+- use any API or local provider by specifying a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers))
 - benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://docs.trycua.com/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb))
 
+### CUA Model Zoo 🐨
+
+| [All-in-one CUAs](https://docs.trycua.com/docs/agent-sdk/supported-agents/computer-use-agents) | [UI Grounding Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) | [UI Planning Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) |
+|---|---|---|
+| `anthropic/claude-opus-4-1-20250805` | `huggingface-local/xlangai/OpenCUA-{7B,32B}` | any all-in-one CUA |
+| `openai/computer-use-preview` | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | any VLM (using liteLLM) |
+| `openrouter/z-ai/glm-4.5v` | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` |  |
+| `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` | |
+| `omniparser+{ui planning}` | | |
+| `{ui grounding}+{ui planning}` | | |
+
+- `human/human` → [Human-in-the-Loop](https://docs.trycua.com/docs/agent-sdk/supported-agents/human-in-the-loop)
+
 Missing a model? [Raise a feature request](https://github.com/trycua/cua/issues/new?assignees=&labels=enhancement&projects=&title=%5BAgent%5D%3A+Add+model+support+for+) or [contribute](https://github.com/trycua/cua/blob/main/CONTRIBUTING.md)!
 
 <br/>

From 3a1244e1c18421c78ccc3afac2ba83f806b6824f Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Mon, 15 Sep 2025 15:59:33 -0400
Subject: [PATCH 10/17] updated model zoo

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f97adb12..cd732f57 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ With the Agent SDK, you can:
 - run computer-use models with a [consistent schema](https://docs.trycua.com/docs/agent-sdk/message-format)
 - combine UI grounding models with any LLM using [composed agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents)
 - use new UI agent models and UI grounding models from the Model Zoo below with just a model string (e.g., `ComputerAgent(model="openai/computer-use-preview")`)
-- use any API or local provider by specifying a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers))
+- use API or local inference by specifying a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers))
 - benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://docs.trycua.com/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb))
 
 ### CUA Model Zoo 🐨
@@ -40,7 +40,7 @@ With the Agent SDK, you can:
 | [All-in-one CUAs](https://docs.trycua.com/docs/agent-sdk/supported-agents/computer-use-agents) | [UI Grounding Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) | [UI Planning Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) |
 |---|---|---|
 | `anthropic/claude-opus-4-1-20250805` | `huggingface-local/xlangai/OpenCUA-{7B,32B}` | any all-in-one CUA |
-| `openai/computer-use-preview` | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | any VLM (using liteLLM) |
+| `openai/computer-use-preview` | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | any VLM (using liteLLM, requires `tools` parameter) |
 | `openrouter/z-ai/glm-4.5v` | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` |  |
 | `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` | |
 | `omniparser+{ui planning}` | | |

From c217ed13af6a07608c4f1d2e63e3cee80aa0f12f Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Mon, 15 Sep 2025 16:01:31 -0400
Subject: [PATCH 11/17] wording change

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cd732f57..c9e814aa 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ With the Agent SDK, you can:
 - run computer-use models with a [consistent schema](https://docs.trycua.com/docs/agent-sdk/message-format)
 - combine UI grounding models with any LLM using [composed agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents)
 - use new UI agent models and UI grounding models from the Model Zoo below with just a model string (e.g., `ComputerAgent(model="openai/computer-use-preview")`)
-- use API or local inference by specifying a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers))
+- use API or local inference by changing a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers))
 - benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://docs.trycua.com/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb))
 
 ### CUA Model Zoo 🐨

From 7a7de5d50f8c001300da69c3c154579be890ffe9 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Mon, 15 Sep 2025 16:10:54 -0400
Subject: [PATCH 12/17] add holo models

---
 README.md                                 |   5 +-
 libs/python/agent/agent/loops/__init__.py |   4 +-
 libs/python/agent/agent/loops/holo.py     | 216 ++++++++++++++++++++++
 3 files changed, 222 insertions(+), 3 deletions(-)
 create mode 100644 libs/python/agent/agent/loops/holo.py

diff --git a/README.md b/README.md
index c9e814aa..24316555 100644
--- a/README.md
+++ b/README.md
@@ -40,9 +40,10 @@ With the Agent SDK, you can:
 | [All-in-one CUAs](https://docs.trycua.com/docs/agent-sdk/supported-agents/computer-use-agents) | [UI Grounding Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) | [UI Planning Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) |
 |---|---|---|
 | `anthropic/claude-opus-4-1-20250805` | `huggingface-local/xlangai/OpenCUA-{7B,32B}` | any all-in-one CUA |
-| `openai/computer-use-preview` | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | any VLM (using liteLLM, requires `tools` parameter) |
+| `openai/computer-use-preview` | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` | any VLM (using liteLLM, requires `tools` parameter) |
 | `openrouter/z-ai/glm-4.5v` | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` |  |
-| `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` | |
+| `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` | any all-in-one CUA | |
+| `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | |
 | `omniparser+{ui planning}` | | |
 | `{ui grounding}+{ui planning}` | | |
 
diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py
index 958e484c..406f14ca 100644
--- a/libs/python/agent/agent/loops/__init__.py
+++ b/libs/python/agent/agent/loops/__init__.py
@@ -12,6 +12,7 @@ from . import composed_grounded
 from . import glm45v
 from . import opencua
 from . import internvl
+from . import holo
 
 __all__ = [
     "anthropic", 
@@ -22,5 +23,6 @@ __all__ = [
     "composed_grounded", 
     "glm45v", 
     "opencua",
-    "internvl"
+    "internvl",
+    "holo",
 ]
\ No newline at end of file
diff --git a/libs/python/agent/agent/loops/holo.py b/libs/python/agent/agent/loops/holo.py
new file mode 100644
index 00000000..b1cbc5a1
--- /dev/null
+++ b/libs/python/agent/agent/loops/holo.py
@@ -0,0 +1,216 @@
+"""
+Holo 1.5 agent loop implementation for click prediction using litellm.acompletion.
+
+Implements the Holo1.5 grounding behavior:
+- Prompt asks for absolute pixel coordinates in JSON: {"action":"click_absolute","x":int,"y":int}
+- Optionally resizes the image using Qwen2-VL smart_resize parameters (via transformers AutoProcessor)
+- If resized, maps predicted coordinates back to the original screenshot resolution
+
+Note: We do NOT manually load the model; acompletions (via HuggingFaceLocalAdapter)
+will handle loading based on the provided model name.
+"""
+
+from __future__ import annotations
+
+import base64
+import json
+from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple
+
+import litellm
+from PIL import Image
+
+from ..decorators import register_agent
+from .base import AsyncAgentConfig
+from ..types import AgentCapability
+
+
+def _strip_hf_prefix(model: str) -> str:
+    """Strip provider prefixes like 'huggingface-local/' from model names for HF processor load."""
+    if "/" in model and model.lower().startswith("huggingface-local/"):
+        return model.split("/", 1)[1]
+    return model
+
+
+def _maybe_smart_resize(image: Image.Image, model: str) -> Tuple[Image.Image, Tuple[int, int]]:
+    """
+    Try to compute Qwen2-VL smart_resize output size using transformers AutoProcessor.
+
+    Returns (processed_image, (orig_w, orig_h)). If transformers or processor unavailable,
+    returns the original image and size without resizing.
+    """
+    orig_w, orig_h = image.size
+    try:
+        # Import lazily to avoid hard dependency if not installed
+        from transformers import AutoProcessor  # type: ignore
+        from transformers.models.qwen2_vl.image_processing_qwen2_vl import (  # type: ignore
+            smart_resize,
+        )
+
+        processor_name = _strip_hf_prefix(model)
+        processor = AutoProcessor.from_pretrained(processor_name)
+        image_processor = getattr(processor, "image_processor", None)
+        if image_processor is None:
+            return image, (orig_w, orig_h)
+
+        factor = getattr(image_processor, "patch_size", 14) * getattr(image_processor, "merge_size", 1)
+        min_pixels = getattr(image_processor, "min_pixels", 256 * 256)
+        max_pixels = getattr(image_processor, "max_pixels", 1536 * 1536)
+
+        resized_h, resized_w = smart_resize(
+            orig_h,
+            orig_w,
+            factor=factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+
+        if (resized_w, resized_h) == (orig_w, orig_h):
+            return image, (orig_w, orig_h)
+
+        processed = image.resize((resized_w, resized_h), resample=Image.Resampling.LANCZOS)
+        return processed, (orig_w, orig_h)
+    except Exception:
+        # If any failure (no transformers, processor load error), fall back to original
+        return image, (orig_w, orig_h)
+
+
+def _build_holo_prompt(instruction: str) -> str:
+    """Construct the Holo1.5 grounding prompt."""
+    # Keep it close to the cookbook while avoiding heavy schema generation
+    schema_hint = '{"action": "click_absolute", "x": <int>, "y": <int>}'
+    return (
+        "Localize an element on the GUI image according to the provided target and output a click position. "
+        f"You must output a valid JSON following the format: {schema_hint} "
+        f"Your target is: {instruction}"
+    )
+
+
+def _parse_click_json(output_text: str) -> Optional[Tuple[int, int]]:
+    """
+    Parse JSON from model output and extract x, y ints.
+    Tries to find the first JSON object substring if extra text is present.
+    """
+    try:
+        # Fast path: direct JSON
+        data = json.loads(output_text)
+    except Exception:
+        # Try to locate a JSON object within the text
+        start = output_text.find("{")
+        end = output_text.rfind("}")
+        if start == -1 or end == -1 or end <= start:
+            return None
+        try:
+            data = json.loads(output_text[start : end + 1])
+        except Exception:
+            return None
+
+    try:
+        x = int(data.get("x"))
+        y = int(data.get("y"))
+        return x, y
+    except Exception:
+        return None
+
+
+@register_agent(models=r"(?i).*(Holo1\.5|Hcompany/Holo1\.5).*")
+class HoloConfig(AsyncAgentConfig):
+    """Holo is a family of UI grounding models from H Company"""
+
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        # Holo models are only trained on UI localization tasks, not all-in-one agent
+        raise NotImplementedError()
+
+    async def predict_click(
+        self,
+        model: str,
+        image_b64: str,
+        instruction: str,
+        **kwargs,
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates using Holo1.5 via litellm.acompletion.
+
+        - Optionally smart-resizes the image using Qwen2-VL rules if transformers are available
+        - Prompts for JSON with absolute pixel coordinates
+        - Parses x,y and maps back to original screenshot size if resized
+        """
+        try:
+            img_bytes = base64.b64decode(image_b64)
+            original_img = Image.open(BytesIO(img_bytes))
+        except Exception:
+            return None
+
+        # Optional preprocessing
+        processed_img, (orig_w, orig_h) = _maybe_smart_resize(original_img, model)
+
+        # If we resized, send the resized image; otherwise send original
+        img_to_send = processed_img
+        buf = BytesIO()
+        img_to_send.save(buf, format="PNG")
+        processed_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
+
+        prompt = _build_holo_prompt(instruction)
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{processed_b64}"},
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+
+        api_kwargs = {
+            "model": model,
+            "messages": messages,
+            # Deterministic, small output
+            "max_tokens": kwargs.get("max_tokens", 256),
+            "temperature": kwargs.get("temperature", 0.0),
+        }
+
+        response = await litellm.acompletion(**api_kwargs)
+        output_text = (response.choices[0].message.content or "").strip()  # type: ignore
+
+        coords = _parse_click_json(output_text)
+        if coords is None:
+            return None
+
+        x, y = coords
+
+        # Map back to original size if we resized
+        proc_w, proc_h = img_to_send.size
+        if (proc_w, proc_h) != (orig_w, orig_h):
+            try:
+                sx = orig_w / float(proc_w)
+                sy = orig_h / float(proc_h)
+                x = int(round(x * sx))
+                y = int(round(y * sy))
+            except Exception:
+                # Fallback: clamp within original bounds
+                pass
+
+        # Clamp to original image bounds
+        x = max(0, min(orig_w - 1, x))
+        y = max(0, min(orig_h - 1, y))
+        return x, y
+
+    def get_capabilities(self) -> List[AgentCapability]:
+        return ["click"]

From c5bbd4611ab11d9eaaba684f916717978c3f0005 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Mon, 15 Sep 2025 16:29:26 -0400
Subject: [PATCH 13/17] add qwen2_5_vl.py

---
 .../agent/agent/adapters/models/__init__.py   |  4 +-
 .../agent/agent/adapters/models/qwen2_5_vl.py | 75 +++++++++++++++++++
 2 files changed, 78 insertions(+), 1 deletion(-)
 create mode 100644 libs/python/agent/agent/adapters/models/qwen2_5_vl.py

diff --git a/libs/python/agent/agent/adapters/models/__init__.py b/libs/python/agent/agent/adapters/models/__init__.py
index 99696a1a..10f896d2 100644
--- a/libs/python/agent/agent/adapters/models/__init__.py
+++ b/libs/python/agent/agent/adapters/models/__init__.py
@@ -8,7 +8,7 @@ except ImportError:
 
 from .generic import GenericHFModel
 from .opencua import OpenCUAModel
-
+from .qwen2_5_vl import Qwen2_5_VLModel
 
 def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = False):
     """Factory function to load and return the right model handler instance.
@@ -25,4 +25,6 @@ def load_model(model_name: str, device: str = "auto", trust_remote_code: bool =
     # print(f"cls: {cls}")
     if "OpenCUA" in cls:
         return OpenCUAModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
+    elif "Qwen2_5_VLConfig" in cls:
+        return Qwen2_5_VLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
     return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
diff --git a/libs/python/agent/agent/adapters/models/qwen2_5_vl.py b/libs/python/agent/agent/adapters/models/qwen2_5_vl.py
new file mode 100644
index 00000000..17b25f8a
--- /dev/null
+++ b/libs/python/agent/agent/adapters/models/qwen2_5_vl.py
@@ -0,0 +1,75 @@
+from typing import List, Dict, Any, Optional
+
+# Hugging Face imports are local to avoid hard dependency at module import
+try:
+    import torch  # type: ignore
+    from transformers import AutoModelForImageTextToText, AutoProcessor  # type: ignore
+    HF_AVAILABLE = True
+except Exception:
+    HF_AVAILABLE = False
+
+
+class Qwen2_5_VLModel:
+    """Qwen2.5-VL Hugging Face vision-language model handler.
+    Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
+    """
+
+    def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
+        if not HF_AVAILABLE:
+            raise ImportError(
+                "HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
+            )
+        self.model_name = model_name
+        self.device = device
+        self.model = None
+        self.processor = None
+        self.trust_remote_code = trust_remote_code
+        self._load()
+
+    def _load(self) -> None:
+        # Load model
+        self.model = AutoModelForImageTextToText.from_pretrained(
+            self.model_name,
+            torch_dtype=torch.bfloat16,
+            device_map=self.device,
+            attn_implementation="sdpa",
+            trust_remote_code=self.trust_remote_code,
+        )
+        # Load processor
+        self.processor = AutoProcessor.from_pretrained(
+            self.model_name,
+            min_pixels=3136,
+            max_pixels=4096 * 2160,
+            device_map=self.device,
+            trust_remote_code=self.trust_remote_code,
+        )
+
+    def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
+        """Generate text for the given HF-format messages.
+        messages: [{ role, content: [{type:'text'|'image', text|image}] }]
+        """
+        assert self.model is not None and self.processor is not None
+        # Apply chat template and tokenize
+        inputs = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        # Move inputs to the same device as model
+        inputs = inputs.to(self.model.device)
+        # Generate
+        with torch.no_grad():
+            generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+        # Trim prompt tokens from output
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        # Decode
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )
+        return output_text[0] if output_text else ""

From a46c276e70063607e851030fac880e75e9cd21a5 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Mon, 15 Sep 2025 16:41:39 -0400
Subject: [PATCH 14/17] updated model docs

---
 .../supported-agents/composed-agents.mdx      | 32 ++++++++-------
 .../supported-agents/computer-use-agents.mdx  | 41 ++++++++++++-------
 .../supported-agents/grounding-models.mdx     | 29 +++++++------
 3 files changed, 61 insertions(+), 41 deletions(-)

diff --git a/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx b/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx
index 8040d2e5..485074e2 100644
--- a/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx
+++ b/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx
@@ -5,32 +5,36 @@ description: Combine grounding models with any LLM for computer-use capabilities
 
 Composed agents combine the best of both worlds: specialized grounding models for precise click prediction and powerful LLMs for task planning and reasoning.
 
-Use the format `"grounding_model+thinking_model"` to create a composed agent with any vision-enabled LiteLLM-compatible model.
+Use the format `"grounding_model+planning_model"` to create a composed agent with any vision-enabled LiteLLM-compatible model.
 
 ## How Composed Agents Work
 
-1. **Planning Phase**: The thinking model (LLM) analyzes the task and decides what actions to take (e.g., `click("find the login button")`, `type("username")`)
+1. **Planning Phase**: The planning model (LLM) analyzes the task and decides what actions to take (e.g., `click("find the login button")`, `type("username")`)
 2. **Grounding Phase**: The grounding model converts element descriptions to precise coordinates
 3. **Execution**: Actions are performed using the predicted coordinates
 
 ## Supported Grounding Models
 
-Any model that supports `predict_click()` can be used as the grounding component:
+Any model that supports `predict_click()` can be used as the grounding component. See the full list on [Grounding Models](./grounding-models).
 
-- `omniparser` (OSS set-of-marks model)
-- `huggingface-local/HelloKKMe/GTA1-7B` (OSS grounding model)
-- `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` (OSS unified model)
-- `claude-3-5-sonnet-20241022` (Anthropic CUA)
-- `openai/computer-use-preview` (OpenAI CUA)
+- OpenCUA: `huggingface-local/xlangai/OpenCUA-{7B,32B}`
+- GTA1 family: `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}`
+- Holo 1.5 family: `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}`
+- InternVL 3.5 family: `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`
+- UI‑TARS 1.5: `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` (also supports full CU)
+- OmniParser (OCR): `omniparser` (requires combination with a LiteLLM vision model)
 
-## Supported Thinking Models
+## Supported Planning Models
 
-Any vision-enabled LiteLLM-compatible model can be used as the thinking component:
+Any vision-enabled LiteLLM-compatible model can be used as the planning component:
 
-- **Anthropic**: `anthropic/claude-3-5-sonnet-20241022`, `anthropic/claude-3-opus-20240229`
-- **OpenAI**: `openai/gpt-5`, `openai/gpt-o3`, `openai/gpt-4o`
-- **Google**: `gemini/gemini-1.5-pro`, `vertex_ai/gemini-pro-vision`
-- **Local models**: Any Hugging Face vision-language model
+- Any All‑in‑one CUA (planning-capable). See [All‑in‑one CUAs](./computer-use-agents).
+- Any VLM via LiteLLM providers: `anthropic/*`, `openai/*`, `openrouter/*`, `gemini/*`, `vertex_ai/*`, `huggingface-local/*`, `mlx/*`, etc.
+- Examples:
+  - **Anthropic**: `anthropic/claude-3-5-sonnet-20241022`, `anthropic/claude-opus-4-1-20250805`
+  - **OpenAI**: `openai/gpt-5`, `openai/gpt-o3`, `openai/gpt-4o`
+  - **Google**: `gemini/gemini-1.5-pro`, `vertex_ai/gemini-pro-vision`
+  - **Local models**: Any Hugging Face vision-language model
 
 ## Usage Examples
 
diff --git a/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx b/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx
index 44ab41d1..b2487a7c 100644
--- a/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx
+++ b/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx
@@ -1,5 +1,5 @@
 ---
-title: Computer-Use Models
+title: All‑in‑one CUA Models
 description: Models that support full computer-use agent capabilities with ComputerAgent.run()
 ---
 
@@ -36,19 +36,6 @@ async for _ in agent.run("Take a screenshot and describe what you see"):
     pass
 ```
 
-## UI-TARS 1.5
-
-Unified vision-language model for computer-use:
-
-- `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`
-- `huggingface/ByteDance-Seed/UI-TARS-1.5-7B` (requires TGI endpoint)
-
-```python
-agent = ComputerAgent("huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", tools=[computer])
-async for _ in agent.run("Open the settings menu and change the theme to dark mode"):
-    pass
-```
-
 ## GLM-4.5V
 
 Zhipu AI's GLM-4.5V vision-language model with computer-use capabilities:
@@ -62,6 +49,32 @@ async for _ in agent.run("Click on the search bar and type 'hello world'"):
     pass
 ```
 
+## InternVL 3.5
+
+InternVL 3.5 family:
+- `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`
+
+```python
+agent = ComputerAgent("huggingface-local/OpenGVLab/InternVL3_5-1B", tools=[computer])
+async for _ in agent.run("Open Firefox and navigate to github.com"):
+    pass
+```
+
+## UI-TARS 1.5
+
+Unified vision-language model for computer-use:
+
+- `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`
+- `huggingface/ByteDance-Seed/UI-TARS-1.5-7B` (requires TGI endpoint)
+
+```python
+agent = ComputerAgent("huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", tools=[computer])
+async for _ in agent.run("Open the settings menu and change the theme to dark mode"):
+    pass
+```
+
 ---
 
+CUAs also support direct click prediction. See [Grounding Models](./grounding-models) for details on `predict_click()`.
+
 For details on agent loop behavior and usage, see [Agent Loops](../agent-loops).
diff --git a/docs/content/docs/agent-sdk/supported-agents/grounding-models.mdx b/docs/content/docs/agent-sdk/supported-agents/grounding-models.mdx
index 65d254fe..9270f183 100644
--- a/docs/content/docs/agent-sdk/supported-agents/grounding-models.mdx
+++ b/docs/content/docs/agent-sdk/supported-agents/grounding-models.mdx
@@ -7,9 +7,7 @@ These models specialize in UI element grounding and click prediction. They can i
 
 Use `ComputerAgent.predict_click()` to get coordinates for specific UI elements.
 
-## All Computer-Use Agents
-
-All models that support `ComputerAgent.run()` also support `ComputerAgent.predict_click()`:
+All models that support `ComputerAgent.run()` also support `ComputerAgent.predict_click()`. See [All‑in‑one CUAs](./computer-use-agents).
 
 ### Anthropic CUAs
 
@@ -21,7 +19,7 @@ All models that support `ComputerAgent.run()` also support `ComputerAgent.predic
 ### OpenAI CUA Preview
 - Computer-use-preview: `computer-use-preview`
 
-### UI-TARS 1.5
+### UI-TARS 1.5 (Unified VLM with grounding support)
 - `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`
 - `huggingface/ByteDance-Seed/UI-TARS-1.5-7B` (requires TGI endpoint)
 
@@ -29,18 +27,24 @@ All models that support `ComputerAgent.run()` also support `ComputerAgent.predic
 
 These models are optimized specifically for click prediction and UI element grounding:
 
-### OmniParser
+### OpenCUA
+- `huggingface-local/xlangai/OpenCUA-{7B,32B}`
+
+### GTA1 Family
+- `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}`
+
+### Holo 1.5 Family
+- `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}`
+
+### InternVL 3.5 Family
+- `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`
+
+### OmniParser (OCR)
 
 OCR-focused set-of-marks model that requires an LLM for click prediction:
 
 - `omniparser` (requires combination with any LiteLLM vision model)
 
-### GTA1-7B
-
-State-of-the-art grounding model from the [GUI Agent Grounding Leaderboard](https://gui-agent.github.io/grounding-leaderboard/):
-
-- `huggingface-local/HelloKKMe/GTA1-7B`
-
 ## Usage Examples
 
 ```python
@@ -83,7 +87,6 @@ print(f"Click coordinates: {coords}")  # (450, 320)
 # agent.run("Fill out the form and submit it")
 ```
 
-
 ---
 
-For information on combining grounding models with planning capabilities, see [Composed Agents](./composed-agents).
+For information on combining grounding models with planning capabilities, see [Composed Agents](./composed-agents) and [All‑in‑one CUAs](./computer-use-agents).

From 7cf27b1cc373431b3036a2d5d855c9d86f2e807d Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Mon, 15 Sep 2025 16:47:34 -0400
Subject: [PATCH 15/17] docs organization

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 24316555..e03d9bb3 100644
--- a/README.md
+++ b/README.md
@@ -30,10 +30,10 @@ With the Computer SDK, you can:
 
 With the Agent SDK, you can:
 - run computer-use models with a [consistent schema](https://docs.trycua.com/docs/agent-sdk/message-format)
+- benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://docs.trycua.com/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb))
 - combine UI grounding models with any LLM using [composed agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents)
 - use new UI agent models and UI grounding models from the Model Zoo below with just a model string (e.g., `ComputerAgent(model="openai/computer-use-preview")`)
 - use API or local inference by changing a prefix (e.g., `openai/`, `openrouter/`, `ollama/`, `huggingface-local/`, `mlx/`, [etc.](https://docs.litellm.ai/docs/providers))
-- benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://docs.trycua.com/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb))
 
 ### CUA Model Zoo 🐨
 

From 9147e8eeaf419c7116663df66b1d8604175cc800 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 16 Sep 2025 12:02:07 -0400
Subject: [PATCH 16/17] Added "cua-agent[internvl-hf]" dep

---
 .../agent/agent/adapters/models/__init__.py   |  5 +-
 .../agent/agent/adapters/models/internvl.py   | 78 ++++++++++++++++
 libs/python/agent/agent/cli.py                | 90 ++++++++++++++++++-
 libs/python/agent/pyproject.toml              | 12 ++-
 4 files changed, 182 insertions(+), 3 deletions(-)
 create mode 100644 libs/python/agent/agent/adapters/models/internvl.py

diff --git a/libs/python/agent/agent/adapters/models/__init__.py b/libs/python/agent/agent/adapters/models/__init__.py
index 10f896d2..b36fda1b 100644
--- a/libs/python/agent/agent/adapters/models/__init__.py
+++ b/libs/python/agent/agent/adapters/models/__init__.py
@@ -9,6 +9,7 @@ except ImportError:
 from .generic import GenericHFModel
 from .opencua import OpenCUAModel
 from .qwen2_5_vl import Qwen2_5_VLModel
+from .internvl import InternVLModel
 
 def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = False):
     """Factory function to load and return the right model handler instance.
@@ -22,9 +23,11 @@ def load_model(model_name: str, device: str = "auto", trust_remote_code: bool =
         )
     cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
     cls = cfg.__class__.__name__
-    # print(f"cls: {cls}")
+    print(f"cls: {cls}")
     if "OpenCUA" in cls:
         return OpenCUAModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
     elif "Qwen2_5_VLConfig" in cls:
         return Qwen2_5_VLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
+    elif "InternVLChatConfig" in cls:
+        return InternVLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
     return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
diff --git a/libs/python/agent/agent/adapters/models/internvl.py b/libs/python/agent/agent/adapters/models/internvl.py
new file mode 100644
index 00000000..0ed32e6b
--- /dev/null
+++ b/libs/python/agent/agent/adapters/models/internvl.py
@@ -0,0 +1,78 @@
+from typing import List, Dict, Any, Optional
+
+# Hugging Face imports are local to avoid hard dependency at module import
+try:
+    import torch  # type: ignore
+    from transformers import AutoModel, AutoProcessor  # type: ignore
+    # Attempt to import InternVL's model dependencies
+    import einops as _  # type: ignore
+    import timm as _  # type: ignore
+    HF_AVAILABLE = True
+except Exception:
+    HF_AVAILABLE = False
+
+
+class InternVLModel:
+    """Generic Hugging Face vision-language model handler.
+    Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
+    """
+
+    def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
+        if not HF_AVAILABLE:
+            raise ImportError(
+                "InternVL dependencies not found. Install with: pip install \"cua-agent[internvl-hf]\""
+            )
+        self.model_name = model_name
+        self.device = device
+        self.model = None
+        self.processor = None
+        self.trust_remote_code = trust_remote_code
+        self._load()
+
+    def _load(self) -> None:
+        # Load model
+        self.model = AutoModel.from_pretrained(
+            self.model_name,
+            torch_dtype=torch.float16,
+            device_map=self.device,
+            attn_implementation="sdpa",
+            trust_remote_code=self.trust_remote_code,
+        )
+        # Load processor
+        self.processor = AutoProcessor.from_pretrained(
+            self.model_name,
+            min_pixels=3136,
+            max_pixels=4096 * 2160,
+            device_map=self.device,
+            trust_remote_code=self.trust_remote_code,
+        )
+
+    def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
+        """Generate text for the given HF-format messages.
+        messages: [{ role, content: [{type:'text'|'image', text|image}] }]
+        """
+        assert self.model is not None and self.processor is not None
+        # Apply chat template and tokenize
+        inputs = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        # Move inputs to the same device as model
+        inputs = inputs.to(self.model.device)
+        # Generate
+        with torch.no_grad():
+            generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+        # Trim prompt tokens from output
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        # Decode
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )
+        return output_text[0] if output_text else ""
diff --git a/libs/python/agent/agent/cli.py b/libs/python/agent/agent/cli.py
index b04f11db..c0434d02 100644
--- a/libs/python/agent/agent/cli.py
+++ b/libs/python/agent/agent/cli.py
@@ -18,6 +18,15 @@ try:
     import json
     from typing import List, Dict, Any
     import dotenv
+    import base64
+    import time
+    import platform
+    from pathlib import Path
+    try:
+        from PIL import Image, ImageDraw
+        PIL_AVAILABLE = True
+    except Exception:
+        PIL_AVAILABLE = False
     from yaspin import yaspin
 except ImportError:
     if __name__ == "__main__":
@@ -248,6 +257,13 @@ Examples:
         help="Initial prompt to send to the agent. Leave blank for interactive mode."
     )
 
+    parser.add_argument(
+        "--predict-click",
+        dest="predict_click",
+        type=str,
+        help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it."
+    )
+
     parser.add_argument(
         "-c", "--cache",
         action="store_true",
@@ -354,7 +370,79 @@ Examples:
         
         agent = ComputerAgent(**agent_kwargs)
         
-        # Start chat loop
+        # If predict-click mode is requested, run once and exit
+        if args.predict_click:
+            if not PIL_AVAILABLE:
+                print_colored("❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow", Colors.RED, bold=True)
+                sys.exit(1)
+
+            instruction = args.predict_click
+            print_colored(f"Predicting click for: '{instruction}'", Colors.CYAN)
+
+            # Take a fresh screenshot FIRST
+            try:
+                img_bytes = await computer.interface.screenshot()
+            except Exception as e:
+                print_colored(f"❌ Failed to take screenshot: {e}", Colors.RED, bold=True)
+                sys.exit(1)
+
+            # Encode screenshot to base64 for predict_click
+            try:
+                image_b64 = base64.b64encode(img_bytes).decode("utf-8")
+            except Exception as e:
+                print_colored(f"❌ Failed to encode screenshot: {e}", Colors.RED, bold=True)
+                sys.exit(1)
+
+            try:
+                coords = await agent.predict_click(instruction, image_b64=image_b64)
+            except Exception as e:
+                print_colored(f"❌ predict_click failed: {e}", Colors.RED, bold=True)
+                sys.exit(1)
+
+            if not coords:
+                print_colored("⚠️  No coordinates returned.", Colors.YELLOW)
+                sys.exit(2)
+
+            x, y = coords
+            print_colored(f"✅ Predicted coordinates: ({x}, {y})", Colors.GREEN)
+
+            try:
+                from io import BytesIO
+                with Image.open(BytesIO(img_bytes)) as img:
+                    img = img.convert("RGB")
+                    draw = ImageDraw.Draw(img)
+                    # Draw crosshair
+                    size = 12
+                    color = (255, 0, 0)
+                    draw.line([(x - size, y), (x + size, y)], fill=color, width=3)
+                    draw.line([(x, y - size), (x, y + size)], fill=color, width=3)
+                    # Optional small circle
+                    r = 6
+                    draw.ellipse([(x - r, y - r), (x + r, y + r)], outline=color, width=2)
+
+                    out_path = Path.cwd() / f"predict_click_{int(time.time())}.png"
+                    img.save(out_path)
+                    print_colored(f"🖼️  Saved to {out_path}")
+
+                    # Open the image with default viewer
+                    try:
+                        system = platform.system().lower()
+                        if system == "windows":
+                            os.startfile(str(out_path))  # type: ignore[attr-defined]
+                        elif system == "darwin":
+                            os.system(f"open \"{out_path}\"")
+                        else:
+                            os.system(f"xdg-open \"{out_path}\"")
+                    except Exception:
+                        pass
+            except Exception as e:
+                print_colored(f"❌ Failed to render/save screenshot: {e}", Colors.RED, bold=True)
+                sys.exit(1)
+
+            # Done
+            sys.exit(0)
+
+        # Start chat loop (default interactive mode)
         await chat_loop(agent, args.model, container_name, args.prompt, args.usage)
 
 
diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml
index b78931ea..6fea439c 100644
--- a/libs/python/agent/pyproject.toml
+++ b/libs/python/agent/pyproject.toml
@@ -53,6 +53,13 @@ opencua-hf = [
     "tiktoken>=0.11.0",
     "blobfile>=3.0.0"
 ]
+internvl-hf = [
+    "accelerate",
+    "torch",
+    "transformers>=4.55.0",
+    "einops",
+    "timm"
+]
 ui = [
     "gradio>=5.23.3",
     "python-dotenv>=1.0.1",
@@ -68,7 +75,10 @@ all = [
     "mlx-vlm>=0.1.27; sys_platform == 'darwin'",
     "accelerate",
     "torch",
-    "transformers>=4.54.0",
+    "transformers>=4.55.0",
+    # internvl requirements,
+    "einops",
+    "timm",
     # opencua requirements
     "tiktoken>=0.11.0",
     "blobfile>=3.0.0",

From 6ddddf8f880dd1eb86a724b395a0a57ba0bba7e6 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 16 Sep 2025 12:56:07 -0400
Subject: [PATCH 17/17] fix internVL inference

---
 .../agent/agent/adapters/models/__init__.py   |   4 +-
 .../agent/agent/adapters/models/internvl.py   | 247 +++++++++++++++---
 libs/python/agent/agent/loops/internvl.py     |  12 +-
 3 files changed, 222 insertions(+), 41 deletions(-)

diff --git a/libs/python/agent/agent/adapters/models/__init__.py b/libs/python/agent/agent/adapters/models/__init__.py
index b36fda1b..3ed48404 100644
--- a/libs/python/agent/agent/adapters/models/__init__.py
+++ b/libs/python/agent/agent/adapters/models/__init__.py
@@ -26,8 +26,8 @@ def load_model(model_name: str, device: str = "auto", trust_remote_code: bool =
     print(f"cls: {cls}")
     if "OpenCUA" in cls:
         return OpenCUAModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
-    elif "Qwen2_5_VLConfig" in cls:
+    elif "Qwen2_5_VL" in cls:
         return Qwen2_5_VLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
-    elif "InternVLChatConfig" in cls:
+    elif "InternVL" in cls:
         return InternVLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
     return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
diff --git a/libs/python/agent/agent/adapters/models/internvl.py b/libs/python/agent/agent/adapters/models/internvl.py
index 0ed32e6b..bb2de42e 100644
--- a/libs/python/agent/agent/adapters/models/internvl.py
+++ b/libs/python/agent/agent/adapters/models/internvl.py
@@ -3,10 +3,16 @@ from typing import List, Dict, Any, Optional
 # Hugging Face imports are local to avoid hard dependency at module import
 try:
     import torch  # type: ignore
-    from transformers import AutoModel, AutoProcessor  # type: ignore
+    from transformers import AutoModel, AutoTokenizer  # type: ignore
     # Attempt to import InternVL's model dependencies
     import einops as _  # type: ignore
     import timm as _  # type: ignore
+    from PIL import Image  # type: ignore
+    import torchvision.transforms as T  # type: ignore
+    from torchvision.transforms.functional import InterpolationMode  # type: ignore
+    import base64  # type: ignore
+    from io import BytesIO  # type: ignore
+    import requests  # type: ignore
     HF_AVAILABLE = True
 except Exception:
     HF_AVAILABLE = False
@@ -14,7 +20,8 @@ except Exception:
 
 class InternVLModel:
     """Generic Hugging Face vision-language model handler.
-    Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
+    Uses InternVL's native `model.chat()` interface with `AutoTokenizer`.
+    Provides preprocessing to support multi-turn conversations with multiple images.
     """
 
     def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
@@ -25,7 +32,7 @@ class InternVLModel:
         self.model_name = model_name
         self.device = device
         self.model = None
-        self.processor = None
+        self.tokenizer = None
         self.trust_remote_code = trust_remote_code
         self._load()
 
@@ -33,46 +40,214 @@ class InternVLModel:
         # Load model
         self.model = AutoModel.from_pretrained(
             self.model_name,
-            torch_dtype=torch.float16,
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            use_flash_attn=True,
             device_map=self.device,
-            attn_implementation="sdpa",
             trust_remote_code=self.trust_remote_code,
-        )
-        # Load processor
-        self.processor = AutoProcessor.from_pretrained(
+        ).eval()
+        # Load tokenizer (InternVL requires trust_remote_code=True and often use_fast=False)
+        self.tokenizer = AutoTokenizer.from_pretrained(
             self.model_name,
-            min_pixels=3136,
-            max_pixels=4096 * 2160,
-            device_map=self.device,
             trust_remote_code=self.trust_remote_code,
+            use_fast=False,
         )
 
+    # ---- Image preprocessing utilities adapted from InternVL docs ----
+    IMAGENET_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_STD = (0.229, 0.224, 0.225)
+
+    def _build_transform(self, input_size: int) -> T.Compose:
+        MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD)
+        ])
+        return transform
+
+    def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tuple], width: int, height: int, image_size: int):
+        best_ratio_diff = float('inf')
+        best_ratio = (1, 1)
+        area = width * height
+        for ratio in target_ratios:
+            target_aspect_ratio = ratio[0] / ratio[1]
+            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+            if ratio_diff < best_ratio_diff:
+                best_ratio_diff = ratio_diff
+                best_ratio = ratio
+            elif ratio_diff == best_ratio_diff:
+                if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                    best_ratio = ratio
+        return best_ratio
+
+    def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448, use_thumbnail: bool = True) -> List[Image.Image]:
+        orig_width, orig_height = image.size
+        aspect_ratio = orig_width / orig_height
+
+        target_ratios = set(
+            (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+            i * j <= max_num and i * j >= min_num)
+        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+        target_aspect_ratio = self._find_closest_aspect_ratio(
+            aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+        target_width = image_size * target_aspect_ratio[0]
+        target_height = image_size * target_aspect_ratio[1]
+        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+        resized_img = image.resize((target_width, target_height))
+        processed_images: List[Image.Image] = []
+        for i in range(blocks):
+            box = (
+                (i % (target_width // image_size)) * image_size,
+                (i // (target_width // image_size)) * image_size,
+                ((i % (target_width // image_size)) + 1) * image_size,
+                ((i // (target_width // image_size)) + 1) * image_size
+            )
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+        assert len(processed_images) == blocks
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = image.resize((image_size, image_size))
+            processed_images.append(thumbnail_img)
+        return processed_images
+
+    def _load_image_from_source(self, src: str) -> Image.Image:
+        """Load PIL image from various sources: data URL, http(s), or local path."""
+        if src.startswith("data:image/"):
+            # data URL base64
+            header, b64data = src.split(",", 1)
+            img_bytes = base64.b64decode(b64data)
+            return Image.open(BytesIO(img_bytes)).convert('RGB')
+        if src.startswith("http://") or src.startswith("https://"):
+            resp = requests.get(src, timeout=10)
+            resp.raise_for_status()
+            return Image.open(BytesIO(resp.content)).convert('RGB')
+        # Assume local file path
+        return Image.open(src).convert('RGB')
+
+    def _images_to_pixel_values(self, images: List[Image.Image], input_size: int = 448, max_num: int = 12):
+        transform = self._build_transform(input_size=input_size)
+        pixel_values_list = []
+        num_patches_list: List[int] = []
+        for img in images:
+            tiles = self._dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
+            pv = [transform(tile) for tile in tiles]
+            pv = torch.stack(pv)
+            num_patches_list.append(pv.shape[0])
+            pixel_values_list.append(pv)
+        if not pixel_values_list:
+            return None, []
+        pixel_values = torch.cat(pixel_values_list)
+        return pixel_values, num_patches_list
+
     def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 128) -> str:
         """Generate text for the given HF-format messages.
         messages: [{ role, content: [{type:'text'|'image', text|image}] }]
+
+        This implementation constructs InternVL-compatible inputs and uses
+        `model.chat(tokenizer, pixel_values, question, history=...)` to avoid
+        relying on AutoProcessor (which fails for some tokenizers).
         """
-        assert self.model is not None and self.processor is not None
-        # Apply chat template and tokenize
-        inputs = self.processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
-        # Move inputs to the same device as model
-        inputs = inputs.to(self.model.device)
-        # Generate
-        with torch.no_grad():
-            generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
-        # Trim prompt tokens from output
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        # Decode
-        output_text = self.processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False,
-        )
-        return output_text[0] if output_text else ""
+        assert self.model is not None and self.tokenizer is not None
+
+        # Build textual context and collect images and the final question
+        context_lines: List[str] = []
+        all_images: List[Image.Image] = []
+        last_user_text_parts: List[str] = []
+
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", [])
+            if isinstance(content, str):
+                content_items = [{"type": "text", "text": content}]
+            else:
+                content_items = content
+
+            if role == "user":
+                # Collect text and images
+                parts_text: List[str] = []
+                for item in content_items:
+                    if item.get("type") == "text":
+                        t = item.get("text", "")
+                        if t:
+                            parts_text.append(t)
+                    elif item.get("type") == "image":
+                        url = item.get("image", "")
+                        if url:
+                            try:
+                                all_images.append(self._load_image_from_source(url))
+                            except Exception:
+                                # Ignore failed image loads but keep going
+                                pass
+                text = "\n".join(parts_text).strip()
+                if text:
+                    context_lines.append(f"User: {text}")
+                # Track last user text separately for question
+                last_user_text_parts = parts_text or last_user_text_parts
+            elif role == "assistant":
+                # Only keep text content for history
+                parts_text = [item.get("text", "") for item in content_items if item.get("type") == "text"]
+                text = "\n".join(parts_text).strip()
+                if text:
+                    context_lines.append(f"Assistant: {text}")
+
+        # Prepare pixel values for all collected images (across turns)
+        pixel_values = None
+        num_patches_list: List[int] = []
+        if all_images:
+            pixel_values, num_patches_list = self._images_to_pixel_values(all_images, input_size=448, max_num=12)
+            if pixel_values is not None:
+                # Convert dtype/device as in docs
+                pixel_values = pixel_values.to(torch.bfloat16)
+                # Chat API expects tensors on CUDA when model is on CUDA
+                try:
+                    pixel_values = pixel_values.to(self.model.device)
+                except Exception:
+                    pass
+
+        # Build question with any prior context and numbered image placeholders
+        if all_images:
+            # Separate images layout: Image-1: <image> ... then question text
+            prefix_lines = [f"Image-{i+1}: <image>" for i in range(len(all_images))]
+            prefix = "\n".join(prefix_lines) + "\n"
+        else:
+            prefix = ""
+
+        last_user_text = "\n".join(last_user_text_parts).strip()
+        # Combine prior text-only turns as context to emulate multi-turn
+        context_text = "\n".join(context_lines[:-1]) if len(context_lines) > 1 else ""
+        base_question = last_user_text if last_user_text else "Describe the image(s) in detail."
+        if context_text:
+            question = (context_text + "\n" + prefix + base_question).strip()
+        else:
+            question = (prefix + base_question).strip()
+
+        # Generation config
+        generation_config = dict(max_new_tokens=max_new_tokens, do_sample=False)
+
+        # Call InternVL chat
+        try:
+            if pixel_values is None:
+                # Pure-text conversation (embed prior turns in question)
+                response = self.model.chat(self.tokenizer, None, question, generation_config)
+            else:
+                # Multi-image: pass num_patches_list if >1 image
+                if len(num_patches_list) > 1:
+                    response = self.model.chat(
+                        self.tokenizer,
+                        pixel_values,
+                        question,
+                        generation_config,
+                        num_patches_list=num_patches_list,
+                    )
+                else:
+                    response = self.model.chat(self.tokenizer, pixel_values, question, generation_config)
+        except Exception as e:
+            # Fallback: return empty string to avoid crashing the adapter
+            return ""
+
+        return response or ""
diff --git a/libs/python/agent/agent/loops/internvl.py b/libs/python/agent/agent/loops/internvl.py
index d1b8c3fe..a857ffe3 100644
--- a/libs/python/agent/agent/loops/internvl.py
+++ b/libs/python/agent/agent/loops/internvl.py
@@ -26,9 +26,13 @@ from .composed_grounded import ComposedGroundedConfig
 from ..types import AgentCapability
 
 
-# Regex patterns matching ScreenSpot baseline extractors
-_POINT_PATTERN = re.compile(r"\[\[(\d+),(\d+)\]\]")
-_BBOX_PATTERN = re.compile(r"\[\[(\d+),(\d+),(\d+),(\d+)\]\]")
+# Regex patterns for extracting coordinates
+# Accept optional whitespace and optional decimal fractions
+_NUM = r"(\d+(?:\.\d+)?)"
+_POINT_PATTERN = re.compile(r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]")
+_BBOX_PATTERN = re.compile(
+    r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]"
+)
 
 
 def _extract_first_point(text: str) -> Optional[Tuple[float, float]]:
@@ -160,6 +164,8 @@ class InternVLConfig(ComposedGroundedConfig):
         response = await litellm.acompletion(**api_kwargs)
         output_text = (response.choices[0].message.content or "").strip()  # type: ignore
 
+        print(f"InternVL output: {output_text}")
+
         # Try to parse a point first; if absent, parse bbox and take center
         point = _extract_first_point(output_text)
         if point is None: