From 3c03ea51c9a15a6b26c131ea8be2217d4a673afe Mon Sep 17 00:00:00 2001
From: Tamoghno Kandar <55907205+tamoghnokandar@users.noreply.github.com>
Date: Mon, 10 Nov 2025 11:12:17 -0800
Subject: [PATCH 1/4] Add files via upload

---
 libs/python/agent/agent/loops/gelato.py | 188 ++++++++++++++++++++++++
 1 file changed, 188 insertions(+)
 create mode 100644 libs/python/agent/agent/loops/gelato.py

diff --git a/libs/python/agent/agent/loops/gelato.py b/libs/python/agent/agent/loops/gelato.py
new file mode 100644
index 00000000..c66475f8
--- /dev/null
+++ b/libs/python/agent/agent/loops/gelato.py
@@ -0,0 +1,188 @@
+"""
+Gelato agent loop implementation for click prediction using litellm.acompletion
+Model: https://huggingface.co/mlfoundations/Gelato-30B-A3B
+Code: https://github.com/mlfoundations/Gelato/tree/main
+"""
+
+import asyncio
+import base64
+import json
+import math
+import re
+import uuid
+from io import BytesIO
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
+
+import litellm
+from PIL import Image
+
+from ..decorators import register_agent
+from ..loops.base import AsyncAgentConfig
+from ..types import AgentCapability, AgentResponse, Messages, Tools
+
+SYSTEM_PROMPT = '''
+You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. For elements with area, return the center point.
+
+Output the coordinate pair exactly:
+(x,y)
+'''
+
+
+def extract_coordinates(raw_string):
+    """
+    Extract the coordinates from the raw string.
+    Args:
+        raw_string: str (e.g. "(100, 200)")
+    Returns:
+        x: float (e.g. 100.0)
+        y: float (e.g. 200.0)
+    """
+    try:
+        matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string)
+        return [tuple(map(int, match)) for match in matches][0]
+    except:
+        return 0,0
+
+
+
+def smart_resize(
+    height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 8847360
+) -> Tuple[int, int]:
+    """Smart resize function similar to qwen_vl_utils."""
+    # Calculate the total pixels
+    total_pixels = height * width
+
+    # If already within bounds, return original dimensions
+    if min_pixels <= total_pixels <= max_pixels:
+        # Round to nearest factor
+        new_height = (height // factor) * factor
+        new_width = (width // factor) * factor
+        return new_height, new_width
+
+    # Calculate scaling factor
+    if total_pixels > max_pixels:
+        scale = (max_pixels / total_pixels) ** 0.5
+    else:
+        scale = (min_pixels / total_pixels) ** 0.5
+
+    # Apply scaling
+    new_height = int(height * scale)
+    new_width = int(width * scale)
+
+    # Round to nearest factor
+    new_height = (new_height // factor) * factor
+    new_width = (new_width // factor) * factor
+
+    # Ensure minimum size
+    new_height = max(new_height, factor)
+    new_width = max(new_width, factor)
+
+    return new_height, new_width
+
+
+@register_agent(models=r".*Gelato.*")
+class GelatoConfig(AsyncAgentConfig):
+    """Gelato agent configuration implementing AsyncAgentConfig protocol for click prediction."""
+
+    def __init__(self):
+        self.current_model = None
+        self.last_screenshot_b64 = None
+
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        raise NotImplementedError()
+
+    async def predict_click(
+        self, model: str, image_b64: str, instruction: str, **kwargs
+    ) -> Optional[Tuple[float, float]]:
+        """
+        Predict click coordinates using UI-Ins model via litellm.acompletion.
+
+        Args:
+            model: The UI-Ins model name
+            image_b64: Base64 encoded image
+            instruction: Instruction for where to click
+
+        Returns:
+            Tuple of (x, y) coordinates or None if prediction fails
+        """
+        # Decode base64 image
+        image_data = base64.b64decode(image_b64)
+        image = Image.open(BytesIO(image_data))
+        width, height = image.width, image.height
+
+        # Smart resize the image (similar to qwen_vl_utils)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=28,  # Default factor for Qwen models
+            min_pixels=3136,
+            max_pixels=4096 * 2160,
+        )
+        resized_image = image.resize((resized_width, resized_height))
+        scale_x, scale_y = width / resized_width, height / resized_height
+
+        # Convert resized image back to base64
+        buffered = BytesIO()
+        resized_image.save(buffered, format="PNG")
+        resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()
+
+        # Prepare system and user messages
+        system_message = {
+            "role": "system",
+            "content":  [
+            {
+                "type": "text",
+                "text": SYSTEM_PROMPT.strip()
+            }
+            ],
+        }
+
+        user_message = {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
+                },
+                {"type": "text", "text": instruction},
+            ],
+        }
+
+        # Prepare API call kwargs
+        api_kwargs = {
+            "model": model,
+            "messages": [system_message, user_message],
+            "max_tokens": 2056,
+            "temperature": 0.0,
+            **kwargs,
+        }
+
+        # Use liteLLM acompletion
+        response = await litellm.acompletion(**api_kwargs)
+
+        # Extract response text
+        output_text = response.choices[0].message.content  # type: ignore
+
+        # Extract and rescale coordinates
+        pred_x, pred_y = extract_coordinates(output_text)  # type: ignore
+        pred_x *= scale_x
+        pred_y *= scale_y
+
+        return (math.floor(pred_x), math.floor(pred_y))
+
+    def get_capabilities(self) -> List[AgentCapability]:
+        """Return the capabilities supported by this agent."""
+        return ["click"]
\ No newline at end of file

From 2f5f887b3d4123ed51a88835c06c837b72fdf63f Mon Sep 17 00:00:00 2001
From: Tamoghno Kandar <55907205+tamoghnokandar@users.noreply.github.com>
Date: Mon, 10 Nov 2025 11:13:05 -0800
Subject: [PATCH 2/4] Update __init__.py

---
 libs/python/agent/agent/loops/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py
index ab23ac27..5006c102 100644
--- a/libs/python/agent/agent/loops/__init__.py
+++ b/libs/python/agent/agent/loops/__init__.py
@@ -17,6 +17,7 @@ from . import (
     opencua,
     qwen,
     uitars,
+    gelato,
 )
 
 __all__ = [
@@ -33,4 +34,5 @@ __all__ = [
     "moondream3",
     "gemini",
     "qwen",
+    "gelato",
 ]

From ef842cf1e6b540e1bbe82e247746a223c72a1f2d Mon Sep 17 00:00:00 2001
From: Tamoghno Kandar <55907205+tamoghnokandar@users.noreply.github.com>
Date: Mon, 10 Nov 2025 12:19:01 -0800
Subject: [PATCH 3/4] Add files via upload

---
 libs/python/agent/agent/loops/gelato.py | 29 ++++++++++---------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/libs/python/agent/agent/loops/gelato.py b/libs/python/agent/agent/loops/gelato.py
index c66475f8..e3032472 100644
--- a/libs/python/agent/agent/loops/gelato.py
+++ b/libs/python/agent/agent/loops/gelato.py
@@ -4,28 +4,25 @@ Model: https://huggingface.co/mlfoundations/Gelato-30B-A3B
 Code: https://github.com/mlfoundations/Gelato/tree/main
 """
 
-import asyncio
 import base64
-import json
 import math
 import re
-import uuid
 from io import BytesIO
-from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple
 
 import litellm
 from PIL import Image
 
 from ..decorators import register_agent
 from ..loops.base import AsyncAgentConfig
-from ..types import AgentCapability, AgentResponse, Messages, Tools
+from ..types import AgentCapability
 
-SYSTEM_PROMPT = '''
+SYSTEM_PROMPT = """
 You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. For elements with area, return the center point.
 
 Output the coordinate pair exactly:
 (x,y)
-'''
+"""
 
 
 def extract_coordinates(raw_string):
@@ -41,12 +38,15 @@ def extract_coordinates(raw_string):
         matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string)
         return [tuple(map(int, match)) for match in matches][0]
     except:
-        return 0,0
-
+        return 0, 0
 
 
 def smart_resize(
-    height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 8847360
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 3136,
+    max_pixels: int = 8847360,
 ) -> Tuple[int, int]:
     """Smart resize function similar to qwen_vl_utils."""
     # Calculate the total pixels
@@ -142,12 +142,7 @@ class GelatoConfig(AsyncAgentConfig):
         # Prepare system and user messages
         system_message = {
             "role": "system",
-            "content":  [
-            {
-                "type": "text",
-                "text": SYSTEM_PROMPT.strip()
-            }
-            ],
+            "content": [{"type": "text", "text": SYSTEM_PROMPT.strip()}],
         }
 
         user_message = {
@@ -185,4 +180,4 @@ class GelatoConfig(AsyncAgentConfig):
 
     def get_capabilities(self) -> List[AgentCapability]:
         """Return the capabilities supported by this agent."""
-        return ["click"]
\ No newline at end of file
+        return ["click"]

From a1c394bcc2fb8981a5d0c08abbec17e69cf4a64e Mon Sep 17 00:00:00 2001
From: Tamoghno Kandar <55907205+tamoghnokandar@users.noreply.github.com>
Date: Mon, 10 Nov 2025 12:20:13 -0800
Subject: [PATCH 4/4] Add files via upload

---
 libs/python/agent/agent/loops/__init__.py | 76 +++++++++++------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py
index 5006c102..88535e89 100644
--- a/libs/python/agent/agent/loops/__init__.py
+++ b/libs/python/agent/agent/loops/__init__.py
@@ -1,38 +1,38 @@
-"""
-Agent loops for agent
-"""
-
-# Import the loops to register them
-from . import (
-    anthropic,
-    composed_grounded,
-    gemini,
-    glm45v,
-    gta1,
-    holo,
-    internvl,
-    moondream3,
-    omniparser,
-    openai,
-    opencua,
-    qwen,
-    uitars,
-    gelato,
-)
-
-__all__ = [
-    "anthropic",
-    "openai",
-    "uitars",
-    "omniparser",
-    "gta1",
-    "composed_grounded",
-    "glm45v",
-    "opencua",
-    "internvl",
-    "holo",
-    "moondream3",
-    "gemini",
-    "qwen",
-    "gelato",
-]
+"""
+Agent loops for agent
+"""
+
+# Import the loops to register them
+from . import (
+    anthropic,
+    composed_grounded,
+    gelato,
+    gemini,
+    glm45v,
+    gta1,
+    holo,
+    internvl,
+    moondream3,
+    omniparser,
+    openai,
+    opencua,
+    qwen,
+    uitars,
+)
+
+__all__ = [
+    "anthropic",
+    "openai",
+    "uitars",
+    "omniparser",
+    "gta1",
+    "composed_grounded",
+    "glm45v",
+    "opencua",
+    "internvl",
+    "holo",
+    "moondream3",
+    "gemini",
+    "qwen",
+    "gelato",
+]