From 58807378dddcf99221fd54b264eb180984efd88c Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Fri, 12 Sep 2025 13:30:09 -0400 Subject: [PATCH] Added internVL --- libs/python/agent/agent/loops/__init__.py | 4 +- .../agent/agent/loops/composed_grounded.py | 2 +- libs/python/agent/agent/loops/internvl.py | 179 ++++++++++++++++++ libs/python/agent/agent/loops/opencua.py | 20 +- 4 files changed, 200 insertions(+), 5 deletions(-) create mode 100644 libs/python/agent/agent/loops/internvl.py diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py index 25227e64..958e484c 100644 --- a/libs/python/agent/agent/loops/__init__.py +++ b/libs/python/agent/agent/loops/__init__.py @@ -11,6 +11,7 @@ from . import gta1 from . import composed_grounded from . import glm45v from . import opencua +from . import internvl __all__ = [ "anthropic", @@ -20,5 +21,6 @@ __all__ = [ "gta1", "composed_grounded", "glm45v", - "opencua" + "opencua", + "internvl" ] \ No newline at end of file diff --git a/libs/python/agent/agent/loops/composed_grounded.py b/libs/python/agent/agent/loops/composed_grounded.py index cf029d13..87ba50e1 100644 --- a/libs/python/agent/agent/loops/composed_grounded.py +++ b/libs/python/agent/agent/loops/composed_grounded.py @@ -116,7 +116,7 @@ def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str @register_agent(r".*\+.*", priority=1) -class ComposedGroundedConfig: +class ComposedGroundedConfig(AsyncAgentConfig): """ Composed-grounded agent configuration that uses both grounding and thinking models. diff --git a/libs/python/agent/agent/loops/internvl.py b/libs/python/agent/agent/loops/internvl.py new file mode 100644 index 00000000..d1b8c3fe --- /dev/null +++ b/libs/python/agent/agent/loops/internvl.py @@ -0,0 +1,179 @@ +""" +InternVL agent loop implementation for click prediction using litellm.acompletion. + +Implements the ScreenSpot InternVL grounding baseline behavior: +- Uses the exact grounding prompt format with and tags +- Expects coordinates in 0-1000 normalized range in formats [[x1,y1,x2,y2]] or [[x,y]] +- Converts to pixel coordinates relative to the original screenshot size + +Note: We do NOT manually load the InternVL model; acompletions (via HuggingFaceLocalAdapter) +will handle loading based on the provided model name. +""" + +from __future__ import annotations + +import base64 +import math +import re +from io import BytesIO +from typing import Any, Dict, List, Optional, Tuple + +from PIL import Image +import litellm + +from ..decorators import register_agent +from .composed_grounded import ComposedGroundedConfig +from ..types import AgentCapability + + +# Regex patterns matching ScreenSpot baseline extractors +_POINT_PATTERN = re.compile(r"\[\[(\d+),(\d+)\]\]") +_BBOX_PATTERN = re.compile(r"\[\[(\d+),(\d+),(\d+),(\d+)\]\]") + + +def _extract_first_point(text: str) -> Optional[Tuple[float, float]]: + """Extract the first [[x,y]] as normalized (0-1000) floats.""" + m = _POINT_PATTERN.search(text) + if not m: + return None + try: + x = float(m.group(1)) + y = float(m.group(2)) + return x, y + except Exception: + return None + + +def _extract_last_bbox(text: str) -> Optional[Tuple[float, float, float, float]]: + """Extract the last [[x1,y1,x2,y2]] as normalized (0-1000) floats.""" + matches = list(_BBOX_PATTERN.finditer(text)) + if not matches: + return None + m = matches[-1] + try: + x1 = float(m.group(1)) + y1 = float(m.group(2)) + x2 = float(m.group(3)) + y2 = float(m.group(4)) + return x1, y1, x2, y2 + except Exception: + return None + + +def _scale_norm_to_pixels(x_norm: float, y_norm: float, width: int, height: int) -> Tuple[int, int]: + """Scale 0-1000 normalized coordinates to pixel coordinates for given image size.""" + x_px = int(math.floor((x_norm / 1000.0) * width)) + y_px = int(math.floor((y_norm / 1000.0) * height)) + # Clamp to image bounds just in case + x_px = max(0, min(width - 1, x_px)) + y_px = max(0, min(height - 1, y_px)) + return x_px, y_px + + +@register_agent(models=r"(?i).*InternVL.*") +class InternVLConfig(ComposedGroundedConfig): + """InternVL agent configuration reusing ComposedGroundedConfig for steps and + overriding predict_click to implement ScreenSpot InternVL grounding baseline.""" + + async def predict_step( + self, + messages: List[Dict[str, Any]], + model: str, + tools: Optional[List[Dict[str, Any]]] = None, + max_retries: Optional[int] = None, + stream: bool = False, + computer_handler=None, + _on_api_start=None, + _on_api_end=None, + _on_usage=None, + _on_screenshot=None, + **kwargs + ) -> Dict[str, Any]: + """Fallback to a self-composed model""" + return await super().predict_step( + messages=messages, + model=f"{model}+{model}", + tools=tools, + max_retries=max_retries, + stream=stream, + computer_handler=computer_handler, + _on_api_start=_on_api_start, + _on_api_end=_on_api_end, + _on_usage=_on_usage, + _on_screenshot=_on_screenshot, + **kwargs + ) + + async def predict_click( + self, + model: str, + image_b64: str, + instruction: str, + **kwargs + ) -> Optional[Tuple[int, int]]: + """ + Predict click coordinates using InternVL via litellm.acompletion. + + Behavior mirrors the ScreenSpot InternVL baseline: + - Prompt: "\nPlease provide the bounding box coordinate of the UI element this user instruction describes: {instruction}. Answer in the format of [[x1, y1, x2, y2]]" + - Parse either [[x,y]] point or [[x1,y1,x2,y2]] bbox, using bbox center if point missing + - Coordinates are 0-1000 normalized; convert to pixel coordinates for the original screenshot + """ + try: + # Decode image dimensions to scale the normalized outputs + img_bytes = base64.b64decode(image_b64) + image = Image.open(BytesIO(img_bytes)) + width, height = image.size + except Exception: + # If decoding fails, proceed with a safe default size to avoid crash + width, height = 1920, 1080 + + # Build grounding prompt exactly like the baseline + grounding_prompt = ( + f"Please provide the bounding box coordinate of the UI element this user instruction describes: {instruction}. " + f"Answer in the format of [[x1, y1, x2, y2]]" + ) + + # Prepare messages for LiteLLM + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{image_b64}"}, + }, + {"type": "text", "text": grounding_prompt}, + ], + } + ] + + # Call acompletion; HuggingFaceLocalAdapter/model handler will handle InternVL loading + api_kwargs = { + "model": model, + "messages": messages, + # Conservative generation params akin to baseline (deterministic) + "max_tokens": kwargs.get("max_tokens", 256), + "temperature": kwargs.get("temperature", 0.0), + } + + response = await litellm.acompletion(**api_kwargs) + output_text = (response.choices[0].message.content or "").strip() # type: ignore + + # Try to parse a point first; if absent, parse bbox and take center + point = _extract_first_point(output_text) + if point is None: + bbox = _extract_last_bbox(output_text) + if bbox is None: + return None + x1, y1, x2, y2 = bbox + cx = (x1 + x2) / 2.0 + cy = (y1 + y2) / 2.0 + point = (cx, cy) + + x_norm, y_norm = point + x_px, y_px = _scale_norm_to_pixels(x_norm, y_norm, width, height) + return (x_px, y_px) + + def get_capabilities(self) -> List[AgentCapability]: + return ["click", "step"] diff --git a/libs/python/agent/agent/loops/opencua.py b/libs/python/agent/agent/loops/opencua.py index 1688b587..b06ea126 100644 --- a/libs/python/agent/agent/loops/opencua.py +++ b/libs/python/agent/agent/loops/opencua.py @@ -14,6 +14,7 @@ from PIL import Image import litellm import math +from .composed_grounded import ComposedGroundedConfig from ..decorators import register_agent from ..types import Messages, AgentResponse, Tools, AgentCapability from ..loops.base import AsyncAgentConfig @@ -32,10 +33,11 @@ def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]: return None @register_agent(models=r"(?i).*OpenCUA.*") -class OpenCUAConfig(AsyncAgentConfig): +class OpenCUAConfig(ComposedGroundedConfig): """OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction.""" def __init__(self): + super().__init__() self.current_model = None self.last_screenshot_b64 = None @@ -53,8 +55,20 @@ class OpenCUAConfig(AsyncAgentConfig): _on_screenshot=None, **kwargs ) -> Dict[str, Any]: - """Predict step is not implemented for OpenCUA model.""" - raise NotImplementedError("predict_step is not implemented for OpenCUA model") + """Fallback to a self-composed model""" + return await super().predict_step( + messages=messages, + model=f"{model}+{model}", + tools=tools, + max_retries=max_retries, + stream=stream, + computer_handler=computer_handler, + _on_api_start=_on_api_start, + _on_api_end=_on_api_end, + _on_usage=_on_usage, + _on_screenshot=_on_screenshot, + **kwargs + ) async def predict_click( self,