Added internVL

This commit is contained in:
Dillon DuPont
2025-09-12 13:30:09 -04:00
parent 84e2a27aea
commit 58807378dd
4 changed files with 200 additions and 5 deletions

View File

@@ -11,6 +11,7 @@ from . import gta1
from . import composed_grounded
from . import glm45v
from . import opencua
from . import internvl
__all__ = [
"anthropic",
@@ -20,5 +21,6 @@ __all__ = [
"gta1",
"composed_grounded",
"glm45v",
"opencua"
"opencua",
"internvl"
]

View File

@@ -116,7 +116,7 @@ def get_last_computer_call_image(messages: List[Dict[str, Any]]) -> Optional[str
@register_agent(r".*\+.*", priority=1)
class ComposedGroundedConfig:
class ComposedGroundedConfig(AsyncAgentConfig):
"""
Composed-grounded agent configuration that uses both grounding and thinking models.

View File

@@ -0,0 +1,179 @@
"""
InternVL agent loop implementation for click prediction using litellm.acompletion.
Implements the ScreenSpot InternVL grounding baseline behavior:
- Uses the exact grounding prompt format with <image> and <ref> tags
- Expects coordinates in 0-1000 normalized range in formats [[x1,y1,x2,y2]] or [[x,y]]
- Converts to pixel coordinates relative to the original screenshot size
Note: We do NOT manually load the InternVL model; acompletions (via HuggingFaceLocalAdapter)
will handle loading based on the provided model name.
"""
from __future__ import annotations
import base64
import math
import re
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple
from PIL import Image
import litellm
from ..decorators import register_agent
from .composed_grounded import ComposedGroundedConfig
from ..types import AgentCapability
# Regex patterns matching ScreenSpot baseline extractors
_POINT_PATTERN = re.compile(r"\[\[(\d+),(\d+)\]\]")
_BBOX_PATTERN = re.compile(r"\[\[(\d+),(\d+),(\d+),(\d+)\]\]")
def _extract_first_point(text: str) -> Optional[Tuple[float, float]]:
"""Extract the first [[x,y]] as normalized (0-1000) floats."""
m = _POINT_PATTERN.search(text)
if not m:
return None
try:
x = float(m.group(1))
y = float(m.group(2))
return x, y
except Exception:
return None
def _extract_last_bbox(text: str) -> Optional[Tuple[float, float, float, float]]:
"""Extract the last [[x1,y1,x2,y2]] as normalized (0-1000) floats."""
matches = list(_BBOX_PATTERN.finditer(text))
if not matches:
return None
m = matches[-1]
try:
x1 = float(m.group(1))
y1 = float(m.group(2))
x2 = float(m.group(3))
y2 = float(m.group(4))
return x1, y1, x2, y2
except Exception:
return None
def _scale_norm_to_pixels(x_norm: float, y_norm: float, width: int, height: int) -> Tuple[int, int]:
"""Scale 0-1000 normalized coordinates to pixel coordinates for given image size."""
x_px = int(math.floor((x_norm / 1000.0) * width))
y_px = int(math.floor((y_norm / 1000.0) * height))
# Clamp to image bounds just in case
x_px = max(0, min(width - 1, x_px))
y_px = max(0, min(height - 1, y_px))
return x_px, y_px
@register_agent(models=r"(?i).*InternVL.*")
class InternVLConfig(ComposedGroundedConfig):
"""InternVL agent configuration reusing ComposedGroundedConfig for steps and
overriding predict_click to implement ScreenSpot InternVL grounding baseline."""
async def predict_step(
self,
messages: List[Dict[str, Any]],
model: str,
tools: Optional[List[Dict[str, Any]]] = None,
max_retries: Optional[int] = None,
stream: bool = False,
computer_handler=None,
_on_api_start=None,
_on_api_end=None,
_on_usage=None,
_on_screenshot=None,
**kwargs
) -> Dict[str, Any]:
"""Fallback to a self-composed model"""
return await super().predict_step(
messages=messages,
model=f"{model}+{model}",
tools=tools,
max_retries=max_retries,
stream=stream,
computer_handler=computer_handler,
_on_api_start=_on_api_start,
_on_api_end=_on_api_end,
_on_usage=_on_usage,
_on_screenshot=_on_screenshot,
**kwargs
)
async def predict_click(
self,
model: str,
image_b64: str,
instruction: str,
**kwargs
) -> Optional[Tuple[int, int]]:
"""
Predict click coordinates using InternVL via litellm.acompletion.
Behavior mirrors the ScreenSpot InternVL baseline:
- Prompt: "<image>\nPlease provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. Answer in the format of [[x1, y1, x2, y2]]"
- Parse either [[x,y]] point or [[x1,y1,x2,y2]] bbox, using bbox center if point missing
- Coordinates are 0-1000 normalized; convert to pixel coordinates for the original screenshot
"""
try:
# Decode image dimensions to scale the normalized outputs
img_bytes = base64.b64decode(image_b64)
image = Image.open(BytesIO(img_bytes))
width, height = image.size
except Exception:
# If decoding fails, proceed with a safe default size to avoid crash
width, height = 1920, 1080
# Build grounding prompt exactly like the baseline
grounding_prompt = (
f"Please provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. "
f"Answer in the format of [[x1, y1, x2, y2]]"
)
# Prepare messages for LiteLLM
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
},
{"type": "text", "text": grounding_prompt},
],
}
]
# Call acompletion; HuggingFaceLocalAdapter/model handler will handle InternVL loading
api_kwargs = {
"model": model,
"messages": messages,
# Conservative generation params akin to baseline (deterministic)
"max_tokens": kwargs.get("max_tokens", 256),
"temperature": kwargs.get("temperature", 0.0),
}
response = await litellm.acompletion(**api_kwargs)
output_text = (response.choices[0].message.content or "").strip() # type: ignore
# Try to parse a point first; if absent, parse bbox and take center
point = _extract_first_point(output_text)
if point is None:
bbox = _extract_last_bbox(output_text)
if bbox is None:
return None
x1, y1, x2, y2 = bbox
cx = (x1 + x2) / 2.0
cy = (y1 + y2) / 2.0
point = (cx, cy)
x_norm, y_norm = point
x_px, y_px = _scale_norm_to_pixels(x_norm, y_norm, width, height)
return (x_px, y_px)
def get_capabilities(self) -> List[AgentCapability]:
return ["click", "step"]

View File

@@ -14,6 +14,7 @@ from PIL import Image
import litellm
import math
from .composed_grounded import ComposedGroundedConfig
from ..decorators import register_agent
from ..types import Messages, AgentResponse, Tools, AgentCapability
from ..loops.base import AsyncAgentConfig
@@ -32,10 +33,11 @@ def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
return None
@register_agent(models=r"(?i).*OpenCUA.*")
class OpenCUAConfig(AsyncAgentConfig):
class OpenCUAConfig(ComposedGroundedConfig):
"""OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
def __init__(self):
super().__init__()
self.current_model = None
self.last_screenshot_b64 = None
@@ -53,8 +55,20 @@ class OpenCUAConfig(AsyncAgentConfig):
_on_screenshot=None,
**kwargs
) -> Dict[str, Any]:
"""Predict step is not implemented for OpenCUA model."""
raise NotImplementedError("predict_step is not implemented for OpenCUA model")
"""Fallback to a self-composed model"""
return await super().predict_step(
messages=messages,
model=f"{model}+{model}",
tools=tools,
max_retries=max_retries,
stream=stream,
computer_handler=computer_handler,
_on_api_start=_on_api_start,
_on_api_end=_on_api_end,
_on_usage=_on_usage,
_on_screenshot=_on_screenshot,
**kwargs
)
async def predict_click(
self,