Merge branch 'main' into feat/generic-vlm-provider

2026-01-06 04:20:03 -06:00 · 2025-11-26 11:09:11 -05:00
parent ebedf60ed7 fd65b4f955
commit eb7d84ee3a
72 changed files with 1341 additions and 294 deletions
--- a/libs/python/agent/README.md
+++ b/libs/python/agent/README.md
@@ -51,7 +51,7 @@ async def main():

        # Create agent
        agent = ComputerAgent(
-            model="anthropic/claude-3-5-sonnet-20241022",
+            model="anthropic/claude-sonnet-4-5-20250929",
            tools=[computer],
            only_n_most_recent_images=3,
            trajectory_dir="trajectories",
--- a/libs/python/agent/agent/agent.py
+++ b/libs/python/agent/agent/agent.py
@@ -189,7 +189,7 @@ class ComputerAgent:
        Initialize ComputerAgent.

        Args:
-            model: Model name (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro")
+            model: Model name (e.g., "claude-sonnet-4-5-20250929", "computer-use-preview", "omni+vertex_ai/gemini-pro")
            tools: List of tools (computer objects, decorated functions, etc.)
            custom_loop: Custom agent loop function to use instead of auto-selection
            only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically.
--- a/libs/python/agent/agent/cli.py
+++ b/libs/python/agent/agent/cli.py
@@ -7,7 +7,7 @@ Usage:
 Examples:
    python -m agent.cli openai/computer-use-preview
    python -m agent.cli anthropic/claude-sonnet-4-5-20250929
-    python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
+    python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
 """

 try:
@@ -233,7 +233,7 @@ async def main():
 Examples:
  python -m agent.cli openai/computer-use-preview
  python -m agent.cli anthropic/claude-sonnet-4-5-20250929
-  python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
+  python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
  python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
        """,
    )
--- a/libs/python/agent/agent/loops/anthropic.py
+++ b/libs/python/agent/agent/loops/anthropic.py
@@ -671,11 +671,12 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
                        # Handle custom function tools (not computer tools)
                        if tool_name != "computer":
                            from ..responses import make_function_call_item
-                            responses_items.append(make_function_call_item(
-                                function_name=tool_name,
-                                arguments=tool_input,
-                                call_id=call_id
-                            ))
+
+                            responses_items.append(
+                                make_function_call_item(
+                                    function_name=tool_name, arguments=tool_input, call_id=call_id
+                                )
+                            )
                            continue

                        # Computer tool - process actions
@@ -883,16 +884,17 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
            # Handle custom function tools
            if tool_name != "computer":
                from ..responses import make_function_call_item
+
                # tool_call.function.arguments is a JSON string, need to parse it
                try:
                    args_dict = json.loads(tool_call.function.arguments)
                except json.JSONDecodeError:
                    args_dict = {}
-                responses_items.append(make_function_call_item(
-                    function_name=tool_name,
-                    arguments=args_dict,
-                    call_id=tool_call.id
-                ))
+                responses_items.append(
+                    make_function_call_item(
+                        function_name=tool_name, arguments=args_dict, call_id=tool_call.id
+                    )
+                )
                continue

            # Handle computer tool
--- a/libs/python/agent/agent/loops/generic_vlm.py
+++ b/libs/python/agent/agent/loops/generic_vlm.py
@@ -20,6 +20,7 @@ from ..loops.base import AsyncAgentConfig
 from ..responses import (
    convert_completion_messages_to_responses_items,
    convert_responses_items_to_completion_messages,
+    make_reasoning_item,
 )
 from ..types import AgentCapability

@@ -373,13 +374,23 @@ class GenericVlmConfig(AsyncAgentConfig):
        if _on_usage:
            await _on_usage(usage)

-        # Parse tool call from text; then convert to responses items via fake tool_calls
+        # Extract response data
        resp_dict = response.model_dump()  # type: ignore
        choice = (resp_dict.get("choices") or [{}])[0]
-        content_text = ((choice.get("message") or {}).get("content")) or ""
-        tool_call = _parse_tool_call_from_text(content_text)
+        message = choice.get("message") or {}
+        content_text = message.get("content") or ""
+        tool_calls_array = message.get("tool_calls") or []
+        reasoning_text = message.get("reasoning") or ""

        output_items: List[Dict[str, Any]] = []
+
+        # Add reasoning if present (Ollama Cloud format)
+        if reasoning_text:
+            output_items.append(make_reasoning_item(reasoning_text))
+
+        # Priority 1: Try to parse tool call from content text (OpenRouter format)
+        tool_call = _parse_tool_call_from_text(content_text)
+
        if tool_call and isinstance(tool_call, dict):
            fn_name = tool_call.get("name") or "computer"
            raw_args = tool_call.get("arguments") or {}
@@ -405,8 +416,50 @@ class GenericVlmConfig(AsyncAgentConfig):
                ],
            }
            output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
+        elif tool_calls_array:
+            # Priority 2: Use tool_calls field if present (Ollama Cloud format)
+            # Process and unnormalize coordinates in tool calls
+            processed_tool_calls = []
+            for tc in tool_calls_array:
+                function = tc.get("function", {})
+                fn_name = function.get("name", "computer")
+                args_str = function.get("arguments", "{}")
+
+                try:
+                    args = json.loads(args_str)
+
+                    # Unnormalize coordinates if present
+                    if "coordinate" in args and last_rw is not None and last_rh is not None:
+                        args = await _unnormalize_coordinate(args, (last_rw, last_rh))
+
+                    # Convert Qwen format to Computer Calls format if this is a computer tool
+                    if fn_name == "computer":
+                        converted_action = convert_qwen_tool_args_to_computer_action(args)
+                        if converted_action:
+                            args = converted_action
+
+                    processed_tool_calls.append(
+                        {
+                            "type": tc.get("type", "function"),
+                            "id": tc.get("id", "call_0"),
+                            "function": {
+                                "name": fn_name,
+                                "arguments": json.dumps(args),
+                            },
+                        }
+                    )
+                except json.JSONDecodeError:
+                    # Keep original if parsing fails
+                    processed_tool_calls.append(tc)
+
+            fake_cm = {
+                "role": "assistant",
+                "content": content_text if content_text else "",
+                "tool_calls": processed_tool_calls,
+            }
+            output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
        else:
-            # Fallback: just return assistant text
+            # No tool calls found in either format, return text response
            fake_cm = {"role": "assistant", "content": content_text}
            output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))

--- a/libs/python/agent/agent/loops/omniparser.py
+++ b/libs/python/agent/agent/loops/omniparser.py
@@ -365,6 +365,22 @@ class OmniparserConfig(AsyncAgentConfig):
            **kwargs,
        }

+        # Add Vertex AI specific parameters if using vertex_ai models
+        if llm_model.startswith("vertex_ai/"):
+            import os
+
+            # Pass vertex_project and vertex_location to liteLLM
+            if "vertex_project" not in api_kwargs:
+                api_kwargs["vertex_project"] = os.getenv("GOOGLE_CLOUD_PROJECT")
+            if "vertex_location" not in api_kwargs:
+                api_kwargs["vertex_location"] = "global"
+
+            # Pass through Gemini 3-specific parameters if provided
+            if "thinking_level" in kwargs:
+                api_kwargs["thinking_level"] = kwargs["thinking_level"]
+            if "media_resolution" in kwargs:
+                api_kwargs["media_resolution"] = kwargs["media_resolution"]
+
        # Call API start hook
        if _on_api_start:
            await _on_api_start(api_kwargs)
--- a/libs/python/agent/agent/loops/uitars2.py
+++ b/libs/python/agent/agent/loops/uitars2.py
@@ -5,13 +5,14 @@ UITARS-2 agent loop implementation using LiteLLM.
 - Calls litellm.acompletion
 - Parses <seed:tool_call> ... </seed:tool_call> outputs back into Responses items (computer actions)
 """
+
 from __future__ import annotations

-import re
-from typing import Any, Dict, List, Optional, Tuple
 import base64
 import io
 import json
+import re
+from typing import Any, Dict, List, Optional, Tuple

 import litellm
 from litellm.responses.litellm_completion_transformation.transformation import (
@@ -20,37 +21,45 @@ from litellm.responses.litellm_completion_transformation.transformation import (

 from ..decorators import register_agent
 from .omniparser import get_last_computer_call_output  # type: ignore
+
 try:
    from PIL import Image  # type: ignore
 except Exception:  # pragma: no cover
    Image = None  # type: ignore
 from ..responses import (
+    convert_responses_items_to_completion_messages,
    make_click_item,
    make_double_click_item,
    make_drag_item,
    make_function_call_item,
    make_keypress_item,
-    make_screenshot_item,
    make_move_item,
    make_output_text_item,
    make_reasoning_item,
+    make_screenshot_item,
    make_scroll_item,
    make_type_item,
    make_wait_item,
-    convert_responses_items_to_completion_messages,
 )
 from ..types import AgentCapability

-
 TOOL_SCHEMAS: List[Dict[str, Any]] = [
-    {"type": "function", "name": "open_computer", "parameters": {}, "description": "Open computer."},
+    {
+        "type": "function",
+        "name": "open_computer",
+        "parameters": {},
+        "description": "Open computer.",
+    },
    {
        "type": "function",
        "name": "click",
        "parameters": {
            "type": "object",
            "properties": {
-                "point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}
+                "point": {
+                    "type": "string",
+                    "description": "Click coordinates. The format is: <point>x y</point>",
+                }
            },
            "required": ["point"],
        },
@@ -62,7 +71,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
        "parameters": {
            "type": "object",
            "properties": {
-                "point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}
+                "point": {
+                    "type": "string",
+                    "description": "Click coordinates. The format is: <point>x y</point>",
+                }
            },
            "required": ["point"],
        },
@@ -74,7 +86,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
        "parameters": {
            "type": "object",
            "properties": {
-                "point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}
+                "point": {
+                    "type": "string",
+                    "description": "Click coordinates. The format is: <point>x y</point>",
+                }
            },
            "required": ["point"],
        },
@@ -106,7 +121,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
        "parameters": {
            "type": "object",
            "properties": {
-                "point": {"type": "string", "description": "Target coordinates. The format is: <point>x y</point>"}
+                "point": {
+                    "type": "string",
+                    "description": "Target coordinates. The format is: <point>x y</point>",
+                }
            },
            "required": ["point"],
        },
@@ -117,7 +135,12 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
        "name": "hotkey",
        "parameters": {
            "type": "object",
-            "properties": {"key": {"type": "string", "description": "Hotkeys you want to press. Split keys with a space and use lowercase."}},
+            "properties": {
+                "key": {
+                    "type": "string",
+                    "description": "Hotkeys you want to press. Split keys with a space and use lowercase.",
+                }
+            },
            "required": ["key"],
        },
        "description": "Press hotkey.",
@@ -227,9 +250,7 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
        "name": "wait",
        "parameters": {
            "type": "object",
-            "properties": {
-                "time": {"type": "integer", "description": "Wait time in seconds."}
-            },
+            "properties": {"time": {"type": "integer", "description": "Wait time in seconds."}},
            "required": [],
        },
        "description": "Wait for a while.",
@@ -268,7 +289,12 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
        },
        "description": "Type content.",
    },
-    {"type": "function", "name": "take_screenshot", "parameters": {}, "description": "Take screenshot."},
+    {
+        "type": "function",
+        "name": "take_screenshot",
+        "parameters": {},
+        "description": "Take screenshot.",
+    },
 ]


@@ -319,7 +345,9 @@ _PROMPT_SUFFIX = (
 SYSTEM_PROMPT = _PROMPT_PREFIX + _format_tool_schemas_json_lines(TOOL_SCHEMAS) + _PROMPT_SUFFIX


-def _extract_function_schemas_from_tools(tools: Optional[List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
+def _extract_function_schemas_from_tools(
+    tools: Optional[List[Dict[str, Any]]],
+) -> List[Dict[str, Any]]:
    schemas: List[Dict[str, Any]] = []
    if not tools:
        return schemas
@@ -330,12 +358,14 @@ def _extract_function_schemas_from_tools(tools: Optional[List[Dict[str, Any]]])
            params = fn.get("parameters", {})
            desc = fn.get("description", "")
            if name:
-                schemas.append({
-                    "type": "function",
-                    "name": name,
-                    "parameters": params if isinstance(params, dict) else {},
-                    "description": desc,
-                })
+                schemas.append(
+                    {
+                        "type": "function",
+                        "name": name,
+                        "parameters": params if isinstance(params, dict) else {},
+                        "description": desc,
+                    }
+                )
    return schemas


@@ -392,7 +422,9 @@ def _denormalize_xy_from_uitars(nx: float, ny: float, width: int, height: int) -
    return x, y


-def _map_computer_action_to_function(action: Dict[str, Any], width: int, height: int) -> Optional[Dict[str, Any]]:
+def _map_computer_action_to_function(
+    action: Dict[str, Any], width: int, height: int
+) -> Optional[Dict[str, Any]]:
    """Map a computer action item to a UITARS function + parameters dict of strings.
    Returns dict like {"function": name, "parameters": {..}} or None if unknown.
    """
@@ -404,7 +436,10 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height:
            return None
        nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
        if btn == "right":
-            return {"function": "right_single", "parameters": {"point": f"<point>{nx} {ny}</point>"}}
+            return {
+                "function": "right_single",
+                "parameters": {"point": f"<point>{nx} {ny}</point>"},
+            }
        return {"function": "click", "parameters": {"point": f"<point>{nx} {ny}</point>"}}
    if atype == "double_click":
        x, y = action.get("x"), action.get("y")
@@ -434,8 +469,19 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height:
        nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
        sx, sy = action.get("scroll_x", 0), action.get("scroll_y", 0)
        # Our parser used positive sy for up
-        direction = "up" if sy and sy > 0 else ("down" if sy and sy < 0 else ("right" if sx and sx > 0 else ("left" if sx and sx < 0 else "down")))
-        return {"function": "scroll", "parameters": {"direction": direction, "point": f"<point>{nx} {ny}</point>"}}
+        direction = (
+            "up"
+            if sy and sy > 0
+            else (
+                "down"
+                if sy and sy < 0
+                else ("right" if sx and sx > 0 else ("left" if sx and sx < 0 else "down"))
+            )
+        )
+        return {
+            "function": "scroll",
+            "parameters": {"direction": direction, "point": f"<point>{nx} {ny}</point>"},
+        }
    if atype == "drag":
        path = action.get("path", [])
        if isinstance(path, list) and len(path) >= 2:
@@ -461,7 +507,9 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height:
    return None


-def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int) -> List[Dict[str, Any]]:
+def _to_uitars_messages(
+    messages: List[Dict[str, Any]], width: int, height: int
+) -> List[Dict[str, Any]]:
    """Convert responses items into completion messages tailored for UI-TARS.

    - User content is passed through similar to convert_responses_items_to_completion_messages
@@ -505,7 +553,9 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
                completion_content = []
                for item in content:
                    if item.get("type") == "input_image":
-                        completion_content.append({"type": "image_url", "image_url": {"url": item.get("image_url")}})
+                        completion_content.append(
+                            {"type": "image_url", "image_url": {"url": item.get("image_url")}}
+                        )
                    elif item.get("type") in ("input_text", "text"):
                        completion_content.append({"type": "text", "text": item.get("text")})
                uitars_messages.append({"role": "user", "content": completion_content})
@@ -517,7 +567,11 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
        if mtype == "reasoning":
            # Responses reasoning stores summary list
            summary = msg.get("summary", [])
-            texts = [s.get("text", "") for s in summary if isinstance(s, dict) and s.get("type") == "summary_text"]
+            texts = [
+                s.get("text", "")
+                for s in summary
+                if isinstance(s, dict) and s.get("type") == "summary_text"
+            ]
            if texts:
                pending_think = "\n".join([t for t in texts if t])
            continue
@@ -546,9 +600,15 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
            pending_think, pending_functions = None, []
            content = msg.get("content", [])
            if isinstance(content, list):
-                texts = [c.get("text", "") for c in content if isinstance(c, dict) and c.get("type") in ("output_text", "text")]
+                texts = [
+                    c.get("text", "")
+                    for c in content
+                    if isinstance(c, dict) and c.get("type") in ("output_text", "text")
+                ]
                if texts:
-                    uitars_messages.append({"role": "assistant", "content": "\n".join([t for t in texts if t])})
+                    uitars_messages.append(
+                        {"role": "assistant", "content": "\n".join([t for t in texts if t])}
+                    )
            elif isinstance(content, str) and content:
                uitars_messages.append({"role": "assistant", "content": content})
            continue
@@ -581,8 +641,12 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)

    return uitars_messages

+
 def _to_response_items(
-    actions: List[Dict[str, Any]], tool_names: Optional[set[str]] = None, width: Optional[int] = None, height: Optional[int] = None
+    actions: List[Dict[str, Any]],
+    tool_names: Optional[set[str]] = None,
+    width: Optional[int] = None,
+    height: Optional[int] = None,
 ) -> List[Any]:
    """Map parsed actions into Responses items (computer actions + optional reasoning)."""
    items: List[Any] = []
@@ -736,8 +800,12 @@ class UITARS2Config:

        # Build dynamic system prompt by concatenating built-in schemas and provided function tools
        provided_fn_schemas = _extract_function_schemas_from_tools(tools)
-        combined_schemas = TOOL_SCHEMAS + provided_fn_schemas if provided_fn_schemas else TOOL_SCHEMAS
-        dynamic_system_prompt = _PROMPT_PREFIX + _format_tool_schemas_json_lines(combined_schemas) + _PROMPT_SUFFIX
+        combined_schemas = (
+            TOOL_SCHEMAS + provided_fn_schemas if provided_fn_schemas else TOOL_SCHEMAS
+        )
+        dynamic_system_prompt = (
+            _PROMPT_PREFIX + _format_tool_schemas_json_lines(combined_schemas) + _PROMPT_SUFFIX
+        )

        # Prepend system prompt (based on training prompts + provided tools)
        litellm_messages: List[Dict[str, Any]] = [
@@ -829,7 +897,10 @@ class UITARS2Config:
                "role": "user",
                "content": [
                    {"type": "text", "text": "Please return a single click action."},
-                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                    },
                ],
            },
        ]
@@ -841,7 +912,9 @@ class UITARS2Config:
            "temperature": kwargs.get("temperature", 0.0),
            "do_sample": kwargs.get("temperature", 0.0) > 0.0,
        }
-        api_kwargs.update({k: v for k, v in (kwargs or {}).items() if k not in ["max_tokens", "temperature"]})
+        api_kwargs.update(
+            {k: v for k, v in (kwargs or {}).items() if k not in ["max_tokens", "temperature"]}
+        )

        response = await litellm.acompletion(**api_kwargs)
        # Extract response content
@@ -852,7 +925,11 @@ class UITARS2Config:
        msg = choices[0].get("message", {})
        content_text = msg.get("content", "")
        if isinstance(content_text, list):
-            text_parts = [p.get("text", "") for p in content_text if isinstance(p, dict) and p.get("type") == "text"]
+            text_parts = [
+                p.get("text", "")
+                for p in content_text
+                if isinstance(p, dict) and p.get("type") == "text"
+            ]
            content_text = "\n".join([t for t in text_parts if t])
        if not isinstance(content_text, str):
            return None
--- a/libs/python/agent/agent/proxy/examples.py
+++ b/libs/python/agent/agent/proxy/examples.py
@@ -22,14 +22,14 @@ async def test_http_endpoint():

    # Example 1: Simple text request
    simple_request = {
-        "model": "anthropic/claude-3-5-sonnet-20241022",
+        "model": "anthropic/claude-sonnet-4-5-20250929",
        "input": "Tell me a three sentence bedtime story about a unicorn.",
        "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
    }

    # Example 2: Multi-modal request with image
    multimodal_request = {
-        "model": "anthropic/claude-3-5-sonnet-20241022",
+        "model": "anthropic/claude-sonnet-4-5-20250929",
        "input": [
            {
                "role": "user",
@@ -47,7 +47,7 @@ async def test_http_endpoint():

    # Example 3: Request with custom agent and computer kwargs
    custom_request = {
-        "model": "anthropic/claude-3-5-sonnet-20241022",
+        "model": "anthropic/claude-sonnet-4-5-20250929",
        "input": "Take a screenshot and tell me what you see",
        "env": {"ANTHROPIC_API_KEY": anthropic_api_key},
    }
@@ -95,7 +95,7 @@ def curl_examples():
        """curl http://localhost:8000/responses \\
  -H "Content-Type: application/json" \\
  -d '{
-    "model": "anthropic/claude-3-5-sonnet-20241022",
+    "model": "anthropic/claude-sonnet-4-5-20250929",
    "input": "Tell me a three sentence bedtime story about a unicorn."
  }'"""
    )
@@ -105,7 +105,7 @@ def curl_examples():
        """curl http://localhost:8000/responses \\
  -H "Content-Type: application/json" \\
  -d '{
-    "model": "anthropic/claude-3-5-sonnet-20241022",
+    "model": "anthropic/claude-sonnet-4-5-20250929",
    "input": [
      {
        "role": "user",
@@ -126,7 +126,7 @@ def curl_examples():
        """curl http://localhost:8000/responses \\
  -H "Content-Type: application/json" \\
  -d '{
-    "model": "anthropic/claude-3-5-sonnet-20241022",
+    "model": "anthropic/claude-sonnet-4-5-20250929",
    "input": "Take a screenshot and tell me what you see",
    "agent_kwargs": {
      "save_trajectory": true,
@@ -166,7 +166,7 @@ async def test_p2p_client():

            # Send a test request
            request = {
-                "model": "anthropic/claude-3-5-sonnet-20241022",
+                "model": "anthropic/claude-sonnet-4-5-20250929",
                "input": "Hello from P2P client!",
            }
            await connection.send(json.dumps(request))
--- a/libs/python/agent/agent/ui/gradio/app.py
+++ b/libs/python/agent/agent/ui/gradio/app.py
@@ -6,9 +6,9 @@ with an advanced UI for model selection and configuration.

 Supported Agent Models:
 - OpenAI: openai/computer-use-preview
- Anthropic: anthropic/claude-3-5-sonnet-20241022, anthropic/claude-3-7-sonnet-20250219
+- Anthropic: anthropic/claude-sonnet-4-5-20250929, anthropic/claude-3-7-sonnet-20250219
 - UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
- Omniparser: omniparser+anthropic/claude-3-5-sonnet-20241022, omniparser+ollama_chat/gemma3
+- Omniparser: omniparser+anthropic/claude-sonnet-4-5-20250929, omniparser+ollama_chat/gemma3

 Requirements:
    - Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
@@ -116,14 +116,12 @@ MODEL_MAPPINGS = {
        "Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
        "Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
        "Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
-        "Anthropic: Claude 3.5 Sonnet (20241022)": "anthropic/claude-3-5-sonnet-20241022",
    },
    "omni": {
        "default": "omniparser+openai/gpt-4o",
        "OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
        "OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
        "OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
-        "OMNI: Claude 3.5 Sonnet (20241022)": "omniparser+anthropic/claude-3-5-sonnet-20241022",
    },
    "uitars": {
        "default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",
--- a/libs/python/agent/agent/ui/gradio/ui_components.py
+++ b/libs/python/agent/agent/ui/gradio/ui_components.py
@@ -44,13 +44,11 @@ def create_gradio_ui() -> gr.Blocks:
        "Anthropic: Claude 4 Opus (20250514)",
        "Anthropic: Claude 4 Sonnet (20250514)",
        "Anthropic: Claude 3.7 Sonnet (20250219)",
-        "Anthropic: Claude 3.5 Sonnet (20241022)",
    ]
    omni_models = [
        "OMNI: OpenAI GPT-4o",
        "OMNI: OpenAI GPT-4o mini",
        "OMNI: Claude 3.7 Sonnet (20250219)",
-        "OMNI: Claude 3.5 Sonnet (20241022)",
    ]

    # Check if API keys are available
--- a/libs/python/agent/example.py
+++ b/libs/python/agent/example.py
@@ -102,7 +102,7 @@ async def main():
            # model="anthropic/claude-opus-4-20250514",
            # model="anthropic/claude-sonnet-4-20250514",
            # model="anthropic/claude-3-7-sonnet-20250219",
-            # model="anthropic/claude-3-5-sonnet-20241022",
+            # model="anthropic/claude-sonnet-4-5-20250929",
            # == UI-TARS ==
            # model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
            # TODO: add local mlx provider
--- a/libs/python/agent/tests/conftest.py
+++ b/libs/python/agent/tests/conftest.py
@@ -24,7 +24,7 @@ def mock_litellm():
                "id": "chatcmpl-test123",
                "object": "chat.completion",
                "created": 1234567890,
-                "model": kwargs.get("model", "anthropic/claude-3-5-sonnet-20241022"),
+                "model": kwargs.get("model", "anthropic/claude-sonnet-4-5-20250929"),
                "choices": [
                    {
                        "index": 0,
--- a/libs/python/agent/tests/test_computer_agent.py
+++ b/libs/python/agent/tests/test_computer_agent.py
@@ -18,18 +18,18 @@ class TestComputerAgentInitialization:
        """Test that agent can be initialized with a model string."""
        from agent import ComputerAgent

-        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
+        agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")

        assert agent is not None
        assert hasattr(agent, "model")
-        assert agent.model == "anthropic/claude-3-5-sonnet-20241022"
+        assert agent.model == "anthropic/claude-sonnet-4-5-20250929"

    @patch("agent.agent.litellm")
    def test_agent_initialization_with_tools(self, mock_litellm, disable_telemetry, mock_computer):
        """Test that agent can be initialized with tools."""
        from agent import ComputerAgent

-        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
+        agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929", tools=[mock_computer])

        assert agent is not None
        assert hasattr(agent, "tools")
@@ -41,7 +41,7 @@ class TestComputerAgentInitialization:

        budget = 5.0
        agent = ComputerAgent(
-            model="anthropic/claude-3-5-sonnet-20241022", max_trajectory_budget=budget
+            model="anthropic/claude-sonnet-4-5-20250929", max_trajectory_budget=budget
        )

        assert agent is not None
@@ -79,7 +79,7 @@ class TestComputerAgentRun:

        mock_litellm.acompletion = AsyncMock(return_value=mock_response)

-        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
+        agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")

        # Run should return an async generator
        result_generator = agent.run(sample_messages)
@@ -92,7 +92,7 @@ class TestComputerAgentRun:
        """Test that agent has run method available."""
        from agent import ComputerAgent

-        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
+        agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")

        # Verify run method exists
        assert hasattr(agent, "run")
@@ -102,7 +102,7 @@ class TestComputerAgentRun:
        """Test that agent has agent_loop initialized."""
        from agent import ComputerAgent

-        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
+        agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")

        # Verify agent_loop is initialized
        assert hasattr(agent, "agent_loop")
@@ -132,7 +132,7 @@ class TestComputerAgentIntegration:
        """Test that agent can be initialized with Computer tool."""
        from agent import ComputerAgent

-        agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
+        agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929", tools=[mock_computer])

        # Verify agent accepted the tool
        assert agent is not None