mirror of
https://github.com/trycua/lume.git
synced 2026-01-06 04:20:03 -06:00
Merge branch 'main' into feat/generic-vlm-provider
This commit is contained in:
@@ -51,7 +51,7 @@ async def main():
|
||||
|
||||
# Create agent
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
model="anthropic/claude-sonnet-4-5-20250929",
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
trajectory_dir="trajectories",
|
||||
|
||||
@@ -189,7 +189,7 @@ class ComputerAgent:
|
||||
Initialize ComputerAgent.
|
||||
|
||||
Args:
|
||||
model: Model name (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro")
|
||||
model: Model name (e.g., "claude-sonnet-4-5-20250929", "computer-use-preview", "omni+vertex_ai/gemini-pro")
|
||||
tools: List of tools (computer objects, decorated functions, etc.)
|
||||
custom_loop: Custom agent loop function to use instead of auto-selection
|
||||
only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically.
|
||||
|
||||
@@ -7,7 +7,7 @@ Usage:
|
||||
Examples:
|
||||
python -m agent.cli openai/computer-use-preview
|
||||
python -m agent.cli anthropic/claude-sonnet-4-5-20250929
|
||||
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
|
||||
python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
|
||||
"""
|
||||
|
||||
try:
|
||||
@@ -233,7 +233,7 @@ async def main():
|
||||
Examples:
|
||||
python -m agent.cli openai/computer-use-preview
|
||||
python -m agent.cli anthropic/claude-sonnet-4-5-20250929
|
||||
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
|
||||
python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
|
||||
python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
||||
""",
|
||||
)
|
||||
|
||||
@@ -671,11 +671,12 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
||||
# Handle custom function tools (not computer tools)
|
||||
if tool_name != "computer":
|
||||
from ..responses import make_function_call_item
|
||||
responses_items.append(make_function_call_item(
|
||||
function_name=tool_name,
|
||||
arguments=tool_input,
|
||||
call_id=call_id
|
||||
))
|
||||
|
||||
responses_items.append(
|
||||
make_function_call_item(
|
||||
function_name=tool_name, arguments=tool_input, call_id=call_id
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
# Computer tool - process actions
|
||||
@@ -883,16 +884,17 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
|
||||
# Handle custom function tools
|
||||
if tool_name != "computer":
|
||||
from ..responses import make_function_call_item
|
||||
|
||||
# tool_call.function.arguments is a JSON string, need to parse it
|
||||
try:
|
||||
args_dict = json.loads(tool_call.function.arguments)
|
||||
except json.JSONDecodeError:
|
||||
args_dict = {}
|
||||
responses_items.append(make_function_call_item(
|
||||
function_name=tool_name,
|
||||
arguments=args_dict,
|
||||
call_id=tool_call.id
|
||||
))
|
||||
responses_items.append(
|
||||
make_function_call_item(
|
||||
function_name=tool_name, arguments=args_dict, call_id=tool_call.id
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
# Handle computer tool
|
||||
|
||||
@@ -20,6 +20,7 @@ from ..loops.base import AsyncAgentConfig
|
||||
from ..responses import (
|
||||
convert_completion_messages_to_responses_items,
|
||||
convert_responses_items_to_completion_messages,
|
||||
make_reasoning_item,
|
||||
)
|
||||
from ..types import AgentCapability
|
||||
|
||||
@@ -373,13 +374,23 @@ class GenericVlmConfig(AsyncAgentConfig):
|
||||
if _on_usage:
|
||||
await _on_usage(usage)
|
||||
|
||||
# Parse tool call from text; then convert to responses items via fake tool_calls
|
||||
# Extract response data
|
||||
resp_dict = response.model_dump() # type: ignore
|
||||
choice = (resp_dict.get("choices") or [{}])[0]
|
||||
content_text = ((choice.get("message") or {}).get("content")) or ""
|
||||
tool_call = _parse_tool_call_from_text(content_text)
|
||||
message = choice.get("message") or {}
|
||||
content_text = message.get("content") or ""
|
||||
tool_calls_array = message.get("tool_calls") or []
|
||||
reasoning_text = message.get("reasoning") or ""
|
||||
|
||||
output_items: List[Dict[str, Any]] = []
|
||||
|
||||
# Add reasoning if present (Ollama Cloud format)
|
||||
if reasoning_text:
|
||||
output_items.append(make_reasoning_item(reasoning_text))
|
||||
|
||||
# Priority 1: Try to parse tool call from content text (OpenRouter format)
|
||||
tool_call = _parse_tool_call_from_text(content_text)
|
||||
|
||||
if tool_call and isinstance(tool_call, dict):
|
||||
fn_name = tool_call.get("name") or "computer"
|
||||
raw_args = tool_call.get("arguments") or {}
|
||||
@@ -405,8 +416,50 @@ class GenericVlmConfig(AsyncAgentConfig):
|
||||
],
|
||||
}
|
||||
output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
|
||||
elif tool_calls_array:
|
||||
# Priority 2: Use tool_calls field if present (Ollama Cloud format)
|
||||
# Process and unnormalize coordinates in tool calls
|
||||
processed_tool_calls = []
|
||||
for tc in tool_calls_array:
|
||||
function = tc.get("function", {})
|
||||
fn_name = function.get("name", "computer")
|
||||
args_str = function.get("arguments", "{}")
|
||||
|
||||
try:
|
||||
args = json.loads(args_str)
|
||||
|
||||
# Unnormalize coordinates if present
|
||||
if "coordinate" in args and last_rw is not None and last_rh is not None:
|
||||
args = await _unnormalize_coordinate(args, (last_rw, last_rh))
|
||||
|
||||
# Convert Qwen format to Computer Calls format if this is a computer tool
|
||||
if fn_name == "computer":
|
||||
converted_action = convert_qwen_tool_args_to_computer_action(args)
|
||||
if converted_action:
|
||||
args = converted_action
|
||||
|
||||
processed_tool_calls.append(
|
||||
{
|
||||
"type": tc.get("type", "function"),
|
||||
"id": tc.get("id", "call_0"),
|
||||
"function": {
|
||||
"name": fn_name,
|
||||
"arguments": json.dumps(args),
|
||||
},
|
||||
}
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
# Keep original if parsing fails
|
||||
processed_tool_calls.append(tc)
|
||||
|
||||
fake_cm = {
|
||||
"role": "assistant",
|
||||
"content": content_text if content_text else "",
|
||||
"tool_calls": processed_tool_calls,
|
||||
}
|
||||
output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
|
||||
else:
|
||||
# Fallback: just return assistant text
|
||||
# No tool calls found in either format, return text response
|
||||
fake_cm = {"role": "assistant", "content": content_text}
|
||||
output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
|
||||
|
||||
|
||||
@@ -365,6 +365,22 @@ class OmniparserConfig(AsyncAgentConfig):
|
||||
**kwargs,
|
||||
}
|
||||
|
||||
# Add Vertex AI specific parameters if using vertex_ai models
|
||||
if llm_model.startswith("vertex_ai/"):
|
||||
import os
|
||||
|
||||
# Pass vertex_project and vertex_location to liteLLM
|
||||
if "vertex_project" not in api_kwargs:
|
||||
api_kwargs["vertex_project"] = os.getenv("GOOGLE_CLOUD_PROJECT")
|
||||
if "vertex_location" not in api_kwargs:
|
||||
api_kwargs["vertex_location"] = "global"
|
||||
|
||||
# Pass through Gemini 3-specific parameters if provided
|
||||
if "thinking_level" in kwargs:
|
||||
api_kwargs["thinking_level"] = kwargs["thinking_level"]
|
||||
if "media_resolution" in kwargs:
|
||||
api_kwargs["media_resolution"] = kwargs["media_resolution"]
|
||||
|
||||
# Call API start hook
|
||||
if _on_api_start:
|
||||
await _on_api_start(api_kwargs)
|
||||
|
||||
@@ -5,13 +5,14 @@ UITARS-2 agent loop implementation using LiteLLM.
|
||||
- Calls litellm.acompletion
|
||||
- Parses <seed:tool_call> ... </seed:tool_call> outputs back into Responses items (computer actions)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import litellm
|
||||
from litellm.responses.litellm_completion_transformation.transformation import (
|
||||
@@ -20,37 +21,45 @@ from litellm.responses.litellm_completion_transformation.transformation import (
|
||||
|
||||
from ..decorators import register_agent
|
||||
from .omniparser import get_last_computer_call_output # type: ignore
|
||||
|
||||
try:
|
||||
from PIL import Image # type: ignore
|
||||
except Exception: # pragma: no cover
|
||||
Image = None # type: ignore
|
||||
from ..responses import (
|
||||
convert_responses_items_to_completion_messages,
|
||||
make_click_item,
|
||||
make_double_click_item,
|
||||
make_drag_item,
|
||||
make_function_call_item,
|
||||
make_keypress_item,
|
||||
make_screenshot_item,
|
||||
make_move_item,
|
||||
make_output_text_item,
|
||||
make_reasoning_item,
|
||||
make_screenshot_item,
|
||||
make_scroll_item,
|
||||
make_type_item,
|
||||
make_wait_item,
|
||||
convert_responses_items_to_completion_messages,
|
||||
)
|
||||
from ..types import AgentCapability
|
||||
|
||||
|
||||
TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
||||
{"type": "function", "name": "open_computer", "parameters": {}, "description": "Open computer."},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "open_computer",
|
||||
"parameters": {},
|
||||
"description": "Open computer.",
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "click",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}
|
||||
"point": {
|
||||
"type": "string",
|
||||
"description": "Click coordinates. The format is: <point>x y</point>",
|
||||
}
|
||||
},
|
||||
"required": ["point"],
|
||||
},
|
||||
@@ -62,7 +71,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}
|
||||
"point": {
|
||||
"type": "string",
|
||||
"description": "Click coordinates. The format is: <point>x y</point>",
|
||||
}
|
||||
},
|
||||
"required": ["point"],
|
||||
},
|
||||
@@ -74,7 +86,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}
|
||||
"point": {
|
||||
"type": "string",
|
||||
"description": "Click coordinates. The format is: <point>x y</point>",
|
||||
}
|
||||
},
|
||||
"required": ["point"],
|
||||
},
|
||||
@@ -106,7 +121,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"point": {"type": "string", "description": "Target coordinates. The format is: <point>x y</point>"}
|
||||
"point": {
|
||||
"type": "string",
|
||||
"description": "Target coordinates. The format is: <point>x y</point>",
|
||||
}
|
||||
},
|
||||
"required": ["point"],
|
||||
},
|
||||
@@ -117,7 +135,12 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
||||
"name": "hotkey",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"key": {"type": "string", "description": "Hotkeys you want to press. Split keys with a space and use lowercase."}},
|
||||
"properties": {
|
||||
"key": {
|
||||
"type": "string",
|
||||
"description": "Hotkeys you want to press. Split keys with a space and use lowercase.",
|
||||
}
|
||||
},
|
||||
"required": ["key"],
|
||||
},
|
||||
"description": "Press hotkey.",
|
||||
@@ -227,9 +250,7 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
||||
"name": "wait",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"time": {"type": "integer", "description": "Wait time in seconds."}
|
||||
},
|
||||
"properties": {"time": {"type": "integer", "description": "Wait time in seconds."}},
|
||||
"required": [],
|
||||
},
|
||||
"description": "Wait for a while.",
|
||||
@@ -268,7 +289,12 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
|
||||
},
|
||||
"description": "Type content.",
|
||||
},
|
||||
{"type": "function", "name": "take_screenshot", "parameters": {}, "description": "Take screenshot."},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "take_screenshot",
|
||||
"parameters": {},
|
||||
"description": "Take screenshot.",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@@ -319,7 +345,9 @@ _PROMPT_SUFFIX = (
|
||||
SYSTEM_PROMPT = _PROMPT_PREFIX + _format_tool_schemas_json_lines(TOOL_SCHEMAS) + _PROMPT_SUFFIX
|
||||
|
||||
|
||||
def _extract_function_schemas_from_tools(tools: Optional[List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
||||
def _extract_function_schemas_from_tools(
|
||||
tools: Optional[List[Dict[str, Any]]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
schemas: List[Dict[str, Any]] = []
|
||||
if not tools:
|
||||
return schemas
|
||||
@@ -330,12 +358,14 @@ def _extract_function_schemas_from_tools(tools: Optional[List[Dict[str, Any]]])
|
||||
params = fn.get("parameters", {})
|
||||
desc = fn.get("description", "")
|
||||
if name:
|
||||
schemas.append({
|
||||
"type": "function",
|
||||
"name": name,
|
||||
"parameters": params if isinstance(params, dict) else {},
|
||||
"description": desc,
|
||||
})
|
||||
schemas.append(
|
||||
{
|
||||
"type": "function",
|
||||
"name": name,
|
||||
"parameters": params if isinstance(params, dict) else {},
|
||||
"description": desc,
|
||||
}
|
||||
)
|
||||
return schemas
|
||||
|
||||
|
||||
@@ -392,7 +422,9 @@ def _denormalize_xy_from_uitars(nx: float, ny: float, width: int, height: int) -
|
||||
return x, y
|
||||
|
||||
|
||||
def _map_computer_action_to_function(action: Dict[str, Any], width: int, height: int) -> Optional[Dict[str, Any]]:
|
||||
def _map_computer_action_to_function(
|
||||
action: Dict[str, Any], width: int, height: int
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Map a computer action item to a UITARS function + parameters dict of strings.
|
||||
Returns dict like {"function": name, "parameters": {..}} or None if unknown.
|
||||
"""
|
||||
@@ -404,7 +436,10 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height:
|
||||
return None
|
||||
nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
|
||||
if btn == "right":
|
||||
return {"function": "right_single", "parameters": {"point": f"<point>{nx} {ny}</point>"}}
|
||||
return {
|
||||
"function": "right_single",
|
||||
"parameters": {"point": f"<point>{nx} {ny}</point>"},
|
||||
}
|
||||
return {"function": "click", "parameters": {"point": f"<point>{nx} {ny}</point>"}}
|
||||
if atype == "double_click":
|
||||
x, y = action.get("x"), action.get("y")
|
||||
@@ -434,8 +469,19 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height:
|
||||
nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
|
||||
sx, sy = action.get("scroll_x", 0), action.get("scroll_y", 0)
|
||||
# Our parser used positive sy for up
|
||||
direction = "up" if sy and sy > 0 else ("down" if sy and sy < 0 else ("right" if sx and sx > 0 else ("left" if sx and sx < 0 else "down")))
|
||||
return {"function": "scroll", "parameters": {"direction": direction, "point": f"<point>{nx} {ny}</point>"}}
|
||||
direction = (
|
||||
"up"
|
||||
if sy and sy > 0
|
||||
else (
|
||||
"down"
|
||||
if sy and sy < 0
|
||||
else ("right" if sx and sx > 0 else ("left" if sx and sx < 0 else "down"))
|
||||
)
|
||||
)
|
||||
return {
|
||||
"function": "scroll",
|
||||
"parameters": {"direction": direction, "point": f"<point>{nx} {ny}</point>"},
|
||||
}
|
||||
if atype == "drag":
|
||||
path = action.get("path", [])
|
||||
if isinstance(path, list) and len(path) >= 2:
|
||||
@@ -461,7 +507,9 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height:
|
||||
return None
|
||||
|
||||
|
||||
def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int) -> List[Dict[str, Any]]:
|
||||
def _to_uitars_messages(
|
||||
messages: List[Dict[str, Any]], width: int, height: int
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Convert responses items into completion messages tailored for UI-TARS.
|
||||
|
||||
- User content is passed through similar to convert_responses_items_to_completion_messages
|
||||
@@ -505,7 +553,9 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
|
||||
completion_content = []
|
||||
for item in content:
|
||||
if item.get("type") == "input_image":
|
||||
completion_content.append({"type": "image_url", "image_url": {"url": item.get("image_url")}})
|
||||
completion_content.append(
|
||||
{"type": "image_url", "image_url": {"url": item.get("image_url")}}
|
||||
)
|
||||
elif item.get("type") in ("input_text", "text"):
|
||||
completion_content.append({"type": "text", "text": item.get("text")})
|
||||
uitars_messages.append({"role": "user", "content": completion_content})
|
||||
@@ -517,7 +567,11 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
|
||||
if mtype == "reasoning":
|
||||
# Responses reasoning stores summary list
|
||||
summary = msg.get("summary", [])
|
||||
texts = [s.get("text", "") for s in summary if isinstance(s, dict) and s.get("type") == "summary_text"]
|
||||
texts = [
|
||||
s.get("text", "")
|
||||
for s in summary
|
||||
if isinstance(s, dict) and s.get("type") == "summary_text"
|
||||
]
|
||||
if texts:
|
||||
pending_think = "\n".join([t for t in texts if t])
|
||||
continue
|
||||
@@ -546,9 +600,15 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
|
||||
pending_think, pending_functions = None, []
|
||||
content = msg.get("content", [])
|
||||
if isinstance(content, list):
|
||||
texts = [c.get("text", "") for c in content if isinstance(c, dict) and c.get("type") in ("output_text", "text")]
|
||||
texts = [
|
||||
c.get("text", "")
|
||||
for c in content
|
||||
if isinstance(c, dict) and c.get("type") in ("output_text", "text")
|
||||
]
|
||||
if texts:
|
||||
uitars_messages.append({"role": "assistant", "content": "\n".join([t for t in texts if t])})
|
||||
uitars_messages.append(
|
||||
{"role": "assistant", "content": "\n".join([t for t in texts if t])}
|
||||
)
|
||||
elif isinstance(content, str) and content:
|
||||
uitars_messages.append({"role": "assistant", "content": content})
|
||||
continue
|
||||
@@ -581,8 +641,12 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
|
||||
|
||||
return uitars_messages
|
||||
|
||||
|
||||
def _to_response_items(
|
||||
actions: List[Dict[str, Any]], tool_names: Optional[set[str]] = None, width: Optional[int] = None, height: Optional[int] = None
|
||||
actions: List[Dict[str, Any]],
|
||||
tool_names: Optional[set[str]] = None,
|
||||
width: Optional[int] = None,
|
||||
height: Optional[int] = None,
|
||||
) -> List[Any]:
|
||||
"""Map parsed actions into Responses items (computer actions + optional reasoning)."""
|
||||
items: List[Any] = []
|
||||
@@ -736,8 +800,12 @@ class UITARS2Config:
|
||||
|
||||
# Build dynamic system prompt by concatenating built-in schemas and provided function tools
|
||||
provided_fn_schemas = _extract_function_schemas_from_tools(tools)
|
||||
combined_schemas = TOOL_SCHEMAS + provided_fn_schemas if provided_fn_schemas else TOOL_SCHEMAS
|
||||
dynamic_system_prompt = _PROMPT_PREFIX + _format_tool_schemas_json_lines(combined_schemas) + _PROMPT_SUFFIX
|
||||
combined_schemas = (
|
||||
TOOL_SCHEMAS + provided_fn_schemas if provided_fn_schemas else TOOL_SCHEMAS
|
||||
)
|
||||
dynamic_system_prompt = (
|
||||
_PROMPT_PREFIX + _format_tool_schemas_json_lines(combined_schemas) + _PROMPT_SUFFIX
|
||||
)
|
||||
|
||||
# Prepend system prompt (based on training prompts + provided tools)
|
||||
litellm_messages: List[Dict[str, Any]] = [
|
||||
@@ -829,7 +897,10 @@ class UITARS2Config:
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Please return a single click action."},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
@@ -841,7 +912,9 @@ class UITARS2Config:
|
||||
"temperature": kwargs.get("temperature", 0.0),
|
||||
"do_sample": kwargs.get("temperature", 0.0) > 0.0,
|
||||
}
|
||||
api_kwargs.update({k: v for k, v in (kwargs or {}).items() if k not in ["max_tokens", "temperature"]})
|
||||
api_kwargs.update(
|
||||
{k: v for k, v in (kwargs or {}).items() if k not in ["max_tokens", "temperature"]}
|
||||
)
|
||||
|
||||
response = await litellm.acompletion(**api_kwargs)
|
||||
# Extract response content
|
||||
@@ -852,7 +925,11 @@ class UITARS2Config:
|
||||
msg = choices[0].get("message", {})
|
||||
content_text = msg.get("content", "")
|
||||
if isinstance(content_text, list):
|
||||
text_parts = [p.get("text", "") for p in content_text if isinstance(p, dict) and p.get("type") == "text"]
|
||||
text_parts = [
|
||||
p.get("text", "")
|
||||
for p in content_text
|
||||
if isinstance(p, dict) and p.get("type") == "text"
|
||||
]
|
||||
content_text = "\n".join([t for t in text_parts if t])
|
||||
if not isinstance(content_text, str):
|
||||
return None
|
||||
|
||||
@@ -22,14 +22,14 @@ async def test_http_endpoint():
|
||||
|
||||
# Example 1: Simple text request
|
||||
simple_request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"input": "Tell me a three sentence bedtime story about a unicorn.",
|
||||
"env": {"ANTHROPIC_API_KEY": anthropic_api_key},
|
||||
}
|
||||
|
||||
# Example 2: Multi-modal request with image
|
||||
multimodal_request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"input": [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -47,7 +47,7 @@ async def test_http_endpoint():
|
||||
|
||||
# Example 3: Request with custom agent and computer kwargs
|
||||
custom_request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"input": "Take a screenshot and tell me what you see",
|
||||
"env": {"ANTHROPIC_API_KEY": anthropic_api_key},
|
||||
}
|
||||
@@ -95,7 +95,7 @@ def curl_examples():
|
||||
"""curl http://localhost:8000/responses \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"input": "Tell me a three sentence bedtime story about a unicorn."
|
||||
}'"""
|
||||
)
|
||||
@@ -105,7 +105,7 @@ def curl_examples():
|
||||
"""curl http://localhost:8000/responses \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"input": [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -126,7 +126,7 @@ def curl_examples():
|
||||
"""curl http://localhost:8000/responses \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"input": "Take a screenshot and tell me what you see",
|
||||
"agent_kwargs": {
|
||||
"save_trajectory": true,
|
||||
@@ -166,7 +166,7 @@ async def test_p2p_client():
|
||||
|
||||
# Send a test request
|
||||
request = {
|
||||
"model": "anthropic/claude-3-5-sonnet-20241022",
|
||||
"model": "anthropic/claude-sonnet-4-5-20250929",
|
||||
"input": "Hello from P2P client!",
|
||||
}
|
||||
await connection.send(json.dumps(request))
|
||||
|
||||
@@ -6,9 +6,9 @@ with an advanced UI for model selection and configuration.
|
||||
|
||||
Supported Agent Models:
|
||||
- OpenAI: openai/computer-use-preview
|
||||
- Anthropic: anthropic/claude-3-5-sonnet-20241022, anthropic/claude-3-7-sonnet-20250219
|
||||
- Anthropic: anthropic/claude-sonnet-4-5-20250929, anthropic/claude-3-7-sonnet-20250219
|
||||
- UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
||||
- Omniparser: omniparser+anthropic/claude-3-5-sonnet-20241022, omniparser+ollama_chat/gemma3
|
||||
- Omniparser: omniparser+anthropic/claude-sonnet-4-5-20250929, omniparser+ollama_chat/gemma3
|
||||
|
||||
Requirements:
|
||||
- Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
|
||||
@@ -116,14 +116,12 @@ MODEL_MAPPINGS = {
|
||||
"Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
|
||||
"Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
|
||||
"Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
|
||||
"Anthropic: Claude 3.5 Sonnet (20241022)": "anthropic/claude-3-5-sonnet-20241022",
|
||||
},
|
||||
"omni": {
|
||||
"default": "omniparser+openai/gpt-4o",
|
||||
"OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
|
||||
"OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
|
||||
"OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
|
||||
"OMNI: Claude 3.5 Sonnet (20241022)": "omniparser+anthropic/claude-3-5-sonnet-20241022",
|
||||
},
|
||||
"uitars": {
|
||||
"default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",
|
||||
|
||||
@@ -44,13 +44,11 @@ def create_gradio_ui() -> gr.Blocks:
|
||||
"Anthropic: Claude 4 Opus (20250514)",
|
||||
"Anthropic: Claude 4 Sonnet (20250514)",
|
||||
"Anthropic: Claude 3.7 Sonnet (20250219)",
|
||||
"Anthropic: Claude 3.5 Sonnet (20241022)",
|
||||
]
|
||||
omni_models = [
|
||||
"OMNI: OpenAI GPT-4o",
|
||||
"OMNI: OpenAI GPT-4o mini",
|
||||
"OMNI: Claude 3.7 Sonnet (20250219)",
|
||||
"OMNI: Claude 3.5 Sonnet (20241022)",
|
||||
]
|
||||
|
||||
# Check if API keys are available
|
||||
|
||||
@@ -102,7 +102,7 @@ async def main():
|
||||
# model="anthropic/claude-opus-4-20250514",
|
||||
# model="anthropic/claude-sonnet-4-20250514",
|
||||
# model="anthropic/claude-3-7-sonnet-20250219",
|
||||
# model="anthropic/claude-3-5-sonnet-20241022",
|
||||
# model="anthropic/claude-sonnet-4-5-20250929",
|
||||
# == UI-TARS ==
|
||||
# model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
|
||||
# TODO: add local mlx provider
|
||||
|
||||
@@ -24,7 +24,7 @@ def mock_litellm():
|
||||
"id": "chatcmpl-test123",
|
||||
"object": "chat.completion",
|
||||
"created": 1234567890,
|
||||
"model": kwargs.get("model", "anthropic/claude-3-5-sonnet-20241022"),
|
||||
"model": kwargs.get("model", "anthropic/claude-sonnet-4-5-20250929"),
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
|
||||
@@ -18,18 +18,18 @@ class TestComputerAgentInitialization:
|
||||
"""Test that agent can be initialized with a model string."""
|
||||
from agent import ComputerAgent
|
||||
|
||||
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
|
||||
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
|
||||
|
||||
assert agent is not None
|
||||
assert hasattr(agent, "model")
|
||||
assert agent.model == "anthropic/claude-3-5-sonnet-20241022"
|
||||
assert agent.model == "anthropic/claude-sonnet-4-5-20250929"
|
||||
|
||||
@patch("agent.agent.litellm")
|
||||
def test_agent_initialization_with_tools(self, mock_litellm, disable_telemetry, mock_computer):
|
||||
"""Test that agent can be initialized with tools."""
|
||||
from agent import ComputerAgent
|
||||
|
||||
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
|
||||
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929", tools=[mock_computer])
|
||||
|
||||
assert agent is not None
|
||||
assert hasattr(agent, "tools")
|
||||
@@ -41,7 +41,7 @@ class TestComputerAgentInitialization:
|
||||
|
||||
budget = 5.0
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022", max_trajectory_budget=budget
|
||||
model="anthropic/claude-sonnet-4-5-20250929", max_trajectory_budget=budget
|
||||
)
|
||||
|
||||
assert agent is not None
|
||||
@@ -79,7 +79,7 @@ class TestComputerAgentRun:
|
||||
|
||||
mock_litellm.acompletion = AsyncMock(return_value=mock_response)
|
||||
|
||||
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
|
||||
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
|
||||
|
||||
# Run should return an async generator
|
||||
result_generator = agent.run(sample_messages)
|
||||
@@ -92,7 +92,7 @@ class TestComputerAgentRun:
|
||||
"""Test that agent has run method available."""
|
||||
from agent import ComputerAgent
|
||||
|
||||
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
|
||||
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
|
||||
|
||||
# Verify run method exists
|
||||
assert hasattr(agent, "run")
|
||||
@@ -102,7 +102,7 @@ class TestComputerAgentRun:
|
||||
"""Test that agent has agent_loop initialized."""
|
||||
from agent import ComputerAgent
|
||||
|
||||
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
|
||||
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
|
||||
|
||||
# Verify agent_loop is initialized
|
||||
assert hasattr(agent, "agent_loop")
|
||||
@@ -132,7 +132,7 @@ class TestComputerAgentIntegration:
|
||||
"""Test that agent can be initialized with Computer tool."""
|
||||
from agent import ComputerAgent
|
||||
|
||||
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
|
||||
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929", tools=[mock_computer])
|
||||
|
||||
# Verify agent accepted the tool
|
||||
assert agent is not None
|
||||
|
||||
Reference in New Issue
Block a user