Merge branch 'main' into feat/generic-vlm-provider

This commit is contained in:
Sarina Li
2025-11-26 11:09:11 -05:00
72 changed files with 1341 additions and 294 deletions

View File

@@ -51,7 +51,7 @@ async def main():
# Create agent
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
only_n_most_recent_images=3,
trajectory_dir="trajectories",

View File

@@ -189,7 +189,7 @@ class ComputerAgent:
Initialize ComputerAgent.
Args:
model: Model name (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro")
model: Model name (e.g., "claude-sonnet-4-5-20250929", "computer-use-preview", "omni+vertex_ai/gemini-pro")
tools: List of tools (computer objects, decorated functions, etc.)
custom_loop: Custom agent loop function to use instead of auto-selection
only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically.

View File

@@ -7,7 +7,7 @@ Usage:
Examples:
python -m agent.cli openai/computer-use-preview
python -m agent.cli anthropic/claude-sonnet-4-5-20250929
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
"""
try:
@@ -233,7 +233,7 @@ async def main():
Examples:
python -m agent.cli openai/computer-use-preview
python -m agent.cli anthropic/claude-sonnet-4-5-20250929
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
python -m agent.cli omniparser+anthropic/claude-sonnet-4-5-20250929
python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
""",
)

View File

@@ -671,11 +671,12 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
# Handle custom function tools (not computer tools)
if tool_name != "computer":
from ..responses import make_function_call_item
responses_items.append(make_function_call_item(
function_name=tool_name,
arguments=tool_input,
call_id=call_id
))
responses_items.append(
make_function_call_item(
function_name=tool_name, arguments=tool_input, call_id=call_id
)
)
continue
# Computer tool - process actions
@@ -883,16 +884,17 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any]
# Handle custom function tools
if tool_name != "computer":
from ..responses import make_function_call_item
# tool_call.function.arguments is a JSON string, need to parse it
try:
args_dict = json.loads(tool_call.function.arguments)
except json.JSONDecodeError:
args_dict = {}
responses_items.append(make_function_call_item(
function_name=tool_name,
arguments=args_dict,
call_id=tool_call.id
))
responses_items.append(
make_function_call_item(
function_name=tool_name, arguments=args_dict, call_id=tool_call.id
)
)
continue
# Handle computer tool

View File

@@ -20,6 +20,7 @@ from ..loops.base import AsyncAgentConfig
from ..responses import (
convert_completion_messages_to_responses_items,
convert_responses_items_to_completion_messages,
make_reasoning_item,
)
from ..types import AgentCapability
@@ -373,13 +374,23 @@ class GenericVlmConfig(AsyncAgentConfig):
if _on_usage:
await _on_usage(usage)
# Parse tool call from text; then convert to responses items via fake tool_calls
# Extract response data
resp_dict = response.model_dump() # type: ignore
choice = (resp_dict.get("choices") or [{}])[0]
content_text = ((choice.get("message") or {}).get("content")) or ""
tool_call = _parse_tool_call_from_text(content_text)
message = choice.get("message") or {}
content_text = message.get("content") or ""
tool_calls_array = message.get("tool_calls") or []
reasoning_text = message.get("reasoning") or ""
output_items: List[Dict[str, Any]] = []
# Add reasoning if present (Ollama Cloud format)
if reasoning_text:
output_items.append(make_reasoning_item(reasoning_text))
# Priority 1: Try to parse tool call from content text (OpenRouter format)
tool_call = _parse_tool_call_from_text(content_text)
if tool_call and isinstance(tool_call, dict):
fn_name = tool_call.get("name") or "computer"
raw_args = tool_call.get("arguments") or {}
@@ -405,8 +416,50 @@ class GenericVlmConfig(AsyncAgentConfig):
],
}
output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
elif tool_calls_array:
# Priority 2: Use tool_calls field if present (Ollama Cloud format)
# Process and unnormalize coordinates in tool calls
processed_tool_calls = []
for tc in tool_calls_array:
function = tc.get("function", {})
fn_name = function.get("name", "computer")
args_str = function.get("arguments", "{}")
try:
args = json.loads(args_str)
# Unnormalize coordinates if present
if "coordinate" in args and last_rw is not None and last_rh is not None:
args = await _unnormalize_coordinate(args, (last_rw, last_rh))
# Convert Qwen format to Computer Calls format if this is a computer tool
if fn_name == "computer":
converted_action = convert_qwen_tool_args_to_computer_action(args)
if converted_action:
args = converted_action
processed_tool_calls.append(
{
"type": tc.get("type", "function"),
"id": tc.get("id", "call_0"),
"function": {
"name": fn_name,
"arguments": json.dumps(args),
},
}
)
except json.JSONDecodeError:
# Keep original if parsing fails
processed_tool_calls.append(tc)
fake_cm = {
"role": "assistant",
"content": content_text if content_text else "",
"tool_calls": processed_tool_calls,
}
output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
else:
# Fallback: just return assistant text
# No tool calls found in either format, return text response
fake_cm = {"role": "assistant", "content": content_text}
output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))

View File

@@ -365,6 +365,22 @@ class OmniparserConfig(AsyncAgentConfig):
**kwargs,
}
# Add Vertex AI specific parameters if using vertex_ai models
if llm_model.startswith("vertex_ai/"):
import os
# Pass vertex_project and vertex_location to liteLLM
if "vertex_project" not in api_kwargs:
api_kwargs["vertex_project"] = os.getenv("GOOGLE_CLOUD_PROJECT")
if "vertex_location" not in api_kwargs:
api_kwargs["vertex_location"] = "global"
# Pass through Gemini 3-specific parameters if provided
if "thinking_level" in kwargs:
api_kwargs["thinking_level"] = kwargs["thinking_level"]
if "media_resolution" in kwargs:
api_kwargs["media_resolution"] = kwargs["media_resolution"]
# Call API start hook
if _on_api_start:
await _on_api_start(api_kwargs)

View File

@@ -5,13 +5,14 @@ UITARS-2 agent loop implementation using LiteLLM.
- Calls litellm.acompletion
- Parses <seed:tool_call> ... </seed:tool_call> outputs back into Responses items (computer actions)
"""
from __future__ import annotations
import re
from typing import Any, Dict, List, Optional, Tuple
import base64
import io
import json
import re
from typing import Any, Dict, List, Optional, Tuple
import litellm
from litellm.responses.litellm_completion_transformation.transformation import (
@@ -20,37 +21,45 @@ from litellm.responses.litellm_completion_transformation.transformation import (
from ..decorators import register_agent
from .omniparser import get_last_computer_call_output # type: ignore
try:
from PIL import Image # type: ignore
except Exception: # pragma: no cover
Image = None # type: ignore
from ..responses import (
convert_responses_items_to_completion_messages,
make_click_item,
make_double_click_item,
make_drag_item,
make_function_call_item,
make_keypress_item,
make_screenshot_item,
make_move_item,
make_output_text_item,
make_reasoning_item,
make_screenshot_item,
make_scroll_item,
make_type_item,
make_wait_item,
convert_responses_items_to_completion_messages,
)
from ..types import AgentCapability
TOOL_SCHEMAS: List[Dict[str, Any]] = [
{"type": "function", "name": "open_computer", "parameters": {}, "description": "Open computer."},
{
"type": "function",
"name": "open_computer",
"parameters": {},
"description": "Open computer.",
},
{
"type": "function",
"name": "click",
"parameters": {
"type": "object",
"properties": {
"point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}
"point": {
"type": "string",
"description": "Click coordinates. The format is: <point>x y</point>",
}
},
"required": ["point"],
},
@@ -62,7 +71,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
"parameters": {
"type": "object",
"properties": {
"point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}
"point": {
"type": "string",
"description": "Click coordinates. The format is: <point>x y</point>",
}
},
"required": ["point"],
},
@@ -74,7 +86,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
"parameters": {
"type": "object",
"properties": {
"point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}
"point": {
"type": "string",
"description": "Click coordinates. The format is: <point>x y</point>",
}
},
"required": ["point"],
},
@@ -106,7 +121,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
"parameters": {
"type": "object",
"properties": {
"point": {"type": "string", "description": "Target coordinates. The format is: <point>x y</point>"}
"point": {
"type": "string",
"description": "Target coordinates. The format is: <point>x y</point>",
}
},
"required": ["point"],
},
@@ -117,7 +135,12 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
"name": "hotkey",
"parameters": {
"type": "object",
"properties": {"key": {"type": "string", "description": "Hotkeys you want to press. Split keys with a space and use lowercase."}},
"properties": {
"key": {
"type": "string",
"description": "Hotkeys you want to press. Split keys with a space and use lowercase.",
}
},
"required": ["key"],
},
"description": "Press hotkey.",
@@ -227,9 +250,7 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
"name": "wait",
"parameters": {
"type": "object",
"properties": {
"time": {"type": "integer", "description": "Wait time in seconds."}
},
"properties": {"time": {"type": "integer", "description": "Wait time in seconds."}},
"required": [],
},
"description": "Wait for a while.",
@@ -268,7 +289,12 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [
},
"description": "Type content.",
},
{"type": "function", "name": "take_screenshot", "parameters": {}, "description": "Take screenshot."},
{
"type": "function",
"name": "take_screenshot",
"parameters": {},
"description": "Take screenshot.",
},
]
@@ -319,7 +345,9 @@ _PROMPT_SUFFIX = (
SYSTEM_PROMPT = _PROMPT_PREFIX + _format_tool_schemas_json_lines(TOOL_SCHEMAS) + _PROMPT_SUFFIX
def _extract_function_schemas_from_tools(tools: Optional[List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
def _extract_function_schemas_from_tools(
tools: Optional[List[Dict[str, Any]]],
) -> List[Dict[str, Any]]:
schemas: List[Dict[str, Any]] = []
if not tools:
return schemas
@@ -330,12 +358,14 @@ def _extract_function_schemas_from_tools(tools: Optional[List[Dict[str, Any]]])
params = fn.get("parameters", {})
desc = fn.get("description", "")
if name:
schemas.append({
"type": "function",
"name": name,
"parameters": params if isinstance(params, dict) else {},
"description": desc,
})
schemas.append(
{
"type": "function",
"name": name,
"parameters": params if isinstance(params, dict) else {},
"description": desc,
}
)
return schemas
@@ -392,7 +422,9 @@ def _denormalize_xy_from_uitars(nx: float, ny: float, width: int, height: int) -
return x, y
def _map_computer_action_to_function(action: Dict[str, Any], width: int, height: int) -> Optional[Dict[str, Any]]:
def _map_computer_action_to_function(
action: Dict[str, Any], width: int, height: int
) -> Optional[Dict[str, Any]]:
"""Map a computer action item to a UITARS function + parameters dict of strings.
Returns dict like {"function": name, "parameters": {..}} or None if unknown.
"""
@@ -404,7 +436,10 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height:
return None
nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
if btn == "right":
return {"function": "right_single", "parameters": {"point": f"<point>{nx} {ny}</point>"}}
return {
"function": "right_single",
"parameters": {"point": f"<point>{nx} {ny}</point>"},
}
return {"function": "click", "parameters": {"point": f"<point>{nx} {ny}</point>"}}
if atype == "double_click":
x, y = action.get("x"), action.get("y")
@@ -434,8 +469,19 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height:
nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height)
sx, sy = action.get("scroll_x", 0), action.get("scroll_y", 0)
# Our parser used positive sy for up
direction = "up" if sy and sy > 0 else ("down" if sy and sy < 0 else ("right" if sx and sx > 0 else ("left" if sx and sx < 0 else "down")))
return {"function": "scroll", "parameters": {"direction": direction, "point": f"<point>{nx} {ny}</point>"}}
direction = (
"up"
if sy and sy > 0
else (
"down"
if sy and sy < 0
else ("right" if sx and sx > 0 else ("left" if sx and sx < 0 else "down"))
)
)
return {
"function": "scroll",
"parameters": {"direction": direction, "point": f"<point>{nx} {ny}</point>"},
}
if atype == "drag":
path = action.get("path", [])
if isinstance(path, list) and len(path) >= 2:
@@ -461,7 +507,9 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height:
return None
def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int) -> List[Dict[str, Any]]:
def _to_uitars_messages(
messages: List[Dict[str, Any]], width: int, height: int
) -> List[Dict[str, Any]]:
"""Convert responses items into completion messages tailored for UI-TARS.
- User content is passed through similar to convert_responses_items_to_completion_messages
@@ -505,7 +553,9 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
completion_content = []
for item in content:
if item.get("type") == "input_image":
completion_content.append({"type": "image_url", "image_url": {"url": item.get("image_url")}})
completion_content.append(
{"type": "image_url", "image_url": {"url": item.get("image_url")}}
)
elif item.get("type") in ("input_text", "text"):
completion_content.append({"type": "text", "text": item.get("text")})
uitars_messages.append({"role": "user", "content": completion_content})
@@ -517,7 +567,11 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
if mtype == "reasoning":
# Responses reasoning stores summary list
summary = msg.get("summary", [])
texts = [s.get("text", "") for s in summary if isinstance(s, dict) and s.get("type") == "summary_text"]
texts = [
s.get("text", "")
for s in summary
if isinstance(s, dict) and s.get("type") == "summary_text"
]
if texts:
pending_think = "\n".join([t for t in texts if t])
continue
@@ -546,9 +600,15 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
pending_think, pending_functions = None, []
content = msg.get("content", [])
if isinstance(content, list):
texts = [c.get("text", "") for c in content if isinstance(c, dict) and c.get("type") in ("output_text", "text")]
texts = [
c.get("text", "")
for c in content
if isinstance(c, dict) and c.get("type") in ("output_text", "text")
]
if texts:
uitars_messages.append({"role": "assistant", "content": "\n".join([t for t in texts if t])})
uitars_messages.append(
{"role": "assistant", "content": "\n".join([t for t in texts if t])}
)
elif isinstance(content, str) and content:
uitars_messages.append({"role": "assistant", "content": content})
continue
@@ -581,8 +641,12 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int)
return uitars_messages
def _to_response_items(
actions: List[Dict[str, Any]], tool_names: Optional[set[str]] = None, width: Optional[int] = None, height: Optional[int] = None
actions: List[Dict[str, Any]],
tool_names: Optional[set[str]] = None,
width: Optional[int] = None,
height: Optional[int] = None,
) -> List[Any]:
"""Map parsed actions into Responses items (computer actions + optional reasoning)."""
items: List[Any] = []
@@ -736,8 +800,12 @@ class UITARS2Config:
# Build dynamic system prompt by concatenating built-in schemas and provided function tools
provided_fn_schemas = _extract_function_schemas_from_tools(tools)
combined_schemas = TOOL_SCHEMAS + provided_fn_schemas if provided_fn_schemas else TOOL_SCHEMAS
dynamic_system_prompt = _PROMPT_PREFIX + _format_tool_schemas_json_lines(combined_schemas) + _PROMPT_SUFFIX
combined_schemas = (
TOOL_SCHEMAS + provided_fn_schemas if provided_fn_schemas else TOOL_SCHEMAS
)
dynamic_system_prompt = (
_PROMPT_PREFIX + _format_tool_schemas_json_lines(combined_schemas) + _PROMPT_SUFFIX
)
# Prepend system prompt (based on training prompts + provided tools)
litellm_messages: List[Dict[str, Any]] = [
@@ -829,7 +897,10 @@ class UITARS2Config:
"role": "user",
"content": [
{"type": "text", "text": "Please return a single click action."},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
},
],
},
]
@@ -841,7 +912,9 @@ class UITARS2Config:
"temperature": kwargs.get("temperature", 0.0),
"do_sample": kwargs.get("temperature", 0.0) > 0.0,
}
api_kwargs.update({k: v for k, v in (kwargs or {}).items() if k not in ["max_tokens", "temperature"]})
api_kwargs.update(
{k: v for k, v in (kwargs or {}).items() if k not in ["max_tokens", "temperature"]}
)
response = await litellm.acompletion(**api_kwargs)
# Extract response content
@@ -852,7 +925,11 @@ class UITARS2Config:
msg = choices[0].get("message", {})
content_text = msg.get("content", "")
if isinstance(content_text, list):
text_parts = [p.get("text", "") for p in content_text if isinstance(p, dict) and p.get("type") == "text"]
text_parts = [
p.get("text", "")
for p in content_text
if isinstance(p, dict) and p.get("type") == "text"
]
content_text = "\n".join([t for t in text_parts if t])
if not isinstance(content_text, str):
return None

View File

@@ -22,14 +22,14 @@ async def test_http_endpoint():
# Example 1: Simple text request
simple_request = {
"model": "anthropic/claude-3-5-sonnet-20241022",
"model": "anthropic/claude-sonnet-4-5-20250929",
"input": "Tell me a three sentence bedtime story about a unicorn.",
"env": {"ANTHROPIC_API_KEY": anthropic_api_key},
}
# Example 2: Multi-modal request with image
multimodal_request = {
"model": "anthropic/claude-3-5-sonnet-20241022",
"model": "anthropic/claude-sonnet-4-5-20250929",
"input": [
{
"role": "user",
@@ -47,7 +47,7 @@ async def test_http_endpoint():
# Example 3: Request with custom agent and computer kwargs
custom_request = {
"model": "anthropic/claude-3-5-sonnet-20241022",
"model": "anthropic/claude-sonnet-4-5-20250929",
"input": "Take a screenshot and tell me what you see",
"env": {"ANTHROPIC_API_KEY": anthropic_api_key},
}
@@ -95,7 +95,7 @@ def curl_examples():
"""curl http://localhost:8000/responses \\
-H "Content-Type: application/json" \\
-d '{
"model": "anthropic/claude-3-5-sonnet-20241022",
"model": "anthropic/claude-sonnet-4-5-20250929",
"input": "Tell me a three sentence bedtime story about a unicorn."
}'"""
)
@@ -105,7 +105,7 @@ def curl_examples():
"""curl http://localhost:8000/responses \\
-H "Content-Type: application/json" \\
-d '{
"model": "anthropic/claude-3-5-sonnet-20241022",
"model": "anthropic/claude-sonnet-4-5-20250929",
"input": [
{
"role": "user",
@@ -126,7 +126,7 @@ def curl_examples():
"""curl http://localhost:8000/responses \\
-H "Content-Type: application/json" \\
-d '{
"model": "anthropic/claude-3-5-sonnet-20241022",
"model": "anthropic/claude-sonnet-4-5-20250929",
"input": "Take a screenshot and tell me what you see",
"agent_kwargs": {
"save_trajectory": true,
@@ -166,7 +166,7 @@ async def test_p2p_client():
# Send a test request
request = {
"model": "anthropic/claude-3-5-sonnet-20241022",
"model": "anthropic/claude-sonnet-4-5-20250929",
"input": "Hello from P2P client!",
}
await connection.send(json.dumps(request))

View File

@@ -6,9 +6,9 @@ with an advanced UI for model selection and configuration.
Supported Agent Models:
- OpenAI: openai/computer-use-preview
- Anthropic: anthropic/claude-3-5-sonnet-20241022, anthropic/claude-3-7-sonnet-20250219
- Anthropic: anthropic/claude-sonnet-4-5-20250929, anthropic/claude-3-7-sonnet-20250219
- UI-TARS: huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
- Omniparser: omniparser+anthropic/claude-3-5-sonnet-20241022, omniparser+ollama_chat/gemma3
- Omniparser: omniparser+anthropic/claude-sonnet-4-5-20250929, omniparser+ollama_chat/gemma3
Requirements:
- Mac with Apple Silicon (M1/M2/M3/M4), Linux, or Windows
@@ -116,14 +116,12 @@ MODEL_MAPPINGS = {
"Anthropic: Claude 4 Opus (20250514)": "anthropic/claude-opus-4-20250514",
"Anthropic: Claude 4 Sonnet (20250514)": "anthropic/claude-sonnet-4-20250514",
"Anthropic: Claude 3.7 Sonnet (20250219)": "anthropic/claude-3-7-sonnet-20250219",
"Anthropic: Claude 3.5 Sonnet (20241022)": "anthropic/claude-3-5-sonnet-20241022",
},
"omni": {
"default": "omniparser+openai/gpt-4o",
"OMNI: OpenAI GPT-4o": "omniparser+openai/gpt-4o",
"OMNI: OpenAI GPT-4o mini": "omniparser+openai/gpt-4o-mini",
"OMNI: Claude 3.7 Sonnet (20250219)": "omniparser+anthropic/claude-3-7-sonnet-20250219",
"OMNI: Claude 3.5 Sonnet (20241022)": "omniparser+anthropic/claude-3-5-sonnet-20241022",
},
"uitars": {
"default": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" if is_mac else "ui-tars",

View File

@@ -44,13 +44,11 @@ def create_gradio_ui() -> gr.Blocks:
"Anthropic: Claude 4 Opus (20250514)",
"Anthropic: Claude 4 Sonnet (20250514)",
"Anthropic: Claude 3.7 Sonnet (20250219)",
"Anthropic: Claude 3.5 Sonnet (20241022)",
]
omni_models = [
"OMNI: OpenAI GPT-4o",
"OMNI: OpenAI GPT-4o mini",
"OMNI: Claude 3.7 Sonnet (20250219)",
"OMNI: Claude 3.5 Sonnet (20241022)",
]
# Check if API keys are available

View File

@@ -102,7 +102,7 @@ async def main():
# model="anthropic/claude-opus-4-20250514",
# model="anthropic/claude-sonnet-4-20250514",
# model="anthropic/claude-3-7-sonnet-20250219",
# model="anthropic/claude-3-5-sonnet-20241022",
# model="anthropic/claude-sonnet-4-5-20250929",
# == UI-TARS ==
# model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
# TODO: add local mlx provider

View File

@@ -24,7 +24,7 @@ def mock_litellm():
"id": "chatcmpl-test123",
"object": "chat.completion",
"created": 1234567890,
"model": kwargs.get("model", "anthropic/claude-3-5-sonnet-20241022"),
"model": kwargs.get("model", "anthropic/claude-sonnet-4-5-20250929"),
"choices": [
{
"index": 0,

View File

@@ -18,18 +18,18 @@ class TestComputerAgentInitialization:
"""Test that agent can be initialized with a model string."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
assert agent is not None
assert hasattr(agent, "model")
assert agent.model == "anthropic/claude-3-5-sonnet-20241022"
assert agent.model == "anthropic/claude-sonnet-4-5-20250929"
@patch("agent.agent.litellm")
def test_agent_initialization_with_tools(self, mock_litellm, disable_telemetry, mock_computer):
"""Test that agent can be initialized with tools."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929", tools=[mock_computer])
assert agent is not None
assert hasattr(agent, "tools")
@@ -41,7 +41,7 @@ class TestComputerAgentInitialization:
budget = 5.0
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022", max_trajectory_budget=budget
model="anthropic/claude-sonnet-4-5-20250929", max_trajectory_budget=budget
)
assert agent is not None
@@ -79,7 +79,7 @@ class TestComputerAgentRun:
mock_litellm.acompletion = AsyncMock(return_value=mock_response)
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
# Run should return an async generator
result_generator = agent.run(sample_messages)
@@ -92,7 +92,7 @@ class TestComputerAgentRun:
"""Test that agent has run method available."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
# Verify run method exists
assert hasattr(agent, "run")
@@ -102,7 +102,7 @@ class TestComputerAgentRun:
"""Test that agent has agent_loop initialized."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022")
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
# Verify agent_loop is initialized
assert hasattr(agent, "agent_loop")
@@ -132,7 +132,7 @@ class TestComputerAgentIntegration:
"""Test that agent can be initialized with Computer tool."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-3-5-sonnet-20241022", tools=[mock_computer])
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929", tools=[mock_computer])
# Verify agent accepted the tool
assert agent is not None