From 7cf9eee30194607d7dac1953454a0aab09d9fc54 Mon Sep 17 00:00:00 2001 From: Sarina Li Date: Wed, 19 Nov 2025 17:41:01 -0500 Subject: [PATCH] fix formatting again --- libs/python/agent/agent/loops/anthropic.py | 22 +-- libs/python/agent/agent/loops/uitars2.py | 151 ++++++++++++++++----- uv.lock | 6 +- 3 files changed, 128 insertions(+), 51 deletions(-) diff --git a/libs/python/agent/agent/loops/anthropic.py b/libs/python/agent/agent/loops/anthropic.py index 42e33b5d..0fa08b96 100644 --- a/libs/python/agent/agent/loops/anthropic.py +++ b/libs/python/agent/agent/loops/anthropic.py @@ -671,11 +671,12 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any] # Handle custom function tools (not computer tools) if tool_name != "computer": from ..responses import make_function_call_item - responses_items.append(make_function_call_item( - function_name=tool_name, - arguments=tool_input, - call_id=call_id - )) + + responses_items.append( + make_function_call_item( + function_name=tool_name, arguments=tool_input, call_id=call_id + ) + ) continue # Computer tool - process actions @@ -883,16 +884,17 @@ def _convert_completion_to_responses_items(response: Any) -> List[Dict[str, Any] # Handle custom function tools if tool_name != "computer": from ..responses import make_function_call_item + # tool_call.function.arguments is a JSON string, need to parse it try: args_dict = json.loads(tool_call.function.arguments) except json.JSONDecodeError: args_dict = {} - responses_items.append(make_function_call_item( - function_name=tool_name, - arguments=args_dict, - call_id=tool_call.id - )) + responses_items.append( + make_function_call_item( + function_name=tool_name, arguments=args_dict, call_id=tool_call.id + ) + ) continue # Handle computer tool diff --git a/libs/python/agent/agent/loops/uitars2.py b/libs/python/agent/agent/loops/uitars2.py index 5d46aced..4ecb3b04 100644 --- a/libs/python/agent/agent/loops/uitars2.py +++ b/libs/python/agent/agent/loops/uitars2.py @@ -5,13 +5,14 @@ UITARS-2 agent loop implementation using LiteLLM. - Calls litellm.acompletion - Parses ... outputs back into Responses items (computer actions) """ + from __future__ import annotations -import re -from typing import Any, Dict, List, Optional, Tuple import base64 import io import json +import re +from typing import Any, Dict, List, Optional, Tuple import litellm from litellm.responses.litellm_completion_transformation.transformation import ( @@ -20,37 +21,45 @@ from litellm.responses.litellm_completion_transformation.transformation import ( from ..decorators import register_agent from .omniparser import get_last_computer_call_output # type: ignore + try: from PIL import Image # type: ignore except Exception: # pragma: no cover Image = None # type: ignore from ..responses import ( + convert_responses_items_to_completion_messages, make_click_item, make_double_click_item, make_drag_item, make_function_call_item, make_keypress_item, - make_screenshot_item, make_move_item, make_output_text_item, make_reasoning_item, + make_screenshot_item, make_scroll_item, make_type_item, make_wait_item, - convert_responses_items_to_completion_messages, ) from ..types import AgentCapability - TOOL_SCHEMAS: List[Dict[str, Any]] = [ - {"type": "function", "name": "open_computer", "parameters": {}, "description": "Open computer."}, + { + "type": "function", + "name": "open_computer", + "parameters": {}, + "description": "Open computer.", + }, { "type": "function", "name": "click", "parameters": { "type": "object", "properties": { - "point": {"type": "string", "description": "Click coordinates. The format is: x y"} + "point": { + "type": "string", + "description": "Click coordinates. The format is: x y", + } }, "required": ["point"], }, @@ -62,7 +71,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [ "parameters": { "type": "object", "properties": { - "point": {"type": "string", "description": "Click coordinates. The format is: x y"} + "point": { + "type": "string", + "description": "Click coordinates. The format is: x y", + } }, "required": ["point"], }, @@ -74,7 +86,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [ "parameters": { "type": "object", "properties": { - "point": {"type": "string", "description": "Click coordinates. The format is: x y"} + "point": { + "type": "string", + "description": "Click coordinates. The format is: x y", + } }, "required": ["point"], }, @@ -106,7 +121,10 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [ "parameters": { "type": "object", "properties": { - "point": {"type": "string", "description": "Target coordinates. The format is: x y"} + "point": { + "type": "string", + "description": "Target coordinates. The format is: x y", + } }, "required": ["point"], }, @@ -117,7 +135,12 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [ "name": "hotkey", "parameters": { "type": "object", - "properties": {"key": {"type": "string", "description": "Hotkeys you want to press. Split keys with a space and use lowercase."}}, + "properties": { + "key": { + "type": "string", + "description": "Hotkeys you want to press. Split keys with a space and use lowercase.", + } + }, "required": ["key"], }, "description": "Press hotkey.", @@ -227,9 +250,7 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [ "name": "wait", "parameters": { "type": "object", - "properties": { - "time": {"type": "integer", "description": "Wait time in seconds."} - }, + "properties": {"time": {"type": "integer", "description": "Wait time in seconds."}}, "required": [], }, "description": "Wait for a while.", @@ -268,7 +289,12 @@ TOOL_SCHEMAS: List[Dict[str, Any]] = [ }, "description": "Type content.", }, - {"type": "function", "name": "take_screenshot", "parameters": {}, "description": "Take screenshot."}, + { + "type": "function", + "name": "take_screenshot", + "parameters": {}, + "description": "Take screenshot.", + }, ] @@ -319,7 +345,9 @@ _PROMPT_SUFFIX = ( SYSTEM_PROMPT = _PROMPT_PREFIX + _format_tool_schemas_json_lines(TOOL_SCHEMAS) + _PROMPT_SUFFIX -def _extract_function_schemas_from_tools(tools: Optional[List[Dict[str, Any]]]) -> List[Dict[str, Any]]: +def _extract_function_schemas_from_tools( + tools: Optional[List[Dict[str, Any]]], +) -> List[Dict[str, Any]]: schemas: List[Dict[str, Any]] = [] if not tools: return schemas @@ -330,12 +358,14 @@ def _extract_function_schemas_from_tools(tools: Optional[List[Dict[str, Any]]]) params = fn.get("parameters", {}) desc = fn.get("description", "") if name: - schemas.append({ - "type": "function", - "name": name, - "parameters": params if isinstance(params, dict) else {}, - "description": desc, - }) + schemas.append( + { + "type": "function", + "name": name, + "parameters": params if isinstance(params, dict) else {}, + "description": desc, + } + ) return schemas @@ -392,7 +422,9 @@ def _denormalize_xy_from_uitars(nx: float, ny: float, width: int, height: int) - return x, y -def _map_computer_action_to_function(action: Dict[str, Any], width: int, height: int) -> Optional[Dict[str, Any]]: +def _map_computer_action_to_function( + action: Dict[str, Any], width: int, height: int +) -> Optional[Dict[str, Any]]: """Map a computer action item to a UITARS function + parameters dict of strings. Returns dict like {"function": name, "parameters": {..}} or None if unknown. """ @@ -404,7 +436,10 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height: return None nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height) if btn == "right": - return {"function": "right_single", "parameters": {"point": f"{nx} {ny}"}} + return { + "function": "right_single", + "parameters": {"point": f"{nx} {ny}"}, + } return {"function": "click", "parameters": {"point": f"{nx} {ny}"}} if atype == "double_click": x, y = action.get("x"), action.get("y") @@ -434,8 +469,19 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height: nx, ny = _normalize_xy_to_uitars(int(x), int(y), width, height) sx, sy = action.get("scroll_x", 0), action.get("scroll_y", 0) # Our parser used positive sy for up - direction = "up" if sy and sy > 0 else ("down" if sy and sy < 0 else ("right" if sx and sx > 0 else ("left" if sx and sx < 0 else "down"))) - return {"function": "scroll", "parameters": {"direction": direction, "point": f"{nx} {ny}"}} + direction = ( + "up" + if sy and sy > 0 + else ( + "down" + if sy and sy < 0 + else ("right" if sx and sx > 0 else ("left" if sx and sx < 0 else "down")) + ) + ) + return { + "function": "scroll", + "parameters": {"direction": direction, "point": f"{nx} {ny}"}, + } if atype == "drag": path = action.get("path", []) if isinstance(path, list) and len(path) >= 2: @@ -461,7 +507,9 @@ def _map_computer_action_to_function(action: Dict[str, Any], width: int, height: return None -def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int) -> List[Dict[str, Any]]: +def _to_uitars_messages( + messages: List[Dict[str, Any]], width: int, height: int +) -> List[Dict[str, Any]]: """Convert responses items into completion messages tailored for UI-TARS. - User content is passed through similar to convert_responses_items_to_completion_messages @@ -505,7 +553,9 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int) completion_content = [] for item in content: if item.get("type") == "input_image": - completion_content.append({"type": "image_url", "image_url": {"url": item.get("image_url")}}) + completion_content.append( + {"type": "image_url", "image_url": {"url": item.get("image_url")}} + ) elif item.get("type") in ("input_text", "text"): completion_content.append({"type": "text", "text": item.get("text")}) uitars_messages.append({"role": "user", "content": completion_content}) @@ -517,7 +567,11 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int) if mtype == "reasoning": # Responses reasoning stores summary list summary = msg.get("summary", []) - texts = [s.get("text", "") for s in summary if isinstance(s, dict) and s.get("type") == "summary_text"] + texts = [ + s.get("text", "") + for s in summary + if isinstance(s, dict) and s.get("type") == "summary_text" + ] if texts: pending_think = "\n".join([t for t in texts if t]) continue @@ -546,9 +600,15 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int) pending_think, pending_functions = None, [] content = msg.get("content", []) if isinstance(content, list): - texts = [c.get("text", "") for c in content if isinstance(c, dict) and c.get("type") in ("output_text", "text")] + texts = [ + c.get("text", "") + for c in content + if isinstance(c, dict) and c.get("type") in ("output_text", "text") + ] if texts: - uitars_messages.append({"role": "assistant", "content": "\n".join([t for t in texts if t])}) + uitars_messages.append( + {"role": "assistant", "content": "\n".join([t for t in texts if t])} + ) elif isinstance(content, str) and content: uitars_messages.append({"role": "assistant", "content": content}) continue @@ -581,8 +641,12 @@ def _to_uitars_messages(messages: List[Dict[str, Any]], width: int, height: int) return uitars_messages + def _to_response_items( - actions: List[Dict[str, Any]], tool_names: Optional[set[str]] = None, width: Optional[int] = None, height: Optional[int] = None + actions: List[Dict[str, Any]], + tool_names: Optional[set[str]] = None, + width: Optional[int] = None, + height: Optional[int] = None, ) -> List[Any]: """Map parsed actions into Responses items (computer actions + optional reasoning).""" items: List[Any] = [] @@ -736,8 +800,12 @@ class UITARS2Config: # Build dynamic system prompt by concatenating built-in schemas and provided function tools provided_fn_schemas = _extract_function_schemas_from_tools(tools) - combined_schemas = TOOL_SCHEMAS + provided_fn_schemas if provided_fn_schemas else TOOL_SCHEMAS - dynamic_system_prompt = _PROMPT_PREFIX + _format_tool_schemas_json_lines(combined_schemas) + _PROMPT_SUFFIX + combined_schemas = ( + TOOL_SCHEMAS + provided_fn_schemas if provided_fn_schemas else TOOL_SCHEMAS + ) + dynamic_system_prompt = ( + _PROMPT_PREFIX + _format_tool_schemas_json_lines(combined_schemas) + _PROMPT_SUFFIX + ) # Prepend system prompt (based on training prompts + provided tools) litellm_messages: List[Dict[str, Any]] = [ @@ -829,7 +897,10 @@ class UITARS2Config: "role": "user", "content": [ {"type": "text", "text": "Please return a single click action."}, - {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}, + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{image_b64}"}, + }, ], }, ] @@ -841,7 +912,9 @@ class UITARS2Config: "temperature": kwargs.get("temperature", 0.0), "do_sample": kwargs.get("temperature", 0.0) > 0.0, } - api_kwargs.update({k: v for k, v in (kwargs or {}).items() if k not in ["max_tokens", "temperature"]}) + api_kwargs.update( + {k: v for k, v in (kwargs or {}).items() if k not in ["max_tokens", "temperature"]} + ) response = await litellm.acompletion(**api_kwargs) # Extract response content @@ -852,7 +925,11 @@ class UITARS2Config: msg = choices[0].get("message", {}) content_text = msg.get("content", "") if isinstance(content_text, list): - text_parts = [p.get("text", "") for p in content_text if isinstance(p, dict) and p.get("type") == "text"] + text_parts = [ + p.get("text", "") + for p in content_text + if isinstance(p, dict) and p.get("type") == "text" + ] content_text = "\n".join([t for t in text_parts if t]) if not isinstance(content_text, str): return None diff --git a/uv.lock b/uv.lock index 67698779..0e26ddcc 100644 --- a/uv.lock +++ b/uv.lock @@ -861,7 +861,7 @@ wheels = [ [[package]] name = "cua-agent" -version = "0.4.39" +version = "0.4.53" source = { editable = "libs/python/agent" } dependencies = [ { name = "aiohttp" }, @@ -885,7 +885,6 @@ all = [ { name = "einops" }, { name = "google-genai" }, { name = "gradio" }, - { name = "hud-python" }, { name = "mlx-vlm", marker = "sys_platform == 'darwin'" }, { name = "pillow" }, { name = "python-dotenv" }, @@ -975,7 +974,6 @@ requires-dist = [ { name = "gradio", marker = "extra == 'all'", specifier = ">=5.23.3" }, { name = "gradio", marker = "extra == 'ui'", specifier = ">=5.23.3" }, { name = "httpx", specifier = ">=0.27.0" }, - { name = "hud-python", marker = "extra == 'all'", specifier = "==0.4.52" }, { name = "hud-python", marker = "extra == 'hud'", specifier = "==0.4.52" }, { name = "litellm", specifier = ">=1.74.12" }, { name = "mlx-vlm", marker = "sys_platform == 'darwin' and extra == 'all'", specifier = ">=0.1.27" }, @@ -1015,7 +1013,7 @@ provides-extras = ["openai", "anthropic", "qwen", "omni", "uitars", "uitars-mlx" [[package]] name = "cua-computer" -version = "0.4.12" +version = "0.4.17" source = { editable = "libs/python/computer" } dependencies = [ { name = "aiohttp" },