diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py index 3ef599a4..ab23ac27 100644 --- a/libs/python/agent/agent/loops/__init__.py +++ b/libs/python/agent/agent/loops/__init__.py @@ -15,8 +15,8 @@ from . import ( omniparser, openai, opencua, - uitars, qwen, + uitars, ) __all__ = [ diff --git a/libs/python/agent/agent/loops/qwen.py b/libs/python/agent/agent/loops/qwen.py index 151376d4..f21fba2c 100644 --- a/libs/python/agent/agent/loops/qwen.py +++ b/libs/python/agent/agent/loops/qwen.py @@ -3,12 +3,13 @@ Qwen3-VL agent loop implementation using litellm with function/tool calling. - Passes a ComputerUse tool schema to acompletion - Converts between Responses items and completion messages using helpers """ -from __future__ import annotations -from typing import Any, Dict, List, Optional, Tuple +from __future__ import annotations import json import re +from typing import Any, Dict, List, Optional, Tuple + import litellm from litellm.responses.litellm_completion_transformation.transformation import ( LiteLLMCompletionResponsesConfig, @@ -16,12 +17,11 @@ from litellm.responses.litellm_completion_transformation.transformation import ( from ..decorators import register_agent from ..loops.base import AsyncAgentConfig -from ..types import AgentCapability from ..responses import ( - convert_responses_items_to_completion_messages, convert_completion_messages_to_responses_items, + convert_responses_items_to_completion_messages, ) - +from ..types import AgentCapability # ComputerUse tool schema (OpenAI function tool format) QWEN3_COMPUTER_TOOL: Dict[str, Any] = { @@ -96,18 +96,29 @@ QWEN3_COMPUTER_TOOL: Dict[str, Any] = { }, } + def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]: """Use qwen-agent NousFnCallPrompt to generate a system message embedding tool schema.""" try: from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import ( - NousFnCallPrompt, - Message as NousMessage, ContentItem as NousContentItem, ) + from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import ( + Message as NousMessage, + ) + from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import ( + NousFnCallPrompt, + ) except ImportError: - raise ImportError("qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`.") + raise ImportError( + "qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`." + ) msgs = NousFnCallPrompt().preprocess_fncall_messages( - messages=[NousMessage(role="system", content=[NousContentItem(text="You are a helpful assistant.")])], + messages=[ + NousMessage( + role="system", content=[NousContentItem(text="You are a helpful assistant.")] + ) + ], functions=functions, lang="en", ) @@ -116,6 +127,7 @@ def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, An content = [{"type": "text", "text": c["text"]} for c in sys.get("content", [])] return {"role": "system", "content": content} + def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]: """Extract JSON object within ... from model text.""" m = re.search(r"\s*(\{[\s\S]*?\})\s*", text) @@ -126,6 +138,7 @@ def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]: except Exception: return None + async def _unnormalize_coordinate(args: Dict[str, Any], dims: Tuple[int, int]) -> Dict[str, Any]: """Coordinates appear in 0..1000 space, scale to actual screen size using dims if provided.""" coord = args.get("coordinate") @@ -262,7 +275,9 @@ class Qwen3VlConfig(AsyncAgentConfig): pre_output_items: List[Dict[str, Any]] = [] if not _has_any_image(completion_messages): if computer_handler is None or not hasattr(computer_handler, "screenshot"): - raise RuntimeError("No screenshots present and computer_handler.screenshot is not available.") + raise RuntimeError( + "No screenshots present and computer_handler.screenshot is not available." + ) screenshot_b64 = await computer_handler.screenshot() if not screenshot_b64: raise RuntimeError("Failed to capture screenshot from computer_handler.") @@ -271,7 +286,10 @@ class Qwen3VlConfig(AsyncAgentConfig): { "role": "user", "content": [ - {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}}, + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}, + }, {"type": "text", "text": "Current screen"}, ], } @@ -282,7 +300,10 @@ class Qwen3VlConfig(AsyncAgentConfig): "type": "message", "role": "assistant", "content": [ - {"type": "text", "text": "Taking a screenshot to see the current computer screen."} + { + "type": "text", + "text": "Taking a screenshot to see the current computer screen.", + } ], } ) @@ -294,11 +315,15 @@ class Qwen3VlConfig(AsyncAgentConfig): MIN_PIXELS = 3136 MAX_PIXELS = 12845056 try: - from qwen_vl_utils import smart_resize # type: ignore + import base64 + import io + from PIL import Image # type: ignore - import base64, io + from qwen_vl_utils import smart_resize # type: ignore except Exception: - raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.") + raise ImportError( + "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`." + ) for msg in completion_messages: content = msg.get("content") @@ -306,14 +331,16 @@ class Qwen3VlConfig(AsyncAgentConfig): continue for part in content: if isinstance(part, dict) and part.get("type") == "image_url": - url = (((part.get("image_url") or {}).get("url")) or "") + url = ((part.get("image_url") or {}).get("url")) or "" # Expect data URL like data:image/png;base64, if url.startswith("data:") and "," in url: b64 = url.split(",", 1)[1] img_bytes = base64.b64decode(b64) im = Image.open(io.BytesIO(img_bytes)) h, w = im.height, im.width - rh, rw = smart_resize(h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS) + rh, rw = smart_resize( + h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS + ) # Attach hints on this image block part["min_pixels"] = MIN_PIXELS part["max_pixels"] = MAX_PIXELS @@ -349,7 +376,7 @@ class Qwen3VlConfig(AsyncAgentConfig): # Parse tool call from text; then convert to responses items via fake tool_calls resp_dict = response.model_dump() # type: ignore choice = (resp_dict.get("choices") or [{}])[0] - content_text = (((choice.get("message") or {}).get("content")) or "") + content_text = ((choice.get("message") or {}).get("content")) or "" tool_call = _parse_tool_call_from_text(content_text) output_items: List[Dict[str, Any]] = [] @@ -358,7 +385,9 @@ class Qwen3VlConfig(AsyncAgentConfig): raw_args = tool_call.get("arguments") or {} # Unnormalize coordinates to actual screen size using last resized dims if last_rw is None or last_rh is None: - raise RuntimeError("No screenshots found to derive dimensions for coordinate unnormalization.") + raise RuntimeError( + "No screenshots found to derive dimensions for coordinate unnormalization." + ) args = await _unnormalize_coordinate(raw_args, (last_rw, last_rh)) # Build an OpenAI-style tool call so we can reuse the converter @@ -426,10 +455,12 @@ class Qwen3VlConfig(AsyncAgentConfig): max_pixels = 12845056 try: # Lazy import to avoid hard dependency - from qwen_vl_utils import smart_resize # type: ignore + import base64 + import io + # If PIL is available, estimate size from image to derive smart bounds from PIL import Image - import io, base64 + from qwen_vl_utils import smart_resize # type: ignore img_bytes = base64.b64decode(image_b64) im = Image.open(io.BytesIO(img_bytes)) @@ -437,16 +468,16 @@ class Qwen3VlConfig(AsyncAgentConfig): # Qwen notebook suggests factor=32 and a wide min/max range rh, rw = smart_resize(h, w, factor=32, min_pixels=min_pixels, max_pixels=max_pixels) except Exception: - raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.") + raise ImportError( + "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`." + ) messages = [] if nous_system: messages.append(nous_system) image_block: Dict[str, Any] = { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{image_b64}" - }, + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{image_b64}"}, "min_pixels": min_pixels, "max_pixels": max_pixels, } @@ -461,11 +492,15 @@ class Qwen3VlConfig(AsyncAgentConfig): } ) - api_kwargs: Dict[str, Any] = {"model": model, "messages": messages, **{k: v for k, v in kwargs.items()}} + api_kwargs: Dict[str, Any] = { + "model": model, + "messages": messages, + **{k: v for k, v in kwargs.items()}, + } response = await litellm.acompletion(**api_kwargs) resp = response.model_dump() # type: ignore choice = (resp.get("choices") or [{}])[0] - content_text = (((choice.get("message") or {}).get("content")) or "") + content_text = ((choice.get("message") or {}).get("content")) or "" tool_call = _parse_tool_call_from_text(content_text) or {} args = tool_call.get("arguments") or {} args = await _unnormalize_coordinate(args, (rh, rw))