From a78dcd333109c3b1ba0908530296b7f0970ba16d Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 22 Oct 2025 16:14:43 -0700 Subject: [PATCH] Fix qwen3 hallucinating coords w/o screenshots --- README.md | 1 + .../supported-agents/computer-use-agents.mdx | 11 ++ libs/python/agent/agent/loops/qwen.py | 107 ++++++++++++++---- 3 files changed, 98 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 2f9429cc..99b47eb9 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ With the Agent SDK, you can: | `openai/computer-use-preview` | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` | any VLM (using liteLLM, requires `tools` parameter) | | `openrouter/z-ai/glm-4.5v` | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` | any LLM (using liteLLM, requires `moondream3+` prefix ) | | `gemini-2.5-computer-use-preview-10-2025` | any-all-in-one CUA | | +| `openrouter/qwen/qwen3-vl-235b-a22b-instruct` | | | | `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` | | | | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | | | `moondream3+{ui planning}` (supports text-only models) | | diff --git a/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx b/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx index ea3e3a4b..a3384b21 100644 --- a/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx +++ b/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx @@ -73,6 +73,17 @@ async for _ in agent.run("Open Firefox and navigate to github.com"): pass ``` +## Qwen3 VL + +Qwen3 VL family: +- `openrouter/qwen/qwen3-vl-235b-a22b-instruct` + +```python +agent = ComputerAgent("openrouter/qwen/qwen3-vl-235b-a22b-instruct", tools=[computer]) +async for _ in agent.run("Open Firefox and navigate to github.com"): + pass +``` + ## UI-TARS 1.5 Unified vision-language model for computer-use: diff --git a/libs/python/agent/agent/loops/qwen.py b/libs/python/agent/agent/loops/qwen.py index 6438ff81..151376d4 100644 --- a/libs/python/agent/agent/loops/qwen.py +++ b/libs/python/agent/agent/loops/qwen.py @@ -54,6 +54,7 @@ QWEN3_COMPUTER_TOOL: Dict[str, Any] = { "triple_click", "scroll", "hscroll", + "screenshot", "wait", # "terminate", # "answer", @@ -125,23 +126,16 @@ def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]: except Exception: return None -async def _unnormalize_coordinate(args: Dict[str, Any], computer_handler) -> Dict[str, Any]: - """If coordinate appears in 0..1000 space, scale to actual screen size using computer_handler if provided.""" +async def _unnormalize_coordinate(args: Dict[str, Any], dims: Tuple[int, int]) -> Dict[str, Any]: + """Coordinates appear in 0..1000 space, scale to actual screen size using dims if provided.""" coord = args.get("coordinate") if not coord or not isinstance(coord, (list, tuple)) or len(coord) < 2: return args x, y = float(coord[0]), float(coord[1]) - # Heuristic: treat <= 1000 as normalized - if x <= 1000.0 and y <= 1000.0 and computer_handler is not None and hasattr(computer_handler, "get_dimensions"): - try: - dims = await computer_handler.get_dimensions() - if isinstance(dims, (list, tuple)) and len(dims) >= 2: - width, height = float(dims[0]), float(dims[1]) - x_abs = max(0.0, min(width, (x / 1000.0) * width)) - y_abs = max(0.0, min(height, (y / 1000.0) * height)) - args = {**args, "coordinate": [round(x_abs), round(y_abs)]} - except Exception: - pass + width, height = float(dims[0]), float(dims[1]) + x_abs = max(0.0, min(width, (x / 1000.0) * width)) + y_abs = max(0.0, min(height, (y / 1000.0) * height)) + args = {**args, "coordinate": [round(x_abs), round(y_abs)]} return args @@ -254,6 +248,77 @@ class Qwen3VlConfig(AsyncAgentConfig): nous_system = _build_nous_system([QWEN3_COMPUTER_TOOL["function"]]) completion_messages = ([nous_system] if nous_system else []) + converted_msgs + # If there is no screenshot in the conversation, take one now and inject it. + # Also record a pre_output_items assistant message to reflect action. + def _has_any_image(msgs: List[Dict[str, Any]]) -> bool: + for m in msgs: + content = m.get("content") + if isinstance(content, list): + for p in content: + if isinstance(p, dict) and p.get("type") == "image_url": + return True + return False + + pre_output_items: List[Dict[str, Any]] = [] + if not _has_any_image(completion_messages): + if computer_handler is None or not hasattr(computer_handler, "screenshot"): + raise RuntimeError("No screenshots present and computer_handler.screenshot is not available.") + screenshot_b64 = await computer_handler.screenshot() + if not screenshot_b64: + raise RuntimeError("Failed to capture screenshot from computer_handler.") + # Inject a user message with the screenshot so the model can see current context + completion_messages.append( + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}}, + {"type": "text", "text": "Current screen"}, + ], + } + ) + # Add assistant message to outputs to reflect the action, similar to composed_grounded.py + pre_output_items.append( + { + "type": "message", + "role": "assistant", + "content": [ + {"type": "text", "text": "Taking a screenshot to see the current computer screen."} + ], + } + ) + + # Smart-resize all screenshots and attach min/max pixel hints. Fail fast if deps missing. + # Also record the last resized width/height to unnormalize coordinates later. + last_rw: Optional[int] = None + last_rh: Optional[int] = None + MIN_PIXELS = 3136 + MAX_PIXELS = 12845056 + try: + from qwen_vl_utils import smart_resize # type: ignore + from PIL import Image # type: ignore + import base64, io + except Exception: + raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.") + + for msg in completion_messages: + content = msg.get("content") + if not isinstance(content, list): + continue + for part in content: + if isinstance(part, dict) and part.get("type") == "image_url": + url = (((part.get("image_url") or {}).get("url")) or "") + # Expect data URL like data:image/png;base64, + if url.startswith("data:") and "," in url: + b64 = url.split(",", 1)[1] + img_bytes = base64.b64decode(b64) + im = Image.open(io.BytesIO(img_bytes)) + h, w = im.height, im.width + rh, rw = smart_resize(h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS) + # Attach hints on this image block + part["min_pixels"] = MIN_PIXELS + part["max_pixels"] = MAX_PIXELS + last_rw, last_rh = rw, rh + api_kwargs: Dict[str, Any] = { "model": model, "messages": completion_messages, @@ -291,8 +356,10 @@ class Qwen3VlConfig(AsyncAgentConfig): if tool_call and isinstance(tool_call, dict): fn_name = tool_call.get("name") or "computer" raw_args = tool_call.get("arguments") or {} - # Unnormalize coordinates to actual screen size when possible - args = await _unnormalize_coordinate(raw_args, computer_handler) + # Unnormalize coordinates to actual screen size using last resized dims + if last_rw is None or last_rh is None: + raise RuntimeError("No screenshots found to derive dimensions for coordinate unnormalization.") + args = await _unnormalize_coordinate(raw_args, (last_rw, last_rh)) # Build an OpenAI-style tool call so we can reuse the converter fake_cm = { @@ -314,7 +381,8 @@ class Qwen3VlConfig(AsyncAgentConfig): fake_cm = {"role": "assistant", "content": content_text} output_items.extend(convert_completion_messages_to_responses_items([fake_cm])) - return {"output": output_items, "usage": usage} + # Prepend any pre_output_items (e.g., simulated screenshot-taking message) + return {"output": (pre_output_items + output_items), "usage": usage} def get_capabilities(self) -> List[AgentCapability]: return ["step"] @@ -353,7 +421,7 @@ class Qwen3VlConfig(AsyncAgentConfig): # Build Nous system (lazy import inside helper already raises clear guidance if missing) nous_system = _build_nous_system([reduced_tool["function"]]) - # Optionally compute min/max pixels via smart_resize if available + # Pre-process using smart_resize min_pixels = 3136 max_pixels = 12845056 try: @@ -368,9 +436,6 @@ class Qwen3VlConfig(AsyncAgentConfig): h, w = im.height, im.width # Qwen notebook suggests factor=32 and a wide min/max range rh, rw = smart_resize(h, w, factor=32, min_pixels=min_pixels, max_pixels=max_pixels) - # Use total pixels as hints - min_pixels = min(3136, rh * rw) - max_pixels = max(12845056, rh * rw) except Exception: raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.") @@ -403,7 +468,7 @@ class Qwen3VlConfig(AsyncAgentConfig): content_text = (((choice.get("message") or {}).get("content")) or "") tool_call = _parse_tool_call_from_text(content_text) or {} args = tool_call.get("arguments") or {} - args = await _unnormalize_coordinate(args, kwargs.get("computer_handler")) + args = await _unnormalize_coordinate(args, (rh, rw)) coord = args.get("coordinate") if isinstance(coord, (list, tuple)) and len(coord) >= 2: return int(coord[0]), int(coord[1])