From a78dcd333109c3b1ba0908530296b7f0970ba16d Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 22 Oct 2025 16:14:43 -0700
Subject: [PATCH] Fix qwen3 hallucinating coords w/o screenshots

---
 README.md                                     |   1 +
 .../supported-agents/computer-use-agents.mdx  |  11 ++
 libs/python/agent/agent/loops/qwen.py         | 107 ++++++++++++++----
 3 files changed, 98 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 2f9429cc..99b47eb9 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@ With the Agent SDK, you can:
 | `openai/computer-use-preview`                                                                  | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}`                                                | any VLM (using liteLLM, requires `tools` parameter)                                           |
 | `openrouter/z-ai/glm-4.5v`                                                                     | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}`                                               | any LLM (using liteLLM, requires `moondream3+` prefix )                                       |
 | `gemini-2.5-computer-use-preview-10-2025`                                                      | any-all-in-one CUA                                                                             |                                                                                               |
+| `openrouter/qwen/qwen3-vl-235b-a22b-instruct`                                                   |                                                                                                |                                                                                               |
 | `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`                                    |                                                                                                |                                                                                               |
 | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`                                              |                                                                                                |
 | `moondream3+{ui planning}` (supports text-only models)                                         |                                                                                                |
diff --git a/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx b/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx
index ea3e3a4b..a3384b21 100644
--- a/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx
+++ b/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx
@@ -73,6 +73,17 @@ async for _ in agent.run("Open Firefox and navigate to github.com"):
     pass
 ```
 
+## Qwen3 VL
+
+Qwen3 VL family:
+- `openrouter/qwen/qwen3-vl-235b-a22b-instruct`
+
+```python
+agent = ComputerAgent("openrouter/qwen/qwen3-vl-235b-a22b-instruct", tools=[computer])
+async for _ in agent.run("Open Firefox and navigate to github.com"):
+    pass
+```
+
 ## UI-TARS 1.5
 
 Unified vision-language model for computer-use:
diff --git a/libs/python/agent/agent/loops/qwen.py b/libs/python/agent/agent/loops/qwen.py
index 6438ff81..151376d4 100644
--- a/libs/python/agent/agent/loops/qwen.py
+++ b/libs/python/agent/agent/loops/qwen.py
@@ -54,6 +54,7 @@ QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
                         "triple_click",
                         "scroll",
                         "hscroll",
+                        "screenshot",
                         "wait",
                         # "terminate",
                         # "answer",
@@ -125,23 +126,16 @@ def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
     except Exception:
         return None
 
-async def _unnormalize_coordinate(args: Dict[str, Any], computer_handler) -> Dict[str, Any]:
-    """If coordinate appears in 0..1000 space, scale to actual screen size using computer_handler if provided."""
+async def _unnormalize_coordinate(args: Dict[str, Any], dims: Tuple[int, int]) -> Dict[str, Any]:
+    """Coordinates appear in 0..1000 space, scale to actual screen size using dims if provided."""
     coord = args.get("coordinate")
     if not coord or not isinstance(coord, (list, tuple)) or len(coord) < 2:
         return args
     x, y = float(coord[0]), float(coord[1])
-    # Heuristic: treat <= 1000 as normalized
-    if x <= 1000.0 and y <= 1000.0 and computer_handler is not None and hasattr(computer_handler, "get_dimensions"):
-        try:
-            dims = await computer_handler.get_dimensions()
-            if isinstance(dims, (list, tuple)) and len(dims) >= 2:
-                width, height = float(dims[0]), float(dims[1])
-                x_abs = max(0.0, min(width, (x / 1000.0) * width))
-                y_abs = max(0.0, min(height, (y / 1000.0) * height))
-                args = {**args, "coordinate": [round(x_abs), round(y_abs)]}
-        except Exception:
-            pass
+    width, height = float(dims[0]), float(dims[1])
+    x_abs = max(0.0, min(width, (x / 1000.0) * width))
+    y_abs = max(0.0, min(height, (y / 1000.0) * height))
+    args = {**args, "coordinate": [round(x_abs), round(y_abs)]}
     return args
 
 
@@ -254,6 +248,77 @@ class Qwen3VlConfig(AsyncAgentConfig):
         nous_system = _build_nous_system([QWEN3_COMPUTER_TOOL["function"]])
         completion_messages = ([nous_system] if nous_system else []) + converted_msgs
 
+        # If there is no screenshot in the conversation, take one now and inject it.
+        # Also record a pre_output_items assistant message to reflect action.
+        def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
+            for m in msgs:
+                content = m.get("content")
+                if isinstance(content, list):
+                    for p in content:
+                        if isinstance(p, dict) and p.get("type") == "image_url":
+                            return True
+            return False
+
+        pre_output_items: List[Dict[str, Any]] = []
+        if not _has_any_image(completion_messages):
+            if computer_handler is None or not hasattr(computer_handler, "screenshot"):
+                raise RuntimeError("No screenshots present and computer_handler.screenshot is not available.")
+            screenshot_b64 = await computer_handler.screenshot()
+            if not screenshot_b64:
+                raise RuntimeError("Failed to capture screenshot from computer_handler.")
+            # Inject a user message with the screenshot so the model can see current context
+            completion_messages.append(
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
+                        {"type": "text", "text": "Current screen"},
+                    ],
+                }
+            )
+            # Add assistant message to outputs to reflect the action, similar to composed_grounded.py
+            pre_output_items.append(
+                {
+                    "type": "message",
+                    "role": "assistant",
+                    "content": [
+                        {"type": "text", "text": "Taking a screenshot to see the current computer screen."}
+                    ],
+                }
+            )
+
+        # Smart-resize all screenshots and attach min/max pixel hints. Fail fast if deps missing.
+        # Also record the last resized width/height to unnormalize coordinates later.
+        last_rw: Optional[int] = None
+        last_rh: Optional[int] = None
+        MIN_PIXELS = 3136
+        MAX_PIXELS = 12845056
+        try:
+            from qwen_vl_utils import smart_resize  # type: ignore
+            from PIL import Image  # type: ignore
+            import base64, io
+        except Exception:
+            raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
+
+        for msg in completion_messages:
+            content = msg.get("content")
+            if not isinstance(content, list):
+                continue
+            for part in content:
+                if isinstance(part, dict) and part.get("type") == "image_url":
+                    url = (((part.get("image_url") or {}).get("url")) or "")
+                    # Expect data URL like data:image/png;base64,<b64>
+                    if url.startswith("data:") and "," in url:
+                        b64 = url.split(",", 1)[1]
+                        img_bytes = base64.b64decode(b64)
+                        im = Image.open(io.BytesIO(img_bytes))
+                        h, w = im.height, im.width
+                        rh, rw = smart_resize(h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
+                        # Attach hints on this image block
+                        part["min_pixels"] = MIN_PIXELS
+                        part["max_pixels"] = MAX_PIXELS
+                        last_rw, last_rh = rw, rh
+
         api_kwargs: Dict[str, Any] = {
             "model": model,
             "messages": completion_messages,
@@ -291,8 +356,10 @@ class Qwen3VlConfig(AsyncAgentConfig):
         if tool_call and isinstance(tool_call, dict):
             fn_name = tool_call.get("name") or "computer"
             raw_args = tool_call.get("arguments") or {}
-            # Unnormalize coordinates to actual screen size when possible
-            args = await _unnormalize_coordinate(raw_args, computer_handler)
+            # Unnormalize coordinates to actual screen size using last resized dims
+            if last_rw is None or last_rh is None:
+                raise RuntimeError("No screenshots found to derive dimensions for coordinate unnormalization.")
+            args = await _unnormalize_coordinate(raw_args, (last_rw, last_rh))
 
             # Build an OpenAI-style tool call so we can reuse the converter
             fake_cm = {
@@ -314,7 +381,8 @@ class Qwen3VlConfig(AsyncAgentConfig):
             fake_cm = {"role": "assistant", "content": content_text}
             output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
 
-        return {"output": output_items, "usage": usage}
+        # Prepend any pre_output_items (e.g., simulated screenshot-taking message)
+        return {"output": (pre_output_items + output_items), "usage": usage}
 
     def get_capabilities(self) -> List[AgentCapability]:
         return ["step"]
@@ -353,7 +421,7 @@ class Qwen3VlConfig(AsyncAgentConfig):
         # Build Nous system (lazy import inside helper already raises clear guidance if missing)
         nous_system = _build_nous_system([reduced_tool["function"]])
 
-        # Optionally compute min/max pixels via smart_resize if available
+        # Pre-process using smart_resize
         min_pixels = 3136
         max_pixels = 12845056
         try:
@@ -368,9 +436,6 @@ class Qwen3VlConfig(AsyncAgentConfig):
             h, w = im.height, im.width
             # Qwen notebook suggests factor=32 and a wide min/max range
             rh, rw = smart_resize(h, w, factor=32, min_pixels=min_pixels, max_pixels=max_pixels)
-            # Use total pixels as hints
-            min_pixels = min(3136, rh * rw)
-            max_pixels = max(12845056, rh * rw)
         except Exception:
             raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
 
@@ -403,7 +468,7 @@ class Qwen3VlConfig(AsyncAgentConfig):
         content_text = (((choice.get("message") or {}).get("content")) or "")
         tool_call = _parse_tool_call_from_text(content_text) or {}
         args = tool_call.get("arguments") or {}
-        args = await _unnormalize_coordinate(args, kwargs.get("computer_handler"))
+        args = await _unnormalize_coordinate(args, (rh, rw))
         coord = args.get("coordinate")
         if isinstance(coord, (list, tuple)) and len(coord) >= 2:
             return int(coord[0]), int(coord[1])