From 7631412694e555a2a22f092315129d3acc9a651c Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 22 Oct 2025 15:51:51 -0700
Subject: [PATCH 1/9] Add qwen3 VL computer-use loop

---
 libs/python/agent/agent/loops/__init__.py |   2 +
 libs/python/agent/agent/loops/qwen.py     | 410 ++++++++++++++++++++++
 libs/python/agent/pyproject.toml          |   5 +
 uv.lock                                   | 104 +++++-
 4 files changed, 518 insertions(+), 3 deletions(-)
 create mode 100644 libs/python/agent/agent/loops/qwen.py

diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py
index 6fd20cde..3ef599a4 100644
--- a/libs/python/agent/agent/loops/__init__.py
+++ b/libs/python/agent/agent/loops/__init__.py
@@ -16,6 +16,7 @@ from . import (
     openai,
     opencua,
     uitars,
+    qwen,
 )
 
 __all__ = [
@@ -31,4 +32,5 @@ __all__ = [
     "holo",
     "moondream3",
     "gemini",
+    "qwen",
 ]
diff --git a/libs/python/agent/agent/loops/qwen.py b/libs/python/agent/agent/loops/qwen.py
new file mode 100644
index 00000000..6438ff81
--- /dev/null
+++ b/libs/python/agent/agent/loops/qwen.py
@@ -0,0 +1,410 @@
+"""
+Qwen3-VL agent loop implementation using litellm with function/tool calling.
+- Passes a ComputerUse tool schema to acompletion
+- Converts between Responses items and completion messages using helpers
+"""
+from __future__ import annotations
+
+from typing import Any, Dict, List, Optional, Tuple
+
+import json
+import re
+import litellm
+from litellm.responses.litellm_completion_transformation.transformation import (
+    LiteLLMCompletionResponsesConfig,
+)
+
+from ..decorators import register_agent
+from ..loops.base import AsyncAgentConfig
+from ..types import AgentCapability
+from ..responses import (
+    convert_responses_items_to_completion_messages,
+    convert_completion_messages_to_responses_items,
+)
+
+
+# ComputerUse tool schema (OpenAI function tool format)
+QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
+    "type": "function",
+    "function": {
+        "name": "computer",
+        "description": (
+            "Use a mouse and keyboard to interact with a computer, and take screenshots.\n"
+            "* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n"
+            "* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n"
+            "* The screen's resolution is 1000x1000.\n"
+            "* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n"
+            "* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n"
+            "* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "action": {
+                    "description": "The action to perform.",
+                    "enum": [
+                        "key",
+                        "type",
+                        "mouse_move",
+                        "left_click",
+                        "left_click_drag",
+                        "right_click",
+                        "middle_click",
+                        "double_click",
+                        "triple_click",
+                        "scroll",
+                        "hscroll",
+                        "wait",
+                        # "terminate",
+                        # "answer",
+                    ],
+                    "type": "string",
+                },
+                "keys": {
+                    "description": "Required only by action=key.",
+                    "type": "array",
+                    "items": {"type": "string"},
+                },
+                "text": {
+                    "description": "Required only by action=type and action=answer.",
+                    "type": "string",
+                },
+                "coordinate": {
+                    "description": "(x, y): Pixel coordinates from top-left.",
+                    "type": "array",
+                    "items": {"type": ["number", "integer"]},
+                    "minItems": 2,
+                    "maxItems": 2,
+                },
+                "pixels": {
+                    "description": "Scroll amount. Positive=up, negative=down. For scroll/hscroll.",
+                    "type": "number",
+                },
+                "time": {
+                    "description": "Seconds to wait (action=wait).",
+                    "type": "number",
+                },
+                # "status": {
+                #     "description": "Task status (action=terminate).",
+                #     "type": "string",
+                #     "enum": ["success", "failure"],
+                # },
+            },
+            "required": ["action"],
+        },
+    },
+}
+
+def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+    """Use qwen-agent NousFnCallPrompt to generate a system message embedding tool schema."""
+    try:
+        from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
+            NousFnCallPrompt,
+            Message as NousMessage,
+            ContentItem as NousContentItem,
+        )
+    except ImportError:
+        raise ImportError("qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`.")
+    msgs = NousFnCallPrompt().preprocess_fncall_messages(
+        messages=[NousMessage(role="system", content=[NousContentItem(text="You are a helpful assistant.")])],
+        functions=functions,
+        lang="en",
+    )
+    sys = msgs[0].model_dump()
+    # Convert qwen-agent structured content to OpenAI-style content list
+    content = [{"type": "text", "text": c["text"]} for c in sys.get("content", [])]
+    return {"role": "system", "content": content}
+
+def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
+    """Extract JSON object within <tool_call>...</tool_call> from model text."""
+    m = re.search(r"<tool_call>\s*(\{[\s\S]*?\})\s*</tool_call>", text)
+    if not m:
+        return None
+    try:
+        return json.loads(m.group(1))
+    except Exception:
+        return None
+
+async def _unnormalize_coordinate(args: Dict[str, Any], computer_handler) -> Dict[str, Any]:
+    """If coordinate appears in 0..1000 space, scale to actual screen size using computer_handler if provided."""
+    coord = args.get("coordinate")
+    if not coord or not isinstance(coord, (list, tuple)) or len(coord) < 2:
+        return args
+    x, y = float(coord[0]), float(coord[1])
+    # Heuristic: treat <= 1000 as normalized
+    if x <= 1000.0 and y <= 1000.0 and computer_handler is not None and hasattr(computer_handler, "get_dimensions"):
+        try:
+            dims = await computer_handler.get_dimensions()
+            if isinstance(dims, (list, tuple)) and len(dims) >= 2:
+                width, height = float(dims[0]), float(dims[1])
+                x_abs = max(0.0, min(width, (x / 1000.0) * width))
+                y_abs = max(0.0, min(height, (y / 1000.0) * height))
+                args = {**args, "coordinate": [round(x_abs), round(y_abs)]}
+        except Exception:
+            pass
+    return args
+
+
+def convert_qwen_tool_args_to_computer_action(args: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    """
+    Convert Qwen computer tool arguments to the Computer Calls action schema.
+
+    Qwen (example):
+        {"action": "left_click", "coordinate": [114, 68]}
+
+    Target (example):
+        {"action": "left_click", "x": 114, "y": 68}
+
+    Other mappings:
+    - right_click, middle_click, double_click (triple_click -> double_click)
+    - mouse_move -> { action: "move", x, y }
+    - key -> { action: "keypress", keys: [...] }
+    - type -> { action: "type", text }
+    - scroll/hscroll -> { action: "scroll", scroll_x, scroll_y, x, y }
+    - wait -> { action: "wait" }
+    - terminate/answer are not direct UI actions; return None for now
+    """
+    if not isinstance(args, dict):
+        return None
+
+    action = args.get("action")
+    if not isinstance(action, str):
+        return None
+
+    # Coordinates helper
+    coord = args.get("coordinate")
+    x = y = None
+    if isinstance(coord, (list, tuple)) and len(coord) >= 2:
+        try:
+            x = int(round(float(coord[0])))
+            y = int(round(float(coord[1])))
+        except Exception:
+            x = y = None
+
+    # Map actions
+    a = action.lower()
+    if a in {"left_click", "right_click", "middle_click", "double_click"}:
+        if x is None or y is None:
+            return None
+        return {"action": a, "x": x, "y": y}
+    if a == "triple_click":
+        # Approximate as double_click
+        if x is None or y is None:
+            return None
+        return {"action": "double_click", "x": x, "y": y}
+    if a == "mouse_move":
+        if x is None or y is None:
+            return None
+        return {"action": "move", "x": x, "y": y}
+    if a == "key":
+        keys = args.get("keys")
+        if isinstance(keys, list) and all(isinstance(k, str) for k in keys):
+            return {"action": "keypress", "keys": keys}
+        return None
+    if a == "type":
+        text = args.get("text")
+        if isinstance(text, str):
+            return {"action": "type", "text": text}
+        return None
+    if a in {"scroll", "hscroll"}:
+        pixels = args.get("pixels") or 0
+        try:
+            pixels_val = int(round(float(pixels)))
+        except Exception:
+            pixels_val = 0
+        scroll_x = pixels_val if a == "hscroll" else 0
+        scroll_y = pixels_val if a == "scroll" else 0
+        # Include cursor position if available (optional)
+        out: Dict[str, Any] = {"action": "scroll", "scroll_x": scroll_x, "scroll_y": scroll_y}
+        if x is not None and y is not None:
+            out.update({"x": x, "y": y})
+        return out
+    if a == "wait":
+        return {"action": "wait"}
+
+    # Non-UI or terminal actions: terminate/answer -> not mapped here
+    return None
+
+
+@register_agent(models=r"(?i).*qwen.*", priority=-1)
+class Qwen3VlConfig(AsyncAgentConfig):
+    async def predict_step(
+        self,
+        messages: List[Dict[str, Any]],
+        model: str,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        max_retries: Optional[int] = None,
+        stream: bool = False,
+        computer_handler=None,
+        use_prompt_caching: Optional[bool] = False,
+        _on_api_start=None,
+        _on_api_end=None,
+        _on_usage=None,
+        _on_screenshot=None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        # Build messages using NousFnCallPrompt system with tool schema in text
+        # Start with converted conversation (images/text preserved)
+        converted_msgs = convert_responses_items_to_completion_messages(
+            messages,
+            allow_images_in_tool_results=False,
+        )
+
+        # Prepend Nous-generated system if available
+        nous_system = _build_nous_system([QWEN3_COMPUTER_TOOL["function"]])
+        completion_messages = ([nous_system] if nous_system else []) + converted_msgs
+
+        api_kwargs: Dict[str, Any] = {
+            "model": model,
+            "messages": completion_messages,
+            "max_retries": max_retries,
+            "stream": stream,
+            **{k: v for k, v in kwargs.items()},
+        }
+        if use_prompt_caching:
+            api_kwargs["use_prompt_caching"] = use_prompt_caching
+
+        if _on_api_start:
+            await _on_api_start(api_kwargs)
+
+        response = await litellm.acompletion(**api_kwargs)
+
+        if _on_api_end:
+            await _on_api_end(api_kwargs, response)
+
+        usage = {
+            **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(  # type: ignore
+                response.usage
+            ).model_dump(),
+            "response_cost": response._hidden_params.get("response_cost", 0.0),
+        }
+        if _on_usage:
+            await _on_usage(usage)
+
+        # Parse tool call from text; then convert to responses items via fake tool_calls
+        resp_dict = response.model_dump()  # type: ignore
+        choice = (resp_dict.get("choices") or [{}])[0]
+        content_text = (((choice.get("message") or {}).get("content")) or "")
+        tool_call = _parse_tool_call_from_text(content_text)
+
+        output_items: List[Dict[str, Any]] = []
+        if tool_call and isinstance(tool_call, dict):
+            fn_name = tool_call.get("name") or "computer"
+            raw_args = tool_call.get("arguments") or {}
+            # Unnormalize coordinates to actual screen size when possible
+            args = await _unnormalize_coordinate(raw_args, computer_handler)
+
+            # Build an OpenAI-style tool call so we can reuse the converter
+            fake_cm = {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "type": "function",
+                        "id": "call_0",
+                        "function": {
+                            "name": fn_name,
+                            "arguments": json.dumps(args),
+                        },
+                    }
+                ],
+            }
+            output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
+        else:
+            # Fallback: just return assistant text
+            fake_cm = {"role": "assistant", "content": content_text}
+            output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
+
+        return {"output": output_items, "usage": usage}
+
+    def get_capabilities(self) -> List[AgentCapability]:
+        return ["step"]
+
+    async def predict_click(
+        self, model: str, image_b64: str, instruction: str, **kwargs
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates using Qwen3-VL via litellm.acompletion.
+
+        Only exposes a reduced tool schema with left_click to bias model to output a single click.
+        Returns (x, y) absolute pixels when screen dimensions can be obtained; otherwise normalized 0..1000 integers.
+        """
+        # Reduced tool
+        reduced_tool = {
+            "type": "function",
+            "function": {
+                **QWEN3_COMPUTER_TOOL["function"],
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "action": {"type": "string", "enum": ["left_click"]},
+                        "coordinate": {
+                            "description": "(x, y) in 0..1000 reference space",
+                            "type": "array",
+                            "items": {"type": ["number", "integer"]},
+                            "minItems": 2,
+                            "maxItems": 2,
+                        },
+                    },
+                    "required": ["action", "coordinate"],
+                },
+            },
+        }
+
+        # Build Nous system (lazy import inside helper already raises clear guidance if missing)
+        nous_system = _build_nous_system([reduced_tool["function"]])
+
+        # Optionally compute min/max pixels via smart_resize if available
+        min_pixels = 3136
+        max_pixels = 12845056
+        try:
+            # Lazy import to avoid hard dependency
+            from qwen_vl_utils import smart_resize  # type: ignore
+            # If PIL is available, estimate size from image to derive smart bounds
+            from PIL import Image
+            import io, base64
+
+            img_bytes = base64.b64decode(image_b64)
+            im = Image.open(io.BytesIO(img_bytes))
+            h, w = im.height, im.width
+            # Qwen notebook suggests factor=32 and a wide min/max range
+            rh, rw = smart_resize(h, w, factor=32, min_pixels=min_pixels, max_pixels=max_pixels)
+            # Use total pixels as hints
+            min_pixels = min(3136, rh * rw)
+            max_pixels = max(12845056, rh * rw)
+        except Exception:
+            raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
+
+        messages = []
+        if nous_system:
+            messages.append(nous_system)
+        image_block: Dict[str, Any] = {
+            "type": "image_url", 
+            "image_url": {
+                "url": f"data:image/png;base64,{image_b64}"
+            },
+            "min_pixels": min_pixels,
+            "max_pixels": max_pixels,
+        }
+        # Single user message with image and instruction, matching OpenAI-style content blocks
+        messages.append(
+            {
+                "role": "user",
+                "content": [
+                    image_block,
+                    {"type": "text", "text": instruction},
+                ],
+            }
+        )
+
+        api_kwargs: Dict[str, Any] = {"model": model, "messages": messages, **{k: v for k, v in kwargs.items()}}
+        response = await litellm.acompletion(**api_kwargs)
+        resp = response.model_dump()  # type: ignore
+        choice = (resp.get("choices") or [{}])[0]
+        content_text = (((choice.get("message") or {}).get("content")) or "")
+        tool_call = _parse_tool_call_from_text(content_text) or {}
+        args = tool_call.get("arguments") or {}
+        args = await _unnormalize_coordinate(args, kwargs.get("computer_handler"))
+        coord = args.get("coordinate")
+        if isinstance(coord, (list, tuple)) and len(coord) >= 2:
+            return int(coord[0]), int(coord[1])
+        return None
diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml
index 0d690240..02f2bfa4 100644
--- a/libs/python/agent/pyproject.toml
+++ b/libs/python/agent/pyproject.toml
@@ -29,6 +29,11 @@ requires-python = ">=3.12"
 [project.optional-dependencies]
 openai = []
 anthropic = []
+qwen = [
+    "qwen-vl-utils",
+    "qwen-agent",
+    "Pillow>=10.0.0",
+]
 omni = [
     "cua-som>=0.1.0,<0.2.0",
 ]
diff --git a/uv.lock b/uv.lock
index d33f2821..1b7ea1b3 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 3
+revision = 2
 requires-python = ">=3.12, <3.14"
 resolution-markers = [
     "python_full_version >= '3.13' and sys_platform == 'darwin'",
@@ -385,6 +385,35 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f8/aa/5082412d1ee302e9e7d80b6949bc4d2a8fa1149aaab610c5fc24709605d6/authlib-1.6.5-py2.py3-none-any.whl", hash = "sha256:3e0e0507807f842b02175507bdee8957a1d5707fd4afb17c32fb43fee90b6e3a", size = 243608, upload-time = "2025-10-02T13:36:07.637Z" },
 ]
 
+[[package]]
+name = "av"
+version = "16.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/15/c3/fd72a0315bc6c943ced1105aaac6e0ec1be57c70d8a616bd05acaa21ffee/av-16.0.1.tar.gz", hash = "sha256:dd2ce779fa0b5f5889a6d9e00fbbbc39f58e247e52d31044272648fe16ff1dbf", size = 3904030, upload-time = "2025-10-13T12:28:51.082Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/44/78/12a11d7a44fdd8b26a65e2efa1d8a5826733c8887a989a78306ec4785956/av-16.0.1-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:e41a8fef85dfb2c717349f9ff74f92f9560122a9f1a94b1c6c9a8a9c9462ba71", size = 27206375, upload-time = "2025-10-13T12:25:44.423Z" },
+    { url = "https://files.pythonhosted.org/packages/27/19/3a4d3882852a0ee136121979ce46f6d2867b974eb217a2c9a070939f55ad/av-16.0.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:6352a64b25c9f985d4f279c2902db9a92424e6f2c972161e67119616f0796cb9", size = 21752603, upload-time = "2025-10-13T12:25:49.122Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/6e/f7abefba6e008e2f69bebb9a17ba38ce1df240c79b36a5b5fcacf8c8fcfd/av-16.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:5201f7b4b5ed2128118cb90c2a6d64feedb0586ca7c783176896c78ffb4bbd5c", size = 38931978, upload-time = "2025-10-13T12:25:55.021Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/7a/1305243ab47f724fdd99ddef7309a594e669af7f0e655e11bdd2c325dfae/av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:daecc2072b82b6a942acbdaa9a2e00c05234c61fef976b22713983c020b07992", size = 40549383, upload-time = "2025-10-13T12:26:00.897Z" },
+    { url = "https://files.pythonhosted.org/packages/32/b2/357cc063185043eb757b4a48782bff780826103bcad1eb40c3ddfc050b7e/av-16.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6573da96e8bebc3536860a7def108d7dbe1875c86517072431ced702447e6aea", size = 40241993, upload-time = "2025-10-13T12:26:06.993Z" },
+    { url = "https://files.pythonhosted.org/packages/20/bb/ced42a4588ba168bf0ef1e9d016982e3ba09fde6992f1dda586fd20dcf71/av-16.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4bc064e48a8de6c087b97dd27cf4ef8c13073f0793108fbce3ecd721201b2502", size = 41532235, upload-time = "2025-10-13T12:26:12.488Z" },
+    { url = "https://files.pythonhosted.org/packages/15/37/c7811eca0f318d5fd3212f7e8c3d8335f75a54907c97a89213dc580b8056/av-16.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0c669b6b6668c8ae74451c15ec6d6d8a36e4c3803dc5d9910f607a174dd18f17", size = 32296912, upload-time = "2025-10-13T12:26:19.187Z" },
+    { url = "https://files.pythonhosted.org/packages/86/59/972f199ccc4f8c9e51f59e0f8962a09407396b3f6d11355e2c697ba555f9/av-16.0.1-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:4c61c6c120f5c5d95c711caf54e2c4a9fb2f1e613ac0a9c273d895f6b2602e44", size = 27170433, upload-time = "2025-10-13T12:26:24.673Z" },
+    { url = "https://files.pythonhosted.org/packages/53/9d/0514cbc185fb20353ab25da54197fbd169a233e39efcbb26533c36a9dbb9/av-16.0.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ecc2e41320c69095f44aff93470a0d32c30892b2dbad0a08040441c81efa379", size = 21717654, upload-time = "2025-10-13T12:26:29.12Z" },
+    { url = "https://files.pythonhosted.org/packages/32/8c/881409dd124b4e07d909d2b70568acb21126fc747656390840a2238651c9/av-16.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:036f0554d6faef3f4a94acaeb0cedd388e3ab96eb0eb5a14ec27c17369c466c9", size = 38651601, upload-time = "2025-10-13T12:26:33.919Z" },
+    { url = "https://files.pythonhosted.org/packages/35/fd/867ba4cc3ab504442dc89b0c117e6a994fc62782eb634c8f31304586f93e/av-16.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:876415470a62e4a3550cc38db2fc0094c25e64eea34d7293b7454125d5958190", size = 40278604, upload-time = "2025-10-13T12:26:39.2Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/87/63cde866c0af09a1fa9727b4f40b34d71b0535785f5665c27894306f1fbc/av-16.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:56902a06bd0828d13f13352874c370670882048267191ff5829534b611ba3956", size = 39984854, upload-time = "2025-10-13T12:26:44.581Z" },
+    { url = "https://files.pythonhosted.org/packages/71/3b/8f40a708bff0e6b0f957836e2ef1f4d4429041cf8d99a415a77ead8ac8a3/av-16.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fe988c2bf0fc2d952858f791f18377ea4ae4e19ba3504793799cd6c2a2562edf", size = 41270352, upload-time = "2025-10-13T12:26:50.817Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/b5/c114292cb58a7269405ae13b7ba48c7d7bfeebbb2e4e66c8073c065a4430/av-16.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:708a66c248848029bf518f0482b81c5803846f1b597ef8013b19c014470b620f", size = 32273242, upload-time = "2025-10-13T12:26:55.788Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/e9/a5b714bc078fdcca8b46c8a0b38484ae5c24cd81d9c1703d3e8ae2b57259/av-16.0.1-cp313-cp313t-macosx_11_0_x86_64.whl", hash = "sha256:79a77ee452537030c21a0b41139bedaf16629636bf764b634e93b99c9d5f4558", size = 27248984, upload-time = "2025-10-13T12:27:00.564Z" },
+    { url = "https://files.pythonhosted.org/packages/06/ef/ff777aaf1f88e3f6ce94aca4c5806a0c360e68d48f9d9f0214e42650f740/av-16.0.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:080823a6ff712f81e7089ae9756fb1512ca1742a138556a852ce50f58e457213", size = 21828098, upload-time = "2025-10-13T12:27:05.433Z" },
+    { url = "https://files.pythonhosted.org/packages/34/d7/a484358d24a42bedde97f61f5d6ee568a7dd866d9df6e33731378db92d9e/av-16.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:04e00124afa8b46a850ed48951ddda61de874407fb8307d6a875bba659d5727e", size = 40051697, upload-time = "2025-10-13T12:27:10.525Z" },
+    { url = "https://files.pythonhosted.org/packages/73/87/6772d6080837da5d5c810a98a95bde6977e1f5a6e2e759e8c9292af9ec69/av-16.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:bc098c1c6dc4e7080629a7e9560e67bd4b5654951e17e5ddfd2b1515cfcd37db", size = 41352596, upload-time = "2025-10-13T12:27:16.217Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/58/fe448c60cf7f85640a0ed8936f16bac874846aa35e1baa521028949c1ea3/av-16.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e6ffd3559a72c46a76aa622630751a821499ba5a780b0047ecc75105d43a6b61", size = 41183156, upload-time = "2025-10-13T12:27:21.574Z" },
+    { url = "https://files.pythonhosted.org/packages/85/c6/a039a0979d0c278e1bed6758d5a6186416c3ccb8081970df893fdf9a0d99/av-16.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7a3f1a36b550adadd7513f4f5ee956f9e06b01a88e59f3150ef5fec6879d6f79", size = 42302331, upload-time = "2025-10-13T12:27:26.953Z" },
+    { url = "https://files.pythonhosted.org/packages/18/7b/2ca4a9e3609ff155436dac384e360f530919cb1e328491f7df294be0f0dc/av-16.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:c6de794abe52b8c0be55d8bb09ade05905efa74b1a5ab4860b4b9c2bfb6578bf", size = 32462194, upload-time = "2025-10-13T12:27:32.942Z" },
+]
+
 [[package]]
 name = "babel"
 version = "2.17.0"
@@ -902,6 +931,11 @@ opencua-hf = [
     { name = "torch" },
     { name = "transformers" },
 ]
+qwen = [
+    { name = "pillow" },
+    { name = "qwen-agent" },
+    { name = "qwen-vl-utils" },
+]
 ui = [
     { name = "gradio" },
     { name = "python-dotenv" },
@@ -944,10 +978,13 @@ requires-dist = [
     { name = "litellm", specifier = ">=1.74.12" },
     { name = "mlx-vlm", marker = "sys_platform == 'darwin' and extra == 'all'", specifier = ">=0.1.27" },
     { name = "mlx-vlm", marker = "sys_platform == 'darwin' and extra == 'uitars-mlx'", specifier = ">=0.1.27" },
+    { name = "pillow", marker = "extra == 'qwen'", specifier = ">=10.0.0" },
     { name = "pydantic", specifier = ">=2.6.4" },
     { name = "python-dotenv", specifier = ">=1.0.1" },
     { name = "python-dotenv", marker = "extra == 'all'", specifier = ">=1.0.1" },
     { name = "python-dotenv", marker = "extra == 'ui'", specifier = ">=1.0.1" },
+    { name = "qwen-agent", marker = "extra == 'qwen'" },
+    { name = "qwen-vl-utils", marker = "extra == 'qwen'" },
     { name = "rich", specifier = ">=13.7.1" },
     { name = "tiktoken", marker = "extra == 'all'", specifier = ">=0.11.0" },
     { name = "tiktoken", marker = "extra == 'opencua-hf'", specifier = ">=0.11.0" },
@@ -969,7 +1006,7 @@ requires-dist = [
     { name = "yaspin", marker = "extra == 'all'", specifier = ">=3.1.0" },
     { name = "yaspin", marker = "extra == 'cli'", specifier = ">=3.1.0" },
 ]
-provides-extras = ["openai", "anthropic", "omni", "uitars", "uitars-mlx", "uitars-hf", "glm45v-hf", "opencua-hf", "internvl-hf", "moondream3", "ui", "cli", "hud", "gemini", "all"]
+provides-extras = ["openai", "anthropic", "qwen", "omni", "uitars", "uitars-mlx", "uitars-hf", "glm45v-hf", "opencua-hf", "internvl-hf", "moondream3", "ui", "cli", "hud", "gemini", "all"]
 
 [[package]]
 name = "cua-computer"
@@ -1015,7 +1052,7 @@ provides-extras = ["lume", "lumier", "ui", "all"]
 
 [[package]]
 name = "cua-computer-server"
-version = "0.1.24"
+version = "0.1.25"
 source = { editable = "libs/python/computer-server" }
 dependencies = [
     { name = "aiohttp" },
@@ -1244,6 +1281,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f0/8b/2c95f0645c6f40211896375e6fa51f504b8ccb29c21f6ae661fe87ab044e/cyclopts-3.24.0-py3-none-any.whl", hash = "sha256:809d04cde9108617106091140c3964ee6fceb33cecdd537f7ffa360bde13ed71", size = 86154, upload-time = "2025-09-08T15:40:56.41Z" },
 ]
 
+[[package]]
+name = "dashscope"
+version = "1.24.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "certifi" },
+    { name = "cryptography" },
+    { name = "requests" },
+    { name = "websocket-client" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e9/c7/ea56bcec1a9e4007b63bc07a060e7076add5eb62be33152c1a702c394225/dashscope-1.24.7-py3-none-any.whl", hash = "sha256:aa6fda8e71a922f5eabedd5c749cc78ad5e1288e8e24c552bddffb7ac61032be", size = 1310416, upload-time = "2025-10-21T10:58:40.974Z" },
+]
+
 [[package]]
 name = "datasets"
 version = "4.2.0"
@@ -1413,6 +1465,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/de/15/545e2b6cf2e3be84bc1ed85613edd75b8aea69807a71c26f4ca6a9258e82/email_validator-2.3.0-py3-none-any.whl", hash = "sha256:80f13f623413e6b197ae73bb10bf4eb0908faf509ad8362c5edeb0be7fd450b4", size = 35604, upload-time = "2025-08-26T13:09:05.858Z" },
 ]
 
+[[package]]
+name = "eval-type-backport"
+version = "0.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/30/ea/8b0ac4469d4c347c6a385ff09dc3c048c2d021696664e26c7ee6791631b5/eval_type_backport-0.2.2.tar.gz", hash = "sha256:f0576b4cf01ebb5bd358d02314d31846af5e07678387486e2c798af0e7d849c1", size = 9079, upload-time = "2024-12-21T20:09:46.005Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ce/31/55cd413eaccd39125368be33c46de24a1f639f2e12349b0361b4678f3915/eval_type_backport-0.2.2-py3-none-any.whl", hash = "sha256:cb6ad7c393517f476f96d456d0412ea80f0a8cf96f6892834cd9340149111b0a", size = 5830, upload-time = "2024-12-21T20:09:44.175Z" },
+]
+
 [[package]]
 name = "evdev"
 version = "1.9.2"
@@ -5144,6 +5205,43 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ad/3f/11dd4cd4f39e05128bfd20138faea57bec56f9ffba6185d276e3107ba5b2/questionary-2.1.0-py3-none-any.whl", hash = "sha256:44174d237b68bc828e4878c763a9ad6790ee61990e0ae72927694ead57bab8ec", size = 36747, upload-time = "2024-12-29T11:49:16.734Z" },
 ]
 
+[[package]]
+name = "qwen-agent"
+version = "0.0.31"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dashscope" },
+    { name = "dotenv" },
+    { name = "eval-type-backport" },
+    { name = "json5" },
+    { name = "jsonlines" },
+    { name = "jsonschema" },
+    { name = "openai" },
+    { name = "pillow" },
+    { name = "pydantic" },
+    { name = "requests" },
+    { name = "tiktoken" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/69/3d/5bec08ac6474415a484cf0aa008fc28e71b416ce3713e1d514d203374bd6/qwen_agent-0.0.31.tar.gz", hash = "sha256:c608d08f89cbffd7840c7151f59095f7ac08321f10398b0637639dec67294386", size = 7058070, upload-time = "2025-09-26T04:00:34.716Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/8b/67ad5290066ac20e5ceddf1d53232e8509b99fa1ebbb60a34da9d5045a25/qwen_agent-0.0.31-py3-none-any.whl", hash = "sha256:3ef803f8450fdf211c0a958b62365b1791c98928cd3f3511d06feb3c97e5c2b3", size = 7134107, upload-time = "2025-09-26T04:00:28.785Z" },
+]
+
+[[package]]
+name = "qwen-vl-utils"
+version = "0.0.14"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "av" },
+    { name = "packaging" },
+    { name = "pillow" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b6/b1/ad4fc2260a3badd278b38d642f3b987412f1f6682f0ef2b31b0572d5caa8/qwen_vl_utils-0.0.14.tar.gz", hash = "sha256:9c7cad5ae803b3a10f8bb7194deb12aeacdd032f92f4224e880c73587a7346ad", size = 8453, upload-time = "2025-09-23T09:38:57.532Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c4/43/80f67e0336cb2fc725f8e06f7fe35c1d0fe946f4d2b8b2175e797e07349e/qwen_vl_utils-0.0.14-py3-none-any.whl", hash = "sha256:5e28657bfd031e56bd447c5901b58ddfc3835285ed100f4c56580e0ade054e96", size = 8120, upload-time = "2025-09-23T09:38:56.297Z" },
+]
+
 [[package]]
 name = "referencing"
 version = "0.36.2"

From a78dcd333109c3b1ba0908530296b7f0970ba16d Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 22 Oct 2025 16:14:43 -0700
Subject: [PATCH 2/9] Fix qwen3 hallucinating coords w/o screenshots

---
 README.md                                     |   1 +
 .../supported-agents/computer-use-agents.mdx  |  11 ++
 libs/python/agent/agent/loops/qwen.py         | 107 ++++++++++++++----
 3 files changed, 98 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 2f9429cc..99b47eb9 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@ With the Agent SDK, you can:
 | `openai/computer-use-preview`                                                                  | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}`                                                | any VLM (using liteLLM, requires `tools` parameter)                                           |
 | `openrouter/z-ai/glm-4.5v`                                                                     | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}`                                               | any LLM (using liteLLM, requires `moondream3+` prefix )                                       |
 | `gemini-2.5-computer-use-preview-10-2025`                                                      | any-all-in-one CUA                                                                             |                                                                                               |
+| `openrouter/qwen/qwen3-vl-235b-a22b-instruct`                                                   |                                                                                                |                                                                                               |
 | `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`                                    |                                                                                                |                                                                                               |
 | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`                                              |                                                                                                |
 | `moondream3+{ui planning}` (supports text-only models)                                         |                                                                                                |
diff --git a/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx b/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx
index ea3e3a4b..a3384b21 100644
--- a/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx
+++ b/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx
@@ -73,6 +73,17 @@ async for _ in agent.run("Open Firefox and navigate to github.com"):
     pass
 ```
 
+## Qwen3 VL
+
+Qwen3 VL family:
+- `openrouter/qwen/qwen3-vl-235b-a22b-instruct`
+
+```python
+agent = ComputerAgent("openrouter/qwen/qwen3-vl-235b-a22b-instruct", tools=[computer])
+async for _ in agent.run("Open Firefox and navigate to github.com"):
+    pass
+```
+
 ## UI-TARS 1.5
 
 Unified vision-language model for computer-use:
diff --git a/libs/python/agent/agent/loops/qwen.py b/libs/python/agent/agent/loops/qwen.py
index 6438ff81..151376d4 100644
--- a/libs/python/agent/agent/loops/qwen.py
+++ b/libs/python/agent/agent/loops/qwen.py
@@ -54,6 +54,7 @@ QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
                         "triple_click",
                         "scroll",
                         "hscroll",
+                        "screenshot",
                         "wait",
                         # "terminate",
                         # "answer",
@@ -125,23 +126,16 @@ def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
     except Exception:
         return None
 
-async def _unnormalize_coordinate(args: Dict[str, Any], computer_handler) -> Dict[str, Any]:
-    """If coordinate appears in 0..1000 space, scale to actual screen size using computer_handler if provided."""
+async def _unnormalize_coordinate(args: Dict[str, Any], dims: Tuple[int, int]) -> Dict[str, Any]:
+    """Coordinates appear in 0..1000 space, scale to actual screen size using dims if provided."""
     coord = args.get("coordinate")
     if not coord or not isinstance(coord, (list, tuple)) or len(coord) < 2:
         return args
     x, y = float(coord[0]), float(coord[1])
-    # Heuristic: treat <= 1000 as normalized
-    if x <= 1000.0 and y <= 1000.0 and computer_handler is not None and hasattr(computer_handler, "get_dimensions"):
-        try:
-            dims = await computer_handler.get_dimensions()
-            if isinstance(dims, (list, tuple)) and len(dims) >= 2:
-                width, height = float(dims[0]), float(dims[1])
-                x_abs = max(0.0, min(width, (x / 1000.0) * width))
-                y_abs = max(0.0, min(height, (y / 1000.0) * height))
-                args = {**args, "coordinate": [round(x_abs), round(y_abs)]}
-        except Exception:
-            pass
+    width, height = float(dims[0]), float(dims[1])
+    x_abs = max(0.0, min(width, (x / 1000.0) * width))
+    y_abs = max(0.0, min(height, (y / 1000.0) * height))
+    args = {**args, "coordinate": [round(x_abs), round(y_abs)]}
     return args
 
 
@@ -254,6 +248,77 @@ class Qwen3VlConfig(AsyncAgentConfig):
         nous_system = _build_nous_system([QWEN3_COMPUTER_TOOL["function"]])
         completion_messages = ([nous_system] if nous_system else []) + converted_msgs
 
+        # If there is no screenshot in the conversation, take one now and inject it.
+        # Also record a pre_output_items assistant message to reflect action.
+        def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
+            for m in msgs:
+                content = m.get("content")
+                if isinstance(content, list):
+                    for p in content:
+                        if isinstance(p, dict) and p.get("type") == "image_url":
+                            return True
+            return False
+
+        pre_output_items: List[Dict[str, Any]] = []
+        if not _has_any_image(completion_messages):
+            if computer_handler is None or not hasattr(computer_handler, "screenshot"):
+                raise RuntimeError("No screenshots present and computer_handler.screenshot is not available.")
+            screenshot_b64 = await computer_handler.screenshot()
+            if not screenshot_b64:
+                raise RuntimeError("Failed to capture screenshot from computer_handler.")
+            # Inject a user message with the screenshot so the model can see current context
+            completion_messages.append(
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
+                        {"type": "text", "text": "Current screen"},
+                    ],
+                }
+            )
+            # Add assistant message to outputs to reflect the action, similar to composed_grounded.py
+            pre_output_items.append(
+                {
+                    "type": "message",
+                    "role": "assistant",
+                    "content": [
+                        {"type": "text", "text": "Taking a screenshot to see the current computer screen."}
+                    ],
+                }
+            )
+
+        # Smart-resize all screenshots and attach min/max pixel hints. Fail fast if deps missing.
+        # Also record the last resized width/height to unnormalize coordinates later.
+        last_rw: Optional[int] = None
+        last_rh: Optional[int] = None
+        MIN_PIXELS = 3136
+        MAX_PIXELS = 12845056
+        try:
+            from qwen_vl_utils import smart_resize  # type: ignore
+            from PIL import Image  # type: ignore
+            import base64, io
+        except Exception:
+            raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
+
+        for msg in completion_messages:
+            content = msg.get("content")
+            if not isinstance(content, list):
+                continue
+            for part in content:
+                if isinstance(part, dict) and part.get("type") == "image_url":
+                    url = (((part.get("image_url") or {}).get("url")) or "")
+                    # Expect data URL like data:image/png;base64,<b64>
+                    if url.startswith("data:") and "," in url:
+                        b64 = url.split(",", 1)[1]
+                        img_bytes = base64.b64decode(b64)
+                        im = Image.open(io.BytesIO(img_bytes))
+                        h, w = im.height, im.width
+                        rh, rw = smart_resize(h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
+                        # Attach hints on this image block
+                        part["min_pixels"] = MIN_PIXELS
+                        part["max_pixels"] = MAX_PIXELS
+                        last_rw, last_rh = rw, rh
+
         api_kwargs: Dict[str, Any] = {
             "model": model,
             "messages": completion_messages,
@@ -291,8 +356,10 @@ class Qwen3VlConfig(AsyncAgentConfig):
         if tool_call and isinstance(tool_call, dict):
             fn_name = tool_call.get("name") or "computer"
             raw_args = tool_call.get("arguments") or {}
-            # Unnormalize coordinates to actual screen size when possible
-            args = await _unnormalize_coordinate(raw_args, computer_handler)
+            # Unnormalize coordinates to actual screen size using last resized dims
+            if last_rw is None or last_rh is None:
+                raise RuntimeError("No screenshots found to derive dimensions for coordinate unnormalization.")
+            args = await _unnormalize_coordinate(raw_args, (last_rw, last_rh))
 
             # Build an OpenAI-style tool call so we can reuse the converter
             fake_cm = {
@@ -314,7 +381,8 @@ class Qwen3VlConfig(AsyncAgentConfig):
             fake_cm = {"role": "assistant", "content": content_text}
             output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
 
-        return {"output": output_items, "usage": usage}
+        # Prepend any pre_output_items (e.g., simulated screenshot-taking message)
+        return {"output": (pre_output_items + output_items), "usage": usage}
 
     def get_capabilities(self) -> List[AgentCapability]:
         return ["step"]
@@ -353,7 +421,7 @@ class Qwen3VlConfig(AsyncAgentConfig):
         # Build Nous system (lazy import inside helper already raises clear guidance if missing)
         nous_system = _build_nous_system([reduced_tool["function"]])
 
-        # Optionally compute min/max pixels via smart_resize if available
+        # Pre-process using smart_resize
         min_pixels = 3136
         max_pixels = 12845056
         try:
@@ -368,9 +436,6 @@ class Qwen3VlConfig(AsyncAgentConfig):
             h, w = im.height, im.width
             # Qwen notebook suggests factor=32 and a wide min/max range
             rh, rw = smart_resize(h, w, factor=32, min_pixels=min_pixels, max_pixels=max_pixels)
-            # Use total pixels as hints
-            min_pixels = min(3136, rh * rw)
-            max_pixels = max(12845056, rh * rw)
         except Exception:
             raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
 
@@ -403,7 +468,7 @@ class Qwen3VlConfig(AsyncAgentConfig):
         content_text = (((choice.get("message") or {}).get("content")) or "")
         tool_call = _parse_tool_call_from_text(content_text) or {}
         args = tool_call.get("arguments") or {}
-        args = await _unnormalize_coordinate(args, kwargs.get("computer_handler"))
+        args = await _unnormalize_coordinate(args, (rh, rw))
         coord = args.get("coordinate")
         if isinstance(coord, (list, tuple)) and len(coord) >= 2:
             return int(coord[0]), int(coord[1])

From 5bad437f47b6c168d90c88dcaf6a12a1b1bf04ca Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 22 Oct 2025 16:18:19 -0700
Subject: [PATCH 3/9] add qwen3 vl requirements to pyproj

---
 libs/python/agent/pyproject.toml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml
index 02f2bfa4..37913555 100644
--- a/libs/python/agent/pyproject.toml
+++ b/libs/python/agent/pyproject.toml
@@ -104,6 +104,10 @@ all = [
     "hud-python==0.4.52",
     # gemini requirements
     "google-genai>=1.41.0",
+    # qwen requirements
+    "qwen-vl-utils",
+    "qwen-agent",
+    "Pillow>=10.0.0",
 ]
 
 [tool.uv]

From 6d9fddea13efcc070184062061c7db4cb27068b9 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 22 Oct 2025 16:22:02 -0700
Subject: [PATCH 4/9] Fix cua-agent version for bump-version

---
 libs/python/agent/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml
index 37913555..753b933c 100644
--- a/libs/python/agent/pyproject.toml
+++ b/libs/python/agent/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "pdm.backend"
 
 [project]
 name = "cua-agent"
-version = "0.4.34"
+version = "0.4.32"
 description = "CUA (Computer Use) Agent for AI-driven computer interaction"
 readme = "README.md"
 authors = [

From 16072f19a39d7e254a37214fec43b4ef246e294a Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 22 Oct 2025 16:22:05 -0700
Subject: [PATCH 5/9] Bump cua-agent to v0.4.33

---
 libs/python/agent/.bumpversion.cfg | 2 +-
 libs/python/agent/pyproject.toml   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/python/agent/.bumpversion.cfg b/libs/python/agent/.bumpversion.cfg
index 3d046f43..e391cbfe 100644
--- a/libs/python/agent/.bumpversion.cfg
+++ b/libs/python/agent/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.32
+current_version = 0.4.33
 commit = True
 tag = True
 tag_name = agent-v{new_version}
diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml
index 753b933c..219d51e1 100644
--- a/libs/python/agent/pyproject.toml
+++ b/libs/python/agent/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "pdm.backend"
 
 [project]
 name = "cua-agent"
-version = "0.4.32"
+version = "0.4.33"
 description = "CUA (Computer Use) Agent for AI-driven computer interaction"
 readme = "README.md"
 authors = [

From cb17e2744599a4dc8d86c0726801c6aa218dd9bd Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 22 Oct 2025 16:22:29 -0700
Subject: [PATCH 6/9] Bump cua-agent to v0.4.34

---
 libs/python/agent/.bumpversion.cfg | 2 +-
 libs/python/agent/pyproject.toml   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/python/agent/.bumpversion.cfg b/libs/python/agent/.bumpversion.cfg
index e391cbfe..e159e0b3 100644
--- a/libs/python/agent/.bumpversion.cfg
+++ b/libs/python/agent/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.33
+current_version = 0.4.34
 commit = True
 tag = True
 tag_name = agent-v{new_version}
diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml
index 219d51e1..37913555 100644
--- a/libs/python/agent/pyproject.toml
+++ b/libs/python/agent/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "pdm.backend"
 
 [project]
 name = "cua-agent"
-version = "0.4.33"
+version = "0.4.34"
 description = "CUA (Computer Use) Agent for AI-driven computer interaction"
 readme = "README.md"
 authors = [

From bdb5f8918b72a9c727acd4a80e3b675d15902d20 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 22 Oct 2025 16:22:33 -0700
Subject: [PATCH 7/9] Bump cua-agent to v0.4.35

---
 libs/python/agent/.bumpversion.cfg | 2 +-
 libs/python/agent/pyproject.toml   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/python/agent/.bumpversion.cfg b/libs/python/agent/.bumpversion.cfg
index e159e0b3..b6bb6583 100644
--- a/libs/python/agent/.bumpversion.cfg
+++ b/libs/python/agent/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.34
+current_version = 0.4.35
 commit = True
 tag = True
 tag_name = agent-v{new_version}
diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml
index 37913555..d97b9895 100644
--- a/libs/python/agent/pyproject.toml
+++ b/libs/python/agent/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "pdm.backend"
 
 [project]
 name = "cua-agent"
-version = "0.4.34"
+version = "0.4.35"
 description = "CUA (Computer Use) Agent for AI-driven computer interaction"
 readme = "README.md"
 authors = [

From 682609b7e809b6b10a3b90bc4dfd7279263cc0ed Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 22 Oct 2025 16:36:32 -0700
Subject: [PATCH 8/9] Run uv run pre-commit run --all-files on qwen.py

---
 libs/python/agent/agent/loops/__init__.py |  2 +-
 libs/python/agent/agent/loops/qwen.py     | 91 ++++++++++++++++-------
 2 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py
index 3ef599a4..ab23ac27 100644
--- a/libs/python/agent/agent/loops/__init__.py
+++ b/libs/python/agent/agent/loops/__init__.py
@@ -15,8 +15,8 @@ from . import (
     omniparser,
     openai,
     opencua,
-    uitars,
     qwen,
+    uitars,
 )
 
 __all__ = [
diff --git a/libs/python/agent/agent/loops/qwen.py b/libs/python/agent/agent/loops/qwen.py
index 151376d4..f21fba2c 100644
--- a/libs/python/agent/agent/loops/qwen.py
+++ b/libs/python/agent/agent/loops/qwen.py
@@ -3,12 +3,13 @@ Qwen3-VL agent loop implementation using litellm with function/tool calling.
 - Passes a ComputerUse tool schema to acompletion
 - Converts between Responses items and completion messages using helpers
 """
-from __future__ import annotations
 
-from typing import Any, Dict, List, Optional, Tuple
+from __future__ import annotations
 
 import json
 import re
+from typing import Any, Dict, List, Optional, Tuple
+
 import litellm
 from litellm.responses.litellm_completion_transformation.transformation import (
     LiteLLMCompletionResponsesConfig,
@@ -16,12 +17,11 @@ from litellm.responses.litellm_completion_transformation.transformation import (
 
 from ..decorators import register_agent
 from ..loops.base import AsyncAgentConfig
-from ..types import AgentCapability
 from ..responses import (
-    convert_responses_items_to_completion_messages,
     convert_completion_messages_to_responses_items,
+    convert_responses_items_to_completion_messages,
 )
-
+from ..types import AgentCapability
 
 # ComputerUse tool schema (OpenAI function tool format)
 QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
@@ -96,18 +96,29 @@ QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
     },
 }
 
+
 def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
     """Use qwen-agent NousFnCallPrompt to generate a system message embedding tool schema."""
     try:
         from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
-            NousFnCallPrompt,
-            Message as NousMessage,
             ContentItem as NousContentItem,
         )
+        from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
+            Message as NousMessage,
+        )
+        from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
+            NousFnCallPrompt,
+        )
     except ImportError:
-        raise ImportError("qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`.")
+        raise ImportError(
+            "qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`."
+        )
     msgs = NousFnCallPrompt().preprocess_fncall_messages(
-        messages=[NousMessage(role="system", content=[NousContentItem(text="You are a helpful assistant.")])],
+        messages=[
+            NousMessage(
+                role="system", content=[NousContentItem(text="You are a helpful assistant.")]
+            )
+        ],
         functions=functions,
         lang="en",
     )
@@ -116,6 +127,7 @@ def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, An
     content = [{"type": "text", "text": c["text"]} for c in sys.get("content", [])]
     return {"role": "system", "content": content}
 
+
 def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
     """Extract JSON object within <tool_call>...</tool_call> from model text."""
     m = re.search(r"<tool_call>\s*(\{[\s\S]*?\})\s*</tool_call>", text)
@@ -126,6 +138,7 @@ def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
     except Exception:
         return None
 
+
 async def _unnormalize_coordinate(args: Dict[str, Any], dims: Tuple[int, int]) -> Dict[str, Any]:
     """Coordinates appear in 0..1000 space, scale to actual screen size using dims if provided."""
     coord = args.get("coordinate")
@@ -262,7 +275,9 @@ class Qwen3VlConfig(AsyncAgentConfig):
         pre_output_items: List[Dict[str, Any]] = []
         if not _has_any_image(completion_messages):
             if computer_handler is None or not hasattr(computer_handler, "screenshot"):
-                raise RuntimeError("No screenshots present and computer_handler.screenshot is not available.")
+                raise RuntimeError(
+                    "No screenshots present and computer_handler.screenshot is not available."
+                )
             screenshot_b64 = await computer_handler.screenshot()
             if not screenshot_b64:
                 raise RuntimeError("Failed to capture screenshot from computer_handler.")
@@ -271,7 +286,10 @@ class Qwen3VlConfig(AsyncAgentConfig):
                 {
                     "role": "user",
                     "content": [
-                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"},
+                        },
                         {"type": "text", "text": "Current screen"},
                     ],
                 }
@@ -282,7 +300,10 @@ class Qwen3VlConfig(AsyncAgentConfig):
                     "type": "message",
                     "role": "assistant",
                     "content": [
-                        {"type": "text", "text": "Taking a screenshot to see the current computer screen."}
+                        {
+                            "type": "text",
+                            "text": "Taking a screenshot to see the current computer screen.",
+                        }
                     ],
                 }
             )
@@ -294,11 +315,15 @@ class Qwen3VlConfig(AsyncAgentConfig):
         MIN_PIXELS = 3136
         MAX_PIXELS = 12845056
         try:
-            from qwen_vl_utils import smart_resize  # type: ignore
+            import base64
+            import io
+
             from PIL import Image  # type: ignore
-            import base64, io
+            from qwen_vl_utils import smart_resize  # type: ignore
         except Exception:
-            raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
+            raise ImportError(
+                "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
+            )
 
         for msg in completion_messages:
             content = msg.get("content")
@@ -306,14 +331,16 @@ class Qwen3VlConfig(AsyncAgentConfig):
                 continue
             for part in content:
                 if isinstance(part, dict) and part.get("type") == "image_url":
-                    url = (((part.get("image_url") or {}).get("url")) or "")
+                    url = ((part.get("image_url") or {}).get("url")) or ""
                     # Expect data URL like data:image/png;base64,<b64>
                     if url.startswith("data:") and "," in url:
                         b64 = url.split(",", 1)[1]
                         img_bytes = base64.b64decode(b64)
                         im = Image.open(io.BytesIO(img_bytes))
                         h, w = im.height, im.width
-                        rh, rw = smart_resize(h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
+                        rh, rw = smart_resize(
+                            h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
+                        )
                         # Attach hints on this image block
                         part["min_pixels"] = MIN_PIXELS
                         part["max_pixels"] = MAX_PIXELS
@@ -349,7 +376,7 @@ class Qwen3VlConfig(AsyncAgentConfig):
         # Parse tool call from text; then convert to responses items via fake tool_calls
         resp_dict = response.model_dump()  # type: ignore
         choice = (resp_dict.get("choices") or [{}])[0]
-        content_text = (((choice.get("message") or {}).get("content")) or "")
+        content_text = ((choice.get("message") or {}).get("content")) or ""
         tool_call = _parse_tool_call_from_text(content_text)
 
         output_items: List[Dict[str, Any]] = []
@@ -358,7 +385,9 @@ class Qwen3VlConfig(AsyncAgentConfig):
             raw_args = tool_call.get("arguments") or {}
             # Unnormalize coordinates to actual screen size using last resized dims
             if last_rw is None or last_rh is None:
-                raise RuntimeError("No screenshots found to derive dimensions for coordinate unnormalization.")
+                raise RuntimeError(
+                    "No screenshots found to derive dimensions for coordinate unnormalization."
+                )
             args = await _unnormalize_coordinate(raw_args, (last_rw, last_rh))
 
             # Build an OpenAI-style tool call so we can reuse the converter
@@ -426,10 +455,12 @@ class Qwen3VlConfig(AsyncAgentConfig):
         max_pixels = 12845056
         try:
             # Lazy import to avoid hard dependency
-            from qwen_vl_utils import smart_resize  # type: ignore
+            import base64
+            import io
+
             # If PIL is available, estimate size from image to derive smart bounds
             from PIL import Image
-            import io, base64
+            from qwen_vl_utils import smart_resize  # type: ignore
 
             img_bytes = base64.b64decode(image_b64)
             im = Image.open(io.BytesIO(img_bytes))
@@ -437,16 +468,16 @@ class Qwen3VlConfig(AsyncAgentConfig):
             # Qwen notebook suggests factor=32 and a wide min/max range
             rh, rw = smart_resize(h, w, factor=32, min_pixels=min_pixels, max_pixels=max_pixels)
         except Exception:
-            raise ImportError("qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`.")
+            raise ImportError(
+                "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
+            )
 
         messages = []
         if nous_system:
             messages.append(nous_system)
         image_block: Dict[str, Any] = {
-            "type": "image_url", 
-            "image_url": {
-                "url": f"data:image/png;base64,{image_b64}"
-            },
+            "type": "image_url",
+            "image_url": {"url": f"data:image/png;base64,{image_b64}"},
             "min_pixels": min_pixels,
             "max_pixels": max_pixels,
         }
@@ -461,11 +492,15 @@ class Qwen3VlConfig(AsyncAgentConfig):
             }
         )
 
-        api_kwargs: Dict[str, Any] = {"model": model, "messages": messages, **{k: v for k, v in kwargs.items()}}
+        api_kwargs: Dict[str, Any] = {
+            "model": model,
+            "messages": messages,
+            **{k: v for k, v in kwargs.items()},
+        }
         response = await litellm.acompletion(**api_kwargs)
         resp = response.model_dump()  # type: ignore
         choice = (resp.get("choices") or [{}])[0]
-        content_text = (((choice.get("message") or {}).get("content")) or "")
+        content_text = ((choice.get("message") or {}).get("content")) or ""
         tool_call = _parse_tool_call_from_text(content_text) or {}
         args = tool_call.get("arguments") or {}
         args = await _unnormalize_coordinate(args, (rh, rw))

From 21427340544c316e53c398fc0f42984626ff6cbf Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 22 Oct 2025 16:38:57 -0700
Subject: [PATCH 9/9] Fix readme lint

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 99b47eb9..1d09535c 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ With the Agent SDK, you can:
 | `openai/computer-use-preview`                                                                  | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}`                                                | any VLM (using liteLLM, requires `tools` parameter)                                           |
 | `openrouter/z-ai/glm-4.5v`                                                                     | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}`                                               | any LLM (using liteLLM, requires `moondream3+` prefix )                                       |
 | `gemini-2.5-computer-use-preview-10-2025`                                                      | any-all-in-one CUA                                                                             |                                                                                               |
-| `openrouter/qwen/qwen3-vl-235b-a22b-instruct`                                                   |                                                                                                |                                                                                               |
+| `openrouter/qwen/qwen3-vl-235b-a22b-instruct`                                                  |                                                                                                |                                                                                               |
 | `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`                                    |                                                                                                |                                                                                               |
 | `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`                                              |                                                                                                |
 | `moondream3+{ui planning}` (supports text-only models)                                         |                                                                                                |