feat(cua-driver): add Claude Code computer-use compatibility mode (#1424)

* feat(cua-driver): add Claude Code computer-use compatibility mode * docs(cua-driver): avoid hard-wrapped compatibility prose * docs(cua-driver): clarify Claude Code vision mode needs MCP * fix(cua-driver): address CodeRabbit review findings * docs(cua-driver): sync generated reference docs * fix(docs): escape generated cua-driver mdx
2026-05-12 15:19:17 -05:00 · 2026-05-01 21:08:01 -07:00
parent e4b047a43e
commit e69d1cbc06
20 changed files with 596 additions and 87 deletions
@@ -113,6 +113,9 @@ Cua Driver speaks MCP over stdio. Use `cua-driver mcp-config --client <name>` to
 # Claude Code (add --scope project|global as needed)
 claude mcp add --transport stdio cua-driver -- ~/.local/bin/cua-driver mcp

+# Claude Code computer-use compatibility mode
+claude mcp add --transport stdio cua-computer-use -- ~/.local/bin/cua-driver mcp --claude-code-computer-use-compat
+
 # Codex (OpenAI)
 codex mcp add cua-driver -- ~/.local/bin/cua-driver mcp

@@ -120,6 +123,10 @@ codex mcp add cua-driver -- ~/.local/bin/cua-driver mcp
 cua-driver mcp-config --client openclaw | sh
 ```

+The Claude Code compatibility mode keeps CuaDriver's normal MCP tools, but replaces `screenshot` with a window-only screenshot shim that requires `pid` and `window_id`. Use it when you want Claude Code's vision/computer-use-style flow to ground on CuaDriver window captures.
+
+Use MCP for that Claude Code vision/computer-use-style path. Shelling out to `cua-driver screenshot` can capture a window, but it does not expose the `mcp__cua-computer-use__screenshot` tool name that Claude Code appears to use as the image-grounding cue.
+
 ### Clients configured via a config file

 Cursor, OpenCode, and Hermes all configure MCP servers via files. Use `mcp-config` to print the exact snippet, paste it into the right path:
@@ -14,6 +14,8 @@ import { Callout } from 'fumadocs-ui/components/callout';

 ## Claude Code

+Standard MCP registration:
+
 ```bash
 claude mcp add --transport stdio cua-driver -- cua-driver mcp
 ```
@@ -25,6 +27,22 @@ claude mcp list
 # cua-driver: cua-driver mcp (stdio) - ✓ Connected
 ```

+### Claude Code computer-use compatibility mode
+
+Claude Code vision/computer-use-style flows appear to use the presence of a screenshot tool as a cue for image-grounded operation. If you want that behavior, register the compatibility server instead:
+
+```bash
+claude mcp add --transport stdio cua-computer-use -- cua-driver mcp --claude-code-computer-use-compat
+```
+
+This mode still exposes the normal CuaDriver tools. The only changed tool is `screenshot`: it requires `pid` and `window_id`, captures that window only, and returns a window-local image coordinate frame. Start with `launch_app` or `list_windows`, then call `screenshot` with the target window.
+
+For this Claude Code vision/computer-use-style path, use MCP rather than shelling out to the CLI. CLI screenshots can still capture windows, but they do not expose the `mcp__cua-computer-use__screenshot` tool name that Claude Code appears to use as the image-grounding cue.
+
+<Callout type="info">
+  This does not call Anthropic APIs or expose Anthropic's native computer-use API tool. It is a CuaDriver MCP compatibility mode for Claude Code.
+</Callout>
+
 ## GitHub Copilot CLI

 Add to `~/.copilot/mcp-config.json`:
@@ -6,8 +6,8 @@ description: Command Line Interface reference for Cua Driver
 {/*
  AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY
  Generated by: npx tsx scripts/docs-generators/cua-driver.ts
-  Source: libs/cua-driver/Sources/**/*.swift
-  Version: 0.1.0
+  Source: recursive Swift sources under libs/cua-driver/Sources
+  Version: 0.1.1
 */}

 import { Callout } from 'fumadocs-ui/components/callout';
@@ -16,7 +16,7 @@ import { VersionHeader } from '@/components/version-selector';
 <VersionHeader
  versions={[{"version":"0.1","href":"/cua-driver/reference/cli-reference","isCurrent":true}]}
  currentVersion="0.1"
-  fullVersion="0.1.0"
+  fullVersion="0.1.1"
  packageName="cua-driver"
  installCommand="curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/cua-driver/scripts/install.sh | bash"
 />
@@ -36,8 +36,8 @@ provided, the tool is called with no arguments.

 Examples:
  cua-driver call list_apps
-  cua-driver call launch_app '{"bundle_id":"com.apple.finder"}'
-  echo '{"pid":844,"window_id":1234}' | cua-driver call get_window_state
+  cua-driver call launch_app '&#123;"bundle_id":"com.apple.finder"&#125;'
+  echo '&#123;"pid":844,"window_id":1234&#125;' | cua-driver call get_window_state

 **Arguments:**

@@ -106,6 +106,12 @@ Print a tool's full description and JSON input schema.

 Run the stdio MCP server.

+**Flags:**
+
+| Name | Description |
+| ---- | ----------- |
+| `--claude-code-computer-use-compat` | Expose normal CuaDriver tools, replacing only `screenshot` with a Claude Code-friendly window-only screenshot that establishes the vision coordinate frame. |
+
 ### cua-driver serve

 Run cua-driver as a long-running daemon on a Unix domain socket.
@@ -158,6 +164,12 @@ Print MCP server config or a client-specific install command.
 | ---- | ---- | ------- | ----------- |
 | `--client` | String | — | Client to print the install command for: claude \| codex \| cursor \| openclaw \| opencode \| hermes \| pi. Omit for the generic JSON snippet. |

+**Flags:**
+
+| Name | Description |
+| ---- | ----------- |
+| `--claude-code-computer-use-compat` | Print config for Claude Code's window-scoped screenshot compatibility mode registered as `cua-computer-use`. |
+
 ## Trajectory recording

 ### cua-driver recording
@@ -6,8 +6,8 @@ description: Reference for every MCP tool cua-driver exposes
 {/*
  AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY
  Generated by: npx tsx scripts/docs-generators/cua-driver.ts
-  Source: libs/cua-driver/Sources/**/*.swift
-  Version: 0.1.0
+  Source: recursive Swift sources under libs/cua-driver/Sources
+  Version: 0.1.1
 */}

 import { Callout } from 'fumadocs-ui/components/callout';
@@ -25,7 +25,7 @@ Tool names are `snake_case`. Responses are MCP `CallTool.Result` envelopes: a te
 Report TCC permission status for Accessibility and Screen Recording.
 By default also raises the system permission dialogs for any missing
 grants — Apple's request APIs are no-ops when the grant is already
-active, so this is safe to call repeatedly. Pass {"prompt": false}
+active, so this is safe to call repeatedly. Pass &#123;"prompt": false&#125;
 for a purely read-only status check.

 **Arguments:**
@@ -208,18 +208,18 @@ at startup. Sibling to `set_config` / `cua-driver config`.

 Current schema:

-  {
+  &#123;
    "schema_version": 1,
    "capture_mode": "vision" | "ax" | "som",
-    "agent_cursor": {
+    "agent_cursor": &#123;
      "enabled": true,
-      "motion": {
+      "motion": &#123;
        "start_handle": 0.3, "end_handle": 0.3,
        "arc_size": 0.25, "arc_flow": 0.0,
        "spring": 0.72
-      }
-    }
-  }
+      &#125;
+    &#125;
+  &#125;

 **Arguments:** none.

@@ -436,10 +436,10 @@ later to resolve a target.
 - `additional_arguments` (array of string, optional): Extra command-line arguments passed to the launched process. Passed directly as argv entries — no shell expansion. Example: ["--user-data-dir=/tmp/cua-session", "--no-first-run"] for an isolated Chrome session.
 - `bundle_id` (string, optional): App bundle identifier, e.g. com.apple.calculator.
 - `creates_new_application_instance` (boolean, optional): Force a brand-new process even if the app is already running. Useful for isolated browser sessions: pass creates_new_application_instance=true together with additional_arguments=["--user-data-dir=/tmp/session-a", "--no-first-run", "--no-default-browser-check"] to launch a sandboxed Chrome that cannot see the user's real profile, cookies, or extensions. Each session gets its own pid and window identity and can be controlled independently.
- `electron_debugging_port` (integer, optional): Launch an Electron app with --remote-debugging-port=<N> so the `page` tool gets full renderer/DOM access. Use 9222 unless running multiple Electron apps. Ignored for non-Electron apps.
+- `electron_debugging_port` (integer, optional): Launch an Electron app with --remote-debugging-port=&lt;N&gt; so the `page` tool gets full renderer/DOM access. Use 9222 unless running multiple Electron apps. Ignored for non-Electron apps.
 - `name` (string, optional): App display name. Used only when bundle_id is absent.
 - `urls` (array of string, optional): Optional file:// or http(s):// URLs (or plain paths with ~ expansion) to hand to the launched app via application(_:open:). For Finder, pass a folder URL or path to open a backgrounded Finder window rooted at that folder — no activation. Apps that don't implement application(_:open:) launch normally and ignore these.
- `webkit_inspector_port` (integer, optional): Launch a Tauri/WKWebView app with WEBKIT_INSPECTOR_SERVER=127.0.0.1:<N> so the `page` tool can reach its WebKit inspector. Use 9226 (reserved WebKit range: 9226–9228, distinct from Electron's 9222–9225). Requires developerExtrasEnabled=true in the WKWebView config (default in Tauri debug builds).
+- `webkit_inspector_port` (integer, optional): Launch a Tauri/WKWebView app with WEBKIT_INSPECTOR_SERVER=127.0.0.1:&lt;N&gt; so the `page` tool can reach its WebKit inspector. Use 9226 (reserved WebKit range: 9226–9228, distinct from Electron's 9222–9225). Requires developerExtrasEnabled=true in the WKWebView config (default in Tauri debug builds).

 ### list_apps

@@ -457,7 +457,7 @@ apps come from scanning /Applications, /Applications/Utilities,
 Use this for "is X installed?" as well as "is X running?". For
 per-window state — on-screen, on-current-Space, minimized,
 window titles — call list_windows instead. For just opening an
-app — running or not — call launch_app({bundle_id: ...}) directly;
+app — running or not — call launch_app(&#123;bundle_id: ...&#125;) directly;
 list_apps is not a prerequisite.

 **Arguments:** none.
@@ -696,10 +696,9 @@ propagate modifier keys).
 ### screenshot

 Capture a screenshot using ScreenCaptureKit. Returns base64-encoded
-image data in the requested format (default png).
+image data for a single window in the requested format (default png).

-Without `window_id`, captures the full main display. With `window_id`,
-captures just that window (get the id from `list_windows`).
+`window_id` is required. Get window ids from `list_windows`.

 Requires the Screen Recording TCC grant — call `check_permissions`
 first if unsure.
@@ -708,7 +707,11 @@ first if unsure.

 - `format` (string, optional): Image format. Default: png.
 - `quality` (integer, optional): JPEG quality 1-95; ignored for png.
- `window_id` (integer, optional): Optional CGWindowID / kCGWindowNumber to capture just that window.
+- `window_id` (integer, required): Required CGWindowID / kCGWindowNumber to capture.
+
+```json
+{"window_id":10725}
+```

 ### scroll

@@ -843,13 +846,13 @@ optional; omitted fields keep their current value.
  string) to revert to the procedural arrow.

 Example — brand-colored arrow:
-  {"gradient_colors": ["#A855F7", "#6366F1"], "bloom_color": "#A855F7"}
+  &#123;"gradient_colors": ["#A855F7", "#6366F1"], "bloom_color": "#A855F7"&#125;

 Example — custom PNG cursor:
-  {"image_path": "~/cursors/my-cursor.png"}
+  &#123;"image_path": "~/cursors/my-cursor.png"&#125;

 Example — revert to default:
-  {"gradient_colors": [], "bloom_color": "", "image_path": ""}
+  &#123;"gradient_colors": [], "bloom_color": "", "image_path": ""&#125;

 **Arguments:**

@@ -931,7 +934,7 @@ Set a value on a UI element. Two modes depending on element role:
 - **AXPopUpButton / select dropdown**: finds the child option whose
  title or value matches `value` (case-insensitive) and AXPresses it
  directly — the native macOS popup menu is never opened, so focus
-  is never stolen. Use this for HTML <select> elements in Safari or
+  is never stolen. Use this for HTML &lt;select&gt; elements in Safari or
  any native NSPopUpButton. Pass the option's display label as `value`
  (e.g. "Blue", not "blue").

@@ -143,4 +143,4 @@ async with Sandbox.ephemeral(Image.linux(), local=True) as sb: ...
 async with Sandbox.ephemeral(Image.macos(), local=True) as sb: ...
 ```

-See [Self-Hosted Sandboxes](/cua/guide/get-started/self-hosted-sandboxes) for local setup instructions.
+See [Set Up a Sandbox](/cua/guide/get-started/set-up-sandbox) for local setup instructions.
@@ -3,3 +3,21 @@
 Background computer-use driver for any agents. Speaks MCP over stdio; drives native macOS apps without stealing focus.

 **[Documentation](https://cua.ai/docs/cua-driver)** - Installation, guides, and API reference.
+
+## Claude Code computer-use compatibility
+
+Standard Claude Code MCP registration:
+
+```bash
+claude mcp add --transport stdio cua-driver -- cua-driver mcp
+```
+
+If you want Claude Code's vision/computer-use-style flow to ground on CuaDriver window screenshots, register the compatibility mode:
+
+```bash
+claude mcp add --transport stdio cua-computer-use -- cua-driver mcp --claude-code-computer-use-compat
+```
+
+This keeps CuaDriver's normal MCP tools and changes only `screenshot`, which requires `pid` and `window_id` and captures that window only.
+
+Use MCP for this Claude Code vision/computer-use-style path. CLI screenshots still work as CuaDriver calls, but they do not expose the `mcp__cua-computer-use__screenshot` tool name that Claude Code appears to use as the image-grounding cue.
@@ -85,6 +85,18 @@ also invoke it explicitly:
 /cua-driver
 ```

+## Claude Code MCP compatibility mode
+
+For normal skill-driven use, prefer the CLI or the standard MCP server. If you want Claude Code's vision/computer-use-style flow to ground on CuaDriver screenshots, register the compatibility server:
+
+```bash
+claude mcp add --transport stdio cua-computer-use -- cua-driver mcp --claude-code-computer-use-compat
+```
+
+This mode exposes the normal CuaDriver tools and changes only `screenshot`. The compatibility screenshot requires `pid` and `window_id`, captures that window only, and establishes a window-local pixel coordinate frame. It does not call Anthropic APIs or expose Anthropic's native computer-use API tool.
+
+Use MCP for this Claude Code vision/computer-use-style path. CLI screenshots still work as CuaDriver calls, but they do not expose the `mcp__cua-computer-use__screenshot` tool name that Claude Code appears to use as the image-grounding cue.
+
 ## Files

 - `SKILL.md` — the main skill body (~500 lines). Loaded on first
@@ -140,6 +140,18 @@ Every reference to `click(...)`, `get_window_state(...)` etc. in this
 skill means `cua-driver click '{...}'` — translate to MCP form only
 when MCP is requested.

+### Claude Code computer-use compatibility mode
+
+For normal Claude Code use, keep the default CLI or `cua-driver` MCP server path above. If the user explicitly wants Claude Code's vision/computer-use-style flow, they can register:
+
+```bash
+claude mcp add --transport stdio cua-computer-use -- cua-driver mcp --claude-code-computer-use-compat
+```
+
+Observation: Claude Code vision flows appear to treat a screenshot MCP tool as the image-grounding anchor. This compatibility mode keeps the normal CuaDriver tools and changes only `screenshot`. The compatibility `screenshot` requires `pid` and `window_id`, captures only that target window, and returns the window-local pixel coordinate frame. Start with `launch_app` or `list_windows`, then call `screenshot({pid, window_id})`; do not assume desktop coordinates or a full-screen capture.
+
+Use MCP for this Claude Code vision/computer-use-style path. Do not shell out to `cua-driver screenshot` as a substitute: CLI screenshots still work as CuaDriver calls, but they do not expose the `mcp__cua-computer-use__screenshot` tool name that Claude Code appears to use as the image-grounding cue.
+
 Intent → tool mapping. If you find yourself reaching for the right
 column, something has gone wrong — re-read "The no-foreground
 contract" above:
@@ -1,3 +1,4 @@
+import AppKit
 import ArgumentParser
 import CuaDriverServer
 import Foundation
@@ -171,6 +172,7 @@ struct CallCommand: AsyncParsableCommand {

        let result: CallTool.Result
        do {
+            await bootstrapAppKitForInProcessCallIfNeeded(toolName: toolName)
            // Route through `registry.call(...)` so the recording hook
            // (and any future cross-cutting wrapper) fires consistently
            // with the MCP and daemon paths. The in-process one-shot
@@ -264,6 +266,35 @@ struct CallCommand: AsyncParsableCommand {
    }
 }

+private func bootstrapAppKitForInProcessCallIfNeeded(toolName: String) async {
+    let appKitBackedTools: Set<String> = [
+        "check_permissions",
+        "click",
+        "double_click",
+        "drag",
+        "get_accessibility_tree",
+        "get_cursor_position",
+        "get_window_state",
+        "hotkey",
+        "launch_app",
+        "list_apps",
+        "list_windows",
+        "move_cursor",
+        "press_key",
+        "right_click",
+        "screenshot",
+        "scroll",
+        "set_value",
+        "type_text",
+        "type_text_chars",
+        "zoom",
+    ]
+    guard appKitBackedTools.contains(toolName) else { return }
+    await MainActor.run {
+        _ = NSApplication.shared.setActivationPolicy(.accessory)
+    }
+}
+
 struct ListToolsCommand: AsyncParsableCommand {
    static let configuration = CommandConfiguration(
        commandName: "list-tools",
@@ -18,6 +18,8 @@ import MCP
 ///
 /// Keys are dotted snake_case paths:
 ///   - `schema_version`
+///   - `capture_mode`
+///   - `max_image_dimension`
 ///   - `agent_cursor.enabled`
 ///   - `agent_cursor.motion.{start_handle,end_handle,arc_size,arc_flow,spring}`
 struct ConfigCommand: AsyncParsableCommand {
@@ -31,6 +33,8 @@ struct ConfigCommand: AsyncParsableCommand {

            Examples:
              cua-driver config                                    # print full config
+              cua-driver config get capture_mode
+              cua-driver config set capture_mode vision
              cua-driver config get agent_cursor.enabled
              cua-driver config set agent_cursor.enabled false
              cua-driver config set agent_cursor.motion.arc_size 0.4
@@ -147,6 +151,10 @@ struct ConfigGetCommand: AsyncParsableCommand {
        switch key {
        case "schema_version":
            print(config.schemaVersion)
+        case "capture_mode":
+            print(config.captureMode.rawValue)
+        case "max_image_dimension":
+            print(config.maxImageDimension)
        case "agent_cursor.enabled":
            print(config.agentCursor.enabled)
        case "agent_cursor.motion.start_handle":
@@ -43,33 +43,62 @@ struct MCPConfigCommand: ParsableCommand {
            help: "Client to print the install command for: claude | codex | cursor | openclaw | opencode | hermes | pi. Omit for the generic JSON snippet.")
    var client: String?

+    @Flag(
+        name: .long,
+        help: "Print config for Claude Code's window-scoped screenshot compatibility mode registered as `cua-computer-use`."
+    )
+    var claudeCodeComputerUseCompat: Bool = false
+
    func run() throws {
        let binary = resolvedBinaryPath()
+        let shellBinary = shellEscape(binary)
+        // Observed Claude Code behavior: the exact config key "computer-use"
+        // is reserved, so external stdio registrations use a distinct key.
+        let serverName = claudeCodeComputerUseCompat ? "cua-computer-use" : "cua-driver"
+        let args = claudeCodeComputerUseCompat
+            ? "[\"mcp\", \"--claude-code-computer-use-compat\"]"
+            : "[\"mcp\"]"
+        let commandArgs = claudeCodeComputerUseCompat
+            ? "mcp --claude-code-computer-use-compat"
+            : "mcp"
        switch client?.lowercased() {
        case nil, "":
-            print(genericMcpServersSnippet(binary: binary, includeType: false))
+            print(genericMcpServersSnippet(
+                serverName: serverName,
+                binary: binary,
+                args: args,
+                includeType: false
+            ))
        case "claude":
-            print("claude mcp add --transport stdio cua-driver -- \(binary) mcp")
+            print("claude mcp add --transport stdio \(serverName) -- \(shellBinary) \(commandArgs)")
        case "codex":
-            print("codex mcp add cua-driver -- \(binary) mcp")
+            print("codex mcp add \(serverName) -- \(shellBinary) \(commandArgs)")
        case "cursor":
            // Cursor has no CLI — emit JSON the user pastes into
            // ~/.cursor/mcp.json (global) or .cursor/mcp.json (project).
-            print(genericMcpServersSnippet(binary: binary, includeType: true))
+            print(genericMcpServersSnippet(
+                serverName: serverName,
+                binary: binary,
+                args: args,
+                includeType: true
+            ))
        case "openclaw":
            // OpenClaw has a CLI registry — set with a JSON arg.
-            print("openclaw mcp set cua-driver '{\"command\":\"\(binary)\",\"args\":[\"mcp\"]}'")
+            print("openclaw mcp set \(serverName) '{\"command\":\"\(binary)\",\"args\":\(args)}'")
        case "opencode":
            // OpenCode (sst/opencode) uses opencode.json with type:"local"
            // and command as a single merged array.
+            let commandArray = claudeCodeComputerUseCompat
+                ? "[\"\(binary)\", \"mcp\", \"--claude-code-computer-use-compat\"]"
+                : "[\"\(binary)\", \"mcp\"]"
            let snippet = """
            // paste under "mcp" in opencode.json (or opencode.jsonc):
            {
              "$schema": "https://opencode.ai/config.json",
              "mcp": {
-                "cua-driver": {
+                "\(serverName)": {
                  "type": "local",
-                  "command": ["\(binary)", "mcp"],
+                  "command": \(commandArray),
                  "enabled": true
                }
              }
@@ -83,9 +112,9 @@ struct MCPConfigCommand: ParsableCommand {
            # paste under mcp_servers in ~/.hermes/config.yaml,
            # then run /reload-mcp inside Hermes:
            mcp_servers:
-              cua-driver:
+              \(serverName):
                command: "\(binary)"
-                args: ["mcp"]
+                args: \(args)
            """
            print(snippet)
        case "pi":
@@ -117,14 +146,19 @@ struct MCPConfigCommand: ParsableCommand {
        }
    }

-    private func genericMcpServersSnippet(binary: String, includeType: Bool) -> String {
+    private func genericMcpServersSnippet(
+        serverName: String,
+        binary: String,
+        args: String,
+        includeType: Bool
+    ) -> String {
        let typeLine = includeType ? ",\n      \"type\": \"stdio\"" : ""
        return """
        {
          "mcpServers": {
-            "cua-driver": {
+            "\(serverName)": {
              "command": "\(binary)",
-              "args": ["mcp"]\(typeLine)
+              "args": \(args)\(typeLine)
            }
          }
        }
@@ -140,6 +174,10 @@ struct MCPConfigCommand: ParsableCommand {
        }
        return CommandLine.arguments.first ?? "cua-driver"
    }
+
+    private func shellEscape(_ value: String) -> String {
+        "'\(value.replacingOccurrences(of: "'", with: "'\"'\"'"))'"
+    }
 }

 /// Top-level entry point. Before handing to ArgumentParser, rewrite
@@ -307,6 +345,17 @@ struct MCPCommand: ParsableCommand {
        abstract: "Run the stdio MCP server."
    )

+    @Flag(
+        name: .long,
+        help: """
+            Expose normal CuaDriver tools, replacing only `screenshot` with a \
+            Claude Code-friendly window-only screenshot that establishes the \
+            vision coordinate frame. This does not use Anthropic's native \
+            computer_2025 API tool.
+            """
+    )
+    var claudeCodeComputerUseCompat: Bool = false
+
    func run() throws {
        // MCP stdio runs for the lifetime of the host process, so we
        // bootstrap AppKit here — the agent cursor overlay (disabled
@@ -342,7 +391,12 @@ struct MCPCommand: ParsableCommand {
                AgentCursor.shared.apply(config: config.agentCursor)
            }

-            let server = await CuaDriverMCPServer.make()
+            let server = await CuaDriverMCPServer.make(
+                serverName: claudeCodeComputerUseCompat ? "computer-use" : "cua-driver",
+                registry: claudeCodeComputerUseCompat
+                    ? .claudeCodeComputerUseCompat
+                    : .default
+            )
            let transport = StdioTransport()
            try await server.start(transport: transport)
            await server.waitUntilCompleted()
@@ -94,7 +94,9 @@ enum CLIDocExtractor {
            discussion: nil,
            arguments: [],
            options: [],
-            flags: [],
+            flags: [
+                FlagDoc(name: "claude-code-computer-use-compat", shortName: nil, help: "Expose normal CuaDriver tools, replacing only `screenshot` with a Claude Code-friendly window-only screenshot that establishes the vision coordinate frame.", defaultValue: false),
+            ],
            subcommands: []
        )
    }
@@ -412,7 +414,9 @@ enum CLIDocExtractor {
            options: [
                OptionDoc(name: "client", shortName: nil, help: "Client to print the install command for: claude | codex | cursor | openclaw | opencode | hermes | pi. Omit for the generic JSON snippet.", type: "String", defaultValue: nil, isOptional: true),
            ],
-            flags: [],
+            flags: [
+                FlagDoc(name: "claude-code-computer-use-compat", shortName: nil, help: "Print config for Claude Code's window-scoped screenshot compatibility mode registered as `cua-computer-use`.", defaultValue: false),
+            ],
            subcommands: []
        )
    }
@@ -98,15 +98,21 @@ public enum MouseInput {
    /// screen points (top-left origin) and deliver them to `pid`.
    /// `modifiers` accepts the same names as `KeyboardInput`
    /// (`cmd` / `command`, `shift`, `option` / `alt`, `ctrl` /
-    /// `control`, `fn`); unknown names are ignored. Events are
-    /// posted via auth-signed `SLEventPostToPid` AND the public HID
-    /// tap — see the file-level doc for the rationale.
+    /// `control`, `fn`); unknown names are ignored.
+    ///
+    /// When `useFrontmostHIDPath` is true (the default) and `pid` is
+    /// frontmost, events are posted through the public HID tap, which
+    /// can move the global cursor and is required for some viewport
+    /// apps. When false, the function skips that path and uses only
+    /// pid-routed delivery, preserving the system cursor for callers
+    /// that rely on background-style dispatch.
    public static func click(
        at point: CGPoint,
        toPid pid: pid_t,
        button: Button,
        count: Int = 1,
-        modifiers: [String] = []
+        modifiers: [String] = [],
+        useFrontmostHIDPath: Bool = true
    ) throws {
        // When the target is frontmost, route via the public HID tap
        // (`CGEventPost(tap: .cghidEventTap)`) with a preceding
@@ -123,7 +129,7 @@ public enum MouseInput {
        // (Chrome/Slack/etc); they just don't work on viewports.
        let targetIsFrontmost =
            NSRunningApplication(processIdentifier: pid)?.isActive ?? false
-        if targetIsFrontmost {
+        if useFrontmostHIDPath && targetIsFrontmost {
            try clickFrontmostViaHIDTap(
                at: point, button: button, count: count, modifiers: modifiers)
            return
@@ -0,0 +1,161 @@
+import AppKit
+import CuaDriverCore
+import Foundation
+import MCP
+
+private struct CompatWindowContext: Sendable {
+    let window: WindowInfo
+    let scaleFactor: Double
+}
+
+private actor ClaudeCodeComputerUseCompatSession {
+    static let shared = ClaudeCodeComputerUseCompatSession()
+
+    private var activeWindow: CompatWindowContext?
+
+    func setActiveWindow(_ context: CompatWindowContext?) {
+        activeWindow = context
+    }
+
+    func currentActiveWindow() -> CompatWindowContext? {
+        activeWindow
+    }
+}
+
+public enum ClaudeCodeComputerUseCompatTools {
+    private static let capture = WindowCapture()
+
+    public static let screenshot = ToolHandler(
+        tool: Tool(
+            name: "screenshot",
+            description: """
+                Capture a target window and return a JPEG image. Coordinates accepted
+                by CuaDriver's pixel tools are pixels in this window screenshot's
+                coordinate space.
+
+                This is the compatibility anchor for Claude Code vision flows:
+                CuaDriver remains window-scoped, and all other tools are the
+                normal CuaDriver tools.
+                """,
+            inputSchema: [
+                "type": "object",
+                "required": ["pid", "window_id"],
+                "properties": [
+                    "pid": [
+                        "type": "integer",
+                        "description": "Target process ID from `list_windows` or `launch_app`.",
+                    ],
+                    "window_id": [
+                        "type": "integer",
+                        "description": "Target CGWindowID from `list_windows` or `launch_app`.",
+                    ],
+                ],
+                "additionalProperties": false,
+            ],
+            annotations: .init(
+                readOnlyHint: true,
+                destructiveHint: false,
+                idempotentHint: false,
+                openWorldHint: false
+            )
+        ),
+        invoke: { arguments in
+            do {
+                guard let rawPid = arguments?["pid"]?.intValue else {
+                    return errorResult("Missing required integer field `pid`.")
+                }
+                guard let pid = Int32(exactly: rawPid) else {
+                    return errorResult("pid \(rawPid) is outside the supported Int32 range.")
+                }
+                guard let rawWindowID = arguments?["window_id"]?.intValue else {
+                    return errorResult("Missing required integer field `window_id`.")
+                }
+                guard let windowID = UInt32(exactly: rawWindowID) else {
+                    return errorResult(
+                        "window_id \(rawWindowID) is outside the supported UInt32 range.")
+                }
+                guard let context = compatWindowContext(
+                    forPid: pid,
+                    windowID: windowID
+                ) else {
+                    return errorResult(
+                        "No visible layer-0 window \(rawWindowID) found for pid \(rawPid). Use `list_windows` to choose an on-screen target window."
+                    )
+                }
+                let shot = try await capture.captureWindow(
+                    windowID: windowID,
+                    format: .jpeg,
+                    quality: 85
+                )
+                await ClaudeCodeComputerUseCompatSession.shared.setActiveWindow(
+                    CompatWindowContext(
+                        window: context.window,
+                        scaleFactor: shot.scaleFactor
+                    )
+                )
+                let base64 = shot.imageData.base64EncodedString()
+                return CallTool.Result(content: [
+                    .image(data: base64, mimeType: "image/jpeg", annotations: nil, _meta: nil),
+                    .text(
+                        text: "Captured window screenshot \(shot.width)x\(shot.height) for \(context.window.owner) [pid: \(context.window.pid), window_id: \(context.window.id)]. Use CuaDriver pixel tools with this window-local coordinate space.",
+                        annotations: nil,
+                        _meta: nil
+                    ),
+                ])
+            } catch CaptureError.permissionDenied {
+                return errorResult(
+                    "Screen Recording permission is not granted for CuaDriver.")
+            } catch {
+                return errorResult("Screenshot failed: \(error)")
+            }
+        }
+    )
+
+    public static let all: [ToolHandler] = [
+        screenshot,
+    ]
+}
+
+extension ToolRegistry {
+    public static let claudeCodeComputerUseCompat: ToolRegistry = {
+        let shimNames = Set(ClaudeCodeComputerUseCompatTools.all.map(\.tool.name))
+        let nativeHandlers = ToolRegistry.default.handlers.values
+            .filter { !shimNames.contains($0.tool.name) }
+        return ToolRegistry(
+            handlers: Array(nativeHandlers) + ClaudeCodeComputerUseCompatTools.all
+        )
+    }()
+}
+
+private func compatWindowContext(
+    forPid pid: Int32,
+    windowID: UInt32
+) -> CompatWindowContext? {
+    guard let window = WindowEnumerator.visibleWindows()
+        .first(where: {
+            $0.pid == pid
+                && UInt32(exactly: $0.id) == windowID
+                && $0.layer == 0
+                && $0.isOnScreen
+                && $0.bounds.width > 1
+                && $0.bounds.height > 1
+        })
+    else {
+        return nil
+    }
+    return CompatWindowContext(
+        window: window,
+        scaleFactor: defaultScaleFactor()
+    )
+}
+
+private func defaultScaleFactor() -> Double {
+    ScreenInfo.mainScreenSize()?.scaleFactor ?? 1.0
+}
+
+private func errorResult(_ text: String) -> CallTool.Result {
+    CallTool.Result(
+        content: [.text(text: text, annotations: nil, _meta: nil)],
+        isError: true
+    )
+}
@@ -7,11 +7,12 @@ public enum CuaDriverMCPServer {
    /// The caller is responsible for calling ``Server/start(transport:initializeHook:)``
    /// and ``Server/waitUntilCompleted()``.
    public static func make(
+        serverName: String = "cua-driver",
        version: String = CuaDriverCore.version,
        registry: ToolRegistry = .default
    ) async -> Server {
        let server = Server(
-            name: "cua-driver",
+            name: serverName,
            version: version,
            capabilities: Server.Capabilities(tools: .init(listChanged: false))
        )
@@ -11,16 +11,16 @@ public enum ScreenshotTool {
            name: "screenshot",
            description: """
                Capture a screenshot using ScreenCaptureKit. Returns base64-encoded
-                image data in the requested format (default png).
+                image data for a single window in the requested format (default png).

-                Without `window_id`, captures the full main display. With `window_id`,
-                captures just that window (get the id from `list_windows`).
+                `window_id` is required. Get window ids from `list_windows`.

                Requires the Screen Recording TCC grant — call `check_permissions`
                first if unsure.
                """,
            inputSchema: [
                "type": "object",
+                "required": ["window_id"],
                "properties": [
                    "format": [
                        "type": "string",
@@ -36,7 +36,7 @@ public enum ScreenshotTool {
                    "window_id": [
                        "type": "integer",
                        "description":
-                            "Optional CGWindowID / kCGWindowNumber to capture just that window.",
+                            "Required CGWindowID / kCGWindowNumber to capture.",
                    ],
                ],
                "additionalProperties": false,
@@ -52,36 +52,42 @@ public enum ScreenshotTool {
            let format =
                ImageFormat(rawValue: arguments?["format"]?.stringValue ?? "png") ?? .png
            let quality = arguments?["quality"]?.intValue ?? 95
-            let windowID = arguments?["window_id"]?.intValue
+            guard let rawWindowID = arguments?["window_id"]?.intValue else {
+                return CallTool.Result(
+                    content: [
+                        .text(
+                            text: "Missing required `window_id`. Use `list_windows` first, then call `screenshot` for one window.",
+                            annotations: nil,
+                            _meta: nil
+                        )
+                    ],
+                    isError: true
+                )
+            }
+            guard let windowID = UInt32(exactly: rawWindowID) else {
+                return CallTool.Result(
+                    content: [
+                        .text(
+                            text: "Invalid `window_id` \(rawWindowID). Use `list_windows` first, then pass a valid UInt32 window id.",
+                            annotations: nil,
+                            _meta: nil
+                        )
+                    ],
+                    isError: true
+                )
+            }

            do {
-                let shot: Screenshot
-                if let windowID {
-                    shot = try await capture.captureWindow(
-                        windowID: UInt32(windowID),
-                        format: format,
-                        quality: quality
-                    )
-                } else {
-                    shot = try await capture.captureMainDisplay(
-                        format: format,
-                        quality: quality
-                    )
-                }
+                let shot = try await capture.captureWindow(
+                    windowID: windowID,
+                    format: format,
+                    quality: quality
+                )
                let base64 = shot.imageData.base64EncodedString()
                let mime = format == .png ? "image/png" : "image/jpeg"
-                let visibleWindows = WindowEnumerator.visibleWindows().filter { $0.layer == 0 }
                var summaryLines: [String] = [
-                    "✅ Screenshot — \(shot.width)x\(shot.height) \(format.rawValue)"
+                    "✅ Window screenshot — \(shot.width)x\(shot.height) \(format.rawValue) [window_id: \(rawWindowID)]"
                ]
-                if !visibleWindows.isEmpty {
-                    summaryLines.append("\nOn-screen windows:")
-                    for w in visibleWindows {
-                        let title = w.name.isEmpty ? "(no title)" : "\"\(w.name)\""
-                        summaryLines.append("- \(w.owner) (pid \(w.pid)) \(title) [window_id: \(w.id)]")
-                    }
-                    summaryLines.append("→ Call get_window_state(pid, window_id) to inspect a window's UI.")
-                }
                let summary = summaryLines.joined(separator: "\n")
                return CallTool.Result(
                    content: [
@@ -232,6 +232,10 @@ Next steps:
  2. Verify the CLI:  $BIN_LINK --version
  3. Wire into an MCP client:
     $BIN_LINK mcp-config | pbcopy
+     Claude Code compatibility:
+       $BIN_LINK mcp-config --client claude --claude-code-computer-use-compat
+     Use MCP for Claude Code vision/computer-use-style flows; CLI screenshots
+     do not expose the mcp__cua-computer-use__screenshot tool name cue.

 Uninstall:  $CUA_DRIVER_DIR/scripts/uninstall.sh

@@ -300,6 +300,14 @@ Next steps:
        • Claude Code:
            claude mcp add --transport stdio cua-driver -- $BIN_LINK mcp

+          Claude Code computer-use compatibility mode:
+            claude mcp add --transport stdio cua-computer-use -- $BIN_LINK mcp --claude-code-computer-use-compat
+          Use this when you want Claude Code's vision/computer-use-style flow
+          to ground on CuaDriver window screenshots. It keeps the normal
+          CuaDriver tools and changes only the screenshot tool.
+          Use MCP for this path; CLI screenshots do not expose the
+          mcp__cua-computer-use__screenshot tool name cue.
+
        • Codex (OpenAI):
            codex mcp add cua-driver -- $BIN_LINK mcp

@@ -5,6 +5,7 @@
 #   - /Applications/CuaDriver.app bundle
 #   - ~/.cua-driver/ (telemetry id + install marker)
 #   - ~/Library/Application Support/Cua Driver/ (config.json)
+#   - ~/Library/Caches/cua-driver/ (daemon/cache state)
 #
 # Does NOT revoke TCC grants (Accessibility + Screen Recording).
 #
@@ -17,6 +18,7 @@ SYSTEM_BIN_LINK="/usr/local/bin/cua-driver"
 APP_BUNDLE="/Applications/CuaDriver.app"
 USER_DATA="$HOME/.cua-driver"
 CONFIG_DIR="$HOME/Library/Application Support/Cua Driver"
+CACHE_DIR="$HOME/Library/Caches/cua-driver"
 # Legacy — remove if present from older installs.
 LEGACY_UPDATE_SCRIPT="/usr/local/bin/cua-driver-update"
 LEGACY_UPDATER_PLIST="$HOME/Library/LaunchAgents/com.trycua.cua_driver_updater.plist"
@@ -74,6 +76,14 @@ else
    log "no config at $CONFIG_DIR (skipping)"
 fi

+# Cache / daemon state.
+if [[ -d "$CACHE_DIR" ]]; then
+    rm -rf "$CACHE_DIR"
+    log "removed $CACHE_DIR"
+else
+    log "no cache at $CACHE_DIR (skipping)"
+fi
+
 # Agent skill symlinks (Claude Code + Codex). Only remove when the link
 # is ours — a dev user pointing the symlink at a working copy of the repo
 # keeps theirs untouched.
@@ -91,6 +101,124 @@ for SKILL_LINK in \
    fi
 done

+# Claude Code MCP registrations. `claude mcp remove` only removes from
+# the current project / user scopes, while ~/.claude.json can also contain
+# stale project entries for other directories. Scrub only registrations
+# that are explicitly named cua-driver or whose command points at a
+# cua-driver binary, so unrelated servers named "computer-use" are left
+# alone.
+CLAUDE_JSON="$HOME/.claude.json"
+if [[ -f "$CLAUDE_JSON" ]] && command -v python3 >/dev/null 2>&1; then
+    PY_OUTPUT="$(
+        CLAUDE_JSON="$CLAUDE_JSON" python3 <<'PY'
+import json
+import os
+import shutil
+import sys
+import tempfile
+import time
+
+path = os.environ["CLAUDE_JSON"]
+
+try:
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+except Exception as exc:
+    print(f"could not read Claude config {path}: {exc}", file=sys.stderr)
+    raise SystemExit(0)
+
+removed = []
+
+def text_parts(value):
+    if isinstance(value, str):
+        return [value]
+    if isinstance(value, list):
+        return [item for item in value if isinstance(item, str)]
+    return []
+
+def invokes_cua_driver(server):
+    if not isinstance(server, dict):
+        return False
+    parts = []
+    parts.extend(text_parts(server.get("command")))
+    parts.extend(text_parts(server.get("args")))
+    joined = " ".join(parts)
+    return "cua-driver" in joined or "CuaDriver.app" in joined
+
+def should_remove(name, server):
+    return name in {"cua-driver", "cua-computer-use"} or invokes_cua_driver(server)
+
+def scrub_servers(servers, scope):
+    if not isinstance(servers, dict):
+        return
+    for name in list(servers.keys()):
+        if should_remove(name, servers[name]):
+            del servers[name]
+            removed.append(f"{scope}:{name}")
+
+scrub_servers(data.get("mcpServers"), "user")
+
+projects = data.get("projects")
+if isinstance(projects, dict):
+    for project in projects.values():
+        if isinstance(project, dict):
+            scrub_servers(project.get("mcpServers"), "project")
+
+if not removed:
+    raise SystemExit(0)
+
+backup = f"{path}.bak-cua-driver-uninstall-{int(time.time())}"
+shutil.copy2(path, backup)
+
+directory = os.path.dirname(path) or "."
+fd, tmp_path = tempfile.mkstemp(
+    prefix=".claude.json.",
+    suffix=".tmp",
+    dir=directory,
+    text=True,
+)
+try:
+    with os.fdopen(fd, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+        f.write("\n")
+    os.replace(tmp_path, path)
+except Exception:
+    try:
+        os.unlink(tmp_path)
+    except OSError:
+        pass
+    raise
+
+print(f"removed Claude MCP registration(s): {', '.join(removed)}")
+print(f"backed up Claude config to {backup}")
+PY
+    )"
+    if [[ -n "$PY_OUTPUT" ]]; then
+        while IFS= read -r line; do
+            log "$line"
+        done <<< "$PY_OUTPUT"
+    else
+        log "no Claude MCP registrations for cua-driver found in $CLAUDE_JSON"
+    fi
+else
+    log "no Claude config cleanup via python3 (missing $CLAUDE_JSON or python3)"
+fi
+
+# Best-effort CLI cleanup for the active Claude project. This covers
+# .mcp.json / current-working-directory scopes when present and is harmless
+# when the entries were already removed above.
+if command -v claude >/dev/null 2>&1; then
+    for SERVER in cua-driver cua-computer-use; do
+        for SCOPE in local project user; do
+            if claude mcp remove "$SERVER" -s "$SCOPE" >/dev/null 2>&1; then
+                log "removed Claude MCP server $SERVER from $SCOPE scope"
+            fi
+        done
+    done
+else
+    log "claude CLI not found (skipping Claude MCP CLI cleanup)"
+fi
+
 cat << 'FINALUNMSG'

 cua-driver uninstalled.
@@ -299,7 +299,7 @@ export function generateCLIReferenceMDX(docs: CLIDocumentation, releasedVersion:
  lines.push(`{/*
  AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY
  Generated by: npx tsx scripts/docs-generators/cua-driver.ts
-  Source: libs/cua-driver/Sources/**/*.swift
+  Source: recursive Swift sources under libs/cua-driver/Sources
  Version: ${releasedVersion}
 */}`);
  lines.push('');
@@ -320,7 +320,7 @@ export function generateCLIReferenceMDX(docs: CLIDocumentation, releasedVersion:
  lines.push('');

  // Introduction
-  lines.push(`${docs.abstract}`);
+  lines.push(escapeMdxText(docs.abstract));
  lines.push('');

  // Group commands by category
@@ -394,11 +394,11 @@ export function generateCommandDoc(cmd: CommandDoc): string[] {

  lines.push(`### cua-driver ${cmd.name}`);
  lines.push('');
-  lines.push(cmd.abstract);
+  lines.push(escapeMdxText(cmd.abstract));
  lines.push('');

  if (cmd.discussion) {
-    lines.push(cmd.discussion);
+    lines.push(escapeMdxText(cmd.discussion));
    lines.push('');
  }

@@ -451,11 +451,11 @@ export function generateCommandDoc(cmd: CommandDoc): string[] {
    for (const sub of cmd.subcommands) {
      lines.push(`#### cua-driver ${cmd.name} ${sub.name}`);
      lines.push('');
-      lines.push(sub.abstract);
+      lines.push(escapeMdxText(sub.abstract));
      lines.push('');

      if (sub.discussion) {
-        lines.push(sub.discussion);
+        lines.push(escapeMdxText(sub.discussion));
        lines.push('');
      }

@@ -505,11 +505,11 @@ export function generateCommandDoc(cmd: CommandDoc): string[] {
        for (const nested of sub.subcommands) {
          lines.push(`##### cua-driver ${cmd.name} ${sub.name} ${nested.name}`);
          lines.push('');
-          lines.push(nested.abstract);
+          lines.push(escapeMdxText(nested.abstract));
          lines.push('');

          if (nested.discussion) {
-            lines.push(nested.discussion);
+            lines.push(escapeMdxText(nested.discussion));
            lines.push('');
          }

@@ -577,7 +577,7 @@ export function generateMCPToolsMDX(docs: MCPDocumentation, releasedVersion: str
  lines.push(`{/*
  AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY
  Generated by: npx tsx scripts/docs-generators/cua-driver.ts
-  Source: libs/cua-driver/Sources/**/*.swift
+  Source: recursive Swift sources under libs/cua-driver/Sources
  Version: ${releasedVersion}
 */}`);
  lines.push('');
@@ -613,7 +613,7 @@ export function generateMCPToolDoc(tool: MCPToolDoc): string[] {

  lines.push(`### ${tool.name}`);
  lines.push('');
-  lines.push(tool.description);
+  lines.push(escapeMdxText(tool.description));
  lines.push('');

  const properties = tool.input_schema.properties ?? {};
@@ -631,7 +631,7 @@ export function generateMCPToolDoc(tool: MCPToolDoc): string[] {
      const isRequired = required.has(propName);
      const requiredLabel = isRequired ? 'required' : 'optional';
      const typeLabel = formatPropertyType(prop);
-      lines.push(`- \`${propName}\` (${typeLabel}, ${requiredLabel}): ${prop.description ?? ''}`);
+      lines.push(`- \`${propName}\` (${typeLabel}, ${requiredLabel}): ${escapeMdxText(prop.description ?? '')}`);
    }
    lines.push('');
  }
@@ -659,7 +659,23 @@ export function generateMCPToolDoc(tool: MCPToolDoc): string[] {
 }

 function escapeTableCell(value: string): string {
-  return value.replace(/\|/g, '\\|').replace(/\n/g, ' ');
+  return escapeMdxText(value.replace(/\n/g, ' ')).replace(/\|/g, '\\|');
+}
+
+function escapeMdxText(value: string): string {
+  return value
+    .split(/(`[^`]*`)/g)
+    .map((segment) => {
+      if (segment.startsWith('`') && segment.endsWith('`')) {
+        return segment;
+      }
+      return segment
+        .replace(/\{/g, '&#123;')
+        .replace(/\}/g, '&#125;')
+        .replace(/</g, '&lt;')
+        .replace(/>/g, '&gt;');
+    })
+    .join('');
 }

 function formatPropertyType(prop: MCPPropertyDoc): string {