feat(cua-driver): add Claude Code computer-use compatibility mode (#1424)

* feat(cua-driver): add Claude Code computer-use compatibility mode

* docs(cua-driver): avoid hard-wrapped compatibility prose

* docs(cua-driver): clarify Claude Code vision mode needs MCP

* fix(cua-driver): address CodeRabbit review findings

* docs(cua-driver): sync generated reference docs

* fix(docs): escape generated cua-driver mdx
This commit is contained in:
Francesco Bonacci
2026-05-01 21:08:01 -07:00
committed by GitHub
parent e4b047a43e
commit e69d1cbc06
20 changed files with 596 additions and 87 deletions
@@ -113,6 +113,9 @@ Cua Driver speaks MCP over stdio. Use `cua-driver mcp-config --client <name>` to
# Claude Code (add --scope project|global as needed)
claude mcp add --transport stdio cua-driver -- ~/.local/bin/cua-driver mcp
# Claude Code computer-use compatibility mode
claude mcp add --transport stdio cua-computer-use -- ~/.local/bin/cua-driver mcp --claude-code-computer-use-compat
# Codex (OpenAI)
codex mcp add cua-driver -- ~/.local/bin/cua-driver mcp
@@ -120,6 +123,10 @@ codex mcp add cua-driver -- ~/.local/bin/cua-driver mcp
cua-driver mcp-config --client openclaw | sh
```
The Claude Code compatibility mode keeps CuaDriver's normal MCP tools, but replaces `screenshot` with a window-only screenshot shim that requires `pid` and `window_id`. Use it when you want Claude Code's vision/computer-use-style flow to ground on CuaDriver window captures.
Use MCP for that Claude Code vision/computer-use-style path. Shelling out to `cua-driver screenshot` can capture a window, but it does not expose the `mcp__cua-computer-use__screenshot` tool name that Claude Code appears to use as the image-grounding cue.
### Clients configured via a config file
Cursor, OpenCode, and Hermes all configure MCP servers via files. Use `mcp-config` to print the exact snippet, paste it into the right path:
@@ -14,6 +14,8 @@ import { Callout } from 'fumadocs-ui/components/callout';
## Claude Code
Standard MCP registration:
```bash
claude mcp add --transport stdio cua-driver -- cua-driver mcp
```
@@ -25,6 +27,22 @@ claude mcp list
# cua-driver: cua-driver mcp (stdio) - ✓ Connected
```
### Claude Code computer-use compatibility mode
Claude Code vision/computer-use-style flows appear to use the presence of a screenshot tool as a cue for image-grounded operation. If you want that behavior, register the compatibility server instead:
```bash
claude mcp add --transport stdio cua-computer-use -- cua-driver mcp --claude-code-computer-use-compat
```
This mode still exposes the normal CuaDriver tools. The only changed tool is `screenshot`: it requires `pid` and `window_id`, captures that window only, and returns a window-local image coordinate frame. Start with `launch_app` or `list_windows`, then call `screenshot` with the target window.
For this Claude Code vision/computer-use-style path, use MCP rather than shelling out to the CLI. CLI screenshots can still capture windows, but they do not expose the `mcp__cua-computer-use__screenshot` tool name that Claude Code appears to use as the image-grounding cue.
<Callout type="info">
This does not call Anthropic APIs or expose Anthropic's native computer-use API tool. It is a CuaDriver MCP compatibility mode for Claude Code.
</Callout>
## GitHub Copilot CLI
Add to `~/.copilot/mcp-config.json`:
@@ -6,8 +6,8 @@ description: Command Line Interface reference for Cua Driver
{/*
AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY
Generated by: npx tsx scripts/docs-generators/cua-driver.ts
Source: libs/cua-driver/Sources/**/*.swift
Version: 0.1.0
Source: recursive Swift sources under libs/cua-driver/Sources
Version: 0.1.1
*/}
import { Callout } from 'fumadocs-ui/components/callout';
@@ -16,7 +16,7 @@ import { VersionHeader } from '@/components/version-selector';
<VersionHeader
versions={[{"version":"0.1","href":"/cua-driver/reference/cli-reference","isCurrent":true}]}
currentVersion="0.1"
fullVersion="0.1.0"
fullVersion="0.1.1"
packageName="cua-driver"
installCommand="curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/cua-driver/scripts/install.sh | bash"
/>
@@ -36,8 +36,8 @@ provided, the tool is called with no arguments.
Examples:
cua-driver call list_apps
cua-driver call launch_app '{"bundle_id":"com.apple.finder"}'
echo '{"pid":844,"window_id":1234}' | cua-driver call get_window_state
cua-driver call launch_app '&#123;"bundle_id":"com.apple.finder"&#125;'
echo '&#123;"pid":844,"window_id":1234&#125;' | cua-driver call get_window_state
**Arguments:**
@@ -106,6 +106,12 @@ Print a tool's full description and JSON input schema.
Run the stdio MCP server.
**Flags:**
| Name | Description |
| ---- | ----------- |
| `--claude-code-computer-use-compat` | Expose normal CuaDriver tools, replacing only `screenshot` with a Claude Code-friendly window-only screenshot that establishes the vision coordinate frame. |
### cua-driver serve
Run cua-driver as a long-running daemon on a Unix domain socket.
@@ -158,6 +164,12 @@ Print MCP server config or a client-specific install command.
| ---- | ---- | ------- | ----------- |
| `--client` | String | — | Client to print the install command for: claude \| codex \| cursor \| openclaw \| opencode \| hermes \| pi. Omit for the generic JSON snippet. |
**Flags:**
| Name | Description |
| ---- | ----------- |
| `--claude-code-computer-use-compat` | Print config for Claude Code's window-scoped screenshot compatibility mode registered as `cua-computer-use`. |
## Trajectory recording
### cua-driver recording
@@ -6,8 +6,8 @@ description: Reference for every MCP tool cua-driver exposes
{/*
AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY
Generated by: npx tsx scripts/docs-generators/cua-driver.ts
Source: libs/cua-driver/Sources/**/*.swift
Version: 0.1.0
Source: recursive Swift sources under libs/cua-driver/Sources
Version: 0.1.1
*/}
import { Callout } from 'fumadocs-ui/components/callout';
@@ -25,7 +25,7 @@ Tool names are `snake_case`. Responses are MCP `CallTool.Result` envelopes: a te
Report TCC permission status for Accessibility and Screen Recording.
By default also raises the system permission dialogs for any missing
grants — Apple's request APIs are no-ops when the grant is already
active, so this is safe to call repeatedly. Pass {"prompt": false}
active, so this is safe to call repeatedly. Pass &#123;"prompt": false&#125;
for a purely read-only status check.
**Arguments:**
@@ -208,18 +208,18 @@ at startup. Sibling to `set_config` / `cua-driver config`.
Current schema:
{
&#123;
"schema_version": 1,
"capture_mode": "vision" | "ax" | "som",
"agent_cursor": {
"agent_cursor": &#123;
"enabled": true,
"motion": {
"motion": &#123;
"start_handle": 0.3, "end_handle": 0.3,
"arc_size": 0.25, "arc_flow": 0.0,
"spring": 0.72
}
}
}
&#125;
&#125;
&#125;
**Arguments:** none.
@@ -436,10 +436,10 @@ later to resolve a target.
- `additional_arguments` (array of string, optional): Extra command-line arguments passed to the launched process. Passed directly as argv entries — no shell expansion. Example: ["--user-data-dir=/tmp/cua-session", "--no-first-run"] for an isolated Chrome session.
- `bundle_id` (string, optional): App bundle identifier, e.g. com.apple.calculator.
- `creates_new_application_instance` (boolean, optional): Force a brand-new process even if the app is already running. Useful for isolated browser sessions: pass creates_new_application_instance=true together with additional_arguments=["--user-data-dir=/tmp/session-a", "--no-first-run", "--no-default-browser-check"] to launch a sandboxed Chrome that cannot see the user's real profile, cookies, or extensions. Each session gets its own pid and window identity and can be controlled independently.
- `electron_debugging_port` (integer, optional): Launch an Electron app with --remote-debugging-port=<N> so the `page` tool gets full renderer/DOM access. Use 9222 unless running multiple Electron apps. Ignored for non-Electron apps.
- `electron_debugging_port` (integer, optional): Launch an Electron app with --remote-debugging-port=&lt;N&gt; so the `page` tool gets full renderer/DOM access. Use 9222 unless running multiple Electron apps. Ignored for non-Electron apps.
- `name` (string, optional): App display name. Used only when bundle_id is absent.
- `urls` (array of string, optional): Optional file:// or http(s):// URLs (or plain paths with ~ expansion) to hand to the launched app via application(_:open:). For Finder, pass a folder URL or path to open a backgrounded Finder window rooted at that folder — no activation. Apps that don't implement application(_:open:) launch normally and ignore these.
- `webkit_inspector_port` (integer, optional): Launch a Tauri/WKWebView app with WEBKIT_INSPECTOR_SERVER=127.0.0.1:<N> so the `page` tool can reach its WebKit inspector. Use 9226 (reserved WebKit range: 92269228, distinct from Electron's 92229225). Requires developerExtrasEnabled=true in the WKWebView config (default in Tauri debug builds).
- `webkit_inspector_port` (integer, optional): Launch a Tauri/WKWebView app with WEBKIT_INSPECTOR_SERVER=127.0.0.1:&lt;N&gt; so the `page` tool can reach its WebKit inspector. Use 9226 (reserved WebKit range: 92269228, distinct from Electron's 92229225). Requires developerExtrasEnabled=true in the WKWebView config (default in Tauri debug builds).
### list_apps
@@ -457,7 +457,7 @@ apps come from scanning /Applications, /Applications/Utilities,
Use this for "is X installed?" as well as "is X running?". For
per-window state — on-screen, on-current-Space, minimized,
window titles — call list_windows instead. For just opening an
app — running or not — call launch_app({bundle_id: ...}) directly;
app — running or not — call launch_app(&#123;bundle_id: ...&#125;) directly;
list_apps is not a prerequisite.
**Arguments:** none.
@@ -696,10 +696,9 @@ propagate modifier keys).
### screenshot
Capture a screenshot using ScreenCaptureKit. Returns base64-encoded
image data in the requested format (default png).
image data for a single window in the requested format (default png).
Without `window_id`, captures the full main display. With `window_id`,
captures just that window (get the id from `list_windows`).
`window_id` is required. Get window ids from `list_windows`.
Requires the Screen Recording TCC grant — call `check_permissions`
first if unsure.
@@ -708,7 +707,11 @@ first if unsure.
- `format` (string, optional): Image format. Default: png.
- `quality` (integer, optional): JPEG quality 1-95; ignored for png.
- `window_id` (integer, optional): Optional CGWindowID / kCGWindowNumber to capture just that window.
- `window_id` (integer, required): Required CGWindowID / kCGWindowNumber to capture.
```json
{"window_id":10725}
```
### scroll
@@ -843,13 +846,13 @@ optional; omitted fields keep their current value.
string) to revert to the procedural arrow.
Example — brand-colored arrow:
{"gradient_colors": ["#A855F7", "#6366F1"], "bloom_color": "#A855F7"}
&#123;"gradient_colors": ["#A855F7", "#6366F1"], "bloom_color": "#A855F7"&#125;
Example — custom PNG cursor:
{"image_path": "~/cursors/my-cursor.png"}
&#123;"image_path": "~/cursors/my-cursor.png"&#125;
Example — revert to default:
{"gradient_colors": [], "bloom_color": "", "image_path": ""}
&#123;"gradient_colors": [], "bloom_color": "", "image_path": ""&#125;
**Arguments:**
@@ -931,7 +934,7 @@ Set a value on a UI element. Two modes depending on element role:
- **AXPopUpButton / select dropdown**: finds the child option whose
title or value matches `value` (case-insensitive) and AXPresses it
directly — the native macOS popup menu is never opened, so focus
is never stolen. Use this for HTML <select> elements in Safari or
is never stolen. Use this for HTML &lt;select&gt; elements in Safari or
any native NSPopUpButton. Pass the option's display label as `value`
(e.g. "Blue", not "blue").
@@ -143,4 +143,4 @@ async with Sandbox.ephemeral(Image.linux(), local=True) as sb: ...
async with Sandbox.ephemeral(Image.macos(), local=True) as sb: ...
```
See [Self-Hosted Sandboxes](/cua/guide/get-started/self-hosted-sandboxes) for local setup instructions.
See [Set Up a Sandbox](/cua/guide/get-started/set-up-sandbox) for local setup instructions.
+18
View File
@@ -3,3 +3,21 @@
Background computer-use driver for any agents. Speaks MCP over stdio; drives native macOS apps without stealing focus.
**[Documentation](https://cua.ai/docs/cua-driver)** - Installation, guides, and API reference.
## Claude Code computer-use compatibility
Standard Claude Code MCP registration:
```bash
claude mcp add --transport stdio cua-driver -- cua-driver mcp
```
If you want Claude Code's vision/computer-use-style flow to ground on CuaDriver window screenshots, register the compatibility mode:
```bash
claude mcp add --transport stdio cua-computer-use -- cua-driver mcp --claude-code-computer-use-compat
```
This keeps CuaDriver's normal MCP tools and changes only `screenshot`, which requires `pid` and `window_id` and captures that window only.
Use MCP for this Claude Code vision/computer-use-style path. CLI screenshots still work as CuaDriver calls, but they do not expose the `mcp__cua-computer-use__screenshot` tool name that Claude Code appears to use as the image-grounding cue.
@@ -85,6 +85,18 @@ also invoke it explicitly:
/cua-driver
```
## Claude Code MCP compatibility mode
For normal skill-driven use, prefer the CLI or the standard MCP server. If you want Claude Code's vision/computer-use-style flow to ground on CuaDriver screenshots, register the compatibility server:
```bash
claude mcp add --transport stdio cua-computer-use -- cua-driver mcp --claude-code-computer-use-compat
```
This mode exposes the normal CuaDriver tools and changes only `screenshot`. The compatibility screenshot requires `pid` and `window_id`, captures that window only, and establishes a window-local pixel coordinate frame. It does not call Anthropic APIs or expose Anthropic's native computer-use API tool.
Use MCP for this Claude Code vision/computer-use-style path. CLI screenshots still work as CuaDriver calls, but they do not expose the `mcp__cua-computer-use__screenshot` tool name that Claude Code appears to use as the image-grounding cue.
## Files
- `SKILL.md` — the main skill body (~500 lines). Loaded on first
@@ -140,6 +140,18 @@ Every reference to `click(...)`, `get_window_state(...)` etc. in this
skill means `cua-driver click '{...}'` — translate to MCP form only
when MCP is requested.
### Claude Code computer-use compatibility mode
For normal Claude Code use, keep the default CLI or `cua-driver` MCP server path above. If the user explicitly wants Claude Code's vision/computer-use-style flow, they can register:
```bash
claude mcp add --transport stdio cua-computer-use -- cua-driver mcp --claude-code-computer-use-compat
```
Observation: Claude Code vision flows appear to treat a screenshot MCP tool as the image-grounding anchor. This compatibility mode keeps the normal CuaDriver tools and changes only `screenshot`. The compatibility `screenshot` requires `pid` and `window_id`, captures only that target window, and returns the window-local pixel coordinate frame. Start with `launch_app` or `list_windows`, then call `screenshot({pid, window_id})`; do not assume desktop coordinates or a full-screen capture.
Use MCP for this Claude Code vision/computer-use-style path. Do not shell out to `cua-driver screenshot` as a substitute: CLI screenshots still work as CuaDriver calls, but they do not expose the `mcp__cua-computer-use__screenshot` tool name that Claude Code appears to use as the image-grounding cue.
Intent → tool mapping. If you find yourself reaching for the right
column, something has gone wrong — re-read "The no-foreground
contract" above:
@@ -1,3 +1,4 @@
import AppKit
import ArgumentParser
import CuaDriverServer
import Foundation
@@ -171,6 +172,7 @@ struct CallCommand: AsyncParsableCommand {
let result: CallTool.Result
do {
await bootstrapAppKitForInProcessCallIfNeeded(toolName: toolName)
// Route through `registry.call(...)` so the recording hook
// (and any future cross-cutting wrapper) fires consistently
// with the MCP and daemon paths. The in-process one-shot
@@ -264,6 +266,35 @@ struct CallCommand: AsyncParsableCommand {
}
}
private func bootstrapAppKitForInProcessCallIfNeeded(toolName: String) async {
let appKitBackedTools: Set<String> = [
"check_permissions",
"click",
"double_click",
"drag",
"get_accessibility_tree",
"get_cursor_position",
"get_window_state",
"hotkey",
"launch_app",
"list_apps",
"list_windows",
"move_cursor",
"press_key",
"right_click",
"screenshot",
"scroll",
"set_value",
"type_text",
"type_text_chars",
"zoom",
]
guard appKitBackedTools.contains(toolName) else { return }
await MainActor.run {
_ = NSApplication.shared.setActivationPolicy(.accessory)
}
}
struct ListToolsCommand: AsyncParsableCommand {
static let configuration = CommandConfiguration(
commandName: "list-tools",
@@ -18,6 +18,8 @@ import MCP
///
/// Keys are dotted snake_case paths:
/// - `schema_version`
/// - `capture_mode`
/// - `max_image_dimension`
/// - `agent_cursor.enabled`
/// - `agent_cursor.motion.{start_handle,end_handle,arc_size,arc_flow,spring}`
struct ConfigCommand: AsyncParsableCommand {
@@ -31,6 +33,8 @@ struct ConfigCommand: AsyncParsableCommand {
Examples:
cua-driver config # print full config
cua-driver config get capture_mode
cua-driver config set capture_mode vision
cua-driver config get agent_cursor.enabled
cua-driver config set agent_cursor.enabled false
cua-driver config set agent_cursor.motion.arc_size 0.4
@@ -147,6 +151,10 @@ struct ConfigGetCommand: AsyncParsableCommand {
switch key {
case "schema_version":
print(config.schemaVersion)
case "capture_mode":
print(config.captureMode.rawValue)
case "max_image_dimension":
print(config.maxImageDimension)
case "agent_cursor.enabled":
print(config.agentCursor.enabled)
case "agent_cursor.motion.start_handle":
@@ -43,33 +43,62 @@ struct MCPConfigCommand: ParsableCommand {
help: "Client to print the install command for: claude | codex | cursor | openclaw | opencode | hermes | pi. Omit for the generic JSON snippet.")
var client: String?
@Flag(
name: .long,
help: "Print config for Claude Code's window-scoped screenshot compatibility mode registered as `cua-computer-use`."
)
var claudeCodeComputerUseCompat: Bool = false
func run() throws {
let binary = resolvedBinaryPath()
let shellBinary = shellEscape(binary)
// Observed Claude Code behavior: the exact config key "computer-use"
// is reserved, so external stdio registrations use a distinct key.
let serverName = claudeCodeComputerUseCompat ? "cua-computer-use" : "cua-driver"
let args = claudeCodeComputerUseCompat
? "[\"mcp\", \"--claude-code-computer-use-compat\"]"
: "[\"mcp\"]"
let commandArgs = claudeCodeComputerUseCompat
? "mcp --claude-code-computer-use-compat"
: "mcp"
switch client?.lowercased() {
case nil, "":
print(genericMcpServersSnippet(binary: binary, includeType: false))
print(genericMcpServersSnippet(
serverName: serverName,
binary: binary,
args: args,
includeType: false
))
case "claude":
print("claude mcp add --transport stdio cua-driver -- \(binary) mcp")
print("claude mcp add --transport stdio \(serverName) -- \(shellBinary) \(commandArgs)")
case "codex":
print("codex mcp add cua-driver -- \(binary) mcp")
print("codex mcp add \(serverName) -- \(shellBinary) \(commandArgs)")
case "cursor":
// Cursor has no CLI emit JSON the user pastes into
// ~/.cursor/mcp.json (global) or .cursor/mcp.json (project).
print(genericMcpServersSnippet(binary: binary, includeType: true))
print(genericMcpServersSnippet(
serverName: serverName,
binary: binary,
args: args,
includeType: true
))
case "openclaw":
// OpenClaw has a CLI registry set with a JSON arg.
print("openclaw mcp set cua-driver '{\"command\":\"\(binary)\",\"args\":[\"mcp\"]}'")
print("openclaw mcp set \(serverName) '{\"command\":\"\(binary)\",\"args\":\(args)}'")
case "opencode":
// OpenCode (sst/opencode) uses opencode.json with type:"local"
// and command as a single merged array.
let commandArray = claudeCodeComputerUseCompat
? "[\"\(binary)\", \"mcp\", \"--claude-code-computer-use-compat\"]"
: "[\"\(binary)\", \"mcp\"]"
let snippet = """
// paste under "mcp" in opencode.json (or opencode.jsonc):
{
"$schema": "https://opencode.ai/config.json",
"mcp": {
"cua-driver": {
"\(serverName)": {
"type": "local",
"command": ["\(binary)", "mcp"],
"command": \(commandArray),
"enabled": true
}
}
@@ -83,9 +112,9 @@ struct MCPConfigCommand: ParsableCommand {
# paste under mcp_servers in ~/.hermes/config.yaml,
# then run /reload-mcp inside Hermes:
mcp_servers:
cua-driver:
\(serverName):
command: "\(binary)"
args: ["mcp"]
args: \(args)
"""
print(snippet)
case "pi":
@@ -117,14 +146,19 @@ struct MCPConfigCommand: ParsableCommand {
}
}
private func genericMcpServersSnippet(binary: String, includeType: Bool) -> String {
private func genericMcpServersSnippet(
serverName: String,
binary: String,
args: String,
includeType: Bool
) -> String {
let typeLine = includeType ? ",\n \"type\": \"stdio\"" : ""
return """
{
"mcpServers": {
"cua-driver": {
"\(serverName)": {
"command": "\(binary)",
"args": ["mcp"]\(typeLine)
"args": \(args)\(typeLine)
}
}
}
@@ -140,6 +174,10 @@ struct MCPConfigCommand: ParsableCommand {
}
return CommandLine.arguments.first ?? "cua-driver"
}
private func shellEscape(_ value: String) -> String {
"'\(value.replacingOccurrences(of: "'", with: "'\"'\"'"))'"
}
}
/// Top-level entry point. Before handing to ArgumentParser, rewrite
@@ -307,6 +345,17 @@ struct MCPCommand: ParsableCommand {
abstract: "Run the stdio MCP server."
)
@Flag(
name: .long,
help: """
Expose normal CuaDriver tools, replacing only `screenshot` with a \
Claude Code-friendly window-only screenshot that establishes the \
vision coordinate frame. This does not use Anthropic's native \
computer_2025 API tool.
"""
)
var claudeCodeComputerUseCompat: Bool = false
func run() throws {
// MCP stdio runs for the lifetime of the host process, so we
// bootstrap AppKit here the agent cursor overlay (disabled
@@ -342,7 +391,12 @@ struct MCPCommand: ParsableCommand {
AgentCursor.shared.apply(config: config.agentCursor)
}
let server = await CuaDriverMCPServer.make()
let server = await CuaDriverMCPServer.make(
serverName: claudeCodeComputerUseCompat ? "computer-use" : "cua-driver",
registry: claudeCodeComputerUseCompat
? .claudeCodeComputerUseCompat
: .default
)
let transport = StdioTransport()
try await server.start(transport: transport)
await server.waitUntilCompleted()
@@ -94,7 +94,9 @@ enum CLIDocExtractor {
discussion: nil,
arguments: [],
options: [],
flags: [],
flags: [
FlagDoc(name: "claude-code-computer-use-compat", shortName: nil, help: "Expose normal CuaDriver tools, replacing only `screenshot` with a Claude Code-friendly window-only screenshot that establishes the vision coordinate frame.", defaultValue: false),
],
subcommands: []
)
}
@@ -412,7 +414,9 @@ enum CLIDocExtractor {
options: [
OptionDoc(name: "client", shortName: nil, help: "Client to print the install command for: claude | codex | cursor | openclaw | opencode | hermes | pi. Omit for the generic JSON snippet.", type: "String", defaultValue: nil, isOptional: true),
],
flags: [],
flags: [
FlagDoc(name: "claude-code-computer-use-compat", shortName: nil, help: "Print config for Claude Code's window-scoped screenshot compatibility mode registered as `cua-computer-use`.", defaultValue: false),
],
subcommands: []
)
}
@@ -98,15 +98,21 @@ public enum MouseInput {
/// screen points (top-left origin) and deliver them to `pid`.
/// `modifiers` accepts the same names as `KeyboardInput`
/// (`cmd` / `command`, `shift`, `option` / `alt`, `ctrl` /
/// `control`, `fn`); unknown names are ignored. Events are
/// posted via auth-signed `SLEventPostToPid` AND the public HID
/// tap see the file-level doc for the rationale.
/// `control`, `fn`); unknown names are ignored.
///
/// When `useFrontmostHIDPath` is true (the default) and `pid` is
/// frontmost, events are posted through the public HID tap, which
/// can move the global cursor and is required for some viewport
/// apps. When false, the function skips that path and uses only
/// pid-routed delivery, preserving the system cursor for callers
/// that rely on background-style dispatch.
public static func click(
at point: CGPoint,
toPid pid: pid_t,
button: Button,
count: Int = 1,
modifiers: [String] = []
modifiers: [String] = [],
useFrontmostHIDPath: Bool = true
) throws {
// When the target is frontmost, route via the public HID tap
// (`CGEventPost(tap: .cghidEventTap)`) with a preceding
@@ -123,7 +129,7 @@ public enum MouseInput {
// (Chrome/Slack/etc); they just don't work on viewports.
let targetIsFrontmost =
NSRunningApplication(processIdentifier: pid)?.isActive ?? false
if targetIsFrontmost {
if useFrontmostHIDPath && targetIsFrontmost {
try clickFrontmostViaHIDTap(
at: point, button: button, count: count, modifiers: modifiers)
return
@@ -0,0 +1,161 @@
import AppKit
import CuaDriverCore
import Foundation
import MCP
private struct CompatWindowContext: Sendable {
let window: WindowInfo
let scaleFactor: Double
}
private actor ClaudeCodeComputerUseCompatSession {
static let shared = ClaudeCodeComputerUseCompatSession()
private var activeWindow: CompatWindowContext?
func setActiveWindow(_ context: CompatWindowContext?) {
activeWindow = context
}
func currentActiveWindow() -> CompatWindowContext? {
activeWindow
}
}
public enum ClaudeCodeComputerUseCompatTools {
private static let capture = WindowCapture()
public static let screenshot = ToolHandler(
tool: Tool(
name: "screenshot",
description: """
Capture a target window and return a JPEG image. Coordinates accepted
by CuaDriver's pixel tools are pixels in this window screenshot's
coordinate space.
This is the compatibility anchor for Claude Code vision flows:
CuaDriver remains window-scoped, and all other tools are the
normal CuaDriver tools.
""",
inputSchema: [
"type": "object",
"required": ["pid", "window_id"],
"properties": [
"pid": [
"type": "integer",
"description": "Target process ID from `list_windows` or `launch_app`.",
],
"window_id": [
"type": "integer",
"description": "Target CGWindowID from `list_windows` or `launch_app`.",
],
],
"additionalProperties": false,
],
annotations: .init(
readOnlyHint: true,
destructiveHint: false,
idempotentHint: false,
openWorldHint: false
)
),
invoke: { arguments in
do {
guard let rawPid = arguments?["pid"]?.intValue else {
return errorResult("Missing required integer field `pid`.")
}
guard let pid = Int32(exactly: rawPid) else {
return errorResult("pid \(rawPid) is outside the supported Int32 range.")
}
guard let rawWindowID = arguments?["window_id"]?.intValue else {
return errorResult("Missing required integer field `window_id`.")
}
guard let windowID = UInt32(exactly: rawWindowID) else {
return errorResult(
"window_id \(rawWindowID) is outside the supported UInt32 range.")
}
guard let context = compatWindowContext(
forPid: pid,
windowID: windowID
) else {
return errorResult(
"No visible layer-0 window \(rawWindowID) found for pid \(rawPid). Use `list_windows` to choose an on-screen target window."
)
}
let shot = try await capture.captureWindow(
windowID: windowID,
format: .jpeg,
quality: 85
)
await ClaudeCodeComputerUseCompatSession.shared.setActiveWindow(
CompatWindowContext(
window: context.window,
scaleFactor: shot.scaleFactor
)
)
let base64 = shot.imageData.base64EncodedString()
return CallTool.Result(content: [
.image(data: base64, mimeType: "image/jpeg", annotations: nil, _meta: nil),
.text(
text: "Captured window screenshot \(shot.width)x\(shot.height) for \(context.window.owner) [pid: \(context.window.pid), window_id: \(context.window.id)]. Use CuaDriver pixel tools with this window-local coordinate space.",
annotations: nil,
_meta: nil
),
])
} catch CaptureError.permissionDenied {
return errorResult(
"Screen Recording permission is not granted for CuaDriver.")
} catch {
return errorResult("Screenshot failed: \(error)")
}
}
)
public static let all: [ToolHandler] = [
screenshot,
]
}
extension ToolRegistry {
public static let claudeCodeComputerUseCompat: ToolRegistry = {
let shimNames = Set(ClaudeCodeComputerUseCompatTools.all.map(\.tool.name))
let nativeHandlers = ToolRegistry.default.handlers.values
.filter { !shimNames.contains($0.tool.name) }
return ToolRegistry(
handlers: Array(nativeHandlers) + ClaudeCodeComputerUseCompatTools.all
)
}()
}
private func compatWindowContext(
forPid pid: Int32,
windowID: UInt32
) -> CompatWindowContext? {
guard let window = WindowEnumerator.visibleWindows()
.first(where: {
$0.pid == pid
&& UInt32(exactly: $0.id) == windowID
&& $0.layer == 0
&& $0.isOnScreen
&& $0.bounds.width > 1
&& $0.bounds.height > 1
})
else {
return nil
}
return CompatWindowContext(
window: window,
scaleFactor: defaultScaleFactor()
)
}
private func defaultScaleFactor() -> Double {
ScreenInfo.mainScreenSize()?.scaleFactor ?? 1.0
}
private func errorResult(_ text: String) -> CallTool.Result {
CallTool.Result(
content: [.text(text: text, annotations: nil, _meta: nil)],
isError: true
)
}
@@ -7,11 +7,12 @@ public enum CuaDriverMCPServer {
/// The caller is responsible for calling ``Server/start(transport:initializeHook:)``
/// and ``Server/waitUntilCompleted()``.
public static func make(
serverName: String = "cua-driver",
version: String = CuaDriverCore.version,
registry: ToolRegistry = .default
) async -> Server {
let server = Server(
name: "cua-driver",
name: serverName,
version: version,
capabilities: Server.Capabilities(tools: .init(listChanged: false))
)
@@ -11,16 +11,16 @@ public enum ScreenshotTool {
name: "screenshot",
description: """
Capture a screenshot using ScreenCaptureKit. Returns base64-encoded
image data in the requested format (default png).
image data for a single window in the requested format (default png).
Without `window_id`, captures the full main display. With `window_id`,
captures just that window (get the id from `list_windows`).
`window_id` is required. Get window ids from `list_windows`.
Requires the Screen Recording TCC grant — call `check_permissions`
first if unsure.
""",
inputSchema: [
"type": "object",
"required": ["window_id"],
"properties": [
"format": [
"type": "string",
@@ -36,7 +36,7 @@ public enum ScreenshotTool {
"window_id": [
"type": "integer",
"description":
"Optional CGWindowID / kCGWindowNumber to capture just that window.",
"Required CGWindowID / kCGWindowNumber to capture.",
],
],
"additionalProperties": false,
@@ -52,36 +52,42 @@ public enum ScreenshotTool {
let format =
ImageFormat(rawValue: arguments?["format"]?.stringValue ?? "png") ?? .png
let quality = arguments?["quality"]?.intValue ?? 95
let windowID = arguments?["window_id"]?.intValue
guard let rawWindowID = arguments?["window_id"]?.intValue else {
return CallTool.Result(
content: [
.text(
text: "Missing required `window_id`. Use `list_windows` first, then call `screenshot` for one window.",
annotations: nil,
_meta: nil
)
],
isError: true
)
}
guard let windowID = UInt32(exactly: rawWindowID) else {
return CallTool.Result(
content: [
.text(
text: "Invalid `window_id` \(rawWindowID). Use `list_windows` first, then pass a valid UInt32 window id.",
annotations: nil,
_meta: nil
)
],
isError: true
)
}
do {
let shot: Screenshot
if let windowID {
shot = try await capture.captureWindow(
windowID: UInt32(windowID),
format: format,
quality: quality
)
} else {
shot = try await capture.captureMainDisplay(
format: format,
quality: quality
)
}
let shot = try await capture.captureWindow(
windowID: windowID,
format: format,
quality: quality
)
let base64 = shot.imageData.base64EncodedString()
let mime = format == .png ? "image/png" : "image/jpeg"
let visibleWindows = WindowEnumerator.visibleWindows().filter { $0.layer == 0 }
var summaryLines: [String] = [
"Screenshot — \(shot.width)x\(shot.height) \(format.rawValue)"
"Window screenshot — \(shot.width)x\(shot.height) \(format.rawValue) [window_id: \(rawWindowID)]"
]
if !visibleWindows.isEmpty {
summaryLines.append("\nOn-screen windows:")
for w in visibleWindows {
let title = w.name.isEmpty ? "(no title)" : "\"\(w.name)\""
summaryLines.append("- \(w.owner) (pid \(w.pid)) \(title) [window_id: \(w.id)]")
}
summaryLines.append("→ Call get_window_state(pid, window_id) to inspect a window's UI.")
}
let summary = summaryLines.joined(separator: "\n")
return CallTool.Result(
content: [
+4
View File
@@ -232,6 +232,10 @@ Next steps:
2. Verify the CLI: $BIN_LINK --version
3. Wire into an MCP client:
$BIN_LINK mcp-config | pbcopy
Claude Code compatibility:
$BIN_LINK mcp-config --client claude --claude-code-computer-use-compat
Use MCP for Claude Code vision/computer-use-style flows; CLI screenshots
do not expose the mcp__cua-computer-use__screenshot tool name cue.
Uninstall: $CUA_DRIVER_DIR/scripts/uninstall.sh
+8
View File
@@ -300,6 +300,14 @@ Next steps:
• Claude Code:
claude mcp add --transport stdio cua-driver -- $BIN_LINK mcp
Claude Code computer-use compatibility mode:
claude mcp add --transport stdio cua-computer-use -- $BIN_LINK mcp --claude-code-computer-use-compat
Use this when you want Claude Code's vision/computer-use-style flow
to ground on CuaDriver window screenshots. It keeps the normal
CuaDriver tools and changes only the screenshot tool.
Use MCP for this path; CLI screenshots do not expose the
mcp__cua-computer-use__screenshot tool name cue.
• Codex (OpenAI):
codex mcp add cua-driver -- $BIN_LINK mcp
+128
View File
@@ -5,6 +5,7 @@
# - /Applications/CuaDriver.app bundle
# - ~/.cua-driver/ (telemetry id + install marker)
# - ~/Library/Application Support/Cua Driver/ (config.json)
# - ~/Library/Caches/cua-driver/ (daemon/cache state)
#
# Does NOT revoke TCC grants (Accessibility + Screen Recording).
#
@@ -17,6 +18,7 @@ SYSTEM_BIN_LINK="/usr/local/bin/cua-driver"
APP_BUNDLE="/Applications/CuaDriver.app"
USER_DATA="$HOME/.cua-driver"
CONFIG_DIR="$HOME/Library/Application Support/Cua Driver"
CACHE_DIR="$HOME/Library/Caches/cua-driver"
# Legacy — remove if present from older installs.
LEGACY_UPDATE_SCRIPT="/usr/local/bin/cua-driver-update"
LEGACY_UPDATER_PLIST="$HOME/Library/LaunchAgents/com.trycua.cua_driver_updater.plist"
@@ -74,6 +76,14 @@ else
log "no config at $CONFIG_DIR (skipping)"
fi
# Cache / daemon state.
if [[ -d "$CACHE_DIR" ]]; then
rm -rf "$CACHE_DIR"
log "removed $CACHE_DIR"
else
log "no cache at $CACHE_DIR (skipping)"
fi
# Agent skill symlinks (Claude Code + Codex). Only remove when the link
# is ours — a dev user pointing the symlink at a working copy of the repo
# keeps theirs untouched.
@@ -91,6 +101,124 @@ for SKILL_LINK in \
fi
done
# Claude Code MCP registrations. `claude mcp remove` only removes from
# the current project / user scopes, while ~/.claude.json can also contain
# stale project entries for other directories. Scrub only registrations
# that are explicitly named cua-driver or whose command points at a
# cua-driver binary, so unrelated servers named "computer-use" are left
# alone.
CLAUDE_JSON="$HOME/.claude.json"
if [[ -f "$CLAUDE_JSON" ]] && command -v python3 >/dev/null 2>&1; then
PY_OUTPUT="$(
CLAUDE_JSON="$CLAUDE_JSON" python3 <<'PY'
import json
import os
import shutil
import sys
import tempfile
import time
path = os.environ["CLAUDE_JSON"]
try:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
except Exception as exc:
print(f"could not read Claude config {path}: {exc}", file=sys.stderr)
raise SystemExit(0)
removed = []
def text_parts(value):
if isinstance(value, str):
return [value]
if isinstance(value, list):
return [item for item in value if isinstance(item, str)]
return []
def invokes_cua_driver(server):
if not isinstance(server, dict):
return False
parts = []
parts.extend(text_parts(server.get("command")))
parts.extend(text_parts(server.get("args")))
joined = " ".join(parts)
return "cua-driver" in joined or "CuaDriver.app" in joined
def should_remove(name, server):
return name in {"cua-driver", "cua-computer-use"} or invokes_cua_driver(server)
def scrub_servers(servers, scope):
if not isinstance(servers, dict):
return
for name in list(servers.keys()):
if should_remove(name, servers[name]):
del servers[name]
removed.append(f"{scope}:{name}")
scrub_servers(data.get("mcpServers"), "user")
projects = data.get("projects")
if isinstance(projects, dict):
for project in projects.values():
if isinstance(project, dict):
scrub_servers(project.get("mcpServers"), "project")
if not removed:
raise SystemExit(0)
backup = f"{path}.bak-cua-driver-uninstall-{int(time.time())}"
shutil.copy2(path, backup)
directory = os.path.dirname(path) or "."
fd, tmp_path = tempfile.mkstemp(
prefix=".claude.json.",
suffix=".tmp",
dir=directory,
text=True,
)
try:
with os.fdopen(fd, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
f.write("\n")
os.replace(tmp_path, path)
except Exception:
try:
os.unlink(tmp_path)
except OSError:
pass
raise
print(f"removed Claude MCP registration(s): {', '.join(removed)}")
print(f"backed up Claude config to {backup}")
PY
)"
if [[ -n "$PY_OUTPUT" ]]; then
while IFS= read -r line; do
log "$line"
done <<< "$PY_OUTPUT"
else
log "no Claude MCP registrations for cua-driver found in $CLAUDE_JSON"
fi
else
log "no Claude config cleanup via python3 (missing $CLAUDE_JSON or python3)"
fi
# Best-effort CLI cleanup for the active Claude project. This covers
# .mcp.json / current-working-directory scopes when present and is harmless
# when the entries were already removed above.
if command -v claude >/dev/null 2>&1; then
for SERVER in cua-driver cua-computer-use; do
for SCOPE in local project user; do
if claude mcp remove "$SERVER" -s "$SCOPE" >/dev/null 2>&1; then
log "removed Claude MCP server $SERVER from $SCOPE scope"
fi
done
done
else
log "claude CLI not found (skipping Claude MCP CLI cleanup)"
fi
cat << 'FINALUNMSG'
cua-driver uninstalled.
+28 -12
View File
@@ -299,7 +299,7 @@ export function generateCLIReferenceMDX(docs: CLIDocumentation, releasedVersion:
lines.push(`{/*
AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY
Generated by: npx tsx scripts/docs-generators/cua-driver.ts
Source: libs/cua-driver/Sources/**/*.swift
Source: recursive Swift sources under libs/cua-driver/Sources
Version: ${releasedVersion}
*/}`);
lines.push('');
@@ -320,7 +320,7 @@ export function generateCLIReferenceMDX(docs: CLIDocumentation, releasedVersion:
lines.push('');
// Introduction
lines.push(`${docs.abstract}`);
lines.push(escapeMdxText(docs.abstract));
lines.push('');
// Group commands by category
@@ -394,11 +394,11 @@ export function generateCommandDoc(cmd: CommandDoc): string[] {
lines.push(`### cua-driver ${cmd.name}`);
lines.push('');
lines.push(cmd.abstract);
lines.push(escapeMdxText(cmd.abstract));
lines.push('');
if (cmd.discussion) {
lines.push(cmd.discussion);
lines.push(escapeMdxText(cmd.discussion));
lines.push('');
}
@@ -451,11 +451,11 @@ export function generateCommandDoc(cmd: CommandDoc): string[] {
for (const sub of cmd.subcommands) {
lines.push(`#### cua-driver ${cmd.name} ${sub.name}`);
lines.push('');
lines.push(sub.abstract);
lines.push(escapeMdxText(sub.abstract));
lines.push('');
if (sub.discussion) {
lines.push(sub.discussion);
lines.push(escapeMdxText(sub.discussion));
lines.push('');
}
@@ -505,11 +505,11 @@ export function generateCommandDoc(cmd: CommandDoc): string[] {
for (const nested of sub.subcommands) {
lines.push(`##### cua-driver ${cmd.name} ${sub.name} ${nested.name}`);
lines.push('');
lines.push(nested.abstract);
lines.push(escapeMdxText(nested.abstract));
lines.push('');
if (nested.discussion) {
lines.push(nested.discussion);
lines.push(escapeMdxText(nested.discussion));
lines.push('');
}
@@ -577,7 +577,7 @@ export function generateMCPToolsMDX(docs: MCPDocumentation, releasedVersion: str
lines.push(`{/*
AUTO-GENERATED FILE - DO NOT EDIT DIRECTLY
Generated by: npx tsx scripts/docs-generators/cua-driver.ts
Source: libs/cua-driver/Sources/**/*.swift
Source: recursive Swift sources under libs/cua-driver/Sources
Version: ${releasedVersion}
*/}`);
lines.push('');
@@ -613,7 +613,7 @@ export function generateMCPToolDoc(tool: MCPToolDoc): string[] {
lines.push(`### ${tool.name}`);
lines.push('');
lines.push(tool.description);
lines.push(escapeMdxText(tool.description));
lines.push('');
const properties = tool.input_schema.properties ?? {};
@@ -631,7 +631,7 @@ export function generateMCPToolDoc(tool: MCPToolDoc): string[] {
const isRequired = required.has(propName);
const requiredLabel = isRequired ? 'required' : 'optional';
const typeLabel = formatPropertyType(prop);
lines.push(`- \`${propName}\` (${typeLabel}, ${requiredLabel}): ${prop.description ?? ''}`);
lines.push(`- \`${propName}\` (${typeLabel}, ${requiredLabel}): ${escapeMdxText(prop.description ?? '')}`);
}
lines.push('');
}
@@ -659,7 +659,23 @@ export function generateMCPToolDoc(tool: MCPToolDoc): string[] {
}
function escapeTableCell(value: string): string {
return value.replace(/\|/g, '\\|').replace(/\n/g, ' ');
return escapeMdxText(value.replace(/\n/g, ' ')).replace(/\|/g, '\\|');
}
function escapeMdxText(value: string): string {
return value
.split(/(`[^`]*`)/g)
.map((segment) => {
if (segment.startsWith('`') && segment.endsWith('`')) {
return segment;
}
return segment
.replace(/\{/g, '&#123;')
.replace(/\}/g, '&#125;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;');
})
.join('');
}
function formatPropertyType(prop: MCPPropertyDoc): string {