mirror of
https://github.com/trycua/lume.git
synced 2026-04-22 11:39:17 -05:00
Add agent-computer/peakaboo style CLI group to cua-cli, add cua do SKILL.md, and cua-auto package (#1107)
* add agent-computer style usage to cua-cli, refactor pyautogui-like handlers from computer-server into its own SDK for reuse by our various SDKs * address CR comments, add auto-focus when zooming to windows on the host * Add cua-auto to pypi workflow * Bump cua-cli requirements * default `cua do ls` to listing all sandboxes * Fix linting error * fix linting
This commit is contained in:
@@ -0,0 +1,74 @@
|
||||
name: "CD: cua-auto (PyPI)"
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "auto-v*"
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
version:
|
||||
description: "Version to publish (without v prefix)"
|
||||
required: true
|
||||
default: "0.1.0"
|
||||
workflow_call:
|
||||
inputs:
|
||||
version:
|
||||
description: "Version to publish"
|
||||
required: true
|
||||
type: string
|
||||
|
||||
# Adding permissions at workflow level
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
jobs:
|
||||
prepare:
|
||||
runs-on: macos-latest
|
||||
outputs:
|
||||
version: ${{ steps.get-version.outputs.version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Determine version
|
||||
id: get-version
|
||||
run: |
|
||||
# Check inputs.version first (set by workflow_call)
|
||||
if [ -n "${{ inputs.version }}" ]; then
|
||||
VERSION=${{ inputs.version }}
|
||||
elif [ "${{ github.event_name }}" == "push" ]; then
|
||||
# Extract version from tag (for package-specific tags)
|
||||
if [[ "${{ github.ref }}" =~ ^refs/tags/auto-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then
|
||||
VERSION=${BASH_REMATCH[1]}
|
||||
else
|
||||
echo "Invalid tag format for auto"
|
||||
exit 1
|
||||
fi
|
||||
elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
|
||||
# Use version from workflow dispatch
|
||||
VERSION=${{ github.event.inputs.version }}
|
||||
else
|
||||
echo "No version provided"
|
||||
exit 1
|
||||
fi
|
||||
echo "VERSION=$VERSION"
|
||||
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
||||
|
||||
publish:
|
||||
needs: prepare
|
||||
uses: ./.github/workflows/py-reusable-publish.yml
|
||||
with:
|
||||
package_name: "auto"
|
||||
package_dir: "libs/python/cua-auto"
|
||||
version: ${{ needs.prepare.outputs.version }}
|
||||
base_package_name: "cua-auto"
|
||||
secrets:
|
||||
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
|
||||
|
||||
create-release:
|
||||
needs: [prepare, publish]
|
||||
uses: ./.github/workflows/release-github-reusable.yml
|
||||
with:
|
||||
tag_name: "auto-v${{ needs.prepare.outputs.version }}"
|
||||
release_name: "cua-auto v${{ needs.prepare.outputs.version }}"
|
||||
module_path: "libs/python/cua-auto"
|
||||
body: ""
|
||||
@@ -27,6 +27,7 @@ jobs:
|
||||
# Map paths to publishable services
|
||||
declare -A PATH_TO_SERVICE=(
|
||||
["libs/python/cua-cli/"]="pypi/cli"
|
||||
["libs/python/cua-auto/"]="pypi/auto"
|
||||
["libs/python/agent/"]="pypi/agent"
|
||||
["libs/python/computer/"]="pypi/computer"
|
||||
["libs/python/core/"]="pypi/core"
|
||||
|
||||
@@ -9,6 +9,7 @@ on:
|
||||
type: choice
|
||||
options:
|
||||
- pypi/agent
|
||||
- pypi/auto
|
||||
- pypi/bench
|
||||
- pypi/bench-ui
|
||||
- pypi/cli
|
||||
@@ -54,6 +55,10 @@ jobs:
|
||||
echo "directory=libs/python/agent" >> $GITHUB_OUTPUT
|
||||
echo "type=python" >> $GITHUB_OUTPUT
|
||||
;;
|
||||
"pypi/auto")
|
||||
echo "directory=libs/python/cua-auto" >> $GITHUB_OUTPUT
|
||||
echo "type=python" >> $GITHUB_OUTPUT
|
||||
;;
|
||||
"pypi/bench")
|
||||
echo "directory=libs/cua-bench" >> $GITHUB_OUTPUT
|
||||
echo "type=python" >> $GITHUB_OUTPUT
|
||||
@@ -212,6 +217,15 @@ jobs:
|
||||
echo "Agent version: $VERSION"
|
||||
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Capture bumped auto version
|
||||
if: ${{ inputs.service == 'pypi/auto' }}
|
||||
id: auto_version
|
||||
run: |
|
||||
cd libs/python/cua-auto
|
||||
VERSION=$(python -c "import tomllib; from pathlib import Path; data = tomllib.loads(Path('pyproject.toml').read_text()); print(data['project']['version'])")
|
||||
echo "Auto version: $VERSION"
|
||||
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Capture bumped bench version
|
||||
if: ${{ inputs.service == 'pypi/bench' }}
|
||||
id: bench_version
|
||||
|
||||
@@ -47,6 +47,7 @@ jobs:
|
||||
# Map paths to publishable services
|
||||
declare -A PATH_TO_SERVICE=(
|
||||
["libs/python/cua-cli/"]="pypi/cli"
|
||||
["libs/python/cua-auto/"]="pypi/auto"
|
||||
["libs/python/agent/"]="pypi/agent"
|
||||
["libs/python/computer/"]="pypi/computer"
|
||||
["libs/python/core/"]="pypi/core"
|
||||
|
||||
@@ -21,6 +21,7 @@ jobs:
|
||||
# Service → tag prefix → directory
|
||||
declare -A SERVICE_TAG_DIR
|
||||
SERVICE_TAG_DIR["pypi/cli"]="cli-v|libs/python/cua-cli/"
|
||||
SERVICE_TAG_DIR["pypi/auto"]="auto-v|libs/python/cua-auto/"
|
||||
SERVICE_TAG_DIR["pypi/agent"]="agent-v|libs/python/agent/"
|
||||
SERVICE_TAG_DIR["pypi/computer"]="computer-v|libs/python/computer/"
|
||||
SERVICE_TAG_DIR["pypi/core"]="core-v|libs/python/core/"
|
||||
|
||||
@@ -305,6 +305,7 @@ All packages are managed through a single consolidated workflow: [Bump Version &
|
||||
**Python (PyPI):**
|
||||
|
||||
- `pypi/agent` - AI agent library
|
||||
- `pypi/auto` - Cross-platform automation library (mouse, keyboard, screen, window, clipboard, shell)
|
||||
- `pypi/bench` - Benchmark toolkit for computer-use RL environments
|
||||
- `pypi/computer` - Computer-use interface library
|
||||
- `pypi/computer-server` - Server component for VM
|
||||
@@ -365,6 +366,7 @@ make show-versions
|
||||
Each package uses its own tag format defined in `.bumpversion.cfg`:
|
||||
|
||||
- **cua-agent**: `agent-v{version}` (e.g., `agent-v0.4.35`)
|
||||
- **cua-auto**: `auto-v{version}` (e.g., `auto-v0.1.0`)
|
||||
- **cua-bench**: `bench-v{version}` (e.g., `bench-v0.1.0`)
|
||||
- **cua-computer**: `computer-v{version}` (e.g., `computer-v0.4.7`)
|
||||
- **cua-computer-server**: `computer-server-v{version}` (e.g., `computer-server-v0.1.27`)
|
||||
|
||||
@@ -234,6 +234,12 @@ class AndroidAutomationHandler(BaseAutomationHandler):
|
||||
else:
|
||||
raise RuntimeError(f"Long press failed: {output}")
|
||||
|
||||
async def middle_click(
|
||||
self, x: Optional[int] = None, y: Optional[int] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Middle click is not supported on Android; returns not-supported error."""
|
||||
return {"success": False, "error": "middle_click is not supported on Android"}
|
||||
|
||||
async def double_click(
|
||||
self, x: Optional[int] = None, y: Optional[int] = None
|
||||
) -> Dict[str, Any]:
|
||||
|
||||
@@ -217,6 +217,13 @@ class BaseAutomationHandler(ABC):
|
||||
"""Perform a right click at the current or specified position."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def middle_click(
|
||||
self, x: Optional[int] = None, y: Optional[int] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Perform a middle click at the current or specified position."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def double_click(
|
||||
self, x: Optional[int] = None, y: Optional[int] = None
|
||||
|
||||
@@ -221,6 +221,20 @@ class LinuxAutomationHandler(BaseAutomationHandler):
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def middle_click(
|
||||
self, x: Optional[int] = None, y: Optional[int] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Perform a middle mouse click at the specified coordinates."""
|
||||
try:
|
||||
from pynput.mouse import Button
|
||||
|
||||
if x is not None and y is not None:
|
||||
self.mouse.position = (x, y)
|
||||
self.mouse.click(Button.middle, 1)
|
||||
return {"success": True}
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def double_click(
|
||||
self, x: Optional[int] = None, y: Optional[int] = None
|
||||
) -> Dict[str, Any]:
|
||||
|
||||
@@ -1058,6 +1058,18 @@ class MacOSAutomationHandler(BaseAutomationHandler):
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def middle_click(
|
||||
self, x: Optional[int] = None, y: Optional[int] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Perform a middle mouse click at the specified coordinates."""
|
||||
try:
|
||||
if x is not None and y is not None:
|
||||
self.mouse.position = (x, y)
|
||||
self.mouse.click(Button.middle, 1)
|
||||
return {"success": True}
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def double_click(
|
||||
self, x: Optional[int] = None, y: Optional[int] = None
|
||||
) -> Dict[str, Any]:
|
||||
|
||||
@@ -374,6 +374,27 @@ class WindowsAutomationHandler(BaseAutomationHandler):
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
@require_unlocked_desktop
|
||||
async def middle_click(
|
||||
self, x: Optional[int] = None, y: Optional[int] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Perform a middle mouse click at the specified coordinates.
|
||||
|
||||
Args:
|
||||
x (Optional[int]): The x-coordinate to click at. If None, clicks at current position.
|
||||
y (Optional[int]): The y-coordinate to click at. If None, clicks at current position.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary with success status and optional error message.
|
||||
"""
|
||||
try:
|
||||
if x is not None and y is not None:
|
||||
self.mouse.position = (x, y)
|
||||
self.mouse.click(MouseButton.middle, 1)
|
||||
return {"success": True}
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
@require_unlocked_desktop
|
||||
async def double_click(
|
||||
self, x: Optional[int] = None, y: Optional[int] = None
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
[bumpversion]
|
||||
current_version = 0.1.0
|
||||
commit = True
|
||||
tag = True
|
||||
tag_name = auto-v{new_version}
|
||||
message = Bump cua-auto to v{new_version}
|
||||
|
||||
[bumpversion:file:pyproject.toml]
|
||||
search = version = "{current_version}"
|
||||
replace = version = "{new_version}"
|
||||
|
||||
[bumpversion:file:cua_auto/__init__.py]
|
||||
search = __version__ = "{current_version}"
|
||||
replace = __version__ = "{new_version}"
|
||||
@@ -0,0 +1,62 @@
|
||||
# cua-auto
|
||||
|
||||
`cua-auto` is a lightweight, cross-platform automation library providing a synchronous, MIT-licensed pyautogui-style API for mouse, keyboard, screen, window, clipboard, and shell operations. It runs on Windows, macOS, and Linux using [pynput](https://github.com/moses-palmer/pynput) for input control and [pywinctl](https://github.com/Kalmat/PyWinCtl) for window management.
|
||||
|
||||
```python
|
||||
import cua_auto.mouse as mouse
|
||||
import cua_auto.keyboard as keyboard
|
||||
import cua_auto.screen as screen
|
||||
import cua_auto.window as window
|
||||
import cua_auto.clipboard as clipboard
|
||||
import cua_auto.shell as shell
|
||||
|
||||
# Mouse
|
||||
mouse.click(100, 200) # left click
|
||||
mouse.right_click(100, 200)
|
||||
mouse.double_click(100, 200)
|
||||
mouse.move_to(500, 300)
|
||||
mouse.mouse_down(100, 200)
|
||||
mouse.mouse_up(100, 200)
|
||||
mouse.drag(100, 200, 400, 500) # start → end
|
||||
mouse.scroll_up(3)
|
||||
mouse.scroll_down(3)
|
||||
x, y = mouse.position()
|
||||
|
||||
# Keyboard
|
||||
keyboard.press_key("enter")
|
||||
keyboard.type_text("hello world")
|
||||
keyboard.hotkey(["ctrl", "c"])
|
||||
keyboard.key_down("shift")
|
||||
keyboard.key_up("shift")
|
||||
|
||||
# Screen
|
||||
img = screen.screenshot() # returns PIL.Image
|
||||
png = screen.screenshot_bytes() # raw PNG bytes
|
||||
b64 = screen.screenshot_b64() # base64 string
|
||||
w, h = screen.screen_size()
|
||||
x, y = screen.cursor_position()
|
||||
|
||||
# Window
|
||||
title = window.get_active_window_title()
|
||||
handle = window.get_active_window_handle()
|
||||
handles = window.get_windows_with_title("Chrome")
|
||||
name = window.get_window_name(handle)
|
||||
x, y = window.get_window_position(handle)
|
||||
w, h = window.get_window_size(handle)
|
||||
window.activate_window(handle)
|
||||
window.minimize_window(handle)
|
||||
window.maximize_window(handle)
|
||||
window.close_window(handle)
|
||||
window.set_window_size(handle, 1280, 800)
|
||||
window.set_window_position(handle, 0, 0)
|
||||
window.open("https://example.com") # or file path
|
||||
pid = window.launch("notepad.exe")
|
||||
|
||||
# Clipboard
|
||||
text = clipboard.get()
|
||||
clipboard.set("hello")
|
||||
|
||||
# Shell
|
||||
result = shell.run("echo hi") # CommandResult
|
||||
print(result.stdout, result.returncode)
|
||||
```
|
||||
@@ -0,0 +1,25 @@
|
||||
"""cua-auto — cross-platform automation library.
|
||||
|
||||
Provides a synchronous, pyautogui-style API for mouse, keyboard, screen,
|
||||
window, clipboard, and shell operations.
|
||||
|
||||
Usage::
|
||||
|
||||
import cua_auto.mouse as mouse
|
||||
import cua_auto.keyboard as keyboard
|
||||
import cua_auto.screen as screen
|
||||
import cua_auto.window as window
|
||||
import cua_auto.clipboard as clipboard
|
||||
import cua_auto.shell as shell
|
||||
|
||||
mouse.click(100, 200)
|
||||
keyboard.hotkey(["ctrl", "c"])
|
||||
img = screen.screenshot()
|
||||
title = window.get_active_window_title()
|
||||
"""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
|
||||
from cua_auto import clipboard, keyboard, mouse, screen, shell, window
|
||||
|
||||
__all__ = ["mouse", "keyboard", "screen", "window", "clipboard", "shell"]
|
||||
@@ -0,0 +1,11 @@
|
||||
"""Platform detection helpers."""
|
||||
|
||||
import platform as _platform
|
||||
|
||||
_sys = _platform.system().lower()
|
||||
|
||||
IS_WINDOWS: bool = _sys == "windows"
|
||||
IS_MACOS: bool = _sys == "darwin"
|
||||
IS_LINUX: bool = _sys == "linux"
|
||||
|
||||
PLATFORM: str = "windows" if IS_WINDOWS else ("macos" if IS_MACOS else "linux")
|
||||
@@ -0,0 +1,13 @@
|
||||
"""Cross-platform clipboard access via pyperclip."""
|
||||
|
||||
import pyperclip as _pyperclip
|
||||
|
||||
|
||||
def get() -> str:
|
||||
"""Return the current clipboard text content."""
|
||||
return _pyperclip.paste()
|
||||
|
||||
|
||||
def set(text: str) -> None:
|
||||
"""Set the clipboard text content."""
|
||||
_pyperclip.copy(text)
|
||||
@@ -0,0 +1,153 @@
|
||||
"""Cross-platform keyboard control via pynput."""
|
||||
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from pynput.keyboard import Controller as _KBController
|
||||
from pynput.keyboard import Key as _Key
|
||||
|
||||
_kb = _KBController()
|
||||
|
||||
# Unified key name → pynput Key map (sourced from computer-server/handlers/windows.py)
|
||||
_SPECIAL: dict = {
|
||||
"enter": _Key.enter,
|
||||
"return": _Key.enter,
|
||||
"esc": _Key.esc,
|
||||
"escape": _Key.esc,
|
||||
"space": _Key.space,
|
||||
"tab": _Key.tab,
|
||||
"backspace": _Key.backspace,
|
||||
"delete": _Key.delete,
|
||||
"home": _Key.home,
|
||||
"end": _Key.end,
|
||||
"pageup": _Key.page_up,
|
||||
"page_up": _Key.page_up,
|
||||
"pagedown": _Key.page_down,
|
||||
"page_down": _Key.page_down,
|
||||
"up": _Key.up,
|
||||
"down": _Key.down,
|
||||
"left": _Key.left,
|
||||
"right": _Key.right,
|
||||
"shift": _Key.shift,
|
||||
"shift_l": _Key.shift_l,
|
||||
"shift_r": _Key.shift_r,
|
||||
"ctrl": _Key.ctrl,
|
||||
"ctrl_l": _Key.ctrl_l,
|
||||
"ctrl_r": _Key.ctrl_r,
|
||||
"control": _Key.ctrl,
|
||||
"alt": _Key.alt,
|
||||
"alt_l": _Key.alt_l,
|
||||
"alt_r": _Key.alt_r,
|
||||
"cmd": _Key.cmd,
|
||||
"command": _Key.cmd,
|
||||
"win": _Key.cmd,
|
||||
"super": _Key.cmd,
|
||||
"meta": _Key.cmd,
|
||||
"option": _Key.alt,
|
||||
"option_l": _Key.alt_l,
|
||||
"option_r": _Key.alt_r,
|
||||
"capslock": _Key.caps_lock,
|
||||
"caps_lock": _Key.caps_lock,
|
||||
"insert": _Key.insert,
|
||||
"print_screen": _Key.print_screen,
|
||||
"pause": _Key.pause,
|
||||
"num_lock": _Key.num_lock,
|
||||
"scroll_lock": _Key.scroll_lock,
|
||||
"f1": _Key.f1,
|
||||
"f2": _Key.f2,
|
||||
"f3": _Key.f3,
|
||||
"f4": _Key.f4,
|
||||
"f5": _Key.f5,
|
||||
"f6": _Key.f6,
|
||||
"f7": _Key.f7,
|
||||
"f8": _Key.f8,
|
||||
"f9": _Key.f9,
|
||||
"f10": _Key.f10,
|
||||
"f11": _Key.f11,
|
||||
"f12": _Key.f12,
|
||||
"f13": _Key.f13,
|
||||
"f14": _Key.f14,
|
||||
"f15": _Key.f15,
|
||||
"f16": _Key.f16,
|
||||
"f17": _Key.f17,
|
||||
"f18": _Key.f18,
|
||||
"f19": _Key.f19,
|
||||
"f20": _Key.f20,
|
||||
}
|
||||
|
||||
|
||||
def _resolve(key: str) -> Optional[Union[str, _Key]]:
|
||||
"""Resolve a key string to a pynput Key or single character."""
|
||||
if not key:
|
||||
return None
|
||||
lk = key.lower()
|
||||
if lk in _SPECIAL:
|
||||
return _SPECIAL[lk]
|
||||
if len(key) == 1:
|
||||
return key
|
||||
return None
|
||||
|
||||
|
||||
# ── Key press / release ───────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def key_down(key: str) -> None:
|
||||
"""Press and hold a key."""
|
||||
k = _resolve(key)
|
||||
if k is None:
|
||||
raise ValueError(f"Unknown key: {key!r}")
|
||||
_kb.press(k)
|
||||
|
||||
|
||||
def key_up(key: str) -> None:
|
||||
"""Release a key."""
|
||||
k = _resolve(key)
|
||||
if k is None:
|
||||
raise ValueError(f"Unknown key: {key!r}")
|
||||
_kb.release(k)
|
||||
|
||||
|
||||
def press_key(key: str) -> None:
|
||||
"""Press and immediately release a key."""
|
||||
k = _resolve(key)
|
||||
if k is None:
|
||||
raise ValueError(f"Unknown key: {key!r}")
|
||||
_kb.press(k)
|
||||
_kb.release(k)
|
||||
|
||||
|
||||
# ── Text typing ───────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def type_text(text: str) -> None:
|
||||
"""Type a string of text (supports Unicode)."""
|
||||
_kb.type(text)
|
||||
|
||||
|
||||
# ── Hotkeys ───────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def hotkey(keys: List[str]) -> None:
|
||||
"""Press a combination of keys, e.g. ['ctrl', 'c'].
|
||||
|
||||
Modifiers are held while the last key is tapped, then released in reverse
|
||||
order (standard hotkey behaviour).
|
||||
"""
|
||||
resolved = [_resolve(k) for k in keys]
|
||||
if any(k is None for k in resolved):
|
||||
bad = [k for k, r in zip(keys, resolved) if r is None]
|
||||
raise ValueError(f"Unknown keys in hotkey: {bad}")
|
||||
|
||||
seq: List[Union[str, _Key]] = [k for k in resolved if k is not None]
|
||||
if not seq:
|
||||
return
|
||||
|
||||
# Hold all modifiers except the last key
|
||||
for k in seq[:-1]:
|
||||
_kb.press(k)
|
||||
# Tap the action key
|
||||
last = seq[-1]
|
||||
_kb.press(last)
|
||||
_kb.release(last)
|
||||
# Release modifiers in reverse order
|
||||
for k in reversed(seq[:-1]):
|
||||
_kb.release(k)
|
||||
@@ -0,0 +1,138 @@
|
||||
"""Cross-platform mouse control via pynput."""
|
||||
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from pynput.mouse import Button as _Button
|
||||
from pynput.mouse import Controller as _MouseController
|
||||
|
||||
_mouse = _MouseController()
|
||||
|
||||
|
||||
def _map_button(button: str) -> _Button:
|
||||
"""Map a button name string to a pynput Button."""
|
||||
b = (button or "left").lower()
|
||||
if b == "right":
|
||||
return _Button.right
|
||||
if b == "middle":
|
||||
return _Button.middle
|
||||
return _Button.left
|
||||
|
||||
|
||||
# ── Basic clicks ──────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def click(x: int, y: int, button: str = "left") -> None:
|
||||
"""Single click at (x, y)."""
|
||||
_mouse.position = (x, y)
|
||||
_mouse.click(_map_button(button), 1)
|
||||
|
||||
|
||||
def right_click(x: int, y: int) -> None:
|
||||
"""Right-click at (x, y)."""
|
||||
click(x, y, "right")
|
||||
|
||||
|
||||
def double_click(x: int, y: int) -> None:
|
||||
"""Double left-click at (x, y)."""
|
||||
_mouse.position = (x, y)
|
||||
_mouse.click(_Button.left, 2)
|
||||
|
||||
|
||||
# ── Cursor movement ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def move_to(x: int, y: int) -> None:
|
||||
"""Move cursor to (x, y) without clicking."""
|
||||
_mouse.position = (x, y)
|
||||
|
||||
|
||||
# ── Mouse button hold / release ───────────────────────────────────────────────
|
||||
|
||||
|
||||
def mouse_down(x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> None:
|
||||
"""Press and hold a mouse button, optionally moving to (x, y) first."""
|
||||
if x is not None and y is not None:
|
||||
_mouse.position = (x, y)
|
||||
_mouse.press(_map_button(button))
|
||||
|
||||
|
||||
def mouse_up(x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> None:
|
||||
"""Release a mouse button, optionally moving to (x, y) first."""
|
||||
if x is not None and y is not None:
|
||||
_mouse.position = (x, y)
|
||||
_mouse.release(_map_button(button))
|
||||
|
||||
|
||||
# ── Drag ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def drag(
|
||||
start_x: int,
|
||||
start_y: int,
|
||||
end_x: int,
|
||||
end_y: int,
|
||||
button: str = "left",
|
||||
) -> None:
|
||||
"""Drag from (start_x, start_y) to (end_x, end_y)."""
|
||||
btn = _map_button(button)
|
||||
_mouse.position = (start_x, start_y)
|
||||
_mouse.press(btn)
|
||||
_mouse.position = (end_x, end_y)
|
||||
_mouse.release(btn)
|
||||
|
||||
|
||||
def drag_to(x: int, y: int, button: str = "left") -> None:
|
||||
"""Drag from the current cursor position to (x, y)."""
|
||||
btn = _map_button(button)
|
||||
_mouse.press(btn)
|
||||
_mouse.position = (x, y)
|
||||
_mouse.release(btn)
|
||||
|
||||
|
||||
def drag_path(path: List[Tuple[int, int]], button: str = "left") -> None:
|
||||
"""Drag through a sequence of (x, y) points."""
|
||||
if not path:
|
||||
return
|
||||
btn = _map_button(button)
|
||||
_mouse.position = path[0]
|
||||
_mouse.press(btn)
|
||||
for x, y in path[1:]:
|
||||
_mouse.position = (x, y)
|
||||
_mouse.release(btn)
|
||||
|
||||
|
||||
# ── Scroll ────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def scroll(dx: int, dy: int) -> None:
|
||||
"""Scroll by (dx, dy). Positive dy = scroll up."""
|
||||
_mouse.scroll(dx, dy)
|
||||
|
||||
|
||||
def scroll_up(clicks: int = 3) -> None:
|
||||
"""Scroll up by *clicks* notches."""
|
||||
_mouse.scroll(0, abs(clicks))
|
||||
|
||||
|
||||
def scroll_down(clicks: int = 3) -> None:
|
||||
"""Scroll down by *clicks* notches."""
|
||||
_mouse.scroll(0, -abs(clicks))
|
||||
|
||||
|
||||
def scroll_left(clicks: int = 3) -> None:
|
||||
"""Scroll left by *clicks* notches."""
|
||||
_mouse.scroll(-abs(clicks), 0)
|
||||
|
||||
|
||||
def scroll_right(clicks: int = 3) -> None:
|
||||
"""Scroll right by *clicks* notches."""
|
||||
_mouse.scroll(abs(clicks), 0)
|
||||
|
||||
|
||||
# ── Position ──────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def position() -> Tuple[int, int]:
|
||||
"""Return the current cursor position as (x, y)."""
|
||||
x, y = _mouse.position
|
||||
return int(x), int(y)
|
||||
@@ -0,0 +1,76 @@
|
||||
"""Cross-platform screen capture and info."""
|
||||
|
||||
import base64
|
||||
import io
|
||||
from typing import Tuple
|
||||
|
||||
from PIL import Image
|
||||
|
||||
|
||||
def screenshot() -> Image.Image:
|
||||
"""Capture a screenshot of all monitors and return a PIL Image.
|
||||
|
||||
Tries PIL.ImageGrab first (works on Windows and macOS).
|
||||
Falls back to mss if ImageGrab fails (better Linux / multi-monitor support).
|
||||
"""
|
||||
# Try PIL.ImageGrab (Windows + macOS)
|
||||
try:
|
||||
from PIL import ImageGrab
|
||||
|
||||
img = ImageGrab.grab(all_screens=True)
|
||||
if isinstance(img, Image.Image):
|
||||
return img
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback: mss (cross-platform, optional dependency)
|
||||
try:
|
||||
import mss # type: ignore[import-untyped]
|
||||
import mss.tools # type: ignore[import-untyped]
|
||||
|
||||
with mss.mss() as sct:
|
||||
monitor = sct.monitors[0] # combined virtual desktop
|
||||
sct_img = sct.grab(monitor)
|
||||
return Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"screenshot failed: {e}") from e
|
||||
|
||||
|
||||
def screenshot_bytes(format: str = "PNG") -> bytes:
|
||||
"""Return screenshot as raw bytes in the specified image format."""
|
||||
img = screenshot()
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format=format)
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def screenshot_b64(format: str = "PNG") -> str:
|
||||
"""Return screenshot as a base64-encoded string."""
|
||||
return base64.b64encode(screenshot_bytes(format)).decode()
|
||||
|
||||
|
||||
def screen_size() -> Tuple[int, int]:
|
||||
"""Return the total virtual desktop size as (width, height)."""
|
||||
img = screenshot()
|
||||
return img.size
|
||||
|
||||
|
||||
def cursor_position() -> Tuple[int, int]:
|
||||
"""Return the current cursor position as (x, y).
|
||||
|
||||
Tries win32gui first on Windows (more reliable with DPI scaling).
|
||||
Falls back to pynput if win32 is unavailable (cross-platform fallback).
|
||||
"""
|
||||
try:
|
||||
import win32gui # type: ignore[import-untyped]
|
||||
|
||||
pos = win32gui.GetCursorPos()
|
||||
return int(pos[0]), int(pos[1])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
from pynput.mouse import Controller as _MouseController
|
||||
|
||||
c = _MouseController()
|
||||
x, y = c.position
|
||||
return int(x), int(y)
|
||||
@@ -0,0 +1,53 @@
|
||||
"""Shell command execution."""
|
||||
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class CommandResult:
|
||||
stdout: str
|
||||
stderr: str
|
||||
returncode: int
|
||||
|
||||
@property
|
||||
def success(self) -> bool:
|
||||
return self.returncode == 0
|
||||
|
||||
|
||||
def run(command: str, timeout: int = 30) -> CommandResult:
|
||||
"""Run a shell command and return stdout, stderr, and returncode.
|
||||
|
||||
The command is passed to the system shell (``shell=True``) so that
|
||||
shell built-ins, pipes, and redirections work as expected.
|
||||
"""
|
||||
|
||||
def _decode(data: bytes) -> str:
|
||||
if not data:
|
||||
return ""
|
||||
for enc in ("utf-8", "gbk", "gb2312", "cp936", "latin1"):
|
||||
try:
|
||||
return data.decode(enc)
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
continue
|
||||
return data.decode("utf-8", errors="replace")
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
command,
|
||||
shell=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
timeout=timeout,
|
||||
)
|
||||
return CommandResult(
|
||||
stdout=_decode(result.stdout),
|
||||
stderr=_decode(result.stderr),
|
||||
returncode=result.returncode,
|
||||
)
|
||||
except subprocess.TimeoutExpired as e:
|
||||
return CommandResult(
|
||||
stdout=_decode(e.stdout or b""),
|
||||
stderr=_decode(e.stderr or b""),
|
||||
returncode=-1,
|
||||
)
|
||||
@@ -0,0 +1,188 @@
|
||||
"""Cross-platform window management via pywinctl."""
|
||||
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
import webbrowser
|
||||
from typing import Any, List, Optional, Tuple
|
||||
|
||||
try:
|
||||
import pywinctl as _pwc # type: ignore[import-untyped]
|
||||
except Exception:
|
||||
_pwc = None # type: ignore[assignment]
|
||||
|
||||
|
||||
def _require_pwc() -> Any:
|
||||
if _pwc is None:
|
||||
raise RuntimeError("pywinctl is not available. Install it with: pip install pywinctl")
|
||||
return _pwc
|
||||
|
||||
|
||||
def _get_by_handle(handle: Any) -> Optional[Any]:
|
||||
"""Find a pywinctl window object by its native handle."""
|
||||
pwc = _require_pwc()
|
||||
try:
|
||||
for w in pwc.getAllWindows():
|
||||
if str(w.getHandle()) == str(handle):
|
||||
return w
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
# ── Active window ─────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def get_active_window() -> Optional[Any]:
|
||||
"""Return the pywinctl window object for the currently focused window."""
|
||||
pwc = _require_pwc()
|
||||
return pwc.getActiveWindow()
|
||||
|
||||
|
||||
def get_active_window_title() -> str:
|
||||
"""Return the title of the currently focused window, or 'Desktop'."""
|
||||
try:
|
||||
win = get_active_window()
|
||||
if win:
|
||||
return win.title or "Desktop"
|
||||
except Exception:
|
||||
pass
|
||||
return "Desktop"
|
||||
|
||||
|
||||
def get_active_window_handle() -> Optional[str]:
|
||||
"""Return the native handle of the active window as a string, or None."""
|
||||
try:
|
||||
win = get_active_window()
|
||||
if win:
|
||||
return str(win.getHandle())
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
# ── Window lookup ─────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def get_windows_with_title(title: str) -> List[str]:
|
||||
"""Return a list of native handle strings for windows whose title contains *title*."""
|
||||
pwc = _require_pwc()
|
||||
try:
|
||||
wins = pwc.getWindowsWithTitle(title, condition=pwc.Re.CONTAINS, flags=pwc.Re.IGNORECASE)
|
||||
return [str(w.getHandle()) for w in wins]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def get_window_name(handle: Any) -> Optional[str]:
|
||||
"""Return the title of the window with the given handle, or None."""
|
||||
w = _get_by_handle(handle)
|
||||
return w.title if w else None
|
||||
|
||||
|
||||
def get_window_size(handle: Any) -> Optional[Tuple[int, int]]:
|
||||
"""Return (width, height) of the window, or None if not found."""
|
||||
w = _get_by_handle(handle)
|
||||
if not w:
|
||||
return None
|
||||
width, height = w.size
|
||||
return int(width), int(height)
|
||||
|
||||
|
||||
def get_window_position(handle: Any) -> Optional[Tuple[int, int]]:
|
||||
"""Return (x, y) position of the window, or None if not found."""
|
||||
w = _get_by_handle(handle)
|
||||
if not w:
|
||||
return None
|
||||
x, y = w.position
|
||||
return int(x), int(y)
|
||||
|
||||
|
||||
# ── Window actions ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def activate_window(handle: Any) -> bool:
|
||||
"""Bring the window to the foreground and give it focus."""
|
||||
w = _get_by_handle(handle)
|
||||
if not w:
|
||||
return False
|
||||
return bool(w.activate())
|
||||
|
||||
|
||||
def minimize_window(handle: Any) -> bool:
|
||||
"""Minimize the window."""
|
||||
w = _get_by_handle(handle)
|
||||
if not w:
|
||||
return False
|
||||
return bool(w.minimize())
|
||||
|
||||
|
||||
def maximize_window(handle: Any) -> bool:
|
||||
"""Maximize the window."""
|
||||
w = _get_by_handle(handle)
|
||||
if not w:
|
||||
return False
|
||||
return bool(w.maximize())
|
||||
|
||||
|
||||
def close_window(handle: Any) -> bool:
|
||||
"""Close the window."""
|
||||
w = _get_by_handle(handle)
|
||||
if not w:
|
||||
return False
|
||||
return bool(w.close())
|
||||
|
||||
|
||||
def set_window_size(handle: Any, width: int, height: int) -> bool:
|
||||
"""Resize the window to (width, height)."""
|
||||
w = _get_by_handle(handle)
|
||||
if not w:
|
||||
return False
|
||||
return bool(w.resizeTo(int(width), int(height)))
|
||||
|
||||
|
||||
def set_window_position(handle: Any, x: int, y: int) -> bool:
|
||||
"""Move the window to (x, y)."""
|
||||
w = _get_by_handle(handle)
|
||||
if not w:
|
||||
return False
|
||||
return bool(w.moveTo(int(x), int(y)))
|
||||
|
||||
|
||||
# ── Open / launch ─────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def open(target: str) -> bool: # noqa: A001
|
||||
"""Open a URL or file path with the default application.
|
||||
|
||||
URLs are opened in the default browser. Files are opened with the OS
|
||||
default handler (``open`` on macOS, ``xdg-open`` on Linux,
|
||||
``os.startfile`` on Windows).
|
||||
"""
|
||||
if target.startswith("http://") or target.startswith("https://"):
|
||||
return bool(webbrowser.open(target))
|
||||
|
||||
sys = platform.system().lower()
|
||||
if sys == "darwin":
|
||||
subprocess.Popen(["open", target])
|
||||
elif sys == "linux":
|
||||
subprocess.Popen(["xdg-open", target])
|
||||
elif sys == "windows":
|
||||
os.startfile(target) # type: ignore[attr-defined]
|
||||
else:
|
||||
raise RuntimeError(f"Unsupported OS: {sys}")
|
||||
return True
|
||||
|
||||
|
||||
def launch(app: str, args: Optional[List[str]] = None) -> int:
|
||||
"""Launch an application and return its PID.
|
||||
|
||||
If *args* is given, the app is launched with those arguments.
|
||||
Otherwise the command string is passed to the system shell, allowing
|
||||
strings like ``"libreoffice --writer"``.
|
||||
"""
|
||||
if args:
|
||||
proc = subprocess.Popen([app, *args])
|
||||
else:
|
||||
proc = subprocess.Popen(app, shell=True)
|
||||
return proc.pid
|
||||
@@ -0,0 +1,91 @@
|
||||
[project]
|
||||
name = "cua-auto"
|
||||
version = "0.1.0"
|
||||
description = "Cross-platform automation library — mouse, keyboard, screen, window, clipboard, shell"
|
||||
readme = "README.md"
|
||||
license = "MIT"
|
||||
authors = [
|
||||
{ name = "TryCua", email = "hello@trycua.com" }
|
||||
]
|
||||
keywords = [
|
||||
"automation",
|
||||
"mouse",
|
||||
"keyboard",
|
||||
"screenshot",
|
||||
"window",
|
||||
"computer-use",
|
||||
"pyautogui",
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Topic :: Software Development :: Libraries",
|
||||
]
|
||||
requires-python = ">=3.11,<3.14"
|
||||
|
||||
dependencies = [
|
||||
# Mouse + keyboard control (cross-platform)
|
||||
"pynput>=1.7.0",
|
||||
# Screenshot + image processing
|
||||
"pillow>=10.0.0",
|
||||
# Clipboard access (cross-platform)
|
||||
"pyperclip>=1.9.0",
|
||||
# Window management (cross-platform)
|
||||
"pywinctl>=0.4",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
# Faster multi-monitor screenshot backend
|
||||
mss = [
|
||||
"mss>=9.0.0",
|
||||
]
|
||||
# Windows-specific extras (cursor position via win32, etc.)
|
||||
windows = [
|
||||
"pywin32>=306; sys_platform == 'win32'",
|
||||
]
|
||||
all = [
|
||||
"cua-auto[mss,windows]",
|
||||
]
|
||||
dev = [
|
||||
"pytest>=8.0.0",
|
||||
"pytest-asyncio>=0.23.0",
|
||||
"ruff>=0.1.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/trycua/cua"
|
||||
Documentation = "https://docs.trycua.com"
|
||||
Repository = "https://github.com/trycua/cua"
|
||||
Issues = "https://github.com/trycua/cua/issues"
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build]
|
||||
include = [
|
||||
"cua_auto/**",
|
||||
"README.md",
|
||||
"LICENSE",
|
||||
]
|
||||
exclude = [
|
||||
"cua_auto/**/__pycache__",
|
||||
"**/.DS_Store",
|
||||
]
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["cua_auto"]
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 100
|
||||
target-version = "py311"
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = ["E", "F", "W"]
|
||||
ignore = ["E501"]
|
||||
File diff suppressed because it is too large
Load Diff
@@ -201,7 +201,9 @@ def cmd_local_list(args: argparse.Namespace) -> int:
|
||||
|
||||
style = "green" if "ready" in status else "red"
|
||||
|
||||
console.print(f"{name:<20} {platform:<15} {size:<10} {created:<12} [{style}]{status}[/{style}]")
|
||||
console.print(
|
||||
f"{name:<20} {platform:<15} {size:<10} {created:<12} [{style}]{status}[/{style}]"
|
||||
)
|
||||
|
||||
print("\n" + "=" * 85)
|
||||
print("\nCommands:")
|
||||
|
||||
@@ -706,6 +706,280 @@ async def _register_computer_tools(
|
||||
result = await _send_command(sandbox, "open", {"path": path})
|
||||
return json.dumps(result)
|
||||
|
||||
@server.tool()
|
||||
async def computer_window_focus(ctx: Context, window_id: str, sandbox: str = "") -> str:
|
||||
"""Bring a window to the foreground and focus it.
|
||||
|
||||
Args:
|
||||
window_id: Window identifier
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(sandbox, "activate_window", {"window_id": window_id})
|
||||
return json.dumps(result)
|
||||
|
||||
@server.tool()
|
||||
async def computer_window_unfocus(ctx: Context, sandbox: str = "") -> str:
|
||||
"""Remove focus from the currently focused window.
|
||||
|
||||
Args:
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(sandbox, "deactivate_window", {})
|
||||
if not result.get("success"):
|
||||
# Fallback: press Escape
|
||||
result = await _send_command(sandbox, "press_key", {"key": "escape"})
|
||||
return json.dumps(result)
|
||||
|
||||
@server.tool()
|
||||
async def computer_window_minimize(ctx: Context, window_id: str, sandbox: str = "") -> str:
|
||||
"""Minimize a window.
|
||||
|
||||
Args:
|
||||
window_id: Window identifier
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(sandbox, "minimize_window", {"window_id": window_id})
|
||||
return json.dumps(result)
|
||||
|
||||
@server.tool()
|
||||
async def computer_window_maximize(ctx: Context, window_id: str, sandbox: str = "") -> str:
|
||||
"""Maximize a window.
|
||||
|
||||
Args:
|
||||
window_id: Window identifier
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(sandbox, "maximize_window", {"window_id": window_id})
|
||||
return json.dumps(result)
|
||||
|
||||
@server.tool()
|
||||
async def computer_window_close(ctx: Context, window_id: str, sandbox: str = "") -> str:
|
||||
"""Close a window.
|
||||
|
||||
Args:
|
||||
window_id: Window identifier
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(sandbox, "close_window", {"window_id": window_id})
|
||||
return json.dumps(result)
|
||||
|
||||
@server.tool()
|
||||
async def computer_window_resize(
|
||||
ctx: Context,
|
||||
window_id: str,
|
||||
width: int,
|
||||
height: int,
|
||||
sandbox: str = "",
|
||||
) -> str:
|
||||
"""Resize a window.
|
||||
|
||||
Args:
|
||||
window_id: Window identifier
|
||||
width: New width in pixels
|
||||
height: New height in pixels
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(
|
||||
sandbox,
|
||||
"set_window_size",
|
||||
{"window_id": window_id, "width": width, "height": height},
|
||||
)
|
||||
return json.dumps(result)
|
||||
|
||||
@server.tool()
|
||||
async def computer_window_move(
|
||||
ctx: Context,
|
||||
window_id: str,
|
||||
x: int,
|
||||
y: int,
|
||||
sandbox: str = "",
|
||||
) -> str:
|
||||
"""Move a window to a position on screen.
|
||||
|
||||
Args:
|
||||
window_id: Window identifier
|
||||
x: X coordinate for the window's top-left corner
|
||||
y: Y coordinate for the window's top-left corner
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(
|
||||
sandbox,
|
||||
"set_window_position",
|
||||
{"window_id": window_id, "x": x, "y": y},
|
||||
)
|
||||
return json.dumps(result)
|
||||
|
||||
@server.tool()
|
||||
async def computer_window_get_info(
|
||||
ctx: Context,
|
||||
window_id: str,
|
||||
sandbox: str = "",
|
||||
) -> str:
|
||||
"""Get a window's title, size, and position.
|
||||
|
||||
Args:
|
||||
window_id: Window identifier
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
name_r = await _send_command(sandbox, "get_window_name", {"window_id": window_id})
|
||||
size_r = await _send_command(sandbox, "get_window_size", {"window_id": window_id})
|
||||
pos_r = await _send_command(sandbox, "get_window_position", {"window_id": window_id})
|
||||
return json.dumps(
|
||||
{
|
||||
"window_id": window_id,
|
||||
"title": name_r.get("name") or name_r.get("data"),
|
||||
"size": size_r.get("size") or size_r.get("data"),
|
||||
"position": pos_r.get("position") or pos_r.get("data"),
|
||||
},
|
||||
indent=2,
|
||||
)
|
||||
|
||||
@server.tool()
|
||||
async def computer_launch(
|
||||
ctx: Context,
|
||||
app: str,
|
||||
args: list[str] | None = None,
|
||||
sandbox: str = "",
|
||||
) -> str:
|
||||
"""Launch an application.
|
||||
|
||||
Args:
|
||||
app: Application executable or bundle identifier
|
||||
args: Optional list of arguments
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(sandbox, "launch", {"app": app, "args": args or []})
|
||||
return json.dumps(result)
|
||||
|
||||
if Permission.COMPUTER_CLICK in permissions:
|
||||
|
||||
@server.tool()
|
||||
async def computer_move_cursor(
|
||||
ctx: Context,
|
||||
x: int,
|
||||
y: int,
|
||||
sandbox: str = "",
|
||||
) -> str:
|
||||
"""Move the mouse cursor without clicking.
|
||||
|
||||
Args:
|
||||
x: X coordinate
|
||||
y: Y coordinate
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(sandbox, "move_cursor", {"x": x, "y": y})
|
||||
return json.dumps(result)
|
||||
|
||||
@server.tool()
|
||||
async def computer_mouse_down(
|
||||
ctx: Context,
|
||||
x: int,
|
||||
y: int,
|
||||
button: str = "left",
|
||||
sandbox: str = "",
|
||||
) -> str:
|
||||
"""Press and hold a mouse button.
|
||||
|
||||
Args:
|
||||
x: X coordinate
|
||||
y: Y coordinate
|
||||
button: Mouse button (left, right, middle)
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(sandbox, "mouse_down", {"x": x, "y": y, "button": button})
|
||||
return json.dumps(result)
|
||||
|
||||
@server.tool()
|
||||
async def computer_mouse_up(
|
||||
ctx: Context,
|
||||
x: int,
|
||||
y: int,
|
||||
button: str = "left",
|
||||
sandbox: str = "",
|
||||
) -> str:
|
||||
"""Release a mouse button.
|
||||
|
||||
Args:
|
||||
x: X coordinate
|
||||
y: Y coordinate
|
||||
button: Mouse button (left, right, middle)
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(sandbox, "mouse_up", {"x": x, "y": y, "button": button})
|
||||
return json.dumps(result)
|
||||
|
||||
if Permission.COMPUTER_KEY in permissions:
|
||||
|
||||
@server.tool()
|
||||
async def computer_key_down(ctx: Context, key: str, sandbox: str = "") -> str:
|
||||
"""Press and hold a key.
|
||||
|
||||
Args:
|
||||
key: Key to hold (e.g. "shift", "ctrl", "a")
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(sandbox, "key_down", {"key": key})
|
||||
return json.dumps(result)
|
||||
|
||||
@server.tool()
|
||||
async def computer_key_up(ctx: Context, key: str, sandbox: str = "") -> str:
|
||||
"""Release a previously held key.
|
||||
|
||||
Args:
|
||||
key: Key to release
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(sandbox, "key_up", {"key": key})
|
||||
return json.dumps(result)
|
||||
|
||||
if Permission.COMPUTER_SCREENSHOT in permissions:
|
||||
|
||||
@server.tool()
|
||||
async def computer_get_screen_size(ctx: Context, sandbox: str = "") -> str:
|
||||
"""Get the screen dimensions.
|
||||
|
||||
Args:
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(sandbox, "get_screen_size", {})
|
||||
return json.dumps(result)
|
||||
|
||||
@server.tool()
|
||||
async def computer_get_cursor_position(ctx: Context, sandbox: str = "") -> str:
|
||||
"""Get the current cursor position.
|
||||
|
||||
Args:
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(sandbox, "get_cursor_position", {})
|
||||
return json.dumps(result)
|
||||
|
||||
@server.tool()
|
||||
async def computer_get_accessibility_tree(ctx: Context, sandbox: str = "") -> str:
|
||||
"""Get the accessibility tree of the current screen.
|
||||
|
||||
Args:
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
result = await _send_command(sandbox, "get_accessibility_tree", {})
|
||||
return json.dumps(result)
|
||||
|
||||
@server.tool()
|
||||
async def computer_get_current_window(ctx: Context, sandbox: str = "") -> str:
|
||||
"""Get the currently focused window ID and title.
|
||||
|
||||
Args:
|
||||
sandbox: Sandbox name (optional)
|
||||
"""
|
||||
win_r = await _send_command(sandbox, "get_current_window_id", {})
|
||||
window_id = win_r.get("window_id") or win_r.get("data")
|
||||
if window_id:
|
||||
name_r = await _send_command(sandbox, "get_window_name", {"window_id": window_id})
|
||||
title = name_r.get("name") or name_r.get("data") or ""
|
||||
else:
|
||||
title = ""
|
||||
return json.dumps({"window_id": window_id, "title": title or "Desktop"})
|
||||
|
||||
|
||||
async def _register_skills_tools(server: "FastMCP", permissions: set[Permission]) -> None:
|
||||
"""Register skills management tools."""
|
||||
|
||||
@@ -5,7 +5,7 @@ import logging
|
||||
import sys
|
||||
|
||||
from cua_cli import __version__
|
||||
from cua_cli.commands import auth, image, mcp, platform, sandbox, skills
|
||||
from cua_cli.commands import auth, do, image, mcp, platform, sandbox, skills
|
||||
from cua_cli.utils.output import print_error
|
||||
|
||||
|
||||
@@ -27,6 +27,10 @@ Examples:
|
||||
cua image create linux-docker Create a local image
|
||||
cua image shell <name> Interactive shell into image
|
||||
cua platform list Show available platforms
|
||||
cua do switch docker my-ct Select automation target VM
|
||||
cua do screenshot Take a screenshot
|
||||
cua do click 100 200 Click at coordinates
|
||||
cua do type "hello" Type text
|
||||
|
||||
For more information, visit https://docs.trycua.com
|
||||
""",
|
||||
@@ -48,6 +52,8 @@ For more information, visit https://docs.trycua.com
|
||||
platform.register_parser(subparsers)
|
||||
skills.register_parser(subparsers)
|
||||
mcp.register_parser(subparsers)
|
||||
do.register_parser(subparsers)
|
||||
do.register_host_consent_parser(subparsers)
|
||||
|
||||
return parser
|
||||
|
||||
@@ -81,6 +87,10 @@ def main() -> int:
|
||||
return skills.execute(args)
|
||||
elif args.command == "serve-mcp":
|
||||
return mcp.execute(args)
|
||||
elif args.command == "do":
|
||||
return do.execute(args)
|
||||
elif args.command == "do-host-consent":
|
||||
return do.execute_host_consent(args)
|
||||
else:
|
||||
print_error(f"Unknown command: {args.command}")
|
||||
return 1
|
||||
|
||||
@@ -24,12 +24,14 @@ classifiers = [
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Environment :: Console",
|
||||
]
|
||||
requires-python = ">=3.11,<3.14"
|
||||
requires-python = ">=3.12,<3.14"
|
||||
|
||||
dependencies = [
|
||||
# Core CUA packages
|
||||
"cua-computer>=0.4.0",
|
||||
"cua-computer>=0.5.0",
|
||||
"cua-core>=0.1.0",
|
||||
# Host automation (used by cua do switch host)
|
||||
"cua-auto>=0.1.0",
|
||||
# HTTP client
|
||||
"aiohttp>=3.9.0",
|
||||
# CLI output
|
||||
|
||||
@@ -0,0 +1,127 @@
|
||||
---
|
||||
name: cua-do-cli
|
||||
description: Automates interactions with remote VMs, local Docker containers, cloud sandboxes, and the host PC via cua do. Use when navigating GUIs, taking snapshots/screenshots, clicking, typing, scrolling, running shell commands, or managing windows on a target machine.
|
||||
---
|
||||
|
||||
# VM Automation with cua do
|
||||
|
||||
One command = one action = one ✅/❌ line. Switch targets once, then operate.
|
||||
|
||||
## Start Here
|
||||
|
||||
1. Pick a flow:
|
||||
- API key available: switch → snapshot → interact → snapshot
|
||||
- No API key: switch → screenshot → read image → interact → screenshot
|
||||
- Host control: consent → switch host → operate
|
||||
2. Run the canonical flow below.
|
||||
3. Check the command skeleton only if blocked.
|
||||
|
||||
## Decision Map
|
||||
|
||||
- No target set: `switch` → then operate
|
||||
- **ANTHROPIC_API_KEY set** (normal): `switch` → `snapshot` → `click/type/key` → repeat snapshot after each UI change
|
||||
- **No API key**: `switch` → `screenshot` → read image to determine coords → `click/type/key` → repeat screenshot
|
||||
- Zoom to one window: `zoom "App Name"` → operate → `unzoom`
|
||||
- Run shell: `shell "command"`
|
||||
- Control host PC: `do-host-consent` (once) → `switch host` → operate
|
||||
|
||||
## Canonical Flows
|
||||
|
||||
### 1) Normal (ANTHROPIC_API_KEY set)
|
||||
|
||||
```bash
|
||||
cua do switch docker my-container
|
||||
cua do snapshot # AI summary + interactive elements with coords
|
||||
cua do click 450 300
|
||||
cua do type "hello"
|
||||
cua do key enter
|
||||
cua do snapshot # re-snapshot after UI change
|
||||
```
|
||||
|
||||
### 2) No API Key
|
||||
|
||||
```bash
|
||||
cua do switch docker my-container
|
||||
cua do screenshot # save path printed; read the image to find coords
|
||||
cua do click 450 300
|
||||
cua do type "hello"
|
||||
cua do key enter
|
||||
cua do screenshot # re-screenshot after UI change
|
||||
```
|
||||
|
||||
### 3) Focused Window (Zoom)
|
||||
|
||||
```bash
|
||||
cua do zoom "Chrome"
|
||||
cua do snapshot # cropped to Chrome; coords are window-relative
|
||||
cua do click 120 80
|
||||
cua do unzoom
|
||||
```
|
||||
|
||||
### 4) Host PC Control
|
||||
|
||||
```bash
|
||||
cua do-host-consent # one-time consent, no prompt
|
||||
cua do switch host
|
||||
cua do snapshot
|
||||
cua do click 100 200
|
||||
```
|
||||
|
||||
## Command Skeleton
|
||||
|
||||
### Target
|
||||
|
||||
```bash
|
||||
cua do switch <provider> [name] # cloud, docker, lume, lumier, winsandbox, host
|
||||
cua do status
|
||||
cua do ls [provider]
|
||||
```
|
||||
|
||||
### Snapshot / Screenshot
|
||||
|
||||
```bash
|
||||
cua do snapshot ["extra instructions"] # screenshot + AI summary (needs ANTHROPIC_API_KEY)
|
||||
cua do screenshot [--save path]
|
||||
cua do zoom "Window Name"
|
||||
cua do unzoom
|
||||
```
|
||||
|
||||
### Input
|
||||
|
||||
```bash
|
||||
cua do click <x> <y> [left|right|middle]
|
||||
cua do dclick <x> <y>
|
||||
cua do move <x> <y>
|
||||
cua do type "text"
|
||||
cua do key <key> # enter, escape, tab, space, f1–f12, …
|
||||
cua do hotkey ctrl+c
|
||||
cua do scroll <up|down|left|right> [n]
|
||||
cua do drag <x1> <y1> <x2> <y2>
|
||||
```
|
||||
|
||||
### Shell / Open
|
||||
|
||||
```bash
|
||||
cua do shell "command"
|
||||
cua do open <url|path>
|
||||
```
|
||||
|
||||
### Window
|
||||
|
||||
```bash
|
||||
cua do window ls [app]
|
||||
cua do window focus <id>
|
||||
cua do window unfocus
|
||||
cua do window minimize/maximize/close <id>
|
||||
cua do window resize <id> <w> <h>
|
||||
cua do window move <id> <x> <y>
|
||||
cua do window info <id>
|
||||
```
|
||||
|
||||
## Guardrails
|
||||
|
||||
- Re-snapshot after navigation, modals, or list changes — coords go stale.
|
||||
- `snapshot` needs `ANTHROPIC_API_KEY`; use `screenshot` otherwise.
|
||||
- Coords are image-space: zoom + max-length scaling are applied automatically.
|
||||
- `do-host-consent` is permanent until the consent file is deleted (`~/.cua/host_consented`).
|
||||
- Set `PYTHONIOENCODING=utf-8` on Windows for correct emoji output.
|
||||
Reference in New Issue
Block a user