diff --git a/docs/content/docs/computer-sdk/commands.mdx b/docs/content/docs/computer-sdk/commands.mdx index 4d36baa4..349d67d8 100644 --- a/docs/content/docs/computer-sdk/commands.mdx +++ b/docs/content/docs/computer-sdk/commands.mdx @@ -22,6 +22,51 @@ Execute shell commands and get detailed results: +## Window Management + +Control application launching and windows: + + + + ```python + # Launch applications + await computer.interface.launch("xfce4-terminal") + await computer.interface.launch("libreoffice --writer") + await computer.interface.open("https://www.google.com") + + # Window management + windows = await computer.interface.get_application_windows("xfce4-terminal") + window_id = windows[0] + await computer.interface.activate_window(window_id) + + window_id = await computer.interface.get_current_window_id() # get the current active window id + await computer.interface.window_size(window_id) + await computer.interface.get_window_title(window_id) + await computer.interface.close_window(window_id) + ``` + + + + ```typescript + // Launch applications + await computer.interface.launch("xfce4-terminal"); + await computer.interface.launch("libreoffice --writer"); + await computer.interface.open("https://www.google.com"); + + // Window management + const windows = await computer.interface.getApplicationWindows("xfce4-terminal"); + let windowId = windows[0]; + await computer.interface.activateWindow(windowId); + + windowId = await computer.interface.getCurrentWindowId(); // current active window id + await computer.interface.getWindowSize(windowId); + await computer.interface.getWindowName(windowId); + await computer.interface.closeWindow(windowId); + ``` + + + + ## Mouse Actions Precise mouse control and interaction: diff --git a/libs/python/computer-server/computer_server/handlers/base.py b/libs/python/computer-server/computer_server/handlers/base.py index 73250d44..a49f232c 100644 --- a/libs/python/computer-server/computer_server/handlers/base.py +++ b/libs/python/computer-server/computer_server/handlers/base.py @@ -105,6 +105,55 @@ class BaseDesktopHandler(ABC): pass +class BaseWindowHandler(ABC): + """Abstract class for OS-specific window management handlers. + + Categories: + - Window Management: Methods for application/window control + """ + + # Window Management + @abstractmethod + async def open(self, target: str) -> Dict[str, Any]: + """Open a file or URL with the default application.""" + pass + + @abstractmethod + async def launch(self, app: str, args: Optional[List[str]] = None) -> Dict[str, Any]: + """Launch an application with optional arguments.""" + pass + + @abstractmethod + async def get_current_window_id(self) -> Dict[str, Any]: + """Get the currently active window ID.""" + pass + + @abstractmethod + async def get_application_windows(self, app: str) -> Dict[str, Any]: + """Get windows belonging to an application (by name or bundle).""" + pass + + @abstractmethod + async def get_window_name(self, window_id: str) -> Dict[str, Any]: + """Get the title/name of a window by ID.""" + pass + + @abstractmethod + async def get_window_size(self, window_id: str) -> Dict[str, Any]: + """Get the size of a window by ID as {width, height}.""" + pass + + @abstractmethod + async def activate_window(self, window_id: str) -> Dict[str, Any]: + """Bring a window to the foreground by ID.""" + pass + + @abstractmethod + async def close_window(self, window_id: str) -> Dict[str, Any]: + """Close a window by ID.""" + pass + + class BaseAutomationHandler(ABC): """Abstract base class for OS-specific automation handlers. diff --git a/libs/python/computer-server/computer_server/handlers/factory.py b/libs/python/computer-server/computer_server/handlers/factory.py index 77d88e5f..aadcac15 100644 --- a/libs/python/computer-server/computer_server/handlers/factory.py +++ b/libs/python/computer-server/computer_server/handlers/factory.py @@ -9,6 +9,7 @@ from .base import ( BaseAutomationHandler, BaseDesktopHandler, BaseFileHandler, + BaseWindowHandler, ) # Conditionally import platform-specific handlers @@ -22,7 +23,7 @@ elif system == "linux": elif system == "windows": from .windows import WindowsAccessibilityHandler, WindowsAutomationHandler -from .generic import GenericDesktopHandler, GenericFileHandler +from .generic import GenericDesktopHandler, GenericFileHandler, GenericWindowHandler class HandlerFactory: @@ -61,6 +62,7 @@ class HandlerFactory: BaseDioramaHandler, BaseFileHandler, BaseDesktopHandler, + BaseWindowHandler, ] ): """Create and return appropriate handlers for the current OS. @@ -82,6 +84,7 @@ class HandlerFactory: MacOSDioramaHandler(), GenericFileHandler(), GenericDesktopHandler(), + GenericWindowHandler(), ) elif os_type == "linux": return ( @@ -90,6 +93,7 @@ class HandlerFactory: BaseDioramaHandler(), GenericFileHandler(), GenericDesktopHandler(), + GenericWindowHandler(), ) elif os_type == "windows": return ( @@ -98,6 +102,7 @@ class HandlerFactory: BaseDioramaHandler(), GenericFileHandler(), GenericDesktopHandler(), + GenericWindowHandler(), ) else: raise NotImplementedError(f"OS '{os_type}' is not supported") diff --git a/libs/python/computer-server/computer_server/handlers/generic.py b/libs/python/computer-server/computer_server/handlers/generic.py index c7348312..e39a693c 100644 --- a/libs/python/computer-server/computer_server/handlers/generic.py +++ b/libs/python/computer-server/computer_server/handlers/generic.py @@ -8,11 +8,20 @@ Includes: """ import base64 +import os +import platform +import subprocess +import webbrowser from pathlib import Path from typing import Any, Dict, Optional from ..utils import wallpaper -from .base import BaseDesktopHandler, BaseFileHandler +from .base import BaseDesktopHandler, BaseFileHandler, BaseWindowHandler + +try: + import pywinctl as pwc +except Exception: # pragma: no cover + pwc = None # type: ignore def resolve_path(path: str) -> Path: @@ -27,6 +36,9 @@ def resolve_path(path: str) -> Path: return Path(path).expanduser().resolve() +# ===== Cross-platform Desktop command handlers ===== + + class GenericDesktopHandler(BaseDesktopHandler): """ Generic desktop handler providing desktop-related operations. @@ -67,6 +79,125 @@ class GenericDesktopHandler(BaseDesktopHandler): return {"success": False, "error": str(e)} +# ===== Cross-platform window control command handlers ===== + + +class GenericWindowHandler(BaseWindowHandler): + """ + Cross-platform window management using pywinctl where possible. + """ + + async def open(self, target: str) -> Dict[str, Any]: + try: + if target.startswith("http://") or target.startswith("https://"): + ok = webbrowser.open(target) + return {"success": bool(ok)} + path = str(resolve_path(target)) + sys = platform.system().lower() + if sys == "darwin": + subprocess.Popen(["open", path]) + elif sys == "linux": + subprocess.Popen(["xdg-open", path]) + elif sys == "windows": + os.startfile(path) # type: ignore[attr-defined] + else: + return {"success": False, "error": f"Unsupported OS: {sys}"} + return {"success": True} + except Exception as e: + return {"success": False, "error": str(e)} + + async def launch(self, app: str, args: Optional[list[str]] = None) -> Dict[str, Any]: + try: + if args: + proc = subprocess.Popen([app, *args]) + else: + # allow shell command like "libreoffice --writer" + proc = subprocess.Popen(app, shell=True) + return {"success": True, "pid": proc.pid} + except Exception as e: + return {"success": False, "error": str(e)} + + def _get_window_by_id(self, window_id: int | str): + if pwc is None: + raise RuntimeError("pywinctl not available") + try: + windows = pwc.getAllWindowsDict() + return windows.get(window_id) or windows.get(int(window_id)) + except Exception: + return None + + async def get_current_window_id(self) -> Dict[str, Any]: + try: + if pwc is None: + return {"success": False, "error": "pywinctl not available"} + win = pwc.getActiveWindow() + if not win: + return {"success": False, "error": "No active window"} + return {"success": True, "window_id": win.getHandle()} + except Exception as e: + return {"success": False, "error": str(e)} + + async def get_application_windows(self, app: str) -> Dict[str, Any]: + try: + if pwc is None: + return {"success": False, "error": "pywinctl not available"} + wins = pwc.getWindowsWithTitle(app, condition=pwc.Re.CONTAINS, flags=pwc.Re.IGNORECASE) + ids = [w.getHandle() for w in wins] + return {"success": True, "windows": ids} + except Exception as e: + return {"success": False, "error": str(e)} + + async def get_window_name(self, window_id: int | str) -> Dict[str, Any]: + try: + if pwc is None: + return {"success": False, "error": "pywinctl not available"} + w = self._get_window_by_id(window_id) + if not w: + return {"success": False, "error": "Window not found"} + return {"success": True, "name": w.title} + except Exception as e: + return {"success": False, "error": str(e)} + + async def get_window_size(self, window_id: int | str) -> Dict[str, Any]: + try: + if pwc is None: + return {"success": False, "error": "pywinctl not available"} + w = self._get_window_by_id(window_id) + if not w: + return {"success": False, "error": "Window not found"} + width, height = w.size + return {"success": True, "width": int(width), "height": int(height)} + except Exception as e: + return {"success": False, "error": str(e)} + + async def activate_window(self, window_id: int | str) -> Dict[str, Any]: + try: + if pwc is None: + return {"success": False, "error": "pywinctl not available"} + w = self._get_window_by_id(window_id) + if not w: + return {"success": False, "error": "Window not found"} + ok = w.activate() + return {"success": bool(ok)} + except Exception as e: + return {"success": False, "error": str(e)} + + async def close_window(self, window_id: int | str) -> Dict[str, Any]: + try: + if pwc is None: + return {"success": False, "error": "pywinctl not available"} + w = self._get_window_by_id(window_id) + if not w: + return {"success": False, "error": "Window not found"} + ok = w.close() + return {"success": bool(ok)} + except Exception as e: + return {"success": False, "error": str(e)} + + +# ===== Cross-platform file system command handlers ===== + + class GenericFileHandler(BaseFileHandler): """ Generic file handler that provides file system operations for all operating systems. diff --git a/libs/python/computer-server/computer_server/main.py b/libs/python/computer-server/computer_server/main.py index 005c7ddb..fc9b6354 100644 --- a/libs/python/computer-server/computer_server/main.py +++ b/libs/python/computer-server/computer_server/main.py @@ -75,7 +75,7 @@ except Exception: except Exception: package_version = "unknown" -accessibility_handler, automation_handler, diorama_handler, file_handler, desktop_handler = ( +accessibility_handler, automation_handler, diorama_handler, file_handler, desktop_handler, window_handler = ( HandlerFactory.create_handlers() ) handlers = { @@ -102,6 +102,15 @@ handlers = { # Desktop commands "get_desktop_environment": desktop_handler.get_desktop_environment, "set_wallpaper": desktop_handler.set_wallpaper, + # Window management + "open": window_handler.open, + "launch": window_handler.launch, + "get_current_window_id": window_handler.get_current_window_id, + "get_application_windows": window_handler.get_application_windows, + "get_window_name": window_handler.get_window_name, + "get_window_size": window_handler.get_window_size, + "activate_window": window_handler.activate_window, + "close_window": window_handler.close_window, # Mouse commands "mouse_down": automation_handler.mouse_down, "mouse_up": automation_handler.mouse_up, diff --git a/libs/python/computer-server/pyproject.toml b/libs/python/computer-server/pyproject.toml index a8ecfb23..20fff950 100644 --- a/libs/python/computer-server/pyproject.toml +++ b/libs/python/computer-server/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "aiohttp>=3.9.1", "pyperclip>=1.9.0", "websockets>=12.0", + "pywinctl>=0.4.1", # OS-specific runtime deps "pyobjc-framework-Cocoa>=10.1; sys_platform == 'darwin'", "pyobjc-framework-Quartz>=10.1; sys_platform == 'darwin'", diff --git a/libs/python/computer/computer/interface/generic.py b/libs/python/computer/computer/interface/generic.py index 7cf47461..80b24199 100644 --- a/libs/python/computer/computer/interface/generic.py +++ b/libs/python/computer/computer/interface/generic.py @@ -499,6 +499,62 @@ class GenericComputerInterface(BaseComputerInterface): if not result.get("success", False): raise RuntimeError(result.get("error", "Failed to set wallpaper")) + # Window management + async def open(self, target: str) -> None: + result = await self._send_command("open", {"target": target}) + if not result.get("success", False): + raise RuntimeError(result.get("error", "Failed to open target")) + + async def launch(self, app: str, args: list[str] | None = None) -> int | None: + payload: dict[str, object] = {"app": app} + if args is not None: + payload["args"] = args + result = await self._send_command("launch", payload) + if not result.get("success", False): + raise RuntimeError(result.get("error", "Failed to launch application")) + return result.get("pid") # type: ignore[return-value] + + async def get_current_window_id(self) -> int | str: + result = await self._send_command("get_current_window_id") + if not result.get("success", False): + raise RuntimeError(result.get("error", "Failed to get current window id")) + return result["window_id"] # type: ignore[return-value] + + async def get_application_windows(self, app: str) -> list[int | str]: + result = await self._send_command("get_application_windows", {"app": app}) + if not result.get("success", False): + raise RuntimeError(result.get("error", "Failed to get application windows")) + return list(result.get("windows", [])) # type: ignore[return-value] + + async def get_window_name(self, window_id: int | str) -> str: + result = await self._send_command("get_window_name", {"window_id": window_id}) + if not result.get("success", False): + raise RuntimeError(result.get("error", "Failed to get window name")) + return result.get("name", "") # type: ignore[return-value] + + async def get_window_size(self, window_id: int | str) -> tuple[int, int]: + result = await self._send_command("get_window_size", {"window_id": window_id}) + if not result.get("success", False): + raise RuntimeError(result.get("error", "Failed to get window size")) + return int(result.get("width", 0)), int(result.get("height", 0)) + + async def activate_window(self, window_id: int | str) -> None: + result = await self._send_command("activate_window", {"window_id": window_id}) + if not result.get("success", False): + raise RuntimeError(result.get("error", "Failed to activate window")) + + async def close_window(self, window_id: int | str) -> None: + result = await self._send_command("close_window", {"window_id": window_id}) + if not result.get("success", False): + raise RuntimeError(result.get("error", "Failed to close window")) + + # Convenience aliases + async def get_window_title(self, window_id: int | str) -> str: + return await self.get_window_name(window_id) + + async def window_size(self, window_id: int | str) -> tuple[int, int]: + return await self.get_window_size(window_id) + # Command execution async def run_command(self, command: str) -> CommandResult: result = await self._send_command("run_command", {"command": command}) diff --git a/libs/typescript/computer/src/interface/base.ts b/libs/typescript/computer/src/interface/base.ts index e41c4416..333060fa 100644 --- a/libs/typescript/computer/src/interface/base.ts +++ b/libs/typescript/computer/src/interface/base.ts @@ -314,6 +314,16 @@ export abstract class BaseComputerInterface { abstract getScreenSize(): Promise; abstract getCursorPosition(): Promise; + // Window Management + abstract open(target: string): Promise; + abstract launch(app: string, args?: string[]): Promise; + abstract getCurrentWindowId(): Promise; + abstract getApplicationWindows(app: string): Promise>; + abstract getWindowName(windowId: number | string): Promise; + abstract getWindowSize(windowId: number | string): Promise<[number, number]>; + abstract activateWindow(windowId: number | string): Promise; + abstract closeWindow(windowId: number | string): Promise; + // Desktop Actions abstract getDesktopEnvironment(): Promise; abstract setWallpaper(path: string): Promise; diff --git a/libs/typescript/computer/src/interface/macos.ts b/libs/typescript/computer/src/interface/macos.ts index 86522042..8b317687 100644 --- a/libs/typescript/computer/src/interface/macos.ts +++ b/libs/typescript/computer/src/interface/macos.ts @@ -212,6 +212,76 @@ export class MacOSComputerInterface extends BaseComputerInterface { return response.position as CursorPosition; } + // Window Management + /** Open a file path or URL with the default handler. */ + async open(target: string): Promise { + const response = await this.sendCommand('open', { target }); + if (!response.success) { + throw new Error((response.error as string) || 'Failed to open target'); + } + } + + /** Launch an application (string may include args). Returns pid if available. */ + async launch(app: string, args?: string[]): Promise { + const response = await this.sendCommand('launch', args ? { app, args } : { app }); + if (!response.success) { + throw new Error((response.error as string) || 'Failed to launch application'); + } + return (response.pid as number) || undefined; + } + + /** Get the current active window id. */ + async getCurrentWindowId(): Promise { + const response = await this.sendCommand('get_current_window_id'); + if (!response.success || response.window_id === undefined) { + throw new Error((response.error as string) || 'Failed to get current window id'); + } + return response.window_id as number | string; + } + + /** Get windows belonging to an application (by name). */ + async getApplicationWindows(app: string): Promise> { + const response = await this.sendCommand('get_application_windows', { app }); + if (!response.success) { + throw new Error((response.error as string) || 'Failed to get application windows'); + } + return (response.windows as Array) || []; + } + + /** Get window title/name by id. */ + async getWindowName(windowId: number | string): Promise { + const response = await this.sendCommand('get_window_name', { window_id: windowId }); + if (!response.success) { + throw new Error((response.error as string) || 'Failed to get window name'); + } + return (response.name as string) || ''; + } + + /** Get window size as [width, height]. */ + async getWindowSize(windowId: number | string): Promise<[number, number]> { + const response = await this.sendCommand('get_window_size', { window_id: windowId }); + if (!response.success) { + throw new Error((response.error as string) || 'Failed to get window size'); + } + return [Number(response.width) || 0, Number(response.height) || 0]; + } + + /** Activate a window by id. */ + async activateWindow(windowId: number | string): Promise { + const response = await this.sendCommand('activate_window', { window_id: windowId }); + if (!response.success) { + throw new Error((response.error as string) || 'Failed to activate window'); + } + } + + /** Close a window by id. */ + async closeWindow(windowId: number | string): Promise { + const response = await this.sendCommand('close_window', { window_id: windowId }); + if (!response.success) { + throw new Error((response.error as string) || 'Failed to close window'); + } + } + // Desktop Actions /** * Get the current desktop environment string (e.g., 'xfce4', 'gnome', 'kde', 'mac', 'windows').