Add window management commands

This commit is contained in:
Dillon DuPont
2025-10-24 16:40:29 -07:00
parent fb174d6aa4
commit 83d8d7e82b
9 changed files with 379 additions and 3 deletions

View File

@@ -22,6 +22,51 @@ Execute shell commands and get detailed results:
</Tab>
</Tabs>
## Window Management
Control application launching and windows:
<Tabs items={['Python', 'TypeScript']}>
<Tab value="Python">
```python
# Launch applications
await computer.interface.launch("xfce4-terminal")
await computer.interface.launch("libreoffice --writer")
await computer.interface.open("https://www.google.com")
# Window management
windows = await computer.interface.get_application_windows("xfce4-terminal")
window_id = windows[0]
await computer.interface.activate_window(window_id)
window_id = await computer.interface.get_current_window_id() # get the current active window id
await computer.interface.window_size(window_id)
await computer.interface.get_window_title(window_id)
await computer.interface.close_window(window_id)
```
</Tab>
<Tab value="TypeScript">
```typescript
// Launch applications
await computer.interface.launch("xfce4-terminal");
await computer.interface.launch("libreoffice --writer");
await computer.interface.open("https://www.google.com");
// Window management
const windows = await computer.interface.getApplicationWindows("xfce4-terminal");
let windowId = windows[0];
await computer.interface.activateWindow(windowId);
windowId = await computer.interface.getCurrentWindowId(); // current active window id
await computer.interface.getWindowSize(windowId);
await computer.interface.getWindowName(windowId);
await computer.interface.closeWindow(windowId);
```
</Tab>
</Tabs>
## Mouse Actions
Precise mouse control and interaction:

View File

@@ -105,6 +105,55 @@ class BaseDesktopHandler(ABC):
pass
class BaseWindowHandler(ABC):
"""Abstract class for OS-specific window management handlers.
Categories:
- Window Management: Methods for application/window control
"""
# Window Management
@abstractmethod
async def open(self, target: str) -> Dict[str, Any]:
"""Open a file or URL with the default application."""
pass
@abstractmethod
async def launch(self, app: str, args: Optional[List[str]] = None) -> Dict[str, Any]:
"""Launch an application with optional arguments."""
pass
@abstractmethod
async def get_current_window_id(self) -> Dict[str, Any]:
"""Get the currently active window ID."""
pass
@abstractmethod
async def get_application_windows(self, app: str) -> Dict[str, Any]:
"""Get windows belonging to an application (by name or bundle)."""
pass
@abstractmethod
async def get_window_name(self, window_id: str) -> Dict[str, Any]:
"""Get the title/name of a window by ID."""
pass
@abstractmethod
async def get_window_size(self, window_id: str) -> Dict[str, Any]:
"""Get the size of a window by ID as {width, height}."""
pass
@abstractmethod
async def activate_window(self, window_id: str) -> Dict[str, Any]:
"""Bring a window to the foreground by ID."""
pass
@abstractmethod
async def close_window(self, window_id: str) -> Dict[str, Any]:
"""Close a window by ID."""
pass
class BaseAutomationHandler(ABC):
"""Abstract base class for OS-specific automation handlers.

View File

@@ -9,6 +9,7 @@ from .base import (
BaseAutomationHandler,
BaseDesktopHandler,
BaseFileHandler,
BaseWindowHandler,
)
# Conditionally import platform-specific handlers
@@ -22,7 +23,7 @@ elif system == "linux":
elif system == "windows":
from .windows import WindowsAccessibilityHandler, WindowsAutomationHandler
from .generic import GenericDesktopHandler, GenericFileHandler
from .generic import GenericDesktopHandler, GenericFileHandler, GenericWindowHandler
class HandlerFactory:
@@ -61,6 +62,7 @@ class HandlerFactory:
BaseDioramaHandler,
BaseFileHandler,
BaseDesktopHandler,
BaseWindowHandler,
]
):
"""Create and return appropriate handlers for the current OS.
@@ -82,6 +84,7 @@ class HandlerFactory:
MacOSDioramaHandler(),
GenericFileHandler(),
GenericDesktopHandler(),
GenericWindowHandler(),
)
elif os_type == "linux":
return (
@@ -90,6 +93,7 @@ class HandlerFactory:
BaseDioramaHandler(),
GenericFileHandler(),
GenericDesktopHandler(),
GenericWindowHandler(),
)
elif os_type == "windows":
return (
@@ -98,6 +102,7 @@ class HandlerFactory:
BaseDioramaHandler(),
GenericFileHandler(),
GenericDesktopHandler(),
GenericWindowHandler(),
)
else:
raise NotImplementedError(f"OS '{os_type}' is not supported")

View File

@@ -8,11 +8,20 @@ Includes:
"""
import base64
import os
import platform
import subprocess
import webbrowser
from pathlib import Path
from typing import Any, Dict, Optional
from ..utils import wallpaper
from .base import BaseDesktopHandler, BaseFileHandler
from .base import BaseDesktopHandler, BaseFileHandler, BaseWindowHandler
try:
import pywinctl as pwc
except Exception: # pragma: no cover
pwc = None # type: ignore
def resolve_path(path: str) -> Path:
@@ -27,6 +36,9 @@ def resolve_path(path: str) -> Path:
return Path(path).expanduser().resolve()
# ===== Cross-platform Desktop command handlers =====
class GenericDesktopHandler(BaseDesktopHandler):
"""
Generic desktop handler providing desktop-related operations.
@@ -67,6 +79,125 @@ class GenericDesktopHandler(BaseDesktopHandler):
return {"success": False, "error": str(e)}
# ===== Cross-platform window control command handlers =====
class GenericWindowHandler(BaseWindowHandler):
"""
Cross-platform window management using pywinctl where possible.
"""
async def open(self, target: str) -> Dict[str, Any]:
try:
if target.startswith("http://") or target.startswith("https://"):
ok = webbrowser.open(target)
return {"success": bool(ok)}
path = str(resolve_path(target))
sys = platform.system().lower()
if sys == "darwin":
subprocess.Popen(["open", path])
elif sys == "linux":
subprocess.Popen(["xdg-open", path])
elif sys == "windows":
os.startfile(path) # type: ignore[attr-defined]
else:
return {"success": False, "error": f"Unsupported OS: {sys}"}
return {"success": True}
except Exception as e:
return {"success": False, "error": str(e)}
async def launch(self, app: str, args: Optional[list[str]] = None) -> Dict[str, Any]:
try:
if args:
proc = subprocess.Popen([app, *args])
else:
# allow shell command like "libreoffice --writer"
proc = subprocess.Popen(app, shell=True)
return {"success": True, "pid": proc.pid}
except Exception as e:
return {"success": False, "error": str(e)}
def _get_window_by_id(self, window_id: int | str):
if pwc is None:
raise RuntimeError("pywinctl not available")
try:
windows = pwc.getAllWindowsDict()
return windows.get(window_id) or windows.get(int(window_id))
except Exception:
return None
async def get_current_window_id(self) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
win = pwc.getActiveWindow()
if not win:
return {"success": False, "error": "No active window"}
return {"success": True, "window_id": win.getHandle()}
except Exception as e:
return {"success": False, "error": str(e)}
async def get_application_windows(self, app: str) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
wins = pwc.getWindowsWithTitle(app, condition=pwc.Re.CONTAINS, flags=pwc.Re.IGNORECASE)
ids = [w.getHandle() for w in wins]
return {"success": True, "windows": ids}
except Exception as e:
return {"success": False, "error": str(e)}
async def get_window_name(self, window_id: int | str) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
w = self._get_window_by_id(window_id)
if not w:
return {"success": False, "error": "Window not found"}
return {"success": True, "name": w.title}
except Exception as e:
return {"success": False, "error": str(e)}
async def get_window_size(self, window_id: int | str) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
w = self._get_window_by_id(window_id)
if not w:
return {"success": False, "error": "Window not found"}
width, height = w.size
return {"success": True, "width": int(width), "height": int(height)}
except Exception as e:
return {"success": False, "error": str(e)}
async def activate_window(self, window_id: int | str) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
w = self._get_window_by_id(window_id)
if not w:
return {"success": False, "error": "Window not found"}
ok = w.activate()
return {"success": bool(ok)}
except Exception as e:
return {"success": False, "error": str(e)}
async def close_window(self, window_id: int | str) -> Dict[str, Any]:
try:
if pwc is None:
return {"success": False, "error": "pywinctl not available"}
w = self._get_window_by_id(window_id)
if not w:
return {"success": False, "error": "Window not found"}
ok = w.close()
return {"success": bool(ok)}
except Exception as e:
return {"success": False, "error": str(e)}
# ===== Cross-platform file system command handlers =====
class GenericFileHandler(BaseFileHandler):
"""
Generic file handler that provides file system operations for all operating systems.

View File

@@ -75,7 +75,7 @@ except Exception:
except Exception:
package_version = "unknown"
accessibility_handler, automation_handler, diorama_handler, file_handler, desktop_handler = (
accessibility_handler, automation_handler, diorama_handler, file_handler, desktop_handler, window_handler = (
HandlerFactory.create_handlers()
)
handlers = {
@@ -102,6 +102,15 @@ handlers = {
# Desktop commands
"get_desktop_environment": desktop_handler.get_desktop_environment,
"set_wallpaper": desktop_handler.set_wallpaper,
# Window management
"open": window_handler.open,
"launch": window_handler.launch,
"get_current_window_id": window_handler.get_current_window_id,
"get_application_windows": window_handler.get_application_windows,
"get_window_name": window_handler.get_window_name,
"get_window_size": window_handler.get_window_size,
"activate_window": window_handler.activate_window,
"close_window": window_handler.close_window,
# Mouse commands
"mouse_down": automation_handler.mouse_down,
"mouse_up": automation_handler.mouse_up,

View File

@@ -23,6 +23,7 @@ dependencies = [
"aiohttp>=3.9.1",
"pyperclip>=1.9.0",
"websockets>=12.0",
"pywinctl>=0.4.1",
# OS-specific runtime deps
"pyobjc-framework-Cocoa>=10.1; sys_platform == 'darwin'",
"pyobjc-framework-Quartz>=10.1; sys_platform == 'darwin'",

View File

@@ -499,6 +499,62 @@ class GenericComputerInterface(BaseComputerInterface):
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to set wallpaper"))
# Window management
async def open(self, target: str) -> None:
result = await self._send_command("open", {"target": target})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to open target"))
async def launch(self, app: str, args: list[str] | None = None) -> int | None:
payload: dict[str, object] = {"app": app}
if args is not None:
payload["args"] = args
result = await self._send_command("launch", payload)
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to launch application"))
return result.get("pid") # type: ignore[return-value]
async def get_current_window_id(self) -> int | str:
result = await self._send_command("get_current_window_id")
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to get current window id"))
return result["window_id"] # type: ignore[return-value]
async def get_application_windows(self, app: str) -> list[int | str]:
result = await self._send_command("get_application_windows", {"app": app})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to get application windows"))
return list(result.get("windows", [])) # type: ignore[return-value]
async def get_window_name(self, window_id: int | str) -> str:
result = await self._send_command("get_window_name", {"window_id": window_id})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to get window name"))
return result.get("name", "") # type: ignore[return-value]
async def get_window_size(self, window_id: int | str) -> tuple[int, int]:
result = await self._send_command("get_window_size", {"window_id": window_id})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to get window size"))
return int(result.get("width", 0)), int(result.get("height", 0))
async def activate_window(self, window_id: int | str) -> None:
result = await self._send_command("activate_window", {"window_id": window_id})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to activate window"))
async def close_window(self, window_id: int | str) -> None:
result = await self._send_command("close_window", {"window_id": window_id})
if not result.get("success", False):
raise RuntimeError(result.get("error", "Failed to close window"))
# Convenience aliases
async def get_window_title(self, window_id: int | str) -> str:
return await self.get_window_name(window_id)
async def window_size(self, window_id: int | str) -> tuple[int, int]:
return await self.get_window_size(window_id)
# Command execution
async def run_command(self, command: str) -> CommandResult:
result = await self._send_command("run_command", {"command": command})

View File

@@ -314,6 +314,16 @@ export abstract class BaseComputerInterface {
abstract getScreenSize(): Promise<ScreenSize>;
abstract getCursorPosition(): Promise<CursorPosition>;
// Window Management
abstract open(target: string): Promise<void>;
abstract launch(app: string, args?: string[]): Promise<number | undefined>;
abstract getCurrentWindowId(): Promise<number | string>;
abstract getApplicationWindows(app: string): Promise<Array<number | string>>;
abstract getWindowName(windowId: number | string): Promise<string>;
abstract getWindowSize(windowId: number | string): Promise<[number, number]>;
abstract activateWindow(windowId: number | string): Promise<void>;
abstract closeWindow(windowId: number | string): Promise<void>;
// Desktop Actions
abstract getDesktopEnvironment(): Promise<string>;
abstract setWallpaper(path: string): Promise<void>;

View File

@@ -212,6 +212,76 @@ export class MacOSComputerInterface extends BaseComputerInterface {
return response.position as CursorPosition;
}
// Window Management
/** Open a file path or URL with the default handler. */
async open(target: string): Promise<void> {
const response = await this.sendCommand('open', { target });
if (!response.success) {
throw new Error((response.error as string) || 'Failed to open target');
}
}
/** Launch an application (string may include args). Returns pid if available. */
async launch(app: string, args?: string[]): Promise<number | undefined> {
const response = await this.sendCommand('launch', args ? { app, args } : { app });
if (!response.success) {
throw new Error((response.error as string) || 'Failed to launch application');
}
return (response.pid as number) || undefined;
}
/** Get the current active window id. */
async getCurrentWindowId(): Promise<number | string> {
const response = await this.sendCommand('get_current_window_id');
if (!response.success || response.window_id === undefined) {
throw new Error((response.error as string) || 'Failed to get current window id');
}
return response.window_id as number | string;
}
/** Get windows belonging to an application (by name). */
async getApplicationWindows(app: string): Promise<Array<number | string>> {
const response = await this.sendCommand('get_application_windows', { app });
if (!response.success) {
throw new Error((response.error as string) || 'Failed to get application windows');
}
return (response.windows as Array<number | string>) || [];
}
/** Get window title/name by id. */
async getWindowName(windowId: number | string): Promise<string> {
const response = await this.sendCommand('get_window_name', { window_id: windowId });
if (!response.success) {
throw new Error((response.error as string) || 'Failed to get window name');
}
return (response.name as string) || '';
}
/** Get window size as [width, height]. */
async getWindowSize(windowId: number | string): Promise<[number, number]> {
const response = await this.sendCommand('get_window_size', { window_id: windowId });
if (!response.success) {
throw new Error((response.error as string) || 'Failed to get window size');
}
return [Number(response.width) || 0, Number(response.height) || 0];
}
/** Activate a window by id. */
async activateWindow(windowId: number | string): Promise<void> {
const response = await this.sendCommand('activate_window', { window_id: windowId });
if (!response.success) {
throw new Error((response.error as string) || 'Failed to activate window');
}
}
/** Close a window by id. */
async closeWindow(windowId: number | string): Promise<void> {
const response = await this.sendCommand('close_window', { window_id: windowId });
if (!response.success) {
throw new Error((response.error as string) || 'Failed to close window');
}
}
// Desktop Actions
/**
* Get the current desktop environment string (e.g., 'xfce4', 'gnome', 'kde', 'mac', 'windows').