From 37c5be669bf256a05350f8429540f7d483eaaccf Mon Sep 17 00:00:00 2001 From: Adam Date: Tue, 2 Dec 2025 12:53:25 -0500 Subject: [PATCH] dd browser tool with Playwright for visible browser automation Add browser tool with Playwright/Firefox support. Includes BrowserManager, /playwright_exec endpoint, BrowserTool client, and auto-recovery. Fixes Python version in startup script and adds Playwright to Docker build. --- examples/BROWSER_TOOL_README.md | 69 ++ examples/browser_tool_example.py | 96 ++ libs/python/agent/agent/tools/__init__.py | 6 + libs/python/agent/agent/tools/browser_tool.py | 143 +++ .../computer_server/browser.py | 308 +++++++ .../computer-server/computer_server/main.py | 67 ++ libs/python/computer-server/pyproject.toml | 1 + libs/xfce/Dockerfile | 10 + libs/xfce/README_BUILD.md | 32 + libs/xfce/browser.py | 308 +++++++ libs/xfce/main.py | 820 ++++++++++++++++++ .../xfce/src/scripts/start-computer-server.sh | 2 +- 12 files changed, 1861 insertions(+), 1 deletion(-) create mode 100644 examples/BROWSER_TOOL_README.md create mode 100644 examples/browser_tool_example.py create mode 100644 libs/python/agent/agent/tools/__init__.py create mode 100644 libs/python/agent/agent/tools/browser_tool.py create mode 100644 libs/python/computer-server/computer_server/browser.py create mode 100644 libs/xfce/README_BUILD.md create mode 100644 libs/xfce/browser.py create mode 100644 libs/xfce/main.py diff --git a/examples/BROWSER_TOOL_README.md b/examples/BROWSER_TOOL_README.md new file mode 100644 index 00000000..8d12ae85 --- /dev/null +++ b/examples/BROWSER_TOOL_README.md @@ -0,0 +1,69 @@ +# Browser Tool + +Browser automation tool that allows agents to control a Firefox browser programmatically via Playwright while keeping it visible on the XFCE desktop. + +## Quick Start + +### Using Docker (Recommended) + +```bash +# Build and run the container +cd libs/xfce +docker build -t cua-xfce . +docker run -d --name cua-xfce-test \ + -p 8000:8000 -p 5901:5901 -p 6901:6901 \ + -e DISPLAY=:1 \ + cua-xfce + +# View desktop: http://localhost:6901 +# Test the browser tool +python examples/browser_tool_example.py +``` + +### Local Testing + +```bash +# Install dependencies +pip install playwright +playwright install --with-deps firefox + +# Start server +python -m computer_server --port 8000 + +# Run test (in another terminal) +python examples/browser_tool_example.py +``` + +## Features + +- **Visible Browser**: Runs in non-headless mode so visual agents can see it +- **Auto-Recovery**: Automatically reopens browser if closed manually +- **Persistent Context**: Maintains cookies and sessions across commands +- **Fara/Magentic-One Interface**: Compatible with Microsoft agent interfaces + +## API Endpoint + +The browser tool is accessible via the `/playwright_exec` endpoint: + +```bash +curl -X POST http://localhost:8000/playwright_exec \ + -H "Content-Type: application/json" \ + -d '{"command": "visit_url", "params": {"url": "https://www.example.com"}}' +``` + +## Available Commands + +- `visit_url(url)` - Navigate to a URL +- `click(x, y)` - Click at coordinates +- `type(text)` - Type text into focused element +- `scroll(delta_x, delta_y)` - Scroll the page +- `web_search(query)` - Navigate to Google search + +## Troubleshooting + +**Browser closes unexpectedly**: The tool automatically reopens the browser on the next command. + +**Connection errors**: Make sure the server is running (`curl http://localhost:8000/status`). + +**Playwright not found**: Install with `pip install playwright && playwright install --with-deps firefox`. + diff --git a/examples/browser_tool_example.py b/examples/browser_tool_example.py new file mode 100644 index 00000000..9705ca8f --- /dev/null +++ b/examples/browser_tool_example.py @@ -0,0 +1,96 @@ +""" +Browser Tool Example + +Demonstrates how to use the BrowserTool to control a browser programmatically +via the computer server. The browser runs visibly on the XFCE desktop so visual +agents can see it. + +Prerequisites: + - Computer server running (Docker container or local) + - For Docker: Container should be running with browser tool support + - For local: Playwright and Firefox must be installed + +Usage: + python examples/browser_tool_example.py +""" + +import asyncio +import logging +import sys +from pathlib import Path + +# Import BrowserTool directly from the file +browser_tool_path = Path(__file__).parent.parent / "libs" / "python" / "agent" / "agent" / "tools" / "browser_tool.py" +sys.path.insert(0, str(browser_tool_path.parent.parent.parent)) + +# Import the module directly +import importlib.util +spec = importlib.util.spec_from_file_location("browser_tool", browser_tool_path) +if spec is None or spec.loader is None: + raise ImportError(f"Could not load browser_tool from {browser_tool_path}") +browser_tool_module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(browser_tool_module) +BrowserTool = browser_tool_module.BrowserTool + +# Configure logging to see what's happening +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +async def test_browser_tool(): + """Test the BrowserTool with various commands.""" + + # Initialize the browser tool + # For local testing, use http://localhost:8000 + # For cloud, provide base_url, api_key, and container_name + browser = BrowserTool(base_url="http://localhost:8000") + + logger.info("Testing Browser Tool...") + + try: + # Test 1: Visit a URL + logger.info("Test 1: Visiting a URL...") + result = await browser.visit_url("https://www.trycua.com") + logger.info(f"Visit URL result: {result}") + + # Wait a bit for the page to load + await asyncio.sleep(2) + + # Test 2: Web search + logger.info("Test 2: Performing a web search...") + result = await browser.web_search("Python programming") + logger.info(f"Web search result: {result}") + + # Wait a bit + await asyncio.sleep(2) + + # Test 3: Scroll + logger.info("Test 3: Scrolling the page...") + result = await browser.scroll(delta_x=0, delta_y=500) + logger.info(f"Scroll result: {result}") + + # Wait a bit + await asyncio.sleep(1) + + # Test 4: Click (example coordinates - adjust based on your screen) + logger.info("Test 4: Clicking at coordinates...") + result = await browser.click(x=500, y=300) + logger.info(f"Click result: {result}") + + # Wait a bit + await asyncio.sleep(1) + + # Test 5: Type text (if there's a focused input field) + logger.info("Test 5: Typing text...") + result = await browser.type("Hello from BrowserTool!") + logger.info(f"Type result: {result}") + + logger.info("All tests completed!") + + except Exception as e: + logger.error(f"Error during testing: {e}", exc_info=True) + + +if __name__ == "__main__": + asyncio.run(test_browser_tool()) + diff --git a/libs/python/agent/agent/tools/__init__.py b/libs/python/agent/agent/tools/__init__.py new file mode 100644 index 00000000..e663c557 --- /dev/null +++ b/libs/python/agent/agent/tools/__init__.py @@ -0,0 +1,6 @@ +"""Tools for agent interactions.""" + +from .browser_tool import BrowserTool + +__all__ = ["BrowserTool"] + diff --git a/libs/python/agent/agent/tools/browser_tool.py b/libs/python/agent/agent/tools/browser_tool.py new file mode 100644 index 00000000..8f8b1ab9 --- /dev/null +++ b/libs/python/agent/agent/tools/browser_tool.py @@ -0,0 +1,143 @@ +""" +Browser Tool for agent interactions. +Allows agents to control a browser programmatically via Playwright. +""" + +import logging +from typing import Optional + +import aiohttp + +logger = logging.getLogger(__name__) + + +class BrowserTool: + """ + Browser tool that connects to the computer server's Playwright endpoint. + Implements the Fara/Magentic-One agent interface for browser control. + """ + + def __init__( + self, + base_url: str = "http://localhost:8000", + api_key: Optional[str] = None, + container_name: Optional[str] = None, + ): + """ + Initialize the BrowserTool. + + Args: + base_url: Base URL of the computer server (default: http://localhost:8000) + api_key: Optional API key for cloud authentication + container_name: Optional container name for cloud authentication + """ + self.base_url = base_url.rstrip("/") + self.api_key = api_key + self.container_name = container_name + self.logger = logger + + def _get_endpoint_url(self) -> str: + """Get the full URL for the playwright_exec endpoint.""" + return f"{self.base_url}/playwright_exec" + + def _get_headers(self) -> dict: + """Get headers for the HTTP request.""" + headers = {"Content-Type": "application/json"} + if self.api_key: + headers["X-API-Key"] = self.api_key + if self.container_name: + headers["X-Container-Name"] = self.container_name + return headers + + async def _execute_command(self, command: str, params: dict) -> dict: + """ + Execute a browser command via HTTP POST. + + Args: + command: Command name + params: Command parameters + + Returns: + Response dictionary + """ + url = self._get_endpoint_url() + payload = {"command": command, "params": params} + headers = self._get_headers() + + try: + async with aiohttp.ClientSession() as session: + async with session.post(url, json=payload, headers=headers) as response: + if response.status == 200: + return await response.json() + else: + error_text = await response.text() + self.logger.error( + f"Browser command failed with status {response.status}: {error_text}" + ) + return {"success": False, "error": error_text} + except Exception as e: + self.logger.error(f"Error executing browser command: {e}") + return {"success": False, "error": str(e)} + + async def visit_url(self, url: str) -> dict: + """ + Navigate to a URL. + + Args: + url: URL to visit + + Returns: + Response dictionary with success status and current URL + """ + return await self._execute_command("visit_url", {"url": url}) + + async def click(self, x: int, y: int) -> dict: + """ + Click at coordinates. + + Args: + x: X coordinate + y: Y coordinate + + Returns: + Response dictionary with success status + """ + return await self._execute_command("click", {"x": x, "y": y}) + + async def type(self, text: str) -> dict: + """ + Type text into the focused element. + + Args: + text: Text to type + + Returns: + Response dictionary with success status + """ + return await self._execute_command("type", {"text": text}) + + async def scroll(self, delta_x: int, delta_y: int) -> dict: + """ + Scroll the page. + + Args: + delta_x: Horizontal scroll delta + delta_y: Vertical scroll delta + + Returns: + Response dictionary with success status + """ + return await self._execute_command("scroll", {"delta_x": delta_x, "delta_y": delta_y}) + + async def web_search(self, query: str) -> dict: + """ + Navigate to a Google search for the query. + + Args: + query: Search query + + Returns: + Response dictionary with success status and current URL + """ + return await self._execute_command("web_search", {"query": query}) + diff --git a/libs/python/computer-server/computer_server/browser.py b/libs/python/computer-server/computer_server/browser.py new file mode 100644 index 00000000..3d0a4c69 --- /dev/null +++ b/libs/python/computer-server/computer_server/browser.py @@ -0,0 +1,308 @@ +""" +Browser manager using Playwright for programmatic browser control. +This allows agents to control a browser that runs visibly on the XFCE desktop. +""" + +import asyncio +import logging +import os +from typing import Any, Dict, Optional + +try: + from playwright.async_api import async_playwright, Browser, BrowserContext, Page +except ImportError: + async_playwright = None + Browser = None + BrowserContext = None + Page = None + +logger = logging.getLogger(__name__) + + +class BrowserManager: + """ + Manages a Playwright browser instance that runs visibly on the XFCE desktop. + Uses persistent context to maintain cookies and sessions. + """ + + def __init__(self): + """Initialize the BrowserManager.""" + self.playwright = None + self.browser: Optional[Browser] = None + self.context: Optional[BrowserContext] = None + self.page: Optional[Page] = None + self._initialized = False + self._initialization_error: Optional[str] = None + self._lock = asyncio.Lock() + + async def _ensure_initialized(self): + """Ensure the browser is initialized.""" + # Check if browser was closed and needs reinitialization + if self._initialized: + try: + # Check if context is still valid by trying to access it + if self.context: + # Try to get pages - this will raise if context is closed + _ = self.context.pages + # If we get here, context is still alive + return + else: + # Context was closed, need to reinitialize + self._initialized = False + logger.warning("Browser context was closed, will reinitialize...") + except Exception as e: + # Context is dead, need to reinitialize + logger.warning(f"Browser context is dead ({e}), will reinitialize...") + self._initialized = False + self.context = None + self.page = None + # Clean up playwright if it exists + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + async with self._lock: + # Double-check after acquiring lock (another thread might have initialized it) + if self._initialized: + try: + if self.context: + _ = self.context.pages + return + except Exception: + self._initialized = False + self.context = None + self.page = None + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + if async_playwright is None: + raise RuntimeError( + "playwright is not installed. Please install it with: pip install playwright && playwright install --with-deps firefox" + ) + + try: + # Get display from environment or default to :1 + display = os.environ.get("DISPLAY", ":1") + logger.info(f"Initializing browser with DISPLAY={display}") + + # Start playwright + self.playwright = await async_playwright().start() + + # Launch Firefox with persistent context (keeps cookies/sessions) + # headless=False is CRITICAL so the visual agent can see it + user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright-firefox") + os.makedirs(user_data_dir, exist_ok=True) + + # launch_persistent_context returns a BrowserContext, not a Browser + # Note: Removed --kiosk mode so the desktop remains visible + self.context = await self.playwright.firefox.launch_persistent_context( + user_data_dir=user_data_dir, + headless=False, # CRITICAL: visible for visual agent + viewport={"width": 1024, "height": 768}, + # Removed --kiosk to allow desktop visibility + ) + + # Get the first page or create one + pages = self.context.pages + if pages: + self.page = pages[0] + else: + self.page = await self.context.new_page() + + self._initialized = True + logger.info("Browser initialized successfully") + + except Exception as e: + logger.error(f"Failed to initialize browser: {e}") + import traceback + logger.error(traceback.format_exc()) + # Don't raise - return error in execute_command instead + self._initialization_error = str(e) + raise + + async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]: + """ + Execute a browser command. + + Args: + cmd: Command name (visit_url, click, type, scroll, web_search) + params: Command parameters + + Returns: + Result dictionary with success status and any data + """ + try: + await self._ensure_initialized() + except Exception as e: + error_msg = getattr(self, '_initialization_error', None) or str(e) + logger.error(f"Browser initialization failed: {error_msg}") + return { + "success": False, + "error": f"Browser initialization failed: {error_msg}. " + f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly." + } + + # Ensure browser is still initialized (in case it was manually closed) + # This will automatically reinitialize if the browser was closed + await self._ensure_initialized() + + # Check if page is still valid + page_valid = False + try: + if self.page is not None: + # Try to access page.url to check if it's still valid + _ = self.page.url + page_valid = True + except Exception as e: + logger.warning(f"Page is invalid: {e}, will get a new page...") + self.page = None + + # Get a valid page if we don't have one + if not page_valid or self.page is None: + try: + pages = self.context.pages + if pages: + # Find first non-closed page + for p in pages: + try: + if not p.is_closed(): + self.page = p + logger.info("Reusing existing open page") + page_valid = True + break + except Exception: + continue + + # If no valid page found, create a new one + if not page_valid: + self.page = await self.context.new_page() + logger.info("Created new page") + except Exception as e: + logger.error(f"Failed to get new page: {e}, browser may be closed") + # Browser was closed - reinitialize it + try: + logger.info("Browser was closed manually, reinitializing...") + self._initialized = False + self.context = None + self.page = None + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + # Reinitialize + await self._ensure_initialized() + # Get or create a page + pages = self.context.pages + if pages: + self.page = pages[0] + else: + self.page = await self.context.new_page() + logger.info("Browser reopened successfully after manual closure") + except Exception as reinit_error: + logger.error(f"Failed to reinitialize browser: {reinit_error}") + import traceback + logger.error(traceback.format_exc()) + return {"success": False, "error": f"Browser was closed and cannot be recovered: {reinit_error}"} + + try: + if cmd == "visit_url": + url = params.get("url") + if not url: + return {"success": False, "error": "url parameter is required"} + await self.page.goto(url, wait_until="domcontentloaded", timeout=30000) + return {"success": True, "url": self.page.url} + + elif cmd == "click": + x = params.get("x") + y = params.get("y") + if x is None or y is None: + return {"success": False, "error": "x and y parameters are required"} + await self.page.mouse.click(x, y) + return {"success": True} + + elif cmd == "type": + text = params.get("text") + if text is None: + return {"success": False, "error": "text parameter is required"} + await self.page.keyboard.type(text) + return {"success": True} + + elif cmd == "scroll": + delta_x = params.get("delta_x", 0) + delta_y = params.get("delta_y", 0) + await self.page.mouse.wheel(delta_x, delta_y) + return {"success": True} + + elif cmd == "web_search": + query = params.get("query") + if not query: + return {"success": False, "error": "query parameter is required"} + # Navigate to Google search + search_url = f"https://www.google.com/search?q={query}" + await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000) + return {"success": True, "url": self.page.url} + + else: + return {"success": False, "error": f"Unknown command: {cmd}"} + + except Exception as e: + logger.error(f"Error executing command {cmd}: {e}") + import traceback + logger.error(traceback.format_exc()) + # If page was closed due to error, try to recover + if "closed" in str(e).lower() and self.context: + try: + pages = self.context.pages + if pages: + self.page = pages[0] + logger.info("Recovered page after error") + else: + self.page = await self.context.new_page() + logger.info("Created new page after error") + except Exception as recover_error: + logger.error(f"Failed to recover page: {recover_error}") + return {"success": False, "error": str(e)} + + async def close(self): + """Close the browser and cleanup resources.""" + async with self._lock: + try: + if self.context: + await self.context.close() + self.context = None + if self.browser: + await self.browser.close() + self.browser = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + + self.page = None + self._initialized = False + logger.info("Browser closed successfully") + except Exception as e: + logger.error(f"Error closing browser: {e}") + + +# Global instance +_browser_manager: Optional[BrowserManager] = None + + +def get_browser_manager() -> BrowserManager: + """Get or create the global BrowserManager instance.""" + global _browser_manager + if _browser_manager is None: + _browser_manager = BrowserManager() + return _browser_manager + diff --git a/libs/python/computer-server/computer_server/main.py b/libs/python/computer-server/computer_server/main.py index 3ae97ebc..9bad59bf 100644 --- a/libs/python/computer-server/computer_server/main.py +++ b/libs/python/computer-server/computer_server/main.py @@ -25,6 +25,7 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, StreamingResponse from .handlers.factory import HandlerFactory +from .browser import get_browser_manager # Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60")) @@ -749,5 +750,71 @@ async def agent_response_endpoint( return JSONResponse(content=payload, headers=headers) +@app.post("/playwright_exec") +async def playwright_exec_endpoint( + request: Request, + container_name: Optional[str] = Header(None, alias="X-Container-Name"), + api_key: Optional[str] = Header(None, alias="X-API-Key"), +): + """ + Execute Playwright browser commands. + + Headers: + - X-Container-Name: Container name for cloud authentication + - X-API-Key: API key for cloud authentication + + Body: + { + "command": "visit_url|click|type|scroll|web_search", + "params": {...} + } + """ + # Parse request body + try: + body = await request.json() + command = body.get("command") + params = body.get("params", {}) + except Exception as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") + + if not command: + raise HTTPException(status_code=400, detail="Command is required") + + # Check if CONTAINER_NAME is set (indicating cloud provider) + server_container_name = os.environ.get("CONTAINER_NAME") + + # If cloud provider, perform authentication + if server_container_name: + logger.info( + f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..." + ) + + # Validate required headers + if not container_name: + raise HTTPException(status_code=401, detail="Container name required") + + if not api_key: + raise HTTPException(status_code=401, detail="API key required") + + # Validate with AuthenticationManager + is_authenticated = await auth_manager.auth(container_name, api_key) + if not is_authenticated: + raise HTTPException(status_code=401, detail="Authentication failed") + + # Get browser manager and execute command + try: + browser_manager = get_browser_manager() + result = await browser_manager.execute_command(command, params) + + if result.get("success"): + return JSONResponse(content=result) + else: + raise HTTPException(status_code=400, detail=result.get("error", "Command failed")) + except Exception as e: + logger.error(f"Error executing playwright command: {str(e)}") + logger.error(traceback.format_exc()) + raise HTTPException(status_code=500, detail=str(e)) + + if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/libs/python/computer-server/pyproject.toml b/libs/python/computer-server/pyproject.toml index 7bae1e06..75ff49f5 100644 --- a/libs/python/computer-server/pyproject.toml +++ b/libs/python/computer-server/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "pyperclip>=1.9.0", "websockets>=12.0", "pywinctl>=0.4.1", + "playwright>=1.40.0", # OS-specific runtime deps "pyobjc-framework-Cocoa>=10.1; sys_platform == 'darwin'", "pyobjc-framework-Quartz>=10.1; sys_platform == 'darwin'", diff --git a/libs/xfce/Dockerfile b/libs/xfce/Dockerfile index e83f6bd2..43dab80f 100644 --- a/libs/xfce/Dockerfile +++ b/libs/xfce/Dockerfile @@ -107,6 +107,16 @@ RUN mkdir -p /home/cua/.cache && \ # Install computer-server using Python 3.12 pip RUN python3.12 -m pip install cua-computer-server +# Copy browser.py and updated main.py from local source (to include browser tool) +# These files need to be in the same directory as the Dockerfile when building +COPY browser.py /tmp/browser.py +COPY main.py /tmp/main.py +RUN python3.12 -c "import shutil; import os; cs_dir = '/usr/local/lib/python3.12/dist-packages/computer_server'; shutil.copy('/tmp/browser.py', f'{cs_dir}/browser.py'); shutil.copy('/tmp/main.py', f'{cs_dir}/main.py'); print('Copied browser.py and main.py')" && rm /tmp/browser.py /tmp/main.py + +# Install playwright and Firefox dependencies +RUN python3.12 -m pip install playwright && \ + python3.12 -m playwright install --with-deps firefox + # Fix any cache files created by pip RUN chown -R cua:cua /home/cua/.cache diff --git a/libs/xfce/README_BUILD.md b/libs/xfce/README_BUILD.md new file mode 100644 index 00000000..d6f6a7d4 --- /dev/null +++ b/libs/xfce/README_BUILD.md @@ -0,0 +1,32 @@ +# Building the XFCE Docker Image + +## Required Files for Build + +The Dockerfile requires these files to be present in the `libs/xfce/` directory: + +- `browser.py` - Copy from `libs/python/computer-server/computer_server/browser.py` +- `main.py` - Copy from `libs/python/computer-server/computer_server/main.py` + +These files are copied into the container to include the browser tool functionality +that isn't yet in the published PyPI package. + +## Before Building + +```bash +# Copy the latest browser tool files +cp libs/python/computer-server/computer_server/browser.py libs/xfce/ +cp libs/python/computer-server/computer_server/main.py libs/xfce/ +``` + +## Build Command + +```bash +cd libs/xfce +docker build -t cua-xfce . +``` + +## Note + +Once the browser tool is included in the published `cua-computer-server` package, +these temporary file copies can be removed and the Dockerfile can be simplified. + diff --git a/libs/xfce/browser.py b/libs/xfce/browser.py new file mode 100644 index 00000000..3d0a4c69 --- /dev/null +++ b/libs/xfce/browser.py @@ -0,0 +1,308 @@ +""" +Browser manager using Playwright for programmatic browser control. +This allows agents to control a browser that runs visibly on the XFCE desktop. +""" + +import asyncio +import logging +import os +from typing import Any, Dict, Optional + +try: + from playwright.async_api import async_playwright, Browser, BrowserContext, Page +except ImportError: + async_playwright = None + Browser = None + BrowserContext = None + Page = None + +logger = logging.getLogger(__name__) + + +class BrowserManager: + """ + Manages a Playwright browser instance that runs visibly on the XFCE desktop. + Uses persistent context to maintain cookies and sessions. + """ + + def __init__(self): + """Initialize the BrowserManager.""" + self.playwright = None + self.browser: Optional[Browser] = None + self.context: Optional[BrowserContext] = None + self.page: Optional[Page] = None + self._initialized = False + self._initialization_error: Optional[str] = None + self._lock = asyncio.Lock() + + async def _ensure_initialized(self): + """Ensure the browser is initialized.""" + # Check if browser was closed and needs reinitialization + if self._initialized: + try: + # Check if context is still valid by trying to access it + if self.context: + # Try to get pages - this will raise if context is closed + _ = self.context.pages + # If we get here, context is still alive + return + else: + # Context was closed, need to reinitialize + self._initialized = False + logger.warning("Browser context was closed, will reinitialize...") + except Exception as e: + # Context is dead, need to reinitialize + logger.warning(f"Browser context is dead ({e}), will reinitialize...") + self._initialized = False + self.context = None + self.page = None + # Clean up playwright if it exists + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + async with self._lock: + # Double-check after acquiring lock (another thread might have initialized it) + if self._initialized: + try: + if self.context: + _ = self.context.pages + return + except Exception: + self._initialized = False + self.context = None + self.page = None + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + if async_playwright is None: + raise RuntimeError( + "playwright is not installed. Please install it with: pip install playwright && playwright install --with-deps firefox" + ) + + try: + # Get display from environment or default to :1 + display = os.environ.get("DISPLAY", ":1") + logger.info(f"Initializing browser with DISPLAY={display}") + + # Start playwright + self.playwright = await async_playwright().start() + + # Launch Firefox with persistent context (keeps cookies/sessions) + # headless=False is CRITICAL so the visual agent can see it + user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright-firefox") + os.makedirs(user_data_dir, exist_ok=True) + + # launch_persistent_context returns a BrowserContext, not a Browser + # Note: Removed --kiosk mode so the desktop remains visible + self.context = await self.playwright.firefox.launch_persistent_context( + user_data_dir=user_data_dir, + headless=False, # CRITICAL: visible for visual agent + viewport={"width": 1024, "height": 768}, + # Removed --kiosk to allow desktop visibility + ) + + # Get the first page or create one + pages = self.context.pages + if pages: + self.page = pages[0] + else: + self.page = await self.context.new_page() + + self._initialized = True + logger.info("Browser initialized successfully") + + except Exception as e: + logger.error(f"Failed to initialize browser: {e}") + import traceback + logger.error(traceback.format_exc()) + # Don't raise - return error in execute_command instead + self._initialization_error = str(e) + raise + + async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]: + """ + Execute a browser command. + + Args: + cmd: Command name (visit_url, click, type, scroll, web_search) + params: Command parameters + + Returns: + Result dictionary with success status and any data + """ + try: + await self._ensure_initialized() + except Exception as e: + error_msg = getattr(self, '_initialization_error', None) or str(e) + logger.error(f"Browser initialization failed: {error_msg}") + return { + "success": False, + "error": f"Browser initialization failed: {error_msg}. " + f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly." + } + + # Ensure browser is still initialized (in case it was manually closed) + # This will automatically reinitialize if the browser was closed + await self._ensure_initialized() + + # Check if page is still valid + page_valid = False + try: + if self.page is not None: + # Try to access page.url to check if it's still valid + _ = self.page.url + page_valid = True + except Exception as e: + logger.warning(f"Page is invalid: {e}, will get a new page...") + self.page = None + + # Get a valid page if we don't have one + if not page_valid or self.page is None: + try: + pages = self.context.pages + if pages: + # Find first non-closed page + for p in pages: + try: + if not p.is_closed(): + self.page = p + logger.info("Reusing existing open page") + page_valid = True + break + except Exception: + continue + + # If no valid page found, create a new one + if not page_valid: + self.page = await self.context.new_page() + logger.info("Created new page") + except Exception as e: + logger.error(f"Failed to get new page: {e}, browser may be closed") + # Browser was closed - reinitialize it + try: + logger.info("Browser was closed manually, reinitializing...") + self._initialized = False + self.context = None + self.page = None + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + # Reinitialize + await self._ensure_initialized() + # Get or create a page + pages = self.context.pages + if pages: + self.page = pages[0] + else: + self.page = await self.context.new_page() + logger.info("Browser reopened successfully after manual closure") + except Exception as reinit_error: + logger.error(f"Failed to reinitialize browser: {reinit_error}") + import traceback + logger.error(traceback.format_exc()) + return {"success": False, "error": f"Browser was closed and cannot be recovered: {reinit_error}"} + + try: + if cmd == "visit_url": + url = params.get("url") + if not url: + return {"success": False, "error": "url parameter is required"} + await self.page.goto(url, wait_until="domcontentloaded", timeout=30000) + return {"success": True, "url": self.page.url} + + elif cmd == "click": + x = params.get("x") + y = params.get("y") + if x is None or y is None: + return {"success": False, "error": "x and y parameters are required"} + await self.page.mouse.click(x, y) + return {"success": True} + + elif cmd == "type": + text = params.get("text") + if text is None: + return {"success": False, "error": "text parameter is required"} + await self.page.keyboard.type(text) + return {"success": True} + + elif cmd == "scroll": + delta_x = params.get("delta_x", 0) + delta_y = params.get("delta_y", 0) + await self.page.mouse.wheel(delta_x, delta_y) + return {"success": True} + + elif cmd == "web_search": + query = params.get("query") + if not query: + return {"success": False, "error": "query parameter is required"} + # Navigate to Google search + search_url = f"https://www.google.com/search?q={query}" + await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000) + return {"success": True, "url": self.page.url} + + else: + return {"success": False, "error": f"Unknown command: {cmd}"} + + except Exception as e: + logger.error(f"Error executing command {cmd}: {e}") + import traceback + logger.error(traceback.format_exc()) + # If page was closed due to error, try to recover + if "closed" in str(e).lower() and self.context: + try: + pages = self.context.pages + if pages: + self.page = pages[0] + logger.info("Recovered page after error") + else: + self.page = await self.context.new_page() + logger.info("Created new page after error") + except Exception as recover_error: + logger.error(f"Failed to recover page: {recover_error}") + return {"success": False, "error": str(e)} + + async def close(self): + """Close the browser and cleanup resources.""" + async with self._lock: + try: + if self.context: + await self.context.close() + self.context = None + if self.browser: + await self.browser.close() + self.browser = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + + self.page = None + self._initialized = False + logger.info("Browser closed successfully") + except Exception as e: + logger.error(f"Error closing browser: {e}") + + +# Global instance +_browser_manager: Optional[BrowserManager] = None + + +def get_browser_manager() -> BrowserManager: + """Get or create the global BrowserManager instance.""" + global _browser_manager + if _browser_manager is None: + _browser_manager = BrowserManager() + return _browser_manager + diff --git a/libs/xfce/main.py b/libs/xfce/main.py new file mode 100644 index 00000000..9bad59bf --- /dev/null +++ b/libs/xfce/main.py @@ -0,0 +1,820 @@ +import asyncio +import hashlib +import inspect +import json +import logging +import os +import platform +import time +import traceback +from contextlib import redirect_stderr, redirect_stdout +from io import StringIO +from typing import Any, Dict, List, Literal, Optional, Union, cast + +import aiohttp +import uvicorn +from fastapi import ( + FastAPI, + Header, + HTTPException, + Request, + WebSocket, + WebSocketDisconnect, +) +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse, StreamingResponse + +from .handlers.factory import HandlerFactory +from .browser import get_browser_manager + +# Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s +AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60")) + +try: + from agent import ComputerAgent + + HAS_AGENT = True +except ImportError: + HAS_AGENT = False + +# Set up logging with more detail +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# Configure WebSocket with larger message size +WEBSOCKET_MAX_SIZE = 1024 * 1024 * 10 # 10MB limit + +# Configure application with WebSocket settings +app = FastAPI( + title="Computer API", + description="API for the Computer project", + version="0.1.0", + websocket_max_size=WEBSOCKET_MAX_SIZE, +) + +# CORS configuration +origins = ["*"] +app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +protocol_version = 1 +try: + from importlib.metadata import version + + package_version = version("cua-computer-server") +except Exception: + # Fallback for cases where package is not installed or importlib.metadata is not available + try: + import pkg_resources + + package_version = pkg_resources.get_distribution("cua-computer-server").version + except Exception: + package_version = "unknown" + +( + accessibility_handler, + automation_handler, + diorama_handler, + file_handler, + desktop_handler, + window_handler, +) = HandlerFactory.create_handlers() +handlers = { + "version": lambda: {"protocol": protocol_version, "package": package_version}, + # App-Use commands + "diorama_cmd": diorama_handler.diorama_cmd, + # Accessibility commands + "get_accessibility_tree": accessibility_handler.get_accessibility_tree, + "find_element": accessibility_handler.find_element, + # Shell commands + "run_command": automation_handler.run_command, + # File system commands + "file_exists": file_handler.file_exists, + "directory_exists": file_handler.directory_exists, + "list_dir": file_handler.list_dir, + "read_text": file_handler.read_text, + "write_text": file_handler.write_text, + "read_bytes": file_handler.read_bytes, + "write_bytes": file_handler.write_bytes, + "get_file_size": file_handler.get_file_size, + "delete_file": file_handler.delete_file, + "create_dir": file_handler.create_dir, + "delete_dir": file_handler.delete_dir, + # Desktop commands + "get_desktop_environment": desktop_handler.get_desktop_environment, + "set_wallpaper": desktop_handler.set_wallpaper, + # Window management + "open": window_handler.open, + "launch": window_handler.launch, + "get_current_window_id": window_handler.get_current_window_id, + "get_application_windows": window_handler.get_application_windows, + "get_window_name": window_handler.get_window_name, + "get_window_size": window_handler.get_window_size, + "get_window_position": window_handler.get_window_position, + "set_window_size": window_handler.set_window_size, + "set_window_position": window_handler.set_window_position, + "maximize_window": window_handler.maximize_window, + "minimize_window": window_handler.minimize_window, + "activate_window": window_handler.activate_window, + "close_window": window_handler.close_window, + # Mouse commands + "mouse_down": automation_handler.mouse_down, + "mouse_up": automation_handler.mouse_up, + "left_click": automation_handler.left_click, + "right_click": automation_handler.right_click, + "double_click": automation_handler.double_click, + "move_cursor": automation_handler.move_cursor, + "drag_to": automation_handler.drag_to, + "drag": automation_handler.drag, + # Keyboard commands + "key_down": automation_handler.key_down, + "key_up": automation_handler.key_up, + "type_text": automation_handler.type_text, + "press_key": automation_handler.press_key, + "hotkey": automation_handler.hotkey, + # Scrolling actions + "scroll": automation_handler.scroll, + "scroll_down": automation_handler.scroll_down, + "scroll_up": automation_handler.scroll_up, + # Screen actions + "screenshot": automation_handler.screenshot, + "get_cursor_position": automation_handler.get_cursor_position, + "get_screen_size": automation_handler.get_screen_size, + # Clipboard actions + "copy_to_clipboard": automation_handler.copy_to_clipboard, + "set_clipboard": automation_handler.set_clipboard, +} + + +class AuthenticationManager: + def __init__(self): + self.sessions: Dict[str, Dict[str, Any]] = {} + self.container_name = os.environ.get("CONTAINER_NAME") + + def _hash_credentials(self, container_name: str, api_key: str) -> str: + """Create a hash of container name and API key for session identification""" + combined = f"{container_name}:{api_key}" + return hashlib.sha256(combined.encode()).hexdigest() + + def _is_session_valid(self, session_data: Dict[str, Any]) -> bool: + """Check if a session is still valid based on expiration time""" + if not session_data.get("valid", False): + return False + + expires_at = session_data.get("expires_at", 0) + return time.time() < expires_at + + async def auth(self, container_name: str, api_key: str) -> bool: + """Authenticate container name and API key, using cached sessions when possible""" + # If no CONTAINER_NAME is set, always allow access (local development) + if not self.container_name: + logger.info( + "No CONTAINER_NAME set in environment. Allowing access (local development mode)" + ) + return True + + # Layer 1: VM Identity Verification + if container_name != self.container_name: + logger.warning( + f"VM name mismatch. Expected: {self.container_name}, Got: {container_name}" + ) + return False + + # Create hash for session lookup + session_hash = self._hash_credentials(container_name, api_key) + + # Check if we have a valid cached session + if session_hash in self.sessions: + session_data = self.sessions[session_hash] + if self._is_session_valid(session_data): + logger.info(f"Using cached authentication for container: {container_name}") + return session_data["valid"] + else: + # Remove expired session + del self.sessions[session_hash] + + # No valid cached session, authenticate with API + logger.info(f"Authenticating with TryCUA API for container: {container_name}") + + try: + async with aiohttp.ClientSession() as session: + headers = {"Authorization": f"Bearer {api_key}"} + + async with session.get( + f"https://www.cua.ai/api/vm/auth?container_name={container_name}", + headers=headers, + ) as resp: + is_valid = resp.status == 200 and bool((await resp.text()).strip()) + + # Cache the result with configurable expiration + self.sessions[session_hash] = { + "valid": is_valid, + "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS, + } + + if is_valid: + logger.info(f"Authentication successful for container: {container_name}") + else: + logger.warning( + f"Authentication failed for container: {container_name}. Status: {resp.status}" + ) + + return is_valid + + except aiohttp.ClientError as e: + logger.error(f"Failed to validate API key with TryCUA API: {str(e)}") + # Cache failed result to avoid repeated requests + self.sessions[session_hash] = { + "valid": False, + "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS, + } + return False + except Exception as e: + logger.error(f"Unexpected error during authentication: {str(e)}") + # Cache failed result to avoid repeated requests + self.sessions[session_hash] = { + "valid": False, + "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS, + } + return False + + +class ConnectionManager: + def __init__(self): + self.active_connections: List[WebSocket] = [] + + async def connect(self, websocket: WebSocket): + await websocket.accept() + self.active_connections.append(websocket) + + def disconnect(self, websocket: WebSocket): + self.active_connections.remove(websocket) + + +manager = ConnectionManager() +auth_manager = AuthenticationManager() + + +@app.get("/status") +async def status(): + sys = platform.system().lower() + # get os type + if "darwin" in sys or sys == "macos" or sys == "mac": + os_type = "macos" + elif "windows" in sys: + os_type = "windows" + else: + os_type = "linux" + # get computer-server features + features = [] + if HAS_AGENT: + features.append("agent") + return {"status": "ok", "os_type": os_type, "features": features} + + +@app.websocket("/ws", name="websocket_endpoint") +async def websocket_endpoint(websocket: WebSocket): + global handlers + + # WebSocket message size is configured at the app or endpoint level, not on the instance + await manager.connect(websocket) + + # Check if CONTAINER_NAME is set (indicating cloud provider) + server_container_name = os.environ.get("CONTAINER_NAME") + + # If cloud provider, perform authentication handshake + if server_container_name: + try: + logger.info( + f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Waiting for authentication..." + ) + + # Wait for authentication message + auth_data = await websocket.receive_json() + + # Validate auth message format + if auth_data.get("command") != "authenticate": + await websocket.send_json( + {"success": False, "error": "First message must be authentication"} + ) + await websocket.close() + manager.disconnect(websocket) + return + + # Extract credentials + client_api_key = auth_data.get("params", {}).get("api_key") + client_container_name = auth_data.get("params", {}).get("container_name") + + # Validate credentials using AuthenticationManager + if not client_api_key: + await websocket.send_json({"success": False, "error": "API key required"}) + await websocket.close() + manager.disconnect(websocket) + return + + if not client_container_name: + await websocket.send_json({"success": False, "error": "Container name required"}) + await websocket.close() + manager.disconnect(websocket) + return + + # Use AuthenticationManager for validation + is_authenticated = await auth_manager.auth(client_container_name, client_api_key) + if not is_authenticated: + await websocket.send_json({"success": False, "error": "Authentication failed"}) + await websocket.close() + manager.disconnect(websocket) + return + + logger.info(f"Authentication successful for VM: {client_container_name}") + await websocket.send_json({"success": True, "message": "Authentication successful"}) + + except Exception as e: + logger.error(f"Error during authentication handshake: {str(e)}") + await websocket.send_json({"success": False, "error": "Authentication failed"}) + await websocket.close() + manager.disconnect(websocket) + return + + try: + while True: + try: + data = await websocket.receive_json() + command = data.get("command") + params = data.get("params", {}) + + if command not in handlers: + await websocket.send_json( + {"success": False, "error": f"Unknown command: {command}"} + ) + continue + + try: + # Filter params to only include those accepted by the handler function + handler_func = handlers[command] + sig = inspect.signature(handler_func) + filtered_params = {k: v for k, v in params.items() if k in sig.parameters} + + # Handle both sync and async functions + if asyncio.iscoroutinefunction(handler_func): + result = await handler_func(**filtered_params) + else: + # Run sync functions in thread pool to avoid blocking event loop + result = await asyncio.to_thread(handler_func, **filtered_params) + await websocket.send_json({"success": True, **result}) + except Exception as cmd_error: + logger.error(f"Error executing command {command}: {str(cmd_error)}") + logger.error(traceback.format_exc()) + await websocket.send_json({"success": False, "error": str(cmd_error)}) + + except WebSocketDisconnect: + raise + except json.JSONDecodeError as json_err: + logger.error(f"JSON decode error: {str(json_err)}") + await websocket.send_json( + {"success": False, "error": f"Invalid JSON: {str(json_err)}"} + ) + except Exception as loop_error: + logger.error(f"Error in message loop: {str(loop_error)}") + logger.error(traceback.format_exc()) + await websocket.send_json({"success": False, "error": str(loop_error)}) + + except WebSocketDisconnect: + logger.info("Client disconnected") + manager.disconnect(websocket) + except Exception as e: + logger.error(f"Fatal error in websocket connection: {str(e)}") + logger.error(traceback.format_exc()) + try: + await websocket.close() + except: + pass + manager.disconnect(websocket) + + +@app.post("/cmd") +async def cmd_endpoint( + request: Request, + container_name: Optional[str] = Header(None, alias="X-Container-Name"), + api_key: Optional[str] = Header(None, alias="X-API-Key"), +): + """ + Backup endpoint for when WebSocket connections fail. + Accepts commands via HTTP POST with streaming response. + + Headers: + - X-Container-Name: Container name for cloud authentication + - X-API-Key: API key for cloud authentication + + Body: + { + "command": "command_name", + "params": {...} + } + """ + global handlers + + # Parse request body + try: + body = await request.json() + command = body.get("command") + params = body.get("params", {}) + except Exception as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") + + if not command: + raise HTTPException(status_code=400, detail="Command is required") + + # Check if CONTAINER_NAME is set (indicating cloud provider) + server_container_name = os.environ.get("CONTAINER_NAME") + + # If cloud provider, perform authentication + if server_container_name: + logger.info( + f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..." + ) + + # Validate required headers + if not container_name: + raise HTTPException(status_code=401, detail="Container name required") + + if not api_key: + raise HTTPException(status_code=401, detail="API key required") + + # Validate with AuthenticationManager + is_authenticated = await auth_manager.auth(container_name, api_key) + if not is_authenticated: + raise HTTPException(status_code=401, detail="Authentication failed") + + if command not in handlers: + raise HTTPException(status_code=400, detail=f"Unknown command: {command}") + + async def generate_response(): + """Generate streaming response for the command execution""" + try: + # Filter params to only include those accepted by the handler function + handler_func = handlers[command] + sig = inspect.signature(handler_func) + filtered_params = {k: v for k, v in params.items() if k in sig.parameters} + + # Handle both sync and async functions + if asyncio.iscoroutinefunction(handler_func): + result = await handler_func(**filtered_params) + else: + # Run sync functions in thread pool to avoid blocking event loop + result = await asyncio.to_thread(handler_func, **filtered_params) + + # Stream the successful result + response_data = {"success": True, **result} + yield f"data: {json.dumps(response_data)}\n\n" + + except Exception as cmd_error: + logger.error(f"Error executing command {command}: {str(cmd_error)}") + logger.error(traceback.format_exc()) + + # Stream the error result + error_data = {"success": False, "error": str(cmd_error)} + yield f"data: {json.dumps(error_data)}\n\n" + + return StreamingResponse( + generate_response(), + media_type="text/plain", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + }, + ) + + +@app.post("/responses") +async def agent_response_endpoint( + request: Request, + api_key: Optional[str] = Header(None, alias="X-API-Key"), +): + """ + Minimal proxy to run ComputerAgent for up to 2 turns. + + Security: + - If CONTAINER_NAME is set on the server, require X-API-Key + and validate using AuthenticationManager unless CUA_ENABLE_PUBLIC_PROXY is true. + + Body JSON: + { + "model": "...", # required + "input": "... or messages[]", # required + "agent_kwargs": { ... }, # optional, passed directly to ComputerAgent + "env": { ... } # optional env overrides for agent + } + """ + if not HAS_AGENT: + raise HTTPException(status_code=501, detail="ComputerAgent not available") + + # Authenticate via AuthenticationManager if running in cloud (CONTAINER_NAME set) + container_name = os.environ.get("CONTAINER_NAME") + if container_name: + is_public = os.environ.get("CUA_ENABLE_PUBLIC_PROXY", "").lower().strip() in [ + "1", + "true", + "yes", + "y", + "on", + ] + if not is_public: + if not api_key: + raise HTTPException(status_code=401, detail="Missing AGENT PROXY auth headers") + ok = await auth_manager.auth(container_name, api_key) + if not ok: + raise HTTPException(status_code=401, detail="Unauthorized") + + # Parse request body + try: + body = await request.json() + except Exception as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") + + model = body.get("model") + input_data = body.get("input") + if not model or input_data is None: + raise HTTPException(status_code=400, detail="'model' and 'input' are required") + + agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {} + env_overrides: Dict[str, str] = body.get("env") or {} + + # Simple env override context + class _EnvOverride: + def __init__(self, overrides: Dict[str, str]): + self.overrides = overrides + self._original: Dict[str, Optional[str]] = {} + + def __enter__(self): + for k, v in (self.overrides or {}).items(): + self._original[k] = os.environ.get(k) + os.environ[k] = str(v) + + def __exit__(self, exc_type, exc, tb): + for k, old in self._original.items(): + if old is None: + os.environ.pop(k, None) + else: + os.environ[k] = old + + # Convert input to messages + def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: + if isinstance(data, str): + return [{"role": "user", "content": data}] + if isinstance(data, list): + return data + + messages = _to_messages(input_data) + + # Define a direct computer tool that implements the AsyncComputerHandler protocol + # and delegates to our existing automation/file/accessibility handlers. + from agent.computers import AsyncComputerHandler # runtime-checkable Protocol + + class DirectComputer(AsyncComputerHandler): + def __init__(self): + # use module-scope handler singletons created by HandlerFactory + self._auto = automation_handler + self._file = file_handler + self._access = accessibility_handler + + async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: + sys = platform.system().lower() + if "darwin" in sys or sys in ("macos", "mac"): + return "mac" + if "windows" in sys: + return "windows" + return "linux" + + async def get_dimensions(self) -> tuple[int, int]: + size = await self._auto.get_screen_size() + return size["width"], size["height"] + + async def screenshot(self) -> str: + img_b64 = await self._auto.screenshot() + return img_b64["image_data"] + + async def click(self, x: int, y: int, button: str = "left") -> None: + if button == "left": + await self._auto.left_click(x, y) + elif button == "right": + await self._auto.right_click(x, y) + else: + await self._auto.left_click(x, y) + + async def double_click(self, x: int, y: int) -> None: + await self._auto.double_click(x, y) + + async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + await self._auto.move_cursor(x, y) + await self._auto.scroll(scroll_x, scroll_y) + + async def type(self, text: str) -> None: + await self._auto.type_text(text) + + async def wait(self, ms: int = 1000) -> None: + await asyncio.sleep(ms / 1000.0) + + async def move(self, x: int, y: int) -> None: + await self._auto.move_cursor(x, y) + + async def keypress(self, keys: Union[List[str], str]) -> None: + if isinstance(keys, str): + parts = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys] + else: + parts = keys + if len(parts) == 1: + await self._auto.press_key(parts[0]) + else: + await self._auto.hotkey(parts) + + async def drag(self, path: List[Dict[str, int]]) -> None: + if not path: + return + start = path[0] + await self._auto.mouse_down(start["x"], start["y"]) + for pt in path[1:]: + await self._auto.move_cursor(pt["x"], pt["y"]) + end = path[-1] + await self._auto.mouse_up(end["x"], end["y"]) + + async def get_current_url(self) -> str: + # Not available in this server context + return "" + + async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None: + await self._auto.mouse_down(x, y, button="left") + + async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None: + await self._auto.mouse_up(x, y, button="left") + + # # Inline image URLs to base64 + # import base64, mimetypes, requests + # # Use a browser-like User-Agent to avoid 403s from some CDNs (e.g., Wikimedia) + # HEADERS = { + # "User-Agent": ( + # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + # "AppleWebKit/537.36 (KHTML, like Gecko) " + # "Chrome/124.0.0.0 Safari/537.36" + # ) + # } + # def _to_data_url(content_bytes: bytes, url: str, resp: requests.Response) -> str: + # ctype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream" + # b64 = base64.b64encode(content_bytes).decode("utf-8") + # return f"data:{ctype};base64,{b64}" + # def inline_image_urls(messages): + # # messages: List[{"role": "...","content":[...]}] + # out = [] + # for m in messages: + # if not isinstance(m.get("content"), list): + # out.append(m) + # continue + # new_content = [] + # for part in (m.get("content") or []): + # if part.get("type") == "input_image" and (url := part.get("image_url")): + # resp = requests.get(url, headers=HEADERS, timeout=30) + # resp.raise_for_status() + # new_content.append({ + # "type": "input_image", + # "image_url": _to_data_url(resp.content, url, resp) + # }) + # else: + # new_content.append(part) + # out.append({**m, "content": new_content}) + # return out + # messages = inline_image_urls(messages) + + error = None + + with _EnvOverride(env_overrides): + # Prepare tools: if caller did not pass tools, inject our DirectComputer + tools = agent_kwargs.get("tools") + if not tools: + tools = [DirectComputer()] + agent_kwargs = {**agent_kwargs, "tools": tools} + # Instantiate agent with our tools + agent = ComputerAgent(model=model, **agent_kwargs) # type: ignore[arg-type] + + total_output: List[Any] = [] + total_usage: Dict[str, Any] = {} + + pending_computer_call_ids = set() + try: + async for result in agent.run(messages): + total_output += result["output"] + # Try to collect usage if present + if ( + isinstance(result, dict) + and "usage" in result + and isinstance(result["usage"], dict) + ): + # Merge usage counters + for k, v in result["usage"].items(): + if isinstance(v, (int, float)): + total_usage[k] = total_usage.get(k, 0) + v + else: + total_usage[k] = v + for msg in result.get("output", []): + if msg.get("type") == "computer_call": + pending_computer_call_ids.add(msg["call_id"]) + elif msg.get("type") == "computer_call_output": + pending_computer_call_ids.discard(msg["call_id"]) + # exit if no pending computer calls + if not pending_computer_call_ids: + break + except Exception as e: + logger.error(f"Error running agent: {str(e)}") + logger.error(traceback.format_exc()) + error = str(e) + + # Build response payload + payload = { + "model": model, + "error": error, + "output": total_output, + "usage": total_usage, + "status": "completed" if not error else "failed", + } + + # CORS: allow any origin + headers = { + "Cache-Control": "no-cache", + "Connection": "keep-alive", + } + + return JSONResponse(content=payload, headers=headers) + + +@app.post("/playwright_exec") +async def playwright_exec_endpoint( + request: Request, + container_name: Optional[str] = Header(None, alias="X-Container-Name"), + api_key: Optional[str] = Header(None, alias="X-API-Key"), +): + """ + Execute Playwright browser commands. + + Headers: + - X-Container-Name: Container name for cloud authentication + - X-API-Key: API key for cloud authentication + + Body: + { + "command": "visit_url|click|type|scroll|web_search", + "params": {...} + } + """ + # Parse request body + try: + body = await request.json() + command = body.get("command") + params = body.get("params", {}) + except Exception as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") + + if not command: + raise HTTPException(status_code=400, detail="Command is required") + + # Check if CONTAINER_NAME is set (indicating cloud provider) + server_container_name = os.environ.get("CONTAINER_NAME") + + # If cloud provider, perform authentication + if server_container_name: + logger.info( + f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..." + ) + + # Validate required headers + if not container_name: + raise HTTPException(status_code=401, detail="Container name required") + + if not api_key: + raise HTTPException(status_code=401, detail="API key required") + + # Validate with AuthenticationManager + is_authenticated = await auth_manager.auth(container_name, api_key) + if not is_authenticated: + raise HTTPException(status_code=401, detail="Authentication failed") + + # Get browser manager and execute command + try: + browser_manager = get_browser_manager() + result = await browser_manager.execute_command(command, params) + + if result.get("success"): + return JSONResponse(content=result) + else: + raise HTTPException(status_code=400, detail=result.get("error", "Command failed")) + except Exception as e: + logger.error(f"Error executing playwright command: {str(e)}") + logger.error(traceback.format_exc()) + raise HTTPException(status_code=500, detail=str(e)) + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/libs/xfce/src/scripts/start-computer-server.sh b/libs/xfce/src/scripts/start-computer-server.sh index bc27a3db..1e52e536 100644 --- a/libs/xfce/src/scripts/start-computer-server.sh +++ b/libs/xfce/src/scripts/start-computer-server.sh @@ -10,4 +10,4 @@ echo "X server is ready" # Start computer-server export DISPLAY=:1 -python3 -m computer_server --port ${API_PORT:-8000} +python -m computer_server --port ${API_PORT:-8000}