From 37c5be669bf256a05350f8429540f7d483eaaccf Mon Sep 17 00:00:00 2001 From: Adam Date: Tue, 2 Dec 2025 12:53:25 -0500 Subject: [PATCH 1/8] dd browser tool with Playwright for visible browser automation Add browser tool with Playwright/Firefox support. Includes BrowserManager, /playwright_exec endpoint, BrowserTool client, and auto-recovery. Fixes Python version in startup script and adds Playwright to Docker build. --- examples/BROWSER_TOOL_README.md | 69 ++ examples/browser_tool_example.py | 96 ++ libs/python/agent/agent/tools/__init__.py | 6 + libs/python/agent/agent/tools/browser_tool.py | 143 +++ .../computer_server/browser.py | 308 +++++++ .../computer-server/computer_server/main.py | 67 ++ libs/python/computer-server/pyproject.toml | 1 + libs/xfce/Dockerfile | 10 + libs/xfce/README_BUILD.md | 32 + libs/xfce/browser.py | 308 +++++++ libs/xfce/main.py | 820 ++++++++++++++++++ .../xfce/src/scripts/start-computer-server.sh | 2 +- 12 files changed, 1861 insertions(+), 1 deletion(-) create mode 100644 examples/BROWSER_TOOL_README.md create mode 100644 examples/browser_tool_example.py create mode 100644 libs/python/agent/agent/tools/__init__.py create mode 100644 libs/python/agent/agent/tools/browser_tool.py create mode 100644 libs/python/computer-server/computer_server/browser.py create mode 100644 libs/xfce/README_BUILD.md create mode 100644 libs/xfce/browser.py create mode 100644 libs/xfce/main.py diff --git a/examples/BROWSER_TOOL_README.md b/examples/BROWSER_TOOL_README.md new file mode 100644 index 00000000..8d12ae85 --- /dev/null +++ b/examples/BROWSER_TOOL_README.md @@ -0,0 +1,69 @@ +# Browser Tool + +Browser automation tool that allows agents to control a Firefox browser programmatically via Playwright while keeping it visible on the XFCE desktop. + +## Quick Start + +### Using Docker (Recommended) + +```bash +# Build and run the container +cd libs/xfce +docker build -t cua-xfce . +docker run -d --name cua-xfce-test \ + -p 8000:8000 -p 5901:5901 -p 6901:6901 \ + -e DISPLAY=:1 \ + cua-xfce + +# View desktop: http://localhost:6901 +# Test the browser tool +python examples/browser_tool_example.py +``` + +### Local Testing + +```bash +# Install dependencies +pip install playwright +playwright install --with-deps firefox + +# Start server +python -m computer_server --port 8000 + +# Run test (in another terminal) +python examples/browser_tool_example.py +``` + +## Features + +- **Visible Browser**: Runs in non-headless mode so visual agents can see it +- **Auto-Recovery**: Automatically reopens browser if closed manually +- **Persistent Context**: Maintains cookies and sessions across commands +- **Fara/Magentic-One Interface**: Compatible with Microsoft agent interfaces + +## API Endpoint + +The browser tool is accessible via the `/playwright_exec` endpoint: + +```bash +curl -X POST http://localhost:8000/playwright_exec \ + -H "Content-Type: application/json" \ + -d '{"command": "visit_url", "params": {"url": "https://www.example.com"}}' +``` + +## Available Commands + +- `visit_url(url)` - Navigate to a URL +- `click(x, y)` - Click at coordinates +- `type(text)` - Type text into focused element +- `scroll(delta_x, delta_y)` - Scroll the page +- `web_search(query)` - Navigate to Google search + +## Troubleshooting + +**Browser closes unexpectedly**: The tool automatically reopens the browser on the next command. + +**Connection errors**: Make sure the server is running (`curl http://localhost:8000/status`). + +**Playwright not found**: Install with `pip install playwright && playwright install --with-deps firefox`. + diff --git a/examples/browser_tool_example.py b/examples/browser_tool_example.py new file mode 100644 index 00000000..9705ca8f --- /dev/null +++ b/examples/browser_tool_example.py @@ -0,0 +1,96 @@ +""" +Browser Tool Example + +Demonstrates how to use the BrowserTool to control a browser programmatically +via the computer server. The browser runs visibly on the XFCE desktop so visual +agents can see it. + +Prerequisites: + - Computer server running (Docker container or local) + - For Docker: Container should be running with browser tool support + - For local: Playwright and Firefox must be installed + +Usage: + python examples/browser_tool_example.py +""" + +import asyncio +import logging +import sys +from pathlib import Path + +# Import BrowserTool directly from the file +browser_tool_path = Path(__file__).parent.parent / "libs" / "python" / "agent" / "agent" / "tools" / "browser_tool.py" +sys.path.insert(0, str(browser_tool_path.parent.parent.parent)) + +# Import the module directly +import importlib.util +spec = importlib.util.spec_from_file_location("browser_tool", browser_tool_path) +if spec is None or spec.loader is None: + raise ImportError(f"Could not load browser_tool from {browser_tool_path}") +browser_tool_module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(browser_tool_module) +BrowserTool = browser_tool_module.BrowserTool + +# Configure logging to see what's happening +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +async def test_browser_tool(): + """Test the BrowserTool with various commands.""" + + # Initialize the browser tool + # For local testing, use http://localhost:8000 + # For cloud, provide base_url, api_key, and container_name + browser = BrowserTool(base_url="http://localhost:8000") + + logger.info("Testing Browser Tool...") + + try: + # Test 1: Visit a URL + logger.info("Test 1: Visiting a URL...") + result = await browser.visit_url("https://www.trycua.com") + logger.info(f"Visit URL result: {result}") + + # Wait a bit for the page to load + await asyncio.sleep(2) + + # Test 2: Web search + logger.info("Test 2: Performing a web search...") + result = await browser.web_search("Python programming") + logger.info(f"Web search result: {result}") + + # Wait a bit + await asyncio.sleep(2) + + # Test 3: Scroll + logger.info("Test 3: Scrolling the page...") + result = await browser.scroll(delta_x=0, delta_y=500) + logger.info(f"Scroll result: {result}") + + # Wait a bit + await asyncio.sleep(1) + + # Test 4: Click (example coordinates - adjust based on your screen) + logger.info("Test 4: Clicking at coordinates...") + result = await browser.click(x=500, y=300) + logger.info(f"Click result: {result}") + + # Wait a bit + await asyncio.sleep(1) + + # Test 5: Type text (if there's a focused input field) + logger.info("Test 5: Typing text...") + result = await browser.type("Hello from BrowserTool!") + logger.info(f"Type result: {result}") + + logger.info("All tests completed!") + + except Exception as e: + logger.error(f"Error during testing: {e}", exc_info=True) + + +if __name__ == "__main__": + asyncio.run(test_browser_tool()) + diff --git a/libs/python/agent/agent/tools/__init__.py b/libs/python/agent/agent/tools/__init__.py new file mode 100644 index 00000000..e663c557 --- /dev/null +++ b/libs/python/agent/agent/tools/__init__.py @@ -0,0 +1,6 @@ +"""Tools for agent interactions.""" + +from .browser_tool import BrowserTool + +__all__ = ["BrowserTool"] + diff --git a/libs/python/agent/agent/tools/browser_tool.py b/libs/python/agent/agent/tools/browser_tool.py new file mode 100644 index 00000000..8f8b1ab9 --- /dev/null +++ b/libs/python/agent/agent/tools/browser_tool.py @@ -0,0 +1,143 @@ +""" +Browser Tool for agent interactions. +Allows agents to control a browser programmatically via Playwright. +""" + +import logging +from typing import Optional + +import aiohttp + +logger = logging.getLogger(__name__) + + +class BrowserTool: + """ + Browser tool that connects to the computer server's Playwright endpoint. + Implements the Fara/Magentic-One agent interface for browser control. + """ + + def __init__( + self, + base_url: str = "http://localhost:8000", + api_key: Optional[str] = None, + container_name: Optional[str] = None, + ): + """ + Initialize the BrowserTool. + + Args: + base_url: Base URL of the computer server (default: http://localhost:8000) + api_key: Optional API key for cloud authentication + container_name: Optional container name for cloud authentication + """ + self.base_url = base_url.rstrip("/") + self.api_key = api_key + self.container_name = container_name + self.logger = logger + + def _get_endpoint_url(self) -> str: + """Get the full URL for the playwright_exec endpoint.""" + return f"{self.base_url}/playwright_exec" + + def _get_headers(self) -> dict: + """Get headers for the HTTP request.""" + headers = {"Content-Type": "application/json"} + if self.api_key: + headers["X-API-Key"] = self.api_key + if self.container_name: + headers["X-Container-Name"] = self.container_name + return headers + + async def _execute_command(self, command: str, params: dict) -> dict: + """ + Execute a browser command via HTTP POST. + + Args: + command: Command name + params: Command parameters + + Returns: + Response dictionary + """ + url = self._get_endpoint_url() + payload = {"command": command, "params": params} + headers = self._get_headers() + + try: + async with aiohttp.ClientSession() as session: + async with session.post(url, json=payload, headers=headers) as response: + if response.status == 200: + return await response.json() + else: + error_text = await response.text() + self.logger.error( + f"Browser command failed with status {response.status}: {error_text}" + ) + return {"success": False, "error": error_text} + except Exception as e: + self.logger.error(f"Error executing browser command: {e}") + return {"success": False, "error": str(e)} + + async def visit_url(self, url: str) -> dict: + """ + Navigate to a URL. + + Args: + url: URL to visit + + Returns: + Response dictionary with success status and current URL + """ + return await self._execute_command("visit_url", {"url": url}) + + async def click(self, x: int, y: int) -> dict: + """ + Click at coordinates. + + Args: + x: X coordinate + y: Y coordinate + + Returns: + Response dictionary with success status + """ + return await self._execute_command("click", {"x": x, "y": y}) + + async def type(self, text: str) -> dict: + """ + Type text into the focused element. + + Args: + text: Text to type + + Returns: + Response dictionary with success status + """ + return await self._execute_command("type", {"text": text}) + + async def scroll(self, delta_x: int, delta_y: int) -> dict: + """ + Scroll the page. + + Args: + delta_x: Horizontal scroll delta + delta_y: Vertical scroll delta + + Returns: + Response dictionary with success status + """ + return await self._execute_command("scroll", {"delta_x": delta_x, "delta_y": delta_y}) + + async def web_search(self, query: str) -> dict: + """ + Navigate to a Google search for the query. + + Args: + query: Search query + + Returns: + Response dictionary with success status and current URL + """ + return await self._execute_command("web_search", {"query": query}) + diff --git a/libs/python/computer-server/computer_server/browser.py b/libs/python/computer-server/computer_server/browser.py new file mode 100644 index 00000000..3d0a4c69 --- /dev/null +++ b/libs/python/computer-server/computer_server/browser.py @@ -0,0 +1,308 @@ +""" +Browser manager using Playwright for programmatic browser control. +This allows agents to control a browser that runs visibly on the XFCE desktop. +""" + +import asyncio +import logging +import os +from typing import Any, Dict, Optional + +try: + from playwright.async_api import async_playwright, Browser, BrowserContext, Page +except ImportError: + async_playwright = None + Browser = None + BrowserContext = None + Page = None + +logger = logging.getLogger(__name__) + + +class BrowserManager: + """ + Manages a Playwright browser instance that runs visibly on the XFCE desktop. + Uses persistent context to maintain cookies and sessions. + """ + + def __init__(self): + """Initialize the BrowserManager.""" + self.playwright = None + self.browser: Optional[Browser] = None + self.context: Optional[BrowserContext] = None + self.page: Optional[Page] = None + self._initialized = False + self._initialization_error: Optional[str] = None + self._lock = asyncio.Lock() + + async def _ensure_initialized(self): + """Ensure the browser is initialized.""" + # Check if browser was closed and needs reinitialization + if self._initialized: + try: + # Check if context is still valid by trying to access it + if self.context: + # Try to get pages - this will raise if context is closed + _ = self.context.pages + # If we get here, context is still alive + return + else: + # Context was closed, need to reinitialize + self._initialized = False + logger.warning("Browser context was closed, will reinitialize...") + except Exception as e: + # Context is dead, need to reinitialize + logger.warning(f"Browser context is dead ({e}), will reinitialize...") + self._initialized = False + self.context = None + self.page = None + # Clean up playwright if it exists + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + async with self._lock: + # Double-check after acquiring lock (another thread might have initialized it) + if self._initialized: + try: + if self.context: + _ = self.context.pages + return + except Exception: + self._initialized = False + self.context = None + self.page = None + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + if async_playwright is None: + raise RuntimeError( + "playwright is not installed. Please install it with: pip install playwright && playwright install --with-deps firefox" + ) + + try: + # Get display from environment or default to :1 + display = os.environ.get("DISPLAY", ":1") + logger.info(f"Initializing browser with DISPLAY={display}") + + # Start playwright + self.playwright = await async_playwright().start() + + # Launch Firefox with persistent context (keeps cookies/sessions) + # headless=False is CRITICAL so the visual agent can see it + user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright-firefox") + os.makedirs(user_data_dir, exist_ok=True) + + # launch_persistent_context returns a BrowserContext, not a Browser + # Note: Removed --kiosk mode so the desktop remains visible + self.context = await self.playwright.firefox.launch_persistent_context( + user_data_dir=user_data_dir, + headless=False, # CRITICAL: visible for visual agent + viewport={"width": 1024, "height": 768}, + # Removed --kiosk to allow desktop visibility + ) + + # Get the first page or create one + pages = self.context.pages + if pages: + self.page = pages[0] + else: + self.page = await self.context.new_page() + + self._initialized = True + logger.info("Browser initialized successfully") + + except Exception as e: + logger.error(f"Failed to initialize browser: {e}") + import traceback + logger.error(traceback.format_exc()) + # Don't raise - return error in execute_command instead + self._initialization_error = str(e) + raise + + async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]: + """ + Execute a browser command. + + Args: + cmd: Command name (visit_url, click, type, scroll, web_search) + params: Command parameters + + Returns: + Result dictionary with success status and any data + """ + try: + await self._ensure_initialized() + except Exception as e: + error_msg = getattr(self, '_initialization_error', None) or str(e) + logger.error(f"Browser initialization failed: {error_msg}") + return { + "success": False, + "error": f"Browser initialization failed: {error_msg}. " + f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly." + } + + # Ensure browser is still initialized (in case it was manually closed) + # This will automatically reinitialize if the browser was closed + await self._ensure_initialized() + + # Check if page is still valid + page_valid = False + try: + if self.page is not None: + # Try to access page.url to check if it's still valid + _ = self.page.url + page_valid = True + except Exception as e: + logger.warning(f"Page is invalid: {e}, will get a new page...") + self.page = None + + # Get a valid page if we don't have one + if not page_valid or self.page is None: + try: + pages = self.context.pages + if pages: + # Find first non-closed page + for p in pages: + try: + if not p.is_closed(): + self.page = p + logger.info("Reusing existing open page") + page_valid = True + break + except Exception: + continue + + # If no valid page found, create a new one + if not page_valid: + self.page = await self.context.new_page() + logger.info("Created new page") + except Exception as e: + logger.error(f"Failed to get new page: {e}, browser may be closed") + # Browser was closed - reinitialize it + try: + logger.info("Browser was closed manually, reinitializing...") + self._initialized = False + self.context = None + self.page = None + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + # Reinitialize + await self._ensure_initialized() + # Get or create a page + pages = self.context.pages + if pages: + self.page = pages[0] + else: + self.page = await self.context.new_page() + logger.info("Browser reopened successfully after manual closure") + except Exception as reinit_error: + logger.error(f"Failed to reinitialize browser: {reinit_error}") + import traceback + logger.error(traceback.format_exc()) + return {"success": False, "error": f"Browser was closed and cannot be recovered: {reinit_error}"} + + try: + if cmd == "visit_url": + url = params.get("url") + if not url: + return {"success": False, "error": "url parameter is required"} + await self.page.goto(url, wait_until="domcontentloaded", timeout=30000) + return {"success": True, "url": self.page.url} + + elif cmd == "click": + x = params.get("x") + y = params.get("y") + if x is None or y is None: + return {"success": False, "error": "x and y parameters are required"} + await self.page.mouse.click(x, y) + return {"success": True} + + elif cmd == "type": + text = params.get("text") + if text is None: + return {"success": False, "error": "text parameter is required"} + await self.page.keyboard.type(text) + return {"success": True} + + elif cmd == "scroll": + delta_x = params.get("delta_x", 0) + delta_y = params.get("delta_y", 0) + await self.page.mouse.wheel(delta_x, delta_y) + return {"success": True} + + elif cmd == "web_search": + query = params.get("query") + if not query: + return {"success": False, "error": "query parameter is required"} + # Navigate to Google search + search_url = f"https://www.google.com/search?q={query}" + await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000) + return {"success": True, "url": self.page.url} + + else: + return {"success": False, "error": f"Unknown command: {cmd}"} + + except Exception as e: + logger.error(f"Error executing command {cmd}: {e}") + import traceback + logger.error(traceback.format_exc()) + # If page was closed due to error, try to recover + if "closed" in str(e).lower() and self.context: + try: + pages = self.context.pages + if pages: + self.page = pages[0] + logger.info("Recovered page after error") + else: + self.page = await self.context.new_page() + logger.info("Created new page after error") + except Exception as recover_error: + logger.error(f"Failed to recover page: {recover_error}") + return {"success": False, "error": str(e)} + + async def close(self): + """Close the browser and cleanup resources.""" + async with self._lock: + try: + if self.context: + await self.context.close() + self.context = None + if self.browser: + await self.browser.close() + self.browser = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + + self.page = None + self._initialized = False + logger.info("Browser closed successfully") + except Exception as e: + logger.error(f"Error closing browser: {e}") + + +# Global instance +_browser_manager: Optional[BrowserManager] = None + + +def get_browser_manager() -> BrowserManager: + """Get or create the global BrowserManager instance.""" + global _browser_manager + if _browser_manager is None: + _browser_manager = BrowserManager() + return _browser_manager + diff --git a/libs/python/computer-server/computer_server/main.py b/libs/python/computer-server/computer_server/main.py index 3ae97ebc..9bad59bf 100644 --- a/libs/python/computer-server/computer_server/main.py +++ b/libs/python/computer-server/computer_server/main.py @@ -25,6 +25,7 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, StreamingResponse from .handlers.factory import HandlerFactory +from .browser import get_browser_manager # Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60")) @@ -749,5 +750,71 @@ async def agent_response_endpoint( return JSONResponse(content=payload, headers=headers) +@app.post("/playwright_exec") +async def playwright_exec_endpoint( + request: Request, + container_name: Optional[str] = Header(None, alias="X-Container-Name"), + api_key: Optional[str] = Header(None, alias="X-API-Key"), +): + """ + Execute Playwright browser commands. + + Headers: + - X-Container-Name: Container name for cloud authentication + - X-API-Key: API key for cloud authentication + + Body: + { + "command": "visit_url|click|type|scroll|web_search", + "params": {...} + } + """ + # Parse request body + try: + body = await request.json() + command = body.get("command") + params = body.get("params", {}) + except Exception as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") + + if not command: + raise HTTPException(status_code=400, detail="Command is required") + + # Check if CONTAINER_NAME is set (indicating cloud provider) + server_container_name = os.environ.get("CONTAINER_NAME") + + # If cloud provider, perform authentication + if server_container_name: + logger.info( + f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..." + ) + + # Validate required headers + if not container_name: + raise HTTPException(status_code=401, detail="Container name required") + + if not api_key: + raise HTTPException(status_code=401, detail="API key required") + + # Validate with AuthenticationManager + is_authenticated = await auth_manager.auth(container_name, api_key) + if not is_authenticated: + raise HTTPException(status_code=401, detail="Authentication failed") + + # Get browser manager and execute command + try: + browser_manager = get_browser_manager() + result = await browser_manager.execute_command(command, params) + + if result.get("success"): + return JSONResponse(content=result) + else: + raise HTTPException(status_code=400, detail=result.get("error", "Command failed")) + except Exception as e: + logger.error(f"Error executing playwright command: {str(e)}") + logger.error(traceback.format_exc()) + raise HTTPException(status_code=500, detail=str(e)) + + if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/libs/python/computer-server/pyproject.toml b/libs/python/computer-server/pyproject.toml index 7bae1e06..75ff49f5 100644 --- a/libs/python/computer-server/pyproject.toml +++ b/libs/python/computer-server/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "pyperclip>=1.9.0", "websockets>=12.0", "pywinctl>=0.4.1", + "playwright>=1.40.0", # OS-specific runtime deps "pyobjc-framework-Cocoa>=10.1; sys_platform == 'darwin'", "pyobjc-framework-Quartz>=10.1; sys_platform == 'darwin'", diff --git a/libs/xfce/Dockerfile b/libs/xfce/Dockerfile index e83f6bd2..43dab80f 100644 --- a/libs/xfce/Dockerfile +++ b/libs/xfce/Dockerfile @@ -107,6 +107,16 @@ RUN mkdir -p /home/cua/.cache && \ # Install computer-server using Python 3.12 pip RUN python3.12 -m pip install cua-computer-server +# Copy browser.py and updated main.py from local source (to include browser tool) +# These files need to be in the same directory as the Dockerfile when building +COPY browser.py /tmp/browser.py +COPY main.py /tmp/main.py +RUN python3.12 -c "import shutil; import os; cs_dir = '/usr/local/lib/python3.12/dist-packages/computer_server'; shutil.copy('/tmp/browser.py', f'{cs_dir}/browser.py'); shutil.copy('/tmp/main.py', f'{cs_dir}/main.py'); print('Copied browser.py and main.py')" && rm /tmp/browser.py /tmp/main.py + +# Install playwright and Firefox dependencies +RUN python3.12 -m pip install playwright && \ + python3.12 -m playwright install --with-deps firefox + # Fix any cache files created by pip RUN chown -R cua:cua /home/cua/.cache diff --git a/libs/xfce/README_BUILD.md b/libs/xfce/README_BUILD.md new file mode 100644 index 00000000..d6f6a7d4 --- /dev/null +++ b/libs/xfce/README_BUILD.md @@ -0,0 +1,32 @@ +# Building the XFCE Docker Image + +## Required Files for Build + +The Dockerfile requires these files to be present in the `libs/xfce/` directory: + +- `browser.py` - Copy from `libs/python/computer-server/computer_server/browser.py` +- `main.py` - Copy from `libs/python/computer-server/computer_server/main.py` + +These files are copied into the container to include the browser tool functionality +that isn't yet in the published PyPI package. + +## Before Building + +```bash +# Copy the latest browser tool files +cp libs/python/computer-server/computer_server/browser.py libs/xfce/ +cp libs/python/computer-server/computer_server/main.py libs/xfce/ +``` + +## Build Command + +```bash +cd libs/xfce +docker build -t cua-xfce . +``` + +## Note + +Once the browser tool is included in the published `cua-computer-server` package, +these temporary file copies can be removed and the Dockerfile can be simplified. + diff --git a/libs/xfce/browser.py b/libs/xfce/browser.py new file mode 100644 index 00000000..3d0a4c69 --- /dev/null +++ b/libs/xfce/browser.py @@ -0,0 +1,308 @@ +""" +Browser manager using Playwright for programmatic browser control. +This allows agents to control a browser that runs visibly on the XFCE desktop. +""" + +import asyncio +import logging +import os +from typing import Any, Dict, Optional + +try: + from playwright.async_api import async_playwright, Browser, BrowserContext, Page +except ImportError: + async_playwright = None + Browser = None + BrowserContext = None + Page = None + +logger = logging.getLogger(__name__) + + +class BrowserManager: + """ + Manages a Playwright browser instance that runs visibly on the XFCE desktop. + Uses persistent context to maintain cookies and sessions. + """ + + def __init__(self): + """Initialize the BrowserManager.""" + self.playwright = None + self.browser: Optional[Browser] = None + self.context: Optional[BrowserContext] = None + self.page: Optional[Page] = None + self._initialized = False + self._initialization_error: Optional[str] = None + self._lock = asyncio.Lock() + + async def _ensure_initialized(self): + """Ensure the browser is initialized.""" + # Check if browser was closed and needs reinitialization + if self._initialized: + try: + # Check if context is still valid by trying to access it + if self.context: + # Try to get pages - this will raise if context is closed + _ = self.context.pages + # If we get here, context is still alive + return + else: + # Context was closed, need to reinitialize + self._initialized = False + logger.warning("Browser context was closed, will reinitialize...") + except Exception as e: + # Context is dead, need to reinitialize + logger.warning(f"Browser context is dead ({e}), will reinitialize...") + self._initialized = False + self.context = None + self.page = None + # Clean up playwright if it exists + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + async with self._lock: + # Double-check after acquiring lock (another thread might have initialized it) + if self._initialized: + try: + if self.context: + _ = self.context.pages + return + except Exception: + self._initialized = False + self.context = None + self.page = None + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + if async_playwright is None: + raise RuntimeError( + "playwright is not installed. Please install it with: pip install playwright && playwright install --with-deps firefox" + ) + + try: + # Get display from environment or default to :1 + display = os.environ.get("DISPLAY", ":1") + logger.info(f"Initializing browser with DISPLAY={display}") + + # Start playwright + self.playwright = await async_playwright().start() + + # Launch Firefox with persistent context (keeps cookies/sessions) + # headless=False is CRITICAL so the visual agent can see it + user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright-firefox") + os.makedirs(user_data_dir, exist_ok=True) + + # launch_persistent_context returns a BrowserContext, not a Browser + # Note: Removed --kiosk mode so the desktop remains visible + self.context = await self.playwright.firefox.launch_persistent_context( + user_data_dir=user_data_dir, + headless=False, # CRITICAL: visible for visual agent + viewport={"width": 1024, "height": 768}, + # Removed --kiosk to allow desktop visibility + ) + + # Get the first page or create one + pages = self.context.pages + if pages: + self.page = pages[0] + else: + self.page = await self.context.new_page() + + self._initialized = True + logger.info("Browser initialized successfully") + + except Exception as e: + logger.error(f"Failed to initialize browser: {e}") + import traceback + logger.error(traceback.format_exc()) + # Don't raise - return error in execute_command instead + self._initialization_error = str(e) + raise + + async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]: + """ + Execute a browser command. + + Args: + cmd: Command name (visit_url, click, type, scroll, web_search) + params: Command parameters + + Returns: + Result dictionary with success status and any data + """ + try: + await self._ensure_initialized() + except Exception as e: + error_msg = getattr(self, '_initialization_error', None) or str(e) + logger.error(f"Browser initialization failed: {error_msg}") + return { + "success": False, + "error": f"Browser initialization failed: {error_msg}. " + f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly." + } + + # Ensure browser is still initialized (in case it was manually closed) + # This will automatically reinitialize if the browser was closed + await self._ensure_initialized() + + # Check if page is still valid + page_valid = False + try: + if self.page is not None: + # Try to access page.url to check if it's still valid + _ = self.page.url + page_valid = True + except Exception as e: + logger.warning(f"Page is invalid: {e}, will get a new page...") + self.page = None + + # Get a valid page if we don't have one + if not page_valid or self.page is None: + try: + pages = self.context.pages + if pages: + # Find first non-closed page + for p in pages: + try: + if not p.is_closed(): + self.page = p + logger.info("Reusing existing open page") + page_valid = True + break + except Exception: + continue + + # If no valid page found, create a new one + if not page_valid: + self.page = await self.context.new_page() + logger.info("Created new page") + except Exception as e: + logger.error(f"Failed to get new page: {e}, browser may be closed") + # Browser was closed - reinitialize it + try: + logger.info("Browser was closed manually, reinitializing...") + self._initialized = False + self.context = None + self.page = None + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + # Reinitialize + await self._ensure_initialized() + # Get or create a page + pages = self.context.pages + if pages: + self.page = pages[0] + else: + self.page = await self.context.new_page() + logger.info("Browser reopened successfully after manual closure") + except Exception as reinit_error: + logger.error(f"Failed to reinitialize browser: {reinit_error}") + import traceback + logger.error(traceback.format_exc()) + return {"success": False, "error": f"Browser was closed and cannot be recovered: {reinit_error}"} + + try: + if cmd == "visit_url": + url = params.get("url") + if not url: + return {"success": False, "error": "url parameter is required"} + await self.page.goto(url, wait_until="domcontentloaded", timeout=30000) + return {"success": True, "url": self.page.url} + + elif cmd == "click": + x = params.get("x") + y = params.get("y") + if x is None or y is None: + return {"success": False, "error": "x and y parameters are required"} + await self.page.mouse.click(x, y) + return {"success": True} + + elif cmd == "type": + text = params.get("text") + if text is None: + return {"success": False, "error": "text parameter is required"} + await self.page.keyboard.type(text) + return {"success": True} + + elif cmd == "scroll": + delta_x = params.get("delta_x", 0) + delta_y = params.get("delta_y", 0) + await self.page.mouse.wheel(delta_x, delta_y) + return {"success": True} + + elif cmd == "web_search": + query = params.get("query") + if not query: + return {"success": False, "error": "query parameter is required"} + # Navigate to Google search + search_url = f"https://www.google.com/search?q={query}" + await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000) + return {"success": True, "url": self.page.url} + + else: + return {"success": False, "error": f"Unknown command: {cmd}"} + + except Exception as e: + logger.error(f"Error executing command {cmd}: {e}") + import traceback + logger.error(traceback.format_exc()) + # If page was closed due to error, try to recover + if "closed" in str(e).lower() and self.context: + try: + pages = self.context.pages + if pages: + self.page = pages[0] + logger.info("Recovered page after error") + else: + self.page = await self.context.new_page() + logger.info("Created new page after error") + except Exception as recover_error: + logger.error(f"Failed to recover page: {recover_error}") + return {"success": False, "error": str(e)} + + async def close(self): + """Close the browser and cleanup resources.""" + async with self._lock: + try: + if self.context: + await self.context.close() + self.context = None + if self.browser: + await self.browser.close() + self.browser = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + + self.page = None + self._initialized = False + logger.info("Browser closed successfully") + except Exception as e: + logger.error(f"Error closing browser: {e}") + + +# Global instance +_browser_manager: Optional[BrowserManager] = None + + +def get_browser_manager() -> BrowserManager: + """Get or create the global BrowserManager instance.""" + global _browser_manager + if _browser_manager is None: + _browser_manager = BrowserManager() + return _browser_manager + diff --git a/libs/xfce/main.py b/libs/xfce/main.py new file mode 100644 index 00000000..9bad59bf --- /dev/null +++ b/libs/xfce/main.py @@ -0,0 +1,820 @@ +import asyncio +import hashlib +import inspect +import json +import logging +import os +import platform +import time +import traceback +from contextlib import redirect_stderr, redirect_stdout +from io import StringIO +from typing import Any, Dict, List, Literal, Optional, Union, cast + +import aiohttp +import uvicorn +from fastapi import ( + FastAPI, + Header, + HTTPException, + Request, + WebSocket, + WebSocketDisconnect, +) +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse, StreamingResponse + +from .handlers.factory import HandlerFactory +from .browser import get_browser_manager + +# Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s +AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60")) + +try: + from agent import ComputerAgent + + HAS_AGENT = True +except ImportError: + HAS_AGENT = False + +# Set up logging with more detail +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# Configure WebSocket with larger message size +WEBSOCKET_MAX_SIZE = 1024 * 1024 * 10 # 10MB limit + +# Configure application with WebSocket settings +app = FastAPI( + title="Computer API", + description="API for the Computer project", + version="0.1.0", + websocket_max_size=WEBSOCKET_MAX_SIZE, +) + +# CORS configuration +origins = ["*"] +app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +protocol_version = 1 +try: + from importlib.metadata import version + + package_version = version("cua-computer-server") +except Exception: + # Fallback for cases where package is not installed or importlib.metadata is not available + try: + import pkg_resources + + package_version = pkg_resources.get_distribution("cua-computer-server").version + except Exception: + package_version = "unknown" + +( + accessibility_handler, + automation_handler, + diorama_handler, + file_handler, + desktop_handler, + window_handler, +) = HandlerFactory.create_handlers() +handlers = { + "version": lambda: {"protocol": protocol_version, "package": package_version}, + # App-Use commands + "diorama_cmd": diorama_handler.diorama_cmd, + # Accessibility commands + "get_accessibility_tree": accessibility_handler.get_accessibility_tree, + "find_element": accessibility_handler.find_element, + # Shell commands + "run_command": automation_handler.run_command, + # File system commands + "file_exists": file_handler.file_exists, + "directory_exists": file_handler.directory_exists, + "list_dir": file_handler.list_dir, + "read_text": file_handler.read_text, + "write_text": file_handler.write_text, + "read_bytes": file_handler.read_bytes, + "write_bytes": file_handler.write_bytes, + "get_file_size": file_handler.get_file_size, + "delete_file": file_handler.delete_file, + "create_dir": file_handler.create_dir, + "delete_dir": file_handler.delete_dir, + # Desktop commands + "get_desktop_environment": desktop_handler.get_desktop_environment, + "set_wallpaper": desktop_handler.set_wallpaper, + # Window management + "open": window_handler.open, + "launch": window_handler.launch, + "get_current_window_id": window_handler.get_current_window_id, + "get_application_windows": window_handler.get_application_windows, + "get_window_name": window_handler.get_window_name, + "get_window_size": window_handler.get_window_size, + "get_window_position": window_handler.get_window_position, + "set_window_size": window_handler.set_window_size, + "set_window_position": window_handler.set_window_position, + "maximize_window": window_handler.maximize_window, + "minimize_window": window_handler.minimize_window, + "activate_window": window_handler.activate_window, + "close_window": window_handler.close_window, + # Mouse commands + "mouse_down": automation_handler.mouse_down, + "mouse_up": automation_handler.mouse_up, + "left_click": automation_handler.left_click, + "right_click": automation_handler.right_click, + "double_click": automation_handler.double_click, + "move_cursor": automation_handler.move_cursor, + "drag_to": automation_handler.drag_to, + "drag": automation_handler.drag, + # Keyboard commands + "key_down": automation_handler.key_down, + "key_up": automation_handler.key_up, + "type_text": automation_handler.type_text, + "press_key": automation_handler.press_key, + "hotkey": automation_handler.hotkey, + # Scrolling actions + "scroll": automation_handler.scroll, + "scroll_down": automation_handler.scroll_down, + "scroll_up": automation_handler.scroll_up, + # Screen actions + "screenshot": automation_handler.screenshot, + "get_cursor_position": automation_handler.get_cursor_position, + "get_screen_size": automation_handler.get_screen_size, + # Clipboard actions + "copy_to_clipboard": automation_handler.copy_to_clipboard, + "set_clipboard": automation_handler.set_clipboard, +} + + +class AuthenticationManager: + def __init__(self): + self.sessions: Dict[str, Dict[str, Any]] = {} + self.container_name = os.environ.get("CONTAINER_NAME") + + def _hash_credentials(self, container_name: str, api_key: str) -> str: + """Create a hash of container name and API key for session identification""" + combined = f"{container_name}:{api_key}" + return hashlib.sha256(combined.encode()).hexdigest() + + def _is_session_valid(self, session_data: Dict[str, Any]) -> bool: + """Check if a session is still valid based on expiration time""" + if not session_data.get("valid", False): + return False + + expires_at = session_data.get("expires_at", 0) + return time.time() < expires_at + + async def auth(self, container_name: str, api_key: str) -> bool: + """Authenticate container name and API key, using cached sessions when possible""" + # If no CONTAINER_NAME is set, always allow access (local development) + if not self.container_name: + logger.info( + "No CONTAINER_NAME set in environment. Allowing access (local development mode)" + ) + return True + + # Layer 1: VM Identity Verification + if container_name != self.container_name: + logger.warning( + f"VM name mismatch. Expected: {self.container_name}, Got: {container_name}" + ) + return False + + # Create hash for session lookup + session_hash = self._hash_credentials(container_name, api_key) + + # Check if we have a valid cached session + if session_hash in self.sessions: + session_data = self.sessions[session_hash] + if self._is_session_valid(session_data): + logger.info(f"Using cached authentication for container: {container_name}") + return session_data["valid"] + else: + # Remove expired session + del self.sessions[session_hash] + + # No valid cached session, authenticate with API + logger.info(f"Authenticating with TryCUA API for container: {container_name}") + + try: + async with aiohttp.ClientSession() as session: + headers = {"Authorization": f"Bearer {api_key}"} + + async with session.get( + f"https://www.cua.ai/api/vm/auth?container_name={container_name}", + headers=headers, + ) as resp: + is_valid = resp.status == 200 and bool((await resp.text()).strip()) + + # Cache the result with configurable expiration + self.sessions[session_hash] = { + "valid": is_valid, + "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS, + } + + if is_valid: + logger.info(f"Authentication successful for container: {container_name}") + else: + logger.warning( + f"Authentication failed for container: {container_name}. Status: {resp.status}" + ) + + return is_valid + + except aiohttp.ClientError as e: + logger.error(f"Failed to validate API key with TryCUA API: {str(e)}") + # Cache failed result to avoid repeated requests + self.sessions[session_hash] = { + "valid": False, + "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS, + } + return False + except Exception as e: + logger.error(f"Unexpected error during authentication: {str(e)}") + # Cache failed result to avoid repeated requests + self.sessions[session_hash] = { + "valid": False, + "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS, + } + return False + + +class ConnectionManager: + def __init__(self): + self.active_connections: List[WebSocket] = [] + + async def connect(self, websocket: WebSocket): + await websocket.accept() + self.active_connections.append(websocket) + + def disconnect(self, websocket: WebSocket): + self.active_connections.remove(websocket) + + +manager = ConnectionManager() +auth_manager = AuthenticationManager() + + +@app.get("/status") +async def status(): + sys = platform.system().lower() + # get os type + if "darwin" in sys or sys == "macos" or sys == "mac": + os_type = "macos" + elif "windows" in sys: + os_type = "windows" + else: + os_type = "linux" + # get computer-server features + features = [] + if HAS_AGENT: + features.append("agent") + return {"status": "ok", "os_type": os_type, "features": features} + + +@app.websocket("/ws", name="websocket_endpoint") +async def websocket_endpoint(websocket: WebSocket): + global handlers + + # WebSocket message size is configured at the app or endpoint level, not on the instance + await manager.connect(websocket) + + # Check if CONTAINER_NAME is set (indicating cloud provider) + server_container_name = os.environ.get("CONTAINER_NAME") + + # If cloud provider, perform authentication handshake + if server_container_name: + try: + logger.info( + f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Waiting for authentication..." + ) + + # Wait for authentication message + auth_data = await websocket.receive_json() + + # Validate auth message format + if auth_data.get("command") != "authenticate": + await websocket.send_json( + {"success": False, "error": "First message must be authentication"} + ) + await websocket.close() + manager.disconnect(websocket) + return + + # Extract credentials + client_api_key = auth_data.get("params", {}).get("api_key") + client_container_name = auth_data.get("params", {}).get("container_name") + + # Validate credentials using AuthenticationManager + if not client_api_key: + await websocket.send_json({"success": False, "error": "API key required"}) + await websocket.close() + manager.disconnect(websocket) + return + + if not client_container_name: + await websocket.send_json({"success": False, "error": "Container name required"}) + await websocket.close() + manager.disconnect(websocket) + return + + # Use AuthenticationManager for validation + is_authenticated = await auth_manager.auth(client_container_name, client_api_key) + if not is_authenticated: + await websocket.send_json({"success": False, "error": "Authentication failed"}) + await websocket.close() + manager.disconnect(websocket) + return + + logger.info(f"Authentication successful for VM: {client_container_name}") + await websocket.send_json({"success": True, "message": "Authentication successful"}) + + except Exception as e: + logger.error(f"Error during authentication handshake: {str(e)}") + await websocket.send_json({"success": False, "error": "Authentication failed"}) + await websocket.close() + manager.disconnect(websocket) + return + + try: + while True: + try: + data = await websocket.receive_json() + command = data.get("command") + params = data.get("params", {}) + + if command not in handlers: + await websocket.send_json( + {"success": False, "error": f"Unknown command: {command}"} + ) + continue + + try: + # Filter params to only include those accepted by the handler function + handler_func = handlers[command] + sig = inspect.signature(handler_func) + filtered_params = {k: v for k, v in params.items() if k in sig.parameters} + + # Handle both sync and async functions + if asyncio.iscoroutinefunction(handler_func): + result = await handler_func(**filtered_params) + else: + # Run sync functions in thread pool to avoid blocking event loop + result = await asyncio.to_thread(handler_func, **filtered_params) + await websocket.send_json({"success": True, **result}) + except Exception as cmd_error: + logger.error(f"Error executing command {command}: {str(cmd_error)}") + logger.error(traceback.format_exc()) + await websocket.send_json({"success": False, "error": str(cmd_error)}) + + except WebSocketDisconnect: + raise + except json.JSONDecodeError as json_err: + logger.error(f"JSON decode error: {str(json_err)}") + await websocket.send_json( + {"success": False, "error": f"Invalid JSON: {str(json_err)}"} + ) + except Exception as loop_error: + logger.error(f"Error in message loop: {str(loop_error)}") + logger.error(traceback.format_exc()) + await websocket.send_json({"success": False, "error": str(loop_error)}) + + except WebSocketDisconnect: + logger.info("Client disconnected") + manager.disconnect(websocket) + except Exception as e: + logger.error(f"Fatal error in websocket connection: {str(e)}") + logger.error(traceback.format_exc()) + try: + await websocket.close() + except: + pass + manager.disconnect(websocket) + + +@app.post("/cmd") +async def cmd_endpoint( + request: Request, + container_name: Optional[str] = Header(None, alias="X-Container-Name"), + api_key: Optional[str] = Header(None, alias="X-API-Key"), +): + """ + Backup endpoint for when WebSocket connections fail. + Accepts commands via HTTP POST with streaming response. + + Headers: + - X-Container-Name: Container name for cloud authentication + - X-API-Key: API key for cloud authentication + + Body: + { + "command": "command_name", + "params": {...} + } + """ + global handlers + + # Parse request body + try: + body = await request.json() + command = body.get("command") + params = body.get("params", {}) + except Exception as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") + + if not command: + raise HTTPException(status_code=400, detail="Command is required") + + # Check if CONTAINER_NAME is set (indicating cloud provider) + server_container_name = os.environ.get("CONTAINER_NAME") + + # If cloud provider, perform authentication + if server_container_name: + logger.info( + f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..." + ) + + # Validate required headers + if not container_name: + raise HTTPException(status_code=401, detail="Container name required") + + if not api_key: + raise HTTPException(status_code=401, detail="API key required") + + # Validate with AuthenticationManager + is_authenticated = await auth_manager.auth(container_name, api_key) + if not is_authenticated: + raise HTTPException(status_code=401, detail="Authentication failed") + + if command not in handlers: + raise HTTPException(status_code=400, detail=f"Unknown command: {command}") + + async def generate_response(): + """Generate streaming response for the command execution""" + try: + # Filter params to only include those accepted by the handler function + handler_func = handlers[command] + sig = inspect.signature(handler_func) + filtered_params = {k: v for k, v in params.items() if k in sig.parameters} + + # Handle both sync and async functions + if asyncio.iscoroutinefunction(handler_func): + result = await handler_func(**filtered_params) + else: + # Run sync functions in thread pool to avoid blocking event loop + result = await asyncio.to_thread(handler_func, **filtered_params) + + # Stream the successful result + response_data = {"success": True, **result} + yield f"data: {json.dumps(response_data)}\n\n" + + except Exception as cmd_error: + logger.error(f"Error executing command {command}: {str(cmd_error)}") + logger.error(traceback.format_exc()) + + # Stream the error result + error_data = {"success": False, "error": str(cmd_error)} + yield f"data: {json.dumps(error_data)}\n\n" + + return StreamingResponse( + generate_response(), + media_type="text/plain", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + }, + ) + + +@app.post("/responses") +async def agent_response_endpoint( + request: Request, + api_key: Optional[str] = Header(None, alias="X-API-Key"), +): + """ + Minimal proxy to run ComputerAgent for up to 2 turns. + + Security: + - If CONTAINER_NAME is set on the server, require X-API-Key + and validate using AuthenticationManager unless CUA_ENABLE_PUBLIC_PROXY is true. + + Body JSON: + { + "model": "...", # required + "input": "... or messages[]", # required + "agent_kwargs": { ... }, # optional, passed directly to ComputerAgent + "env": { ... } # optional env overrides for agent + } + """ + if not HAS_AGENT: + raise HTTPException(status_code=501, detail="ComputerAgent not available") + + # Authenticate via AuthenticationManager if running in cloud (CONTAINER_NAME set) + container_name = os.environ.get("CONTAINER_NAME") + if container_name: + is_public = os.environ.get("CUA_ENABLE_PUBLIC_PROXY", "").lower().strip() in [ + "1", + "true", + "yes", + "y", + "on", + ] + if not is_public: + if not api_key: + raise HTTPException(status_code=401, detail="Missing AGENT PROXY auth headers") + ok = await auth_manager.auth(container_name, api_key) + if not ok: + raise HTTPException(status_code=401, detail="Unauthorized") + + # Parse request body + try: + body = await request.json() + except Exception as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") + + model = body.get("model") + input_data = body.get("input") + if not model or input_data is None: + raise HTTPException(status_code=400, detail="'model' and 'input' are required") + + agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {} + env_overrides: Dict[str, str] = body.get("env") or {} + + # Simple env override context + class _EnvOverride: + def __init__(self, overrides: Dict[str, str]): + self.overrides = overrides + self._original: Dict[str, Optional[str]] = {} + + def __enter__(self): + for k, v in (self.overrides or {}).items(): + self._original[k] = os.environ.get(k) + os.environ[k] = str(v) + + def __exit__(self, exc_type, exc, tb): + for k, old in self._original.items(): + if old is None: + os.environ.pop(k, None) + else: + os.environ[k] = old + + # Convert input to messages + def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: + if isinstance(data, str): + return [{"role": "user", "content": data}] + if isinstance(data, list): + return data + + messages = _to_messages(input_data) + + # Define a direct computer tool that implements the AsyncComputerHandler protocol + # and delegates to our existing automation/file/accessibility handlers. + from agent.computers import AsyncComputerHandler # runtime-checkable Protocol + + class DirectComputer(AsyncComputerHandler): + def __init__(self): + # use module-scope handler singletons created by HandlerFactory + self._auto = automation_handler + self._file = file_handler + self._access = accessibility_handler + + async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: + sys = platform.system().lower() + if "darwin" in sys or sys in ("macos", "mac"): + return "mac" + if "windows" in sys: + return "windows" + return "linux" + + async def get_dimensions(self) -> tuple[int, int]: + size = await self._auto.get_screen_size() + return size["width"], size["height"] + + async def screenshot(self) -> str: + img_b64 = await self._auto.screenshot() + return img_b64["image_data"] + + async def click(self, x: int, y: int, button: str = "left") -> None: + if button == "left": + await self._auto.left_click(x, y) + elif button == "right": + await self._auto.right_click(x, y) + else: + await self._auto.left_click(x, y) + + async def double_click(self, x: int, y: int) -> None: + await self._auto.double_click(x, y) + + async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + await self._auto.move_cursor(x, y) + await self._auto.scroll(scroll_x, scroll_y) + + async def type(self, text: str) -> None: + await self._auto.type_text(text) + + async def wait(self, ms: int = 1000) -> None: + await asyncio.sleep(ms / 1000.0) + + async def move(self, x: int, y: int) -> None: + await self._auto.move_cursor(x, y) + + async def keypress(self, keys: Union[List[str], str]) -> None: + if isinstance(keys, str): + parts = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys] + else: + parts = keys + if len(parts) == 1: + await self._auto.press_key(parts[0]) + else: + await self._auto.hotkey(parts) + + async def drag(self, path: List[Dict[str, int]]) -> None: + if not path: + return + start = path[0] + await self._auto.mouse_down(start["x"], start["y"]) + for pt in path[1:]: + await self._auto.move_cursor(pt["x"], pt["y"]) + end = path[-1] + await self._auto.mouse_up(end["x"], end["y"]) + + async def get_current_url(self) -> str: + # Not available in this server context + return "" + + async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None: + await self._auto.mouse_down(x, y, button="left") + + async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None: + await self._auto.mouse_up(x, y, button="left") + + # # Inline image URLs to base64 + # import base64, mimetypes, requests + # # Use a browser-like User-Agent to avoid 403s from some CDNs (e.g., Wikimedia) + # HEADERS = { + # "User-Agent": ( + # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + # "AppleWebKit/537.36 (KHTML, like Gecko) " + # "Chrome/124.0.0.0 Safari/537.36" + # ) + # } + # def _to_data_url(content_bytes: bytes, url: str, resp: requests.Response) -> str: + # ctype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream" + # b64 = base64.b64encode(content_bytes).decode("utf-8") + # return f"data:{ctype};base64,{b64}" + # def inline_image_urls(messages): + # # messages: List[{"role": "...","content":[...]}] + # out = [] + # for m in messages: + # if not isinstance(m.get("content"), list): + # out.append(m) + # continue + # new_content = [] + # for part in (m.get("content") or []): + # if part.get("type") == "input_image" and (url := part.get("image_url")): + # resp = requests.get(url, headers=HEADERS, timeout=30) + # resp.raise_for_status() + # new_content.append({ + # "type": "input_image", + # "image_url": _to_data_url(resp.content, url, resp) + # }) + # else: + # new_content.append(part) + # out.append({**m, "content": new_content}) + # return out + # messages = inline_image_urls(messages) + + error = None + + with _EnvOverride(env_overrides): + # Prepare tools: if caller did not pass tools, inject our DirectComputer + tools = agent_kwargs.get("tools") + if not tools: + tools = [DirectComputer()] + agent_kwargs = {**agent_kwargs, "tools": tools} + # Instantiate agent with our tools + agent = ComputerAgent(model=model, **agent_kwargs) # type: ignore[arg-type] + + total_output: List[Any] = [] + total_usage: Dict[str, Any] = {} + + pending_computer_call_ids = set() + try: + async for result in agent.run(messages): + total_output += result["output"] + # Try to collect usage if present + if ( + isinstance(result, dict) + and "usage" in result + and isinstance(result["usage"], dict) + ): + # Merge usage counters + for k, v in result["usage"].items(): + if isinstance(v, (int, float)): + total_usage[k] = total_usage.get(k, 0) + v + else: + total_usage[k] = v + for msg in result.get("output", []): + if msg.get("type") == "computer_call": + pending_computer_call_ids.add(msg["call_id"]) + elif msg.get("type") == "computer_call_output": + pending_computer_call_ids.discard(msg["call_id"]) + # exit if no pending computer calls + if not pending_computer_call_ids: + break + except Exception as e: + logger.error(f"Error running agent: {str(e)}") + logger.error(traceback.format_exc()) + error = str(e) + + # Build response payload + payload = { + "model": model, + "error": error, + "output": total_output, + "usage": total_usage, + "status": "completed" if not error else "failed", + } + + # CORS: allow any origin + headers = { + "Cache-Control": "no-cache", + "Connection": "keep-alive", + } + + return JSONResponse(content=payload, headers=headers) + + +@app.post("/playwright_exec") +async def playwright_exec_endpoint( + request: Request, + container_name: Optional[str] = Header(None, alias="X-Container-Name"), + api_key: Optional[str] = Header(None, alias="X-API-Key"), +): + """ + Execute Playwright browser commands. + + Headers: + - X-Container-Name: Container name for cloud authentication + - X-API-Key: API key for cloud authentication + + Body: + { + "command": "visit_url|click|type|scroll|web_search", + "params": {...} + } + """ + # Parse request body + try: + body = await request.json() + command = body.get("command") + params = body.get("params", {}) + except Exception as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") + + if not command: + raise HTTPException(status_code=400, detail="Command is required") + + # Check if CONTAINER_NAME is set (indicating cloud provider) + server_container_name = os.environ.get("CONTAINER_NAME") + + # If cloud provider, perform authentication + if server_container_name: + logger.info( + f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..." + ) + + # Validate required headers + if not container_name: + raise HTTPException(status_code=401, detail="Container name required") + + if not api_key: + raise HTTPException(status_code=401, detail="API key required") + + # Validate with AuthenticationManager + is_authenticated = await auth_manager.auth(container_name, api_key) + if not is_authenticated: + raise HTTPException(status_code=401, detail="Authentication failed") + + # Get browser manager and execute command + try: + browser_manager = get_browser_manager() + result = await browser_manager.execute_command(command, params) + + if result.get("success"): + return JSONResponse(content=result) + else: + raise HTTPException(status_code=400, detail=result.get("error", "Command failed")) + except Exception as e: + logger.error(f"Error executing playwright command: {str(e)}") + logger.error(traceback.format_exc()) + raise HTTPException(status_code=500, detail=str(e)) + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/libs/xfce/src/scripts/start-computer-server.sh b/libs/xfce/src/scripts/start-computer-server.sh index bc27a3db..1e52e536 100644 --- a/libs/xfce/src/scripts/start-computer-server.sh +++ b/libs/xfce/src/scripts/start-computer-server.sh @@ -10,4 +10,4 @@ echo "X server is ready" # Start computer-server export DISPLAY=:1 -python3 -m computer_server --port ${API_PORT:-8000} +python -m computer_server --port ${API_PORT:-8000} From ddfb53e79f4b0bc98d01bc67beeb01ea5860b7d3 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 3 Dec 2025 08:17:52 -0800 Subject: [PATCH 2/8] Migrate browser interface into computer SDK --- .pre-commit-config.yaml | 2 + examples/BROWSER_TOOL_README.md | 24 +++++++- examples/browser_tool_example.py | 56 +++++++++--------- libs/python/agent/agent/tools/browser_tool.py | 57 +++++-------------- .../computer/computer/interface/generic.py | 50 ++++++++++++++++ 5 files changed, 116 insertions(+), 73 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d9475d42..a2e35493 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,6 +15,8 @@ repos: name: TypeScript type check entry: node ./scripts/typescript-typecheck.js language: node + files: \.(ts|tsx)$ + pass_filenames: false - repo: https://github.com/PyCQA/isort rev: 7.0.0 diff --git a/examples/BROWSER_TOOL_README.md b/examples/BROWSER_TOOL_README.md index 8d12ae85..f72971e8 100644 --- a/examples/BROWSER_TOOL_README.md +++ b/examples/BROWSER_TOOL_README.md @@ -40,10 +40,31 @@ python examples/browser_tool_example.py - **Auto-Recovery**: Automatically reopens browser if closed manually - **Persistent Context**: Maintains cookies and sessions across commands - **Fara/Magentic-One Interface**: Compatible with Microsoft agent interfaces +- **Computer SDK Integration**: Uses the Computer SDK's interface for unified control + +## Usage + +The BrowserTool uses the Computer SDK's interface to communicate with the server: + +```python +from computer import Computer +from agent.tools.browser_tool import BrowserTool + +# Initialize computer interface +computer = Computer(ip_address="localhost") + +# Create browser tool with the interface +browser = BrowserTool(interface=computer) + +# Use the browser +await browser.visit_url("https://www.example.com") +await browser.click(x=500, y=300) +await browser.type("Hello, world!") +``` ## API Endpoint -The browser tool is accessible via the `/playwright_exec` endpoint: +The browser tool is also accessible via the `/playwright_exec` endpoint: ```bash curl -X POST http://localhost:8000/playwright_exec \ @@ -66,4 +87,3 @@ curl -X POST http://localhost:8000/playwright_exec \ **Connection errors**: Make sure the server is running (`curl http://localhost:8000/status`). **Playwright not found**: Install with `pip install playwright && playwright install --with-deps firefox`. - diff --git a/examples/browser_tool_example.py b/examples/browser_tool_example.py index 9705ca8f..11a8dead 100644 --- a/examples/browser_tool_example.py +++ b/examples/browser_tool_example.py @@ -19,18 +19,14 @@ import logging import sys from pathlib import Path -# Import BrowserTool directly from the file -browser_tool_path = Path(__file__).parent.parent / "libs" / "python" / "agent" / "agent" / "tools" / "browser_tool.py" -sys.path.insert(0, str(browser_tool_path.parent.parent.parent)) +# Add the libs path to sys.path +libs_path = Path(__file__).parent.parent / "libs" / "python" +sys.path.insert(0, str(libs_path)) -# Import the module directly -import importlib.util -spec = importlib.util.spec_from_file_location("browser_tool", browser_tool_path) -if spec is None or spec.loader is None: - raise ImportError(f"Could not load browser_tool from {browser_tool_path}") -browser_tool_module = importlib.util.module_from_spec(spec) -spec.loader.exec_module(browser_tool_module) -BrowserTool = browser_tool_module.BrowserTool +from agent.tools.browser_tool import BrowserTool + +# Import Computer interface and BrowserTool +from computer import Computer # Configure logging to see what's happening logging.basicConfig(level=logging.INFO) @@ -39,58 +35,60 @@ logger = logging.getLogger(__name__) async def test_browser_tool(): """Test the BrowserTool with various commands.""" - - # Initialize the browser tool - # For local testing, use http://localhost:8000 - # For cloud, provide base_url, api_key, and container_name - browser = BrowserTool(base_url="http://localhost:8000") - + + # Initialize the computer interface + # For local testing, use provider_type="docker" + # For provider_type="cloud", provide name and api_key + computer = Computer(provider_type="docker") + + # Initialize the browser tool with the computer interface + browser = BrowserTool(interface=computer) + logger.info("Testing Browser Tool...") - + try: # Test 1: Visit a URL logger.info("Test 1: Visiting a URL...") result = await browser.visit_url("https://www.trycua.com") logger.info(f"Visit URL result: {result}") - + # Wait a bit for the page to load await asyncio.sleep(2) - + # Test 2: Web search logger.info("Test 2: Performing a web search...") result = await browser.web_search("Python programming") logger.info(f"Web search result: {result}") - + # Wait a bit await asyncio.sleep(2) - + # Test 3: Scroll logger.info("Test 3: Scrolling the page...") result = await browser.scroll(delta_x=0, delta_y=500) logger.info(f"Scroll result: {result}") - + # Wait a bit await asyncio.sleep(1) - + # Test 4: Click (example coordinates - adjust based on your screen) logger.info("Test 4: Clicking at coordinates...") result = await browser.click(x=500, y=300) logger.info(f"Click result: {result}") - + # Wait a bit await asyncio.sleep(1) - + # Test 5: Type text (if there's a focused input field) logger.info("Test 5: Typing text...") result = await browser.type("Hello from BrowserTool!") logger.info(f"Type result: {result}") - + logger.info("All tests completed!") - + except Exception as e: logger.error(f"Error during testing: {e}", exc_info=True) if __name__ == "__main__": asyncio.run(test_browser_tool()) - diff --git a/libs/python/agent/agent/tools/browser_tool.py b/libs/python/agent/agent/tools/browser_tool.py index 8f8b1ab9..85b6ba23 100644 --- a/libs/python/agent/agent/tools/browser_tool.py +++ b/libs/python/agent/agent/tools/browser_tool.py @@ -4,54 +4,36 @@ Allows agents to control a browser programmatically via Playwright. """ import logging -from typing import Optional +from typing import TYPE_CHECKING, Optional -import aiohttp +if TYPE_CHECKING: + from computer.interface import GenericComputerInterface logger = logging.getLogger(__name__) class BrowserTool: """ - Browser tool that connects to the computer server's Playwright endpoint. + Browser tool that uses the computer SDK's interface to control a browser. Implements the Fara/Magentic-One agent interface for browser control. """ def __init__( self, - base_url: str = "http://localhost:8000", - api_key: Optional[str] = None, - container_name: Optional[str] = None, + interface: "GenericComputerInterface", ): """ Initialize the BrowserTool. Args: - base_url: Base URL of the computer server (default: http://localhost:8000) - api_key: Optional API key for cloud authentication - container_name: Optional container name for cloud authentication + interface: A GenericComputerInterface instance that provides playwright_exec """ - self.base_url = base_url.rstrip("/") - self.api_key = api_key - self.container_name = container_name + self.interface = interface self.logger = logger - def _get_endpoint_url(self) -> str: - """Get the full URL for the playwright_exec endpoint.""" - return f"{self.base_url}/playwright_exec" - - def _get_headers(self) -> dict: - """Get headers for the HTTP request.""" - headers = {"Content-Type": "application/json"} - if self.api_key: - headers["X-API-Key"] = self.api_key - if self.container_name: - headers["X-Container-Name"] = self.container_name - return headers - async def _execute_command(self, command: str, params: dict) -> dict: """ - Execute a browser command via HTTP POST. + Execute a browser command via the computer interface. Args: command: Command name @@ -60,23 +42,15 @@ class BrowserTool: Returns: Response dictionary """ - url = self._get_endpoint_url() - payload = {"command": command, "params": params} - headers = self._get_headers() - try: - async with aiohttp.ClientSession() as session: - async with session.post(url, json=payload, headers=headers) as response: - if response.status == 200: - return await response.json() - else: - error_text = await response.text() - self.logger.error( - f"Browser command failed with status {response.status}: {error_text}" - ) - return {"success": False, "error": error_text} + result = await self.interface.playwright_exec(command, params) + if not result.get("success"): + self.logger.error( + f"Browser command '{command}' failed: {result.get('error', 'Unknown error')}" + ) + return result except Exception as e: - self.logger.error(f"Error executing browser command: {e}") + self.logger.error(f"Error executing browser command '{command}': {e}") return {"success": False, "error": str(e)} async def visit_url(self, url: str) -> dict: @@ -140,4 +114,3 @@ class BrowserTool: Response dictionary with success status and current URL """ return await self._execute_command("web_search", {"query": query}) - diff --git a/libs/python/computer/computer/interface/generic.py b/libs/python/computer/computer/interface/generic.py index e58719dd..d5a5dc4b 100644 --- a/libs/python/computer/computer/interface/generic.py +++ b/libs/python/computer/computer/interface/generic.py @@ -661,6 +661,56 @@ class GenericComputerInterface(BaseComputerInterface): return screenshot_x, screenshot_y + # Playwright browser control + async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any]: + """ + Execute a Playwright browser command. + + Args: + command: The browser command to execute (visit_url, click, type, scroll, web_search) + params: Command parameters + + Returns: + Dict containing the command result + + Examples: + # Navigate to a URL + await interface.playwright_exec("visit_url", {"url": "https://example.com"}) + + # Click at coordinates + await interface.playwright_exec("click", {"x": 100, "y": 200}) + + # Type text + await interface.playwright_exec("type", {"text": "Hello, world!"}) + + # Scroll + await interface.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100}) + + # Web search + await interface.playwright_exec("web_search", {"query": "computer use agent"}) + """ + protocol = "https" if self.api_key else "http" + port = "8443" if self.api_key else "8000" + url = f"{protocol}://{self.ip_address}:{port}/playwright_exec" + + payload = {"command": command, "params": params or {}} + headers = {"Content-Type": "application/json"} + if self.api_key: + headers["X-API-Key"] = self.api_key + if self.vm_name: + headers["X-Container-Name"] = self.vm_name + + try: + async with aiohttp.ClientSession() as session: + async with session.post(url, json=payload, headers=headers) as response: + if response.status == 200: + return await response.json() + else: + error_text = await response.text() + return {"success": False, "error": error_text} + except Exception as e: + return {"success": False, "error": str(e)} + # Websocket Methods async def _keep_alive(self): """Keep the WebSocket connection alive with automatic reconnection.""" From 6fa66c18cc5e19dd884f1ebecca12449754cf690 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 3 Dec 2025 08:24:21 -0800 Subject: [PATCH 3/8] Unmerged Dockerfile, added Dockerfile.dev that mounts the local computer-server --- libs/xfce/Development.md | 20 + libs/xfce/Dockerfile | 6 - libs/xfce/Dockerfile.dev | 159 ++++++++ libs/xfce/README_BUILD.md | 32 -- libs/xfce/browser.py | 308 -------------- libs/xfce/main.py | 820 -------------------------------------- 6 files changed, 179 insertions(+), 1166 deletions(-) create mode 100644 libs/xfce/Development.md create mode 100644 libs/xfce/Dockerfile.dev delete mode 100644 libs/xfce/README_BUILD.md delete mode 100644 libs/xfce/browser.py delete mode 100644 libs/xfce/main.py diff --git a/libs/xfce/Development.md b/libs/xfce/Development.md new file mode 100644 index 00000000..4bc12c81 --- /dev/null +++ b/libs/xfce/Development.md @@ -0,0 +1,20 @@ +# Development + +## Building the Development Docker Image + +To build the XFCE container with local computer-server changes: + +```bash +cd libs/xfce +docker build -f Dockerfile.dev -t cua-xfce:dev .. +``` + +The build context is set to the parent directory to allow copying the local `computer-server` source. + +## Running the Development Container + +```bash +docker run -p 6901:6901 -p 8000:8000 cua-xfce:dev +``` + +Access noVNC at: http://localhost:6901 diff --git a/libs/xfce/Dockerfile b/libs/xfce/Dockerfile index 43dab80f..f1605181 100644 --- a/libs/xfce/Dockerfile +++ b/libs/xfce/Dockerfile @@ -107,12 +107,6 @@ RUN mkdir -p /home/cua/.cache && \ # Install computer-server using Python 3.12 pip RUN python3.12 -m pip install cua-computer-server -# Copy browser.py and updated main.py from local source (to include browser tool) -# These files need to be in the same directory as the Dockerfile when building -COPY browser.py /tmp/browser.py -COPY main.py /tmp/main.py -RUN python3.12 -c "import shutil; import os; cs_dir = '/usr/local/lib/python3.12/dist-packages/computer_server'; shutil.copy('/tmp/browser.py', f'{cs_dir}/browser.py'); shutil.copy('/tmp/main.py', f'{cs_dir}/main.py'); print('Copied browser.py and main.py')" && rm /tmp/browser.py /tmp/main.py - # Install playwright and Firefox dependencies RUN python3.12 -m pip install playwright && \ python3.12 -m playwright install --with-deps firefox diff --git a/libs/xfce/Dockerfile.dev b/libs/xfce/Dockerfile.dev new file mode 100644 index 00000000..12036a0c --- /dev/null +++ b/libs/xfce/Dockerfile.dev @@ -0,0 +1,159 @@ +# CUA Docker XFCE Container - Development Version +# Vanilla XFCE desktop with noVNC and computer-server (from local source) + +FROM ubuntu:22.04 + +# Avoid prompts from apt +ENV DEBIAN_FRONTEND=noninteractive + +# Set environment variables +ENV HOME=/home/cua +ENV DISPLAY=:1 +ENV VNC_PORT=5901 +ENV NOVNC_PORT=6901 +ENV API_PORT=8000 +ENV VNC_RESOLUTION=1024x768 +ENV VNC_COL_DEPTH=24 + +# Install system dependencies first (including sudo) +RUN apt-get update && apt-get install -y \ + # System utilities + sudo \ + unzip \ + zip \ + xdg-utils \ + # Desktop environment + xfce4 \ + xfce4-terminal \ + dbus-x11 \ + # VNC server + tigervnc-standalone-server \ + tigervnc-common \ + # noVNC dependencies + # python will be installed via deadsnakes as 3.12 \ + git \ + net-tools \ + netcat \ + supervisor \ + # Computer-server dependencies + # python-tk/dev for 3.12 will be installed later \ + gnome-screenshot \ + wmctrl \ + ffmpeg \ + socat \ + xclip \ + # Browser + wget \ + software-properties-common \ + # Build tools + build-essential \ + libncursesw5-dev \ + libssl-dev \ + libsqlite3-dev \ + tk-dev \ + libgdbm-dev \ + libc6-dev \ + libbz2-dev \ + libffi-dev \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install Python 3.12 from deadsnakes (keep system python3 for apt) +RUN add-apt-repository -y ppa:deadsnakes/ppa && \ + apt-get update && apt-get install -y \ + python3.12 python3.12-venv python3.12-dev python3.12-tk && \ + python3.12 -m ensurepip --upgrade && \ + python3.12 -m pip install --upgrade pip setuptools wheel && \ + rm -rf /var/lib/apt/lists/* + +# Ensure 'python' points to Python 3.12 +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 2 + +# Remove screensavers and power manager to avoid popups and lock screens +RUN apt-get remove -y \ + xfce4-power-manager \ + xfce4-power-manager-data \ + xfce4-power-manager-plugins \ + xfce4-screensaver \ + light-locker \ + xscreensaver \ + xscreensaver-data || true + +# Create user after sudo is installed +RUN useradd -m -s /bin/bash -G sudo cua && \ + echo "cua:cua" | chpasswd && \ + echo "cua ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + +# Install Firefox from Mozilla PPA (snap-free) - inline to avoid script issues +RUN apt-get update && \ + add-apt-repository -y ppa:mozillateam/ppa && \ + echo 'Package: *\nPin: release o=LP-PPA-mozillateam\nPin-Priority: 1001' > /etc/apt/preferences.d/mozilla-firefox && \ + apt-get update && \ + apt-get install -y firefox && \ + echo 'pref("datareporting.policy.firstRunURL", "");\npref("datareporting.policy.dataSubmissionEnabled", false);\npref("datareporting.healthreport.service.enabled", false);\npref("datareporting.healthreport.uploadEnabled", false);\npref("trailhead.firstrun.branches", "nofirstrun-empty");\npref("browser.aboutwelcome.enabled", false);' > /usr/lib/firefox/browser/defaults/preferences/firefox.js && \ + update-alternatives --install /usr/bin/x-www-browser x-www-browser /usr/bin/firefox 100 && \ + update-alternatives --install /usr/bin/gnome-www-browser gnome-www-browser /usr/bin/firefox 100 && \ + rm -rf /var/lib/apt/lists/* + +# Install noVNC +RUN git clone https://github.com/novnc/noVNC.git /opt/noVNC && \ + git clone https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \ + ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html + +# Pre-create cache directory with correct ownership before pip install +RUN mkdir -p /home/cua/.cache && \ + chown -R cua:cua /home/cua/.cache + +# Copy local computer-server source and install it +COPY ../python/computer-server /tmp/computer-server +RUN python3.12 -m pip install /tmp/computer-server && \ + rm -rf /tmp/computer-server + +# Install playwright and Firefox dependencies +RUN python3.12 -m pip install playwright && \ + python3.12 -m playwright install --with-deps firefox + +# Fix any cache files created by pip +RUN chown -R cua:cua /home/cua/.cache + +# Copy startup scripts +COPY src/supervisor/ /etc/supervisor/conf.d/ +COPY src/scripts/ /usr/local/bin/ + +# Make scripts executable +RUN chmod +x /usr/local/bin/*.sh + +# Setup VNC +RUN chown -R cua:cua /home/cua +USER cua +WORKDIR /home/cua + +# Create VNC directory (no password needed with SecurityTypes None) +RUN mkdir -p $HOME/.vnc + +# Configure XFCE for first start +RUN mkdir -p $HOME/.config/xfce4/xfconf/xfce-perchannel-xml $HOME/.config/xfce4 $HOME/.config/autostart + +# Copy XFCE config to disable browser launching and welcome screens +COPY --chown=cua:cua src/xfce-config/helpers.rc $HOME/.config/xfce4/helpers.rc +COPY --chown=cua:cua src/xfce-config/xfce4-session.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-session.xml +COPY --chown=cua:cua src/xfce-config/xfce4-power-manager.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-power-manager.xml + +# Disable autostart for screensaver, lock screen, and power manager +RUN echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-tips-autostart.desktop && \ + echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-screensaver.desktop && \ + echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/light-locker.desktop && \ + echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-power-manager.desktop && \ + chown -R cua:cua $HOME/.config + +# Create storage and shared directories, and Firefox cache directory +RUN mkdir -p $HOME/storage $HOME/shared $HOME/.cache/dconf $HOME/.mozilla/firefox && \ + chown -R cua:cua $HOME/storage $HOME/shared $HOME/.cache $HOME/.mozilla $HOME/.vnc + +USER root + +# Expose ports +EXPOSE $VNC_PORT $NOVNC_PORT $API_PORT + +# Start services via supervisor +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"] diff --git a/libs/xfce/README_BUILD.md b/libs/xfce/README_BUILD.md deleted file mode 100644 index d6f6a7d4..00000000 --- a/libs/xfce/README_BUILD.md +++ /dev/null @@ -1,32 +0,0 @@ -# Building the XFCE Docker Image - -## Required Files for Build - -The Dockerfile requires these files to be present in the `libs/xfce/` directory: - -- `browser.py` - Copy from `libs/python/computer-server/computer_server/browser.py` -- `main.py` - Copy from `libs/python/computer-server/computer_server/main.py` - -These files are copied into the container to include the browser tool functionality -that isn't yet in the published PyPI package. - -## Before Building - -```bash -# Copy the latest browser tool files -cp libs/python/computer-server/computer_server/browser.py libs/xfce/ -cp libs/python/computer-server/computer_server/main.py libs/xfce/ -``` - -## Build Command - -```bash -cd libs/xfce -docker build -t cua-xfce . -``` - -## Note - -Once the browser tool is included in the published `cua-computer-server` package, -these temporary file copies can be removed and the Dockerfile can be simplified. - diff --git a/libs/xfce/browser.py b/libs/xfce/browser.py deleted file mode 100644 index 3d0a4c69..00000000 --- a/libs/xfce/browser.py +++ /dev/null @@ -1,308 +0,0 @@ -""" -Browser manager using Playwright for programmatic browser control. -This allows agents to control a browser that runs visibly on the XFCE desktop. -""" - -import asyncio -import logging -import os -from typing import Any, Dict, Optional - -try: - from playwright.async_api import async_playwright, Browser, BrowserContext, Page -except ImportError: - async_playwright = None - Browser = None - BrowserContext = None - Page = None - -logger = logging.getLogger(__name__) - - -class BrowserManager: - """ - Manages a Playwright browser instance that runs visibly on the XFCE desktop. - Uses persistent context to maintain cookies and sessions. - """ - - def __init__(self): - """Initialize the BrowserManager.""" - self.playwright = None - self.browser: Optional[Browser] = None - self.context: Optional[BrowserContext] = None - self.page: Optional[Page] = None - self._initialized = False - self._initialization_error: Optional[str] = None - self._lock = asyncio.Lock() - - async def _ensure_initialized(self): - """Ensure the browser is initialized.""" - # Check if browser was closed and needs reinitialization - if self._initialized: - try: - # Check if context is still valid by trying to access it - if self.context: - # Try to get pages - this will raise if context is closed - _ = self.context.pages - # If we get here, context is still alive - return - else: - # Context was closed, need to reinitialize - self._initialized = False - logger.warning("Browser context was closed, will reinitialize...") - except Exception as e: - # Context is dead, need to reinitialize - logger.warning(f"Browser context is dead ({e}), will reinitialize...") - self._initialized = False - self.context = None - self.page = None - # Clean up playwright if it exists - if self.playwright: - try: - await self.playwright.stop() - except Exception: - pass - self.playwright = None - - async with self._lock: - # Double-check after acquiring lock (another thread might have initialized it) - if self._initialized: - try: - if self.context: - _ = self.context.pages - return - except Exception: - self._initialized = False - self.context = None - self.page = None - if self.playwright: - try: - await self.playwright.stop() - except Exception: - pass - self.playwright = None - - if async_playwright is None: - raise RuntimeError( - "playwright is not installed. Please install it with: pip install playwright && playwright install --with-deps firefox" - ) - - try: - # Get display from environment or default to :1 - display = os.environ.get("DISPLAY", ":1") - logger.info(f"Initializing browser with DISPLAY={display}") - - # Start playwright - self.playwright = await async_playwright().start() - - # Launch Firefox with persistent context (keeps cookies/sessions) - # headless=False is CRITICAL so the visual agent can see it - user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright-firefox") - os.makedirs(user_data_dir, exist_ok=True) - - # launch_persistent_context returns a BrowserContext, not a Browser - # Note: Removed --kiosk mode so the desktop remains visible - self.context = await self.playwright.firefox.launch_persistent_context( - user_data_dir=user_data_dir, - headless=False, # CRITICAL: visible for visual agent - viewport={"width": 1024, "height": 768}, - # Removed --kiosk to allow desktop visibility - ) - - # Get the first page or create one - pages = self.context.pages - if pages: - self.page = pages[0] - else: - self.page = await self.context.new_page() - - self._initialized = True - logger.info("Browser initialized successfully") - - except Exception as e: - logger.error(f"Failed to initialize browser: {e}") - import traceback - logger.error(traceback.format_exc()) - # Don't raise - return error in execute_command instead - self._initialization_error = str(e) - raise - - async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]: - """ - Execute a browser command. - - Args: - cmd: Command name (visit_url, click, type, scroll, web_search) - params: Command parameters - - Returns: - Result dictionary with success status and any data - """ - try: - await self._ensure_initialized() - except Exception as e: - error_msg = getattr(self, '_initialization_error', None) or str(e) - logger.error(f"Browser initialization failed: {error_msg}") - return { - "success": False, - "error": f"Browser initialization failed: {error_msg}. " - f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly." - } - - # Ensure browser is still initialized (in case it was manually closed) - # This will automatically reinitialize if the browser was closed - await self._ensure_initialized() - - # Check if page is still valid - page_valid = False - try: - if self.page is not None: - # Try to access page.url to check if it's still valid - _ = self.page.url - page_valid = True - except Exception as e: - logger.warning(f"Page is invalid: {e}, will get a new page...") - self.page = None - - # Get a valid page if we don't have one - if not page_valid or self.page is None: - try: - pages = self.context.pages - if pages: - # Find first non-closed page - for p in pages: - try: - if not p.is_closed(): - self.page = p - logger.info("Reusing existing open page") - page_valid = True - break - except Exception: - continue - - # If no valid page found, create a new one - if not page_valid: - self.page = await self.context.new_page() - logger.info("Created new page") - except Exception as e: - logger.error(f"Failed to get new page: {e}, browser may be closed") - # Browser was closed - reinitialize it - try: - logger.info("Browser was closed manually, reinitializing...") - self._initialized = False - self.context = None - self.page = None - if self.playwright: - try: - await self.playwright.stop() - except Exception: - pass - self.playwright = None - - # Reinitialize - await self._ensure_initialized() - # Get or create a page - pages = self.context.pages - if pages: - self.page = pages[0] - else: - self.page = await self.context.new_page() - logger.info("Browser reopened successfully after manual closure") - except Exception as reinit_error: - logger.error(f"Failed to reinitialize browser: {reinit_error}") - import traceback - logger.error(traceback.format_exc()) - return {"success": False, "error": f"Browser was closed and cannot be recovered: {reinit_error}"} - - try: - if cmd == "visit_url": - url = params.get("url") - if not url: - return {"success": False, "error": "url parameter is required"} - await self.page.goto(url, wait_until="domcontentloaded", timeout=30000) - return {"success": True, "url": self.page.url} - - elif cmd == "click": - x = params.get("x") - y = params.get("y") - if x is None or y is None: - return {"success": False, "error": "x and y parameters are required"} - await self.page.mouse.click(x, y) - return {"success": True} - - elif cmd == "type": - text = params.get("text") - if text is None: - return {"success": False, "error": "text parameter is required"} - await self.page.keyboard.type(text) - return {"success": True} - - elif cmd == "scroll": - delta_x = params.get("delta_x", 0) - delta_y = params.get("delta_y", 0) - await self.page.mouse.wheel(delta_x, delta_y) - return {"success": True} - - elif cmd == "web_search": - query = params.get("query") - if not query: - return {"success": False, "error": "query parameter is required"} - # Navigate to Google search - search_url = f"https://www.google.com/search?q={query}" - await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000) - return {"success": True, "url": self.page.url} - - else: - return {"success": False, "error": f"Unknown command: {cmd}"} - - except Exception as e: - logger.error(f"Error executing command {cmd}: {e}") - import traceback - logger.error(traceback.format_exc()) - # If page was closed due to error, try to recover - if "closed" in str(e).lower() and self.context: - try: - pages = self.context.pages - if pages: - self.page = pages[0] - logger.info("Recovered page after error") - else: - self.page = await self.context.new_page() - logger.info("Created new page after error") - except Exception as recover_error: - logger.error(f"Failed to recover page: {recover_error}") - return {"success": False, "error": str(e)} - - async def close(self): - """Close the browser and cleanup resources.""" - async with self._lock: - try: - if self.context: - await self.context.close() - self.context = None - if self.browser: - await self.browser.close() - self.browser = None - - if self.playwright: - await self.playwright.stop() - self.playwright = None - - self.page = None - self._initialized = False - logger.info("Browser closed successfully") - except Exception as e: - logger.error(f"Error closing browser: {e}") - - -# Global instance -_browser_manager: Optional[BrowserManager] = None - - -def get_browser_manager() -> BrowserManager: - """Get or create the global BrowserManager instance.""" - global _browser_manager - if _browser_manager is None: - _browser_manager = BrowserManager() - return _browser_manager - diff --git a/libs/xfce/main.py b/libs/xfce/main.py deleted file mode 100644 index 9bad59bf..00000000 --- a/libs/xfce/main.py +++ /dev/null @@ -1,820 +0,0 @@ -import asyncio -import hashlib -import inspect -import json -import logging -import os -import platform -import time -import traceback -from contextlib import redirect_stderr, redirect_stdout -from io import StringIO -from typing import Any, Dict, List, Literal, Optional, Union, cast - -import aiohttp -import uvicorn -from fastapi import ( - FastAPI, - Header, - HTTPException, - Request, - WebSocket, - WebSocketDisconnect, -) -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, StreamingResponse - -from .handlers.factory import HandlerFactory -from .browser import get_browser_manager - -# Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s -AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60")) - -try: - from agent import ComputerAgent - - HAS_AGENT = True -except ImportError: - HAS_AGENT = False - -# Set up logging with more detail -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - -# Configure WebSocket with larger message size -WEBSOCKET_MAX_SIZE = 1024 * 1024 * 10 # 10MB limit - -# Configure application with WebSocket settings -app = FastAPI( - title="Computer API", - description="API for the Computer project", - version="0.1.0", - websocket_max_size=WEBSOCKET_MAX_SIZE, -) - -# CORS configuration -origins = ["*"] -app.add_middleware( - CORSMiddleware, - allow_origins=origins, - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -protocol_version = 1 -try: - from importlib.metadata import version - - package_version = version("cua-computer-server") -except Exception: - # Fallback for cases where package is not installed or importlib.metadata is not available - try: - import pkg_resources - - package_version = pkg_resources.get_distribution("cua-computer-server").version - except Exception: - package_version = "unknown" - -( - accessibility_handler, - automation_handler, - diorama_handler, - file_handler, - desktop_handler, - window_handler, -) = HandlerFactory.create_handlers() -handlers = { - "version": lambda: {"protocol": protocol_version, "package": package_version}, - # App-Use commands - "diorama_cmd": diorama_handler.diorama_cmd, - # Accessibility commands - "get_accessibility_tree": accessibility_handler.get_accessibility_tree, - "find_element": accessibility_handler.find_element, - # Shell commands - "run_command": automation_handler.run_command, - # File system commands - "file_exists": file_handler.file_exists, - "directory_exists": file_handler.directory_exists, - "list_dir": file_handler.list_dir, - "read_text": file_handler.read_text, - "write_text": file_handler.write_text, - "read_bytes": file_handler.read_bytes, - "write_bytes": file_handler.write_bytes, - "get_file_size": file_handler.get_file_size, - "delete_file": file_handler.delete_file, - "create_dir": file_handler.create_dir, - "delete_dir": file_handler.delete_dir, - # Desktop commands - "get_desktop_environment": desktop_handler.get_desktop_environment, - "set_wallpaper": desktop_handler.set_wallpaper, - # Window management - "open": window_handler.open, - "launch": window_handler.launch, - "get_current_window_id": window_handler.get_current_window_id, - "get_application_windows": window_handler.get_application_windows, - "get_window_name": window_handler.get_window_name, - "get_window_size": window_handler.get_window_size, - "get_window_position": window_handler.get_window_position, - "set_window_size": window_handler.set_window_size, - "set_window_position": window_handler.set_window_position, - "maximize_window": window_handler.maximize_window, - "minimize_window": window_handler.minimize_window, - "activate_window": window_handler.activate_window, - "close_window": window_handler.close_window, - # Mouse commands - "mouse_down": automation_handler.mouse_down, - "mouse_up": automation_handler.mouse_up, - "left_click": automation_handler.left_click, - "right_click": automation_handler.right_click, - "double_click": automation_handler.double_click, - "move_cursor": automation_handler.move_cursor, - "drag_to": automation_handler.drag_to, - "drag": automation_handler.drag, - # Keyboard commands - "key_down": automation_handler.key_down, - "key_up": automation_handler.key_up, - "type_text": automation_handler.type_text, - "press_key": automation_handler.press_key, - "hotkey": automation_handler.hotkey, - # Scrolling actions - "scroll": automation_handler.scroll, - "scroll_down": automation_handler.scroll_down, - "scroll_up": automation_handler.scroll_up, - # Screen actions - "screenshot": automation_handler.screenshot, - "get_cursor_position": automation_handler.get_cursor_position, - "get_screen_size": automation_handler.get_screen_size, - # Clipboard actions - "copy_to_clipboard": automation_handler.copy_to_clipboard, - "set_clipboard": automation_handler.set_clipboard, -} - - -class AuthenticationManager: - def __init__(self): - self.sessions: Dict[str, Dict[str, Any]] = {} - self.container_name = os.environ.get("CONTAINER_NAME") - - def _hash_credentials(self, container_name: str, api_key: str) -> str: - """Create a hash of container name and API key for session identification""" - combined = f"{container_name}:{api_key}" - return hashlib.sha256(combined.encode()).hexdigest() - - def _is_session_valid(self, session_data: Dict[str, Any]) -> bool: - """Check if a session is still valid based on expiration time""" - if not session_data.get("valid", False): - return False - - expires_at = session_data.get("expires_at", 0) - return time.time() < expires_at - - async def auth(self, container_name: str, api_key: str) -> bool: - """Authenticate container name and API key, using cached sessions when possible""" - # If no CONTAINER_NAME is set, always allow access (local development) - if not self.container_name: - logger.info( - "No CONTAINER_NAME set in environment. Allowing access (local development mode)" - ) - return True - - # Layer 1: VM Identity Verification - if container_name != self.container_name: - logger.warning( - f"VM name mismatch. Expected: {self.container_name}, Got: {container_name}" - ) - return False - - # Create hash for session lookup - session_hash = self._hash_credentials(container_name, api_key) - - # Check if we have a valid cached session - if session_hash in self.sessions: - session_data = self.sessions[session_hash] - if self._is_session_valid(session_data): - logger.info(f"Using cached authentication for container: {container_name}") - return session_data["valid"] - else: - # Remove expired session - del self.sessions[session_hash] - - # No valid cached session, authenticate with API - logger.info(f"Authenticating with TryCUA API for container: {container_name}") - - try: - async with aiohttp.ClientSession() as session: - headers = {"Authorization": f"Bearer {api_key}"} - - async with session.get( - f"https://www.cua.ai/api/vm/auth?container_name={container_name}", - headers=headers, - ) as resp: - is_valid = resp.status == 200 and bool((await resp.text()).strip()) - - # Cache the result with configurable expiration - self.sessions[session_hash] = { - "valid": is_valid, - "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS, - } - - if is_valid: - logger.info(f"Authentication successful for container: {container_name}") - else: - logger.warning( - f"Authentication failed for container: {container_name}. Status: {resp.status}" - ) - - return is_valid - - except aiohttp.ClientError as e: - logger.error(f"Failed to validate API key with TryCUA API: {str(e)}") - # Cache failed result to avoid repeated requests - self.sessions[session_hash] = { - "valid": False, - "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS, - } - return False - except Exception as e: - logger.error(f"Unexpected error during authentication: {str(e)}") - # Cache failed result to avoid repeated requests - self.sessions[session_hash] = { - "valid": False, - "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS, - } - return False - - -class ConnectionManager: - def __init__(self): - self.active_connections: List[WebSocket] = [] - - async def connect(self, websocket: WebSocket): - await websocket.accept() - self.active_connections.append(websocket) - - def disconnect(self, websocket: WebSocket): - self.active_connections.remove(websocket) - - -manager = ConnectionManager() -auth_manager = AuthenticationManager() - - -@app.get("/status") -async def status(): - sys = platform.system().lower() - # get os type - if "darwin" in sys or sys == "macos" or sys == "mac": - os_type = "macos" - elif "windows" in sys: - os_type = "windows" - else: - os_type = "linux" - # get computer-server features - features = [] - if HAS_AGENT: - features.append("agent") - return {"status": "ok", "os_type": os_type, "features": features} - - -@app.websocket("/ws", name="websocket_endpoint") -async def websocket_endpoint(websocket: WebSocket): - global handlers - - # WebSocket message size is configured at the app or endpoint level, not on the instance - await manager.connect(websocket) - - # Check if CONTAINER_NAME is set (indicating cloud provider) - server_container_name = os.environ.get("CONTAINER_NAME") - - # If cloud provider, perform authentication handshake - if server_container_name: - try: - logger.info( - f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Waiting for authentication..." - ) - - # Wait for authentication message - auth_data = await websocket.receive_json() - - # Validate auth message format - if auth_data.get("command") != "authenticate": - await websocket.send_json( - {"success": False, "error": "First message must be authentication"} - ) - await websocket.close() - manager.disconnect(websocket) - return - - # Extract credentials - client_api_key = auth_data.get("params", {}).get("api_key") - client_container_name = auth_data.get("params", {}).get("container_name") - - # Validate credentials using AuthenticationManager - if not client_api_key: - await websocket.send_json({"success": False, "error": "API key required"}) - await websocket.close() - manager.disconnect(websocket) - return - - if not client_container_name: - await websocket.send_json({"success": False, "error": "Container name required"}) - await websocket.close() - manager.disconnect(websocket) - return - - # Use AuthenticationManager for validation - is_authenticated = await auth_manager.auth(client_container_name, client_api_key) - if not is_authenticated: - await websocket.send_json({"success": False, "error": "Authentication failed"}) - await websocket.close() - manager.disconnect(websocket) - return - - logger.info(f"Authentication successful for VM: {client_container_name}") - await websocket.send_json({"success": True, "message": "Authentication successful"}) - - except Exception as e: - logger.error(f"Error during authentication handshake: {str(e)}") - await websocket.send_json({"success": False, "error": "Authentication failed"}) - await websocket.close() - manager.disconnect(websocket) - return - - try: - while True: - try: - data = await websocket.receive_json() - command = data.get("command") - params = data.get("params", {}) - - if command not in handlers: - await websocket.send_json( - {"success": False, "error": f"Unknown command: {command}"} - ) - continue - - try: - # Filter params to only include those accepted by the handler function - handler_func = handlers[command] - sig = inspect.signature(handler_func) - filtered_params = {k: v for k, v in params.items() if k in sig.parameters} - - # Handle both sync and async functions - if asyncio.iscoroutinefunction(handler_func): - result = await handler_func(**filtered_params) - else: - # Run sync functions in thread pool to avoid blocking event loop - result = await asyncio.to_thread(handler_func, **filtered_params) - await websocket.send_json({"success": True, **result}) - except Exception as cmd_error: - logger.error(f"Error executing command {command}: {str(cmd_error)}") - logger.error(traceback.format_exc()) - await websocket.send_json({"success": False, "error": str(cmd_error)}) - - except WebSocketDisconnect: - raise - except json.JSONDecodeError as json_err: - logger.error(f"JSON decode error: {str(json_err)}") - await websocket.send_json( - {"success": False, "error": f"Invalid JSON: {str(json_err)}"} - ) - except Exception as loop_error: - logger.error(f"Error in message loop: {str(loop_error)}") - logger.error(traceback.format_exc()) - await websocket.send_json({"success": False, "error": str(loop_error)}) - - except WebSocketDisconnect: - logger.info("Client disconnected") - manager.disconnect(websocket) - except Exception as e: - logger.error(f"Fatal error in websocket connection: {str(e)}") - logger.error(traceback.format_exc()) - try: - await websocket.close() - except: - pass - manager.disconnect(websocket) - - -@app.post("/cmd") -async def cmd_endpoint( - request: Request, - container_name: Optional[str] = Header(None, alias="X-Container-Name"), - api_key: Optional[str] = Header(None, alias="X-API-Key"), -): - """ - Backup endpoint for when WebSocket connections fail. - Accepts commands via HTTP POST with streaming response. - - Headers: - - X-Container-Name: Container name for cloud authentication - - X-API-Key: API key for cloud authentication - - Body: - { - "command": "command_name", - "params": {...} - } - """ - global handlers - - # Parse request body - try: - body = await request.json() - command = body.get("command") - params = body.get("params", {}) - except Exception as e: - raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") - - if not command: - raise HTTPException(status_code=400, detail="Command is required") - - # Check if CONTAINER_NAME is set (indicating cloud provider) - server_container_name = os.environ.get("CONTAINER_NAME") - - # If cloud provider, perform authentication - if server_container_name: - logger.info( - f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..." - ) - - # Validate required headers - if not container_name: - raise HTTPException(status_code=401, detail="Container name required") - - if not api_key: - raise HTTPException(status_code=401, detail="API key required") - - # Validate with AuthenticationManager - is_authenticated = await auth_manager.auth(container_name, api_key) - if not is_authenticated: - raise HTTPException(status_code=401, detail="Authentication failed") - - if command not in handlers: - raise HTTPException(status_code=400, detail=f"Unknown command: {command}") - - async def generate_response(): - """Generate streaming response for the command execution""" - try: - # Filter params to only include those accepted by the handler function - handler_func = handlers[command] - sig = inspect.signature(handler_func) - filtered_params = {k: v for k, v in params.items() if k in sig.parameters} - - # Handle both sync and async functions - if asyncio.iscoroutinefunction(handler_func): - result = await handler_func(**filtered_params) - else: - # Run sync functions in thread pool to avoid blocking event loop - result = await asyncio.to_thread(handler_func, **filtered_params) - - # Stream the successful result - response_data = {"success": True, **result} - yield f"data: {json.dumps(response_data)}\n\n" - - except Exception as cmd_error: - logger.error(f"Error executing command {command}: {str(cmd_error)}") - logger.error(traceback.format_exc()) - - # Stream the error result - error_data = {"success": False, "error": str(cmd_error)} - yield f"data: {json.dumps(error_data)}\n\n" - - return StreamingResponse( - generate_response(), - media_type="text/plain", - headers={ - "Cache-Control": "no-cache", - "Connection": "keep-alive", - }, - ) - - -@app.post("/responses") -async def agent_response_endpoint( - request: Request, - api_key: Optional[str] = Header(None, alias="X-API-Key"), -): - """ - Minimal proxy to run ComputerAgent for up to 2 turns. - - Security: - - If CONTAINER_NAME is set on the server, require X-API-Key - and validate using AuthenticationManager unless CUA_ENABLE_PUBLIC_PROXY is true. - - Body JSON: - { - "model": "...", # required - "input": "... or messages[]", # required - "agent_kwargs": { ... }, # optional, passed directly to ComputerAgent - "env": { ... } # optional env overrides for agent - } - """ - if not HAS_AGENT: - raise HTTPException(status_code=501, detail="ComputerAgent not available") - - # Authenticate via AuthenticationManager if running in cloud (CONTAINER_NAME set) - container_name = os.environ.get("CONTAINER_NAME") - if container_name: - is_public = os.environ.get("CUA_ENABLE_PUBLIC_PROXY", "").lower().strip() in [ - "1", - "true", - "yes", - "y", - "on", - ] - if not is_public: - if not api_key: - raise HTTPException(status_code=401, detail="Missing AGENT PROXY auth headers") - ok = await auth_manager.auth(container_name, api_key) - if not ok: - raise HTTPException(status_code=401, detail="Unauthorized") - - # Parse request body - try: - body = await request.json() - except Exception as e: - raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") - - model = body.get("model") - input_data = body.get("input") - if not model or input_data is None: - raise HTTPException(status_code=400, detail="'model' and 'input' are required") - - agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {} - env_overrides: Dict[str, str] = body.get("env") or {} - - # Simple env override context - class _EnvOverride: - def __init__(self, overrides: Dict[str, str]): - self.overrides = overrides - self._original: Dict[str, Optional[str]] = {} - - def __enter__(self): - for k, v in (self.overrides or {}).items(): - self._original[k] = os.environ.get(k) - os.environ[k] = str(v) - - def __exit__(self, exc_type, exc, tb): - for k, old in self._original.items(): - if old is None: - os.environ.pop(k, None) - else: - os.environ[k] = old - - # Convert input to messages - def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: - if isinstance(data, str): - return [{"role": "user", "content": data}] - if isinstance(data, list): - return data - - messages = _to_messages(input_data) - - # Define a direct computer tool that implements the AsyncComputerHandler protocol - # and delegates to our existing automation/file/accessibility handlers. - from agent.computers import AsyncComputerHandler # runtime-checkable Protocol - - class DirectComputer(AsyncComputerHandler): - def __init__(self): - # use module-scope handler singletons created by HandlerFactory - self._auto = automation_handler - self._file = file_handler - self._access = accessibility_handler - - async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: - sys = platform.system().lower() - if "darwin" in sys or sys in ("macos", "mac"): - return "mac" - if "windows" in sys: - return "windows" - return "linux" - - async def get_dimensions(self) -> tuple[int, int]: - size = await self._auto.get_screen_size() - return size["width"], size["height"] - - async def screenshot(self) -> str: - img_b64 = await self._auto.screenshot() - return img_b64["image_data"] - - async def click(self, x: int, y: int, button: str = "left") -> None: - if button == "left": - await self._auto.left_click(x, y) - elif button == "right": - await self._auto.right_click(x, y) - else: - await self._auto.left_click(x, y) - - async def double_click(self, x: int, y: int) -> None: - await self._auto.double_click(x, y) - - async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: - await self._auto.move_cursor(x, y) - await self._auto.scroll(scroll_x, scroll_y) - - async def type(self, text: str) -> None: - await self._auto.type_text(text) - - async def wait(self, ms: int = 1000) -> None: - await asyncio.sleep(ms / 1000.0) - - async def move(self, x: int, y: int) -> None: - await self._auto.move_cursor(x, y) - - async def keypress(self, keys: Union[List[str], str]) -> None: - if isinstance(keys, str): - parts = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys] - else: - parts = keys - if len(parts) == 1: - await self._auto.press_key(parts[0]) - else: - await self._auto.hotkey(parts) - - async def drag(self, path: List[Dict[str, int]]) -> None: - if not path: - return - start = path[0] - await self._auto.mouse_down(start["x"], start["y"]) - for pt in path[1:]: - await self._auto.move_cursor(pt["x"], pt["y"]) - end = path[-1] - await self._auto.mouse_up(end["x"], end["y"]) - - async def get_current_url(self) -> str: - # Not available in this server context - return "" - - async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None: - await self._auto.mouse_down(x, y, button="left") - - async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None: - await self._auto.mouse_up(x, y, button="left") - - # # Inline image URLs to base64 - # import base64, mimetypes, requests - # # Use a browser-like User-Agent to avoid 403s from some CDNs (e.g., Wikimedia) - # HEADERS = { - # "User-Agent": ( - # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - # "AppleWebKit/537.36 (KHTML, like Gecko) " - # "Chrome/124.0.0.0 Safari/537.36" - # ) - # } - # def _to_data_url(content_bytes: bytes, url: str, resp: requests.Response) -> str: - # ctype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream" - # b64 = base64.b64encode(content_bytes).decode("utf-8") - # return f"data:{ctype};base64,{b64}" - # def inline_image_urls(messages): - # # messages: List[{"role": "...","content":[...]}] - # out = [] - # for m in messages: - # if not isinstance(m.get("content"), list): - # out.append(m) - # continue - # new_content = [] - # for part in (m.get("content") or []): - # if part.get("type") == "input_image" and (url := part.get("image_url")): - # resp = requests.get(url, headers=HEADERS, timeout=30) - # resp.raise_for_status() - # new_content.append({ - # "type": "input_image", - # "image_url": _to_data_url(resp.content, url, resp) - # }) - # else: - # new_content.append(part) - # out.append({**m, "content": new_content}) - # return out - # messages = inline_image_urls(messages) - - error = None - - with _EnvOverride(env_overrides): - # Prepare tools: if caller did not pass tools, inject our DirectComputer - tools = agent_kwargs.get("tools") - if not tools: - tools = [DirectComputer()] - agent_kwargs = {**agent_kwargs, "tools": tools} - # Instantiate agent with our tools - agent = ComputerAgent(model=model, **agent_kwargs) # type: ignore[arg-type] - - total_output: List[Any] = [] - total_usage: Dict[str, Any] = {} - - pending_computer_call_ids = set() - try: - async for result in agent.run(messages): - total_output += result["output"] - # Try to collect usage if present - if ( - isinstance(result, dict) - and "usage" in result - and isinstance(result["usage"], dict) - ): - # Merge usage counters - for k, v in result["usage"].items(): - if isinstance(v, (int, float)): - total_usage[k] = total_usage.get(k, 0) + v - else: - total_usage[k] = v - for msg in result.get("output", []): - if msg.get("type") == "computer_call": - pending_computer_call_ids.add(msg["call_id"]) - elif msg.get("type") == "computer_call_output": - pending_computer_call_ids.discard(msg["call_id"]) - # exit if no pending computer calls - if not pending_computer_call_ids: - break - except Exception as e: - logger.error(f"Error running agent: {str(e)}") - logger.error(traceback.format_exc()) - error = str(e) - - # Build response payload - payload = { - "model": model, - "error": error, - "output": total_output, - "usage": total_usage, - "status": "completed" if not error else "failed", - } - - # CORS: allow any origin - headers = { - "Cache-Control": "no-cache", - "Connection": "keep-alive", - } - - return JSONResponse(content=payload, headers=headers) - - -@app.post("/playwright_exec") -async def playwright_exec_endpoint( - request: Request, - container_name: Optional[str] = Header(None, alias="X-Container-Name"), - api_key: Optional[str] = Header(None, alias="X-API-Key"), -): - """ - Execute Playwright browser commands. - - Headers: - - X-Container-Name: Container name for cloud authentication - - X-API-Key: API key for cloud authentication - - Body: - { - "command": "visit_url|click|type|scroll|web_search", - "params": {...} - } - """ - # Parse request body - try: - body = await request.json() - command = body.get("command") - params = body.get("params", {}) - except Exception as e: - raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") - - if not command: - raise HTTPException(status_code=400, detail="Command is required") - - # Check if CONTAINER_NAME is set (indicating cloud provider) - server_container_name = os.environ.get("CONTAINER_NAME") - - # If cloud provider, perform authentication - if server_container_name: - logger.info( - f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..." - ) - - # Validate required headers - if not container_name: - raise HTTPException(status_code=401, detail="Container name required") - - if not api_key: - raise HTTPException(status_code=401, detail="API key required") - - # Validate with AuthenticationManager - is_authenticated = await auth_manager.auth(container_name, api_key) - if not is_authenticated: - raise HTTPException(status_code=401, detail="Authentication failed") - - # Get browser manager and execute command - try: - browser_manager = get_browser_manager() - result = await browser_manager.execute_command(command, params) - - if result.get("success"): - return JSONResponse(content=result) - else: - raise HTTPException(status_code=400, detail=result.get("error", "Command failed")) - except Exception as e: - logger.error(f"Error executing playwright command: {str(e)}") - logger.error(traceback.format_exc()) - raise HTTPException(status_code=500, detail=str(e)) - - -if __name__ == "__main__": - uvicorn.run(app, host="0.0.0.0", port=8000) From 8f297eac3ce0b113f4a8698f37019265e2cd98b8 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 3 Dec 2025 09:00:20 -0800 Subject: [PATCH 4/8] Migrate browser interface into computer SDK --- examples/browser_tool_example.py | 3 ++- libs/python/computer/computer/computer.py | 29 +++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/examples/browser_tool_example.py b/examples/browser_tool_example.py index 11a8dead..70055ff9 100644 --- a/examples/browser_tool_example.py +++ b/examples/browser_tool_example.py @@ -39,7 +39,8 @@ async def test_browser_tool(): # Initialize the computer interface # For local testing, use provider_type="docker" # For provider_type="cloud", provide name and api_key - computer = Computer(provider_type="docker") + computer = Computer(provider_type="docker", os_type="linux", image="cua-xfce:dev") + await computer.run() # Initialize the browser tool with the computer interface browser = BrowserTool(interface=computer) diff --git a/libs/python/computer/computer/computer.py b/libs/python/computer/computer/computer.py index 0b1cd509..710b08a2 100644 --- a/libs/python/computer/computer/computer.py +++ b/libs/python/computer/computer/computer.py @@ -953,6 +953,35 @@ class Computer: """ return await self.interface.to_screenshot_coordinates(x, y) + async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any]: + """ + Execute a Playwright browser command. + + Args: + command: The browser command to execute (visit_url, click, type, scroll, web_search) + params: Command parameters + + Returns: + Dict containing the command result + + Examples: + # Navigate to a URL + await computer.playwright_exec("visit_url", {"url": "https://example.com"}) + + # Click at coordinates + await computer.playwright_exec("click", {"x": 100, "y": 200}) + + # Type text + await computer.playwright_exec("type", {"text": "Hello, world!"}) + + # Scroll + await computer.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100}) + + # Web search + await computer.playwright_exec("web_search", {"query": "computer use agent"}) + """ + return await self.interface.playwright_exec(command, params) + # Add virtual environment management functions to computer interface async def venv_install(self, venv_name: str, requirements: list[str]): """Install packages in a virtual environment. From 3f0ed2c203090f6c5613b725073a57a46eb93a9a Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 3 Dec 2025 09:01:07 -0800 Subject: [PATCH 5/8] Fix Dockerfile.dev, make browser manager robust to the browser closing --- .../computer_server/browser.py | 260 ++++++++++-------- libs/xfce/Development.md | 8 + libs/xfce/Dockerfile.dev | 12 +- 3 files changed, 153 insertions(+), 127 deletions(-) diff --git a/libs/python/computer-server/computer_server/browser.py b/libs/python/computer-server/computer_server/browser.py index 3d0a4c69..bd0f3f7e 100644 --- a/libs/python/computer-server/computer_server/browser.py +++ b/libs/python/computer-server/computer_server/browser.py @@ -9,7 +9,7 @@ import os from typing import Any, Dict, Optional try: - from playwright.async_api import async_playwright, Browser, BrowserContext, Page + from playwright.async_api import Browser, BrowserContext, Page, async_playwright except ImportError: async_playwright = None Browser = None @@ -122,14 +122,57 @@ class BrowserManager: except Exception as e: logger.error(f"Failed to initialize browser: {e}") import traceback + logger.error(traceback.format_exc()) # Don't raise - return error in execute_command instead self._initialization_error = str(e) raise + async def _execute_command_impl(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]: + """Internal implementation of command execution.""" + if cmd == "visit_url": + url = params.get("url") + if not url: + return {"success": False, "error": "url parameter is required"} + await self.page.goto(url, wait_until="domcontentloaded", timeout=30000) + return {"success": True, "url": self.page.url} + + elif cmd == "click": + x = params.get("x") + y = params.get("y") + if x is None or y is None: + return {"success": False, "error": "x and y parameters are required"} + await self.page.mouse.click(x, y) + return {"success": True} + + elif cmd == "type": + text = params.get("text") + if text is None: + return {"success": False, "error": "text parameter is required"} + await self.page.keyboard.type(text) + return {"success": True} + + elif cmd == "scroll": + delta_x = params.get("delta_x", 0) + delta_y = params.get("delta_y", 0) + await self.page.mouse.wheel(delta_x, delta_y) + return {"success": True} + + elif cmd == "web_search": + query = params.get("query") + if not query: + return {"success": False, "error": "query parameter is required"} + # Navigate to Google search + search_url = f"https://www.google.com/search?q={query}" + await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000) + return {"success": True, "url": self.page.url} + + else: + return {"success": False, "error": f"Unknown command: {cmd}"} + async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]: """ - Execute a browser command. + Execute a browser command with automatic recovery. Args: cmd: Command name (visit_url, click, type, scroll, web_search) @@ -138,57 +181,54 @@ class BrowserManager: Returns: Result dictionary with success status and any data """ - try: - await self._ensure_initialized() - except Exception as e: - error_msg = getattr(self, '_initialization_error', None) or str(e) - logger.error(f"Browser initialization failed: {error_msg}") - return { - "success": False, - "error": f"Browser initialization failed: {error_msg}. " - f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly." - } - - # Ensure browser is still initialized (in case it was manually closed) - # This will automatically reinitialize if the browser was closed - await self._ensure_initialized() - - # Check if page is still valid - page_valid = False - try: - if self.page is not None: - # Try to access page.url to check if it's still valid - _ = self.page.url - page_valid = True - except Exception as e: - logger.warning(f"Page is invalid: {e}, will get a new page...") - self.page = None - - # Get a valid page if we don't have one - if not page_valid or self.page is None: + max_retries = 2 + for attempt in range(max_retries): try: - pages = self.context.pages - if pages: - # Find first non-closed page - for p in pages: - try: - if not p.is_closed(): - self.page = p - logger.info("Reusing existing open page") - page_valid = True - break - except Exception: - continue - - # If no valid page found, create a new one - if not page_valid: - self.page = await self.context.new_page() - logger.info("Created new page") + await self._ensure_initialized() except Exception as e: - logger.error(f"Failed to get new page: {e}, browser may be closed") - # Browser was closed - reinitialize it + error_msg = getattr(self, "_initialization_error", None) or str(e) + logger.error(f"Browser initialization failed: {error_msg}") + return { + "success": False, + "error": f"Browser initialization failed: {error_msg}. " + f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly.", + } + + # Check if page is still valid and get a new one if needed + page_valid = False + try: + if self.page is not None and not self.page.is_closed(): + # Try to access page.url to check if it's still valid + _ = self.page.url + page_valid = True + except Exception as e: + logger.warning(f"Page is invalid: {e}, will get a new page...") + self.page = None + + # Get a valid page if we don't have one + if not page_valid or self.page is None: try: - logger.info("Browser was closed manually, reinitializing...") + if self.context: + pages = self.context.pages + if pages: + # Find first non-closed page + for p in pages: + try: + if not p.is_closed(): + self.page = p + logger.info("Reusing existing open page") + page_valid = True + break + except Exception: + continue + + # If no valid page found, create a new one + if not page_valid: + self.page = await self.context.new_page() + logger.info("Created new page") + except Exception as e: + logger.error(f"Failed to get new page: {e}, browser may be closed") + # Browser was closed - force reinitialization self._initialized = False self.context = None self.page = None @@ -198,80 +238,59 @@ class BrowserManager: except Exception: pass self.playwright = None - - # Reinitialize - await self._ensure_initialized() - # Get or create a page - pages = self.context.pages - if pages: - self.page = pages[0] + + # If this isn't the last attempt, continue to retry + if attempt < max_retries - 1: + logger.info("Browser was closed, retrying with fresh initialization...") + continue else: - self.page = await self.context.new_page() - logger.info("Browser reopened successfully after manual closure") - except Exception as reinit_error: - logger.error(f"Failed to reinitialize browser: {reinit_error}") + return { + "success": False, + "error": f"Browser was closed and cannot be recovered: {e}", + } + + # Try to execute the command + try: + return await self._execute_command_impl(cmd, params) + except Exception as e: + error_str = str(e) + logger.error(f"Error executing command {cmd}: {e}") + + # Check if this is a "browser/page/context closed" error + if any(keyword in error_str.lower() for keyword in ["closed", "target", "context"]): + logger.warning( + f"Browser/page was closed during command execution (attempt {attempt + 1}/{max_retries})" + ) + + # Force reinitialization + self._initialized = False + self.context = None + self.page = None + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + # If this isn't the last attempt, retry + if attempt < max_retries - 1: + logger.info("Retrying command after browser reinitialization...") + continue + else: + return { + "success": False, + "error": f"Command failed after {max_retries} attempts: {error_str}", + } + else: + # Not a browser closed error, return immediately import traceback + logger.error(traceback.format_exc()) - return {"success": False, "error": f"Browser was closed and cannot be recovered: {reinit_error}"} + return {"success": False, "error": error_str} - try: - if cmd == "visit_url": - url = params.get("url") - if not url: - return {"success": False, "error": "url parameter is required"} - await self.page.goto(url, wait_until="domcontentloaded", timeout=30000) - return {"success": True, "url": self.page.url} - - elif cmd == "click": - x = params.get("x") - y = params.get("y") - if x is None or y is None: - return {"success": False, "error": "x and y parameters are required"} - await self.page.mouse.click(x, y) - return {"success": True} - - elif cmd == "type": - text = params.get("text") - if text is None: - return {"success": False, "error": "text parameter is required"} - await self.page.keyboard.type(text) - return {"success": True} - - elif cmd == "scroll": - delta_x = params.get("delta_x", 0) - delta_y = params.get("delta_y", 0) - await self.page.mouse.wheel(delta_x, delta_y) - return {"success": True} - - elif cmd == "web_search": - query = params.get("query") - if not query: - return {"success": False, "error": "query parameter is required"} - # Navigate to Google search - search_url = f"https://www.google.com/search?q={query}" - await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000) - return {"success": True, "url": self.page.url} - - else: - return {"success": False, "error": f"Unknown command: {cmd}"} - - except Exception as e: - logger.error(f"Error executing command {cmd}: {e}") - import traceback - logger.error(traceback.format_exc()) - # If page was closed due to error, try to recover - if "closed" in str(e).lower() and self.context: - try: - pages = self.context.pages - if pages: - self.page = pages[0] - logger.info("Recovered page after error") - else: - self.page = await self.context.new_page() - logger.info("Created new page after error") - except Exception as recover_error: - logger.error(f"Failed to recover page: {recover_error}") - return {"success": False, "error": str(e)} + # Should never reach here, but just in case + return {"success": False, "error": "Command failed after all retries"} async def close(self): """Close the browser and cleanup resources.""" @@ -305,4 +324,3 @@ def get_browser_manager() -> BrowserManager: if _browser_manager is None: _browser_manager = BrowserManager() return _browser_manager - diff --git a/libs/xfce/Development.md b/libs/xfce/Development.md index 4bc12c81..b67199e8 100644 --- a/libs/xfce/Development.md +++ b/libs/xfce/Development.md @@ -11,6 +11,14 @@ docker build -f Dockerfile.dev -t cua-xfce:dev .. The build context is set to the parent directory to allow copying the local `computer-server` source. +## Tagging the Image + +To tag the dev image as latest: + +```bash +docker tag cua-xfce:dev cua-xfce:latest +``` + ## Running the Development Container ```bash diff --git a/libs/xfce/Dockerfile.dev b/libs/xfce/Dockerfile.dev index 12036a0c..c24efaf9 100644 --- a/libs/xfce/Dockerfile.dev +++ b/libs/xfce/Dockerfile.dev @@ -105,7 +105,7 @@ RUN mkdir -p /home/cua/.cache && \ chown -R cua:cua /home/cua/.cache # Copy local computer-server source and install it -COPY ../python/computer-server /tmp/computer-server +COPY python/computer-server /tmp/computer-server RUN python3.12 -m pip install /tmp/computer-server && \ rm -rf /tmp/computer-server @@ -117,8 +117,8 @@ RUN python3.12 -m pip install playwright && \ RUN chown -R cua:cua /home/cua/.cache # Copy startup scripts -COPY src/supervisor/ /etc/supervisor/conf.d/ -COPY src/scripts/ /usr/local/bin/ +COPY xfce/src/supervisor/ /etc/supervisor/conf.d/ +COPY xfce/src/scripts/ /usr/local/bin/ # Make scripts executable RUN chmod +x /usr/local/bin/*.sh @@ -135,9 +135,9 @@ RUN mkdir -p $HOME/.vnc RUN mkdir -p $HOME/.config/xfce4/xfconf/xfce-perchannel-xml $HOME/.config/xfce4 $HOME/.config/autostart # Copy XFCE config to disable browser launching and welcome screens -COPY --chown=cua:cua src/xfce-config/helpers.rc $HOME/.config/xfce4/helpers.rc -COPY --chown=cua:cua src/xfce-config/xfce4-session.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-session.xml -COPY --chown=cua:cua src/xfce-config/xfce4-power-manager.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-power-manager.xml +COPY --chown=cua:cua xfce/src/xfce-config/helpers.rc $HOME/.config/xfce4/helpers.rc +COPY --chown=cua:cua xfce/src/xfce-config/xfce4-session.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-session.xml +COPY --chown=cua:cua xfce/src/xfce-config/xfce4-power-manager.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-power-manager.xml # Disable autostart for screensaver, lock screen, and power manager RUN echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-tips-autostart.desktop && \ From 907fff475e34bf453887655335d42dbad0b191da Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 3 Dec 2025 09:28:03 -0800 Subject: [PATCH 6/8] Patch anti-bot-detection/stealth-mode into playwright browser --- examples/browser_tool_example.py | 24 +++++++++++++ libs/python/agent/agent/tools/browser_tool.py | 19 ++++++++++ .../computer_server/browser.py | 35 +++++++++++++++++++ libs/xfce/Dockerfile | 2 +- libs/xfce/Dockerfile.dev | 2 +- 5 files changed, 80 insertions(+), 2 deletions(-) diff --git a/examples/browser_tool_example.py b/examples/browser_tool_example.py index 70055ff9..e5767f27 100644 --- a/examples/browser_tool_example.py +++ b/examples/browser_tool_example.py @@ -48,6 +48,14 @@ async def test_browser_tool(): logger.info("Testing Browser Tool...") try: + # Test 0: Take a screenshot (pre-init) + logger.info("Test 0: Taking a screenshot...") + screenshot_bytes = await browser.screenshot() + screenshot_path = Path(__file__).parent / "browser_screenshot_init.png" + with open(screenshot_path, "wb") as f: + f.write(screenshot_bytes) + logger.info(f"Screenshot captured: {len(screenshot_bytes)} bytes") + # Test 1: Visit a URL logger.info("Test 1: Visiting a URL...") result = await browser.visit_url("https://www.trycua.com") @@ -56,6 +64,22 @@ async def test_browser_tool(): # Wait a bit for the page to load await asyncio.sleep(2) + # Test 2: Take a screenshot + logger.info("Test 2: Taking a screenshot...") + screenshot_bytes = await browser.screenshot() + screenshot_path = Path(__file__).parent / "browser_screenshot.png" + with open(screenshot_path, "wb") as f: + f.write(screenshot_bytes) + logger.info(f"Screenshot captured: {len(screenshot_bytes)} bytes") + + # Wait a bit + await asyncio.sleep(1) + + # Test 3: Visit bot detector + logger.info("Test 3: Visiting bot detector...") + result = await browser.visit_url("https://bot-detector.rebrowser.net/") + logger.info(f"Visit URL result: {result}") + # Test 2: Web search logger.info("Test 2: Performing a web search...") result = await browser.web_search("Python programming") diff --git a/libs/python/agent/agent/tools/browser_tool.py b/libs/python/agent/agent/tools/browser_tool.py index 85b6ba23..a1bf2090 100644 --- a/libs/python/agent/agent/tools/browser_tool.py +++ b/libs/python/agent/agent/tools/browser_tool.py @@ -114,3 +114,22 @@ class BrowserTool: Response dictionary with success status and current URL """ return await self._execute_command("web_search", {"query": query}) + + async def screenshot(self) -> bytes: + """ + Take a screenshot of the current browser page. + + Returns: + Screenshot image data as bytes (PNG format) + """ + import base64 + + result = await self._execute_command("screenshot", {}) + if result.get("success") and result.get("screenshot"): + # Decode base64 screenshot to bytes + screenshot_b64 = result["screenshot"] + screenshot_bytes = base64.b64decode(screenshot_b64) + return screenshot_bytes + else: + error = result.get("error", "Unknown error") + raise RuntimeError(f"Failed to take screenshot: {error}") diff --git a/libs/python/computer-server/computer_server/browser.py b/libs/python/computer-server/computer_server/browser.py index bd0f3f7e..9789abf7 100644 --- a/libs/python/computer-server/computer_server/browser.py +++ b/libs/python/computer-server/computer_server/browser.py @@ -109,6 +109,33 @@ class BrowserManager: # Removed --kiosk to allow desktop visibility ) + # Add init script to make the browser less detectable + await self.context.add_init_script( + """const defaultGetter = Object.getOwnPropertyDescriptor( + Navigator.prototype, + "webdriver" + ).get; + defaultGetter.apply(navigator); + defaultGetter.toString(); + Object.defineProperty(Navigator.prototype, "webdriver", { + set: undefined, + enumerable: true, + configurable: true, + get: new Proxy(defaultGetter, { + apply: (target, thisArg, args) => { + Reflect.apply(target, thisArg, args); + return false; + }, + }), + }); + const patchedGetter = Object.getOwnPropertyDescriptor( + Navigator.prototype, + "webdriver" + ).get; + patchedGetter.apply(navigator); + patchedGetter.toString();""" + ) + # Get the first page or create one pages = self.context.pages if pages: @@ -167,6 +194,14 @@ class BrowserManager: await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000) return {"success": True, "url": self.page.url} + elif cmd == "screenshot": + # Take a screenshot and return as base64 + import base64 + + screenshot_bytes = await self.page.screenshot(type="png") + screenshot_b64 = base64.b64encode(screenshot_bytes).decode("utf-8") + return {"success": True, "screenshot": screenshot_b64} + else: return {"success": False, "error": f"Unknown command: {cmd}"} diff --git a/libs/xfce/Dockerfile b/libs/xfce/Dockerfile index f1605181..2b687940 100644 --- a/libs/xfce/Dockerfile +++ b/libs/xfce/Dockerfile @@ -108,7 +108,7 @@ RUN mkdir -p /home/cua/.cache && \ RUN python3.12 -m pip install cua-computer-server # Install playwright and Firefox dependencies -RUN python3.12 -m pip install playwright && \ +RUN python3.12 -m pip install rebrowser-playwright && \ python3.12 -m playwright install --with-deps firefox # Fix any cache files created by pip diff --git a/libs/xfce/Dockerfile.dev b/libs/xfce/Dockerfile.dev index c24efaf9..2eec430e 100644 --- a/libs/xfce/Dockerfile.dev +++ b/libs/xfce/Dockerfile.dev @@ -110,7 +110,7 @@ RUN python3.12 -m pip install /tmp/computer-server && \ rm -rf /tmp/computer-server # Install playwright and Firefox dependencies -RUN python3.12 -m pip install playwright && \ +RUN python3.12 -m pip install rebrowser-playwright && \ python3.12 -m playwright install --with-deps firefox # Fix any cache files created by pip From 6dcb1a1b9f4b159d88347211cc23bca1d26497c5 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 3 Dec 2025 09:28:59 -0800 Subject: [PATCH 7/8] Cleanup extra files --- examples/BROWSER_TOOL_README.md | 89 --------------------------------- 1 file changed, 89 deletions(-) delete mode 100644 examples/BROWSER_TOOL_README.md diff --git a/examples/BROWSER_TOOL_README.md b/examples/BROWSER_TOOL_README.md deleted file mode 100644 index f72971e8..00000000 --- a/examples/BROWSER_TOOL_README.md +++ /dev/null @@ -1,89 +0,0 @@ -# Browser Tool - -Browser automation tool that allows agents to control a Firefox browser programmatically via Playwright while keeping it visible on the XFCE desktop. - -## Quick Start - -### Using Docker (Recommended) - -```bash -# Build and run the container -cd libs/xfce -docker build -t cua-xfce . -docker run -d --name cua-xfce-test \ - -p 8000:8000 -p 5901:5901 -p 6901:6901 \ - -e DISPLAY=:1 \ - cua-xfce - -# View desktop: http://localhost:6901 -# Test the browser tool -python examples/browser_tool_example.py -``` - -### Local Testing - -```bash -# Install dependencies -pip install playwright -playwright install --with-deps firefox - -# Start server -python -m computer_server --port 8000 - -# Run test (in another terminal) -python examples/browser_tool_example.py -``` - -## Features - -- **Visible Browser**: Runs in non-headless mode so visual agents can see it -- **Auto-Recovery**: Automatically reopens browser if closed manually -- **Persistent Context**: Maintains cookies and sessions across commands -- **Fara/Magentic-One Interface**: Compatible with Microsoft agent interfaces -- **Computer SDK Integration**: Uses the Computer SDK's interface for unified control - -## Usage - -The BrowserTool uses the Computer SDK's interface to communicate with the server: - -```python -from computer import Computer -from agent.tools.browser_tool import BrowserTool - -# Initialize computer interface -computer = Computer(ip_address="localhost") - -# Create browser tool with the interface -browser = BrowserTool(interface=computer) - -# Use the browser -await browser.visit_url("https://www.example.com") -await browser.click(x=500, y=300) -await browser.type("Hello, world!") -``` - -## API Endpoint - -The browser tool is also accessible via the `/playwright_exec` endpoint: - -```bash -curl -X POST http://localhost:8000/playwright_exec \ - -H "Content-Type: application/json" \ - -d '{"command": "visit_url", "params": {"url": "https://www.example.com"}}' -``` - -## Available Commands - -- `visit_url(url)` - Navigate to a URL -- `click(x, y)` - Click at coordinates -- `type(text)` - Type text into focused element -- `scroll(delta_x, delta_y)` - Scroll the page -- `web_search(query)` - Navigate to Google search - -## Troubleshooting - -**Browser closes unexpectedly**: The tool automatically reopens the browser on the next command. - -**Connection errors**: Make sure the server is running (`curl http://localhost:8000/status`). - -**Playwright not found**: Install with `pip install playwright && playwright install --with-deps firefox`. From ec233e2e891ec6822505f327ab01d1cca7ca24bb Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 3 Dec 2025 09:33:42 -0800 Subject: [PATCH 8/8] Remove rebrowser-playwright --- libs/xfce/Dockerfile | 2 +- libs/xfce/Dockerfile.dev | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/xfce/Dockerfile b/libs/xfce/Dockerfile index 2b687940..f1605181 100644 --- a/libs/xfce/Dockerfile +++ b/libs/xfce/Dockerfile @@ -108,7 +108,7 @@ RUN mkdir -p /home/cua/.cache && \ RUN python3.12 -m pip install cua-computer-server # Install playwright and Firefox dependencies -RUN python3.12 -m pip install rebrowser-playwright && \ +RUN python3.12 -m pip install playwright && \ python3.12 -m playwright install --with-deps firefox # Fix any cache files created by pip diff --git a/libs/xfce/Dockerfile.dev b/libs/xfce/Dockerfile.dev index 2eec430e..c24efaf9 100644 --- a/libs/xfce/Dockerfile.dev +++ b/libs/xfce/Dockerfile.dev @@ -110,7 +110,7 @@ RUN python3.12 -m pip install /tmp/computer-server && \ rm -rf /tmp/computer-server # Install playwright and Firefox dependencies -RUN python3.12 -m pip install rebrowser-playwright && \ +RUN python3.12 -m pip install playwright && \ python3.12 -m playwright install --with-deps firefox # Fix any cache files created by pip