dd browser tool with Playwright for visible browser automation

Add browser tool with Playwright/Firefox support. Includes BrowserManager, /playwright_exec endpoint, BrowserTool client, and auto-recovery. Fixes Python version in startup script and adds Playwright to Docker build.
2026-01-01 11:00:31 -06:00 · 2025-12-02 12:53:25 -05:00
parent 5b37015fd2
commit 37c5be669b
12 changed files with 1861 additions and 1 deletions
--- a/examples/BROWSER_TOOL_README.md
+++ b/examples/BROWSER_TOOL_README.md
@@ -0,0 +1,69 @@
+# Browser Tool
+
+Browser automation tool that allows agents to control a Firefox browser programmatically via Playwright while keeping it visible on the XFCE desktop.
+
+## Quick Start
+
+### Using Docker (Recommended)
+
+```bash
+# Build and run the container
+cd libs/xfce
+docker build -t cua-xfce .
+docker run -d --name cua-xfce-test \
+  -p 8000:8000 -p 5901:5901 -p 6901:6901 \
+  -e DISPLAY=:1 \
+  cua-xfce
+
+# View desktop: http://localhost:6901
+# Test the browser tool
+python examples/browser_tool_example.py
+```
+
+### Local Testing
+
+```bash
+# Install dependencies
+pip install playwright
+playwright install --with-deps firefox
+
+# Start server
+python -m computer_server --port 8000
+
+# Run test (in another terminal)
+python examples/browser_tool_example.py
+```
+
+## Features
+
+- **Visible Browser**: Runs in non-headless mode so visual agents can see it
+- **Auto-Recovery**: Automatically reopens browser if closed manually
+- **Persistent Context**: Maintains cookies and sessions across commands
+- **Fara/Magentic-One Interface**: Compatible with Microsoft agent interfaces
+
+## API Endpoint
+
+The browser tool is accessible via the `/playwright_exec` endpoint:
+
+```bash
+curl -X POST http://localhost:8000/playwright_exec \
+  -H "Content-Type: application/json" \
+  -d '{"command": "visit_url", "params": {"url": "https://www.example.com"}}'
+```
+
+## Available Commands
+
+- `visit_url(url)` - Navigate to a URL
+- `click(x, y)` - Click at coordinates
+- `type(text)` - Type text into focused element
+- `scroll(delta_x, delta_y)` - Scroll the page
+- `web_search(query)` - Navigate to Google search
+
+## Troubleshooting
+
+**Browser closes unexpectedly**: The tool automatically reopens the browser on the next command.
+
+**Connection errors**: Make sure the server is running (`curl http://localhost:8000/status`).
+
+**Playwright not found**: Install with `pip install playwright && playwright install --with-deps firefox`.
+
--- a/examples/browser_tool_example.py
+++ b/examples/browser_tool_example.py
@@ -0,0 +1,96 @@
+"""
+Browser Tool Example
+
+Demonstrates how to use the BrowserTool to control a browser programmatically
+via the computer server. The browser runs visibly on the XFCE desktop so visual
+agents can see it.
+
+Prerequisites:
+    - Computer server running (Docker container or local)
+    - For Docker: Container should be running with browser tool support
+    - For local: Playwright and Firefox must be installed
+
+Usage:
+    python examples/browser_tool_example.py
+"""
+
+import asyncio
+import logging
+import sys
+from pathlib import Path
+
+# Import BrowserTool directly from the file
+browser_tool_path = Path(__file__).parent.parent / "libs" / "python" / "agent" / "agent" / "tools" / "browser_tool.py"
+sys.path.insert(0, str(browser_tool_path.parent.parent.parent))
+
+# Import the module directly
+import importlib.util
+spec = importlib.util.spec_from_file_location("browser_tool", browser_tool_path)
+if spec is None or spec.loader is None:
+    raise ImportError(f"Could not load browser_tool from {browser_tool_path}")
+browser_tool_module = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(browser_tool_module)
+BrowserTool = browser_tool_module.BrowserTool
+
+# Configure logging to see what's happening
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+async def test_browser_tool():
+    """Test the BrowserTool with various commands."""
+    
+    # Initialize the browser tool
+    # For local testing, use http://localhost:8000
+    # For cloud, provide base_url, api_key, and container_name
+    browser = BrowserTool(base_url="http://localhost:8000")
+    
+    logger.info("Testing Browser Tool...")
+    
+    try:
+        # Test 1: Visit a URL
+        logger.info("Test 1: Visiting a URL...")
+        result = await browser.visit_url("https://www.trycua.com")
+        logger.info(f"Visit URL result: {result}")
+        
+        # Wait a bit for the page to load
+        await asyncio.sleep(2)
+        
+        # Test 2: Web search
+        logger.info("Test 2: Performing a web search...")
+        result = await browser.web_search("Python programming")
+        logger.info(f"Web search result: {result}")
+        
+        # Wait a bit
+        await asyncio.sleep(2)
+        
+        # Test 3: Scroll
+        logger.info("Test 3: Scrolling the page...")
+        result = await browser.scroll(delta_x=0, delta_y=500)
+        logger.info(f"Scroll result: {result}")
+        
+        # Wait a bit
+        await asyncio.sleep(1)
+        
+        # Test 4: Click (example coordinates - adjust based on your screen)
+        logger.info("Test 4: Clicking at coordinates...")
+        result = await browser.click(x=500, y=300)
+        logger.info(f"Click result: {result}")
+        
+        # Wait a bit
+        await asyncio.sleep(1)
+        
+        # Test 5: Type text (if there's a focused input field)
+        logger.info("Test 5: Typing text...")
+        result = await browser.type("Hello from BrowserTool!")
+        logger.info(f"Type result: {result}")
+        
+        logger.info("All tests completed!")
+        
+    except Exception as e:
+        logger.error(f"Error during testing: {e}", exc_info=True)
+
+
+if __name__ == "__main__":
+    asyncio.run(test_browser_tool())
+
--- a/libs/python/agent/agent/tools/init.py
+++ b/libs/python/agent/agent/tools/init.py
@@ -0,0 +1,6 @@
+"""Tools for agent interactions."""
+
+from .browser_tool import BrowserTool
+
+__all__ = ["BrowserTool"]
+
--- a/libs/python/agent/agent/tools/browser_tool.py
+++ b/libs/python/agent/agent/tools/browser_tool.py
@@ -0,0 +1,143 @@
+"""
+Browser Tool for agent interactions.
+Allows agents to control a browser programmatically via Playwright.
+"""
+
+import logging
+from typing import Optional
+
+import aiohttp
+
+logger = logging.getLogger(__name__)
+
+
+class BrowserTool:
+    """
+    Browser tool that connects to the computer server's Playwright endpoint.
+    Implements the Fara/Magentic-One agent interface for browser control.
+    """
+
+    def __init__(
+        self,
+        base_url: str = "http://localhost:8000",
+        api_key: Optional[str] = None,
+        container_name: Optional[str] = None,
+    ):
+        """
+        Initialize the BrowserTool.
+
+        Args:
+            base_url: Base URL of the computer server (default: http://localhost:8000)
+            api_key: Optional API key for cloud authentication
+            container_name: Optional container name for cloud authentication
+        """
+        self.base_url = base_url.rstrip("/")
+        self.api_key = api_key
+        self.container_name = container_name
+        self.logger = logger
+
+    def _get_endpoint_url(self) -> str:
+        """Get the full URL for the playwright_exec endpoint."""
+        return f"{self.base_url}/playwright_exec"
+
+    def _get_headers(self) -> dict:
+        """Get headers for the HTTP request."""
+        headers = {"Content-Type": "application/json"}
+        if self.api_key:
+            headers["X-API-Key"] = self.api_key
+        if self.container_name:
+            headers["X-Container-Name"] = self.container_name
+        return headers
+
+    async def _execute_command(self, command: str, params: dict) -> dict:
+        """
+        Execute a browser command via HTTP POST.
+
+        Args:
+            command: Command name
+            params: Command parameters
+
+        Returns:
+            Response dictionary
+        """
+        url = self._get_endpoint_url()
+        payload = {"command": command, "params": params}
+        headers = self._get_headers()
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(url, json=payload, headers=headers) as response:
+                    if response.status == 200:
+                        return await response.json()
+                    else:
+                        error_text = await response.text()
+                        self.logger.error(
+                            f"Browser command failed with status {response.status}: {error_text}"
+                        )
+                        return {"success": False, "error": error_text}
+        except Exception as e:
+            self.logger.error(f"Error executing browser command: {e}")
+            return {"success": False, "error": str(e)}
+
+    async def visit_url(self, url: str) -> dict:
+        """
+        Navigate to a URL.
+
+        Args:
+            url: URL to visit
+
+        Returns:
+            Response dictionary with success status and current URL
+        """
+        return await self._execute_command("visit_url", {"url": url})
+
+    async def click(self, x: int, y: int) -> dict:
+        """
+        Click at coordinates.
+
+        Args:
+            x: X coordinate
+            y: Y coordinate
+
+        Returns:
+            Response dictionary with success status
+        """
+        return await self._execute_command("click", {"x": x, "y": y})
+
+    async def type(self, text: str) -> dict:
+        """
+        Type text into the focused element.
+
+        Args:
+            text: Text to type
+
+        Returns:
+            Response dictionary with success status
+        """
+        return await self._execute_command("type", {"text": text})
+
+    async def scroll(self, delta_x: int, delta_y: int) -> dict:
+        """
+        Scroll the page.
+
+        Args:
+            delta_x: Horizontal scroll delta
+            delta_y: Vertical scroll delta
+
+        Returns:
+            Response dictionary with success status
+        """
+        return await self._execute_command("scroll", {"delta_x": delta_x, "delta_y": delta_y})
+
+    async def web_search(self, query: str) -> dict:
+        """
+        Navigate to a Google search for the query.
+
+        Args:
+            query: Search query
+
+        Returns:
+            Response dictionary with success status and current URL
+        """
+        return await self._execute_command("web_search", {"query": query})
+
--- a/libs/python/computer-server/computer_server/browser.py
+++ b/libs/python/computer-server/computer_server/browser.py
@@ -0,0 +1,308 @@
+"""
+Browser manager using Playwright for programmatic browser control.
+This allows agents to control a browser that runs visibly on the XFCE desktop.
+"""
+
+import asyncio
+import logging
+import os
+from typing import Any, Dict, Optional
+
+try:
+    from playwright.async_api import async_playwright, Browser, BrowserContext, Page
+except ImportError:
+    async_playwright = None
+    Browser = None
+    BrowserContext = None
+    Page = None
+
+logger = logging.getLogger(__name__)
+
+
+class BrowserManager:
+    """
+    Manages a Playwright browser instance that runs visibly on the XFCE desktop.
+    Uses persistent context to maintain cookies and sessions.
+    """
+
+    def __init__(self):
+        """Initialize the BrowserManager."""
+        self.playwright = None
+        self.browser: Optional[Browser] = None
+        self.context: Optional[BrowserContext] = None
+        self.page: Optional[Page] = None
+        self._initialized = False
+        self._initialization_error: Optional[str] = None
+        self._lock = asyncio.Lock()
+
+    async def _ensure_initialized(self):
+        """Ensure the browser is initialized."""
+        # Check if browser was closed and needs reinitialization
+        if self._initialized:
+            try:
+                # Check if context is still valid by trying to access it
+                if self.context:
+                    # Try to get pages - this will raise if context is closed
+                    _ = self.context.pages
+                    # If we get here, context is still alive
+                    return
+                else:
+                    # Context was closed, need to reinitialize
+                    self._initialized = False
+                    logger.warning("Browser context was closed, will reinitialize...")
+            except Exception as e:
+                # Context is dead, need to reinitialize
+                logger.warning(f"Browser context is dead ({e}), will reinitialize...")
+                self._initialized = False
+                self.context = None
+                self.page = None
+                # Clean up playwright if it exists
+                if self.playwright:
+                    try:
+                        await self.playwright.stop()
+                    except Exception:
+                        pass
+                    self.playwright = None
+
+        async with self._lock:
+            # Double-check after acquiring lock (another thread might have initialized it)
+            if self._initialized:
+                try:
+                    if self.context:
+                        _ = self.context.pages
+                        return
+                except Exception:
+                    self._initialized = False
+                    self.context = None
+                    self.page = None
+                    if self.playwright:
+                        try:
+                            await self.playwright.stop()
+                        except Exception:
+                            pass
+                        self.playwright = None
+
+            if async_playwright is None:
+                raise RuntimeError(
+                    "playwright is not installed. Please install it with: pip install playwright && playwright install --with-deps firefox"
+                )
+
+            try:
+                # Get display from environment or default to :1
+                display = os.environ.get("DISPLAY", ":1")
+                logger.info(f"Initializing browser with DISPLAY={display}")
+
+                # Start playwright
+                self.playwright = await async_playwright().start()
+
+                # Launch Firefox with persistent context (keeps cookies/sessions)
+                # headless=False is CRITICAL so the visual agent can see it
+                user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright-firefox")
+                os.makedirs(user_data_dir, exist_ok=True)
+
+                # launch_persistent_context returns a BrowserContext, not a Browser
+                # Note: Removed --kiosk mode so the desktop remains visible
+                self.context = await self.playwright.firefox.launch_persistent_context(
+                    user_data_dir=user_data_dir,
+                    headless=False,  # CRITICAL: visible for visual agent
+                    viewport={"width": 1024, "height": 768},
+                    # Removed --kiosk to allow desktop visibility
+                )
+
+                # Get the first page or create one
+                pages = self.context.pages
+                if pages:
+                    self.page = pages[0]
+                else:
+                    self.page = await self.context.new_page()
+
+                self._initialized = True
+                logger.info("Browser initialized successfully")
+
+            except Exception as e:
+                logger.error(f"Failed to initialize browser: {e}")
+                import traceback
+                logger.error(traceback.format_exc())
+                # Don't raise - return error in execute_command instead
+                self._initialization_error = str(e)
+                raise
+
+    async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Execute a browser command.
+
+        Args:
+            cmd: Command name (visit_url, click, type, scroll, web_search)
+            params: Command parameters
+
+        Returns:
+            Result dictionary with success status and any data
+        """
+        try:
+            await self._ensure_initialized()
+        except Exception as e:
+            error_msg = getattr(self, '_initialization_error', None) or str(e)
+            logger.error(f"Browser initialization failed: {error_msg}")
+            return {
+                "success": False,
+                "error": f"Browser initialization failed: {error_msg}. "
+                         f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly."
+            }
+
+        # Ensure browser is still initialized (in case it was manually closed)
+        # This will automatically reinitialize if the browser was closed
+        await self._ensure_initialized()
+        
+        # Check if page is still valid
+        page_valid = False
+        try:
+            if self.page is not None:
+                # Try to access page.url to check if it's still valid
+                _ = self.page.url
+                page_valid = True
+        except Exception as e:
+            logger.warning(f"Page is invalid: {e}, will get a new page...")
+            self.page = None
+        
+        # Get a valid page if we don't have one
+        if not page_valid or self.page is None:
+            try:
+                pages = self.context.pages
+                if pages:
+                    # Find first non-closed page
+                    for p in pages:
+                        try:
+                            if not p.is_closed():
+                                self.page = p
+                                logger.info("Reusing existing open page")
+                                page_valid = True
+                                break
+                        except Exception:
+                            continue
+                
+                # If no valid page found, create a new one
+                if not page_valid:
+                    self.page = await self.context.new_page()
+                    logger.info("Created new page")
+            except Exception as e:
+                logger.error(f"Failed to get new page: {e}, browser may be closed")
+                # Browser was closed - reinitialize it
+                try:
+                    logger.info("Browser was closed manually, reinitializing...")
+                    self._initialized = False
+                    self.context = None
+                    self.page = None
+                    if self.playwright:
+                        try:
+                            await self.playwright.stop()
+                        except Exception:
+                            pass
+                        self.playwright = None
+                    
+                    # Reinitialize
+                    await self._ensure_initialized()
+                    # Get or create a page
+                    pages = self.context.pages
+                    if pages:
+                        self.page = pages[0]
+                    else:
+                        self.page = await self.context.new_page()
+                    logger.info("Browser reopened successfully after manual closure")
+                except Exception as reinit_error:
+                    logger.error(f"Failed to reinitialize browser: {reinit_error}")
+                    import traceback
+                    logger.error(traceback.format_exc())
+                    return {"success": False, "error": f"Browser was closed and cannot be recovered: {reinit_error}"}
+
+        try:
+            if cmd == "visit_url":
+                url = params.get("url")
+                if not url:
+                    return {"success": False, "error": "url parameter is required"}
+                await self.page.goto(url, wait_until="domcontentloaded", timeout=30000)
+                return {"success": True, "url": self.page.url}
+
+            elif cmd == "click":
+                x = params.get("x")
+                y = params.get("y")
+                if x is None or y is None:
+                    return {"success": False, "error": "x and y parameters are required"}
+                await self.page.mouse.click(x, y)
+                return {"success": True}
+
+            elif cmd == "type":
+                text = params.get("text")
+                if text is None:
+                    return {"success": False, "error": "text parameter is required"}
+                await self.page.keyboard.type(text)
+                return {"success": True}
+
+            elif cmd == "scroll":
+                delta_x = params.get("delta_x", 0)
+                delta_y = params.get("delta_y", 0)
+                await self.page.mouse.wheel(delta_x, delta_y)
+                return {"success": True}
+
+            elif cmd == "web_search":
+                query = params.get("query")
+                if not query:
+                    return {"success": False, "error": "query parameter is required"}
+                # Navigate to Google search
+                search_url = f"https://www.google.com/search?q={query}"
+                await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000)
+                return {"success": True, "url": self.page.url}
+
+            else:
+                return {"success": False, "error": f"Unknown command: {cmd}"}
+
+        except Exception as e:
+            logger.error(f"Error executing command {cmd}: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
+            # If page was closed due to error, try to recover
+            if "closed" in str(e).lower() and self.context:
+                try:
+                    pages = self.context.pages
+                    if pages:
+                        self.page = pages[0]
+                        logger.info("Recovered page after error")
+                    else:
+                        self.page = await self.context.new_page()
+                        logger.info("Created new page after error")
+                except Exception as recover_error:
+                    logger.error(f"Failed to recover page: {recover_error}")
+            return {"success": False, "error": str(e)}
+
+    async def close(self):
+        """Close the browser and cleanup resources."""
+        async with self._lock:
+            try:
+                if self.context:
+                    await self.context.close()
+                    self.context = None
+                if self.browser:
+                    await self.browser.close()
+                    self.browser = None
+
+                if self.playwright:
+                    await self.playwright.stop()
+                    self.playwright = None
+
+                self.page = None
+                self._initialized = False
+                logger.info("Browser closed successfully")
+            except Exception as e:
+                logger.error(f"Error closing browser: {e}")
+
+
+# Global instance
+_browser_manager: Optional[BrowserManager] = None
+
+
+def get_browser_manager() -> BrowserManager:
+    """Get or create the global BrowserManager instance."""
+    global _browser_manager
+    if _browser_manager is None:
+        _browser_manager = BrowserManager()
+    return _browser_manager
+
--- a/libs/python/computer-server/computer_server/main.py
+++ b/libs/python/computer-server/computer_server/main.py
@@ -25,6 +25,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse

 from .handlers.factory import HandlerFactory
+from .browser import get_browser_manager

 # Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s
 AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60"))
@@ -749,5 +750,71 @@ async def agent_response_endpoint(
    return JSONResponse(content=payload, headers=headers)


+@app.post("/playwright_exec")
+async def playwright_exec_endpoint(
+    request: Request,
+    container_name: Optional[str] = Header(None, alias="X-Container-Name"),
+    api_key: Optional[str] = Header(None, alias="X-API-Key"),
+):
+    """
+    Execute Playwright browser commands.
+
+    Headers:
+    - X-Container-Name: Container name for cloud authentication
+    - X-API-Key: API key for cloud authentication
+
+    Body:
+    {
+        "command": "visit_url|click|type|scroll|web_search",
+        "params": {...}
+    }
+    """
+    # Parse request body
+    try:
+        body = await request.json()
+        command = body.get("command")
+        params = body.get("params", {})
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
+
+    if not command:
+        raise HTTPException(status_code=400, detail="Command is required")
+
+    # Check if CONTAINER_NAME is set (indicating cloud provider)
+    server_container_name = os.environ.get("CONTAINER_NAME")
+
+    # If cloud provider, perform authentication
+    if server_container_name:
+        logger.info(
+            f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..."
+        )
+
+        # Validate required headers
+        if not container_name:
+            raise HTTPException(status_code=401, detail="Container name required")
+
+        if not api_key:
+            raise HTTPException(status_code=401, detail="API key required")
+
+        # Validate with AuthenticationManager
+        is_authenticated = await auth_manager.auth(container_name, api_key)
+        if not is_authenticated:
+            raise HTTPException(status_code=401, detail="Authentication failed")
+
+    # Get browser manager and execute command
+    try:
+        browser_manager = get_browser_manager()
+        result = await browser_manager.execute_command(command, params)
+        
+        if result.get("success"):
+            return JSONResponse(content=result)
+        else:
+            raise HTTPException(status_code=400, detail=result.get("error", "Command failed"))
+    except Exception as e:
+        logger.error(f"Error executing playwright command: {str(e)}")
+        logger.error(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
+
+
 if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/libs/python/computer-server/pyproject.toml
+++ b/libs/python/computer-server/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
    "pyperclip>=1.9.0",
    "websockets>=12.0",
    "pywinctl>=0.4.1",
+    "playwright>=1.40.0",
    # OS-specific runtime deps
    "pyobjc-framework-Cocoa>=10.1; sys_platform == 'darwin'",
    "pyobjc-framework-Quartz>=10.1; sys_platform == 'darwin'",
--- a/libs/xfce/Dockerfile
+++ b/libs/xfce/Dockerfile
@@ -107,6 +107,16 @@ RUN mkdir -p /home/cua/.cache && \
 # Install computer-server using Python 3.12 pip
 RUN python3.12 -m pip install cua-computer-server

+# Copy browser.py and updated main.py from local source (to include browser tool)
+# These files need to be in the same directory as the Dockerfile when building
+COPY browser.py /tmp/browser.py
+COPY main.py /tmp/main.py
+RUN python3.12 -c "import shutil; import os; cs_dir = '/usr/local/lib/python3.12/dist-packages/computer_server'; shutil.copy('/tmp/browser.py', f'{cs_dir}/browser.py'); shutil.copy('/tmp/main.py', f'{cs_dir}/main.py'); print('Copied browser.py and main.py')" && rm /tmp/browser.py /tmp/main.py
+
+# Install playwright and Firefox dependencies
+RUN python3.12 -m pip install playwright && \
+    python3.12 -m playwright install --with-deps firefox
+
 # Fix any cache files created by pip
 RUN chown -R cua:cua /home/cua/.cache

--- a/libs/xfce/README_BUILD.md
+++ b/libs/xfce/README_BUILD.md
@@ -0,0 +1,32 @@
+# Building the XFCE Docker Image
+
+## Required Files for Build
+
+The Dockerfile requires these files to be present in the `libs/xfce/` directory:
+
+- `browser.py` - Copy from `libs/python/computer-server/computer_server/browser.py`
+- `main.py` - Copy from `libs/python/computer-server/computer_server/main.py`
+
+These files are copied into the container to include the browser tool functionality
+that isn't yet in the published PyPI package.
+
+## Before Building
+
+```bash
+# Copy the latest browser tool files
+cp libs/python/computer-server/computer_server/browser.py libs/xfce/
+cp libs/python/computer-server/computer_server/main.py libs/xfce/
+```
+
+## Build Command
+
+```bash
+cd libs/xfce
+docker build -t cua-xfce .
+```
+
+## Note
+
+Once the browser tool is included in the published `cua-computer-server` package,
+these temporary file copies can be removed and the Dockerfile can be simplified.
+
--- a/libs/xfce/browser.py
+++ b/libs/xfce/browser.py
@@ -0,0 +1,308 @@
+"""
+Browser manager using Playwright for programmatic browser control.
+This allows agents to control a browser that runs visibly on the XFCE desktop.
+"""
+
+import asyncio
+import logging
+import os
+from typing import Any, Dict, Optional
+
+try:
+    from playwright.async_api import async_playwright, Browser, BrowserContext, Page
+except ImportError:
+    async_playwright = None
+    Browser = None
+    BrowserContext = None
+    Page = None
+
+logger = logging.getLogger(__name__)
+
+
+class BrowserManager:
+    """
+    Manages a Playwright browser instance that runs visibly on the XFCE desktop.
+    Uses persistent context to maintain cookies and sessions.
+    """
+
+    def __init__(self):
+        """Initialize the BrowserManager."""
+        self.playwright = None
+        self.browser: Optional[Browser] = None
+        self.context: Optional[BrowserContext] = None
+        self.page: Optional[Page] = None
+        self._initialized = False
+        self._initialization_error: Optional[str] = None
+        self._lock = asyncio.Lock()
+
+    async def _ensure_initialized(self):
+        """Ensure the browser is initialized."""
+        # Check if browser was closed and needs reinitialization
+        if self._initialized:
+            try:
+                # Check if context is still valid by trying to access it
+                if self.context:
+                    # Try to get pages - this will raise if context is closed
+                    _ = self.context.pages
+                    # If we get here, context is still alive
+                    return
+                else:
+                    # Context was closed, need to reinitialize
+                    self._initialized = False
+                    logger.warning("Browser context was closed, will reinitialize...")
+            except Exception as e:
+                # Context is dead, need to reinitialize
+                logger.warning(f"Browser context is dead ({e}), will reinitialize...")
+                self._initialized = False
+                self.context = None
+                self.page = None
+                # Clean up playwright if it exists
+                if self.playwright:
+                    try:
+                        await self.playwright.stop()
+                    except Exception:
+                        pass
+                    self.playwright = None
+
+        async with self._lock:
+            # Double-check after acquiring lock (another thread might have initialized it)
+            if self._initialized:
+                try:
+                    if self.context:
+                        _ = self.context.pages
+                        return
+                except Exception:
+                    self._initialized = False
+                    self.context = None
+                    self.page = None
+                    if self.playwright:
+                        try:
+                            await self.playwright.stop()
+                        except Exception:
+                            pass
+                        self.playwright = None
+
+            if async_playwright is None:
+                raise RuntimeError(
+                    "playwright is not installed. Please install it with: pip install playwright && playwright install --with-deps firefox"
+                )
+
+            try:
+                # Get display from environment or default to :1
+                display = os.environ.get("DISPLAY", ":1")
+                logger.info(f"Initializing browser with DISPLAY={display}")
+
+                # Start playwright
+                self.playwright = await async_playwright().start()
+
+                # Launch Firefox with persistent context (keeps cookies/sessions)
+                # headless=False is CRITICAL so the visual agent can see it
+                user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright-firefox")
+                os.makedirs(user_data_dir, exist_ok=True)
+
+                # launch_persistent_context returns a BrowserContext, not a Browser
+                # Note: Removed --kiosk mode so the desktop remains visible
+                self.context = await self.playwright.firefox.launch_persistent_context(
+                    user_data_dir=user_data_dir,
+                    headless=False,  # CRITICAL: visible for visual agent
+                    viewport={"width": 1024, "height": 768},
+                    # Removed --kiosk to allow desktop visibility
+                )
+
+                # Get the first page or create one
+                pages = self.context.pages
+                if pages:
+                    self.page = pages[0]
+                else:
+                    self.page = await self.context.new_page()
+
+                self._initialized = True
+                logger.info("Browser initialized successfully")
+
+            except Exception as e:
+                logger.error(f"Failed to initialize browser: {e}")
+                import traceback
+                logger.error(traceback.format_exc())
+                # Don't raise - return error in execute_command instead
+                self._initialization_error = str(e)
+                raise
+
+    async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Execute a browser command.
+
+        Args:
+            cmd: Command name (visit_url, click, type, scroll, web_search)
+            params: Command parameters
+
+        Returns:
+            Result dictionary with success status and any data
+        """
+        try:
+            await self._ensure_initialized()
+        except Exception as e:
+            error_msg = getattr(self, '_initialization_error', None) or str(e)
+            logger.error(f"Browser initialization failed: {error_msg}")
+            return {
+                "success": False,
+                "error": f"Browser initialization failed: {error_msg}. "
+                         f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly."
+            }
+
+        # Ensure browser is still initialized (in case it was manually closed)
+        # This will automatically reinitialize if the browser was closed
+        await self._ensure_initialized()
+        
+        # Check if page is still valid
+        page_valid = False
+        try:
+            if self.page is not None:
+                # Try to access page.url to check if it's still valid
+                _ = self.page.url
+                page_valid = True
+        except Exception as e:
+            logger.warning(f"Page is invalid: {e}, will get a new page...")
+            self.page = None
+        
+        # Get a valid page if we don't have one
+        if not page_valid or self.page is None:
+            try:
+                pages = self.context.pages
+                if pages:
+                    # Find first non-closed page
+                    for p in pages:
+                        try:
+                            if not p.is_closed():
+                                self.page = p
+                                logger.info("Reusing existing open page")
+                                page_valid = True
+                                break
+                        except Exception:
+                            continue
+                
+                # If no valid page found, create a new one
+                if not page_valid:
+                    self.page = await self.context.new_page()
+                    logger.info("Created new page")
+            except Exception as e:
+                logger.error(f"Failed to get new page: {e}, browser may be closed")
+                # Browser was closed - reinitialize it
+                try:
+                    logger.info("Browser was closed manually, reinitializing...")
+                    self._initialized = False
+                    self.context = None
+                    self.page = None
+                    if self.playwright:
+                        try:
+                            await self.playwright.stop()
+                        except Exception:
+                            pass
+                        self.playwright = None
+                    
+                    # Reinitialize
+                    await self._ensure_initialized()
+                    # Get or create a page
+                    pages = self.context.pages
+                    if pages:
+                        self.page = pages[0]
+                    else:
+                        self.page = await self.context.new_page()
+                    logger.info("Browser reopened successfully after manual closure")
+                except Exception as reinit_error:
+                    logger.error(f"Failed to reinitialize browser: {reinit_error}")
+                    import traceback
+                    logger.error(traceback.format_exc())
+                    return {"success": False, "error": f"Browser was closed and cannot be recovered: {reinit_error}"}
+
+        try:
+            if cmd == "visit_url":
+                url = params.get("url")
+                if not url:
+                    return {"success": False, "error": "url parameter is required"}
+                await self.page.goto(url, wait_until="domcontentloaded", timeout=30000)
+                return {"success": True, "url": self.page.url}
+
+            elif cmd == "click":
+                x = params.get("x")
+                y = params.get("y")
+                if x is None or y is None:
+                    return {"success": False, "error": "x and y parameters are required"}
+                await self.page.mouse.click(x, y)
+                return {"success": True}
+
+            elif cmd == "type":
+                text = params.get("text")
+                if text is None:
+                    return {"success": False, "error": "text parameter is required"}
+                await self.page.keyboard.type(text)
+                return {"success": True}
+
+            elif cmd == "scroll":
+                delta_x = params.get("delta_x", 0)
+                delta_y = params.get("delta_y", 0)
+                await self.page.mouse.wheel(delta_x, delta_y)
+                return {"success": True}
+
+            elif cmd == "web_search":
+                query = params.get("query")
+                if not query:
+                    return {"success": False, "error": "query parameter is required"}
+                # Navigate to Google search
+                search_url = f"https://www.google.com/search?q={query}"
+                await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000)
+                return {"success": True, "url": self.page.url}
+
+            else:
+                return {"success": False, "error": f"Unknown command: {cmd}"}
+
+        except Exception as e:
+            logger.error(f"Error executing command {cmd}: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
+            # If page was closed due to error, try to recover
+            if "closed" in str(e).lower() and self.context:
+                try:
+                    pages = self.context.pages
+                    if pages:
+                        self.page = pages[0]
+                        logger.info("Recovered page after error")
+                    else:
+                        self.page = await self.context.new_page()
+                        logger.info("Created new page after error")
+                except Exception as recover_error:
+                    logger.error(f"Failed to recover page: {recover_error}")
+            return {"success": False, "error": str(e)}
+
+    async def close(self):
+        """Close the browser and cleanup resources."""
+        async with self._lock:
+            try:
+                if self.context:
+                    await self.context.close()
+                    self.context = None
+                if self.browser:
+                    await self.browser.close()
+                    self.browser = None
+
+                if self.playwright:
+                    await self.playwright.stop()
+                    self.playwright = None
+
+                self.page = None
+                self._initialized = False
+                logger.info("Browser closed successfully")
+            except Exception as e:
+                logger.error(f"Error closing browser: {e}")
+
+
+# Global instance
+_browser_manager: Optional[BrowserManager] = None
+
+
+def get_browser_manager() -> BrowserManager:
+    """Get or create the global BrowserManager instance."""
+    global _browser_manager
+    if _browser_manager is None:
+        _browser_manager = BrowserManager()
+    return _browser_manager
+
--- a/libs/xfce/main.py
+++ b/libs/xfce/main.py
@@ -0,0 +1,820 @@
+import asyncio
+import hashlib
+import inspect
+import json
+import logging
+import os
+import platform
+import time
+import traceback
+from contextlib import redirect_stderr, redirect_stdout
+from io import StringIO
+from typing import Any, Dict, List, Literal, Optional, Union, cast
+
+import aiohttp
+import uvicorn
+from fastapi import (
+    FastAPI,
+    Header,
+    HTTPException,
+    Request,
+    WebSocket,
+    WebSocketDisconnect,
+)
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, StreamingResponse
+
+from .handlers.factory import HandlerFactory
+from .browser import get_browser_manager
+
+# Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s
+AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60"))
+
+try:
+    from agent import ComputerAgent
+
+    HAS_AGENT = True
+except ImportError:
+    HAS_AGENT = False
+
+# Set up logging with more detail
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+# Configure WebSocket with larger message size
+WEBSOCKET_MAX_SIZE = 1024 * 1024 * 10  # 10MB limit
+
+# Configure application with WebSocket settings
+app = FastAPI(
+    title="Computer API",
+    description="API for the Computer project",
+    version="0.1.0",
+    websocket_max_size=WEBSOCKET_MAX_SIZE,
+)
+
+# CORS configuration
+origins = ["*"]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+protocol_version = 1
+try:
+    from importlib.metadata import version
+
+    package_version = version("cua-computer-server")
+except Exception:
+    # Fallback for cases where package is not installed or importlib.metadata is not available
+    try:
+        import pkg_resources
+
+        package_version = pkg_resources.get_distribution("cua-computer-server").version
+    except Exception:
+        package_version = "unknown"
+
+(
+    accessibility_handler,
+    automation_handler,
+    diorama_handler,
+    file_handler,
+    desktop_handler,
+    window_handler,
+) = HandlerFactory.create_handlers()
+handlers = {
+    "version": lambda: {"protocol": protocol_version, "package": package_version},
+    # App-Use commands
+    "diorama_cmd": diorama_handler.diorama_cmd,
+    # Accessibility commands
+    "get_accessibility_tree": accessibility_handler.get_accessibility_tree,
+    "find_element": accessibility_handler.find_element,
+    # Shell commands
+    "run_command": automation_handler.run_command,
+    # File system commands
+    "file_exists": file_handler.file_exists,
+    "directory_exists": file_handler.directory_exists,
+    "list_dir": file_handler.list_dir,
+    "read_text": file_handler.read_text,
+    "write_text": file_handler.write_text,
+    "read_bytes": file_handler.read_bytes,
+    "write_bytes": file_handler.write_bytes,
+    "get_file_size": file_handler.get_file_size,
+    "delete_file": file_handler.delete_file,
+    "create_dir": file_handler.create_dir,
+    "delete_dir": file_handler.delete_dir,
+    # Desktop commands
+    "get_desktop_environment": desktop_handler.get_desktop_environment,
+    "set_wallpaper": desktop_handler.set_wallpaper,
+    # Window management
+    "open": window_handler.open,
+    "launch": window_handler.launch,
+    "get_current_window_id": window_handler.get_current_window_id,
+    "get_application_windows": window_handler.get_application_windows,
+    "get_window_name": window_handler.get_window_name,
+    "get_window_size": window_handler.get_window_size,
+    "get_window_position": window_handler.get_window_position,
+    "set_window_size": window_handler.set_window_size,
+    "set_window_position": window_handler.set_window_position,
+    "maximize_window": window_handler.maximize_window,
+    "minimize_window": window_handler.minimize_window,
+    "activate_window": window_handler.activate_window,
+    "close_window": window_handler.close_window,
+    # Mouse commands
+    "mouse_down": automation_handler.mouse_down,
+    "mouse_up": automation_handler.mouse_up,
+    "left_click": automation_handler.left_click,
+    "right_click": automation_handler.right_click,
+    "double_click": automation_handler.double_click,
+    "move_cursor": automation_handler.move_cursor,
+    "drag_to": automation_handler.drag_to,
+    "drag": automation_handler.drag,
+    # Keyboard commands
+    "key_down": automation_handler.key_down,
+    "key_up": automation_handler.key_up,
+    "type_text": automation_handler.type_text,
+    "press_key": automation_handler.press_key,
+    "hotkey": automation_handler.hotkey,
+    # Scrolling actions
+    "scroll": automation_handler.scroll,
+    "scroll_down": automation_handler.scroll_down,
+    "scroll_up": automation_handler.scroll_up,
+    # Screen actions
+    "screenshot": automation_handler.screenshot,
+    "get_cursor_position": automation_handler.get_cursor_position,
+    "get_screen_size": automation_handler.get_screen_size,
+    # Clipboard actions
+    "copy_to_clipboard": automation_handler.copy_to_clipboard,
+    "set_clipboard": automation_handler.set_clipboard,
+}
+
+
+class AuthenticationManager:
+    def __init__(self):
+        self.sessions: Dict[str, Dict[str, Any]] = {}
+        self.container_name = os.environ.get("CONTAINER_NAME")
+
+    def _hash_credentials(self, container_name: str, api_key: str) -> str:
+        """Create a hash of container name and API key for session identification"""
+        combined = f"{container_name}:{api_key}"
+        return hashlib.sha256(combined.encode()).hexdigest()
+
+    def _is_session_valid(self, session_data: Dict[str, Any]) -> bool:
+        """Check if a session is still valid based on expiration time"""
+        if not session_data.get("valid", False):
+            return False
+
+        expires_at = session_data.get("expires_at", 0)
+        return time.time() < expires_at
+
+    async def auth(self, container_name: str, api_key: str) -> bool:
+        """Authenticate container name and API key, using cached sessions when possible"""
+        # If no CONTAINER_NAME is set, always allow access (local development)
+        if not self.container_name:
+            logger.info(
+                "No CONTAINER_NAME set in environment. Allowing access (local development mode)"
+            )
+            return True
+
+        # Layer 1: VM Identity Verification
+        if container_name != self.container_name:
+            logger.warning(
+                f"VM name mismatch. Expected: {self.container_name}, Got: {container_name}"
+            )
+            return False
+
+        # Create hash for session lookup
+        session_hash = self._hash_credentials(container_name, api_key)
+
+        # Check if we have a valid cached session
+        if session_hash in self.sessions:
+            session_data = self.sessions[session_hash]
+            if self._is_session_valid(session_data):
+                logger.info(f"Using cached authentication for container: {container_name}")
+                return session_data["valid"]
+            else:
+                # Remove expired session
+                del self.sessions[session_hash]
+
+        # No valid cached session, authenticate with API
+        logger.info(f"Authenticating with TryCUA API for container: {container_name}")
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                headers = {"Authorization": f"Bearer {api_key}"}
+
+                async with session.get(
+                    f"https://www.cua.ai/api/vm/auth?container_name={container_name}",
+                    headers=headers,
+                ) as resp:
+                    is_valid = resp.status == 200 and bool((await resp.text()).strip())
+
+                    # Cache the result with configurable expiration
+                    self.sessions[session_hash] = {
+                        "valid": is_valid,
+                        "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS,
+                    }
+
+                    if is_valid:
+                        logger.info(f"Authentication successful for container: {container_name}")
+                    else:
+                        logger.warning(
+                            f"Authentication failed for container: {container_name}. Status: {resp.status}"
+                        )
+
+                    return is_valid
+
+        except aiohttp.ClientError as e:
+            logger.error(f"Failed to validate API key with TryCUA API: {str(e)}")
+            # Cache failed result to avoid repeated requests
+            self.sessions[session_hash] = {
+                "valid": False,
+                "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS,
+            }
+            return False
+        except Exception as e:
+            logger.error(f"Unexpected error during authentication: {str(e)}")
+            # Cache failed result to avoid repeated requests
+            self.sessions[session_hash] = {
+                "valid": False,
+                "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS,
+            }
+            return False
+
+
+class ConnectionManager:
+    def __init__(self):
+        self.active_connections: List[WebSocket] = []
+
+    async def connect(self, websocket: WebSocket):
+        await websocket.accept()
+        self.active_connections.append(websocket)
+
+    def disconnect(self, websocket: WebSocket):
+        self.active_connections.remove(websocket)
+
+
+manager = ConnectionManager()
+auth_manager = AuthenticationManager()
+
+
+@app.get("/status")
+async def status():
+    sys = platform.system().lower()
+    # get os type
+    if "darwin" in sys or sys == "macos" or sys == "mac":
+        os_type = "macos"
+    elif "windows" in sys:
+        os_type = "windows"
+    else:
+        os_type = "linux"
+    # get computer-server features
+    features = []
+    if HAS_AGENT:
+        features.append("agent")
+    return {"status": "ok", "os_type": os_type, "features": features}
+
+
+@app.websocket("/ws", name="websocket_endpoint")
+async def websocket_endpoint(websocket: WebSocket):
+    global handlers
+
+    # WebSocket message size is configured at the app or endpoint level, not on the instance
+    await manager.connect(websocket)
+
+    # Check if CONTAINER_NAME is set (indicating cloud provider)
+    server_container_name = os.environ.get("CONTAINER_NAME")
+
+    # If cloud provider, perform authentication handshake
+    if server_container_name:
+        try:
+            logger.info(
+                f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Waiting for authentication..."
+            )
+
+            # Wait for authentication message
+            auth_data = await websocket.receive_json()
+
+            # Validate auth message format
+            if auth_data.get("command") != "authenticate":
+                await websocket.send_json(
+                    {"success": False, "error": "First message must be authentication"}
+                )
+                await websocket.close()
+                manager.disconnect(websocket)
+                return
+
+            # Extract credentials
+            client_api_key = auth_data.get("params", {}).get("api_key")
+            client_container_name = auth_data.get("params", {}).get("container_name")
+
+            # Validate credentials using AuthenticationManager
+            if not client_api_key:
+                await websocket.send_json({"success": False, "error": "API key required"})
+                await websocket.close()
+                manager.disconnect(websocket)
+                return
+
+            if not client_container_name:
+                await websocket.send_json({"success": False, "error": "Container name required"})
+                await websocket.close()
+                manager.disconnect(websocket)
+                return
+
+            # Use AuthenticationManager for validation
+            is_authenticated = await auth_manager.auth(client_container_name, client_api_key)
+            if not is_authenticated:
+                await websocket.send_json({"success": False, "error": "Authentication failed"})
+                await websocket.close()
+                manager.disconnect(websocket)
+                return
+
+            logger.info(f"Authentication successful for VM: {client_container_name}")
+            await websocket.send_json({"success": True, "message": "Authentication successful"})
+
+        except Exception as e:
+            logger.error(f"Error during authentication handshake: {str(e)}")
+            await websocket.send_json({"success": False, "error": "Authentication failed"})
+            await websocket.close()
+            manager.disconnect(websocket)
+            return
+
+    try:
+        while True:
+            try:
+                data = await websocket.receive_json()
+                command = data.get("command")
+                params = data.get("params", {})
+
+                if command not in handlers:
+                    await websocket.send_json(
+                        {"success": False, "error": f"Unknown command: {command}"}
+                    )
+                    continue
+
+                try:
+                    # Filter params to only include those accepted by the handler function
+                    handler_func = handlers[command]
+                    sig = inspect.signature(handler_func)
+                    filtered_params = {k: v for k, v in params.items() if k in sig.parameters}
+
+                    # Handle both sync and async functions
+                    if asyncio.iscoroutinefunction(handler_func):
+                        result = await handler_func(**filtered_params)
+                    else:
+                        # Run sync functions in thread pool to avoid blocking event loop
+                        result = await asyncio.to_thread(handler_func, **filtered_params)
+                    await websocket.send_json({"success": True, **result})
+                except Exception as cmd_error:
+                    logger.error(f"Error executing command {command}: {str(cmd_error)}")
+                    logger.error(traceback.format_exc())
+                    await websocket.send_json({"success": False, "error": str(cmd_error)})
+
+            except WebSocketDisconnect:
+                raise
+            except json.JSONDecodeError as json_err:
+                logger.error(f"JSON decode error: {str(json_err)}")
+                await websocket.send_json(
+                    {"success": False, "error": f"Invalid JSON: {str(json_err)}"}
+                )
+            except Exception as loop_error:
+                logger.error(f"Error in message loop: {str(loop_error)}")
+                logger.error(traceback.format_exc())
+                await websocket.send_json({"success": False, "error": str(loop_error)})
+
+    except WebSocketDisconnect:
+        logger.info("Client disconnected")
+        manager.disconnect(websocket)
+    except Exception as e:
+        logger.error(f"Fatal error in websocket connection: {str(e)}")
+        logger.error(traceback.format_exc())
+        try:
+            await websocket.close()
+        except:
+            pass
+        manager.disconnect(websocket)
+
+
+@app.post("/cmd")
+async def cmd_endpoint(
+    request: Request,
+    container_name: Optional[str] = Header(None, alias="X-Container-Name"),
+    api_key: Optional[str] = Header(None, alias="X-API-Key"),
+):
+    """
+    Backup endpoint for when WebSocket connections fail.
+    Accepts commands via HTTP POST with streaming response.
+
+    Headers:
+    - X-Container-Name: Container name for cloud authentication
+    - X-API-Key: API key for cloud authentication
+
+    Body:
+    {
+        "command": "command_name",
+        "params": {...}
+    }
+    """
+    global handlers
+
+    # Parse request body
+    try:
+        body = await request.json()
+        command = body.get("command")
+        params = body.get("params", {})
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
+
+    if not command:
+        raise HTTPException(status_code=400, detail="Command is required")
+
+    # Check if CONTAINER_NAME is set (indicating cloud provider)
+    server_container_name = os.environ.get("CONTAINER_NAME")
+
+    # If cloud provider, perform authentication
+    if server_container_name:
+        logger.info(
+            f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..."
+        )
+
+        # Validate required headers
+        if not container_name:
+            raise HTTPException(status_code=401, detail="Container name required")
+
+        if not api_key:
+            raise HTTPException(status_code=401, detail="API key required")
+
+        # Validate with AuthenticationManager
+        is_authenticated = await auth_manager.auth(container_name, api_key)
+        if not is_authenticated:
+            raise HTTPException(status_code=401, detail="Authentication failed")
+
+    if command not in handlers:
+        raise HTTPException(status_code=400, detail=f"Unknown command: {command}")
+
+    async def generate_response():
+        """Generate streaming response for the command execution"""
+        try:
+            # Filter params to only include those accepted by the handler function
+            handler_func = handlers[command]
+            sig = inspect.signature(handler_func)
+            filtered_params = {k: v for k, v in params.items() if k in sig.parameters}
+
+            # Handle both sync and async functions
+            if asyncio.iscoroutinefunction(handler_func):
+                result = await handler_func(**filtered_params)
+            else:
+                # Run sync functions in thread pool to avoid blocking event loop
+                result = await asyncio.to_thread(handler_func, **filtered_params)
+
+            # Stream the successful result
+            response_data = {"success": True, **result}
+            yield f"data: {json.dumps(response_data)}\n\n"
+
+        except Exception as cmd_error:
+            logger.error(f"Error executing command {command}: {str(cmd_error)}")
+            logger.error(traceback.format_exc())
+
+            # Stream the error result
+            error_data = {"success": False, "error": str(cmd_error)}
+            yield f"data: {json.dumps(error_data)}\n\n"
+
+    return StreamingResponse(
+        generate_response(),
+        media_type="text/plain",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+        },
+    )
+
+
+@app.post("/responses")
+async def agent_response_endpoint(
+    request: Request,
+    api_key: Optional[str] = Header(None, alias="X-API-Key"),
+):
+    """
+    Minimal proxy to run ComputerAgent for up to 2 turns.
+
+    Security:
+    - If CONTAINER_NAME is set on the server, require X-API-Key
+      and validate using AuthenticationManager unless CUA_ENABLE_PUBLIC_PROXY is true.
+
+    Body JSON:
+    {
+      "model": "...",                 # required
+      "input": "... or messages[]",   # required
+      "agent_kwargs": { ... },         # optional, passed directly to ComputerAgent
+      "env": { ... }                   # optional env overrides for agent
+    }
+    """
+    if not HAS_AGENT:
+        raise HTTPException(status_code=501, detail="ComputerAgent not available")
+
+    # Authenticate via AuthenticationManager if running in cloud (CONTAINER_NAME set)
+    container_name = os.environ.get("CONTAINER_NAME")
+    if container_name:
+        is_public = os.environ.get("CUA_ENABLE_PUBLIC_PROXY", "").lower().strip() in [
+            "1",
+            "true",
+            "yes",
+            "y",
+            "on",
+        ]
+        if not is_public:
+            if not api_key:
+                raise HTTPException(status_code=401, detail="Missing AGENT PROXY auth headers")
+            ok = await auth_manager.auth(container_name, api_key)
+            if not ok:
+                raise HTTPException(status_code=401, detail="Unauthorized")
+
+    # Parse request body
+    try:
+        body = await request.json()
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
+
+    model = body.get("model")
+    input_data = body.get("input")
+    if not model or input_data is None:
+        raise HTTPException(status_code=400, detail="'model' and 'input' are required")
+
+    agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {}
+    env_overrides: Dict[str, str] = body.get("env") or {}
+
+    # Simple env override context
+    class _EnvOverride:
+        def __init__(self, overrides: Dict[str, str]):
+            self.overrides = overrides
+            self._original: Dict[str, Optional[str]] = {}
+
+        def __enter__(self):
+            for k, v in (self.overrides or {}).items():
+                self._original[k] = os.environ.get(k)
+                os.environ[k] = str(v)
+
+        def __exit__(self, exc_type, exc, tb):
+            for k, old in self._original.items():
+                if old is None:
+                    os.environ.pop(k, None)
+                else:
+                    os.environ[k] = old
+
+    # Convert input to messages
+    def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
+        if isinstance(data, str):
+            return [{"role": "user", "content": data}]
+        if isinstance(data, list):
+            return data
+
+    messages = _to_messages(input_data)
+
+    # Define a direct computer tool that implements the AsyncComputerHandler protocol
+    # and delegates to our existing automation/file/accessibility handlers.
+    from agent.computers import AsyncComputerHandler  # runtime-checkable Protocol
+
+    class DirectComputer(AsyncComputerHandler):
+        def __init__(self):
+            # use module-scope handler singletons created by HandlerFactory
+            self._auto = automation_handler
+            self._file = file_handler
+            self._access = accessibility_handler
+
+        async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
+            sys = platform.system().lower()
+            if "darwin" in sys or sys in ("macos", "mac"):
+                return "mac"
+            if "windows" in sys:
+                return "windows"
+            return "linux"
+
+        async def get_dimensions(self) -> tuple[int, int]:
+            size = await self._auto.get_screen_size()
+            return size["width"], size["height"]
+
+        async def screenshot(self) -> str:
+            img_b64 = await self._auto.screenshot()
+            return img_b64["image_data"]
+
+        async def click(self, x: int, y: int, button: str = "left") -> None:
+            if button == "left":
+                await self._auto.left_click(x, y)
+            elif button == "right":
+                await self._auto.right_click(x, y)
+            else:
+                await self._auto.left_click(x, y)
+
+        async def double_click(self, x: int, y: int) -> None:
+            await self._auto.double_click(x, y)
+
+        async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
+            await self._auto.move_cursor(x, y)
+            await self._auto.scroll(scroll_x, scroll_y)
+
+        async def type(self, text: str) -> None:
+            await self._auto.type_text(text)
+
+        async def wait(self, ms: int = 1000) -> None:
+            await asyncio.sleep(ms / 1000.0)
+
+        async def move(self, x: int, y: int) -> None:
+            await self._auto.move_cursor(x, y)
+
+        async def keypress(self, keys: Union[List[str], str]) -> None:
+            if isinstance(keys, str):
+                parts = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
+            else:
+                parts = keys
+            if len(parts) == 1:
+                await self._auto.press_key(parts[0])
+            else:
+                await self._auto.hotkey(parts)
+
+        async def drag(self, path: List[Dict[str, int]]) -> None:
+            if not path:
+                return
+            start = path[0]
+            await self._auto.mouse_down(start["x"], start["y"])
+            for pt in path[1:]:
+                await self._auto.move_cursor(pt["x"], pt["y"])
+            end = path[-1]
+            await self._auto.mouse_up(end["x"], end["y"])
+
+        async def get_current_url(self) -> str:
+            # Not available in this server context
+            return ""
+
+        async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
+            await self._auto.mouse_down(x, y, button="left")
+
+        async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
+            await self._auto.mouse_up(x, y, button="left")
+
+    # # Inline image URLs to base64
+    # import base64, mimetypes, requests
+    # # Use a browser-like User-Agent to avoid 403s from some CDNs (e.g., Wikimedia)
+    # HEADERS = {
+    #     "User-Agent": (
+    #         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    #         "AppleWebKit/537.36 (KHTML, like Gecko) "
+    #         "Chrome/124.0.0.0 Safari/537.36"
+    #     )
+    # }
+    # def _to_data_url(content_bytes: bytes, url: str, resp: requests.Response) -> str:
+    #     ctype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream"
+    #     b64 = base64.b64encode(content_bytes).decode("utf-8")
+    #     return f"data:{ctype};base64,{b64}"
+    # def inline_image_urls(messages):
+    #     # messages: List[{"role": "...","content":[...]}]
+    #     out = []
+    #     for m in messages:
+    #         if not isinstance(m.get("content"), list):
+    #             out.append(m)
+    #             continue
+    #         new_content = []
+    #         for part in (m.get("content") or []):
+    #             if part.get("type") == "input_image" and (url := part.get("image_url")):
+    #                 resp = requests.get(url, headers=HEADERS, timeout=30)
+    #                 resp.raise_for_status()
+    #                 new_content.append({
+    #                     "type": "input_image",
+    #                     "image_url": _to_data_url(resp.content, url, resp)
+    #                 })
+    #             else:
+    #                 new_content.append(part)
+    #         out.append({**m, "content": new_content})
+    #     return out
+    # messages = inline_image_urls(messages)
+
+    error = None
+
+    with _EnvOverride(env_overrides):
+        # Prepare tools: if caller did not pass tools, inject our DirectComputer
+        tools = agent_kwargs.get("tools")
+        if not tools:
+            tools = [DirectComputer()]
+            agent_kwargs = {**agent_kwargs, "tools": tools}
+        # Instantiate agent with our tools
+        agent = ComputerAgent(model=model, **agent_kwargs)  # type: ignore[arg-type]
+
+        total_output: List[Any] = []
+        total_usage: Dict[str, Any] = {}
+
+        pending_computer_call_ids = set()
+        try:
+            async for result in agent.run(messages):
+                total_output += result["output"]
+                # Try to collect usage if present
+                if (
+                    isinstance(result, dict)
+                    and "usage" in result
+                    and isinstance(result["usage"], dict)
+                ):
+                    # Merge usage counters
+                    for k, v in result["usage"].items():
+                        if isinstance(v, (int, float)):
+                            total_usage[k] = total_usage.get(k, 0) + v
+                        else:
+                            total_usage[k] = v
+                for msg in result.get("output", []):
+                    if msg.get("type") == "computer_call":
+                        pending_computer_call_ids.add(msg["call_id"])
+                    elif msg.get("type") == "computer_call_output":
+                        pending_computer_call_ids.discard(msg["call_id"])
+                # exit if no pending computer calls
+                if not pending_computer_call_ids:
+                    break
+        except Exception as e:
+            logger.error(f"Error running agent: {str(e)}")
+            logger.error(traceback.format_exc())
+            error = str(e)
+
+    # Build response payload
+    payload = {
+        "model": model,
+        "error": error,
+        "output": total_output,
+        "usage": total_usage,
+        "status": "completed" if not error else "failed",
+    }
+
+    # CORS: allow any origin
+    headers = {
+        "Cache-Control": "no-cache",
+        "Connection": "keep-alive",
+    }
+
+    return JSONResponse(content=payload, headers=headers)
+
+
+@app.post("/playwright_exec")
+async def playwright_exec_endpoint(
+    request: Request,
+    container_name: Optional[str] = Header(None, alias="X-Container-Name"),
+    api_key: Optional[str] = Header(None, alias="X-API-Key"),
+):
+    """
+    Execute Playwright browser commands.
+
+    Headers:
+    - X-Container-Name: Container name for cloud authentication
+    - X-API-Key: API key for cloud authentication
+
+    Body:
+    {
+        "command": "visit_url|click|type|scroll|web_search",
+        "params": {...}
+    }
+    """
+    # Parse request body
+    try:
+        body = await request.json()
+        command = body.get("command")
+        params = body.get("params", {})
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
+
+    if not command:
+        raise HTTPException(status_code=400, detail="Command is required")
+
+    # Check if CONTAINER_NAME is set (indicating cloud provider)
+    server_container_name = os.environ.get("CONTAINER_NAME")
+
+    # If cloud provider, perform authentication
+    if server_container_name:
+        logger.info(
+            f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..."
+        )
+
+        # Validate required headers
+        if not container_name:
+            raise HTTPException(status_code=401, detail="Container name required")
+
+        if not api_key:
+            raise HTTPException(status_code=401, detail="API key required")
+
+        # Validate with AuthenticationManager
+        is_authenticated = await auth_manager.auth(container_name, api_key)
+        if not is_authenticated:
+            raise HTTPException(status_code=401, detail="Authentication failed")
+
+    # Get browser manager and execute command
+    try:
+        browser_manager = get_browser_manager()
+        result = await browser_manager.execute_command(command, params)
+        
+        if result.get("success"):
+            return JSONResponse(content=result)
+        else:
+            raise HTTPException(status_code=400, detail=result.get("error", "Command failed"))
+    except Exception as e:
+        logger.error(f"Error executing playwright command: {str(e)}")
+        logger.error(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/libs/xfce/src/scripts/start-computer-server.sh
+++ b/libs/xfce/src/scripts/start-computer-server.sh
@@ -10,4 +10,4 @@ echo "X server is ready"

 # Start computer-server
 export DISPLAY=:1
-python3 -m computer_server --port ${API_PORT:-8000}
+python -m computer_server --port ${API_PORT:-8000}