From 37c5be669bf256a05350f8429540f7d483eaaccf Mon Sep 17 00:00:00 2001
From: Adam <wangadam019@gmail.com>
Date: Tue, 2 Dec 2025 12:53:25 -0500
Subject: [PATCH 1/8] dd browser tool with Playwright for visible browser
 automation

Add browser tool with Playwright/Firefox support. Includes BrowserManager,
/playwright_exec endpoint, BrowserTool client, and auto-recovery. Fixes
Python version in startup script and adds Playwright to Docker build.
---
 examples/BROWSER_TOOL_README.md               |  69 ++
 examples/browser_tool_example.py              |  96 ++
 libs/python/agent/agent/tools/__init__.py     |   6 +
 libs/python/agent/agent/tools/browser_tool.py | 143 +++
 .../computer_server/browser.py                | 308 +++++++
 .../computer-server/computer_server/main.py   |  67 ++
 libs/python/computer-server/pyproject.toml    |   1 +
 libs/xfce/Dockerfile                          |  10 +
 libs/xfce/README_BUILD.md                     |  32 +
 libs/xfce/browser.py                          | 308 +++++++
 libs/xfce/main.py                             | 820 ++++++++++++++++++
 .../xfce/src/scripts/start-computer-server.sh |   2 +-
 12 files changed, 1861 insertions(+), 1 deletion(-)
 create mode 100644 examples/BROWSER_TOOL_README.md
 create mode 100644 examples/browser_tool_example.py
 create mode 100644 libs/python/agent/agent/tools/__init__.py
 create mode 100644 libs/python/agent/agent/tools/browser_tool.py
 create mode 100644 libs/python/computer-server/computer_server/browser.py
 create mode 100644 libs/xfce/README_BUILD.md
 create mode 100644 libs/xfce/browser.py
 create mode 100644 libs/xfce/main.py

diff --git a/examples/BROWSER_TOOL_README.md b/examples/BROWSER_TOOL_README.md
new file mode 100644
index 00000000..8d12ae85
--- /dev/null
+++ b/examples/BROWSER_TOOL_README.md
@@ -0,0 +1,69 @@
+# Browser Tool
+
+Browser automation tool that allows agents to control a Firefox browser programmatically via Playwright while keeping it visible on the XFCE desktop.
+
+## Quick Start
+
+### Using Docker (Recommended)
+
+```bash
+# Build and run the container
+cd libs/xfce
+docker build -t cua-xfce .
+docker run -d --name cua-xfce-test \
+  -p 8000:8000 -p 5901:5901 -p 6901:6901 \
+  -e DISPLAY=:1 \
+  cua-xfce
+
+# View desktop: http://localhost:6901
+# Test the browser tool
+python examples/browser_tool_example.py
+```
+
+### Local Testing
+
+```bash
+# Install dependencies
+pip install playwright
+playwright install --with-deps firefox
+
+# Start server
+python -m computer_server --port 8000
+
+# Run test (in another terminal)
+python examples/browser_tool_example.py
+```
+
+## Features
+
+- **Visible Browser**: Runs in non-headless mode so visual agents can see it
+- **Auto-Recovery**: Automatically reopens browser if closed manually
+- **Persistent Context**: Maintains cookies and sessions across commands
+- **Fara/Magentic-One Interface**: Compatible with Microsoft agent interfaces
+
+## API Endpoint
+
+The browser tool is accessible via the `/playwright_exec` endpoint:
+
+```bash
+curl -X POST http://localhost:8000/playwright_exec \
+  -H "Content-Type: application/json" \
+  -d '{"command": "visit_url", "params": {"url": "https://www.example.com"}}'
+```
+
+## Available Commands
+
+- `visit_url(url)` - Navigate to a URL
+- `click(x, y)` - Click at coordinates
+- `type(text)` - Type text into focused element
+- `scroll(delta_x, delta_y)` - Scroll the page
+- `web_search(query)` - Navigate to Google search
+
+## Troubleshooting
+
+**Browser closes unexpectedly**: The tool automatically reopens the browser on the next command.
+
+**Connection errors**: Make sure the server is running (`curl http://localhost:8000/status`).
+
+**Playwright not found**: Install with `pip install playwright && playwright install --with-deps firefox`.
+
diff --git a/examples/browser_tool_example.py b/examples/browser_tool_example.py
new file mode 100644
index 00000000..9705ca8f
--- /dev/null
+++ b/examples/browser_tool_example.py
@@ -0,0 +1,96 @@
+"""
+Browser Tool Example
+
+Demonstrates how to use the BrowserTool to control a browser programmatically
+via the computer server. The browser runs visibly on the XFCE desktop so visual
+agents can see it.
+
+Prerequisites:
+    - Computer server running (Docker container or local)
+    - For Docker: Container should be running with browser tool support
+    - For local: Playwright and Firefox must be installed
+
+Usage:
+    python examples/browser_tool_example.py
+"""
+
+import asyncio
+import logging
+import sys
+from pathlib import Path
+
+# Import BrowserTool directly from the file
+browser_tool_path = Path(__file__).parent.parent / "libs" / "python" / "agent" / "agent" / "tools" / "browser_tool.py"
+sys.path.insert(0, str(browser_tool_path.parent.parent.parent))
+
+# Import the module directly
+import importlib.util
+spec = importlib.util.spec_from_file_location("browser_tool", browser_tool_path)
+if spec is None or spec.loader is None:
+    raise ImportError(f"Could not load browser_tool from {browser_tool_path}")
+browser_tool_module = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(browser_tool_module)
+BrowserTool = browser_tool_module.BrowserTool
+
+# Configure logging to see what's happening
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+async def test_browser_tool():
+    """Test the BrowserTool with various commands."""
+    
+    # Initialize the browser tool
+    # For local testing, use http://localhost:8000
+    # For cloud, provide base_url, api_key, and container_name
+    browser = BrowserTool(base_url="http://localhost:8000")
+    
+    logger.info("Testing Browser Tool...")
+    
+    try:
+        # Test 1: Visit a URL
+        logger.info("Test 1: Visiting a URL...")
+        result = await browser.visit_url("https://www.trycua.com")
+        logger.info(f"Visit URL result: {result}")
+        
+        # Wait a bit for the page to load
+        await asyncio.sleep(2)
+        
+        # Test 2: Web search
+        logger.info("Test 2: Performing a web search...")
+        result = await browser.web_search("Python programming")
+        logger.info(f"Web search result: {result}")
+        
+        # Wait a bit
+        await asyncio.sleep(2)
+        
+        # Test 3: Scroll
+        logger.info("Test 3: Scrolling the page...")
+        result = await browser.scroll(delta_x=0, delta_y=500)
+        logger.info(f"Scroll result: {result}")
+        
+        # Wait a bit
+        await asyncio.sleep(1)
+        
+        # Test 4: Click (example coordinates - adjust based on your screen)
+        logger.info("Test 4: Clicking at coordinates...")
+        result = await browser.click(x=500, y=300)
+        logger.info(f"Click result: {result}")
+        
+        # Wait a bit
+        await asyncio.sleep(1)
+        
+        # Test 5: Type text (if there's a focused input field)
+        logger.info("Test 5: Typing text...")
+        result = await browser.type("Hello from BrowserTool!")
+        logger.info(f"Type result: {result}")
+        
+        logger.info("All tests completed!")
+        
+    except Exception as e:
+        logger.error(f"Error during testing: {e}", exc_info=True)
+
+
+if __name__ == "__main__":
+    asyncio.run(test_browser_tool())
+
diff --git a/libs/python/agent/agent/tools/__init__.py b/libs/python/agent/agent/tools/__init__.py
new file mode 100644
index 00000000..e663c557
--- /dev/null
+++ b/libs/python/agent/agent/tools/__init__.py
@@ -0,0 +1,6 @@
+"""Tools for agent interactions."""
+
+from .browser_tool import BrowserTool
+
+__all__ = ["BrowserTool"]
+
diff --git a/libs/python/agent/agent/tools/browser_tool.py b/libs/python/agent/agent/tools/browser_tool.py
new file mode 100644
index 00000000..8f8b1ab9
--- /dev/null
+++ b/libs/python/agent/agent/tools/browser_tool.py
@@ -0,0 +1,143 @@
+"""
+Browser Tool for agent interactions.
+Allows agents to control a browser programmatically via Playwright.
+"""
+
+import logging
+from typing import Optional
+
+import aiohttp
+
+logger = logging.getLogger(__name__)
+
+
+class BrowserTool:
+    """
+    Browser tool that connects to the computer server's Playwright endpoint.
+    Implements the Fara/Magentic-One agent interface for browser control.
+    """
+
+    def __init__(
+        self,
+        base_url: str = "http://localhost:8000",
+        api_key: Optional[str] = None,
+        container_name: Optional[str] = None,
+    ):
+        """
+        Initialize the BrowserTool.
+
+        Args:
+            base_url: Base URL of the computer server (default: http://localhost:8000)
+            api_key: Optional API key for cloud authentication
+            container_name: Optional container name for cloud authentication
+        """
+        self.base_url = base_url.rstrip("/")
+        self.api_key = api_key
+        self.container_name = container_name
+        self.logger = logger
+
+    def _get_endpoint_url(self) -> str:
+        """Get the full URL for the playwright_exec endpoint."""
+        return f"{self.base_url}/playwright_exec"
+
+    def _get_headers(self) -> dict:
+        """Get headers for the HTTP request."""
+        headers = {"Content-Type": "application/json"}
+        if self.api_key:
+            headers["X-API-Key"] = self.api_key
+        if self.container_name:
+            headers["X-Container-Name"] = self.container_name
+        return headers
+
+    async def _execute_command(self, command: str, params: dict) -> dict:
+        """
+        Execute a browser command via HTTP POST.
+
+        Args:
+            command: Command name
+            params: Command parameters
+
+        Returns:
+            Response dictionary
+        """
+        url = self._get_endpoint_url()
+        payload = {"command": command, "params": params}
+        headers = self._get_headers()
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(url, json=payload, headers=headers) as response:
+                    if response.status == 200:
+                        return await response.json()
+                    else:
+                        error_text = await response.text()
+                        self.logger.error(
+                            f"Browser command failed with status {response.status}: {error_text}"
+                        )
+                        return {"success": False, "error": error_text}
+        except Exception as e:
+            self.logger.error(f"Error executing browser command: {e}")
+            return {"success": False, "error": str(e)}
+
+    async def visit_url(self, url: str) -> dict:
+        """
+        Navigate to a URL.
+
+        Args:
+            url: URL to visit
+
+        Returns:
+            Response dictionary with success status and current URL
+        """
+        return await self._execute_command("visit_url", {"url": url})
+
+    async def click(self, x: int, y: int) -> dict:
+        """
+        Click at coordinates.
+
+        Args:
+            x: X coordinate
+            y: Y coordinate
+
+        Returns:
+            Response dictionary with success status
+        """
+        return await self._execute_command("click", {"x": x, "y": y})
+
+    async def type(self, text: str) -> dict:
+        """
+        Type text into the focused element.
+
+        Args:
+            text: Text to type
+
+        Returns:
+            Response dictionary with success status
+        """
+        return await self._execute_command("type", {"text": text})
+
+    async def scroll(self, delta_x: int, delta_y: int) -> dict:
+        """
+        Scroll the page.
+
+        Args:
+            delta_x: Horizontal scroll delta
+            delta_y: Vertical scroll delta
+
+        Returns:
+            Response dictionary with success status
+        """
+        return await self._execute_command("scroll", {"delta_x": delta_x, "delta_y": delta_y})
+
+    async def web_search(self, query: str) -> dict:
+        """
+        Navigate to a Google search for the query.
+
+        Args:
+            query: Search query
+
+        Returns:
+            Response dictionary with success status and current URL
+        """
+        return await self._execute_command("web_search", {"query": query})
+
diff --git a/libs/python/computer-server/computer_server/browser.py b/libs/python/computer-server/computer_server/browser.py
new file mode 100644
index 00000000..3d0a4c69
--- /dev/null
+++ b/libs/python/computer-server/computer_server/browser.py
@@ -0,0 +1,308 @@
+"""
+Browser manager using Playwright for programmatic browser control.
+This allows agents to control a browser that runs visibly on the XFCE desktop.
+"""
+
+import asyncio
+import logging
+import os
+from typing import Any, Dict, Optional
+
+try:
+    from playwright.async_api import async_playwright, Browser, BrowserContext, Page
+except ImportError:
+    async_playwright = None
+    Browser = None
+    BrowserContext = None
+    Page = None
+
+logger = logging.getLogger(__name__)
+
+
+class BrowserManager:
+    """
+    Manages a Playwright browser instance that runs visibly on the XFCE desktop.
+    Uses persistent context to maintain cookies and sessions.
+    """
+
+    def __init__(self):
+        """Initialize the BrowserManager."""
+        self.playwright = None
+        self.browser: Optional[Browser] = None
+        self.context: Optional[BrowserContext] = None
+        self.page: Optional[Page] = None
+        self._initialized = False
+        self._initialization_error: Optional[str] = None
+        self._lock = asyncio.Lock()
+
+    async def _ensure_initialized(self):
+        """Ensure the browser is initialized."""
+        # Check if browser was closed and needs reinitialization
+        if self._initialized:
+            try:
+                # Check if context is still valid by trying to access it
+                if self.context:
+                    # Try to get pages - this will raise if context is closed
+                    _ = self.context.pages
+                    # If we get here, context is still alive
+                    return
+                else:
+                    # Context was closed, need to reinitialize
+                    self._initialized = False
+                    logger.warning("Browser context was closed, will reinitialize...")
+            except Exception as e:
+                # Context is dead, need to reinitialize
+                logger.warning(f"Browser context is dead ({e}), will reinitialize...")
+                self._initialized = False
+                self.context = None
+                self.page = None
+                # Clean up playwright if it exists
+                if self.playwright:
+                    try:
+                        await self.playwright.stop()
+                    except Exception:
+                        pass
+                    self.playwright = None
+
+        async with self._lock:
+            # Double-check after acquiring lock (another thread might have initialized it)
+            if self._initialized:
+                try:
+                    if self.context:
+                        _ = self.context.pages
+                        return
+                except Exception:
+                    self._initialized = False
+                    self.context = None
+                    self.page = None
+                    if self.playwright:
+                        try:
+                            await self.playwright.stop()
+                        except Exception:
+                            pass
+                        self.playwright = None
+
+            if async_playwright is None:
+                raise RuntimeError(
+                    "playwright is not installed. Please install it with: pip install playwright && playwright install --with-deps firefox"
+                )
+
+            try:
+                # Get display from environment or default to :1
+                display = os.environ.get("DISPLAY", ":1")
+                logger.info(f"Initializing browser with DISPLAY={display}")
+
+                # Start playwright
+                self.playwright = await async_playwright().start()
+
+                # Launch Firefox with persistent context (keeps cookies/sessions)
+                # headless=False is CRITICAL so the visual agent can see it
+                user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright-firefox")
+                os.makedirs(user_data_dir, exist_ok=True)
+
+                # launch_persistent_context returns a BrowserContext, not a Browser
+                # Note: Removed --kiosk mode so the desktop remains visible
+                self.context = await self.playwright.firefox.launch_persistent_context(
+                    user_data_dir=user_data_dir,
+                    headless=False,  # CRITICAL: visible for visual agent
+                    viewport={"width": 1024, "height": 768},
+                    # Removed --kiosk to allow desktop visibility
+                )
+
+                # Get the first page or create one
+                pages = self.context.pages
+                if pages:
+                    self.page = pages[0]
+                else:
+                    self.page = await self.context.new_page()
+
+                self._initialized = True
+                logger.info("Browser initialized successfully")
+
+            except Exception as e:
+                logger.error(f"Failed to initialize browser: {e}")
+                import traceback
+                logger.error(traceback.format_exc())
+                # Don't raise - return error in execute_command instead
+                self._initialization_error = str(e)
+                raise
+
+    async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Execute a browser command.
+
+        Args:
+            cmd: Command name (visit_url, click, type, scroll, web_search)
+            params: Command parameters
+
+        Returns:
+            Result dictionary with success status and any data
+        """
+        try:
+            await self._ensure_initialized()
+        except Exception as e:
+            error_msg = getattr(self, '_initialization_error', None) or str(e)
+            logger.error(f"Browser initialization failed: {error_msg}")
+            return {
+                "success": False,
+                "error": f"Browser initialization failed: {error_msg}. "
+                         f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly."
+            }
+
+        # Ensure browser is still initialized (in case it was manually closed)
+        # This will automatically reinitialize if the browser was closed
+        await self._ensure_initialized()
+        
+        # Check if page is still valid
+        page_valid = False
+        try:
+            if self.page is not None:
+                # Try to access page.url to check if it's still valid
+                _ = self.page.url
+                page_valid = True
+        except Exception as e:
+            logger.warning(f"Page is invalid: {e}, will get a new page...")
+            self.page = None
+        
+        # Get a valid page if we don't have one
+        if not page_valid or self.page is None:
+            try:
+                pages = self.context.pages
+                if pages:
+                    # Find first non-closed page
+                    for p in pages:
+                        try:
+                            if not p.is_closed():
+                                self.page = p
+                                logger.info("Reusing existing open page")
+                                page_valid = True
+                                break
+                        except Exception:
+                            continue
+                
+                # If no valid page found, create a new one
+                if not page_valid:
+                    self.page = await self.context.new_page()
+                    logger.info("Created new page")
+            except Exception as e:
+                logger.error(f"Failed to get new page: {e}, browser may be closed")
+                # Browser was closed - reinitialize it
+                try:
+                    logger.info("Browser was closed manually, reinitializing...")
+                    self._initialized = False
+                    self.context = None
+                    self.page = None
+                    if self.playwright:
+                        try:
+                            await self.playwright.stop()
+                        except Exception:
+                            pass
+                        self.playwright = None
+                    
+                    # Reinitialize
+                    await self._ensure_initialized()
+                    # Get or create a page
+                    pages = self.context.pages
+                    if pages:
+                        self.page = pages[0]
+                    else:
+                        self.page = await self.context.new_page()
+                    logger.info("Browser reopened successfully after manual closure")
+                except Exception as reinit_error:
+                    logger.error(f"Failed to reinitialize browser: {reinit_error}")
+                    import traceback
+                    logger.error(traceback.format_exc())
+                    return {"success": False, "error": f"Browser was closed and cannot be recovered: {reinit_error}"}
+
+        try:
+            if cmd == "visit_url":
+                url = params.get("url")
+                if not url:
+                    return {"success": False, "error": "url parameter is required"}
+                await self.page.goto(url, wait_until="domcontentloaded", timeout=30000)
+                return {"success": True, "url": self.page.url}
+
+            elif cmd == "click":
+                x = params.get("x")
+                y = params.get("y")
+                if x is None or y is None:
+                    return {"success": False, "error": "x and y parameters are required"}
+                await self.page.mouse.click(x, y)
+                return {"success": True}
+
+            elif cmd == "type":
+                text = params.get("text")
+                if text is None:
+                    return {"success": False, "error": "text parameter is required"}
+                await self.page.keyboard.type(text)
+                return {"success": True}
+
+            elif cmd == "scroll":
+                delta_x = params.get("delta_x", 0)
+                delta_y = params.get("delta_y", 0)
+                await self.page.mouse.wheel(delta_x, delta_y)
+                return {"success": True}
+
+            elif cmd == "web_search":
+                query = params.get("query")
+                if not query:
+                    return {"success": False, "error": "query parameter is required"}
+                # Navigate to Google search
+                search_url = f"https://www.google.com/search?q={query}"
+                await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000)
+                return {"success": True, "url": self.page.url}
+
+            else:
+                return {"success": False, "error": f"Unknown command: {cmd}"}
+
+        except Exception as e:
+            logger.error(f"Error executing command {cmd}: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
+            # If page was closed due to error, try to recover
+            if "closed" in str(e).lower() and self.context:
+                try:
+                    pages = self.context.pages
+                    if pages:
+                        self.page = pages[0]
+                        logger.info("Recovered page after error")
+                    else:
+                        self.page = await self.context.new_page()
+                        logger.info("Created new page after error")
+                except Exception as recover_error:
+                    logger.error(f"Failed to recover page: {recover_error}")
+            return {"success": False, "error": str(e)}
+
+    async def close(self):
+        """Close the browser and cleanup resources."""
+        async with self._lock:
+            try:
+                if self.context:
+                    await self.context.close()
+                    self.context = None
+                if self.browser:
+                    await self.browser.close()
+                    self.browser = None
+
+                if self.playwright:
+                    await self.playwright.stop()
+                    self.playwright = None
+
+                self.page = None
+                self._initialized = False
+                logger.info("Browser closed successfully")
+            except Exception as e:
+                logger.error(f"Error closing browser: {e}")
+
+
+# Global instance
+_browser_manager: Optional[BrowserManager] = None
+
+
+def get_browser_manager() -> BrowserManager:
+    """Get or create the global BrowserManager instance."""
+    global _browser_manager
+    if _browser_manager is None:
+        _browser_manager = BrowserManager()
+    return _browser_manager
+
diff --git a/libs/python/computer-server/computer_server/main.py b/libs/python/computer-server/computer_server/main.py
index 3ae97ebc..9bad59bf 100644
--- a/libs/python/computer-server/computer_server/main.py
+++ b/libs/python/computer-server/computer_server/main.py
@@ -25,6 +25,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 
 from .handlers.factory import HandlerFactory
+from .browser import get_browser_manager
 
 # Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s
 AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60"))
@@ -749,5 +750,71 @@ async def agent_response_endpoint(
     return JSONResponse(content=payload, headers=headers)
 
 
+@app.post("/playwright_exec")
+async def playwright_exec_endpoint(
+    request: Request,
+    container_name: Optional[str] = Header(None, alias="X-Container-Name"),
+    api_key: Optional[str] = Header(None, alias="X-API-Key"),
+):
+    """
+    Execute Playwright browser commands.
+
+    Headers:
+    - X-Container-Name: Container name for cloud authentication
+    - X-API-Key: API key for cloud authentication
+
+    Body:
+    {
+        "command": "visit_url|click|type|scroll|web_search",
+        "params": {...}
+    }
+    """
+    # Parse request body
+    try:
+        body = await request.json()
+        command = body.get("command")
+        params = body.get("params", {})
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
+
+    if not command:
+        raise HTTPException(status_code=400, detail="Command is required")
+
+    # Check if CONTAINER_NAME is set (indicating cloud provider)
+    server_container_name = os.environ.get("CONTAINER_NAME")
+
+    # If cloud provider, perform authentication
+    if server_container_name:
+        logger.info(
+            f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..."
+        )
+
+        # Validate required headers
+        if not container_name:
+            raise HTTPException(status_code=401, detail="Container name required")
+
+        if not api_key:
+            raise HTTPException(status_code=401, detail="API key required")
+
+        # Validate with AuthenticationManager
+        is_authenticated = await auth_manager.auth(container_name, api_key)
+        if not is_authenticated:
+            raise HTTPException(status_code=401, detail="Authentication failed")
+
+    # Get browser manager and execute command
+    try:
+        browser_manager = get_browser_manager()
+        result = await browser_manager.execute_command(command, params)
+        
+        if result.get("success"):
+            return JSONResponse(content=result)
+        else:
+            raise HTTPException(status_code=400, detail=result.get("error", "Command failed"))
+    except Exception as e:
+        logger.error(f"Error executing playwright command: {str(e)}")
+        logger.error(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
+
+
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/libs/python/computer-server/pyproject.toml b/libs/python/computer-server/pyproject.toml
index 7bae1e06..75ff49f5 100644
--- a/libs/python/computer-server/pyproject.toml
+++ b/libs/python/computer-server/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
     "pyperclip>=1.9.0",
     "websockets>=12.0",
     "pywinctl>=0.4.1",
+    "playwright>=1.40.0",
     # OS-specific runtime deps
     "pyobjc-framework-Cocoa>=10.1; sys_platform == 'darwin'",
     "pyobjc-framework-Quartz>=10.1; sys_platform == 'darwin'",
diff --git a/libs/xfce/Dockerfile b/libs/xfce/Dockerfile
index e83f6bd2..43dab80f 100644
--- a/libs/xfce/Dockerfile
+++ b/libs/xfce/Dockerfile
@@ -107,6 +107,16 @@ RUN mkdir -p /home/cua/.cache && \
 # Install computer-server using Python 3.12 pip
 RUN python3.12 -m pip install cua-computer-server
 
+# Copy browser.py and updated main.py from local source (to include browser tool)
+# These files need to be in the same directory as the Dockerfile when building
+COPY browser.py /tmp/browser.py
+COPY main.py /tmp/main.py
+RUN python3.12 -c "import shutil; import os; cs_dir = '/usr/local/lib/python3.12/dist-packages/computer_server'; shutil.copy('/tmp/browser.py', f'{cs_dir}/browser.py'); shutil.copy('/tmp/main.py', f'{cs_dir}/main.py'); print('Copied browser.py and main.py')" && rm /tmp/browser.py /tmp/main.py
+
+# Install playwright and Firefox dependencies
+RUN python3.12 -m pip install playwright && \
+    python3.12 -m playwright install --with-deps firefox
+
 # Fix any cache files created by pip
 RUN chown -R cua:cua /home/cua/.cache
 
diff --git a/libs/xfce/README_BUILD.md b/libs/xfce/README_BUILD.md
new file mode 100644
index 00000000..d6f6a7d4
--- /dev/null
+++ b/libs/xfce/README_BUILD.md
@@ -0,0 +1,32 @@
+# Building the XFCE Docker Image
+
+## Required Files for Build
+
+The Dockerfile requires these files to be present in the `libs/xfce/` directory:
+
+- `browser.py` - Copy from `libs/python/computer-server/computer_server/browser.py`
+- `main.py` - Copy from `libs/python/computer-server/computer_server/main.py`
+
+These files are copied into the container to include the browser tool functionality
+that isn't yet in the published PyPI package.
+
+## Before Building
+
+```bash
+# Copy the latest browser tool files
+cp libs/python/computer-server/computer_server/browser.py libs/xfce/
+cp libs/python/computer-server/computer_server/main.py libs/xfce/
+```
+
+## Build Command
+
+```bash
+cd libs/xfce
+docker build -t cua-xfce .
+```
+
+## Note
+
+Once the browser tool is included in the published `cua-computer-server` package,
+these temporary file copies can be removed and the Dockerfile can be simplified.
+
diff --git a/libs/xfce/browser.py b/libs/xfce/browser.py
new file mode 100644
index 00000000..3d0a4c69
--- /dev/null
+++ b/libs/xfce/browser.py
@@ -0,0 +1,308 @@
+"""
+Browser manager using Playwright for programmatic browser control.
+This allows agents to control a browser that runs visibly on the XFCE desktop.
+"""
+
+import asyncio
+import logging
+import os
+from typing import Any, Dict, Optional
+
+try:
+    from playwright.async_api import async_playwright, Browser, BrowserContext, Page
+except ImportError:
+    async_playwright = None
+    Browser = None
+    BrowserContext = None
+    Page = None
+
+logger = logging.getLogger(__name__)
+
+
+class BrowserManager:
+    """
+    Manages a Playwright browser instance that runs visibly on the XFCE desktop.
+    Uses persistent context to maintain cookies and sessions.
+    """
+
+    def __init__(self):
+        """Initialize the BrowserManager."""
+        self.playwright = None
+        self.browser: Optional[Browser] = None
+        self.context: Optional[BrowserContext] = None
+        self.page: Optional[Page] = None
+        self._initialized = False
+        self._initialization_error: Optional[str] = None
+        self._lock = asyncio.Lock()
+
+    async def _ensure_initialized(self):
+        """Ensure the browser is initialized."""
+        # Check if browser was closed and needs reinitialization
+        if self._initialized:
+            try:
+                # Check if context is still valid by trying to access it
+                if self.context:
+                    # Try to get pages - this will raise if context is closed
+                    _ = self.context.pages
+                    # If we get here, context is still alive
+                    return
+                else:
+                    # Context was closed, need to reinitialize
+                    self._initialized = False
+                    logger.warning("Browser context was closed, will reinitialize...")
+            except Exception as e:
+                # Context is dead, need to reinitialize
+                logger.warning(f"Browser context is dead ({e}), will reinitialize...")
+                self._initialized = False
+                self.context = None
+                self.page = None
+                # Clean up playwright if it exists
+                if self.playwright:
+                    try:
+                        await self.playwright.stop()
+                    except Exception:
+                        pass
+                    self.playwright = None
+
+        async with self._lock:
+            # Double-check after acquiring lock (another thread might have initialized it)
+            if self._initialized:
+                try:
+                    if self.context:
+                        _ = self.context.pages
+                        return
+                except Exception:
+                    self._initialized = False
+                    self.context = None
+                    self.page = None
+                    if self.playwright:
+                        try:
+                            await self.playwright.stop()
+                        except Exception:
+                            pass
+                        self.playwright = None
+
+            if async_playwright is None:
+                raise RuntimeError(
+                    "playwright is not installed. Please install it with: pip install playwright && playwright install --with-deps firefox"
+                )
+
+            try:
+                # Get display from environment or default to :1
+                display = os.environ.get("DISPLAY", ":1")
+                logger.info(f"Initializing browser with DISPLAY={display}")
+
+                # Start playwright
+                self.playwright = await async_playwright().start()
+
+                # Launch Firefox with persistent context (keeps cookies/sessions)
+                # headless=False is CRITICAL so the visual agent can see it
+                user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright-firefox")
+                os.makedirs(user_data_dir, exist_ok=True)
+
+                # launch_persistent_context returns a BrowserContext, not a Browser
+                # Note: Removed --kiosk mode so the desktop remains visible
+                self.context = await self.playwright.firefox.launch_persistent_context(
+                    user_data_dir=user_data_dir,
+                    headless=False,  # CRITICAL: visible for visual agent
+                    viewport={"width": 1024, "height": 768},
+                    # Removed --kiosk to allow desktop visibility
+                )
+
+                # Get the first page or create one
+                pages = self.context.pages
+                if pages:
+                    self.page = pages[0]
+                else:
+                    self.page = await self.context.new_page()
+
+                self._initialized = True
+                logger.info("Browser initialized successfully")
+
+            except Exception as e:
+                logger.error(f"Failed to initialize browser: {e}")
+                import traceback
+                logger.error(traceback.format_exc())
+                # Don't raise - return error in execute_command instead
+                self._initialization_error = str(e)
+                raise
+
+    async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Execute a browser command.
+
+        Args:
+            cmd: Command name (visit_url, click, type, scroll, web_search)
+            params: Command parameters
+
+        Returns:
+            Result dictionary with success status and any data
+        """
+        try:
+            await self._ensure_initialized()
+        except Exception as e:
+            error_msg = getattr(self, '_initialization_error', None) or str(e)
+            logger.error(f"Browser initialization failed: {error_msg}")
+            return {
+                "success": False,
+                "error": f"Browser initialization failed: {error_msg}. "
+                         f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly."
+            }
+
+        # Ensure browser is still initialized (in case it was manually closed)
+        # This will automatically reinitialize if the browser was closed
+        await self._ensure_initialized()
+        
+        # Check if page is still valid
+        page_valid = False
+        try:
+            if self.page is not None:
+                # Try to access page.url to check if it's still valid
+                _ = self.page.url
+                page_valid = True
+        except Exception as e:
+            logger.warning(f"Page is invalid: {e}, will get a new page...")
+            self.page = None
+        
+        # Get a valid page if we don't have one
+        if not page_valid or self.page is None:
+            try:
+                pages = self.context.pages
+                if pages:
+                    # Find first non-closed page
+                    for p in pages:
+                        try:
+                            if not p.is_closed():
+                                self.page = p
+                                logger.info("Reusing existing open page")
+                                page_valid = True
+                                break
+                        except Exception:
+                            continue
+                
+                # If no valid page found, create a new one
+                if not page_valid:
+                    self.page = await self.context.new_page()
+                    logger.info("Created new page")
+            except Exception as e:
+                logger.error(f"Failed to get new page: {e}, browser may be closed")
+                # Browser was closed - reinitialize it
+                try:
+                    logger.info("Browser was closed manually, reinitializing...")
+                    self._initialized = False
+                    self.context = None
+                    self.page = None
+                    if self.playwright:
+                        try:
+                            await self.playwright.stop()
+                        except Exception:
+                            pass
+                        self.playwright = None
+                    
+                    # Reinitialize
+                    await self._ensure_initialized()
+                    # Get or create a page
+                    pages = self.context.pages
+                    if pages:
+                        self.page = pages[0]
+                    else:
+                        self.page = await self.context.new_page()
+                    logger.info("Browser reopened successfully after manual closure")
+                except Exception as reinit_error:
+                    logger.error(f"Failed to reinitialize browser: {reinit_error}")
+                    import traceback
+                    logger.error(traceback.format_exc())
+                    return {"success": False, "error": f"Browser was closed and cannot be recovered: {reinit_error}"}
+
+        try:
+            if cmd == "visit_url":
+                url = params.get("url")
+                if not url:
+                    return {"success": False, "error": "url parameter is required"}
+                await self.page.goto(url, wait_until="domcontentloaded", timeout=30000)
+                return {"success": True, "url": self.page.url}
+
+            elif cmd == "click":
+                x = params.get("x")
+                y = params.get("y")
+                if x is None or y is None:
+                    return {"success": False, "error": "x and y parameters are required"}
+                await self.page.mouse.click(x, y)
+                return {"success": True}
+
+            elif cmd == "type":
+                text = params.get("text")
+                if text is None:
+                    return {"success": False, "error": "text parameter is required"}
+                await self.page.keyboard.type(text)
+                return {"success": True}
+
+            elif cmd == "scroll":
+                delta_x = params.get("delta_x", 0)
+                delta_y = params.get("delta_y", 0)
+                await self.page.mouse.wheel(delta_x, delta_y)
+                return {"success": True}
+
+            elif cmd == "web_search":
+                query = params.get("query")
+                if not query:
+                    return {"success": False, "error": "query parameter is required"}
+                # Navigate to Google search
+                search_url = f"https://www.google.com/search?q={query}"
+                await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000)
+                return {"success": True, "url": self.page.url}
+
+            else:
+                return {"success": False, "error": f"Unknown command: {cmd}"}
+
+        except Exception as e:
+            logger.error(f"Error executing command {cmd}: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
+            # If page was closed due to error, try to recover
+            if "closed" in str(e).lower() and self.context:
+                try:
+                    pages = self.context.pages
+                    if pages:
+                        self.page = pages[0]
+                        logger.info("Recovered page after error")
+                    else:
+                        self.page = await self.context.new_page()
+                        logger.info("Created new page after error")
+                except Exception as recover_error:
+                    logger.error(f"Failed to recover page: {recover_error}")
+            return {"success": False, "error": str(e)}
+
+    async def close(self):
+        """Close the browser and cleanup resources."""
+        async with self._lock:
+            try:
+                if self.context:
+                    await self.context.close()
+                    self.context = None
+                if self.browser:
+                    await self.browser.close()
+                    self.browser = None
+
+                if self.playwright:
+                    await self.playwright.stop()
+                    self.playwright = None
+
+                self.page = None
+                self._initialized = False
+                logger.info("Browser closed successfully")
+            except Exception as e:
+                logger.error(f"Error closing browser: {e}")
+
+
+# Global instance
+_browser_manager: Optional[BrowserManager] = None
+
+
+def get_browser_manager() -> BrowserManager:
+    """Get or create the global BrowserManager instance."""
+    global _browser_manager
+    if _browser_manager is None:
+        _browser_manager = BrowserManager()
+    return _browser_manager
+
diff --git a/libs/xfce/main.py b/libs/xfce/main.py
new file mode 100644
index 00000000..9bad59bf
--- /dev/null
+++ b/libs/xfce/main.py
@@ -0,0 +1,820 @@
+import asyncio
+import hashlib
+import inspect
+import json
+import logging
+import os
+import platform
+import time
+import traceback
+from contextlib import redirect_stderr, redirect_stdout
+from io import StringIO
+from typing import Any, Dict, List, Literal, Optional, Union, cast
+
+import aiohttp
+import uvicorn
+from fastapi import (
+    FastAPI,
+    Header,
+    HTTPException,
+    Request,
+    WebSocket,
+    WebSocketDisconnect,
+)
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, StreamingResponse
+
+from .handlers.factory import HandlerFactory
+from .browser import get_browser_manager
+
+# Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s
+AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60"))
+
+try:
+    from agent import ComputerAgent
+
+    HAS_AGENT = True
+except ImportError:
+    HAS_AGENT = False
+
+# Set up logging with more detail
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+# Configure WebSocket with larger message size
+WEBSOCKET_MAX_SIZE = 1024 * 1024 * 10  # 10MB limit
+
+# Configure application with WebSocket settings
+app = FastAPI(
+    title="Computer API",
+    description="API for the Computer project",
+    version="0.1.0",
+    websocket_max_size=WEBSOCKET_MAX_SIZE,
+)
+
+# CORS configuration
+origins = ["*"]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+protocol_version = 1
+try:
+    from importlib.metadata import version
+
+    package_version = version("cua-computer-server")
+except Exception:
+    # Fallback for cases where package is not installed or importlib.metadata is not available
+    try:
+        import pkg_resources
+
+        package_version = pkg_resources.get_distribution("cua-computer-server").version
+    except Exception:
+        package_version = "unknown"
+
+(
+    accessibility_handler,
+    automation_handler,
+    diorama_handler,
+    file_handler,
+    desktop_handler,
+    window_handler,
+) = HandlerFactory.create_handlers()
+handlers = {
+    "version": lambda: {"protocol": protocol_version, "package": package_version},
+    # App-Use commands
+    "diorama_cmd": diorama_handler.diorama_cmd,
+    # Accessibility commands
+    "get_accessibility_tree": accessibility_handler.get_accessibility_tree,
+    "find_element": accessibility_handler.find_element,
+    # Shell commands
+    "run_command": automation_handler.run_command,
+    # File system commands
+    "file_exists": file_handler.file_exists,
+    "directory_exists": file_handler.directory_exists,
+    "list_dir": file_handler.list_dir,
+    "read_text": file_handler.read_text,
+    "write_text": file_handler.write_text,
+    "read_bytes": file_handler.read_bytes,
+    "write_bytes": file_handler.write_bytes,
+    "get_file_size": file_handler.get_file_size,
+    "delete_file": file_handler.delete_file,
+    "create_dir": file_handler.create_dir,
+    "delete_dir": file_handler.delete_dir,
+    # Desktop commands
+    "get_desktop_environment": desktop_handler.get_desktop_environment,
+    "set_wallpaper": desktop_handler.set_wallpaper,
+    # Window management
+    "open": window_handler.open,
+    "launch": window_handler.launch,
+    "get_current_window_id": window_handler.get_current_window_id,
+    "get_application_windows": window_handler.get_application_windows,
+    "get_window_name": window_handler.get_window_name,
+    "get_window_size": window_handler.get_window_size,
+    "get_window_position": window_handler.get_window_position,
+    "set_window_size": window_handler.set_window_size,
+    "set_window_position": window_handler.set_window_position,
+    "maximize_window": window_handler.maximize_window,
+    "minimize_window": window_handler.minimize_window,
+    "activate_window": window_handler.activate_window,
+    "close_window": window_handler.close_window,
+    # Mouse commands
+    "mouse_down": automation_handler.mouse_down,
+    "mouse_up": automation_handler.mouse_up,
+    "left_click": automation_handler.left_click,
+    "right_click": automation_handler.right_click,
+    "double_click": automation_handler.double_click,
+    "move_cursor": automation_handler.move_cursor,
+    "drag_to": automation_handler.drag_to,
+    "drag": automation_handler.drag,
+    # Keyboard commands
+    "key_down": automation_handler.key_down,
+    "key_up": automation_handler.key_up,
+    "type_text": automation_handler.type_text,
+    "press_key": automation_handler.press_key,
+    "hotkey": automation_handler.hotkey,
+    # Scrolling actions
+    "scroll": automation_handler.scroll,
+    "scroll_down": automation_handler.scroll_down,
+    "scroll_up": automation_handler.scroll_up,
+    # Screen actions
+    "screenshot": automation_handler.screenshot,
+    "get_cursor_position": automation_handler.get_cursor_position,
+    "get_screen_size": automation_handler.get_screen_size,
+    # Clipboard actions
+    "copy_to_clipboard": automation_handler.copy_to_clipboard,
+    "set_clipboard": automation_handler.set_clipboard,
+}
+
+
+class AuthenticationManager:
+    def __init__(self):
+        self.sessions: Dict[str, Dict[str, Any]] = {}
+        self.container_name = os.environ.get("CONTAINER_NAME")
+
+    def _hash_credentials(self, container_name: str, api_key: str) -> str:
+        """Create a hash of container name and API key for session identification"""
+        combined = f"{container_name}:{api_key}"
+        return hashlib.sha256(combined.encode()).hexdigest()
+
+    def _is_session_valid(self, session_data: Dict[str, Any]) -> bool:
+        """Check if a session is still valid based on expiration time"""
+        if not session_data.get("valid", False):
+            return False
+
+        expires_at = session_data.get("expires_at", 0)
+        return time.time() < expires_at
+
+    async def auth(self, container_name: str, api_key: str) -> bool:
+        """Authenticate container name and API key, using cached sessions when possible"""
+        # If no CONTAINER_NAME is set, always allow access (local development)
+        if not self.container_name:
+            logger.info(
+                "No CONTAINER_NAME set in environment. Allowing access (local development mode)"
+            )
+            return True
+
+        # Layer 1: VM Identity Verification
+        if container_name != self.container_name:
+            logger.warning(
+                f"VM name mismatch. Expected: {self.container_name}, Got: {container_name}"
+            )
+            return False
+
+        # Create hash for session lookup
+        session_hash = self._hash_credentials(container_name, api_key)
+
+        # Check if we have a valid cached session
+        if session_hash in self.sessions:
+            session_data = self.sessions[session_hash]
+            if self._is_session_valid(session_data):
+                logger.info(f"Using cached authentication for container: {container_name}")
+                return session_data["valid"]
+            else:
+                # Remove expired session
+                del self.sessions[session_hash]
+
+        # No valid cached session, authenticate with API
+        logger.info(f"Authenticating with TryCUA API for container: {container_name}")
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                headers = {"Authorization": f"Bearer {api_key}"}
+
+                async with session.get(
+                    f"https://www.cua.ai/api/vm/auth?container_name={container_name}",
+                    headers=headers,
+                ) as resp:
+                    is_valid = resp.status == 200 and bool((await resp.text()).strip())
+
+                    # Cache the result with configurable expiration
+                    self.sessions[session_hash] = {
+                        "valid": is_valid,
+                        "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS,
+                    }
+
+                    if is_valid:
+                        logger.info(f"Authentication successful for container: {container_name}")
+                    else:
+                        logger.warning(
+                            f"Authentication failed for container: {container_name}. Status: {resp.status}"
+                        )
+
+                    return is_valid
+
+        except aiohttp.ClientError as e:
+            logger.error(f"Failed to validate API key with TryCUA API: {str(e)}")
+            # Cache failed result to avoid repeated requests
+            self.sessions[session_hash] = {
+                "valid": False,
+                "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS,
+            }
+            return False
+        except Exception as e:
+            logger.error(f"Unexpected error during authentication: {str(e)}")
+            # Cache failed result to avoid repeated requests
+            self.sessions[session_hash] = {
+                "valid": False,
+                "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS,
+            }
+            return False
+
+
+class ConnectionManager:
+    def __init__(self):
+        self.active_connections: List[WebSocket] = []
+
+    async def connect(self, websocket: WebSocket):
+        await websocket.accept()
+        self.active_connections.append(websocket)
+
+    def disconnect(self, websocket: WebSocket):
+        self.active_connections.remove(websocket)
+
+
+manager = ConnectionManager()
+auth_manager = AuthenticationManager()
+
+
+@app.get("/status")
+async def status():
+    sys = platform.system().lower()
+    # get os type
+    if "darwin" in sys or sys == "macos" or sys == "mac":
+        os_type = "macos"
+    elif "windows" in sys:
+        os_type = "windows"
+    else:
+        os_type = "linux"
+    # get computer-server features
+    features = []
+    if HAS_AGENT:
+        features.append("agent")
+    return {"status": "ok", "os_type": os_type, "features": features}
+
+
+@app.websocket("/ws", name="websocket_endpoint")
+async def websocket_endpoint(websocket: WebSocket):
+    global handlers
+
+    # WebSocket message size is configured at the app or endpoint level, not on the instance
+    await manager.connect(websocket)
+
+    # Check if CONTAINER_NAME is set (indicating cloud provider)
+    server_container_name = os.environ.get("CONTAINER_NAME")
+
+    # If cloud provider, perform authentication handshake
+    if server_container_name:
+        try:
+            logger.info(
+                f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Waiting for authentication..."
+            )
+
+            # Wait for authentication message
+            auth_data = await websocket.receive_json()
+
+            # Validate auth message format
+            if auth_data.get("command") != "authenticate":
+                await websocket.send_json(
+                    {"success": False, "error": "First message must be authentication"}
+                )
+                await websocket.close()
+                manager.disconnect(websocket)
+                return
+
+            # Extract credentials
+            client_api_key = auth_data.get("params", {}).get("api_key")
+            client_container_name = auth_data.get("params", {}).get("container_name")
+
+            # Validate credentials using AuthenticationManager
+            if not client_api_key:
+                await websocket.send_json({"success": False, "error": "API key required"})
+                await websocket.close()
+                manager.disconnect(websocket)
+                return
+
+            if not client_container_name:
+                await websocket.send_json({"success": False, "error": "Container name required"})
+                await websocket.close()
+                manager.disconnect(websocket)
+                return
+
+            # Use AuthenticationManager for validation
+            is_authenticated = await auth_manager.auth(client_container_name, client_api_key)
+            if not is_authenticated:
+                await websocket.send_json({"success": False, "error": "Authentication failed"})
+                await websocket.close()
+                manager.disconnect(websocket)
+                return
+
+            logger.info(f"Authentication successful for VM: {client_container_name}")
+            await websocket.send_json({"success": True, "message": "Authentication successful"})
+
+        except Exception as e:
+            logger.error(f"Error during authentication handshake: {str(e)}")
+            await websocket.send_json({"success": False, "error": "Authentication failed"})
+            await websocket.close()
+            manager.disconnect(websocket)
+            return
+
+    try:
+        while True:
+            try:
+                data = await websocket.receive_json()
+                command = data.get("command")
+                params = data.get("params", {})
+
+                if command not in handlers:
+                    await websocket.send_json(
+                        {"success": False, "error": f"Unknown command: {command}"}
+                    )
+                    continue
+
+                try:
+                    # Filter params to only include those accepted by the handler function
+                    handler_func = handlers[command]
+                    sig = inspect.signature(handler_func)
+                    filtered_params = {k: v for k, v in params.items() if k in sig.parameters}
+
+                    # Handle both sync and async functions
+                    if asyncio.iscoroutinefunction(handler_func):
+                        result = await handler_func(**filtered_params)
+                    else:
+                        # Run sync functions in thread pool to avoid blocking event loop
+                        result = await asyncio.to_thread(handler_func, **filtered_params)
+                    await websocket.send_json({"success": True, **result})
+                except Exception as cmd_error:
+                    logger.error(f"Error executing command {command}: {str(cmd_error)}")
+                    logger.error(traceback.format_exc())
+                    await websocket.send_json({"success": False, "error": str(cmd_error)})
+
+            except WebSocketDisconnect:
+                raise
+            except json.JSONDecodeError as json_err:
+                logger.error(f"JSON decode error: {str(json_err)}")
+                await websocket.send_json(
+                    {"success": False, "error": f"Invalid JSON: {str(json_err)}"}
+                )
+            except Exception as loop_error:
+                logger.error(f"Error in message loop: {str(loop_error)}")
+                logger.error(traceback.format_exc())
+                await websocket.send_json({"success": False, "error": str(loop_error)})
+
+    except WebSocketDisconnect:
+        logger.info("Client disconnected")
+        manager.disconnect(websocket)
+    except Exception as e:
+        logger.error(f"Fatal error in websocket connection: {str(e)}")
+        logger.error(traceback.format_exc())
+        try:
+            await websocket.close()
+        except:
+            pass
+        manager.disconnect(websocket)
+
+
+@app.post("/cmd")
+async def cmd_endpoint(
+    request: Request,
+    container_name: Optional[str] = Header(None, alias="X-Container-Name"),
+    api_key: Optional[str] = Header(None, alias="X-API-Key"),
+):
+    """
+    Backup endpoint for when WebSocket connections fail.
+    Accepts commands via HTTP POST with streaming response.
+
+    Headers:
+    - X-Container-Name: Container name for cloud authentication
+    - X-API-Key: API key for cloud authentication
+
+    Body:
+    {
+        "command": "command_name",
+        "params": {...}
+    }
+    """
+    global handlers
+
+    # Parse request body
+    try:
+        body = await request.json()
+        command = body.get("command")
+        params = body.get("params", {})
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
+
+    if not command:
+        raise HTTPException(status_code=400, detail="Command is required")
+
+    # Check if CONTAINER_NAME is set (indicating cloud provider)
+    server_container_name = os.environ.get("CONTAINER_NAME")
+
+    # If cloud provider, perform authentication
+    if server_container_name:
+        logger.info(
+            f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..."
+        )
+
+        # Validate required headers
+        if not container_name:
+            raise HTTPException(status_code=401, detail="Container name required")
+
+        if not api_key:
+            raise HTTPException(status_code=401, detail="API key required")
+
+        # Validate with AuthenticationManager
+        is_authenticated = await auth_manager.auth(container_name, api_key)
+        if not is_authenticated:
+            raise HTTPException(status_code=401, detail="Authentication failed")
+
+    if command not in handlers:
+        raise HTTPException(status_code=400, detail=f"Unknown command: {command}")
+
+    async def generate_response():
+        """Generate streaming response for the command execution"""
+        try:
+            # Filter params to only include those accepted by the handler function
+            handler_func = handlers[command]
+            sig = inspect.signature(handler_func)
+            filtered_params = {k: v for k, v in params.items() if k in sig.parameters}
+
+            # Handle both sync and async functions
+            if asyncio.iscoroutinefunction(handler_func):
+                result = await handler_func(**filtered_params)
+            else:
+                # Run sync functions in thread pool to avoid blocking event loop
+                result = await asyncio.to_thread(handler_func, **filtered_params)
+
+            # Stream the successful result
+            response_data = {"success": True, **result}
+            yield f"data: {json.dumps(response_data)}\n\n"
+
+        except Exception as cmd_error:
+            logger.error(f"Error executing command {command}: {str(cmd_error)}")
+            logger.error(traceback.format_exc())
+
+            # Stream the error result
+            error_data = {"success": False, "error": str(cmd_error)}
+            yield f"data: {json.dumps(error_data)}\n\n"
+
+    return StreamingResponse(
+        generate_response(),
+        media_type="text/plain",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+        },
+    )
+
+
+@app.post("/responses")
+async def agent_response_endpoint(
+    request: Request,
+    api_key: Optional[str] = Header(None, alias="X-API-Key"),
+):
+    """
+    Minimal proxy to run ComputerAgent for up to 2 turns.
+
+    Security:
+    - If CONTAINER_NAME is set on the server, require X-API-Key
+      and validate using AuthenticationManager unless CUA_ENABLE_PUBLIC_PROXY is true.
+
+    Body JSON:
+    {
+      "model": "...",                 # required
+      "input": "... or messages[]",   # required
+      "agent_kwargs": { ... },         # optional, passed directly to ComputerAgent
+      "env": { ... }                   # optional env overrides for agent
+    }
+    """
+    if not HAS_AGENT:
+        raise HTTPException(status_code=501, detail="ComputerAgent not available")
+
+    # Authenticate via AuthenticationManager if running in cloud (CONTAINER_NAME set)
+    container_name = os.environ.get("CONTAINER_NAME")
+    if container_name:
+        is_public = os.environ.get("CUA_ENABLE_PUBLIC_PROXY", "").lower().strip() in [
+            "1",
+            "true",
+            "yes",
+            "y",
+            "on",
+        ]
+        if not is_public:
+            if not api_key:
+                raise HTTPException(status_code=401, detail="Missing AGENT PROXY auth headers")
+            ok = await auth_manager.auth(container_name, api_key)
+            if not ok:
+                raise HTTPException(status_code=401, detail="Unauthorized")
+
+    # Parse request body
+    try:
+        body = await request.json()
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
+
+    model = body.get("model")
+    input_data = body.get("input")
+    if not model or input_data is None:
+        raise HTTPException(status_code=400, detail="'model' and 'input' are required")
+
+    agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {}
+    env_overrides: Dict[str, str] = body.get("env") or {}
+
+    # Simple env override context
+    class _EnvOverride:
+        def __init__(self, overrides: Dict[str, str]):
+            self.overrides = overrides
+            self._original: Dict[str, Optional[str]] = {}
+
+        def __enter__(self):
+            for k, v in (self.overrides or {}).items():
+                self._original[k] = os.environ.get(k)
+                os.environ[k] = str(v)
+
+        def __exit__(self, exc_type, exc, tb):
+            for k, old in self._original.items():
+                if old is None:
+                    os.environ.pop(k, None)
+                else:
+                    os.environ[k] = old
+
+    # Convert input to messages
+    def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
+        if isinstance(data, str):
+            return [{"role": "user", "content": data}]
+        if isinstance(data, list):
+            return data
+
+    messages = _to_messages(input_data)
+
+    # Define a direct computer tool that implements the AsyncComputerHandler protocol
+    # and delegates to our existing automation/file/accessibility handlers.
+    from agent.computers import AsyncComputerHandler  # runtime-checkable Protocol
+
+    class DirectComputer(AsyncComputerHandler):
+        def __init__(self):
+            # use module-scope handler singletons created by HandlerFactory
+            self._auto = automation_handler
+            self._file = file_handler
+            self._access = accessibility_handler
+
+        async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
+            sys = platform.system().lower()
+            if "darwin" in sys or sys in ("macos", "mac"):
+                return "mac"
+            if "windows" in sys:
+                return "windows"
+            return "linux"
+
+        async def get_dimensions(self) -> tuple[int, int]:
+            size = await self._auto.get_screen_size()
+            return size["width"], size["height"]
+
+        async def screenshot(self) -> str:
+            img_b64 = await self._auto.screenshot()
+            return img_b64["image_data"]
+
+        async def click(self, x: int, y: int, button: str = "left") -> None:
+            if button == "left":
+                await self._auto.left_click(x, y)
+            elif button == "right":
+                await self._auto.right_click(x, y)
+            else:
+                await self._auto.left_click(x, y)
+
+        async def double_click(self, x: int, y: int) -> None:
+            await self._auto.double_click(x, y)
+
+        async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
+            await self._auto.move_cursor(x, y)
+            await self._auto.scroll(scroll_x, scroll_y)
+
+        async def type(self, text: str) -> None:
+            await self._auto.type_text(text)
+
+        async def wait(self, ms: int = 1000) -> None:
+            await asyncio.sleep(ms / 1000.0)
+
+        async def move(self, x: int, y: int) -> None:
+            await self._auto.move_cursor(x, y)
+
+        async def keypress(self, keys: Union[List[str], str]) -> None:
+            if isinstance(keys, str):
+                parts = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
+            else:
+                parts = keys
+            if len(parts) == 1:
+                await self._auto.press_key(parts[0])
+            else:
+                await self._auto.hotkey(parts)
+
+        async def drag(self, path: List[Dict[str, int]]) -> None:
+            if not path:
+                return
+            start = path[0]
+            await self._auto.mouse_down(start["x"], start["y"])
+            for pt in path[1:]:
+                await self._auto.move_cursor(pt["x"], pt["y"])
+            end = path[-1]
+            await self._auto.mouse_up(end["x"], end["y"])
+
+        async def get_current_url(self) -> str:
+            # Not available in this server context
+            return ""
+
+        async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
+            await self._auto.mouse_down(x, y, button="left")
+
+        async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
+            await self._auto.mouse_up(x, y, button="left")
+
+    # # Inline image URLs to base64
+    # import base64, mimetypes, requests
+    # # Use a browser-like User-Agent to avoid 403s from some CDNs (e.g., Wikimedia)
+    # HEADERS = {
+    #     "User-Agent": (
+    #         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    #         "AppleWebKit/537.36 (KHTML, like Gecko) "
+    #         "Chrome/124.0.0.0 Safari/537.36"
+    #     )
+    # }
+    # def _to_data_url(content_bytes: bytes, url: str, resp: requests.Response) -> str:
+    #     ctype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream"
+    #     b64 = base64.b64encode(content_bytes).decode("utf-8")
+    #     return f"data:{ctype};base64,{b64}"
+    # def inline_image_urls(messages):
+    #     # messages: List[{"role": "...","content":[...]}]
+    #     out = []
+    #     for m in messages:
+    #         if not isinstance(m.get("content"), list):
+    #             out.append(m)
+    #             continue
+    #         new_content = []
+    #         for part in (m.get("content") or []):
+    #             if part.get("type") == "input_image" and (url := part.get("image_url")):
+    #                 resp = requests.get(url, headers=HEADERS, timeout=30)
+    #                 resp.raise_for_status()
+    #                 new_content.append({
+    #                     "type": "input_image",
+    #                     "image_url": _to_data_url(resp.content, url, resp)
+    #                 })
+    #             else:
+    #                 new_content.append(part)
+    #         out.append({**m, "content": new_content})
+    #     return out
+    # messages = inline_image_urls(messages)
+
+    error = None
+
+    with _EnvOverride(env_overrides):
+        # Prepare tools: if caller did not pass tools, inject our DirectComputer
+        tools = agent_kwargs.get("tools")
+        if not tools:
+            tools = [DirectComputer()]
+            agent_kwargs = {**agent_kwargs, "tools": tools}
+        # Instantiate agent with our tools
+        agent = ComputerAgent(model=model, **agent_kwargs)  # type: ignore[arg-type]
+
+        total_output: List[Any] = []
+        total_usage: Dict[str, Any] = {}
+
+        pending_computer_call_ids = set()
+        try:
+            async for result in agent.run(messages):
+                total_output += result["output"]
+                # Try to collect usage if present
+                if (
+                    isinstance(result, dict)
+                    and "usage" in result
+                    and isinstance(result["usage"], dict)
+                ):
+                    # Merge usage counters
+                    for k, v in result["usage"].items():
+                        if isinstance(v, (int, float)):
+                            total_usage[k] = total_usage.get(k, 0) + v
+                        else:
+                            total_usage[k] = v
+                for msg in result.get("output", []):
+                    if msg.get("type") == "computer_call":
+                        pending_computer_call_ids.add(msg["call_id"])
+                    elif msg.get("type") == "computer_call_output":
+                        pending_computer_call_ids.discard(msg["call_id"])
+                # exit if no pending computer calls
+                if not pending_computer_call_ids:
+                    break
+        except Exception as e:
+            logger.error(f"Error running agent: {str(e)}")
+            logger.error(traceback.format_exc())
+            error = str(e)
+
+    # Build response payload
+    payload = {
+        "model": model,
+        "error": error,
+        "output": total_output,
+        "usage": total_usage,
+        "status": "completed" if not error else "failed",
+    }
+
+    # CORS: allow any origin
+    headers = {
+        "Cache-Control": "no-cache",
+        "Connection": "keep-alive",
+    }
+
+    return JSONResponse(content=payload, headers=headers)
+
+
+@app.post("/playwright_exec")
+async def playwright_exec_endpoint(
+    request: Request,
+    container_name: Optional[str] = Header(None, alias="X-Container-Name"),
+    api_key: Optional[str] = Header(None, alias="X-API-Key"),
+):
+    """
+    Execute Playwright browser commands.
+
+    Headers:
+    - X-Container-Name: Container name for cloud authentication
+    - X-API-Key: API key for cloud authentication
+
+    Body:
+    {
+        "command": "visit_url|click|type|scroll|web_search",
+        "params": {...}
+    }
+    """
+    # Parse request body
+    try:
+        body = await request.json()
+        command = body.get("command")
+        params = body.get("params", {})
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
+
+    if not command:
+        raise HTTPException(status_code=400, detail="Command is required")
+
+    # Check if CONTAINER_NAME is set (indicating cloud provider)
+    server_container_name = os.environ.get("CONTAINER_NAME")
+
+    # If cloud provider, perform authentication
+    if server_container_name:
+        logger.info(
+            f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..."
+        )
+
+        # Validate required headers
+        if not container_name:
+            raise HTTPException(status_code=401, detail="Container name required")
+
+        if not api_key:
+            raise HTTPException(status_code=401, detail="API key required")
+
+        # Validate with AuthenticationManager
+        is_authenticated = await auth_manager.auth(container_name, api_key)
+        if not is_authenticated:
+            raise HTTPException(status_code=401, detail="Authentication failed")
+
+    # Get browser manager and execute command
+    try:
+        browser_manager = get_browser_manager()
+        result = await browser_manager.execute_command(command, params)
+        
+        if result.get("success"):
+            return JSONResponse(content=result)
+        else:
+            raise HTTPException(status_code=400, detail=result.get("error", "Command failed"))
+    except Exception as e:
+        logger.error(f"Error executing playwright command: {str(e)}")
+        logger.error(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/libs/xfce/src/scripts/start-computer-server.sh b/libs/xfce/src/scripts/start-computer-server.sh
index bc27a3db..1e52e536 100644
--- a/libs/xfce/src/scripts/start-computer-server.sh
+++ b/libs/xfce/src/scripts/start-computer-server.sh
@@ -10,4 +10,4 @@ echo "X server is ready"
 
 # Start computer-server
 export DISPLAY=:1
-python3 -m computer_server --port ${API_PORT:-8000}
+python -m computer_server --port ${API_PORT:-8000}

From ddfb53e79f4b0bc98d01bc67beeb01ea5860b7d3 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 3 Dec 2025 08:17:52 -0800
Subject: [PATCH 2/8] Migrate browser interface into computer SDK

---
 .pre-commit-config.yaml                       |  2 +
 examples/BROWSER_TOOL_README.md               | 24 +++++++-
 examples/browser_tool_example.py              | 56 +++++++++---------
 libs/python/agent/agent/tools/browser_tool.py | 57 +++++--------------
 .../computer/computer/interface/generic.py    | 50 ++++++++++++++++
 5 files changed, 116 insertions(+), 73 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d9475d42..a2e35493 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,6 +15,8 @@ repos:
         name: TypeScript type check
         entry: node ./scripts/typescript-typecheck.js
         language: node
+        files: \.(ts|tsx)$
+        pass_filenames: false
 
   - repo: https://github.com/PyCQA/isort
     rev: 7.0.0
diff --git a/examples/BROWSER_TOOL_README.md b/examples/BROWSER_TOOL_README.md
index 8d12ae85..f72971e8 100644
--- a/examples/BROWSER_TOOL_README.md
+++ b/examples/BROWSER_TOOL_README.md
@@ -40,10 +40,31 @@ python examples/browser_tool_example.py
 - **Auto-Recovery**: Automatically reopens browser if closed manually
 - **Persistent Context**: Maintains cookies and sessions across commands
 - **Fara/Magentic-One Interface**: Compatible with Microsoft agent interfaces
+- **Computer SDK Integration**: Uses the Computer SDK's interface for unified control
+
+## Usage
+
+The BrowserTool uses the Computer SDK's interface to communicate with the server:
+
+```python
+from computer import Computer
+from agent.tools.browser_tool import BrowserTool
+
+# Initialize computer interface
+computer = Computer(ip_address="localhost")
+
+# Create browser tool with the interface
+browser = BrowserTool(interface=computer)
+
+# Use the browser
+await browser.visit_url("https://www.example.com")
+await browser.click(x=500, y=300)
+await browser.type("Hello, world!")
+```
 
 ## API Endpoint
 
-The browser tool is accessible via the `/playwright_exec` endpoint:
+The browser tool is also accessible via the `/playwright_exec` endpoint:
 
 ```bash
 curl -X POST http://localhost:8000/playwright_exec \
@@ -66,4 +87,3 @@ curl -X POST http://localhost:8000/playwright_exec \
 **Connection errors**: Make sure the server is running (`curl http://localhost:8000/status`).
 
 **Playwright not found**: Install with `pip install playwright && playwright install --with-deps firefox`.
-
diff --git a/examples/browser_tool_example.py b/examples/browser_tool_example.py
index 9705ca8f..11a8dead 100644
--- a/examples/browser_tool_example.py
+++ b/examples/browser_tool_example.py
@@ -19,18 +19,14 @@ import logging
 import sys
 from pathlib import Path
 
-# Import BrowserTool directly from the file
-browser_tool_path = Path(__file__).parent.parent / "libs" / "python" / "agent" / "agent" / "tools" / "browser_tool.py"
-sys.path.insert(0, str(browser_tool_path.parent.parent.parent))
+# Add the libs path to sys.path
+libs_path = Path(__file__).parent.parent / "libs" / "python"
+sys.path.insert(0, str(libs_path))
 
-# Import the module directly
-import importlib.util
-spec = importlib.util.spec_from_file_location("browser_tool", browser_tool_path)
-if spec is None or spec.loader is None:
-    raise ImportError(f"Could not load browser_tool from {browser_tool_path}")
-browser_tool_module = importlib.util.module_from_spec(spec)
-spec.loader.exec_module(browser_tool_module)
-BrowserTool = browser_tool_module.BrowserTool
+from agent.tools.browser_tool import BrowserTool
+
+# Import Computer interface and BrowserTool
+from computer import Computer
 
 # Configure logging to see what's happening
 logging.basicConfig(level=logging.INFO)
@@ -39,58 +35,60 @@ logger = logging.getLogger(__name__)
 
 async def test_browser_tool():
     """Test the BrowserTool with various commands."""
-    
-    # Initialize the browser tool
-    # For local testing, use http://localhost:8000
-    # For cloud, provide base_url, api_key, and container_name
-    browser = BrowserTool(base_url="http://localhost:8000")
-    
+
+    # Initialize the computer interface
+    # For local testing, use provider_type="docker"
+    # For provider_type="cloud", provide name and api_key
+    computer = Computer(provider_type="docker")
+
+    # Initialize the browser tool with the computer interface
+    browser = BrowserTool(interface=computer)
+
     logger.info("Testing Browser Tool...")
-    
+
     try:
         # Test 1: Visit a URL
         logger.info("Test 1: Visiting a URL...")
         result = await browser.visit_url("https://www.trycua.com")
         logger.info(f"Visit URL result: {result}")
-        
+
         # Wait a bit for the page to load
         await asyncio.sleep(2)
-        
+
         # Test 2: Web search
         logger.info("Test 2: Performing a web search...")
         result = await browser.web_search("Python programming")
         logger.info(f"Web search result: {result}")
-        
+
         # Wait a bit
         await asyncio.sleep(2)
-        
+
         # Test 3: Scroll
         logger.info("Test 3: Scrolling the page...")
         result = await browser.scroll(delta_x=0, delta_y=500)
         logger.info(f"Scroll result: {result}")
-        
+
         # Wait a bit
         await asyncio.sleep(1)
-        
+
         # Test 4: Click (example coordinates - adjust based on your screen)
         logger.info("Test 4: Clicking at coordinates...")
         result = await browser.click(x=500, y=300)
         logger.info(f"Click result: {result}")
-        
+
         # Wait a bit
         await asyncio.sleep(1)
-        
+
         # Test 5: Type text (if there's a focused input field)
         logger.info("Test 5: Typing text...")
         result = await browser.type("Hello from BrowserTool!")
         logger.info(f"Type result: {result}")
-        
+
         logger.info("All tests completed!")
-        
+
     except Exception as e:
         logger.error(f"Error during testing: {e}", exc_info=True)
 
 
 if __name__ == "__main__":
     asyncio.run(test_browser_tool())
-
diff --git a/libs/python/agent/agent/tools/browser_tool.py b/libs/python/agent/agent/tools/browser_tool.py
index 8f8b1ab9..85b6ba23 100644
--- a/libs/python/agent/agent/tools/browser_tool.py
+++ b/libs/python/agent/agent/tools/browser_tool.py
@@ -4,54 +4,36 @@ Allows agents to control a browser programmatically via Playwright.
 """
 
 import logging
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 
-import aiohttp
+if TYPE_CHECKING:
+    from computer.interface import GenericComputerInterface
 
 logger = logging.getLogger(__name__)
 
 
 class BrowserTool:
     """
-    Browser tool that connects to the computer server's Playwright endpoint.
+    Browser tool that uses the computer SDK's interface to control a browser.
     Implements the Fara/Magentic-One agent interface for browser control.
     """
 
     def __init__(
         self,
-        base_url: str = "http://localhost:8000",
-        api_key: Optional[str] = None,
-        container_name: Optional[str] = None,
+        interface: "GenericComputerInterface",
     ):
         """
         Initialize the BrowserTool.
 
         Args:
-            base_url: Base URL of the computer server (default: http://localhost:8000)
-            api_key: Optional API key for cloud authentication
-            container_name: Optional container name for cloud authentication
+            interface: A GenericComputerInterface instance that provides playwright_exec
         """
-        self.base_url = base_url.rstrip("/")
-        self.api_key = api_key
-        self.container_name = container_name
+        self.interface = interface
         self.logger = logger
 
-    def _get_endpoint_url(self) -> str:
-        """Get the full URL for the playwright_exec endpoint."""
-        return f"{self.base_url}/playwright_exec"
-
-    def _get_headers(self) -> dict:
-        """Get headers for the HTTP request."""
-        headers = {"Content-Type": "application/json"}
-        if self.api_key:
-            headers["X-API-Key"] = self.api_key
-        if self.container_name:
-            headers["X-Container-Name"] = self.container_name
-        return headers
-
     async def _execute_command(self, command: str, params: dict) -> dict:
         """
-        Execute a browser command via HTTP POST.
+        Execute a browser command via the computer interface.
 
         Args:
             command: Command name
@@ -60,23 +42,15 @@ class BrowserTool:
         Returns:
             Response dictionary
         """
-        url = self._get_endpoint_url()
-        payload = {"command": command, "params": params}
-        headers = self._get_headers()
-
         try:
-            async with aiohttp.ClientSession() as session:
-                async with session.post(url, json=payload, headers=headers) as response:
-                    if response.status == 200:
-                        return await response.json()
-                    else:
-                        error_text = await response.text()
-                        self.logger.error(
-                            f"Browser command failed with status {response.status}: {error_text}"
-                        )
-                        return {"success": False, "error": error_text}
+            result = await self.interface.playwright_exec(command, params)
+            if not result.get("success"):
+                self.logger.error(
+                    f"Browser command '{command}' failed: {result.get('error', 'Unknown error')}"
+                )
+            return result
         except Exception as e:
-            self.logger.error(f"Error executing browser command: {e}")
+            self.logger.error(f"Error executing browser command '{command}': {e}")
             return {"success": False, "error": str(e)}
 
     async def visit_url(self, url: str) -> dict:
@@ -140,4 +114,3 @@ class BrowserTool:
             Response dictionary with success status and current URL
         """
         return await self._execute_command("web_search", {"query": query})
-
diff --git a/libs/python/computer/computer/interface/generic.py b/libs/python/computer/computer/interface/generic.py
index e58719dd..d5a5dc4b 100644
--- a/libs/python/computer/computer/interface/generic.py
+++ b/libs/python/computer/computer/interface/generic.py
@@ -661,6 +661,56 @@ class GenericComputerInterface(BaseComputerInterface):
 
         return screenshot_x, screenshot_y
 
+    # Playwright browser control
+    async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        Execute a Playwright browser command.
+
+        Args:
+            command: The browser command to execute (visit_url, click, type, scroll, web_search)
+            params: Command parameters
+
+        Returns:
+            Dict containing the command result
+
+        Examples:
+            # Navigate to a URL
+            await interface.playwright_exec("visit_url", {"url": "https://example.com"})
+
+            # Click at coordinates
+            await interface.playwright_exec("click", {"x": 100, "y": 200})
+
+            # Type text
+            await interface.playwright_exec("type", {"text": "Hello, world!"})
+
+            # Scroll
+            await interface.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100})
+
+            # Web search
+            await interface.playwright_exec("web_search", {"query": "computer use agent"})
+        """
+        protocol = "https" if self.api_key else "http"
+        port = "8443" if self.api_key else "8000"
+        url = f"{protocol}://{self.ip_address}:{port}/playwright_exec"
+
+        payload = {"command": command, "params": params or {}}
+        headers = {"Content-Type": "application/json"}
+        if self.api_key:
+            headers["X-API-Key"] = self.api_key
+        if self.vm_name:
+            headers["X-Container-Name"] = self.vm_name
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(url, json=payload, headers=headers) as response:
+                    if response.status == 200:
+                        return await response.json()
+                    else:
+                        error_text = await response.text()
+                        return {"success": False, "error": error_text}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
     # Websocket Methods
     async def _keep_alive(self):
         """Keep the WebSocket connection alive with automatic reconnection."""

From 6fa66c18cc5e19dd884f1ebecca12449754cf690 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 3 Dec 2025 08:24:21 -0800
Subject: [PATCH 3/8] Unmerged Dockerfile, added Dockerfile.dev that mounts the
 local computer-server

---
 libs/xfce/Development.md  |  20 +
 libs/xfce/Dockerfile      |   6 -
 libs/xfce/Dockerfile.dev  | 159 ++++++++
 libs/xfce/README_BUILD.md |  32 --
 libs/xfce/browser.py      | 308 --------------
 libs/xfce/main.py         | 820 --------------------------------------
 6 files changed, 179 insertions(+), 1166 deletions(-)
 create mode 100644 libs/xfce/Development.md
 create mode 100644 libs/xfce/Dockerfile.dev
 delete mode 100644 libs/xfce/README_BUILD.md
 delete mode 100644 libs/xfce/browser.py
 delete mode 100644 libs/xfce/main.py

diff --git a/libs/xfce/Development.md b/libs/xfce/Development.md
new file mode 100644
index 00000000..4bc12c81
--- /dev/null
+++ b/libs/xfce/Development.md
@@ -0,0 +1,20 @@
+# Development
+
+## Building the Development Docker Image
+
+To build the XFCE container with local computer-server changes:
+
+```bash
+cd libs/xfce
+docker build -f Dockerfile.dev -t cua-xfce:dev ..
+```
+
+The build context is set to the parent directory to allow copying the local `computer-server` source.
+
+## Running the Development Container
+
+```bash
+docker run -p 6901:6901 -p 8000:8000 cua-xfce:dev
+```
+
+Access noVNC at: http://localhost:6901
diff --git a/libs/xfce/Dockerfile b/libs/xfce/Dockerfile
index 43dab80f..f1605181 100644
--- a/libs/xfce/Dockerfile
+++ b/libs/xfce/Dockerfile
@@ -107,12 +107,6 @@ RUN mkdir -p /home/cua/.cache && \
 # Install computer-server using Python 3.12 pip
 RUN python3.12 -m pip install cua-computer-server
 
-# Copy browser.py and updated main.py from local source (to include browser tool)
-# These files need to be in the same directory as the Dockerfile when building
-COPY browser.py /tmp/browser.py
-COPY main.py /tmp/main.py
-RUN python3.12 -c "import shutil; import os; cs_dir = '/usr/local/lib/python3.12/dist-packages/computer_server'; shutil.copy('/tmp/browser.py', f'{cs_dir}/browser.py'); shutil.copy('/tmp/main.py', f'{cs_dir}/main.py'); print('Copied browser.py and main.py')" && rm /tmp/browser.py /tmp/main.py
-
 # Install playwright and Firefox dependencies
 RUN python3.12 -m pip install playwright && \
     python3.12 -m playwright install --with-deps firefox
diff --git a/libs/xfce/Dockerfile.dev b/libs/xfce/Dockerfile.dev
new file mode 100644
index 00000000..12036a0c
--- /dev/null
+++ b/libs/xfce/Dockerfile.dev
@@ -0,0 +1,159 @@
+# CUA Docker XFCE Container - Development Version
+# Vanilla XFCE desktop with noVNC and computer-server (from local source)
+
+FROM ubuntu:22.04
+
+# Avoid prompts from apt
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Set environment variables
+ENV HOME=/home/cua
+ENV DISPLAY=:1
+ENV VNC_PORT=5901
+ENV NOVNC_PORT=6901
+ENV API_PORT=8000
+ENV VNC_RESOLUTION=1024x768
+ENV VNC_COL_DEPTH=24
+
+# Install system dependencies first (including sudo)
+RUN apt-get update && apt-get install -y \
+    # System utilities
+    sudo \
+    unzip \
+    zip \
+    xdg-utils \
+    # Desktop environment
+    xfce4 \
+    xfce4-terminal \
+    dbus-x11 \
+    # VNC server
+    tigervnc-standalone-server \
+    tigervnc-common \
+    # noVNC dependencies
+    # python will be installed via deadsnakes as 3.12 \
+    git \
+    net-tools \
+    netcat \
+    supervisor \
+    # Computer-server dependencies
+    # python-tk/dev for 3.12 will be installed later \
+    gnome-screenshot \
+    wmctrl \
+    ffmpeg \
+    socat \
+    xclip \
+    # Browser
+    wget \
+    software-properties-common \
+    # Build tools
+    build-essential \
+    libncursesw5-dev \
+    libssl-dev \
+    libsqlite3-dev \
+    tk-dev \
+    libgdbm-dev \
+    libc6-dev \
+    libbz2-dev \
+    libffi-dev \
+    zlib1g-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python 3.12 from deadsnakes (keep system python3 for apt)
+RUN add-apt-repository -y ppa:deadsnakes/ppa && \
+    apt-get update && apt-get install -y \
+    python3.12 python3.12-venv python3.12-dev python3.12-tk && \
+    python3.12 -m ensurepip --upgrade && \
+    python3.12 -m pip install --upgrade pip setuptools wheel && \
+    rm -rf /var/lib/apt/lists/*
+
+# Ensure 'python' points to Python 3.12
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 2
+
+# Remove screensavers and power manager to avoid popups and lock screens
+RUN apt-get remove -y \
+    xfce4-power-manager \
+    xfce4-power-manager-data \
+    xfce4-power-manager-plugins \
+    xfce4-screensaver \
+    light-locker \
+    xscreensaver \
+    xscreensaver-data || true
+
+# Create user after sudo is installed
+RUN useradd -m -s /bin/bash -G sudo cua && \
+    echo "cua:cua" | chpasswd && \
+    echo "cua ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
+
+# Install Firefox from Mozilla PPA (snap-free) - inline to avoid script issues
+RUN apt-get update && \
+    add-apt-repository -y ppa:mozillateam/ppa && \
+    echo 'Package: *\nPin: release o=LP-PPA-mozillateam\nPin-Priority: 1001' > /etc/apt/preferences.d/mozilla-firefox && \
+    apt-get update && \
+    apt-get install -y firefox && \
+    echo 'pref("datareporting.policy.firstRunURL", "");\npref("datareporting.policy.dataSubmissionEnabled", false);\npref("datareporting.healthreport.service.enabled", false);\npref("datareporting.healthreport.uploadEnabled", false);\npref("trailhead.firstrun.branches", "nofirstrun-empty");\npref("browser.aboutwelcome.enabled", false);' > /usr/lib/firefox/browser/defaults/preferences/firefox.js && \
+    update-alternatives --install /usr/bin/x-www-browser x-www-browser /usr/bin/firefox 100 && \
+    update-alternatives --install /usr/bin/gnome-www-browser gnome-www-browser /usr/bin/firefox 100 && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install noVNC
+RUN git clone https://github.com/novnc/noVNC.git /opt/noVNC && \
+    git clone https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \
+    ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html
+
+# Pre-create cache directory with correct ownership before pip install
+RUN mkdir -p /home/cua/.cache && \
+    chown -R cua:cua /home/cua/.cache
+
+# Copy local computer-server source and install it
+COPY ../python/computer-server /tmp/computer-server
+RUN python3.12 -m pip install /tmp/computer-server && \
+    rm -rf /tmp/computer-server
+
+# Install playwright and Firefox dependencies
+RUN python3.12 -m pip install playwright && \
+    python3.12 -m playwright install --with-deps firefox
+
+# Fix any cache files created by pip
+RUN chown -R cua:cua /home/cua/.cache
+
+# Copy startup scripts
+COPY src/supervisor/ /etc/supervisor/conf.d/
+COPY src/scripts/ /usr/local/bin/
+
+# Make scripts executable
+RUN chmod +x /usr/local/bin/*.sh
+
+# Setup VNC
+RUN chown -R cua:cua /home/cua
+USER cua
+WORKDIR /home/cua
+
+# Create VNC directory (no password needed with SecurityTypes None)
+RUN mkdir -p $HOME/.vnc
+
+# Configure XFCE for first start
+RUN mkdir -p $HOME/.config/xfce4/xfconf/xfce-perchannel-xml $HOME/.config/xfce4 $HOME/.config/autostart
+
+# Copy XFCE config to disable browser launching and welcome screens
+COPY --chown=cua:cua src/xfce-config/helpers.rc $HOME/.config/xfce4/helpers.rc
+COPY --chown=cua:cua src/xfce-config/xfce4-session.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-session.xml
+COPY --chown=cua:cua src/xfce-config/xfce4-power-manager.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-power-manager.xml
+
+# Disable autostart for screensaver, lock screen, and power manager
+RUN echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-tips-autostart.desktop && \
+    echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-screensaver.desktop && \
+    echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/light-locker.desktop && \
+    echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-power-manager.desktop && \
+    chown -R cua:cua $HOME/.config
+
+# Create storage and shared directories, and Firefox cache directory
+RUN mkdir -p $HOME/storage $HOME/shared $HOME/.cache/dconf $HOME/.mozilla/firefox && \
+    chown -R cua:cua $HOME/storage $HOME/shared $HOME/.cache $HOME/.mozilla $HOME/.vnc
+
+USER root
+
+# Expose ports
+EXPOSE $VNC_PORT $NOVNC_PORT $API_PORT
+
+# Start services via supervisor
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
diff --git a/libs/xfce/README_BUILD.md b/libs/xfce/README_BUILD.md
deleted file mode 100644
index d6f6a7d4..00000000
--- a/libs/xfce/README_BUILD.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Building the XFCE Docker Image
-
-## Required Files for Build
-
-The Dockerfile requires these files to be present in the `libs/xfce/` directory:
-
-- `browser.py` - Copy from `libs/python/computer-server/computer_server/browser.py`
-- `main.py` - Copy from `libs/python/computer-server/computer_server/main.py`
-
-These files are copied into the container to include the browser tool functionality
-that isn't yet in the published PyPI package.
-
-## Before Building
-
-```bash
-# Copy the latest browser tool files
-cp libs/python/computer-server/computer_server/browser.py libs/xfce/
-cp libs/python/computer-server/computer_server/main.py libs/xfce/
-```
-
-## Build Command
-
-```bash
-cd libs/xfce
-docker build -t cua-xfce .
-```
-
-## Note
-
-Once the browser tool is included in the published `cua-computer-server` package,
-these temporary file copies can be removed and the Dockerfile can be simplified.
-
diff --git a/libs/xfce/browser.py b/libs/xfce/browser.py
deleted file mode 100644
index 3d0a4c69..00000000
--- a/libs/xfce/browser.py
+++ /dev/null
@@ -1,308 +0,0 @@
-"""
-Browser manager using Playwright for programmatic browser control.
-This allows agents to control a browser that runs visibly on the XFCE desktop.
-"""
-
-import asyncio
-import logging
-import os
-from typing import Any, Dict, Optional
-
-try:
-    from playwright.async_api import async_playwright, Browser, BrowserContext, Page
-except ImportError:
-    async_playwright = None
-    Browser = None
-    BrowserContext = None
-    Page = None
-
-logger = logging.getLogger(__name__)
-
-
-class BrowserManager:
-    """
-    Manages a Playwright browser instance that runs visibly on the XFCE desktop.
-    Uses persistent context to maintain cookies and sessions.
-    """
-
-    def __init__(self):
-        """Initialize the BrowserManager."""
-        self.playwright = None
-        self.browser: Optional[Browser] = None
-        self.context: Optional[BrowserContext] = None
-        self.page: Optional[Page] = None
-        self._initialized = False
-        self._initialization_error: Optional[str] = None
-        self._lock = asyncio.Lock()
-
-    async def _ensure_initialized(self):
-        """Ensure the browser is initialized."""
-        # Check if browser was closed and needs reinitialization
-        if self._initialized:
-            try:
-                # Check if context is still valid by trying to access it
-                if self.context:
-                    # Try to get pages - this will raise if context is closed
-                    _ = self.context.pages
-                    # If we get here, context is still alive
-                    return
-                else:
-                    # Context was closed, need to reinitialize
-                    self._initialized = False
-                    logger.warning("Browser context was closed, will reinitialize...")
-            except Exception as e:
-                # Context is dead, need to reinitialize
-                logger.warning(f"Browser context is dead ({e}), will reinitialize...")
-                self._initialized = False
-                self.context = None
-                self.page = None
-                # Clean up playwright if it exists
-                if self.playwright:
-                    try:
-                        await self.playwright.stop()
-                    except Exception:
-                        pass
-                    self.playwright = None
-
-        async with self._lock:
-            # Double-check after acquiring lock (another thread might have initialized it)
-            if self._initialized:
-                try:
-                    if self.context:
-                        _ = self.context.pages
-                        return
-                except Exception:
-                    self._initialized = False
-                    self.context = None
-                    self.page = None
-                    if self.playwright:
-                        try:
-                            await self.playwright.stop()
-                        except Exception:
-                            pass
-                        self.playwright = None
-
-            if async_playwright is None:
-                raise RuntimeError(
-                    "playwright is not installed. Please install it with: pip install playwright && playwright install --with-deps firefox"
-                )
-
-            try:
-                # Get display from environment or default to :1
-                display = os.environ.get("DISPLAY", ":1")
-                logger.info(f"Initializing browser with DISPLAY={display}")
-
-                # Start playwright
-                self.playwright = await async_playwright().start()
-
-                # Launch Firefox with persistent context (keeps cookies/sessions)
-                # headless=False is CRITICAL so the visual agent can see it
-                user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright-firefox")
-                os.makedirs(user_data_dir, exist_ok=True)
-
-                # launch_persistent_context returns a BrowserContext, not a Browser
-                # Note: Removed --kiosk mode so the desktop remains visible
-                self.context = await self.playwright.firefox.launch_persistent_context(
-                    user_data_dir=user_data_dir,
-                    headless=False,  # CRITICAL: visible for visual agent
-                    viewport={"width": 1024, "height": 768},
-                    # Removed --kiosk to allow desktop visibility
-                )
-
-                # Get the first page or create one
-                pages = self.context.pages
-                if pages:
-                    self.page = pages[0]
-                else:
-                    self.page = await self.context.new_page()
-
-                self._initialized = True
-                logger.info("Browser initialized successfully")
-
-            except Exception as e:
-                logger.error(f"Failed to initialize browser: {e}")
-                import traceback
-                logger.error(traceback.format_exc())
-                # Don't raise - return error in execute_command instead
-                self._initialization_error = str(e)
-                raise
-
-    async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Execute a browser command.
-
-        Args:
-            cmd: Command name (visit_url, click, type, scroll, web_search)
-            params: Command parameters
-
-        Returns:
-            Result dictionary with success status and any data
-        """
-        try:
-            await self._ensure_initialized()
-        except Exception as e:
-            error_msg = getattr(self, '_initialization_error', None) or str(e)
-            logger.error(f"Browser initialization failed: {error_msg}")
-            return {
-                "success": False,
-                "error": f"Browser initialization failed: {error_msg}. "
-                         f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly."
-            }
-
-        # Ensure browser is still initialized (in case it was manually closed)
-        # This will automatically reinitialize if the browser was closed
-        await self._ensure_initialized()
-        
-        # Check if page is still valid
-        page_valid = False
-        try:
-            if self.page is not None:
-                # Try to access page.url to check if it's still valid
-                _ = self.page.url
-                page_valid = True
-        except Exception as e:
-            logger.warning(f"Page is invalid: {e}, will get a new page...")
-            self.page = None
-        
-        # Get a valid page if we don't have one
-        if not page_valid or self.page is None:
-            try:
-                pages = self.context.pages
-                if pages:
-                    # Find first non-closed page
-                    for p in pages:
-                        try:
-                            if not p.is_closed():
-                                self.page = p
-                                logger.info("Reusing existing open page")
-                                page_valid = True
-                                break
-                        except Exception:
-                            continue
-                
-                # If no valid page found, create a new one
-                if not page_valid:
-                    self.page = await self.context.new_page()
-                    logger.info("Created new page")
-            except Exception as e:
-                logger.error(f"Failed to get new page: {e}, browser may be closed")
-                # Browser was closed - reinitialize it
-                try:
-                    logger.info("Browser was closed manually, reinitializing...")
-                    self._initialized = False
-                    self.context = None
-                    self.page = None
-                    if self.playwright:
-                        try:
-                            await self.playwright.stop()
-                        except Exception:
-                            pass
-                        self.playwright = None
-                    
-                    # Reinitialize
-                    await self._ensure_initialized()
-                    # Get or create a page
-                    pages = self.context.pages
-                    if pages:
-                        self.page = pages[0]
-                    else:
-                        self.page = await self.context.new_page()
-                    logger.info("Browser reopened successfully after manual closure")
-                except Exception as reinit_error:
-                    logger.error(f"Failed to reinitialize browser: {reinit_error}")
-                    import traceback
-                    logger.error(traceback.format_exc())
-                    return {"success": False, "error": f"Browser was closed and cannot be recovered: {reinit_error}"}
-
-        try:
-            if cmd == "visit_url":
-                url = params.get("url")
-                if not url:
-                    return {"success": False, "error": "url parameter is required"}
-                await self.page.goto(url, wait_until="domcontentloaded", timeout=30000)
-                return {"success": True, "url": self.page.url}
-
-            elif cmd == "click":
-                x = params.get("x")
-                y = params.get("y")
-                if x is None or y is None:
-                    return {"success": False, "error": "x and y parameters are required"}
-                await self.page.mouse.click(x, y)
-                return {"success": True}
-
-            elif cmd == "type":
-                text = params.get("text")
-                if text is None:
-                    return {"success": False, "error": "text parameter is required"}
-                await self.page.keyboard.type(text)
-                return {"success": True}
-
-            elif cmd == "scroll":
-                delta_x = params.get("delta_x", 0)
-                delta_y = params.get("delta_y", 0)
-                await self.page.mouse.wheel(delta_x, delta_y)
-                return {"success": True}
-
-            elif cmd == "web_search":
-                query = params.get("query")
-                if not query:
-                    return {"success": False, "error": "query parameter is required"}
-                # Navigate to Google search
-                search_url = f"https://www.google.com/search?q={query}"
-                await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000)
-                return {"success": True, "url": self.page.url}
-
-            else:
-                return {"success": False, "error": f"Unknown command: {cmd}"}
-
-        except Exception as e:
-            logger.error(f"Error executing command {cmd}: {e}")
-            import traceback
-            logger.error(traceback.format_exc())
-            # If page was closed due to error, try to recover
-            if "closed" in str(e).lower() and self.context:
-                try:
-                    pages = self.context.pages
-                    if pages:
-                        self.page = pages[0]
-                        logger.info("Recovered page after error")
-                    else:
-                        self.page = await self.context.new_page()
-                        logger.info("Created new page after error")
-                except Exception as recover_error:
-                    logger.error(f"Failed to recover page: {recover_error}")
-            return {"success": False, "error": str(e)}
-
-    async def close(self):
-        """Close the browser and cleanup resources."""
-        async with self._lock:
-            try:
-                if self.context:
-                    await self.context.close()
-                    self.context = None
-                if self.browser:
-                    await self.browser.close()
-                    self.browser = None
-
-                if self.playwright:
-                    await self.playwright.stop()
-                    self.playwright = None
-
-                self.page = None
-                self._initialized = False
-                logger.info("Browser closed successfully")
-            except Exception as e:
-                logger.error(f"Error closing browser: {e}")
-
-
-# Global instance
-_browser_manager: Optional[BrowserManager] = None
-
-
-def get_browser_manager() -> BrowserManager:
-    """Get or create the global BrowserManager instance."""
-    global _browser_manager
-    if _browser_manager is None:
-        _browser_manager = BrowserManager()
-    return _browser_manager
-
diff --git a/libs/xfce/main.py b/libs/xfce/main.py
deleted file mode 100644
index 9bad59bf..00000000
--- a/libs/xfce/main.py
+++ /dev/null
@@ -1,820 +0,0 @@
-import asyncio
-import hashlib
-import inspect
-import json
-import logging
-import os
-import platform
-import time
-import traceback
-from contextlib import redirect_stderr, redirect_stdout
-from io import StringIO
-from typing import Any, Dict, List, Literal, Optional, Union, cast
-
-import aiohttp
-import uvicorn
-from fastapi import (
-    FastAPI,
-    Header,
-    HTTPException,
-    Request,
-    WebSocket,
-    WebSocketDisconnect,
-)
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, StreamingResponse
-
-from .handlers.factory import HandlerFactory
-from .browser import get_browser_manager
-
-# Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s
-AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60"))
-
-try:
-    from agent import ComputerAgent
-
-    HAS_AGENT = True
-except ImportError:
-    HAS_AGENT = False
-
-# Set up logging with more detail
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-# Configure WebSocket with larger message size
-WEBSOCKET_MAX_SIZE = 1024 * 1024 * 10  # 10MB limit
-
-# Configure application with WebSocket settings
-app = FastAPI(
-    title="Computer API",
-    description="API for the Computer project",
-    version="0.1.0",
-    websocket_max_size=WEBSOCKET_MAX_SIZE,
-)
-
-# CORS configuration
-origins = ["*"]
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=origins,
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-protocol_version = 1
-try:
-    from importlib.metadata import version
-
-    package_version = version("cua-computer-server")
-except Exception:
-    # Fallback for cases where package is not installed or importlib.metadata is not available
-    try:
-        import pkg_resources
-
-        package_version = pkg_resources.get_distribution("cua-computer-server").version
-    except Exception:
-        package_version = "unknown"
-
-(
-    accessibility_handler,
-    automation_handler,
-    diorama_handler,
-    file_handler,
-    desktop_handler,
-    window_handler,
-) = HandlerFactory.create_handlers()
-handlers = {
-    "version": lambda: {"protocol": protocol_version, "package": package_version},
-    # App-Use commands
-    "diorama_cmd": diorama_handler.diorama_cmd,
-    # Accessibility commands
-    "get_accessibility_tree": accessibility_handler.get_accessibility_tree,
-    "find_element": accessibility_handler.find_element,
-    # Shell commands
-    "run_command": automation_handler.run_command,
-    # File system commands
-    "file_exists": file_handler.file_exists,
-    "directory_exists": file_handler.directory_exists,
-    "list_dir": file_handler.list_dir,
-    "read_text": file_handler.read_text,
-    "write_text": file_handler.write_text,
-    "read_bytes": file_handler.read_bytes,
-    "write_bytes": file_handler.write_bytes,
-    "get_file_size": file_handler.get_file_size,
-    "delete_file": file_handler.delete_file,
-    "create_dir": file_handler.create_dir,
-    "delete_dir": file_handler.delete_dir,
-    # Desktop commands
-    "get_desktop_environment": desktop_handler.get_desktop_environment,
-    "set_wallpaper": desktop_handler.set_wallpaper,
-    # Window management
-    "open": window_handler.open,
-    "launch": window_handler.launch,
-    "get_current_window_id": window_handler.get_current_window_id,
-    "get_application_windows": window_handler.get_application_windows,
-    "get_window_name": window_handler.get_window_name,
-    "get_window_size": window_handler.get_window_size,
-    "get_window_position": window_handler.get_window_position,
-    "set_window_size": window_handler.set_window_size,
-    "set_window_position": window_handler.set_window_position,
-    "maximize_window": window_handler.maximize_window,
-    "minimize_window": window_handler.minimize_window,
-    "activate_window": window_handler.activate_window,
-    "close_window": window_handler.close_window,
-    # Mouse commands
-    "mouse_down": automation_handler.mouse_down,
-    "mouse_up": automation_handler.mouse_up,
-    "left_click": automation_handler.left_click,
-    "right_click": automation_handler.right_click,
-    "double_click": automation_handler.double_click,
-    "move_cursor": automation_handler.move_cursor,
-    "drag_to": automation_handler.drag_to,
-    "drag": automation_handler.drag,
-    # Keyboard commands
-    "key_down": automation_handler.key_down,
-    "key_up": automation_handler.key_up,
-    "type_text": automation_handler.type_text,
-    "press_key": automation_handler.press_key,
-    "hotkey": automation_handler.hotkey,
-    # Scrolling actions
-    "scroll": automation_handler.scroll,
-    "scroll_down": automation_handler.scroll_down,
-    "scroll_up": automation_handler.scroll_up,
-    # Screen actions
-    "screenshot": automation_handler.screenshot,
-    "get_cursor_position": automation_handler.get_cursor_position,
-    "get_screen_size": automation_handler.get_screen_size,
-    # Clipboard actions
-    "copy_to_clipboard": automation_handler.copy_to_clipboard,
-    "set_clipboard": automation_handler.set_clipboard,
-}
-
-
-class AuthenticationManager:
-    def __init__(self):
-        self.sessions: Dict[str, Dict[str, Any]] = {}
-        self.container_name = os.environ.get("CONTAINER_NAME")
-
-    def _hash_credentials(self, container_name: str, api_key: str) -> str:
-        """Create a hash of container name and API key for session identification"""
-        combined = f"{container_name}:{api_key}"
-        return hashlib.sha256(combined.encode()).hexdigest()
-
-    def _is_session_valid(self, session_data: Dict[str, Any]) -> bool:
-        """Check if a session is still valid based on expiration time"""
-        if not session_data.get("valid", False):
-            return False
-
-        expires_at = session_data.get("expires_at", 0)
-        return time.time() < expires_at
-
-    async def auth(self, container_name: str, api_key: str) -> bool:
-        """Authenticate container name and API key, using cached sessions when possible"""
-        # If no CONTAINER_NAME is set, always allow access (local development)
-        if not self.container_name:
-            logger.info(
-                "No CONTAINER_NAME set in environment. Allowing access (local development mode)"
-            )
-            return True
-
-        # Layer 1: VM Identity Verification
-        if container_name != self.container_name:
-            logger.warning(
-                f"VM name mismatch. Expected: {self.container_name}, Got: {container_name}"
-            )
-            return False
-
-        # Create hash for session lookup
-        session_hash = self._hash_credentials(container_name, api_key)
-
-        # Check if we have a valid cached session
-        if session_hash in self.sessions:
-            session_data = self.sessions[session_hash]
-            if self._is_session_valid(session_data):
-                logger.info(f"Using cached authentication for container: {container_name}")
-                return session_data["valid"]
-            else:
-                # Remove expired session
-                del self.sessions[session_hash]
-
-        # No valid cached session, authenticate with API
-        logger.info(f"Authenticating with TryCUA API for container: {container_name}")
-
-        try:
-            async with aiohttp.ClientSession() as session:
-                headers = {"Authorization": f"Bearer {api_key}"}
-
-                async with session.get(
-                    f"https://www.cua.ai/api/vm/auth?container_name={container_name}",
-                    headers=headers,
-                ) as resp:
-                    is_valid = resp.status == 200 and bool((await resp.text()).strip())
-
-                    # Cache the result with configurable expiration
-                    self.sessions[session_hash] = {
-                        "valid": is_valid,
-                        "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS,
-                    }
-
-                    if is_valid:
-                        logger.info(f"Authentication successful for container: {container_name}")
-                    else:
-                        logger.warning(
-                            f"Authentication failed for container: {container_name}. Status: {resp.status}"
-                        )
-
-                    return is_valid
-
-        except aiohttp.ClientError as e:
-            logger.error(f"Failed to validate API key with TryCUA API: {str(e)}")
-            # Cache failed result to avoid repeated requests
-            self.sessions[session_hash] = {
-                "valid": False,
-                "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS,
-            }
-            return False
-        except Exception as e:
-            logger.error(f"Unexpected error during authentication: {str(e)}")
-            # Cache failed result to avoid repeated requests
-            self.sessions[session_hash] = {
-                "valid": False,
-                "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS,
-            }
-            return False
-
-
-class ConnectionManager:
-    def __init__(self):
-        self.active_connections: List[WebSocket] = []
-
-    async def connect(self, websocket: WebSocket):
-        await websocket.accept()
-        self.active_connections.append(websocket)
-
-    def disconnect(self, websocket: WebSocket):
-        self.active_connections.remove(websocket)
-
-
-manager = ConnectionManager()
-auth_manager = AuthenticationManager()
-
-
-@app.get("/status")
-async def status():
-    sys = platform.system().lower()
-    # get os type
-    if "darwin" in sys or sys == "macos" or sys == "mac":
-        os_type = "macos"
-    elif "windows" in sys:
-        os_type = "windows"
-    else:
-        os_type = "linux"
-    # get computer-server features
-    features = []
-    if HAS_AGENT:
-        features.append("agent")
-    return {"status": "ok", "os_type": os_type, "features": features}
-
-
-@app.websocket("/ws", name="websocket_endpoint")
-async def websocket_endpoint(websocket: WebSocket):
-    global handlers
-
-    # WebSocket message size is configured at the app or endpoint level, not on the instance
-    await manager.connect(websocket)
-
-    # Check if CONTAINER_NAME is set (indicating cloud provider)
-    server_container_name = os.environ.get("CONTAINER_NAME")
-
-    # If cloud provider, perform authentication handshake
-    if server_container_name:
-        try:
-            logger.info(
-                f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Waiting for authentication..."
-            )
-
-            # Wait for authentication message
-            auth_data = await websocket.receive_json()
-
-            # Validate auth message format
-            if auth_data.get("command") != "authenticate":
-                await websocket.send_json(
-                    {"success": False, "error": "First message must be authentication"}
-                )
-                await websocket.close()
-                manager.disconnect(websocket)
-                return
-
-            # Extract credentials
-            client_api_key = auth_data.get("params", {}).get("api_key")
-            client_container_name = auth_data.get("params", {}).get("container_name")
-
-            # Validate credentials using AuthenticationManager
-            if not client_api_key:
-                await websocket.send_json({"success": False, "error": "API key required"})
-                await websocket.close()
-                manager.disconnect(websocket)
-                return
-
-            if not client_container_name:
-                await websocket.send_json({"success": False, "error": "Container name required"})
-                await websocket.close()
-                manager.disconnect(websocket)
-                return
-
-            # Use AuthenticationManager for validation
-            is_authenticated = await auth_manager.auth(client_container_name, client_api_key)
-            if not is_authenticated:
-                await websocket.send_json({"success": False, "error": "Authentication failed"})
-                await websocket.close()
-                manager.disconnect(websocket)
-                return
-
-            logger.info(f"Authentication successful for VM: {client_container_name}")
-            await websocket.send_json({"success": True, "message": "Authentication successful"})
-
-        except Exception as e:
-            logger.error(f"Error during authentication handshake: {str(e)}")
-            await websocket.send_json({"success": False, "error": "Authentication failed"})
-            await websocket.close()
-            manager.disconnect(websocket)
-            return
-
-    try:
-        while True:
-            try:
-                data = await websocket.receive_json()
-                command = data.get("command")
-                params = data.get("params", {})
-
-                if command not in handlers:
-                    await websocket.send_json(
-                        {"success": False, "error": f"Unknown command: {command}"}
-                    )
-                    continue
-
-                try:
-                    # Filter params to only include those accepted by the handler function
-                    handler_func = handlers[command]
-                    sig = inspect.signature(handler_func)
-                    filtered_params = {k: v for k, v in params.items() if k in sig.parameters}
-
-                    # Handle both sync and async functions
-                    if asyncio.iscoroutinefunction(handler_func):
-                        result = await handler_func(**filtered_params)
-                    else:
-                        # Run sync functions in thread pool to avoid blocking event loop
-                        result = await asyncio.to_thread(handler_func, **filtered_params)
-                    await websocket.send_json({"success": True, **result})
-                except Exception as cmd_error:
-                    logger.error(f"Error executing command {command}: {str(cmd_error)}")
-                    logger.error(traceback.format_exc())
-                    await websocket.send_json({"success": False, "error": str(cmd_error)})
-
-            except WebSocketDisconnect:
-                raise
-            except json.JSONDecodeError as json_err:
-                logger.error(f"JSON decode error: {str(json_err)}")
-                await websocket.send_json(
-                    {"success": False, "error": f"Invalid JSON: {str(json_err)}"}
-                )
-            except Exception as loop_error:
-                logger.error(f"Error in message loop: {str(loop_error)}")
-                logger.error(traceback.format_exc())
-                await websocket.send_json({"success": False, "error": str(loop_error)})
-
-    except WebSocketDisconnect:
-        logger.info("Client disconnected")
-        manager.disconnect(websocket)
-    except Exception as e:
-        logger.error(f"Fatal error in websocket connection: {str(e)}")
-        logger.error(traceback.format_exc())
-        try:
-            await websocket.close()
-        except:
-            pass
-        manager.disconnect(websocket)
-
-
-@app.post("/cmd")
-async def cmd_endpoint(
-    request: Request,
-    container_name: Optional[str] = Header(None, alias="X-Container-Name"),
-    api_key: Optional[str] = Header(None, alias="X-API-Key"),
-):
-    """
-    Backup endpoint for when WebSocket connections fail.
-    Accepts commands via HTTP POST with streaming response.
-
-    Headers:
-    - X-Container-Name: Container name for cloud authentication
-    - X-API-Key: API key for cloud authentication
-
-    Body:
-    {
-        "command": "command_name",
-        "params": {...}
-    }
-    """
-    global handlers
-
-    # Parse request body
-    try:
-        body = await request.json()
-        command = body.get("command")
-        params = body.get("params", {})
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
-
-    if not command:
-        raise HTTPException(status_code=400, detail="Command is required")
-
-    # Check if CONTAINER_NAME is set (indicating cloud provider)
-    server_container_name = os.environ.get("CONTAINER_NAME")
-
-    # If cloud provider, perform authentication
-    if server_container_name:
-        logger.info(
-            f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..."
-        )
-
-        # Validate required headers
-        if not container_name:
-            raise HTTPException(status_code=401, detail="Container name required")
-
-        if not api_key:
-            raise HTTPException(status_code=401, detail="API key required")
-
-        # Validate with AuthenticationManager
-        is_authenticated = await auth_manager.auth(container_name, api_key)
-        if not is_authenticated:
-            raise HTTPException(status_code=401, detail="Authentication failed")
-
-    if command not in handlers:
-        raise HTTPException(status_code=400, detail=f"Unknown command: {command}")
-
-    async def generate_response():
-        """Generate streaming response for the command execution"""
-        try:
-            # Filter params to only include those accepted by the handler function
-            handler_func = handlers[command]
-            sig = inspect.signature(handler_func)
-            filtered_params = {k: v for k, v in params.items() if k in sig.parameters}
-
-            # Handle both sync and async functions
-            if asyncio.iscoroutinefunction(handler_func):
-                result = await handler_func(**filtered_params)
-            else:
-                # Run sync functions in thread pool to avoid blocking event loop
-                result = await asyncio.to_thread(handler_func, **filtered_params)
-
-            # Stream the successful result
-            response_data = {"success": True, **result}
-            yield f"data: {json.dumps(response_data)}\n\n"
-
-        except Exception as cmd_error:
-            logger.error(f"Error executing command {command}: {str(cmd_error)}")
-            logger.error(traceback.format_exc())
-
-            # Stream the error result
-            error_data = {"success": False, "error": str(cmd_error)}
-            yield f"data: {json.dumps(error_data)}\n\n"
-
-    return StreamingResponse(
-        generate_response(),
-        media_type="text/plain",
-        headers={
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-        },
-    )
-
-
-@app.post("/responses")
-async def agent_response_endpoint(
-    request: Request,
-    api_key: Optional[str] = Header(None, alias="X-API-Key"),
-):
-    """
-    Minimal proxy to run ComputerAgent for up to 2 turns.
-
-    Security:
-    - If CONTAINER_NAME is set on the server, require X-API-Key
-      and validate using AuthenticationManager unless CUA_ENABLE_PUBLIC_PROXY is true.
-
-    Body JSON:
-    {
-      "model": "...",                 # required
-      "input": "... or messages[]",   # required
-      "agent_kwargs": { ... },         # optional, passed directly to ComputerAgent
-      "env": { ... }                   # optional env overrides for agent
-    }
-    """
-    if not HAS_AGENT:
-        raise HTTPException(status_code=501, detail="ComputerAgent not available")
-
-    # Authenticate via AuthenticationManager if running in cloud (CONTAINER_NAME set)
-    container_name = os.environ.get("CONTAINER_NAME")
-    if container_name:
-        is_public = os.environ.get("CUA_ENABLE_PUBLIC_PROXY", "").lower().strip() in [
-            "1",
-            "true",
-            "yes",
-            "y",
-            "on",
-        ]
-        if not is_public:
-            if not api_key:
-                raise HTTPException(status_code=401, detail="Missing AGENT PROXY auth headers")
-            ok = await auth_manager.auth(container_name, api_key)
-            if not ok:
-                raise HTTPException(status_code=401, detail="Unauthorized")
-
-    # Parse request body
-    try:
-        body = await request.json()
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
-
-    model = body.get("model")
-    input_data = body.get("input")
-    if not model or input_data is None:
-        raise HTTPException(status_code=400, detail="'model' and 'input' are required")
-
-    agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {}
-    env_overrides: Dict[str, str] = body.get("env") or {}
-
-    # Simple env override context
-    class _EnvOverride:
-        def __init__(self, overrides: Dict[str, str]):
-            self.overrides = overrides
-            self._original: Dict[str, Optional[str]] = {}
-
-        def __enter__(self):
-            for k, v in (self.overrides or {}).items():
-                self._original[k] = os.environ.get(k)
-                os.environ[k] = str(v)
-
-        def __exit__(self, exc_type, exc, tb):
-            for k, old in self._original.items():
-                if old is None:
-                    os.environ.pop(k, None)
-                else:
-                    os.environ[k] = old
-
-    # Convert input to messages
-    def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
-        if isinstance(data, str):
-            return [{"role": "user", "content": data}]
-        if isinstance(data, list):
-            return data
-
-    messages = _to_messages(input_data)
-
-    # Define a direct computer tool that implements the AsyncComputerHandler protocol
-    # and delegates to our existing automation/file/accessibility handlers.
-    from agent.computers import AsyncComputerHandler  # runtime-checkable Protocol
-
-    class DirectComputer(AsyncComputerHandler):
-        def __init__(self):
-            # use module-scope handler singletons created by HandlerFactory
-            self._auto = automation_handler
-            self._file = file_handler
-            self._access = accessibility_handler
-
-        async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
-            sys = platform.system().lower()
-            if "darwin" in sys or sys in ("macos", "mac"):
-                return "mac"
-            if "windows" in sys:
-                return "windows"
-            return "linux"
-
-        async def get_dimensions(self) -> tuple[int, int]:
-            size = await self._auto.get_screen_size()
-            return size["width"], size["height"]
-
-        async def screenshot(self) -> str:
-            img_b64 = await self._auto.screenshot()
-            return img_b64["image_data"]
-
-        async def click(self, x: int, y: int, button: str = "left") -> None:
-            if button == "left":
-                await self._auto.left_click(x, y)
-            elif button == "right":
-                await self._auto.right_click(x, y)
-            else:
-                await self._auto.left_click(x, y)
-
-        async def double_click(self, x: int, y: int) -> None:
-            await self._auto.double_click(x, y)
-
-        async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
-            await self._auto.move_cursor(x, y)
-            await self._auto.scroll(scroll_x, scroll_y)
-
-        async def type(self, text: str) -> None:
-            await self._auto.type_text(text)
-
-        async def wait(self, ms: int = 1000) -> None:
-            await asyncio.sleep(ms / 1000.0)
-
-        async def move(self, x: int, y: int) -> None:
-            await self._auto.move_cursor(x, y)
-
-        async def keypress(self, keys: Union[List[str], str]) -> None:
-            if isinstance(keys, str):
-                parts = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
-            else:
-                parts = keys
-            if len(parts) == 1:
-                await self._auto.press_key(parts[0])
-            else:
-                await self._auto.hotkey(parts)
-
-        async def drag(self, path: List[Dict[str, int]]) -> None:
-            if not path:
-                return
-            start = path[0]
-            await self._auto.mouse_down(start["x"], start["y"])
-            for pt in path[1:]:
-                await self._auto.move_cursor(pt["x"], pt["y"])
-            end = path[-1]
-            await self._auto.mouse_up(end["x"], end["y"])
-
-        async def get_current_url(self) -> str:
-            # Not available in this server context
-            return ""
-
-        async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
-            await self._auto.mouse_down(x, y, button="left")
-
-        async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
-            await self._auto.mouse_up(x, y, button="left")
-
-    # # Inline image URLs to base64
-    # import base64, mimetypes, requests
-    # # Use a browser-like User-Agent to avoid 403s from some CDNs (e.g., Wikimedia)
-    # HEADERS = {
-    #     "User-Agent": (
-    #         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-    #         "AppleWebKit/537.36 (KHTML, like Gecko) "
-    #         "Chrome/124.0.0.0 Safari/537.36"
-    #     )
-    # }
-    # def _to_data_url(content_bytes: bytes, url: str, resp: requests.Response) -> str:
-    #     ctype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream"
-    #     b64 = base64.b64encode(content_bytes).decode("utf-8")
-    #     return f"data:{ctype};base64,{b64}"
-    # def inline_image_urls(messages):
-    #     # messages: List[{"role": "...","content":[...]}]
-    #     out = []
-    #     for m in messages:
-    #         if not isinstance(m.get("content"), list):
-    #             out.append(m)
-    #             continue
-    #         new_content = []
-    #         for part in (m.get("content") or []):
-    #             if part.get("type") == "input_image" and (url := part.get("image_url")):
-    #                 resp = requests.get(url, headers=HEADERS, timeout=30)
-    #                 resp.raise_for_status()
-    #                 new_content.append({
-    #                     "type": "input_image",
-    #                     "image_url": _to_data_url(resp.content, url, resp)
-    #                 })
-    #             else:
-    #                 new_content.append(part)
-    #         out.append({**m, "content": new_content})
-    #     return out
-    # messages = inline_image_urls(messages)
-
-    error = None
-
-    with _EnvOverride(env_overrides):
-        # Prepare tools: if caller did not pass tools, inject our DirectComputer
-        tools = agent_kwargs.get("tools")
-        if not tools:
-            tools = [DirectComputer()]
-            agent_kwargs = {**agent_kwargs, "tools": tools}
-        # Instantiate agent with our tools
-        agent = ComputerAgent(model=model, **agent_kwargs)  # type: ignore[arg-type]
-
-        total_output: List[Any] = []
-        total_usage: Dict[str, Any] = {}
-
-        pending_computer_call_ids = set()
-        try:
-            async for result in agent.run(messages):
-                total_output += result["output"]
-                # Try to collect usage if present
-                if (
-                    isinstance(result, dict)
-                    and "usage" in result
-                    and isinstance(result["usage"], dict)
-                ):
-                    # Merge usage counters
-                    for k, v in result["usage"].items():
-                        if isinstance(v, (int, float)):
-                            total_usage[k] = total_usage.get(k, 0) + v
-                        else:
-                            total_usage[k] = v
-                for msg in result.get("output", []):
-                    if msg.get("type") == "computer_call":
-                        pending_computer_call_ids.add(msg["call_id"])
-                    elif msg.get("type") == "computer_call_output":
-                        pending_computer_call_ids.discard(msg["call_id"])
-                # exit if no pending computer calls
-                if not pending_computer_call_ids:
-                    break
-        except Exception as e:
-            logger.error(f"Error running agent: {str(e)}")
-            logger.error(traceback.format_exc())
-            error = str(e)
-
-    # Build response payload
-    payload = {
-        "model": model,
-        "error": error,
-        "output": total_output,
-        "usage": total_usage,
-        "status": "completed" if not error else "failed",
-    }
-
-    # CORS: allow any origin
-    headers = {
-        "Cache-Control": "no-cache",
-        "Connection": "keep-alive",
-    }
-
-    return JSONResponse(content=payload, headers=headers)
-
-
-@app.post("/playwright_exec")
-async def playwright_exec_endpoint(
-    request: Request,
-    container_name: Optional[str] = Header(None, alias="X-Container-Name"),
-    api_key: Optional[str] = Header(None, alias="X-API-Key"),
-):
-    """
-    Execute Playwright browser commands.
-
-    Headers:
-    - X-Container-Name: Container name for cloud authentication
-    - X-API-Key: API key for cloud authentication
-
-    Body:
-    {
-        "command": "visit_url|click|type|scroll|web_search",
-        "params": {...}
-    }
-    """
-    # Parse request body
-    try:
-        body = await request.json()
-        command = body.get("command")
-        params = body.get("params", {})
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
-
-    if not command:
-        raise HTTPException(status_code=400, detail="Command is required")
-
-    # Check if CONTAINER_NAME is set (indicating cloud provider)
-    server_container_name = os.environ.get("CONTAINER_NAME")
-
-    # If cloud provider, perform authentication
-    if server_container_name:
-        logger.info(
-            f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..."
-        )
-
-        # Validate required headers
-        if not container_name:
-            raise HTTPException(status_code=401, detail="Container name required")
-
-        if not api_key:
-            raise HTTPException(status_code=401, detail="API key required")
-
-        # Validate with AuthenticationManager
-        is_authenticated = await auth_manager.auth(container_name, api_key)
-        if not is_authenticated:
-            raise HTTPException(status_code=401, detail="Authentication failed")
-
-    # Get browser manager and execute command
-    try:
-        browser_manager = get_browser_manager()
-        result = await browser_manager.execute_command(command, params)
-        
-        if result.get("success"):
-            return JSONResponse(content=result)
-        else:
-            raise HTTPException(status_code=400, detail=result.get("error", "Command failed"))
-    except Exception as e:
-        logger.error(f"Error executing playwright command: {str(e)}")
-        logger.error(traceback.format_exc())
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)

From 8f297eac3ce0b113f4a8698f37019265e2cd98b8 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 3 Dec 2025 09:00:20 -0800
Subject: [PATCH 4/8] Migrate browser interface into computer SDK

---
 examples/browser_tool_example.py          |  3 ++-
 libs/python/computer/computer/computer.py | 29 +++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/examples/browser_tool_example.py b/examples/browser_tool_example.py
index 11a8dead..70055ff9 100644
--- a/examples/browser_tool_example.py
+++ b/examples/browser_tool_example.py
@@ -39,7 +39,8 @@ async def test_browser_tool():
     # Initialize the computer interface
     # For local testing, use provider_type="docker"
     # For provider_type="cloud", provide name and api_key
-    computer = Computer(provider_type="docker")
+    computer = Computer(provider_type="docker", os_type="linux", image="cua-xfce:dev")
+    await computer.run()
 
     # Initialize the browser tool with the computer interface
     browser = BrowserTool(interface=computer)
diff --git a/libs/python/computer/computer/computer.py b/libs/python/computer/computer/computer.py
index 0b1cd509..710b08a2 100644
--- a/libs/python/computer/computer/computer.py
+++ b/libs/python/computer/computer/computer.py
@@ -953,6 +953,35 @@ class Computer:
         """
         return await self.interface.to_screenshot_coordinates(x, y)
 
+    async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        Execute a Playwright browser command.
+
+        Args:
+            command: The browser command to execute (visit_url, click, type, scroll, web_search)
+            params: Command parameters
+
+        Returns:
+            Dict containing the command result
+
+        Examples:
+            # Navigate to a URL
+            await computer.playwright_exec("visit_url", {"url": "https://example.com"})
+
+            # Click at coordinates
+            await computer.playwright_exec("click", {"x": 100, "y": 200})
+
+            # Type text
+            await computer.playwright_exec("type", {"text": "Hello, world!"})
+
+            # Scroll
+            await computer.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100})
+
+            # Web search
+            await computer.playwright_exec("web_search", {"query": "computer use agent"})
+        """
+        return await self.interface.playwright_exec(command, params)
+
     # Add virtual environment management functions to computer interface
     async def venv_install(self, venv_name: str, requirements: list[str]):
         """Install packages in a virtual environment.

From 3f0ed2c203090f6c5613b725073a57a46eb93a9a Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 3 Dec 2025 09:01:07 -0800
Subject: [PATCH 5/8] Fix Dockerfile.dev, make browser manager robust to the
 browser closing

---
 .../computer_server/browser.py                | 260 ++++++++++--------
 libs/xfce/Development.md                      |   8 +
 libs/xfce/Dockerfile.dev                      |  12 +-
 3 files changed, 153 insertions(+), 127 deletions(-)

diff --git a/libs/python/computer-server/computer_server/browser.py b/libs/python/computer-server/computer_server/browser.py
index 3d0a4c69..bd0f3f7e 100644
--- a/libs/python/computer-server/computer_server/browser.py
+++ b/libs/python/computer-server/computer_server/browser.py
@@ -9,7 +9,7 @@ import os
 from typing import Any, Dict, Optional
 
 try:
-    from playwright.async_api import async_playwright, Browser, BrowserContext, Page
+    from playwright.async_api import Browser, BrowserContext, Page, async_playwright
 except ImportError:
     async_playwright = None
     Browser = None
@@ -122,14 +122,57 @@ class BrowserManager:
             except Exception as e:
                 logger.error(f"Failed to initialize browser: {e}")
                 import traceback
+
                 logger.error(traceback.format_exc())
                 # Don't raise - return error in execute_command instead
                 self._initialization_error = str(e)
                 raise
 
+    async def _execute_command_impl(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]:
+        """Internal implementation of command execution."""
+        if cmd == "visit_url":
+            url = params.get("url")
+            if not url:
+                return {"success": False, "error": "url parameter is required"}
+            await self.page.goto(url, wait_until="domcontentloaded", timeout=30000)
+            return {"success": True, "url": self.page.url}
+
+        elif cmd == "click":
+            x = params.get("x")
+            y = params.get("y")
+            if x is None or y is None:
+                return {"success": False, "error": "x and y parameters are required"}
+            await self.page.mouse.click(x, y)
+            return {"success": True}
+
+        elif cmd == "type":
+            text = params.get("text")
+            if text is None:
+                return {"success": False, "error": "text parameter is required"}
+            await self.page.keyboard.type(text)
+            return {"success": True}
+
+        elif cmd == "scroll":
+            delta_x = params.get("delta_x", 0)
+            delta_y = params.get("delta_y", 0)
+            await self.page.mouse.wheel(delta_x, delta_y)
+            return {"success": True}
+
+        elif cmd == "web_search":
+            query = params.get("query")
+            if not query:
+                return {"success": False, "error": "query parameter is required"}
+            # Navigate to Google search
+            search_url = f"https://www.google.com/search?q={query}"
+            await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000)
+            return {"success": True, "url": self.page.url}
+
+        else:
+            return {"success": False, "error": f"Unknown command: {cmd}"}
+
     async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]:
         """
-        Execute a browser command.
+        Execute a browser command with automatic recovery.
 
         Args:
             cmd: Command name (visit_url, click, type, scroll, web_search)
@@ -138,57 +181,54 @@ class BrowserManager:
         Returns:
             Result dictionary with success status and any data
         """
-        try:
-            await self._ensure_initialized()
-        except Exception as e:
-            error_msg = getattr(self, '_initialization_error', None) or str(e)
-            logger.error(f"Browser initialization failed: {error_msg}")
-            return {
-                "success": False,
-                "error": f"Browser initialization failed: {error_msg}. "
-                         f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly."
-            }
-
-        # Ensure browser is still initialized (in case it was manually closed)
-        # This will automatically reinitialize if the browser was closed
-        await self._ensure_initialized()
-        
-        # Check if page is still valid
-        page_valid = False
-        try:
-            if self.page is not None:
-                # Try to access page.url to check if it's still valid
-                _ = self.page.url
-                page_valid = True
-        except Exception as e:
-            logger.warning(f"Page is invalid: {e}, will get a new page...")
-            self.page = None
-        
-        # Get a valid page if we don't have one
-        if not page_valid or self.page is None:
+        max_retries = 2
+        for attempt in range(max_retries):
             try:
-                pages = self.context.pages
-                if pages:
-                    # Find first non-closed page
-                    for p in pages:
-                        try:
-                            if not p.is_closed():
-                                self.page = p
-                                logger.info("Reusing existing open page")
-                                page_valid = True
-                                break
-                        except Exception:
-                            continue
-                
-                # If no valid page found, create a new one
-                if not page_valid:
-                    self.page = await self.context.new_page()
-                    logger.info("Created new page")
+                await self._ensure_initialized()
             except Exception as e:
-                logger.error(f"Failed to get new page: {e}, browser may be closed")
-                # Browser was closed - reinitialize it
+                error_msg = getattr(self, "_initialization_error", None) or str(e)
+                logger.error(f"Browser initialization failed: {error_msg}")
+                return {
+                    "success": False,
+                    "error": f"Browser initialization failed: {error_msg}. "
+                    f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly.",
+                }
+
+            # Check if page is still valid and get a new one if needed
+            page_valid = False
+            try:
+                if self.page is not None and not self.page.is_closed():
+                    # Try to access page.url to check if it's still valid
+                    _ = self.page.url
+                    page_valid = True
+            except Exception as e:
+                logger.warning(f"Page is invalid: {e}, will get a new page...")
+                self.page = None
+
+            # Get a valid page if we don't have one
+            if not page_valid or self.page is None:
                 try:
-                    logger.info("Browser was closed manually, reinitializing...")
+                    if self.context:
+                        pages = self.context.pages
+                        if pages:
+                            # Find first non-closed page
+                            for p in pages:
+                                try:
+                                    if not p.is_closed():
+                                        self.page = p
+                                        logger.info("Reusing existing open page")
+                                        page_valid = True
+                                        break
+                                except Exception:
+                                    continue
+
+                        # If no valid page found, create a new one
+                        if not page_valid:
+                            self.page = await self.context.new_page()
+                            logger.info("Created new page")
+                except Exception as e:
+                    logger.error(f"Failed to get new page: {e}, browser may be closed")
+                    # Browser was closed - force reinitialization
                     self._initialized = False
                     self.context = None
                     self.page = None
@@ -198,80 +238,59 @@ class BrowserManager:
                         except Exception:
                             pass
                         self.playwright = None
-                    
-                    # Reinitialize
-                    await self._ensure_initialized()
-                    # Get or create a page
-                    pages = self.context.pages
-                    if pages:
-                        self.page = pages[0]
+
+                    # If this isn't the last attempt, continue to retry
+                    if attempt < max_retries - 1:
+                        logger.info("Browser was closed, retrying with fresh initialization...")
+                        continue
                     else:
-                        self.page = await self.context.new_page()
-                    logger.info("Browser reopened successfully after manual closure")
-                except Exception as reinit_error:
-                    logger.error(f"Failed to reinitialize browser: {reinit_error}")
+                        return {
+                            "success": False,
+                            "error": f"Browser was closed and cannot be recovered: {e}",
+                        }
+
+            # Try to execute the command
+            try:
+                return await self._execute_command_impl(cmd, params)
+            except Exception as e:
+                error_str = str(e)
+                logger.error(f"Error executing command {cmd}: {e}")
+
+                # Check if this is a "browser/page/context closed" error
+                if any(keyword in error_str.lower() for keyword in ["closed", "target", "context"]):
+                    logger.warning(
+                        f"Browser/page was closed during command execution (attempt {attempt + 1}/{max_retries})"
+                    )
+
+                    # Force reinitialization
+                    self._initialized = False
+                    self.context = None
+                    self.page = None
+                    if self.playwright:
+                        try:
+                            await self.playwright.stop()
+                        except Exception:
+                            pass
+                        self.playwright = None
+
+                    # If this isn't the last attempt, retry
+                    if attempt < max_retries - 1:
+                        logger.info("Retrying command after browser reinitialization...")
+                        continue
+                    else:
+                        return {
+                            "success": False,
+                            "error": f"Command failed after {max_retries} attempts: {error_str}",
+                        }
+                else:
+                    # Not a browser closed error, return immediately
                     import traceback
+
                     logger.error(traceback.format_exc())
-                    return {"success": False, "error": f"Browser was closed and cannot be recovered: {reinit_error}"}
+                    return {"success": False, "error": error_str}
 
-        try:
-            if cmd == "visit_url":
-                url = params.get("url")
-                if not url:
-                    return {"success": False, "error": "url parameter is required"}
-                await self.page.goto(url, wait_until="domcontentloaded", timeout=30000)
-                return {"success": True, "url": self.page.url}
-
-            elif cmd == "click":
-                x = params.get("x")
-                y = params.get("y")
-                if x is None or y is None:
-                    return {"success": False, "error": "x and y parameters are required"}
-                await self.page.mouse.click(x, y)
-                return {"success": True}
-
-            elif cmd == "type":
-                text = params.get("text")
-                if text is None:
-                    return {"success": False, "error": "text parameter is required"}
-                await self.page.keyboard.type(text)
-                return {"success": True}
-
-            elif cmd == "scroll":
-                delta_x = params.get("delta_x", 0)
-                delta_y = params.get("delta_y", 0)
-                await self.page.mouse.wheel(delta_x, delta_y)
-                return {"success": True}
-
-            elif cmd == "web_search":
-                query = params.get("query")
-                if not query:
-                    return {"success": False, "error": "query parameter is required"}
-                # Navigate to Google search
-                search_url = f"https://www.google.com/search?q={query}"
-                await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000)
-                return {"success": True, "url": self.page.url}
-
-            else:
-                return {"success": False, "error": f"Unknown command: {cmd}"}
-
-        except Exception as e:
-            logger.error(f"Error executing command {cmd}: {e}")
-            import traceback
-            logger.error(traceback.format_exc())
-            # If page was closed due to error, try to recover
-            if "closed" in str(e).lower() and self.context:
-                try:
-                    pages = self.context.pages
-                    if pages:
-                        self.page = pages[0]
-                        logger.info("Recovered page after error")
-                    else:
-                        self.page = await self.context.new_page()
-                        logger.info("Created new page after error")
-                except Exception as recover_error:
-                    logger.error(f"Failed to recover page: {recover_error}")
-            return {"success": False, "error": str(e)}
+        # Should never reach here, but just in case
+        return {"success": False, "error": "Command failed after all retries"}
 
     async def close(self):
         """Close the browser and cleanup resources."""
@@ -305,4 +324,3 @@ def get_browser_manager() -> BrowserManager:
     if _browser_manager is None:
         _browser_manager = BrowserManager()
     return _browser_manager
-
diff --git a/libs/xfce/Development.md b/libs/xfce/Development.md
index 4bc12c81..b67199e8 100644
--- a/libs/xfce/Development.md
+++ b/libs/xfce/Development.md
@@ -11,6 +11,14 @@ docker build -f Dockerfile.dev -t cua-xfce:dev ..
 
 The build context is set to the parent directory to allow copying the local `computer-server` source.
 
+## Tagging the Image
+
+To tag the dev image as latest:
+
+```bash
+docker tag cua-xfce:dev cua-xfce:latest
+```
+
 ## Running the Development Container
 
 ```bash
diff --git a/libs/xfce/Dockerfile.dev b/libs/xfce/Dockerfile.dev
index 12036a0c..c24efaf9 100644
--- a/libs/xfce/Dockerfile.dev
+++ b/libs/xfce/Dockerfile.dev
@@ -105,7 +105,7 @@ RUN mkdir -p /home/cua/.cache && \
     chown -R cua:cua /home/cua/.cache
 
 # Copy local computer-server source and install it
-COPY ../python/computer-server /tmp/computer-server
+COPY python/computer-server /tmp/computer-server
 RUN python3.12 -m pip install /tmp/computer-server && \
     rm -rf /tmp/computer-server
 
@@ -117,8 +117,8 @@ RUN python3.12 -m pip install playwright && \
 RUN chown -R cua:cua /home/cua/.cache
 
 # Copy startup scripts
-COPY src/supervisor/ /etc/supervisor/conf.d/
-COPY src/scripts/ /usr/local/bin/
+COPY xfce/src/supervisor/ /etc/supervisor/conf.d/
+COPY xfce/src/scripts/ /usr/local/bin/
 
 # Make scripts executable
 RUN chmod +x /usr/local/bin/*.sh
@@ -135,9 +135,9 @@ RUN mkdir -p $HOME/.vnc
 RUN mkdir -p $HOME/.config/xfce4/xfconf/xfce-perchannel-xml $HOME/.config/xfce4 $HOME/.config/autostart
 
 # Copy XFCE config to disable browser launching and welcome screens
-COPY --chown=cua:cua src/xfce-config/helpers.rc $HOME/.config/xfce4/helpers.rc
-COPY --chown=cua:cua src/xfce-config/xfce4-session.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-session.xml
-COPY --chown=cua:cua src/xfce-config/xfce4-power-manager.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-power-manager.xml
+COPY --chown=cua:cua xfce/src/xfce-config/helpers.rc $HOME/.config/xfce4/helpers.rc
+COPY --chown=cua:cua xfce/src/xfce-config/xfce4-session.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-session.xml
+COPY --chown=cua:cua xfce/src/xfce-config/xfce4-power-manager.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-power-manager.xml
 
 # Disable autostart for screensaver, lock screen, and power manager
 RUN echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-tips-autostart.desktop && \

From 907fff475e34bf453887655335d42dbad0b191da Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 3 Dec 2025 09:28:03 -0800
Subject: [PATCH 6/8] Patch anti-bot-detection/stealth-mode into playwright
 browser

---
 examples/browser_tool_example.py              | 24 +++++++++++++
 libs/python/agent/agent/tools/browser_tool.py | 19 ++++++++++
 .../computer_server/browser.py                | 35 +++++++++++++++++++
 libs/xfce/Dockerfile                          |  2 +-
 libs/xfce/Dockerfile.dev                      |  2 +-
 5 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/examples/browser_tool_example.py b/examples/browser_tool_example.py
index 70055ff9..e5767f27 100644
--- a/examples/browser_tool_example.py
+++ b/examples/browser_tool_example.py
@@ -48,6 +48,14 @@ async def test_browser_tool():
     logger.info("Testing Browser Tool...")
 
     try:
+        # Test 0: Take a screenshot (pre-init)
+        logger.info("Test 0: Taking a screenshot...")
+        screenshot_bytes = await browser.screenshot()
+        screenshot_path = Path(__file__).parent / "browser_screenshot_init.png"
+        with open(screenshot_path, "wb") as f:
+            f.write(screenshot_bytes)
+        logger.info(f"Screenshot captured: {len(screenshot_bytes)} bytes")
+
         # Test 1: Visit a URL
         logger.info("Test 1: Visiting a URL...")
         result = await browser.visit_url("https://www.trycua.com")
@@ -56,6 +64,22 @@ async def test_browser_tool():
         # Wait a bit for the page to load
         await asyncio.sleep(2)
 
+        # Test 2: Take a screenshot
+        logger.info("Test 2: Taking a screenshot...")
+        screenshot_bytes = await browser.screenshot()
+        screenshot_path = Path(__file__).parent / "browser_screenshot.png"
+        with open(screenshot_path, "wb") as f:
+            f.write(screenshot_bytes)
+        logger.info(f"Screenshot captured: {len(screenshot_bytes)} bytes")
+
+        # Wait a bit
+        await asyncio.sleep(1)
+
+        # Test 3: Visit bot detector
+        logger.info("Test 3: Visiting bot detector...")
+        result = await browser.visit_url("https://bot-detector.rebrowser.net/")
+        logger.info(f"Visit URL result: {result}")
+
         # Test 2: Web search
         logger.info("Test 2: Performing a web search...")
         result = await browser.web_search("Python programming")
diff --git a/libs/python/agent/agent/tools/browser_tool.py b/libs/python/agent/agent/tools/browser_tool.py
index 85b6ba23..a1bf2090 100644
--- a/libs/python/agent/agent/tools/browser_tool.py
+++ b/libs/python/agent/agent/tools/browser_tool.py
@@ -114,3 +114,22 @@ class BrowserTool:
             Response dictionary with success status and current URL
         """
         return await self._execute_command("web_search", {"query": query})
+
+    async def screenshot(self) -> bytes:
+        """
+        Take a screenshot of the current browser page.
+
+        Returns:
+            Screenshot image data as bytes (PNG format)
+        """
+        import base64
+
+        result = await self._execute_command("screenshot", {})
+        if result.get("success") and result.get("screenshot"):
+            # Decode base64 screenshot to bytes
+            screenshot_b64 = result["screenshot"]
+            screenshot_bytes = base64.b64decode(screenshot_b64)
+            return screenshot_bytes
+        else:
+            error = result.get("error", "Unknown error")
+            raise RuntimeError(f"Failed to take screenshot: {error}")
diff --git a/libs/python/computer-server/computer_server/browser.py b/libs/python/computer-server/computer_server/browser.py
index bd0f3f7e..9789abf7 100644
--- a/libs/python/computer-server/computer_server/browser.py
+++ b/libs/python/computer-server/computer_server/browser.py
@@ -109,6 +109,33 @@ class BrowserManager:
                     # Removed --kiosk to allow desktop visibility
                 )
 
+                # Add init script to make the browser less detectable
+                await self.context.add_init_script(
+                    """const defaultGetter = Object.getOwnPropertyDescriptor(
+      Navigator.prototype,
+      "webdriver"
+    ).get;
+    defaultGetter.apply(navigator);
+    defaultGetter.toString();
+    Object.defineProperty(Navigator.prototype, "webdriver", {
+      set: undefined,
+      enumerable: true,
+      configurable: true,
+      get: new Proxy(defaultGetter, {
+        apply: (target, thisArg, args) => {
+          Reflect.apply(target, thisArg, args);
+          return false;
+        },
+      }),
+    });
+    const patchedGetter = Object.getOwnPropertyDescriptor(
+      Navigator.prototype,
+      "webdriver"
+    ).get;
+    patchedGetter.apply(navigator);
+    patchedGetter.toString();"""
+                )
+
                 # Get the first page or create one
                 pages = self.context.pages
                 if pages:
@@ -167,6 +194,14 @@ class BrowserManager:
             await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000)
             return {"success": True, "url": self.page.url}
 
+        elif cmd == "screenshot":
+            # Take a screenshot and return as base64
+            import base64
+
+            screenshot_bytes = await self.page.screenshot(type="png")
+            screenshot_b64 = base64.b64encode(screenshot_bytes).decode("utf-8")
+            return {"success": True, "screenshot": screenshot_b64}
+
         else:
             return {"success": False, "error": f"Unknown command: {cmd}"}
 
diff --git a/libs/xfce/Dockerfile b/libs/xfce/Dockerfile
index f1605181..2b687940 100644
--- a/libs/xfce/Dockerfile
+++ b/libs/xfce/Dockerfile
@@ -108,7 +108,7 @@ RUN mkdir -p /home/cua/.cache && \
 RUN python3.12 -m pip install cua-computer-server
 
 # Install playwright and Firefox dependencies
-RUN python3.12 -m pip install playwright && \
+RUN python3.12 -m pip install rebrowser-playwright && \
     python3.12 -m playwright install --with-deps firefox
 
 # Fix any cache files created by pip
diff --git a/libs/xfce/Dockerfile.dev b/libs/xfce/Dockerfile.dev
index c24efaf9..2eec430e 100644
--- a/libs/xfce/Dockerfile.dev
+++ b/libs/xfce/Dockerfile.dev
@@ -110,7 +110,7 @@ RUN python3.12 -m pip install /tmp/computer-server && \
     rm -rf /tmp/computer-server
 
 # Install playwright and Firefox dependencies
-RUN python3.12 -m pip install playwright && \
+RUN python3.12 -m pip install rebrowser-playwright && \
     python3.12 -m playwright install --with-deps firefox
 
 # Fix any cache files created by pip

From 6dcb1a1b9f4b159d88347211cc23bca1d26497c5 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 3 Dec 2025 09:28:59 -0800
Subject: [PATCH 7/8] Cleanup extra files

---
 examples/BROWSER_TOOL_README.md | 89 ---------------------------------
 1 file changed, 89 deletions(-)
 delete mode 100644 examples/BROWSER_TOOL_README.md

diff --git a/examples/BROWSER_TOOL_README.md b/examples/BROWSER_TOOL_README.md
deleted file mode 100644
index f72971e8..00000000
--- a/examples/BROWSER_TOOL_README.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# Browser Tool
-
-Browser automation tool that allows agents to control a Firefox browser programmatically via Playwright while keeping it visible on the XFCE desktop.
-
-## Quick Start
-
-### Using Docker (Recommended)
-
-```bash
-# Build and run the container
-cd libs/xfce
-docker build -t cua-xfce .
-docker run -d --name cua-xfce-test \
-  -p 8000:8000 -p 5901:5901 -p 6901:6901 \
-  -e DISPLAY=:1 \
-  cua-xfce
-
-# View desktop: http://localhost:6901
-# Test the browser tool
-python examples/browser_tool_example.py
-```
-
-### Local Testing
-
-```bash
-# Install dependencies
-pip install playwright
-playwright install --with-deps firefox
-
-# Start server
-python -m computer_server --port 8000
-
-# Run test (in another terminal)
-python examples/browser_tool_example.py
-```
-
-## Features
-
-- **Visible Browser**: Runs in non-headless mode so visual agents can see it
-- **Auto-Recovery**: Automatically reopens browser if closed manually
-- **Persistent Context**: Maintains cookies and sessions across commands
-- **Fara/Magentic-One Interface**: Compatible with Microsoft agent interfaces
-- **Computer SDK Integration**: Uses the Computer SDK's interface for unified control
-
-## Usage
-
-The BrowserTool uses the Computer SDK's interface to communicate with the server:
-
-```python
-from computer import Computer
-from agent.tools.browser_tool import BrowserTool
-
-# Initialize computer interface
-computer = Computer(ip_address="localhost")
-
-# Create browser tool with the interface
-browser = BrowserTool(interface=computer)
-
-# Use the browser
-await browser.visit_url("https://www.example.com")
-await browser.click(x=500, y=300)
-await browser.type("Hello, world!")
-```
-
-## API Endpoint
-
-The browser tool is also accessible via the `/playwright_exec` endpoint:
-
-```bash
-curl -X POST http://localhost:8000/playwright_exec \
-  -H "Content-Type: application/json" \
-  -d '{"command": "visit_url", "params": {"url": "https://www.example.com"}}'
-```
-
-## Available Commands
-
-- `visit_url(url)` - Navigate to a URL
-- `click(x, y)` - Click at coordinates
-- `type(text)` - Type text into focused element
-- `scroll(delta_x, delta_y)` - Scroll the page
-- `web_search(query)` - Navigate to Google search
-
-## Troubleshooting
-
-**Browser closes unexpectedly**: The tool automatically reopens the browser on the next command.
-
-**Connection errors**: Make sure the server is running (`curl http://localhost:8000/status`).
-
-**Playwright not found**: Install with `pip install playwright && playwright install --with-deps firefox`.

From ec233e2e891ec6822505f327ab01d1cca7ca24bb Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 3 Dec 2025 09:33:42 -0800
Subject: [PATCH 8/8] Remove rebrowser-playwright

---
 libs/xfce/Dockerfile     | 2 +-
 libs/xfce/Dockerfile.dev | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/xfce/Dockerfile b/libs/xfce/Dockerfile
index 2b687940..f1605181 100644
--- a/libs/xfce/Dockerfile
+++ b/libs/xfce/Dockerfile
@@ -108,7 +108,7 @@ RUN mkdir -p /home/cua/.cache && \
 RUN python3.12 -m pip install cua-computer-server
 
 # Install playwright and Firefox dependencies
-RUN python3.12 -m pip install rebrowser-playwright && \
+RUN python3.12 -m pip install playwright && \
     python3.12 -m playwright install --with-deps firefox
 
 # Fix any cache files created by pip
diff --git a/libs/xfce/Dockerfile.dev b/libs/xfce/Dockerfile.dev
index 2eec430e..c24efaf9 100644
--- a/libs/xfce/Dockerfile.dev
+++ b/libs/xfce/Dockerfile.dev
@@ -110,7 +110,7 @@ RUN python3.12 -m pip install /tmp/computer-server && \
     rm -rf /tmp/computer-server
 
 # Install playwright and Firefox dependencies
-RUN python3.12 -m pip install rebrowser-playwright && \
+RUN python3.12 -m pip install playwright && \
     python3.12 -m playwright install --with-deps firefox
 
 # Fix any cache files created by pip