Merge pull request #633 from trycua/feat/browser-tool

Add browser tool with Playwright for visible browser automation
2026-01-08 14:30:25 -06:00 · 2025-12-03 12:36:09 -05:00
parent 7d3d826672 ec233e2e89
commit 1d2317b005
13 changed files with 962 additions and 1 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,6 +15,8 @@ repos:
        name: TypeScript type check
        entry: node ./scripts/typescript-typecheck.js
        language: node
+        files: \.(ts|tsx)$
+        pass_filenames: false

  - repo: https://github.com/PyCQA/isort
    rev: 7.0.0
--- a/examples/browser_tool_example.py
+++ b/examples/browser_tool_example.py
@@ -0,0 +1,119 @@
+"""
+Browser Tool Example
+
+Demonstrates how to use the BrowserTool to control a browser programmatically
+via the computer server. The browser runs visibly on the XFCE desktop so visual
+agents can see it.
+
+Prerequisites:
+    - Computer server running (Docker container or local)
+    - For Docker: Container should be running with browser tool support
+    - For local: Playwright and Firefox must be installed
+
+Usage:
+    python examples/browser_tool_example.py
+"""
+
+import asyncio
+import logging
+import sys
+from pathlib import Path
+
+# Add the libs path to sys.path
+libs_path = Path(__file__).parent.parent / "libs" / "python"
+sys.path.insert(0, str(libs_path))
+
+from agent.tools.browser_tool import BrowserTool
+
+# Import Computer interface and BrowserTool
+from computer import Computer
+
+# Configure logging to see what's happening
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+async def test_browser_tool():
+    """Test the BrowserTool with various commands."""
+
+    # Initialize the computer interface
+    # For local testing, use provider_type="docker"
+    # For provider_type="cloud", provide name and api_key
+    computer = Computer(provider_type="docker", os_type="linux", image="cua-xfce:dev")
+    await computer.run()
+
+    # Initialize the browser tool with the computer interface
+    browser = BrowserTool(interface=computer)
+
+    logger.info("Testing Browser Tool...")
+
+    try:
+        # Test 0: Take a screenshot (pre-init)
+        logger.info("Test 0: Taking a screenshot...")
+        screenshot_bytes = await browser.screenshot()
+        screenshot_path = Path(__file__).parent / "browser_screenshot_init.png"
+        with open(screenshot_path, "wb") as f:
+            f.write(screenshot_bytes)
+        logger.info(f"Screenshot captured: {len(screenshot_bytes)} bytes")
+
+        # Test 1: Visit a URL
+        logger.info("Test 1: Visiting a URL...")
+        result = await browser.visit_url("https://www.trycua.com")
+        logger.info(f"Visit URL result: {result}")
+
+        # Wait a bit for the page to load
+        await asyncio.sleep(2)
+
+        # Test 2: Take a screenshot
+        logger.info("Test 2: Taking a screenshot...")
+        screenshot_bytes = await browser.screenshot()
+        screenshot_path = Path(__file__).parent / "browser_screenshot.png"
+        with open(screenshot_path, "wb") as f:
+            f.write(screenshot_bytes)
+        logger.info(f"Screenshot captured: {len(screenshot_bytes)} bytes")
+
+        # Wait a bit
+        await asyncio.sleep(1)
+
+        # Test 3: Visit bot detector
+        logger.info("Test 3: Visiting bot detector...")
+        result = await browser.visit_url("https://bot-detector.rebrowser.net/")
+        logger.info(f"Visit URL result: {result}")
+
+        # Test 2: Web search
+        logger.info("Test 2: Performing a web search...")
+        result = await browser.web_search("Python programming")
+        logger.info(f"Web search result: {result}")
+
+        # Wait a bit
+        await asyncio.sleep(2)
+
+        # Test 3: Scroll
+        logger.info("Test 3: Scrolling the page...")
+        result = await browser.scroll(delta_x=0, delta_y=500)
+        logger.info(f"Scroll result: {result}")
+
+        # Wait a bit
+        await asyncio.sleep(1)
+
+        # Test 4: Click (example coordinates - adjust based on your screen)
+        logger.info("Test 4: Clicking at coordinates...")
+        result = await browser.click(x=500, y=300)
+        logger.info(f"Click result: {result}")
+
+        # Wait a bit
+        await asyncio.sleep(1)
+
+        # Test 5: Type text (if there's a focused input field)
+        logger.info("Test 5: Typing text...")
+        result = await browser.type("Hello from BrowserTool!")
+        logger.info(f"Type result: {result}")
+
+        logger.info("All tests completed!")
+
+    except Exception as e:
+        logger.error(f"Error during testing: {e}", exc_info=True)
+
+
+if __name__ == "__main__":
+    asyncio.run(test_browser_tool())
--- a/libs/python/agent/agent/tools/init.py
+++ b/libs/python/agent/agent/tools/init.py
@@ -0,0 +1,6 @@
+"""Tools for agent interactions."""
+
+from .browser_tool import BrowserTool
+
+__all__ = ["BrowserTool"]
+
--- a/libs/python/agent/agent/tools/browser_tool.py
+++ b/libs/python/agent/agent/tools/browser_tool.py
@@ -0,0 +1,135 @@
+"""
+Browser Tool for agent interactions.
+Allows agents to control a browser programmatically via Playwright.
+"""
+
+import logging
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    from computer.interface import GenericComputerInterface
+
+logger = logging.getLogger(__name__)
+
+
+class BrowserTool:
+    """
+    Browser tool that uses the computer SDK's interface to control a browser.
+    Implements the Fara/Magentic-One agent interface for browser control.
+    """
+
+    def __init__(
+        self,
+        interface: "GenericComputerInterface",
+    ):
+        """
+        Initialize the BrowserTool.
+
+        Args:
+            interface: A GenericComputerInterface instance that provides playwright_exec
+        """
+        self.interface = interface
+        self.logger = logger
+
+    async def _execute_command(self, command: str, params: dict) -> dict:
+        """
+        Execute a browser command via the computer interface.
+
+        Args:
+            command: Command name
+            params: Command parameters
+
+        Returns:
+            Response dictionary
+        """
+        try:
+            result = await self.interface.playwright_exec(command, params)
+            if not result.get("success"):
+                self.logger.error(
+                    f"Browser command '{command}' failed: {result.get('error', 'Unknown error')}"
+                )
+            return result
+        except Exception as e:
+            self.logger.error(f"Error executing browser command '{command}': {e}")
+            return {"success": False, "error": str(e)}
+
+    async def visit_url(self, url: str) -> dict:
+        """
+        Navigate to a URL.
+
+        Args:
+            url: URL to visit
+
+        Returns:
+            Response dictionary with success status and current URL
+        """
+        return await self._execute_command("visit_url", {"url": url})
+
+    async def click(self, x: int, y: int) -> dict:
+        """
+        Click at coordinates.
+
+        Args:
+            x: X coordinate
+            y: Y coordinate
+
+        Returns:
+            Response dictionary with success status
+        """
+        return await self._execute_command("click", {"x": x, "y": y})
+
+    async def type(self, text: str) -> dict:
+        """
+        Type text into the focused element.
+
+        Args:
+            text: Text to type
+
+        Returns:
+            Response dictionary with success status
+        """
+        return await self._execute_command("type", {"text": text})
+
+    async def scroll(self, delta_x: int, delta_y: int) -> dict:
+        """
+        Scroll the page.
+
+        Args:
+            delta_x: Horizontal scroll delta
+            delta_y: Vertical scroll delta
+
+        Returns:
+            Response dictionary with success status
+        """
+        return await self._execute_command("scroll", {"delta_x": delta_x, "delta_y": delta_y})
+
+    async def web_search(self, query: str) -> dict:
+        """
+        Navigate to a Google search for the query.
+
+        Args:
+            query: Search query
+
+        Returns:
+            Response dictionary with success status and current URL
+        """
+        return await self._execute_command("web_search", {"query": query})
+
+    async def screenshot(self) -> bytes:
+        """
+        Take a screenshot of the current browser page.
+
+        Returns:
+            Screenshot image data as bytes (PNG format)
+        """
+        import base64
+
+        result = await self._execute_command("screenshot", {})
+        if result.get("success") and result.get("screenshot"):
+            # Decode base64 screenshot to bytes
+            screenshot_b64 = result["screenshot"]
+            screenshot_bytes = base64.b64decode(screenshot_b64)
+            return screenshot_bytes
+        else:
+            error = result.get("error", "Unknown error")
+            raise RuntimeError(f"Failed to take screenshot: {error}")
--- a/libs/python/computer-server/computer_server/browser.py
+++ b/libs/python/computer-server/computer_server/browser.py
@@ -0,0 +1,361 @@
+"""
+Browser manager using Playwright for programmatic browser control.
+This allows agents to control a browser that runs visibly on the XFCE desktop.
+"""
+
+import asyncio
+import logging
+import os
+from typing import Any, Dict, Optional
+
+try:
+    from playwright.async_api import Browser, BrowserContext, Page, async_playwright
+except ImportError:
+    async_playwright = None
+    Browser = None
+    BrowserContext = None
+    Page = None
+
+logger = logging.getLogger(__name__)
+
+
+class BrowserManager:
+    """
+    Manages a Playwright browser instance that runs visibly on the XFCE desktop.
+    Uses persistent context to maintain cookies and sessions.
+    """
+
+    def __init__(self):
+        """Initialize the BrowserManager."""
+        self.playwright = None
+        self.browser: Optional[Browser] = None
+        self.context: Optional[BrowserContext] = None
+        self.page: Optional[Page] = None
+        self._initialized = False
+        self._initialization_error: Optional[str] = None
+        self._lock = asyncio.Lock()
+
+    async def _ensure_initialized(self):
+        """Ensure the browser is initialized."""
+        # Check if browser was closed and needs reinitialization
+        if self._initialized:
+            try:
+                # Check if context is still valid by trying to access it
+                if self.context:
+                    # Try to get pages - this will raise if context is closed
+                    _ = self.context.pages
+                    # If we get here, context is still alive
+                    return
+                else:
+                    # Context was closed, need to reinitialize
+                    self._initialized = False
+                    logger.warning("Browser context was closed, will reinitialize...")
+            except Exception as e:
+                # Context is dead, need to reinitialize
+                logger.warning(f"Browser context is dead ({e}), will reinitialize...")
+                self._initialized = False
+                self.context = None
+                self.page = None
+                # Clean up playwright if it exists
+                if self.playwright:
+                    try:
+                        await self.playwright.stop()
+                    except Exception:
+                        pass
+                    self.playwright = None
+
+        async with self._lock:
+            # Double-check after acquiring lock (another thread might have initialized it)
+            if self._initialized:
+                try:
+                    if self.context:
+                        _ = self.context.pages
+                        return
+                except Exception:
+                    self._initialized = False
+                    self.context = None
+                    self.page = None
+                    if self.playwright:
+                        try:
+                            await self.playwright.stop()
+                        except Exception:
+                            pass
+                        self.playwright = None
+
+            if async_playwright is None:
+                raise RuntimeError(
+                    "playwright is not installed. Please install it with: pip install playwright && playwright install --with-deps firefox"
+                )
+
+            try:
+                # Get display from environment or default to :1
+                display = os.environ.get("DISPLAY", ":1")
+                logger.info(f"Initializing browser with DISPLAY={display}")
+
+                # Start playwright
+                self.playwright = await async_playwright().start()
+
+                # Launch Firefox with persistent context (keeps cookies/sessions)
+                # headless=False is CRITICAL so the visual agent can see it
+                user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright-firefox")
+                os.makedirs(user_data_dir, exist_ok=True)
+
+                # launch_persistent_context returns a BrowserContext, not a Browser
+                # Note: Removed --kiosk mode so the desktop remains visible
+                self.context = await self.playwright.firefox.launch_persistent_context(
+                    user_data_dir=user_data_dir,
+                    headless=False,  # CRITICAL: visible for visual agent
+                    viewport={"width": 1024, "height": 768},
+                    # Removed --kiosk to allow desktop visibility
+                )
+
+                # Add init script to make the browser less detectable
+                await self.context.add_init_script(
+                    """const defaultGetter = Object.getOwnPropertyDescriptor(
+      Navigator.prototype,
+      "webdriver"
+    ).get;
+    defaultGetter.apply(navigator);
+    defaultGetter.toString();
+    Object.defineProperty(Navigator.prototype, "webdriver", {
+      set: undefined,
+      enumerable: true,
+      configurable: true,
+      get: new Proxy(defaultGetter, {
+        apply: (target, thisArg, args) => {
+          Reflect.apply(target, thisArg, args);
+          return false;
+        },
+      }),
+    });
+    const patchedGetter = Object.getOwnPropertyDescriptor(
+      Navigator.prototype,
+      "webdriver"
+    ).get;
+    patchedGetter.apply(navigator);
+    patchedGetter.toString();"""
+                )
+
+                # Get the first page or create one
+                pages = self.context.pages
+                if pages:
+                    self.page = pages[0]
+                else:
+                    self.page = await self.context.new_page()
+
+                self._initialized = True
+                logger.info("Browser initialized successfully")
+
+            except Exception as e:
+                logger.error(f"Failed to initialize browser: {e}")
+                import traceback
+
+                logger.error(traceback.format_exc())
+                # Don't raise - return error in execute_command instead
+                self._initialization_error = str(e)
+                raise
+
+    async def _execute_command_impl(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]:
+        """Internal implementation of command execution."""
+        if cmd == "visit_url":
+            url = params.get("url")
+            if not url:
+                return {"success": False, "error": "url parameter is required"}
+            await self.page.goto(url, wait_until="domcontentloaded", timeout=30000)
+            return {"success": True, "url": self.page.url}
+
+        elif cmd == "click":
+            x = params.get("x")
+            y = params.get("y")
+            if x is None or y is None:
+                return {"success": False, "error": "x and y parameters are required"}
+            await self.page.mouse.click(x, y)
+            return {"success": True}
+
+        elif cmd == "type":
+            text = params.get("text")
+            if text is None:
+                return {"success": False, "error": "text parameter is required"}
+            await self.page.keyboard.type(text)
+            return {"success": True}
+
+        elif cmd == "scroll":
+            delta_x = params.get("delta_x", 0)
+            delta_y = params.get("delta_y", 0)
+            await self.page.mouse.wheel(delta_x, delta_y)
+            return {"success": True}
+
+        elif cmd == "web_search":
+            query = params.get("query")
+            if not query:
+                return {"success": False, "error": "query parameter is required"}
+            # Navigate to Google search
+            search_url = f"https://www.google.com/search?q={query}"
+            await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000)
+            return {"success": True, "url": self.page.url}
+
+        elif cmd == "screenshot":
+            # Take a screenshot and return as base64
+            import base64
+
+            screenshot_bytes = await self.page.screenshot(type="png")
+            screenshot_b64 = base64.b64encode(screenshot_bytes).decode("utf-8")
+            return {"success": True, "screenshot": screenshot_b64}
+
+        else:
+            return {"success": False, "error": f"Unknown command: {cmd}"}
+
+    async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Execute a browser command with automatic recovery.
+
+        Args:
+            cmd: Command name (visit_url, click, type, scroll, web_search)
+            params: Command parameters
+
+        Returns:
+            Result dictionary with success status and any data
+        """
+        max_retries = 2
+        for attempt in range(max_retries):
+            try:
+                await self._ensure_initialized()
+            except Exception as e:
+                error_msg = getattr(self, "_initialization_error", None) or str(e)
+                logger.error(f"Browser initialization failed: {error_msg}")
+                return {
+                    "success": False,
+                    "error": f"Browser initialization failed: {error_msg}. "
+                    f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly.",
+                }
+
+            # Check if page is still valid and get a new one if needed
+            page_valid = False
+            try:
+                if self.page is not None and not self.page.is_closed():
+                    # Try to access page.url to check if it's still valid
+                    _ = self.page.url
+                    page_valid = True
+            except Exception as e:
+                logger.warning(f"Page is invalid: {e}, will get a new page...")
+                self.page = None
+
+            # Get a valid page if we don't have one
+            if not page_valid or self.page is None:
+                try:
+                    if self.context:
+                        pages = self.context.pages
+                        if pages:
+                            # Find first non-closed page
+                            for p in pages:
+                                try:
+                                    if not p.is_closed():
+                                        self.page = p
+                                        logger.info("Reusing existing open page")
+                                        page_valid = True
+                                        break
+                                except Exception:
+                                    continue
+
+                        # If no valid page found, create a new one
+                        if not page_valid:
+                            self.page = await self.context.new_page()
+                            logger.info("Created new page")
+                except Exception as e:
+                    logger.error(f"Failed to get new page: {e}, browser may be closed")
+                    # Browser was closed - force reinitialization
+                    self._initialized = False
+                    self.context = None
+                    self.page = None
+                    if self.playwright:
+                        try:
+                            await self.playwright.stop()
+                        except Exception:
+                            pass
+                        self.playwright = None
+
+                    # If this isn't the last attempt, continue to retry
+                    if attempt < max_retries - 1:
+                        logger.info("Browser was closed, retrying with fresh initialization...")
+                        continue
+                    else:
+                        return {
+                            "success": False,
+                            "error": f"Browser was closed and cannot be recovered: {e}",
+                        }
+
+            # Try to execute the command
+            try:
+                return await self._execute_command_impl(cmd, params)
+            except Exception as e:
+                error_str = str(e)
+                logger.error(f"Error executing command {cmd}: {e}")
+
+                # Check if this is a "browser/page/context closed" error
+                if any(keyword in error_str.lower() for keyword in ["closed", "target", "context"]):
+                    logger.warning(
+                        f"Browser/page was closed during command execution (attempt {attempt + 1}/{max_retries})"
+                    )
+
+                    # Force reinitialization
+                    self._initialized = False
+                    self.context = None
+                    self.page = None
+                    if self.playwright:
+                        try:
+                            await self.playwright.stop()
+                        except Exception:
+                            pass
+                        self.playwright = None
+
+                    # If this isn't the last attempt, retry
+                    if attempt < max_retries - 1:
+                        logger.info("Retrying command after browser reinitialization...")
+                        continue
+                    else:
+                        return {
+                            "success": False,
+                            "error": f"Command failed after {max_retries} attempts: {error_str}",
+                        }
+                else:
+                    # Not a browser closed error, return immediately
+                    import traceback
+
+                    logger.error(traceback.format_exc())
+                    return {"success": False, "error": error_str}
+
+        # Should never reach here, but just in case
+        return {"success": False, "error": "Command failed after all retries"}
+
+    async def close(self):
+        """Close the browser and cleanup resources."""
+        async with self._lock:
+            try:
+                if self.context:
+                    await self.context.close()
+                    self.context = None
+                if self.browser:
+                    await self.browser.close()
+                    self.browser = None
+
+                if self.playwright:
+                    await self.playwright.stop()
+                    self.playwright = None
+
+                self.page = None
+                self._initialized = False
+                logger.info("Browser closed successfully")
+            except Exception as e:
+                logger.error(f"Error closing browser: {e}")
+
+
+# Global instance
+_browser_manager: Optional[BrowserManager] = None
+
+
+def get_browser_manager() -> BrowserManager:
+    """Get or create the global BrowserManager instance."""
+    global _browser_manager
+    if _browser_manager is None:
+        _browser_manager = BrowserManager()
+    return _browser_manager
--- a/libs/python/computer-server/computer_server/main.py
+++ b/libs/python/computer-server/computer_server/main.py
@@ -25,6 +25,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse

 from .handlers.factory import HandlerFactory
+from .browser import get_browser_manager

 # Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s
 AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60"))
@@ -749,5 +750,71 @@ async def agent_response_endpoint(
    return JSONResponse(content=payload, headers=headers)


+@app.post("/playwright_exec")
+async def playwright_exec_endpoint(
+    request: Request,
+    container_name: Optional[str] = Header(None, alias="X-Container-Name"),
+    api_key: Optional[str] = Header(None, alias="X-API-Key"),
+):
+    """
+    Execute Playwright browser commands.
+
+    Headers:
+    - X-Container-Name: Container name for cloud authentication
+    - X-API-Key: API key for cloud authentication
+
+    Body:
+    {
+        "command": "visit_url|click|type|scroll|web_search",
+        "params": {...}
+    }
+    """
+    # Parse request body
+    try:
+        body = await request.json()
+        command = body.get("command")
+        params = body.get("params", {})
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
+
+    if not command:
+        raise HTTPException(status_code=400, detail="Command is required")
+
+    # Check if CONTAINER_NAME is set (indicating cloud provider)
+    server_container_name = os.environ.get("CONTAINER_NAME")
+
+    # If cloud provider, perform authentication
+    if server_container_name:
+        logger.info(
+            f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..."
+        )
+
+        # Validate required headers
+        if not container_name:
+            raise HTTPException(status_code=401, detail="Container name required")
+
+        if not api_key:
+            raise HTTPException(status_code=401, detail="API key required")
+
+        # Validate with AuthenticationManager
+        is_authenticated = await auth_manager.auth(container_name, api_key)
+        if not is_authenticated:
+            raise HTTPException(status_code=401, detail="Authentication failed")
+
+    # Get browser manager and execute command
+    try:
+        browser_manager = get_browser_manager()
+        result = await browser_manager.execute_command(command, params)
+        
+        if result.get("success"):
+            return JSONResponse(content=result)
+        else:
+            raise HTTPException(status_code=400, detail=result.get("error", "Command failed"))
+    except Exception as e:
+        logger.error(f"Error executing playwright command: {str(e)}")
+        logger.error(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
+
+
 if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/libs/python/computer-server/pyproject.toml
+++ b/libs/python/computer-server/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
    "pyperclip>=1.9.0",
    "websockets>=12.0",
    "pywinctl>=0.4.1",
+    "playwright>=1.40.0",
    # OS-specific runtime deps
    "pyobjc-framework-Cocoa>=10.1; sys_platform == 'darwin'",
    "pyobjc-framework-Quartz>=10.1; sys_platform == 'darwin'",
--- a/libs/python/computer/computer/computer.py
+++ b/libs/python/computer/computer/computer.py
@@ -953,6 +953,35 @@ class Computer:
        """
        return await self.interface.to_screenshot_coordinates(x, y)

+    async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        Execute a Playwright browser command.
+
+        Args:
+            command: The browser command to execute (visit_url, click, type, scroll, web_search)
+            params: Command parameters
+
+        Returns:
+            Dict containing the command result
+
+        Examples:
+            # Navigate to a URL
+            await computer.playwright_exec("visit_url", {"url": "https://example.com"})
+
+            # Click at coordinates
+            await computer.playwright_exec("click", {"x": 100, "y": 200})
+
+            # Type text
+            await computer.playwright_exec("type", {"text": "Hello, world!"})
+
+            # Scroll
+            await computer.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100})
+
+            # Web search
+            await computer.playwright_exec("web_search", {"query": "computer use agent"})
+        """
+        return await self.interface.playwright_exec(command, params)
+
    # Add virtual environment management functions to computer interface
    async def venv_install(self, venv_name: str, requirements: list[str]):
        """Install packages in a virtual environment.
--- a/libs/python/computer/computer/interface/generic.py
+++ b/libs/python/computer/computer/interface/generic.py
@@ -661,6 +661,56 @@ class GenericComputerInterface(BaseComputerInterface):

        return screenshot_x, screenshot_y

+    # Playwright browser control
+    async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        Execute a Playwright browser command.
+
+        Args:
+            command: The browser command to execute (visit_url, click, type, scroll, web_search)
+            params: Command parameters
+
+        Returns:
+            Dict containing the command result
+
+        Examples:
+            # Navigate to a URL
+            await interface.playwright_exec("visit_url", {"url": "https://example.com"})
+
+            # Click at coordinates
+            await interface.playwright_exec("click", {"x": 100, "y": 200})
+
+            # Type text
+            await interface.playwright_exec("type", {"text": "Hello, world!"})
+
+            # Scroll
+            await interface.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100})
+
+            # Web search
+            await interface.playwright_exec("web_search", {"query": "computer use agent"})
+        """
+        protocol = "https" if self.api_key else "http"
+        port = "8443" if self.api_key else "8000"
+        url = f"{protocol}://{self.ip_address}:{port}/playwright_exec"
+
+        payload = {"command": command, "params": params or {}}
+        headers = {"Content-Type": "application/json"}
+        if self.api_key:
+            headers["X-API-Key"] = self.api_key
+        if self.vm_name:
+            headers["X-Container-Name"] = self.vm_name
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(url, json=payload, headers=headers) as response:
+                    if response.status == 200:
+                        return await response.json()
+                    else:
+                        error_text = await response.text()
+                        return {"success": False, "error": error_text}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
    # Websocket Methods
    async def _keep_alive(self):
        """Keep the WebSocket connection alive with automatic reconnection."""
--- a/libs/xfce/Development.md
+++ b/libs/xfce/Development.md
@@ -0,0 +1,28 @@
+# Development
+
+## Building the Development Docker Image
+
+To build the XFCE container with local computer-server changes:
+
+```bash
+cd libs/xfce
+docker build -f Dockerfile.dev -t cua-xfce:dev ..
+```
+
+The build context is set to the parent directory to allow copying the local `computer-server` source.
+
+## Tagging the Image
+
+To tag the dev image as latest:
+
+```bash
+docker tag cua-xfce:dev cua-xfce:latest
+```
+
+## Running the Development Container
+
+```bash
+docker run -p 6901:6901 -p 8000:8000 cua-xfce:dev
+```
+
+Access noVNC at: http://localhost:6901
--- a/libs/xfce/Dockerfile
+++ b/libs/xfce/Dockerfile
@@ -107,6 +107,10 @@ RUN mkdir -p /home/cua/.cache && \
 # Install computer-server using Python 3.12 pip
 RUN python3.12 -m pip install cua-computer-server

+# Install playwright and Firefox dependencies
+RUN python3.12 -m pip install playwright && \
+    python3.12 -m playwright install --with-deps firefox
+
 # Fix any cache files created by pip
 RUN chown -R cua:cua /home/cua/.cache

--- a/libs/xfce/Dockerfile.dev
+++ b/libs/xfce/Dockerfile.dev
@@ -0,0 +1,159 @@
+# CUA Docker XFCE Container - Development Version
+# Vanilla XFCE desktop with noVNC and computer-server (from local source)
+
+FROM ubuntu:22.04
+
+# Avoid prompts from apt
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Set environment variables
+ENV HOME=/home/cua
+ENV DISPLAY=:1
+ENV VNC_PORT=5901
+ENV NOVNC_PORT=6901
+ENV API_PORT=8000
+ENV VNC_RESOLUTION=1024x768
+ENV VNC_COL_DEPTH=24
+
+# Install system dependencies first (including sudo)
+RUN apt-get update && apt-get install -y \
+    # System utilities
+    sudo \
+    unzip \
+    zip \
+    xdg-utils \
+    # Desktop environment
+    xfce4 \
+    xfce4-terminal \
+    dbus-x11 \
+    # VNC server
+    tigervnc-standalone-server \
+    tigervnc-common \
+    # noVNC dependencies
+    # python will be installed via deadsnakes as 3.12 \
+    git \
+    net-tools \
+    netcat \
+    supervisor \
+    # Computer-server dependencies
+    # python-tk/dev for 3.12 will be installed later \
+    gnome-screenshot \
+    wmctrl \
+    ffmpeg \
+    socat \
+    xclip \
+    # Browser
+    wget \
+    software-properties-common \
+    # Build tools
+    build-essential \
+    libncursesw5-dev \
+    libssl-dev \
+    libsqlite3-dev \
+    tk-dev \
+    libgdbm-dev \
+    libc6-dev \
+    libbz2-dev \
+    libffi-dev \
+    zlib1g-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python 3.12 from deadsnakes (keep system python3 for apt)
+RUN add-apt-repository -y ppa:deadsnakes/ppa && \
+    apt-get update && apt-get install -y \
+    python3.12 python3.12-venv python3.12-dev python3.12-tk && \
+    python3.12 -m ensurepip --upgrade && \
+    python3.12 -m pip install --upgrade pip setuptools wheel && \
+    rm -rf /var/lib/apt/lists/*
+
+# Ensure 'python' points to Python 3.12
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 2
+
+# Remove screensavers and power manager to avoid popups and lock screens
+RUN apt-get remove -y \
+    xfce4-power-manager \
+    xfce4-power-manager-data \
+    xfce4-power-manager-plugins \
+    xfce4-screensaver \
+    light-locker \
+    xscreensaver \
+    xscreensaver-data || true
+
+# Create user after sudo is installed
+RUN useradd -m -s /bin/bash -G sudo cua && \
+    echo "cua:cua" | chpasswd && \
+    echo "cua ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
+
+# Install Firefox from Mozilla PPA (snap-free) - inline to avoid script issues
+RUN apt-get update && \
+    add-apt-repository -y ppa:mozillateam/ppa && \
+    echo 'Package: *\nPin: release o=LP-PPA-mozillateam\nPin-Priority: 1001' > /etc/apt/preferences.d/mozilla-firefox && \
+    apt-get update && \
+    apt-get install -y firefox && \
+    echo 'pref("datareporting.policy.firstRunURL", "");\npref("datareporting.policy.dataSubmissionEnabled", false);\npref("datareporting.healthreport.service.enabled", false);\npref("datareporting.healthreport.uploadEnabled", false);\npref("trailhead.firstrun.branches", "nofirstrun-empty");\npref("browser.aboutwelcome.enabled", false);' > /usr/lib/firefox/browser/defaults/preferences/firefox.js && \
+    update-alternatives --install /usr/bin/x-www-browser x-www-browser /usr/bin/firefox 100 && \
+    update-alternatives --install /usr/bin/gnome-www-browser gnome-www-browser /usr/bin/firefox 100 && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install noVNC
+RUN git clone https://github.com/novnc/noVNC.git /opt/noVNC && \
+    git clone https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \
+    ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html
+
+# Pre-create cache directory with correct ownership before pip install
+RUN mkdir -p /home/cua/.cache && \
+    chown -R cua:cua /home/cua/.cache
+
+# Copy local computer-server source and install it
+COPY python/computer-server /tmp/computer-server
+RUN python3.12 -m pip install /tmp/computer-server && \
+    rm -rf /tmp/computer-server
+
+# Install playwright and Firefox dependencies
+RUN python3.12 -m pip install playwright && \
+    python3.12 -m playwright install --with-deps firefox
+
+# Fix any cache files created by pip
+RUN chown -R cua:cua /home/cua/.cache
+
+# Copy startup scripts
+COPY xfce/src/supervisor/ /etc/supervisor/conf.d/
+COPY xfce/src/scripts/ /usr/local/bin/
+
+# Make scripts executable
+RUN chmod +x /usr/local/bin/*.sh
+
+# Setup VNC
+RUN chown -R cua:cua /home/cua
+USER cua
+WORKDIR /home/cua
+
+# Create VNC directory (no password needed with SecurityTypes None)
+RUN mkdir -p $HOME/.vnc
+
+# Configure XFCE for first start
+RUN mkdir -p $HOME/.config/xfce4/xfconf/xfce-perchannel-xml $HOME/.config/xfce4 $HOME/.config/autostart
+
+# Copy XFCE config to disable browser launching and welcome screens
+COPY --chown=cua:cua xfce/src/xfce-config/helpers.rc $HOME/.config/xfce4/helpers.rc
+COPY --chown=cua:cua xfce/src/xfce-config/xfce4-session.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-session.xml
+COPY --chown=cua:cua xfce/src/xfce-config/xfce4-power-manager.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-power-manager.xml
+
+# Disable autostart for screensaver, lock screen, and power manager
+RUN echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-tips-autostart.desktop && \
+    echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-screensaver.desktop && \
+    echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/light-locker.desktop && \
+    echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-power-manager.desktop && \
+    chown -R cua:cua $HOME/.config
+
+# Create storage and shared directories, and Firefox cache directory
+RUN mkdir -p $HOME/storage $HOME/shared $HOME/.cache/dconf $HOME/.mozilla/firefox && \
+    chown -R cua:cua $HOME/storage $HOME/shared $HOME/.cache $HOME/.mozilla $HOME/.vnc
+
+USER root
+
+# Expose ports
+EXPOSE $VNC_PORT $NOVNC_PORT $API_PORT
+
+# Start services via supervisor
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
--- a/libs/xfce/src/scripts/start-computer-server.sh
+++ b/libs/xfce/src/scripts/start-computer-server.sh
@@ -10,4 +10,4 @@ echo "X server is ready"

 # Start computer-server
 export DISPLAY=:1
-python3 -m computer_server --port ${API_PORT:-8000}
+python -m computer_server --port ${API_PORT:-8000}