diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d9475d42..a2e35493 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,6 +15,8 @@ repos: name: TypeScript type check entry: node ./scripts/typescript-typecheck.js language: node + files: \.(ts|tsx)$ + pass_filenames: false - repo: https://github.com/PyCQA/isort rev: 7.0.0 diff --git a/examples/browser_tool_example.py b/examples/browser_tool_example.py new file mode 100644 index 00000000..e5767f27 --- /dev/null +++ b/examples/browser_tool_example.py @@ -0,0 +1,119 @@ +""" +Browser Tool Example + +Demonstrates how to use the BrowserTool to control a browser programmatically +via the computer server. The browser runs visibly on the XFCE desktop so visual +agents can see it. + +Prerequisites: + - Computer server running (Docker container or local) + - For Docker: Container should be running with browser tool support + - For local: Playwright and Firefox must be installed + +Usage: + python examples/browser_tool_example.py +""" + +import asyncio +import logging +import sys +from pathlib import Path + +# Add the libs path to sys.path +libs_path = Path(__file__).parent.parent / "libs" / "python" +sys.path.insert(0, str(libs_path)) + +from agent.tools.browser_tool import BrowserTool + +# Import Computer interface and BrowserTool +from computer import Computer + +# Configure logging to see what's happening +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +async def test_browser_tool(): + """Test the BrowserTool with various commands.""" + + # Initialize the computer interface + # For local testing, use provider_type="docker" + # For provider_type="cloud", provide name and api_key + computer = Computer(provider_type="docker", os_type="linux", image="cua-xfce:dev") + await computer.run() + + # Initialize the browser tool with the computer interface + browser = BrowserTool(interface=computer) + + logger.info("Testing Browser Tool...") + + try: + # Test 0: Take a screenshot (pre-init) + logger.info("Test 0: Taking a screenshot...") + screenshot_bytes = await browser.screenshot() + screenshot_path = Path(__file__).parent / "browser_screenshot_init.png" + with open(screenshot_path, "wb") as f: + f.write(screenshot_bytes) + logger.info(f"Screenshot captured: {len(screenshot_bytes)} bytes") + + # Test 1: Visit a URL + logger.info("Test 1: Visiting a URL...") + result = await browser.visit_url("https://www.trycua.com") + logger.info(f"Visit URL result: {result}") + + # Wait a bit for the page to load + await asyncio.sleep(2) + + # Test 2: Take a screenshot + logger.info("Test 2: Taking a screenshot...") + screenshot_bytes = await browser.screenshot() + screenshot_path = Path(__file__).parent / "browser_screenshot.png" + with open(screenshot_path, "wb") as f: + f.write(screenshot_bytes) + logger.info(f"Screenshot captured: {len(screenshot_bytes)} bytes") + + # Wait a bit + await asyncio.sleep(1) + + # Test 3: Visit bot detector + logger.info("Test 3: Visiting bot detector...") + result = await browser.visit_url("https://bot-detector.rebrowser.net/") + logger.info(f"Visit URL result: {result}") + + # Test 2: Web search + logger.info("Test 2: Performing a web search...") + result = await browser.web_search("Python programming") + logger.info(f"Web search result: {result}") + + # Wait a bit + await asyncio.sleep(2) + + # Test 3: Scroll + logger.info("Test 3: Scrolling the page...") + result = await browser.scroll(delta_x=0, delta_y=500) + logger.info(f"Scroll result: {result}") + + # Wait a bit + await asyncio.sleep(1) + + # Test 4: Click (example coordinates - adjust based on your screen) + logger.info("Test 4: Clicking at coordinates...") + result = await browser.click(x=500, y=300) + logger.info(f"Click result: {result}") + + # Wait a bit + await asyncio.sleep(1) + + # Test 5: Type text (if there's a focused input field) + logger.info("Test 5: Typing text...") + result = await browser.type("Hello from BrowserTool!") + logger.info(f"Type result: {result}") + + logger.info("All tests completed!") + + except Exception as e: + logger.error(f"Error during testing: {e}", exc_info=True) + + +if __name__ == "__main__": + asyncio.run(test_browser_tool()) diff --git a/libs/python/agent/agent/tools/__init__.py b/libs/python/agent/agent/tools/__init__.py new file mode 100644 index 00000000..e663c557 --- /dev/null +++ b/libs/python/agent/agent/tools/__init__.py @@ -0,0 +1,6 @@ +"""Tools for agent interactions.""" + +from .browser_tool import BrowserTool + +__all__ = ["BrowserTool"] + diff --git a/libs/python/agent/agent/tools/browser_tool.py b/libs/python/agent/agent/tools/browser_tool.py new file mode 100644 index 00000000..a1bf2090 --- /dev/null +++ b/libs/python/agent/agent/tools/browser_tool.py @@ -0,0 +1,135 @@ +""" +Browser Tool for agent interactions. +Allows agents to control a browser programmatically via Playwright. +""" + +import logging +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + from computer.interface import GenericComputerInterface + +logger = logging.getLogger(__name__) + + +class BrowserTool: + """ + Browser tool that uses the computer SDK's interface to control a browser. + Implements the Fara/Magentic-One agent interface for browser control. + """ + + def __init__( + self, + interface: "GenericComputerInterface", + ): + """ + Initialize the BrowserTool. + + Args: + interface: A GenericComputerInterface instance that provides playwright_exec + """ + self.interface = interface + self.logger = logger + + async def _execute_command(self, command: str, params: dict) -> dict: + """ + Execute a browser command via the computer interface. + + Args: + command: Command name + params: Command parameters + + Returns: + Response dictionary + """ + try: + result = await self.interface.playwright_exec(command, params) + if not result.get("success"): + self.logger.error( + f"Browser command '{command}' failed: {result.get('error', 'Unknown error')}" + ) + return result + except Exception as e: + self.logger.error(f"Error executing browser command '{command}': {e}") + return {"success": False, "error": str(e)} + + async def visit_url(self, url: str) -> dict: + """ + Navigate to a URL. + + Args: + url: URL to visit + + Returns: + Response dictionary with success status and current URL + """ + return await self._execute_command("visit_url", {"url": url}) + + async def click(self, x: int, y: int) -> dict: + """ + Click at coordinates. + + Args: + x: X coordinate + y: Y coordinate + + Returns: + Response dictionary with success status + """ + return await self._execute_command("click", {"x": x, "y": y}) + + async def type(self, text: str) -> dict: + """ + Type text into the focused element. + + Args: + text: Text to type + + Returns: + Response dictionary with success status + """ + return await self._execute_command("type", {"text": text}) + + async def scroll(self, delta_x: int, delta_y: int) -> dict: + """ + Scroll the page. + + Args: + delta_x: Horizontal scroll delta + delta_y: Vertical scroll delta + + Returns: + Response dictionary with success status + """ + return await self._execute_command("scroll", {"delta_x": delta_x, "delta_y": delta_y}) + + async def web_search(self, query: str) -> dict: + """ + Navigate to a Google search for the query. + + Args: + query: Search query + + Returns: + Response dictionary with success status and current URL + """ + return await self._execute_command("web_search", {"query": query}) + + async def screenshot(self) -> bytes: + """ + Take a screenshot of the current browser page. + + Returns: + Screenshot image data as bytes (PNG format) + """ + import base64 + + result = await self._execute_command("screenshot", {}) + if result.get("success") and result.get("screenshot"): + # Decode base64 screenshot to bytes + screenshot_b64 = result["screenshot"] + screenshot_bytes = base64.b64decode(screenshot_b64) + return screenshot_bytes + else: + error = result.get("error", "Unknown error") + raise RuntimeError(f"Failed to take screenshot: {error}") diff --git a/libs/python/computer-server/computer_server/browser.py b/libs/python/computer-server/computer_server/browser.py new file mode 100644 index 00000000..9789abf7 --- /dev/null +++ b/libs/python/computer-server/computer_server/browser.py @@ -0,0 +1,361 @@ +""" +Browser manager using Playwright for programmatic browser control. +This allows agents to control a browser that runs visibly on the XFCE desktop. +""" + +import asyncio +import logging +import os +from typing import Any, Dict, Optional + +try: + from playwright.async_api import Browser, BrowserContext, Page, async_playwright +except ImportError: + async_playwright = None + Browser = None + BrowserContext = None + Page = None + +logger = logging.getLogger(__name__) + + +class BrowserManager: + """ + Manages a Playwright browser instance that runs visibly on the XFCE desktop. + Uses persistent context to maintain cookies and sessions. + """ + + def __init__(self): + """Initialize the BrowserManager.""" + self.playwright = None + self.browser: Optional[Browser] = None + self.context: Optional[BrowserContext] = None + self.page: Optional[Page] = None + self._initialized = False + self._initialization_error: Optional[str] = None + self._lock = asyncio.Lock() + + async def _ensure_initialized(self): + """Ensure the browser is initialized.""" + # Check if browser was closed and needs reinitialization + if self._initialized: + try: + # Check if context is still valid by trying to access it + if self.context: + # Try to get pages - this will raise if context is closed + _ = self.context.pages + # If we get here, context is still alive + return + else: + # Context was closed, need to reinitialize + self._initialized = False + logger.warning("Browser context was closed, will reinitialize...") + except Exception as e: + # Context is dead, need to reinitialize + logger.warning(f"Browser context is dead ({e}), will reinitialize...") + self._initialized = False + self.context = None + self.page = None + # Clean up playwright if it exists + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + async with self._lock: + # Double-check after acquiring lock (another thread might have initialized it) + if self._initialized: + try: + if self.context: + _ = self.context.pages + return + except Exception: + self._initialized = False + self.context = None + self.page = None + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + if async_playwright is None: + raise RuntimeError( + "playwright is not installed. Please install it with: pip install playwright && playwright install --with-deps firefox" + ) + + try: + # Get display from environment or default to :1 + display = os.environ.get("DISPLAY", ":1") + logger.info(f"Initializing browser with DISPLAY={display}") + + # Start playwright + self.playwright = await async_playwright().start() + + # Launch Firefox with persistent context (keeps cookies/sessions) + # headless=False is CRITICAL so the visual agent can see it + user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright-firefox") + os.makedirs(user_data_dir, exist_ok=True) + + # launch_persistent_context returns a BrowserContext, not a Browser + # Note: Removed --kiosk mode so the desktop remains visible + self.context = await self.playwright.firefox.launch_persistent_context( + user_data_dir=user_data_dir, + headless=False, # CRITICAL: visible for visual agent + viewport={"width": 1024, "height": 768}, + # Removed --kiosk to allow desktop visibility + ) + + # Add init script to make the browser less detectable + await self.context.add_init_script( + """const defaultGetter = Object.getOwnPropertyDescriptor( + Navigator.prototype, + "webdriver" + ).get; + defaultGetter.apply(navigator); + defaultGetter.toString(); + Object.defineProperty(Navigator.prototype, "webdriver", { + set: undefined, + enumerable: true, + configurable: true, + get: new Proxy(defaultGetter, { + apply: (target, thisArg, args) => { + Reflect.apply(target, thisArg, args); + return false; + }, + }), + }); + const patchedGetter = Object.getOwnPropertyDescriptor( + Navigator.prototype, + "webdriver" + ).get; + patchedGetter.apply(navigator); + patchedGetter.toString();""" + ) + + # Get the first page or create one + pages = self.context.pages + if pages: + self.page = pages[0] + else: + self.page = await self.context.new_page() + + self._initialized = True + logger.info("Browser initialized successfully") + + except Exception as e: + logger.error(f"Failed to initialize browser: {e}") + import traceback + + logger.error(traceback.format_exc()) + # Don't raise - return error in execute_command instead + self._initialization_error = str(e) + raise + + async def _execute_command_impl(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]: + """Internal implementation of command execution.""" + if cmd == "visit_url": + url = params.get("url") + if not url: + return {"success": False, "error": "url parameter is required"} + await self.page.goto(url, wait_until="domcontentloaded", timeout=30000) + return {"success": True, "url": self.page.url} + + elif cmd == "click": + x = params.get("x") + y = params.get("y") + if x is None or y is None: + return {"success": False, "error": "x and y parameters are required"} + await self.page.mouse.click(x, y) + return {"success": True} + + elif cmd == "type": + text = params.get("text") + if text is None: + return {"success": False, "error": "text parameter is required"} + await self.page.keyboard.type(text) + return {"success": True} + + elif cmd == "scroll": + delta_x = params.get("delta_x", 0) + delta_y = params.get("delta_y", 0) + await self.page.mouse.wheel(delta_x, delta_y) + return {"success": True} + + elif cmd == "web_search": + query = params.get("query") + if not query: + return {"success": False, "error": "query parameter is required"} + # Navigate to Google search + search_url = f"https://www.google.com/search?q={query}" + await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000) + return {"success": True, "url": self.page.url} + + elif cmd == "screenshot": + # Take a screenshot and return as base64 + import base64 + + screenshot_bytes = await self.page.screenshot(type="png") + screenshot_b64 = base64.b64encode(screenshot_bytes).decode("utf-8") + return {"success": True, "screenshot": screenshot_b64} + + else: + return {"success": False, "error": f"Unknown command: {cmd}"} + + async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]: + """ + Execute a browser command with automatic recovery. + + Args: + cmd: Command name (visit_url, click, type, scroll, web_search) + params: Command parameters + + Returns: + Result dictionary with success status and any data + """ + max_retries = 2 + for attempt in range(max_retries): + try: + await self._ensure_initialized() + except Exception as e: + error_msg = getattr(self, "_initialization_error", None) or str(e) + logger.error(f"Browser initialization failed: {error_msg}") + return { + "success": False, + "error": f"Browser initialization failed: {error_msg}. " + f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly.", + } + + # Check if page is still valid and get a new one if needed + page_valid = False + try: + if self.page is not None and not self.page.is_closed(): + # Try to access page.url to check if it's still valid + _ = self.page.url + page_valid = True + except Exception as e: + logger.warning(f"Page is invalid: {e}, will get a new page...") + self.page = None + + # Get a valid page if we don't have one + if not page_valid or self.page is None: + try: + if self.context: + pages = self.context.pages + if pages: + # Find first non-closed page + for p in pages: + try: + if not p.is_closed(): + self.page = p + logger.info("Reusing existing open page") + page_valid = True + break + except Exception: + continue + + # If no valid page found, create a new one + if not page_valid: + self.page = await self.context.new_page() + logger.info("Created new page") + except Exception as e: + logger.error(f"Failed to get new page: {e}, browser may be closed") + # Browser was closed - force reinitialization + self._initialized = False + self.context = None + self.page = None + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + # If this isn't the last attempt, continue to retry + if attempt < max_retries - 1: + logger.info("Browser was closed, retrying with fresh initialization...") + continue + else: + return { + "success": False, + "error": f"Browser was closed and cannot be recovered: {e}", + } + + # Try to execute the command + try: + return await self._execute_command_impl(cmd, params) + except Exception as e: + error_str = str(e) + logger.error(f"Error executing command {cmd}: {e}") + + # Check if this is a "browser/page/context closed" error + if any(keyword in error_str.lower() for keyword in ["closed", "target", "context"]): + logger.warning( + f"Browser/page was closed during command execution (attempt {attempt + 1}/{max_retries})" + ) + + # Force reinitialization + self._initialized = False + self.context = None + self.page = None + if self.playwright: + try: + await self.playwright.stop() + except Exception: + pass + self.playwright = None + + # If this isn't the last attempt, retry + if attempt < max_retries - 1: + logger.info("Retrying command after browser reinitialization...") + continue + else: + return { + "success": False, + "error": f"Command failed after {max_retries} attempts: {error_str}", + } + else: + # Not a browser closed error, return immediately + import traceback + + logger.error(traceback.format_exc()) + return {"success": False, "error": error_str} + + # Should never reach here, but just in case + return {"success": False, "error": "Command failed after all retries"} + + async def close(self): + """Close the browser and cleanup resources.""" + async with self._lock: + try: + if self.context: + await self.context.close() + self.context = None + if self.browser: + await self.browser.close() + self.browser = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + + self.page = None + self._initialized = False + logger.info("Browser closed successfully") + except Exception as e: + logger.error(f"Error closing browser: {e}") + + +# Global instance +_browser_manager: Optional[BrowserManager] = None + + +def get_browser_manager() -> BrowserManager: + """Get or create the global BrowserManager instance.""" + global _browser_manager + if _browser_manager is None: + _browser_manager = BrowserManager() + return _browser_manager diff --git a/libs/python/computer-server/computer_server/main.py b/libs/python/computer-server/computer_server/main.py index 3ae97ebc..9bad59bf 100644 --- a/libs/python/computer-server/computer_server/main.py +++ b/libs/python/computer-server/computer_server/main.py @@ -25,6 +25,7 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, StreamingResponse from .handlers.factory import HandlerFactory +from .browser import get_browser_manager # Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60")) @@ -749,5 +750,71 @@ async def agent_response_endpoint( return JSONResponse(content=payload, headers=headers) +@app.post("/playwright_exec") +async def playwright_exec_endpoint( + request: Request, + container_name: Optional[str] = Header(None, alias="X-Container-Name"), + api_key: Optional[str] = Header(None, alias="X-API-Key"), +): + """ + Execute Playwright browser commands. + + Headers: + - X-Container-Name: Container name for cloud authentication + - X-API-Key: API key for cloud authentication + + Body: + { + "command": "visit_url|click|type|scroll|web_search", + "params": {...} + } + """ + # Parse request body + try: + body = await request.json() + command = body.get("command") + params = body.get("params", {}) + except Exception as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") + + if not command: + raise HTTPException(status_code=400, detail="Command is required") + + # Check if CONTAINER_NAME is set (indicating cloud provider) + server_container_name = os.environ.get("CONTAINER_NAME") + + # If cloud provider, perform authentication + if server_container_name: + logger.info( + f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..." + ) + + # Validate required headers + if not container_name: + raise HTTPException(status_code=401, detail="Container name required") + + if not api_key: + raise HTTPException(status_code=401, detail="API key required") + + # Validate with AuthenticationManager + is_authenticated = await auth_manager.auth(container_name, api_key) + if not is_authenticated: + raise HTTPException(status_code=401, detail="Authentication failed") + + # Get browser manager and execute command + try: + browser_manager = get_browser_manager() + result = await browser_manager.execute_command(command, params) + + if result.get("success"): + return JSONResponse(content=result) + else: + raise HTTPException(status_code=400, detail=result.get("error", "Command failed")) + except Exception as e: + logger.error(f"Error executing playwright command: {str(e)}") + logger.error(traceback.format_exc()) + raise HTTPException(status_code=500, detail=str(e)) + + if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/libs/python/computer-server/pyproject.toml b/libs/python/computer-server/pyproject.toml index 07dc51df..e1a3445b 100644 --- a/libs/python/computer-server/pyproject.toml +++ b/libs/python/computer-server/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "pyperclip>=1.9.0", "websockets>=12.0", "pywinctl>=0.4.1", + "playwright>=1.40.0", # OS-specific runtime deps "pyobjc-framework-Cocoa>=10.1; sys_platform == 'darwin'", "pyobjc-framework-Quartz>=10.1; sys_platform == 'darwin'", diff --git a/libs/python/computer/computer/computer.py b/libs/python/computer/computer/computer.py index 0b1cd509..710b08a2 100644 --- a/libs/python/computer/computer/computer.py +++ b/libs/python/computer/computer/computer.py @@ -953,6 +953,35 @@ class Computer: """ return await self.interface.to_screenshot_coordinates(x, y) + async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any]: + """ + Execute a Playwright browser command. + + Args: + command: The browser command to execute (visit_url, click, type, scroll, web_search) + params: Command parameters + + Returns: + Dict containing the command result + + Examples: + # Navigate to a URL + await computer.playwright_exec("visit_url", {"url": "https://example.com"}) + + # Click at coordinates + await computer.playwright_exec("click", {"x": 100, "y": 200}) + + # Type text + await computer.playwright_exec("type", {"text": "Hello, world!"}) + + # Scroll + await computer.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100}) + + # Web search + await computer.playwright_exec("web_search", {"query": "computer use agent"}) + """ + return await self.interface.playwright_exec(command, params) + # Add virtual environment management functions to computer interface async def venv_install(self, venv_name: str, requirements: list[str]): """Install packages in a virtual environment. diff --git a/libs/python/computer/computer/interface/generic.py b/libs/python/computer/computer/interface/generic.py index e58719dd..d5a5dc4b 100644 --- a/libs/python/computer/computer/interface/generic.py +++ b/libs/python/computer/computer/interface/generic.py @@ -661,6 +661,56 @@ class GenericComputerInterface(BaseComputerInterface): return screenshot_x, screenshot_y + # Playwright browser control + async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any]: + """ + Execute a Playwright browser command. + + Args: + command: The browser command to execute (visit_url, click, type, scroll, web_search) + params: Command parameters + + Returns: + Dict containing the command result + + Examples: + # Navigate to a URL + await interface.playwright_exec("visit_url", {"url": "https://example.com"}) + + # Click at coordinates + await interface.playwright_exec("click", {"x": 100, "y": 200}) + + # Type text + await interface.playwright_exec("type", {"text": "Hello, world!"}) + + # Scroll + await interface.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100}) + + # Web search + await interface.playwright_exec("web_search", {"query": "computer use agent"}) + """ + protocol = "https" if self.api_key else "http" + port = "8443" if self.api_key else "8000" + url = f"{protocol}://{self.ip_address}:{port}/playwright_exec" + + payload = {"command": command, "params": params or {}} + headers = {"Content-Type": "application/json"} + if self.api_key: + headers["X-API-Key"] = self.api_key + if self.vm_name: + headers["X-Container-Name"] = self.vm_name + + try: + async with aiohttp.ClientSession() as session: + async with session.post(url, json=payload, headers=headers) as response: + if response.status == 200: + return await response.json() + else: + error_text = await response.text() + return {"success": False, "error": error_text} + except Exception as e: + return {"success": False, "error": str(e)} + # Websocket Methods async def _keep_alive(self): """Keep the WebSocket connection alive with automatic reconnection.""" diff --git a/libs/xfce/Development.md b/libs/xfce/Development.md new file mode 100644 index 00000000..b67199e8 --- /dev/null +++ b/libs/xfce/Development.md @@ -0,0 +1,28 @@ +# Development + +## Building the Development Docker Image + +To build the XFCE container with local computer-server changes: + +```bash +cd libs/xfce +docker build -f Dockerfile.dev -t cua-xfce:dev .. +``` + +The build context is set to the parent directory to allow copying the local `computer-server` source. + +## Tagging the Image + +To tag the dev image as latest: + +```bash +docker tag cua-xfce:dev cua-xfce:latest +``` + +## Running the Development Container + +```bash +docker run -p 6901:6901 -p 8000:8000 cua-xfce:dev +``` + +Access noVNC at: http://localhost:6901 diff --git a/libs/xfce/Dockerfile b/libs/xfce/Dockerfile index e83f6bd2..f1605181 100644 --- a/libs/xfce/Dockerfile +++ b/libs/xfce/Dockerfile @@ -107,6 +107,10 @@ RUN mkdir -p /home/cua/.cache && \ # Install computer-server using Python 3.12 pip RUN python3.12 -m pip install cua-computer-server +# Install playwright and Firefox dependencies +RUN python3.12 -m pip install playwright && \ + python3.12 -m playwright install --with-deps firefox + # Fix any cache files created by pip RUN chown -R cua:cua /home/cua/.cache diff --git a/libs/xfce/Dockerfile.dev b/libs/xfce/Dockerfile.dev new file mode 100644 index 00000000..c24efaf9 --- /dev/null +++ b/libs/xfce/Dockerfile.dev @@ -0,0 +1,159 @@ +# CUA Docker XFCE Container - Development Version +# Vanilla XFCE desktop with noVNC and computer-server (from local source) + +FROM ubuntu:22.04 + +# Avoid prompts from apt +ENV DEBIAN_FRONTEND=noninteractive + +# Set environment variables +ENV HOME=/home/cua +ENV DISPLAY=:1 +ENV VNC_PORT=5901 +ENV NOVNC_PORT=6901 +ENV API_PORT=8000 +ENV VNC_RESOLUTION=1024x768 +ENV VNC_COL_DEPTH=24 + +# Install system dependencies first (including sudo) +RUN apt-get update && apt-get install -y \ + # System utilities + sudo \ + unzip \ + zip \ + xdg-utils \ + # Desktop environment + xfce4 \ + xfce4-terminal \ + dbus-x11 \ + # VNC server + tigervnc-standalone-server \ + tigervnc-common \ + # noVNC dependencies + # python will be installed via deadsnakes as 3.12 \ + git \ + net-tools \ + netcat \ + supervisor \ + # Computer-server dependencies + # python-tk/dev for 3.12 will be installed later \ + gnome-screenshot \ + wmctrl \ + ffmpeg \ + socat \ + xclip \ + # Browser + wget \ + software-properties-common \ + # Build tools + build-essential \ + libncursesw5-dev \ + libssl-dev \ + libsqlite3-dev \ + tk-dev \ + libgdbm-dev \ + libc6-dev \ + libbz2-dev \ + libffi-dev \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install Python 3.12 from deadsnakes (keep system python3 for apt) +RUN add-apt-repository -y ppa:deadsnakes/ppa && \ + apt-get update && apt-get install -y \ + python3.12 python3.12-venv python3.12-dev python3.12-tk && \ + python3.12 -m ensurepip --upgrade && \ + python3.12 -m pip install --upgrade pip setuptools wheel && \ + rm -rf /var/lib/apt/lists/* + +# Ensure 'python' points to Python 3.12 +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 2 + +# Remove screensavers and power manager to avoid popups and lock screens +RUN apt-get remove -y \ + xfce4-power-manager \ + xfce4-power-manager-data \ + xfce4-power-manager-plugins \ + xfce4-screensaver \ + light-locker \ + xscreensaver \ + xscreensaver-data || true + +# Create user after sudo is installed +RUN useradd -m -s /bin/bash -G sudo cua && \ + echo "cua:cua" | chpasswd && \ + echo "cua ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + +# Install Firefox from Mozilla PPA (snap-free) - inline to avoid script issues +RUN apt-get update && \ + add-apt-repository -y ppa:mozillateam/ppa && \ + echo 'Package: *\nPin: release o=LP-PPA-mozillateam\nPin-Priority: 1001' > /etc/apt/preferences.d/mozilla-firefox && \ + apt-get update && \ + apt-get install -y firefox && \ + echo 'pref("datareporting.policy.firstRunURL", "");\npref("datareporting.policy.dataSubmissionEnabled", false);\npref("datareporting.healthreport.service.enabled", false);\npref("datareporting.healthreport.uploadEnabled", false);\npref("trailhead.firstrun.branches", "nofirstrun-empty");\npref("browser.aboutwelcome.enabled", false);' > /usr/lib/firefox/browser/defaults/preferences/firefox.js && \ + update-alternatives --install /usr/bin/x-www-browser x-www-browser /usr/bin/firefox 100 && \ + update-alternatives --install /usr/bin/gnome-www-browser gnome-www-browser /usr/bin/firefox 100 && \ + rm -rf /var/lib/apt/lists/* + +# Install noVNC +RUN git clone https://github.com/novnc/noVNC.git /opt/noVNC && \ + git clone https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \ + ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html + +# Pre-create cache directory with correct ownership before pip install +RUN mkdir -p /home/cua/.cache && \ + chown -R cua:cua /home/cua/.cache + +# Copy local computer-server source and install it +COPY python/computer-server /tmp/computer-server +RUN python3.12 -m pip install /tmp/computer-server && \ + rm -rf /tmp/computer-server + +# Install playwright and Firefox dependencies +RUN python3.12 -m pip install playwright && \ + python3.12 -m playwright install --with-deps firefox + +# Fix any cache files created by pip +RUN chown -R cua:cua /home/cua/.cache + +# Copy startup scripts +COPY xfce/src/supervisor/ /etc/supervisor/conf.d/ +COPY xfce/src/scripts/ /usr/local/bin/ + +# Make scripts executable +RUN chmod +x /usr/local/bin/*.sh + +# Setup VNC +RUN chown -R cua:cua /home/cua +USER cua +WORKDIR /home/cua + +# Create VNC directory (no password needed with SecurityTypes None) +RUN mkdir -p $HOME/.vnc + +# Configure XFCE for first start +RUN mkdir -p $HOME/.config/xfce4/xfconf/xfce-perchannel-xml $HOME/.config/xfce4 $HOME/.config/autostart + +# Copy XFCE config to disable browser launching and welcome screens +COPY --chown=cua:cua xfce/src/xfce-config/helpers.rc $HOME/.config/xfce4/helpers.rc +COPY --chown=cua:cua xfce/src/xfce-config/xfce4-session.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-session.xml +COPY --chown=cua:cua xfce/src/xfce-config/xfce4-power-manager.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-power-manager.xml + +# Disable autostart for screensaver, lock screen, and power manager +RUN echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-tips-autostart.desktop && \ + echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-screensaver.desktop && \ + echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/light-locker.desktop && \ + echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-power-manager.desktop && \ + chown -R cua:cua $HOME/.config + +# Create storage and shared directories, and Firefox cache directory +RUN mkdir -p $HOME/storage $HOME/shared $HOME/.cache/dconf $HOME/.mozilla/firefox && \ + chown -R cua:cua $HOME/storage $HOME/shared $HOME/.cache $HOME/.mozilla $HOME/.vnc + +USER root + +# Expose ports +EXPOSE $VNC_PORT $NOVNC_PORT $API_PORT + +# Start services via supervisor +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"] diff --git a/libs/xfce/src/scripts/start-computer-server.sh b/libs/xfce/src/scripts/start-computer-server.sh index bc27a3db..1e52e536 100644 --- a/libs/xfce/src/scripts/start-computer-server.sh +++ b/libs/xfce/src/scripts/start-computer-server.sh @@ -10,4 +10,4 @@ echo "X server is ready" # Start computer-server export DISPLAY=:1 -python3 -m computer_server --port ${API_PORT:-8000} +python -m computer_server --port ${API_PORT:-8000}