Unmerged Dockerfile, added Dockerfile.dev that mounts the local computer-server

2026-04-24 15:38:59 -05:00 · 2025-12-03 08:24:21 -08:00
parent ddfb53e79f
commit 6fa66c18cc
6 changed files with 179 additions and 1166 deletions
@@ -0,0 +1,20 @@
+# Development
+
+## Building the Development Docker Image
+
+To build the XFCE container with local computer-server changes:
+
+```bash
+cd libs/xfce
+docker build -f Dockerfile.dev -t cua-xfce:dev ..
+```
+
+The build context is set to the parent directory to allow copying the local `computer-server` source.
+
+## Running the Development Container
+
+```bash
+docker run -p 6901:6901 -p 8000:8000 cua-xfce:dev
+```
+
+Access noVNC at: http://localhost:6901
@@ -107,12 +107,6 @@ RUN mkdir -p /home/cua/.cache && \
 # Install computer-server using Python 3.12 pip
 RUN python3.12 -m pip install cua-computer-server

-# Copy browser.py and updated main.py from local source (to include browser tool)
-# These files need to be in the same directory as the Dockerfile when building
-COPY browser.py /tmp/browser.py
-COPY main.py /tmp/main.py
-RUN python3.12 -c "import shutil; import os; cs_dir = '/usr/local/lib/python3.12/dist-packages/computer_server'; shutil.copy('/tmp/browser.py', f'{cs_dir}/browser.py'); shutil.copy('/tmp/main.py', f'{cs_dir}/main.py'); print('Copied browser.py and main.py')" && rm /tmp/browser.py /tmp/main.py
-
 # Install playwright and Firefox dependencies
 RUN python3.12 -m pip install playwright && \
    python3.12 -m playwright install --with-deps firefox
@@ -0,0 +1,159 @@
+# CUA Docker XFCE Container - Development Version
+# Vanilla XFCE desktop with noVNC and computer-server (from local source)
+
+FROM ubuntu:22.04
+
+# Avoid prompts from apt
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Set environment variables
+ENV HOME=/home/cua
+ENV DISPLAY=:1
+ENV VNC_PORT=5901
+ENV NOVNC_PORT=6901
+ENV API_PORT=8000
+ENV VNC_RESOLUTION=1024x768
+ENV VNC_COL_DEPTH=24
+
+# Install system dependencies first (including sudo)
+RUN apt-get update && apt-get install -y \
+    # System utilities
+    sudo \
+    unzip \
+    zip \
+    xdg-utils \
+    # Desktop environment
+    xfce4 \
+    xfce4-terminal \
+    dbus-x11 \
+    # VNC server
+    tigervnc-standalone-server \
+    tigervnc-common \
+    # noVNC dependencies
+    # python will be installed via deadsnakes as 3.12 \
+    git \
+    net-tools \
+    netcat \
+    supervisor \
+    # Computer-server dependencies
+    # python-tk/dev for 3.12 will be installed later \
+    gnome-screenshot \
+    wmctrl \
+    ffmpeg \
+    socat \
+    xclip \
+    # Browser
+    wget \
+    software-properties-common \
+    # Build tools
+    build-essential \
+    libncursesw5-dev \
+    libssl-dev \
+    libsqlite3-dev \
+    tk-dev \
+    libgdbm-dev \
+    libc6-dev \
+    libbz2-dev \
+    libffi-dev \
+    zlib1g-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python 3.12 from deadsnakes (keep system python3 for apt)
+RUN add-apt-repository -y ppa:deadsnakes/ppa && \
+    apt-get update && apt-get install -y \
+    python3.12 python3.12-venv python3.12-dev python3.12-tk && \
+    python3.12 -m ensurepip --upgrade && \
+    python3.12 -m pip install --upgrade pip setuptools wheel && \
+    rm -rf /var/lib/apt/lists/*
+
+# Ensure 'python' points to Python 3.12
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 2
+
+# Remove screensavers and power manager to avoid popups and lock screens
+RUN apt-get remove -y \
+    xfce4-power-manager \
+    xfce4-power-manager-data \
+    xfce4-power-manager-plugins \
+    xfce4-screensaver \
+    light-locker \
+    xscreensaver \
+    xscreensaver-data || true
+
+# Create user after sudo is installed
+RUN useradd -m -s /bin/bash -G sudo cua && \
+    echo "cua:cua" | chpasswd && \
+    echo "cua ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
+
+# Install Firefox from Mozilla PPA (snap-free) - inline to avoid script issues
+RUN apt-get update && \
+    add-apt-repository -y ppa:mozillateam/ppa && \
+    echo 'Package: *\nPin: release o=LP-PPA-mozillateam\nPin-Priority: 1001' > /etc/apt/preferences.d/mozilla-firefox && \
+    apt-get update && \
+    apt-get install -y firefox && \
+    echo 'pref("datareporting.policy.firstRunURL", "");\npref("datareporting.policy.dataSubmissionEnabled", false);\npref("datareporting.healthreport.service.enabled", false);\npref("datareporting.healthreport.uploadEnabled", false);\npref("trailhead.firstrun.branches", "nofirstrun-empty");\npref("browser.aboutwelcome.enabled", false);' > /usr/lib/firefox/browser/defaults/preferences/firefox.js && \
+    update-alternatives --install /usr/bin/x-www-browser x-www-browser /usr/bin/firefox 100 && \
+    update-alternatives --install /usr/bin/gnome-www-browser gnome-www-browser /usr/bin/firefox 100 && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install noVNC
+RUN git clone https://github.com/novnc/noVNC.git /opt/noVNC && \
+    git clone https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \
+    ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html
+
+# Pre-create cache directory with correct ownership before pip install
+RUN mkdir -p /home/cua/.cache && \
+    chown -R cua:cua /home/cua/.cache
+
+# Copy local computer-server source and install it
+COPY ../python/computer-server /tmp/computer-server
+RUN python3.12 -m pip install /tmp/computer-server && \
+    rm -rf /tmp/computer-server
+
+# Install playwright and Firefox dependencies
+RUN python3.12 -m pip install playwright && \
+    python3.12 -m playwright install --with-deps firefox
+
+# Fix any cache files created by pip
+RUN chown -R cua:cua /home/cua/.cache
+
+# Copy startup scripts
+COPY src/supervisor/ /etc/supervisor/conf.d/
+COPY src/scripts/ /usr/local/bin/
+
+# Make scripts executable
+RUN chmod +x /usr/local/bin/*.sh
+
+# Setup VNC
+RUN chown -R cua:cua /home/cua
+USER cua
+WORKDIR /home/cua
+
+# Create VNC directory (no password needed with SecurityTypes None)
+RUN mkdir -p $HOME/.vnc
+
+# Configure XFCE for first start
+RUN mkdir -p $HOME/.config/xfce4/xfconf/xfce-perchannel-xml $HOME/.config/xfce4 $HOME/.config/autostart
+
+# Copy XFCE config to disable browser launching and welcome screens
+COPY --chown=cua:cua src/xfce-config/helpers.rc $HOME/.config/xfce4/helpers.rc
+COPY --chown=cua:cua src/xfce-config/xfce4-session.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-session.xml
+COPY --chown=cua:cua src/xfce-config/xfce4-power-manager.xml $HOME/.config/xfce4/xfconf/xfce-perchannel-xml/xfce4-power-manager.xml
+
+# Disable autostart for screensaver, lock screen, and power manager
+RUN echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-tips-autostart.desktop && \
+    echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-screensaver.desktop && \
+    echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/light-locker.desktop && \
+    echo "[Desktop Entry]\nHidden=true" > $HOME/.config/autostart/xfce4-power-manager.desktop && \
+    chown -R cua:cua $HOME/.config
+
+# Create storage and shared directories, and Firefox cache directory
+RUN mkdir -p $HOME/storage $HOME/shared $HOME/.cache/dconf $HOME/.mozilla/firefox && \
+    chown -R cua:cua $HOME/storage $HOME/shared $HOME/.cache $HOME/.mozilla $HOME/.vnc
+
+USER root
+
+# Expose ports
+EXPOSE $VNC_PORT $NOVNC_PORT $API_PORT
+
+# Start services via supervisor
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"]
@@ -1,32 +0,0 @@
-# Building the XFCE Docker Image
-
-## Required Files for Build
-
-The Dockerfile requires these files to be present in the `libs/xfce/` directory:
-
- `browser.py` - Copy from `libs/python/computer-server/computer_server/browser.py`
- `main.py` - Copy from `libs/python/computer-server/computer_server/main.py`
-
-These files are copied into the container to include the browser tool functionality
-that isn't yet in the published PyPI package.
-
-## Before Building
-
-```bash
-# Copy the latest browser tool files
-cp libs/python/computer-server/computer_server/browser.py libs/xfce/
-cp libs/python/computer-server/computer_server/main.py libs/xfce/
-```
-
-## Build Command
-
-```bash
-cd libs/xfce
-docker build -t cua-xfce .
-```
-
-## Note
-
-Once the browser tool is included in the published `cua-computer-server` package,
-these temporary file copies can be removed and the Dockerfile can be simplified.
-
@@ -1,308 +0,0 @@
-"""
-Browser manager using Playwright for programmatic browser control.
-This allows agents to control a browser that runs visibly on the XFCE desktop.
-"""
-
-import asyncio
-import logging
-import os
-from typing import Any, Dict, Optional
-
-try:
-    from playwright.async_api import async_playwright, Browser, BrowserContext, Page
-except ImportError:
-    async_playwright = None
-    Browser = None
-    BrowserContext = None
-    Page = None
-
-logger = logging.getLogger(__name__)
-
-
-class BrowserManager:
-    """
-    Manages a Playwright browser instance that runs visibly on the XFCE desktop.
-    Uses persistent context to maintain cookies and sessions.
-    """
-
-    def __init__(self):
-        """Initialize the BrowserManager."""
-        self.playwright = None
-        self.browser: Optional[Browser] = None
-        self.context: Optional[BrowserContext] = None
-        self.page: Optional[Page] = None
-        self._initialized = False
-        self._initialization_error: Optional[str] = None
-        self._lock = asyncio.Lock()
-
-    async def _ensure_initialized(self):
-        """Ensure the browser is initialized."""
-        # Check if browser was closed and needs reinitialization
-        if self._initialized:
-            try:
-                # Check if context is still valid by trying to access it
-                if self.context:
-                    # Try to get pages - this will raise if context is closed
-                    _ = self.context.pages
-                    # If we get here, context is still alive
-                    return
-                else:
-                    # Context was closed, need to reinitialize
-                    self._initialized = False
-                    logger.warning("Browser context was closed, will reinitialize...")
-            except Exception as e:
-                # Context is dead, need to reinitialize
-                logger.warning(f"Browser context is dead ({e}), will reinitialize...")
-                self._initialized = False
-                self.context = None
-                self.page = None
-                # Clean up playwright if it exists
-                if self.playwright:
-                    try:
-                        await self.playwright.stop()
-                    except Exception:
-                        pass
-                    self.playwright = None
-
-        async with self._lock:
-            # Double-check after acquiring lock (another thread might have initialized it)
-            if self._initialized:
-                try:
-                    if self.context:
-                        _ = self.context.pages
-                        return
-                except Exception:
-                    self._initialized = False
-                    self.context = None
-                    self.page = None
-                    if self.playwright:
-                        try:
-                            await self.playwright.stop()
-                        except Exception:
-                            pass
-                        self.playwright = None
-
-            if async_playwright is None:
-                raise RuntimeError(
-                    "playwright is not installed. Please install it with: pip install playwright && playwright install --with-deps firefox"
-                )
-
-            try:
-                # Get display from environment or default to :1
-                display = os.environ.get("DISPLAY", ":1")
-                logger.info(f"Initializing browser with DISPLAY={display}")
-
-                # Start playwright
-                self.playwright = await async_playwright().start()
-
-                # Launch Firefox with persistent context (keeps cookies/sessions)
-                # headless=False is CRITICAL so the visual agent can see it
-                user_data_dir = os.path.join(os.path.expanduser("~"), ".playwright-firefox")
-                os.makedirs(user_data_dir, exist_ok=True)
-
-                # launch_persistent_context returns a BrowserContext, not a Browser
-                # Note: Removed --kiosk mode so the desktop remains visible
-                self.context = await self.playwright.firefox.launch_persistent_context(
-                    user_data_dir=user_data_dir,
-                    headless=False,  # CRITICAL: visible for visual agent
-                    viewport={"width": 1024, "height": 768},
-                    # Removed --kiosk to allow desktop visibility
-                )
-
-                # Get the first page or create one
-                pages = self.context.pages
-                if pages:
-                    self.page = pages[0]
-                else:
-                    self.page = await self.context.new_page()
-
-                self._initialized = True
-                logger.info("Browser initialized successfully")
-
-            except Exception as e:
-                logger.error(f"Failed to initialize browser: {e}")
-                import traceback
-                logger.error(traceback.format_exc())
-                # Don't raise - return error in execute_command instead
-                self._initialization_error = str(e)
-                raise
-
-    async def execute_command(self, cmd: str, params: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Execute a browser command.
-
-        Args:
-            cmd: Command name (visit_url, click, type, scroll, web_search)
-            params: Command parameters
-
-        Returns:
-            Result dictionary with success status and any data
-        """
-        try:
-            await self._ensure_initialized()
-        except Exception as e:
-            error_msg = getattr(self, '_initialization_error', None) or str(e)
-            logger.error(f"Browser initialization failed: {error_msg}")
-            return {
-                "success": False,
-                "error": f"Browser initialization failed: {error_msg}. "
-                         f"Make sure Playwright and Firefox are installed, and DISPLAY is set correctly."
-            }
-
-        # Ensure browser is still initialized (in case it was manually closed)
-        # This will automatically reinitialize if the browser was closed
-        await self._ensure_initialized()
-        
-        # Check if page is still valid
-        page_valid = False
-        try:
-            if self.page is not None:
-                # Try to access page.url to check if it's still valid
-                _ = self.page.url
-                page_valid = True
-        except Exception as e:
-            logger.warning(f"Page is invalid: {e}, will get a new page...")
-            self.page = None
-        
-        # Get a valid page if we don't have one
-        if not page_valid or self.page is None:
-            try:
-                pages = self.context.pages
-                if pages:
-                    # Find first non-closed page
-                    for p in pages:
-                        try:
-                            if not p.is_closed():
-                                self.page = p
-                                logger.info("Reusing existing open page")
-                                page_valid = True
-                                break
-                        except Exception:
-                            continue
-                
-                # If no valid page found, create a new one
-                if not page_valid:
-                    self.page = await self.context.new_page()
-                    logger.info("Created new page")
-            except Exception as e:
-                logger.error(f"Failed to get new page: {e}, browser may be closed")
-                # Browser was closed - reinitialize it
-                try:
-                    logger.info("Browser was closed manually, reinitializing...")
-                    self._initialized = False
-                    self.context = None
-                    self.page = None
-                    if self.playwright:
-                        try:
-                            await self.playwright.stop()
-                        except Exception:
-                            pass
-                        self.playwright = None
-                    
-                    # Reinitialize
-                    await self._ensure_initialized()
-                    # Get or create a page
-                    pages = self.context.pages
-                    if pages:
-                        self.page = pages[0]
-                    else:
-                        self.page = await self.context.new_page()
-                    logger.info("Browser reopened successfully after manual closure")
-                except Exception as reinit_error:
-                    logger.error(f"Failed to reinitialize browser: {reinit_error}")
-                    import traceback
-                    logger.error(traceback.format_exc())
-                    return {"success": False, "error": f"Browser was closed and cannot be recovered: {reinit_error}"}
-
-        try:
-            if cmd == "visit_url":
-                url = params.get("url")
-                if not url:
-                    return {"success": False, "error": "url parameter is required"}
-                await self.page.goto(url, wait_until="domcontentloaded", timeout=30000)
-                return {"success": True, "url": self.page.url}
-
-            elif cmd == "click":
-                x = params.get("x")
-                y = params.get("y")
-                if x is None or y is None:
-                    return {"success": False, "error": "x and y parameters are required"}
-                await self.page.mouse.click(x, y)
-                return {"success": True}
-
-            elif cmd == "type":
-                text = params.get("text")
-                if text is None:
-                    return {"success": False, "error": "text parameter is required"}
-                await self.page.keyboard.type(text)
-                return {"success": True}
-
-            elif cmd == "scroll":
-                delta_x = params.get("delta_x", 0)
-                delta_y = params.get("delta_y", 0)
-                await self.page.mouse.wheel(delta_x, delta_y)
-                return {"success": True}
-
-            elif cmd == "web_search":
-                query = params.get("query")
-                if not query:
-                    return {"success": False, "error": "query parameter is required"}
-                # Navigate to Google search
-                search_url = f"https://www.google.com/search?q={query}"
-                await self.page.goto(search_url, wait_until="domcontentloaded", timeout=30000)
-                return {"success": True, "url": self.page.url}
-
-            else:
-                return {"success": False, "error": f"Unknown command: {cmd}"}
-
-        except Exception as e:
-            logger.error(f"Error executing command {cmd}: {e}")
-            import traceback
-            logger.error(traceback.format_exc())
-            # If page was closed due to error, try to recover
-            if "closed" in str(e).lower() and self.context:
-                try:
-                    pages = self.context.pages
-                    if pages:
-                        self.page = pages[0]
-                        logger.info("Recovered page after error")
-                    else:
-                        self.page = await self.context.new_page()
-                        logger.info("Created new page after error")
-                except Exception as recover_error:
-                    logger.error(f"Failed to recover page: {recover_error}")
-            return {"success": False, "error": str(e)}
-
-    async def close(self):
-        """Close the browser and cleanup resources."""
-        async with self._lock:
-            try:
-                if self.context:
-                    await self.context.close()
-                    self.context = None
-                if self.browser:
-                    await self.browser.close()
-                    self.browser = None
-
-                if self.playwright:
-                    await self.playwright.stop()
-                    self.playwright = None
-
-                self.page = None
-                self._initialized = False
-                logger.info("Browser closed successfully")
-            except Exception as e:
-                logger.error(f"Error closing browser: {e}")
-
-
-# Global instance
-_browser_manager: Optional[BrowserManager] = None
-
-
-def get_browser_manager() -> BrowserManager:
-    """Get or create the global BrowserManager instance."""
-    global _browser_manager
-    if _browser_manager is None:
-        _browser_manager = BrowserManager()
-    return _browser_manager
-
@@ -1,820 +0,0 @@
-import asyncio
-import hashlib
-import inspect
-import json
-import logging
-import os
-import platform
-import time
-import traceback
-from contextlib import redirect_stderr, redirect_stdout
-from io import StringIO
-from typing import Any, Dict, List, Literal, Optional, Union, cast
-
-import aiohttp
-import uvicorn
-from fastapi import (
-    FastAPI,
-    Header,
-    HTTPException,
-    Request,
-    WebSocket,
-    WebSocketDisconnect,
-)
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, StreamingResponse
-
-from .handlers.factory import HandlerFactory
-from .browser import get_browser_manager
-
-# Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s
-AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60"))
-
-try:
-    from agent import ComputerAgent
-
-    HAS_AGENT = True
-except ImportError:
-    HAS_AGENT = False
-
-# Set up logging with more detail
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-# Configure WebSocket with larger message size
-WEBSOCKET_MAX_SIZE = 1024 * 1024 * 10  # 10MB limit
-
-# Configure application with WebSocket settings
-app = FastAPI(
-    title="Computer API",
-    description="API for the Computer project",
-    version="0.1.0",
-    websocket_max_size=WEBSOCKET_MAX_SIZE,
-)
-
-# CORS configuration
-origins = ["*"]
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=origins,
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-protocol_version = 1
-try:
-    from importlib.metadata import version
-
-    package_version = version("cua-computer-server")
-except Exception:
-    # Fallback for cases where package is not installed or importlib.metadata is not available
-    try:
-        import pkg_resources
-
-        package_version = pkg_resources.get_distribution("cua-computer-server").version
-    except Exception:
-        package_version = "unknown"
-
-(
-    accessibility_handler,
-    automation_handler,
-    diorama_handler,
-    file_handler,
-    desktop_handler,
-    window_handler,
-) = HandlerFactory.create_handlers()
-handlers = {
-    "version": lambda: {"protocol": protocol_version, "package": package_version},
-    # App-Use commands
-    "diorama_cmd": diorama_handler.diorama_cmd,
-    # Accessibility commands
-    "get_accessibility_tree": accessibility_handler.get_accessibility_tree,
-    "find_element": accessibility_handler.find_element,
-    # Shell commands
-    "run_command": automation_handler.run_command,
-    # File system commands
-    "file_exists": file_handler.file_exists,
-    "directory_exists": file_handler.directory_exists,
-    "list_dir": file_handler.list_dir,
-    "read_text": file_handler.read_text,
-    "write_text": file_handler.write_text,
-    "read_bytes": file_handler.read_bytes,
-    "write_bytes": file_handler.write_bytes,
-    "get_file_size": file_handler.get_file_size,
-    "delete_file": file_handler.delete_file,
-    "create_dir": file_handler.create_dir,
-    "delete_dir": file_handler.delete_dir,
-    # Desktop commands
-    "get_desktop_environment": desktop_handler.get_desktop_environment,
-    "set_wallpaper": desktop_handler.set_wallpaper,
-    # Window management
-    "open": window_handler.open,
-    "launch": window_handler.launch,
-    "get_current_window_id": window_handler.get_current_window_id,
-    "get_application_windows": window_handler.get_application_windows,
-    "get_window_name": window_handler.get_window_name,
-    "get_window_size": window_handler.get_window_size,
-    "get_window_position": window_handler.get_window_position,
-    "set_window_size": window_handler.set_window_size,
-    "set_window_position": window_handler.set_window_position,
-    "maximize_window": window_handler.maximize_window,
-    "minimize_window": window_handler.minimize_window,
-    "activate_window": window_handler.activate_window,
-    "close_window": window_handler.close_window,
-    # Mouse commands
-    "mouse_down": automation_handler.mouse_down,
-    "mouse_up": automation_handler.mouse_up,
-    "left_click": automation_handler.left_click,
-    "right_click": automation_handler.right_click,
-    "double_click": automation_handler.double_click,
-    "move_cursor": automation_handler.move_cursor,
-    "drag_to": automation_handler.drag_to,
-    "drag": automation_handler.drag,
-    # Keyboard commands
-    "key_down": automation_handler.key_down,
-    "key_up": automation_handler.key_up,
-    "type_text": automation_handler.type_text,
-    "press_key": automation_handler.press_key,
-    "hotkey": automation_handler.hotkey,
-    # Scrolling actions
-    "scroll": automation_handler.scroll,
-    "scroll_down": automation_handler.scroll_down,
-    "scroll_up": automation_handler.scroll_up,
-    # Screen actions
-    "screenshot": automation_handler.screenshot,
-    "get_cursor_position": automation_handler.get_cursor_position,
-    "get_screen_size": automation_handler.get_screen_size,
-    # Clipboard actions
-    "copy_to_clipboard": automation_handler.copy_to_clipboard,
-    "set_clipboard": automation_handler.set_clipboard,
-}
-
-
-class AuthenticationManager:
-    def __init__(self):
-        self.sessions: Dict[str, Dict[str, Any]] = {}
-        self.container_name = os.environ.get("CONTAINER_NAME")
-
-    def _hash_credentials(self, container_name: str, api_key: str) -> str:
-        """Create a hash of container name and API key for session identification"""
-        combined = f"{container_name}:{api_key}"
-        return hashlib.sha256(combined.encode()).hexdigest()
-
-    def _is_session_valid(self, session_data: Dict[str, Any]) -> bool:
-        """Check if a session is still valid based on expiration time"""
-        if not session_data.get("valid", False):
-            return False
-
-        expires_at = session_data.get("expires_at", 0)
-        return time.time() < expires_at
-
-    async def auth(self, container_name: str, api_key: str) -> bool:
-        """Authenticate container name and API key, using cached sessions when possible"""
-        # If no CONTAINER_NAME is set, always allow access (local development)
-        if not self.container_name:
-            logger.info(
-                "No CONTAINER_NAME set in environment. Allowing access (local development mode)"
-            )
-            return True
-
-        # Layer 1: VM Identity Verification
-        if container_name != self.container_name:
-            logger.warning(
-                f"VM name mismatch. Expected: {self.container_name}, Got: {container_name}"
-            )
-            return False
-
-        # Create hash for session lookup
-        session_hash = self._hash_credentials(container_name, api_key)
-
-        # Check if we have a valid cached session
-        if session_hash in self.sessions:
-            session_data = self.sessions[session_hash]
-            if self._is_session_valid(session_data):
-                logger.info(f"Using cached authentication for container: {container_name}")
-                return session_data["valid"]
-            else:
-                # Remove expired session
-                del self.sessions[session_hash]
-
-        # No valid cached session, authenticate with API
-        logger.info(f"Authenticating with TryCUA API for container: {container_name}")
-
-        try:
-            async with aiohttp.ClientSession() as session:
-                headers = {"Authorization": f"Bearer {api_key}"}
-
-                async with session.get(
-                    f"https://www.cua.ai/api/vm/auth?container_name={container_name}",
-                    headers=headers,
-                ) as resp:
-                    is_valid = resp.status == 200 and bool((await resp.text()).strip())
-
-                    # Cache the result with configurable expiration
-                    self.sessions[session_hash] = {
-                        "valid": is_valid,
-                        "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS,
-                    }
-
-                    if is_valid:
-                        logger.info(f"Authentication successful for container: {container_name}")
-                    else:
-                        logger.warning(
-                            f"Authentication failed for container: {container_name}. Status: {resp.status}"
-                        )
-
-                    return is_valid
-
-        except aiohttp.ClientError as e:
-            logger.error(f"Failed to validate API key with TryCUA API: {str(e)}")
-            # Cache failed result to avoid repeated requests
-            self.sessions[session_hash] = {
-                "valid": False,
-                "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS,
-            }
-            return False
-        except Exception as e:
-            logger.error(f"Unexpected error during authentication: {str(e)}")
-            # Cache failed result to avoid repeated requests
-            self.sessions[session_hash] = {
-                "valid": False,
-                "expires_at": time.time() + AUTH_SESSION_TTL_SECONDS,
-            }
-            return False
-
-
-class ConnectionManager:
-    def __init__(self):
-        self.active_connections: List[WebSocket] = []
-
-    async def connect(self, websocket: WebSocket):
-        await websocket.accept()
-        self.active_connections.append(websocket)
-
-    def disconnect(self, websocket: WebSocket):
-        self.active_connections.remove(websocket)
-
-
-manager = ConnectionManager()
-auth_manager = AuthenticationManager()
-
-
-@app.get("/status")
-async def status():
-    sys = platform.system().lower()
-    # get os type
-    if "darwin" in sys or sys == "macos" or sys == "mac":
-        os_type = "macos"
-    elif "windows" in sys:
-        os_type = "windows"
-    else:
-        os_type = "linux"
-    # get computer-server features
-    features = []
-    if HAS_AGENT:
-        features.append("agent")
-    return {"status": "ok", "os_type": os_type, "features": features}
-
-
-@app.websocket("/ws", name="websocket_endpoint")
-async def websocket_endpoint(websocket: WebSocket):
-    global handlers
-
-    # WebSocket message size is configured at the app or endpoint level, not on the instance
-    await manager.connect(websocket)
-
-    # Check if CONTAINER_NAME is set (indicating cloud provider)
-    server_container_name = os.environ.get("CONTAINER_NAME")
-
-    # If cloud provider, perform authentication handshake
-    if server_container_name:
-        try:
-            logger.info(
-                f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Waiting for authentication..."
-            )
-
-            # Wait for authentication message
-            auth_data = await websocket.receive_json()
-
-            # Validate auth message format
-            if auth_data.get("command") != "authenticate":
-                await websocket.send_json(
-                    {"success": False, "error": "First message must be authentication"}
-                )
-                await websocket.close()
-                manager.disconnect(websocket)
-                return
-
-            # Extract credentials
-            client_api_key = auth_data.get("params", {}).get("api_key")
-            client_container_name = auth_data.get("params", {}).get("container_name")
-
-            # Validate credentials using AuthenticationManager
-            if not client_api_key:
-                await websocket.send_json({"success": False, "error": "API key required"})
-                await websocket.close()
-                manager.disconnect(websocket)
-                return
-
-            if not client_container_name:
-                await websocket.send_json({"success": False, "error": "Container name required"})
-                await websocket.close()
-                manager.disconnect(websocket)
-                return
-
-            # Use AuthenticationManager for validation
-            is_authenticated = await auth_manager.auth(client_container_name, client_api_key)
-            if not is_authenticated:
-                await websocket.send_json({"success": False, "error": "Authentication failed"})
-                await websocket.close()
-                manager.disconnect(websocket)
-                return
-
-            logger.info(f"Authentication successful for VM: {client_container_name}")
-            await websocket.send_json({"success": True, "message": "Authentication successful"})
-
-        except Exception as e:
-            logger.error(f"Error during authentication handshake: {str(e)}")
-            await websocket.send_json({"success": False, "error": "Authentication failed"})
-            await websocket.close()
-            manager.disconnect(websocket)
-            return
-
-    try:
-        while True:
-            try:
-                data = await websocket.receive_json()
-                command = data.get("command")
-                params = data.get("params", {})
-
-                if command not in handlers:
-                    await websocket.send_json(
-                        {"success": False, "error": f"Unknown command: {command}"}
-                    )
-                    continue
-
-                try:
-                    # Filter params to only include those accepted by the handler function
-                    handler_func = handlers[command]
-                    sig = inspect.signature(handler_func)
-                    filtered_params = {k: v for k, v in params.items() if k in sig.parameters}
-
-                    # Handle both sync and async functions
-                    if asyncio.iscoroutinefunction(handler_func):
-                        result = await handler_func(**filtered_params)
-                    else:
-                        # Run sync functions in thread pool to avoid blocking event loop
-                        result = await asyncio.to_thread(handler_func, **filtered_params)
-                    await websocket.send_json({"success": True, **result})
-                except Exception as cmd_error:
-                    logger.error(f"Error executing command {command}: {str(cmd_error)}")
-                    logger.error(traceback.format_exc())
-                    await websocket.send_json({"success": False, "error": str(cmd_error)})
-
-            except WebSocketDisconnect:
-                raise
-            except json.JSONDecodeError as json_err:
-                logger.error(f"JSON decode error: {str(json_err)}")
-                await websocket.send_json(
-                    {"success": False, "error": f"Invalid JSON: {str(json_err)}"}
-                )
-            except Exception as loop_error:
-                logger.error(f"Error in message loop: {str(loop_error)}")
-                logger.error(traceback.format_exc())
-                await websocket.send_json({"success": False, "error": str(loop_error)})
-
-    except WebSocketDisconnect:
-        logger.info("Client disconnected")
-        manager.disconnect(websocket)
-    except Exception as e:
-        logger.error(f"Fatal error in websocket connection: {str(e)}")
-        logger.error(traceback.format_exc())
-        try:
-            await websocket.close()
-        except:
-            pass
-        manager.disconnect(websocket)
-
-
-@app.post("/cmd")
-async def cmd_endpoint(
-    request: Request,
-    container_name: Optional[str] = Header(None, alias="X-Container-Name"),
-    api_key: Optional[str] = Header(None, alias="X-API-Key"),
-):
-    """
-    Backup endpoint for when WebSocket connections fail.
-    Accepts commands via HTTP POST with streaming response.
-
-    Headers:
-    - X-Container-Name: Container name for cloud authentication
-    - X-API-Key: API key for cloud authentication
-
-    Body:
-    {
-        "command": "command_name",
-        "params": {...}
-    }
-    """
-    global handlers
-
-    # Parse request body
-    try:
-        body = await request.json()
-        command = body.get("command")
-        params = body.get("params", {})
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
-
-    if not command:
-        raise HTTPException(status_code=400, detail="Command is required")
-
-    # Check if CONTAINER_NAME is set (indicating cloud provider)
-    server_container_name = os.environ.get("CONTAINER_NAME")
-
-    # If cloud provider, perform authentication
-    if server_container_name:
-        logger.info(
-            f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..."
-        )
-
-        # Validate required headers
-        if not container_name:
-            raise HTTPException(status_code=401, detail="Container name required")
-
-        if not api_key:
-            raise HTTPException(status_code=401, detail="API key required")
-
-        # Validate with AuthenticationManager
-        is_authenticated = await auth_manager.auth(container_name, api_key)
-        if not is_authenticated:
-            raise HTTPException(status_code=401, detail="Authentication failed")
-
-    if command not in handlers:
-        raise HTTPException(status_code=400, detail=f"Unknown command: {command}")
-
-    async def generate_response():
-        """Generate streaming response for the command execution"""
-        try:
-            # Filter params to only include those accepted by the handler function
-            handler_func = handlers[command]
-            sig = inspect.signature(handler_func)
-            filtered_params = {k: v for k, v in params.items() if k in sig.parameters}
-
-            # Handle both sync and async functions
-            if asyncio.iscoroutinefunction(handler_func):
-                result = await handler_func(**filtered_params)
-            else:
-                # Run sync functions in thread pool to avoid blocking event loop
-                result = await asyncio.to_thread(handler_func, **filtered_params)
-
-            # Stream the successful result
-            response_data = {"success": True, **result}
-            yield f"data: {json.dumps(response_data)}\n\n"
-
-        except Exception as cmd_error:
-            logger.error(f"Error executing command {command}: {str(cmd_error)}")
-            logger.error(traceback.format_exc())
-
-            # Stream the error result
-            error_data = {"success": False, "error": str(cmd_error)}
-            yield f"data: {json.dumps(error_data)}\n\n"
-
-    return StreamingResponse(
-        generate_response(),
-        media_type="text/plain",
-        headers={
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-        },
-    )
-
-
-@app.post("/responses")
-async def agent_response_endpoint(
-    request: Request,
-    api_key: Optional[str] = Header(None, alias="X-API-Key"),
-):
-    """
-    Minimal proxy to run ComputerAgent for up to 2 turns.
-
-    Security:
-    - If CONTAINER_NAME is set on the server, require X-API-Key
-      and validate using AuthenticationManager unless CUA_ENABLE_PUBLIC_PROXY is true.
-
-    Body JSON:
-    {
-      "model": "...",                 # required
-      "input": "... or messages[]",   # required
-      "agent_kwargs": { ... },         # optional, passed directly to ComputerAgent
-      "env": { ... }                   # optional env overrides for agent
-    }
-    """
-    if not HAS_AGENT:
-        raise HTTPException(status_code=501, detail="ComputerAgent not available")
-
-    # Authenticate via AuthenticationManager if running in cloud (CONTAINER_NAME set)
-    container_name = os.environ.get("CONTAINER_NAME")
-    if container_name:
-        is_public = os.environ.get("CUA_ENABLE_PUBLIC_PROXY", "").lower().strip() in [
-            "1",
-            "true",
-            "yes",
-            "y",
-            "on",
-        ]
-        if not is_public:
-            if not api_key:
-                raise HTTPException(status_code=401, detail="Missing AGENT PROXY auth headers")
-            ok = await auth_manager.auth(container_name, api_key)
-            if not ok:
-                raise HTTPException(status_code=401, detail="Unauthorized")
-
-    # Parse request body
-    try:
-        body = await request.json()
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
-
-    model = body.get("model")
-    input_data = body.get("input")
-    if not model or input_data is None:
-        raise HTTPException(status_code=400, detail="'model' and 'input' are required")
-
-    agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {}
-    env_overrides: Dict[str, str] = body.get("env") or {}
-
-    # Simple env override context
-    class _EnvOverride:
-        def __init__(self, overrides: Dict[str, str]):
-            self.overrides = overrides
-            self._original: Dict[str, Optional[str]] = {}
-
-        def __enter__(self):
-            for k, v in (self.overrides or {}).items():
-                self._original[k] = os.environ.get(k)
-                os.environ[k] = str(v)
-
-        def __exit__(self, exc_type, exc, tb):
-            for k, old in self._original.items():
-                if old is None:
-                    os.environ.pop(k, None)
-                else:
-                    os.environ[k] = old
-
-    # Convert input to messages
-    def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
-        if isinstance(data, str):
-            return [{"role": "user", "content": data}]
-        if isinstance(data, list):
-            return data
-
-    messages = _to_messages(input_data)
-
-    # Define a direct computer tool that implements the AsyncComputerHandler protocol
-    # and delegates to our existing automation/file/accessibility handlers.
-    from agent.computers import AsyncComputerHandler  # runtime-checkable Protocol
-
-    class DirectComputer(AsyncComputerHandler):
-        def __init__(self):
-            # use module-scope handler singletons created by HandlerFactory
-            self._auto = automation_handler
-            self._file = file_handler
-            self._access = accessibility_handler
-
-        async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
-            sys = platform.system().lower()
-            if "darwin" in sys or sys in ("macos", "mac"):
-                return "mac"
-            if "windows" in sys:
-                return "windows"
-            return "linux"
-
-        async def get_dimensions(self) -> tuple[int, int]:
-            size = await self._auto.get_screen_size()
-            return size["width"], size["height"]
-
-        async def screenshot(self) -> str:
-            img_b64 = await self._auto.screenshot()
-            return img_b64["image_data"]
-
-        async def click(self, x: int, y: int, button: str = "left") -> None:
-            if button == "left":
-                await self._auto.left_click(x, y)
-            elif button == "right":
-                await self._auto.right_click(x, y)
-            else:
-                await self._auto.left_click(x, y)
-
-        async def double_click(self, x: int, y: int) -> None:
-            await self._auto.double_click(x, y)
-
-        async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
-            await self._auto.move_cursor(x, y)
-            await self._auto.scroll(scroll_x, scroll_y)
-
-        async def type(self, text: str) -> None:
-            await self._auto.type_text(text)
-
-        async def wait(self, ms: int = 1000) -> None:
-            await asyncio.sleep(ms / 1000.0)
-
-        async def move(self, x: int, y: int) -> None:
-            await self._auto.move_cursor(x, y)
-
-        async def keypress(self, keys: Union[List[str], str]) -> None:
-            if isinstance(keys, str):
-                parts = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
-            else:
-                parts = keys
-            if len(parts) == 1:
-                await self._auto.press_key(parts[0])
-            else:
-                await self._auto.hotkey(parts)
-
-        async def drag(self, path: List[Dict[str, int]]) -> None:
-            if not path:
-                return
-            start = path[0]
-            await self._auto.mouse_down(start["x"], start["y"])
-            for pt in path[1:]:
-                await self._auto.move_cursor(pt["x"], pt["y"])
-            end = path[-1]
-            await self._auto.mouse_up(end["x"], end["y"])
-
-        async def get_current_url(self) -> str:
-            # Not available in this server context
-            return ""
-
-        async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
-            await self._auto.mouse_down(x, y, button="left")
-
-        async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
-            await self._auto.mouse_up(x, y, button="left")
-
-    # # Inline image URLs to base64
-    # import base64, mimetypes, requests
-    # # Use a browser-like User-Agent to avoid 403s from some CDNs (e.g., Wikimedia)
-    # HEADERS = {
-    #     "User-Agent": (
-    #         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-    #         "AppleWebKit/537.36 (KHTML, like Gecko) "
-    #         "Chrome/124.0.0.0 Safari/537.36"
-    #     )
-    # }
-    # def _to_data_url(content_bytes: bytes, url: str, resp: requests.Response) -> str:
-    #     ctype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream"
-    #     b64 = base64.b64encode(content_bytes).decode("utf-8")
-    #     return f"data:{ctype};base64,{b64}"
-    # def inline_image_urls(messages):
-    #     # messages: List[{"role": "...","content":[...]}]
-    #     out = []
-    #     for m in messages:
-    #         if not isinstance(m.get("content"), list):
-    #             out.append(m)
-    #             continue
-    #         new_content = []
-    #         for part in (m.get("content") or []):
-    #             if part.get("type") == "input_image" and (url := part.get("image_url")):
-    #                 resp = requests.get(url, headers=HEADERS, timeout=30)
-    #                 resp.raise_for_status()
-    #                 new_content.append({
-    #                     "type": "input_image",
-    #                     "image_url": _to_data_url(resp.content, url, resp)
-    #                 })
-    #             else:
-    #                 new_content.append(part)
-    #         out.append({**m, "content": new_content})
-    #     return out
-    # messages = inline_image_urls(messages)
-
-    error = None
-
-    with _EnvOverride(env_overrides):
-        # Prepare tools: if caller did not pass tools, inject our DirectComputer
-        tools = agent_kwargs.get("tools")
-        if not tools:
-            tools = [DirectComputer()]
-            agent_kwargs = {**agent_kwargs, "tools": tools}
-        # Instantiate agent with our tools
-        agent = ComputerAgent(model=model, **agent_kwargs)  # type: ignore[arg-type]
-
-        total_output: List[Any] = []
-        total_usage: Dict[str, Any] = {}
-
-        pending_computer_call_ids = set()
-        try:
-            async for result in agent.run(messages):
-                total_output += result["output"]
-                # Try to collect usage if present
-                if (
-                    isinstance(result, dict)
-                    and "usage" in result
-                    and isinstance(result["usage"], dict)
-                ):
-                    # Merge usage counters
-                    for k, v in result["usage"].items():
-                        if isinstance(v, (int, float)):
-                            total_usage[k] = total_usage.get(k, 0) + v
-                        else:
-                            total_usage[k] = v
-                for msg in result.get("output", []):
-                    if msg.get("type") == "computer_call":
-                        pending_computer_call_ids.add(msg["call_id"])
-                    elif msg.get("type") == "computer_call_output":
-                        pending_computer_call_ids.discard(msg["call_id"])
-                # exit if no pending computer calls
-                if not pending_computer_call_ids:
-                    break
-        except Exception as e:
-            logger.error(f"Error running agent: {str(e)}")
-            logger.error(traceback.format_exc())
-            error = str(e)
-
-    # Build response payload
-    payload = {
-        "model": model,
-        "error": error,
-        "output": total_output,
-        "usage": total_usage,
-        "status": "completed" if not error else "failed",
-    }
-
-    # CORS: allow any origin
-    headers = {
-        "Cache-Control": "no-cache",
-        "Connection": "keep-alive",
-    }
-
-    return JSONResponse(content=payload, headers=headers)
-
-
-@app.post("/playwright_exec")
-async def playwright_exec_endpoint(
-    request: Request,
-    container_name: Optional[str] = Header(None, alias="X-Container-Name"),
-    api_key: Optional[str] = Header(None, alias="X-API-Key"),
-):
-    """
-    Execute Playwright browser commands.
-
-    Headers:
-    - X-Container-Name: Container name for cloud authentication
-    - X-API-Key: API key for cloud authentication
-
-    Body:
-    {
-        "command": "visit_url|click|type|scroll|web_search",
-        "params": {...}
-    }
-    """
-    # Parse request body
-    try:
-        body = await request.json()
-        command = body.get("command")
-        params = body.get("params", {})
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
-
-    if not command:
-        raise HTTPException(status_code=400, detail="Command is required")
-
-    # Check if CONTAINER_NAME is set (indicating cloud provider)
-    server_container_name = os.environ.get("CONTAINER_NAME")
-
-    # If cloud provider, perform authentication
-    if server_container_name:
-        logger.info(
-            f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication..."
-        )
-
-        # Validate required headers
-        if not container_name:
-            raise HTTPException(status_code=401, detail="Container name required")
-
-        if not api_key:
-            raise HTTPException(status_code=401, detail="API key required")
-
-        # Validate with AuthenticationManager
-        is_authenticated = await auth_manager.auth(container_name, api_key)
-        if not is_authenticated:
-            raise HTTPException(status_code=401, detail="Authentication failed")
-
-    # Get browser manager and execute command
-    try:
-        browser_manager = get_browser_manager()
-        result = await browser_manager.execute_command(command, params)
-        
-        if result.get("success"):
-            return JSONResponse(content=result)
-        else:
-            raise HTTPException(status_code=400, detail=result.get("error", "Command failed"))
-    except Exception as e:
-        logger.error(f"Error executing playwright command: {str(e)}")
-        logger.error(traceback.format_exc())
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)