From ddfb53e79f4b0bc98d01bc67beeb01ea5860b7d3 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 3 Dec 2025 08:17:52 -0800
Subject: [PATCH] Migrate browser interface into computer SDK

---
 .pre-commit-config.yaml                       |  2 +
 examples/BROWSER_TOOL_README.md               | 24 +++++++-
 examples/browser_tool_example.py              | 56 +++++++++---------
 libs/python/agent/agent/tools/browser_tool.py | 57 +++++--------------
 .../computer/computer/interface/generic.py    | 50 ++++++++++++++++
 5 files changed, 116 insertions(+), 73 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d9475d42..a2e35493 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,6 +15,8 @@ repos:
         name: TypeScript type check
         entry: node ./scripts/typescript-typecheck.js
         language: node
+        files: \.(ts|tsx)$
+        pass_filenames: false
 
   - repo: https://github.com/PyCQA/isort
     rev: 7.0.0
diff --git a/examples/BROWSER_TOOL_README.md b/examples/BROWSER_TOOL_README.md
index 8d12ae85..f72971e8 100644
--- a/examples/BROWSER_TOOL_README.md
+++ b/examples/BROWSER_TOOL_README.md
@@ -40,10 +40,31 @@ python examples/browser_tool_example.py
 - **Auto-Recovery**: Automatically reopens browser if closed manually
 - **Persistent Context**: Maintains cookies and sessions across commands
 - **Fara/Magentic-One Interface**: Compatible with Microsoft agent interfaces
+- **Computer SDK Integration**: Uses the Computer SDK's interface for unified control
+
+## Usage
+
+The BrowserTool uses the Computer SDK's interface to communicate with the server:
+
+```python
+from computer import Computer
+from agent.tools.browser_tool import BrowserTool
+
+# Initialize computer interface
+computer = Computer(ip_address="localhost")
+
+# Create browser tool with the interface
+browser = BrowserTool(interface=computer)
+
+# Use the browser
+await browser.visit_url("https://www.example.com")
+await browser.click(x=500, y=300)
+await browser.type("Hello, world!")
+```
 
 ## API Endpoint
 
-The browser tool is accessible via the `/playwright_exec` endpoint:
+The browser tool is also accessible via the `/playwright_exec` endpoint:
 
 ```bash
 curl -X POST http://localhost:8000/playwright_exec \
@@ -66,4 +87,3 @@ curl -X POST http://localhost:8000/playwright_exec \
 **Connection errors**: Make sure the server is running (`curl http://localhost:8000/status`).
 
 **Playwright not found**: Install with `pip install playwright && playwright install --with-deps firefox`.
-
diff --git a/examples/browser_tool_example.py b/examples/browser_tool_example.py
index 9705ca8f..11a8dead 100644
--- a/examples/browser_tool_example.py
+++ b/examples/browser_tool_example.py
@@ -19,18 +19,14 @@ import logging
 import sys
 from pathlib import Path
 
-# Import BrowserTool directly from the file
-browser_tool_path = Path(__file__).parent.parent / "libs" / "python" / "agent" / "agent" / "tools" / "browser_tool.py"
-sys.path.insert(0, str(browser_tool_path.parent.parent.parent))
+# Add the libs path to sys.path
+libs_path = Path(__file__).parent.parent / "libs" / "python"
+sys.path.insert(0, str(libs_path))
 
-# Import the module directly
-import importlib.util
-spec = importlib.util.spec_from_file_location("browser_tool", browser_tool_path)
-if spec is None or spec.loader is None:
-    raise ImportError(f"Could not load browser_tool from {browser_tool_path}")
-browser_tool_module = importlib.util.module_from_spec(spec)
-spec.loader.exec_module(browser_tool_module)
-BrowserTool = browser_tool_module.BrowserTool
+from agent.tools.browser_tool import BrowserTool
+
+# Import Computer interface and BrowserTool
+from computer import Computer
 
 # Configure logging to see what's happening
 logging.basicConfig(level=logging.INFO)
@@ -39,58 +35,60 @@ logger = logging.getLogger(__name__)
 
 async def test_browser_tool():
     """Test the BrowserTool with various commands."""
-    
-    # Initialize the browser tool
-    # For local testing, use http://localhost:8000
-    # For cloud, provide base_url, api_key, and container_name
-    browser = BrowserTool(base_url="http://localhost:8000")
-    
+
+    # Initialize the computer interface
+    # For local testing, use provider_type="docker"
+    # For provider_type="cloud", provide name and api_key
+    computer = Computer(provider_type="docker")
+
+    # Initialize the browser tool with the computer interface
+    browser = BrowserTool(interface=computer)
+
     logger.info("Testing Browser Tool...")
-    
+
     try:
         # Test 1: Visit a URL
         logger.info("Test 1: Visiting a URL...")
         result = await browser.visit_url("https://www.trycua.com")
         logger.info(f"Visit URL result: {result}")
-        
+
         # Wait a bit for the page to load
         await asyncio.sleep(2)
-        
+
         # Test 2: Web search
         logger.info("Test 2: Performing a web search...")
         result = await browser.web_search("Python programming")
         logger.info(f"Web search result: {result}")
-        
+
         # Wait a bit
         await asyncio.sleep(2)
-        
+
         # Test 3: Scroll
         logger.info("Test 3: Scrolling the page...")
         result = await browser.scroll(delta_x=0, delta_y=500)
         logger.info(f"Scroll result: {result}")
-        
+
         # Wait a bit
         await asyncio.sleep(1)
-        
+
         # Test 4: Click (example coordinates - adjust based on your screen)
         logger.info("Test 4: Clicking at coordinates...")
         result = await browser.click(x=500, y=300)
         logger.info(f"Click result: {result}")
-        
+
         # Wait a bit
         await asyncio.sleep(1)
-        
+
         # Test 5: Type text (if there's a focused input field)
         logger.info("Test 5: Typing text...")
         result = await browser.type("Hello from BrowserTool!")
         logger.info(f"Type result: {result}")
-        
+
         logger.info("All tests completed!")
-        
+
     except Exception as e:
         logger.error(f"Error during testing: {e}", exc_info=True)
 
 
 if __name__ == "__main__":
     asyncio.run(test_browser_tool())
-
diff --git a/libs/python/agent/agent/tools/browser_tool.py b/libs/python/agent/agent/tools/browser_tool.py
index 8f8b1ab9..85b6ba23 100644
--- a/libs/python/agent/agent/tools/browser_tool.py
+++ b/libs/python/agent/agent/tools/browser_tool.py
@@ -4,54 +4,36 @@ Allows agents to control a browser programmatically via Playwright.
 """
 
 import logging
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 
-import aiohttp
+if TYPE_CHECKING:
+    from computer.interface import GenericComputerInterface
 
 logger = logging.getLogger(__name__)
 
 
 class BrowserTool:
     """
-    Browser tool that connects to the computer server's Playwright endpoint.
+    Browser tool that uses the computer SDK's interface to control a browser.
     Implements the Fara/Magentic-One agent interface for browser control.
     """
 
     def __init__(
         self,
-        base_url: str = "http://localhost:8000",
-        api_key: Optional[str] = None,
-        container_name: Optional[str] = None,
+        interface: "GenericComputerInterface",
     ):
         """
         Initialize the BrowserTool.
 
         Args:
-            base_url: Base URL of the computer server (default: http://localhost:8000)
-            api_key: Optional API key for cloud authentication
-            container_name: Optional container name for cloud authentication
+            interface: A GenericComputerInterface instance that provides playwright_exec
         """
-        self.base_url = base_url.rstrip("/")
-        self.api_key = api_key
-        self.container_name = container_name
+        self.interface = interface
         self.logger = logger
 
-    def _get_endpoint_url(self) -> str:
-        """Get the full URL for the playwright_exec endpoint."""
-        return f"{self.base_url}/playwright_exec"
-
-    def _get_headers(self) -> dict:
-        """Get headers for the HTTP request."""
-        headers = {"Content-Type": "application/json"}
-        if self.api_key:
-            headers["X-API-Key"] = self.api_key
-        if self.container_name:
-            headers["X-Container-Name"] = self.container_name
-        return headers
-
     async def _execute_command(self, command: str, params: dict) -> dict:
         """
-        Execute a browser command via HTTP POST.
+        Execute a browser command via the computer interface.
 
         Args:
             command: Command name
@@ -60,23 +42,15 @@ class BrowserTool:
         Returns:
             Response dictionary
         """
-        url = self._get_endpoint_url()
-        payload = {"command": command, "params": params}
-        headers = self._get_headers()
-
         try:
-            async with aiohttp.ClientSession() as session:
-                async with session.post(url, json=payload, headers=headers) as response:
-                    if response.status == 200:
-                        return await response.json()
-                    else:
-                        error_text = await response.text()
-                        self.logger.error(
-                            f"Browser command failed with status {response.status}: {error_text}"
-                        )
-                        return {"success": False, "error": error_text}
+            result = await self.interface.playwright_exec(command, params)
+            if not result.get("success"):
+                self.logger.error(
+                    f"Browser command '{command}' failed: {result.get('error', 'Unknown error')}"
+                )
+            return result
         except Exception as e:
-            self.logger.error(f"Error executing browser command: {e}")
+            self.logger.error(f"Error executing browser command '{command}': {e}")
             return {"success": False, "error": str(e)}
 
     async def visit_url(self, url: str) -> dict:
@@ -140,4 +114,3 @@ class BrowserTool:
             Response dictionary with success status and current URL
         """
         return await self._execute_command("web_search", {"query": query})
-
diff --git a/libs/python/computer/computer/interface/generic.py b/libs/python/computer/computer/interface/generic.py
index e58719dd..d5a5dc4b 100644
--- a/libs/python/computer/computer/interface/generic.py
+++ b/libs/python/computer/computer/interface/generic.py
@@ -661,6 +661,56 @@ class GenericComputerInterface(BaseComputerInterface):
 
         return screenshot_x, screenshot_y
 
+    # Playwright browser control
+    async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        Execute a Playwright browser command.
+
+        Args:
+            command: The browser command to execute (visit_url, click, type, scroll, web_search)
+            params: Command parameters
+
+        Returns:
+            Dict containing the command result
+
+        Examples:
+            # Navigate to a URL
+            await interface.playwright_exec("visit_url", {"url": "https://example.com"})
+
+            # Click at coordinates
+            await interface.playwright_exec("click", {"x": 100, "y": 200})
+
+            # Type text
+            await interface.playwright_exec("type", {"text": "Hello, world!"})
+
+            # Scroll
+            await interface.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100})
+
+            # Web search
+            await interface.playwright_exec("web_search", {"query": "computer use agent"})
+        """
+        protocol = "https" if self.api_key else "http"
+        port = "8443" if self.api_key else "8000"
+        url = f"{protocol}://{self.ip_address}:{port}/playwright_exec"
+
+        payload = {"command": command, "params": params or {}}
+        headers = {"Content-Type": "application/json"}
+        if self.api_key:
+            headers["X-API-Key"] = self.api_key
+        if self.vm_name:
+            headers["X-Container-Name"] = self.vm_name
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(url, json=payload, headers=headers) as response:
+                    if response.status == 200:
+                        return await response.json()
+                    else:
+                        error_text = await response.text()
+                        return {"success": False, "error": error_text}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+
     # Websocket Methods
     async def _keep_alive(self):
         """Keep the WebSocket connection alive with automatic reconnection."""