From ddfb53e79f4b0bc98d01bc67beeb01ea5860b7d3 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 3 Dec 2025 08:17:52 -0800 Subject: [PATCH] Migrate browser interface into computer SDK --- .pre-commit-config.yaml | 2 + examples/BROWSER_TOOL_README.md | 24 +++++++- examples/browser_tool_example.py | 56 +++++++++--------- libs/python/agent/agent/tools/browser_tool.py | 57 +++++-------------- .../computer/computer/interface/generic.py | 50 ++++++++++++++++ 5 files changed, 116 insertions(+), 73 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d9475d42..a2e35493 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,6 +15,8 @@ repos: name: TypeScript type check entry: node ./scripts/typescript-typecheck.js language: node + files: \.(ts|tsx)$ + pass_filenames: false - repo: https://github.com/PyCQA/isort rev: 7.0.0 diff --git a/examples/BROWSER_TOOL_README.md b/examples/BROWSER_TOOL_README.md index 8d12ae85..f72971e8 100644 --- a/examples/BROWSER_TOOL_README.md +++ b/examples/BROWSER_TOOL_README.md @@ -40,10 +40,31 @@ python examples/browser_tool_example.py - **Auto-Recovery**: Automatically reopens browser if closed manually - **Persistent Context**: Maintains cookies and sessions across commands - **Fara/Magentic-One Interface**: Compatible with Microsoft agent interfaces +- **Computer SDK Integration**: Uses the Computer SDK's interface for unified control + +## Usage + +The BrowserTool uses the Computer SDK's interface to communicate with the server: + +```python +from computer import Computer +from agent.tools.browser_tool import BrowserTool + +# Initialize computer interface +computer = Computer(ip_address="localhost") + +# Create browser tool with the interface +browser = BrowserTool(interface=computer) + +# Use the browser +await browser.visit_url("https://www.example.com") +await browser.click(x=500, y=300) +await browser.type("Hello, world!") +``` ## API Endpoint -The browser tool is accessible via the `/playwright_exec` endpoint: +The browser tool is also accessible via the `/playwright_exec` endpoint: ```bash curl -X POST http://localhost:8000/playwright_exec \ @@ -66,4 +87,3 @@ curl -X POST http://localhost:8000/playwright_exec \ **Connection errors**: Make sure the server is running (`curl http://localhost:8000/status`). **Playwright not found**: Install with `pip install playwright && playwright install --with-deps firefox`. - diff --git a/examples/browser_tool_example.py b/examples/browser_tool_example.py index 9705ca8f..11a8dead 100644 --- a/examples/browser_tool_example.py +++ b/examples/browser_tool_example.py @@ -19,18 +19,14 @@ import logging import sys from pathlib import Path -# Import BrowserTool directly from the file -browser_tool_path = Path(__file__).parent.parent / "libs" / "python" / "agent" / "agent" / "tools" / "browser_tool.py" -sys.path.insert(0, str(browser_tool_path.parent.parent.parent)) +# Add the libs path to sys.path +libs_path = Path(__file__).parent.parent / "libs" / "python" +sys.path.insert(0, str(libs_path)) -# Import the module directly -import importlib.util -spec = importlib.util.spec_from_file_location("browser_tool", browser_tool_path) -if spec is None or spec.loader is None: - raise ImportError(f"Could not load browser_tool from {browser_tool_path}") -browser_tool_module = importlib.util.module_from_spec(spec) -spec.loader.exec_module(browser_tool_module) -BrowserTool = browser_tool_module.BrowserTool +from agent.tools.browser_tool import BrowserTool + +# Import Computer interface and BrowserTool +from computer import Computer # Configure logging to see what's happening logging.basicConfig(level=logging.INFO) @@ -39,58 +35,60 @@ logger = logging.getLogger(__name__) async def test_browser_tool(): """Test the BrowserTool with various commands.""" - - # Initialize the browser tool - # For local testing, use http://localhost:8000 - # For cloud, provide base_url, api_key, and container_name - browser = BrowserTool(base_url="http://localhost:8000") - + + # Initialize the computer interface + # For local testing, use provider_type="docker" + # For provider_type="cloud", provide name and api_key + computer = Computer(provider_type="docker") + + # Initialize the browser tool with the computer interface + browser = BrowserTool(interface=computer) + logger.info("Testing Browser Tool...") - + try: # Test 1: Visit a URL logger.info("Test 1: Visiting a URL...") result = await browser.visit_url("https://www.trycua.com") logger.info(f"Visit URL result: {result}") - + # Wait a bit for the page to load await asyncio.sleep(2) - + # Test 2: Web search logger.info("Test 2: Performing a web search...") result = await browser.web_search("Python programming") logger.info(f"Web search result: {result}") - + # Wait a bit await asyncio.sleep(2) - + # Test 3: Scroll logger.info("Test 3: Scrolling the page...") result = await browser.scroll(delta_x=0, delta_y=500) logger.info(f"Scroll result: {result}") - + # Wait a bit await asyncio.sleep(1) - + # Test 4: Click (example coordinates - adjust based on your screen) logger.info("Test 4: Clicking at coordinates...") result = await browser.click(x=500, y=300) logger.info(f"Click result: {result}") - + # Wait a bit await asyncio.sleep(1) - + # Test 5: Type text (if there's a focused input field) logger.info("Test 5: Typing text...") result = await browser.type("Hello from BrowserTool!") logger.info(f"Type result: {result}") - + logger.info("All tests completed!") - + except Exception as e: logger.error(f"Error during testing: {e}", exc_info=True) if __name__ == "__main__": asyncio.run(test_browser_tool()) - diff --git a/libs/python/agent/agent/tools/browser_tool.py b/libs/python/agent/agent/tools/browser_tool.py index 8f8b1ab9..85b6ba23 100644 --- a/libs/python/agent/agent/tools/browser_tool.py +++ b/libs/python/agent/agent/tools/browser_tool.py @@ -4,54 +4,36 @@ Allows agents to control a browser programmatically via Playwright. """ import logging -from typing import Optional +from typing import TYPE_CHECKING, Optional -import aiohttp +if TYPE_CHECKING: + from computer.interface import GenericComputerInterface logger = logging.getLogger(__name__) class BrowserTool: """ - Browser tool that connects to the computer server's Playwright endpoint. + Browser tool that uses the computer SDK's interface to control a browser. Implements the Fara/Magentic-One agent interface for browser control. """ def __init__( self, - base_url: str = "http://localhost:8000", - api_key: Optional[str] = None, - container_name: Optional[str] = None, + interface: "GenericComputerInterface", ): """ Initialize the BrowserTool. Args: - base_url: Base URL of the computer server (default: http://localhost:8000) - api_key: Optional API key for cloud authentication - container_name: Optional container name for cloud authentication + interface: A GenericComputerInterface instance that provides playwright_exec """ - self.base_url = base_url.rstrip("/") - self.api_key = api_key - self.container_name = container_name + self.interface = interface self.logger = logger - def _get_endpoint_url(self) -> str: - """Get the full URL for the playwright_exec endpoint.""" - return f"{self.base_url}/playwright_exec" - - def _get_headers(self) -> dict: - """Get headers for the HTTP request.""" - headers = {"Content-Type": "application/json"} - if self.api_key: - headers["X-API-Key"] = self.api_key - if self.container_name: - headers["X-Container-Name"] = self.container_name - return headers - async def _execute_command(self, command: str, params: dict) -> dict: """ - Execute a browser command via HTTP POST. + Execute a browser command via the computer interface. Args: command: Command name @@ -60,23 +42,15 @@ class BrowserTool: Returns: Response dictionary """ - url = self._get_endpoint_url() - payload = {"command": command, "params": params} - headers = self._get_headers() - try: - async with aiohttp.ClientSession() as session: - async with session.post(url, json=payload, headers=headers) as response: - if response.status == 200: - return await response.json() - else: - error_text = await response.text() - self.logger.error( - f"Browser command failed with status {response.status}: {error_text}" - ) - return {"success": False, "error": error_text} + result = await self.interface.playwright_exec(command, params) + if not result.get("success"): + self.logger.error( + f"Browser command '{command}' failed: {result.get('error', 'Unknown error')}" + ) + return result except Exception as e: - self.logger.error(f"Error executing browser command: {e}") + self.logger.error(f"Error executing browser command '{command}': {e}") return {"success": False, "error": str(e)} async def visit_url(self, url: str) -> dict: @@ -140,4 +114,3 @@ class BrowserTool: Response dictionary with success status and current URL """ return await self._execute_command("web_search", {"query": query}) - diff --git a/libs/python/computer/computer/interface/generic.py b/libs/python/computer/computer/interface/generic.py index e58719dd..d5a5dc4b 100644 --- a/libs/python/computer/computer/interface/generic.py +++ b/libs/python/computer/computer/interface/generic.py @@ -661,6 +661,56 @@ class GenericComputerInterface(BaseComputerInterface): return screenshot_x, screenshot_y + # Playwright browser control + async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any]: + """ + Execute a Playwright browser command. + + Args: + command: The browser command to execute (visit_url, click, type, scroll, web_search) + params: Command parameters + + Returns: + Dict containing the command result + + Examples: + # Navigate to a URL + await interface.playwright_exec("visit_url", {"url": "https://example.com"}) + + # Click at coordinates + await interface.playwright_exec("click", {"x": 100, "y": 200}) + + # Type text + await interface.playwright_exec("type", {"text": "Hello, world!"}) + + # Scroll + await interface.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100}) + + # Web search + await interface.playwright_exec("web_search", {"query": "computer use agent"}) + """ + protocol = "https" if self.api_key else "http" + port = "8443" if self.api_key else "8000" + url = f"{protocol}://{self.ip_address}:{port}/playwright_exec" + + payload = {"command": command, "params": params or {}} + headers = {"Content-Type": "application/json"} + if self.api_key: + headers["X-API-Key"] = self.api_key + if self.vm_name: + headers["X-Container-Name"] = self.vm_name + + try: + async with aiohttp.ClientSession() as session: + async with session.post(url, json=payload, headers=headers) as response: + if response.status == 200: + return await response.json() + else: + error_text = await response.text() + return {"success": False, "error": error_text} + except Exception as e: + return {"success": False, "error": str(e)} + # Websocket Methods async def _keep_alive(self): """Keep the WebSocket connection alive with automatic reconnection."""