Migrate browser interface into computer SDK

This commit is contained in:
Dillon DuPont
2025-12-03 08:17:52 -08:00
parent 4000cabfdd
commit ddfb53e79f
5 changed files with 116 additions and 73 deletions

View File

@@ -15,6 +15,8 @@ repos:
name: TypeScript type check
entry: node ./scripts/typescript-typecheck.js
language: node
files: \.(ts|tsx)$
pass_filenames: false
- repo: https://github.com/PyCQA/isort
rev: 7.0.0

View File

@@ -40,10 +40,31 @@ python examples/browser_tool_example.py
- **Auto-Recovery**: Automatically reopens browser if closed manually
- **Persistent Context**: Maintains cookies and sessions across commands
- **Fara/Magentic-One Interface**: Compatible with Microsoft agent interfaces
- **Computer SDK Integration**: Uses the Computer SDK's interface for unified control
## Usage
The BrowserTool uses the Computer SDK's interface to communicate with the server:
```python
from computer import Computer
from agent.tools.browser_tool import BrowserTool
# Initialize computer interface
computer = Computer(ip_address="localhost")
# Create browser tool with the interface
browser = BrowserTool(interface=computer)
# Use the browser
await browser.visit_url("https://www.example.com")
await browser.click(x=500, y=300)
await browser.type("Hello, world!")
```
## API Endpoint
The browser tool is accessible via the `/playwright_exec` endpoint:
The browser tool is also accessible via the `/playwright_exec` endpoint:
```bash
curl -X POST http://localhost:8000/playwright_exec \
@@ -66,4 +87,3 @@ curl -X POST http://localhost:8000/playwright_exec \
**Connection errors**: Make sure the server is running (`curl http://localhost:8000/status`).
**Playwright not found**: Install with `pip install playwright && playwright install --with-deps firefox`.

View File

@@ -19,18 +19,14 @@ import logging
import sys
from pathlib import Path
# Import BrowserTool directly from the file
browser_tool_path = Path(__file__).parent.parent / "libs" / "python" / "agent" / "agent" / "tools" / "browser_tool.py"
sys.path.insert(0, str(browser_tool_path.parent.parent.parent))
# Add the libs path to sys.path
libs_path = Path(__file__).parent.parent / "libs" / "python"
sys.path.insert(0, str(libs_path))
# Import the module directly
import importlib.util
spec = importlib.util.spec_from_file_location("browser_tool", browser_tool_path)
if spec is None or spec.loader is None:
raise ImportError(f"Could not load browser_tool from {browser_tool_path}")
browser_tool_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(browser_tool_module)
BrowserTool = browser_tool_module.BrowserTool
from agent.tools.browser_tool import BrowserTool
# Import Computer interface and BrowserTool
from computer import Computer
# Configure logging to see what's happening
logging.basicConfig(level=logging.INFO)
@@ -39,58 +35,60 @@ logger = logging.getLogger(__name__)
async def test_browser_tool():
"""Test the BrowserTool with various commands."""
# Initialize the browser tool
# For local testing, use http://localhost:8000
# For cloud, provide base_url, api_key, and container_name
browser = BrowserTool(base_url="http://localhost:8000")
# Initialize the computer interface
# For local testing, use provider_type="docker"
# For provider_type="cloud", provide name and api_key
computer = Computer(provider_type="docker")
# Initialize the browser tool with the computer interface
browser = BrowserTool(interface=computer)
logger.info("Testing Browser Tool...")
try:
# Test 1: Visit a URL
logger.info("Test 1: Visiting a URL...")
result = await browser.visit_url("https://www.trycua.com")
logger.info(f"Visit URL result: {result}")
# Wait a bit for the page to load
await asyncio.sleep(2)
# Test 2: Web search
logger.info("Test 2: Performing a web search...")
result = await browser.web_search("Python programming")
logger.info(f"Web search result: {result}")
# Wait a bit
await asyncio.sleep(2)
# Test 3: Scroll
logger.info("Test 3: Scrolling the page...")
result = await browser.scroll(delta_x=0, delta_y=500)
logger.info(f"Scroll result: {result}")
# Wait a bit
await asyncio.sleep(1)
# Test 4: Click (example coordinates - adjust based on your screen)
logger.info("Test 4: Clicking at coordinates...")
result = await browser.click(x=500, y=300)
logger.info(f"Click result: {result}")
# Wait a bit
await asyncio.sleep(1)
# Test 5: Type text (if there's a focused input field)
logger.info("Test 5: Typing text...")
result = await browser.type("Hello from BrowserTool!")
logger.info(f"Type result: {result}")
logger.info("All tests completed!")
except Exception as e:
logger.error(f"Error during testing: {e}", exc_info=True)
if __name__ == "__main__":
asyncio.run(test_browser_tool())

View File

@@ -4,54 +4,36 @@ Allows agents to control a browser programmatically via Playwright.
"""
import logging
from typing import Optional
from typing import TYPE_CHECKING, Optional
import aiohttp
if TYPE_CHECKING:
from computer.interface import GenericComputerInterface
logger = logging.getLogger(__name__)
class BrowserTool:
"""
Browser tool that connects to the computer server's Playwright endpoint.
Browser tool that uses the computer SDK's interface to control a browser.
Implements the Fara/Magentic-One agent interface for browser control.
"""
def __init__(
self,
base_url: str = "http://localhost:8000",
api_key: Optional[str] = None,
container_name: Optional[str] = None,
interface: "GenericComputerInterface",
):
"""
Initialize the BrowserTool.
Args:
base_url: Base URL of the computer server (default: http://localhost:8000)
api_key: Optional API key for cloud authentication
container_name: Optional container name for cloud authentication
interface: A GenericComputerInterface instance that provides playwright_exec
"""
self.base_url = base_url.rstrip("/")
self.api_key = api_key
self.container_name = container_name
self.interface = interface
self.logger = logger
def _get_endpoint_url(self) -> str:
"""Get the full URL for the playwright_exec endpoint."""
return f"{self.base_url}/playwright_exec"
def _get_headers(self) -> dict:
"""Get headers for the HTTP request."""
headers = {"Content-Type": "application/json"}
if self.api_key:
headers["X-API-Key"] = self.api_key
if self.container_name:
headers["X-Container-Name"] = self.container_name
return headers
async def _execute_command(self, command: str, params: dict) -> dict:
"""
Execute a browser command via HTTP POST.
Execute a browser command via the computer interface.
Args:
command: Command name
@@ -60,23 +42,15 @@ class BrowserTool:
Returns:
Response dictionary
"""
url = self._get_endpoint_url()
payload = {"command": command, "params": params}
headers = self._get_headers()
try:
async with aiohttp.ClientSession() as session:
async with session.post(url, json=payload, headers=headers) as response:
if response.status == 200:
return await response.json()
else:
error_text = await response.text()
self.logger.error(
f"Browser command failed with status {response.status}: {error_text}"
)
return {"success": False, "error": error_text}
result = await self.interface.playwright_exec(command, params)
if not result.get("success"):
self.logger.error(
f"Browser command '{command}' failed: {result.get('error', 'Unknown error')}"
)
return result
except Exception as e:
self.logger.error(f"Error executing browser command: {e}")
self.logger.error(f"Error executing browser command '{command}': {e}")
return {"success": False, "error": str(e)}
async def visit_url(self, url: str) -> dict:
@@ -140,4 +114,3 @@ class BrowserTool:
Response dictionary with success status and current URL
"""
return await self._execute_command("web_search", {"query": query})

View File

@@ -661,6 +661,56 @@ class GenericComputerInterface(BaseComputerInterface):
return screenshot_x, screenshot_y
# Playwright browser control
async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any]:
"""
Execute a Playwright browser command.
Args:
command: The browser command to execute (visit_url, click, type, scroll, web_search)
params: Command parameters
Returns:
Dict containing the command result
Examples:
# Navigate to a URL
await interface.playwright_exec("visit_url", {"url": "https://example.com"})
# Click at coordinates
await interface.playwright_exec("click", {"x": 100, "y": 200})
# Type text
await interface.playwright_exec("type", {"text": "Hello, world!"})
# Scroll
await interface.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100})
# Web search
await interface.playwright_exec("web_search", {"query": "computer use agent"})
"""
protocol = "https" if self.api_key else "http"
port = "8443" if self.api_key else "8000"
url = f"{protocol}://{self.ip_address}:{port}/playwright_exec"
payload = {"command": command, "params": params or {}}
headers = {"Content-Type": "application/json"}
if self.api_key:
headers["X-API-Key"] = self.api_key
if self.vm_name:
headers["X-Container-Name"] = self.vm_name
try:
async with aiohttp.ClientSession() as session:
async with session.post(url, json=payload, headers=headers) as response:
if response.status == 200:
return await response.json()
else:
error_text = await response.text()
return {"success": False, "error": error_text}
except Exception as e:
return {"success": False, "error": str(e)}
# Websocket Methods
async def _keep_alive(self):
"""Keep the WebSocket connection alive with automatic reconnection."""