mirror of
https://github.com/trycua/lume.git
synced 2025-12-30 17:09:58 -06:00
Migrate browser interface into computer SDK
This commit is contained in:
@@ -15,6 +15,8 @@ repos:
|
||||
name: TypeScript type check
|
||||
entry: node ./scripts/typescript-typecheck.js
|
||||
language: node
|
||||
files: \.(ts|tsx)$
|
||||
pass_filenames: false
|
||||
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
rev: 7.0.0
|
||||
|
||||
@@ -40,10 +40,31 @@ python examples/browser_tool_example.py
|
||||
- **Auto-Recovery**: Automatically reopens browser if closed manually
|
||||
- **Persistent Context**: Maintains cookies and sessions across commands
|
||||
- **Fara/Magentic-One Interface**: Compatible with Microsoft agent interfaces
|
||||
- **Computer SDK Integration**: Uses the Computer SDK's interface for unified control
|
||||
|
||||
## Usage
|
||||
|
||||
The BrowserTool uses the Computer SDK's interface to communicate with the server:
|
||||
|
||||
```python
|
||||
from computer import Computer
|
||||
from agent.tools.browser_tool import BrowserTool
|
||||
|
||||
# Initialize computer interface
|
||||
computer = Computer(ip_address="localhost")
|
||||
|
||||
# Create browser tool with the interface
|
||||
browser = BrowserTool(interface=computer)
|
||||
|
||||
# Use the browser
|
||||
await browser.visit_url("https://www.example.com")
|
||||
await browser.click(x=500, y=300)
|
||||
await browser.type("Hello, world!")
|
||||
```
|
||||
|
||||
## API Endpoint
|
||||
|
||||
The browser tool is accessible via the `/playwright_exec` endpoint:
|
||||
The browser tool is also accessible via the `/playwright_exec` endpoint:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8000/playwright_exec \
|
||||
@@ -66,4 +87,3 @@ curl -X POST http://localhost:8000/playwright_exec \
|
||||
**Connection errors**: Make sure the server is running (`curl http://localhost:8000/status`).
|
||||
|
||||
**Playwright not found**: Install with `pip install playwright && playwright install --with-deps firefox`.
|
||||
|
||||
|
||||
@@ -19,18 +19,14 @@ import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Import BrowserTool directly from the file
|
||||
browser_tool_path = Path(__file__).parent.parent / "libs" / "python" / "agent" / "agent" / "tools" / "browser_tool.py"
|
||||
sys.path.insert(0, str(browser_tool_path.parent.parent.parent))
|
||||
# Add the libs path to sys.path
|
||||
libs_path = Path(__file__).parent.parent / "libs" / "python"
|
||||
sys.path.insert(0, str(libs_path))
|
||||
|
||||
# Import the module directly
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location("browser_tool", browser_tool_path)
|
||||
if spec is None or spec.loader is None:
|
||||
raise ImportError(f"Could not load browser_tool from {browser_tool_path}")
|
||||
browser_tool_module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(browser_tool_module)
|
||||
BrowserTool = browser_tool_module.BrowserTool
|
||||
from agent.tools.browser_tool import BrowserTool
|
||||
|
||||
# Import Computer interface and BrowserTool
|
||||
from computer import Computer
|
||||
|
||||
# Configure logging to see what's happening
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
@@ -39,58 +35,60 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
async def test_browser_tool():
|
||||
"""Test the BrowserTool with various commands."""
|
||||
|
||||
# Initialize the browser tool
|
||||
# For local testing, use http://localhost:8000
|
||||
# For cloud, provide base_url, api_key, and container_name
|
||||
browser = BrowserTool(base_url="http://localhost:8000")
|
||||
|
||||
|
||||
# Initialize the computer interface
|
||||
# For local testing, use provider_type="docker"
|
||||
# For provider_type="cloud", provide name and api_key
|
||||
computer = Computer(provider_type="docker")
|
||||
|
||||
# Initialize the browser tool with the computer interface
|
||||
browser = BrowserTool(interface=computer)
|
||||
|
||||
logger.info("Testing Browser Tool...")
|
||||
|
||||
|
||||
try:
|
||||
# Test 1: Visit a URL
|
||||
logger.info("Test 1: Visiting a URL...")
|
||||
result = await browser.visit_url("https://www.trycua.com")
|
||||
logger.info(f"Visit URL result: {result}")
|
||||
|
||||
|
||||
# Wait a bit for the page to load
|
||||
await asyncio.sleep(2)
|
||||
|
||||
|
||||
# Test 2: Web search
|
||||
logger.info("Test 2: Performing a web search...")
|
||||
result = await browser.web_search("Python programming")
|
||||
logger.info(f"Web search result: {result}")
|
||||
|
||||
|
||||
# Wait a bit
|
||||
await asyncio.sleep(2)
|
||||
|
||||
|
||||
# Test 3: Scroll
|
||||
logger.info("Test 3: Scrolling the page...")
|
||||
result = await browser.scroll(delta_x=0, delta_y=500)
|
||||
logger.info(f"Scroll result: {result}")
|
||||
|
||||
|
||||
# Wait a bit
|
||||
await asyncio.sleep(1)
|
||||
|
||||
|
||||
# Test 4: Click (example coordinates - adjust based on your screen)
|
||||
logger.info("Test 4: Clicking at coordinates...")
|
||||
result = await browser.click(x=500, y=300)
|
||||
logger.info(f"Click result: {result}")
|
||||
|
||||
|
||||
# Wait a bit
|
||||
await asyncio.sleep(1)
|
||||
|
||||
|
||||
# Test 5: Type text (if there's a focused input field)
|
||||
logger.info("Test 5: Typing text...")
|
||||
result = await browser.type("Hello from BrowserTool!")
|
||||
logger.info(f"Type result: {result}")
|
||||
|
||||
|
||||
logger.info("All tests completed!")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during testing: {e}", exc_info=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_browser_tool())
|
||||
|
||||
|
||||
@@ -4,54 +4,36 @@ Allows agents to control a browser programmatically via Playwright.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import aiohttp
|
||||
if TYPE_CHECKING:
|
||||
from computer.interface import GenericComputerInterface
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BrowserTool:
|
||||
"""
|
||||
Browser tool that connects to the computer server's Playwright endpoint.
|
||||
Browser tool that uses the computer SDK's interface to control a browser.
|
||||
Implements the Fara/Magentic-One agent interface for browser control.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_url: str = "http://localhost:8000",
|
||||
api_key: Optional[str] = None,
|
||||
container_name: Optional[str] = None,
|
||||
interface: "GenericComputerInterface",
|
||||
):
|
||||
"""
|
||||
Initialize the BrowserTool.
|
||||
|
||||
Args:
|
||||
base_url: Base URL of the computer server (default: http://localhost:8000)
|
||||
api_key: Optional API key for cloud authentication
|
||||
container_name: Optional container name for cloud authentication
|
||||
interface: A GenericComputerInterface instance that provides playwright_exec
|
||||
"""
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.api_key = api_key
|
||||
self.container_name = container_name
|
||||
self.interface = interface
|
||||
self.logger = logger
|
||||
|
||||
def _get_endpoint_url(self) -> str:
|
||||
"""Get the full URL for the playwright_exec endpoint."""
|
||||
return f"{self.base_url}/playwright_exec"
|
||||
|
||||
def _get_headers(self) -> dict:
|
||||
"""Get headers for the HTTP request."""
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if self.api_key:
|
||||
headers["X-API-Key"] = self.api_key
|
||||
if self.container_name:
|
||||
headers["X-Container-Name"] = self.container_name
|
||||
return headers
|
||||
|
||||
async def _execute_command(self, command: str, params: dict) -> dict:
|
||||
"""
|
||||
Execute a browser command via HTTP POST.
|
||||
Execute a browser command via the computer interface.
|
||||
|
||||
Args:
|
||||
command: Command name
|
||||
@@ -60,23 +42,15 @@ class BrowserTool:
|
||||
Returns:
|
||||
Response dictionary
|
||||
"""
|
||||
url = self._get_endpoint_url()
|
||||
payload = {"command": command, "params": params}
|
||||
headers = self._get_headers()
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(url, json=payload, headers=headers) as response:
|
||||
if response.status == 200:
|
||||
return await response.json()
|
||||
else:
|
||||
error_text = await response.text()
|
||||
self.logger.error(
|
||||
f"Browser command failed with status {response.status}: {error_text}"
|
||||
)
|
||||
return {"success": False, "error": error_text}
|
||||
result = await self.interface.playwright_exec(command, params)
|
||||
if not result.get("success"):
|
||||
self.logger.error(
|
||||
f"Browser command '{command}' failed: {result.get('error', 'Unknown error')}"
|
||||
)
|
||||
return result
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error executing browser command: {e}")
|
||||
self.logger.error(f"Error executing browser command '{command}': {e}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def visit_url(self, url: str) -> dict:
|
||||
@@ -140,4 +114,3 @@ class BrowserTool:
|
||||
Response dictionary with success status and current URL
|
||||
"""
|
||||
return await self._execute_command("web_search", {"query": query})
|
||||
|
||||
|
||||
@@ -661,6 +661,56 @@ class GenericComputerInterface(BaseComputerInterface):
|
||||
|
||||
return screenshot_x, screenshot_y
|
||||
|
||||
# Playwright browser control
|
||||
async def playwright_exec(self, command: str, params: Optional[Dict] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Execute a Playwright browser command.
|
||||
|
||||
Args:
|
||||
command: The browser command to execute (visit_url, click, type, scroll, web_search)
|
||||
params: Command parameters
|
||||
|
||||
Returns:
|
||||
Dict containing the command result
|
||||
|
||||
Examples:
|
||||
# Navigate to a URL
|
||||
await interface.playwright_exec("visit_url", {"url": "https://example.com"})
|
||||
|
||||
# Click at coordinates
|
||||
await interface.playwright_exec("click", {"x": 100, "y": 200})
|
||||
|
||||
# Type text
|
||||
await interface.playwright_exec("type", {"text": "Hello, world!"})
|
||||
|
||||
# Scroll
|
||||
await interface.playwright_exec("scroll", {"delta_x": 0, "delta_y": -100})
|
||||
|
||||
# Web search
|
||||
await interface.playwright_exec("web_search", {"query": "computer use agent"})
|
||||
"""
|
||||
protocol = "https" if self.api_key else "http"
|
||||
port = "8443" if self.api_key else "8000"
|
||||
url = f"{protocol}://{self.ip_address}:{port}/playwright_exec"
|
||||
|
||||
payload = {"command": command, "params": params or {}}
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if self.api_key:
|
||||
headers["X-API-Key"] = self.api_key
|
||||
if self.vm_name:
|
||||
headers["X-Container-Name"] = self.vm_name
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(url, json=payload, headers=headers) as response:
|
||||
if response.status == 200:
|
||||
return await response.json()
|
||||
else:
|
||||
error_text = await response.text()
|
||||
return {"success": False, "error": error_text}
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
# Websocket Methods
|
||||
async def _keep_alive(self):
|
||||
"""Keep the WebSocket connection alive with automatic reconnection."""
|
||||
|
||||
Reference in New Issue
Block a user