Merge pull request #340 from trycua/feat/agent-proxy

[Agent] Add Agent Proxy Server
This commit is contained in:
ddupont
2025-09-03 10:35:31 -04:00
committed by GitHub
17 changed files with 2159 additions and 10 deletions

View File

@@ -167,7 +167,7 @@ class BaseAutomationHandler(ABC):
pass
@abstractmethod
async def hotkey(self, *keys: str) -> Dict[str, Any]:
async def hotkey(self, keys: List[str]) -> Dict[str, Any]:
"""Press a combination of keys together."""
pass

View File

@@ -416,11 +416,11 @@ class WindowsAutomationHandler(BaseAutomationHandler):
except Exception as e:
return {"success": False, "error": str(e)}
async def hotkey(self, keys: str) -> Dict[str, Any]:
async def hotkey(self, keys: List[str]) -> Dict[str, Any]:
"""Press a combination of keys simultaneously.
Args:
keys (str): The keys to press together (e.g., 'ctrl+c', 'alt+tab').
keys (List[str]): The keys to press together (e.g., ['ctrl', 'c'], ['alt', 'tab']).
Returns:
Dict[str, Any]: A dictionary with success status and optional error message.

View File

@@ -1,6 +1,6 @@
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request, HTTPException, Header
from fastapi.responses import StreamingResponse
from typing import List, Dict, Any, Optional
from fastapi.responses import StreamingResponse, JSONResponse
from typing import List, Dict, Any, Optional, Union, Literal, cast
import uvicorn
import logging
import asyncio
@@ -14,6 +14,14 @@ import os
import aiohttp
import hashlib
import time
import platform
from fastapi.middleware.cors import CORSMiddleware
try:
from agent import ComputerAgent
HAS_AGENT = True
except ImportError:
HAS_AGENT = False
# Set up logging with more detail
logger = logging.getLogger(__name__)
@@ -30,6 +38,16 @@ app = FastAPI(
websocket_max_size=WEBSOCKET_MAX_SIZE,
)
# CORS configuration
origins = ["*"]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
protocol_version = 1
try:
from importlib.metadata import version
@@ -197,6 +215,21 @@ class ConnectionManager:
manager = ConnectionManager()
auth_manager = AuthenticationManager()
@app.get("/status")
async def status():
sys = platform.system().lower()
# get os type
if "darwin" in sys or sys == "macos" or sys == "mac":
os_type = "macos"
elif "windows" in sys:
os_type = "windows"
else:
os_type = "linux"
# get computer-server features
features = []
if HAS_AGENT:
features.append("agent")
return {"status": "ok", "os_type": os_type, "features": features}
@app.websocket("/ws", name="websocket_endpoint")
async def websocket_endpoint(websocket: WebSocket):
@@ -331,7 +364,6 @@ async def websocket_endpoint(websocket: WebSocket):
pass
manager.disconnect(websocket)
@app.post("/cmd")
async def cmd_endpoint(
request: Request,
@@ -420,12 +452,255 @@ async def cmd_endpoint(
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "POST, OPTIONS",
"Access-Control-Allow-Headers": "Content-Type, X-Container-Name, X-API-Key"
}
)
@app.post("/responses")
async def agent_response_endpoint(
request: Request,
api_key: Optional[str] = Header(None, alias="X-API-Key"),
):
"""
Minimal proxy to run ComputerAgent for up to 2 turns.
Security:
- If CONTAINER_NAME is set on the server, require X-API-Key
and validate using AuthenticationManager unless CUA_ENABLE_PUBLIC_PROXY is true.
Body JSON:
{
"model": "...", # required
"input": "... or messages[]", # required
"agent_kwargs": { ... }, # optional, passed directly to ComputerAgent
"env": { ... } # optional env overrides for agent
}
"""
if not HAS_AGENT:
raise HTTPException(status_code=501, detail="ComputerAgent not available")
# Authenticate via AuthenticationManager if running in cloud (CONTAINER_NAME set)
container_name = os.environ.get("CONTAINER_NAME")
if container_name:
is_public = os.environ.get("CUA_ENABLE_PUBLIC_PROXY", "").lower().strip() in ["1", "true", "yes", "y", "on"]
if not is_public:
if not api_key:
raise HTTPException(status_code=401, detail="Missing AGENT PROXY auth headers")
ok = await auth_manager.auth(container_name, api_key)
if not ok:
raise HTTPException(status_code=401, detail="Unauthorized")
# Parse request body
try:
body = await request.json()
except Exception as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
model = body.get("model")
input_data = body.get("input")
if not model or input_data is None:
raise HTTPException(status_code=400, detail="'model' and 'input' are required")
agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {}
env_overrides: Dict[str, str] = body.get("env") or {}
# Simple env override context
class _EnvOverride:
def __init__(self, overrides: Dict[str, str]):
self.overrides = overrides
self._original: Dict[str, Optional[str]] = {}
def __enter__(self):
for k, v in (self.overrides or {}).items():
self._original[k] = os.environ.get(k)
os.environ[k] = str(v)
def __exit__(self, exc_type, exc, tb):
for k, old in self._original.items():
if old is None:
os.environ.pop(k, None)
else:
os.environ[k] = old
# Convert input to messages
def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
if isinstance(data, str):
return [{"role": "user", "content": data}]
if isinstance(data, list):
return data
messages = _to_messages(input_data)
# Define a direct computer tool that implements the AsyncComputerHandler protocol
# and delegates to our existing automation/file/accessibility handlers.
from agent.computers import AsyncComputerHandler # runtime-checkable Protocol
class DirectComputer(AsyncComputerHandler):
def __init__(self):
# use module-scope handler singletons created by HandlerFactory
self._auto = automation_handler
self._file = file_handler
self._access = accessibility_handler
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
sys = platform.system().lower()
if "darwin" in sys or sys in ("macos", "mac"):
return "mac"
if "windows" in sys:
return "windows"
return "linux"
async def get_dimensions(self) -> tuple[int, int]:
size = await self._auto.get_screen_size()
return size["width"], size["height"]
async def screenshot(self) -> str:
img_b64 = await self._auto.screenshot()
return img_b64["image_data"]
async def click(self, x: int, y: int, button: str = "left") -> None:
if button == "left":
await self._auto.left_click(x, y)
elif button == "right":
await self._auto.right_click(x, y)
else:
await self._auto.left_click(x, y)
async def double_click(self, x: int, y: int) -> None:
await self._auto.double_click(x, y)
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
await self._auto.move_cursor(x, y)
await self._auto.scroll(scroll_x, scroll_y)
async def type(self, text: str) -> None:
await self._auto.type_text(text)
async def wait(self, ms: int = 1000) -> None:
await asyncio.sleep(ms / 1000.0)
async def move(self, x: int, y: int) -> None:
await self._auto.move_cursor(x, y)
async def keypress(self, keys: Union[List[str], str]) -> None:
if isinstance(keys, str):
parts = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
else:
parts = keys
if len(parts) == 1:
await self._auto.press_key(parts[0])
else:
await self._auto.hotkey(parts)
async def drag(self, path: List[Dict[str, int]]) -> None:
if not path:
return
start = path[0]
await self._auto.mouse_down(start["x"], start["y"])
for pt in path[1:]:
await self._auto.move_cursor(pt["x"], pt["y"])
end = path[-1]
await self._auto.mouse_up(end["x"], end["y"])
async def get_current_url(self) -> str:
# Not available in this server context
return ""
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
await self._auto.mouse_down(x, y, button="left")
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
await self._auto.mouse_up(x, y, button="left")
# # Inline image URLs to base64
# import base64, mimetypes, requests
# # Use a browser-like User-Agent to avoid 403s from some CDNs (e.g., Wikimedia)
# HEADERS = {
# "User-Agent": (
# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
# "AppleWebKit/537.36 (KHTML, like Gecko) "
# "Chrome/124.0.0.0 Safari/537.36"
# )
# }
# def _to_data_url(content_bytes: bytes, url: str, resp: requests.Response) -> str:
# ctype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream"
# b64 = base64.b64encode(content_bytes).decode("utf-8")
# return f"data:{ctype};base64,{b64}"
# def inline_image_urls(messages):
# # messages: List[{"role": "...","content":[...]}]
# out = []
# for m in messages:
# if not isinstance(m.get("content"), list):
# out.append(m)
# continue
# new_content = []
# for part in (m.get("content") or []):
# if part.get("type") == "input_image" and (url := part.get("image_url")):
# resp = requests.get(url, headers=HEADERS, timeout=30)
# resp.raise_for_status()
# new_content.append({
# "type": "input_image",
# "image_url": _to_data_url(resp.content, url, resp)
# })
# else:
# new_content.append(part)
# out.append({**m, "content": new_content})
# return out
# messages = inline_image_urls(messages)
error = None
with _EnvOverride(env_overrides):
# Prepare tools: if caller did not pass tools, inject our DirectComputer
tools = agent_kwargs.get("tools")
if not tools:
tools = [DirectComputer()]
agent_kwargs = {**agent_kwargs, "tools": tools}
# Instantiate agent with our tools
agent = ComputerAgent(model=model, **agent_kwargs) # type: ignore[arg-type]
total_output: List[Any] = []
total_usage: Dict[str, Any] = {}
pending_computer_call_ids = set()
try:
async for result in agent.run(messages):
total_output += result["output"]
# Try to collect usage if present
if isinstance(result, dict) and "usage" in result and isinstance(result["usage"], dict):
# Merge usage counters
for k, v in result["usage"].items():
if isinstance(v, (int, float)):
total_usage[k] = total_usage.get(k, 0) + v
else:
total_usage[k] = v
for msg in result.get("output", []):
if msg.get("type") == "computer_call":
pending_computer_call_ids.add(msg["call_id"])
elif msg.get("type") == "computer_call_output":
pending_computer_call_ids.discard(msg["call_id"])
# exit if no pending computer calls
if not pending_computer_call_ids:
break
except Exception as e:
logger.error(f"Error running agent: {str(e)}")
logger.error(traceback.format_exc())
error = str(e)
# Build response payload
payload = {
"model": model,
"error": error,
"output": total_output,
"usage": total_usage,
"status": "completed" if not error else "failed"
}
# CORS: allow any origin
headers = {
"Cache-Control": "no-cache",
"Connection": "keep-alive",
}
return JSONResponse(content=payload, headers=headers)
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -0,0 +1,217 @@
# @trycua/agent
TypeScript SDK for CUA agent interaction. Connect to CUA agent proxies via HTTP/HTTPS or peer-to-peer (WebRTC) connections.
## Installation
```bash
npm install @trycua/agent
# or
pnpm add @trycua/agent
# or
yarn add @trycua/agent
```
## Usage
### Basic Usage
```typescript
import AgentClient from "@trycua/agent";
// Connect to local HTTP server
const client = new AgentClient("https://localhost:8000");
// Connect to a cloud container (port 8443 over HTTPS)
const cloud = new AgentClient(
"https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443",
{ apiKey: process.env.NEXT_PUBLIC_CUA_API_KEY || "" }
);
// Connect to peer
const peerClient = new AgentClient("peer://my-agent-proxy");
// Send a simple text request
const response = await client.responses.create({
model: "anthropic/claude-3-5-sonnet-20241022",
input: "Write a one-sentence bedtime story about a unicorn.",
// Optional per-request env overrides
env: {
OPENAI_API_KEY: "sk-..."
}
});
console.log(response.output);
```
### Multi-modal Requests
```typescript
const response = await client.responses.create({
model: "anthropic/claude-3-5-sonnet-20241022",
input: [
{
role: "user",
content: [
{ type: "input_text", text: "What is in this image?" },
{
type: "input_image",
image_url: "https://example.com/image.jpg"
}
]
}
],
env: { OPENROUTER_API_KEY: "sk-..." }
});
```
### Advanced Configuration
```typescript
const client = new AgentClient("https://localhost:8000", {
timeout: 60000, // 60 second timeout
retries: 5, // 5 retry attempts
apiKey: "cua_...", // sent as X-API-Key header when using HTTP/HTTPS
});
const response = await client.responses.create({
model: "anthropic/claude-3-5-sonnet-20241022",
input: "Hello, world!",
agent_kwargs: {
save_trajectory: true,
verbosity: 20
},
computer_kwargs: {
os_type: "linux",
provider_type: "cloud"
},
// Per-request env overrides
env: {
ANTHROPIC_API_KEY: "sk-...",
OPENROUTER_API_KEY: "sk-..."
}
});
```
### Health Check
```typescript
const health = await client.health();
console.log(health.status); // 'healthy', 'unhealthy', 'unreachable', 'connected', 'disconnected'
```
### Cleanup
```typescript
// Clean up peer connections when done
await client.disconnect();
```
## API Reference
### AgentClient
#### Constructor
```typescript
new AgentClient(url: string, options?: AgentClientOptions)
```
- `url`: Connection URL. Supports `http://`, `https://`, or `peer://` protocols
- `options`: Optional configuration object
#### Methods
##### responses.create(request: AgentRequest): Promise<AgentResponse>
Send a request to the agent and get a response.
##### health(): Promise<{status: string}>
Check the health/connection status of the agent.
##### disconnect(): Promise<void>
Clean up resources and close connections.
### Types
#### AgentRequest
```typescript
interface AgentRequest {
model: string;
input: string | AgentMessage[];
agent_kwargs?: {
save_trajectory?: boolean;
verbosity?: number;
[key: string]: any;
};
computer_kwargs?: {
os_type?: string;
provider_type?: string;
[key: string]: any;
};
// Optional per-request environment overrides
env?: Record<string, string>;
}
```
#### AgentResponse
```typescript
interface AgentResponse {
output: AgentMessage[];
usage: Usage;
}
interface Usage {
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
response_cost: number;
}
```
The `output` array contains the conversation history including:
- User messages
- Agent reasoning/thinking
- Computer actions and their results
- Final agent responses
The `usage` object provides token counts and cost information for the request.
## Connection Types
### HTTP/HTTPS
Connect to a CUA agent proxy server:
```typescript
// Local
const client = new AgentClient("https://my-agent-server.com:8000", { apiKey: "cua_..." });
// Cloud container (port 8443)
const cloud = new AgentClient(
"https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443",
{ apiKey: "cua_..." }
);
```
Notes:
- The client sends the API key as `X-API-Key` for HTTP/HTTPS connections.
- Cloud containers listen on `:8443` with HTTPS.
### Peer-to-Peer (WebRTC)
Connect directly to another peer using WebRTC:
```typescript
const client = new AgentClient("peer://agent-proxy-peer-id");
```
The client uses PeerJS with default configuration for peer connections.
## License
MIT

View File

@@ -0,0 +1,70 @@
# CUA Agent Client Examples
This directory contains examples demonstrating how to use the `@trycua/agent` client library.
## Browser Example
### `browser-example.html`
A simple HTML page that demonstrates using the CUA Agent Client in a browser environment.
**Features:**
- Connect to HTTP/HTTPS or P2P (peer://) agent proxies
- Send text messages to any supported model
- View responses in real-time
- Health check functionality
- Clear, simple interface with no external dependencies
**Usage:**
1. **Build the library first:**
```bash
cd ../
pnpm build
```
2. **Start a local web server** (required for ES modules):
```bash
# Option 1: Using Python
python -m http.server 8080
# Option 2: Using Node.js (if you have http-server installed)
npx http-server -p 8080
# Option 3: Using any other local server
```
3. **Open in browser:**
Navigate to `http://localhost:8080/examples/playground-example.html`
4. **Configure and test:**
- Enter an agent URL (e.g., `https://localhost:8000` or `peer://some-peer-id`)
- Enter a model name (e.g., `anthropic/claude-3-5-sonnet-20241022`)
- Type a message and click "Send Message" or press Enter
- View the response in the output textarea
**Supported URLs:**
- **HTTP/HTTPS**: `https://localhost:8000`, `http://my-agent-server.com:8080`
- **Peer-to-Peer**: `peer://computer-agent-proxy`, `peer://any-peer-id`
**Example Models:**
- `anthropic/claude-3-5-sonnet-20241022`
- `openai/gpt-4`
- `huggingface-local/microsoft/UI-TARS-7B`
**Note:** Make sure you have a CUA agent proxy server running at the specified URL before testing.
## Running Agent Proxy Server
To test the examples, you'll need a CUA agent proxy server running:
```bash
# HTTP server (default port 8000)
python -m agent.proxy.cli
# P2P server
python -m agent.proxy.cli --mode p2p
# Both HTTP and P2P
python -m agent.proxy.cli --mode both
```

View File

@@ -0,0 +1,146 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>CUA Agent Playground Example</title>
</head>
<body>
<h1>CUA Agent Playground Example</h1>
<div>
<h2>Configuration</h2>
<label for="url">Agent URL:</label><br>
<input type="text" id="url" placeholder="https://localhost:8000 or peer://peer-id" value="https://localhost:8000" style="width: 400px;"><br><br>
<label for="model">Model:</label><br>
<input type="text" id="model" placeholder="anthropic/claude-opus-4-1-20250805" value="anthropic/claude-opus-4-1-20250805" style="width: 400px;"><br><br>
</div>
<div>
<h2>Chat</h2>
<label for="message">Message:</label><br>
<input type="text" id="message" placeholder="Enter your message here..." style="width: 400px;"><br><br>
<button onclick="sendMessage()">Send Message</button>
<!-- <button onclick="checkHealth()">Check Health</button> -->
<button onclick="clearOutput()">Clear Output</button><br><br>
<label for="output">Output:</label><br>
<textarea id="output" rows="20" cols="80" readonly></textarea>
</div>
<script src="https://unpkg.com/peerjs@1.5.5/dist/peerjs.min.js"></script>
<script type="module">
// Import the AgentClient from the built library
import AgentClient from '/dist/index.js';
let client = null;
// Make functions available globally
window.sendMessage = sendMessage;
window.checkHealth = checkHealth;
window.clearOutput = clearOutput;
function log(message) {
const output = document.getElementById('output');
const timestamp = new Date().toLocaleTimeString();
output.value += `[${timestamp}] ${message}\n`;
output.scrollTop = output.scrollHeight;
}
function getClient() {
const url = document.getElementById('url').value.trim();
if (!url) {
log('ERROR: Please enter a URL');
return null;
}
// Create new client if URL changed or client doesn't exist
if (!client || client.url !== url) {
try {
client = new AgentClient(url);
client.url = url; // Store URL for comparison
log(`Created new client for: ${url}`);
} catch (error) {
log(`ERROR creating client: ${error.message}`);
return null;
}
}
return client;
}
async function sendMessage() {
const messageInput = document.getElementById('message');
const modelInput = document.getElementById('model');
const message = messageInput.value.trim();
const model = modelInput.value.trim();
if (!message) {
log('ERROR: Please enter a message');
return;
}
if (!model) {
log('ERROR: Please enter a model');
return;
}
const agentClient = getClient();
if (!agentClient) return;
try {
log(`Sending message: "${message}"`);
log(`Using model: ${model}`);
const request = {
model: model,
input: message
};
log('Sending request...');
const response = await agentClient.responses.create(request);
log('Response received:');
log(JSON.stringify(response, null, 2));
// Clear the message input
messageInput.value = '';
} catch (error) {
log(`ERROR: ${error.message}`);
}
}
async function checkHealth() {
const agentClient = getClient();
if (!agentClient) return;
try {
log('Checking health...');
const health = await agentClient.health();
log(`Health status: ${health.status}`);
} catch (error) {
log(`ERROR checking health: ${error.message}`);
}
}
function clearOutput() {
document.getElementById('output').value = '';
}
// Allow sending message with Enter key
document.getElementById('message').addEventListener('keypress', function(e) {
if (e.key === 'Enter') {
sendMessage();
}
});
// Log initial message
log('CUA Agent Client Browser Example loaded');
log('Enter a URL (HTTP/HTTPS or peer://) and model, then send a message');
</script>
</body>
</html>

View File

@@ -0,0 +1,54 @@
{
"name": "@trycua/agent",
"version": "0.1.0",
"packageManager": "pnpm@10.11.0",
"description": "TypeScript SDK for CUA agent interaction",
"type": "module",
"license": "MIT",
"homepage": "https://github.com/trycua/cua/tree/main/libs/typescript/agent",
"bugs": {
"url": "https://github.com/trycua/cua/issues"
},
"repository": {
"type": "git",
"url": "git+https://github.com/trycua/cua.git"
},
"author": "cua",
"files": [
"dist"
],
"main": "./dist/index.js",
"module": "./dist/index.js",
"types": "./dist/index.d.ts",
"exports": {
".": "./dist/index.js",
"./package.json": "./package.json"
},
"publishConfig": {
"access": "public"
},
"scripts": {
"lint": "biome lint .",
"lint:fix": "biome lint --fix .",
"build": "tsdown",
"dev": "tsdown --watch",
"test": "vitest",
"typecheck": "tsc --noEmit",
"release": "bumpp && pnpm publish",
"prepublishOnly": "pnpm run build"
},
"dependencies": {
"@trycua/core": "^0.1.2",
"peerjs": "^1.5.4",
"pino": "^9.7.0"
},
"devDependencies": {
"@biomejs/biome": "^1.9.4",
"@types/node": "^22.15.17",
"bumpp": "^10.1.0",
"happy-dom": "^17.4.7",
"tsdown": "^0.14.1",
"typescript": "^5.7.2",
"vitest": "^2.1.8"
}
}

View File

@@ -0,0 +1,197 @@
import {Peer} from "peerjs";
import type {
AgentRequest,
AgentResponse,
ConnectionType,
AgentClientOptions,
} from "./types";
export class AgentClient {
private url: string;
private connectionType: ConnectionType;
private options: AgentClientOptions;
private peer?: Peer;
private connection?: any;
constructor(url: string, options: AgentClientOptions = {}) {
this.url = url;
this.options = {
timeout: 30000,
retries: 3,
...options,
};
// Determine connection type from URL
if (url.startsWith("http://") || url.startsWith("https://")) {
this.connectionType = url.startsWith("https://") ? "https" : "http";
} else if (url.startsWith("peer://")) {
this.connectionType = "peer";
} else {
throw new Error(
"Invalid URL format. Must start with http://, https://, or peer://"
);
}
}
// Main responses API matching the desired usage pattern
public responses = {
create: async (request: AgentRequest): Promise<AgentResponse> => {
return this.sendRequest(request);
},
};
private async sendRequest(request: AgentRequest): Promise<AgentResponse> {
switch (this.connectionType) {
case "http":
case "https":
return this.sendHttpRequest(request);
case "peer":
return this.sendPeerRequest(request);
default:
throw new Error(`Unsupported connection type: ${this.connectionType}`);
}
}
private async sendHttpRequest(request: AgentRequest): Promise<AgentResponse> {
const controller = new AbortController();
const timeoutId = setTimeout(
() => controller.abort(),
this.options.timeout
);
try {
const headers: Record<string, string> = {
"Content-Type": "application/json",
};
if (this.options.apiKey) {
headers["X-API-Key"] = this.options.apiKey;
}
const response = await fetch(`${this.url}/responses`, {
method: "POST",
headers,
body: JSON.stringify(request),
signal: controller.signal,
});
clearTimeout(timeoutId);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const data = await response.json();
return data as AgentResponse;
} catch (error) {
clearTimeout(timeoutId);
if (error instanceof Error) {
throw new Error(`Failed to send HTTP request: ${error.message}`);
}
throw error;
}
}
private async sendPeerRequest(request: AgentRequest): Promise<AgentResponse> {
// Extract peer ID from peer:// URL
const peerId = this.url.replace("peer://", "");
if (!this.peer) {
// Initialize peer connection with default options as requested
this.peer = new Peer();
return new Promise<AgentResponse>((resolve, reject) => {
const timeout = setTimeout(() => {
reject(new Error("Peer connection timeout"));
}, this.options.timeout);
this.peer!.on("open", () => {
// Connect to the target peer
this.connection = this.peer!.connect(peerId);
this.connection.on("open", () => {
// Send the request
this.connection!.send(JSON.stringify(request));
});
this.connection.on("data", (data: any) => {
clearTimeout(timeout);
try {
const response =
typeof data === "string" ? JSON.parse(data) : data;
resolve(response as AgentResponse);
} catch (error) {
reject(new Error("Failed to parse peer response"));
}
});
this.connection.on("error", (error: any) => {
clearTimeout(timeout);
reject(new Error(`Peer connection error: ${error}`));
});
});
this.peer!.on("error", (error: any) => {
clearTimeout(timeout);
reject(new Error(`Peer error: ${error}`));
});
});
} else {
// Reuse existing connection
return new Promise<AgentResponse>((resolve, reject) => {
const timeout = setTimeout(() => {
reject(new Error("Peer request timeout"));
}, this.options.timeout);
if (this.connection && this.connection.open) {
this.connection.send(JSON.stringify(request));
const handleData = (data: any) => {
clearTimeout(timeout);
this.connection!.off("data", handleData);
try {
const response =
typeof data === "string" ? JSON.parse(data) : data;
resolve(response as AgentResponse);
} catch (error) {
reject(new Error("Failed to parse peer response"));
}
};
this.connection.on("data", handleData);
} else {
clearTimeout(timeout);
reject(new Error("Peer connection not available"));
}
});
}
}
// Health check method
async health(): Promise<{ status: string }> {
if (this.connectionType === "peer") {
return { status: this.peer?.open ? "connected" : "disconnected" };
}
try {
const response = await fetch(`${this.url}/health`);
if (response.ok) {
return { status: "healthy" };
}
return { status: "unhealthy" };
} catch {
return { status: "unreachable" };
}
}
// Clean up resources
async disconnect(): Promise<void> {
if (this.connection) {
this.connection.close();
this.connection = undefined;
}
if (this.peer) {
this.peer.destroy();
this.peer = undefined;
}
}
}

View File

@@ -0,0 +1,29 @@
// Export the main AgentClient class as default
export { AgentClient as default } from './client.js';
// Also export as named export for flexibility
export { AgentClient } from './client.js';
// Export types for TypeScript users
export type {
AgentRequest,
AgentResponse,
AgentMessage,
UserMessage,
AssistantMessage,
ReasoningMessage,
ComputerCallMessage,
ComputerCallOutputMessage,
OutputContent,
SummaryContent,
InputContent,
ComputerAction,
ClickAction,
TypeAction,
KeyPressAction,
ScrollAction,
WaitAction,
Usage,
ConnectionType,
AgentClientOptions,
} from './types';

View File

@@ -0,0 +1,203 @@
// #region Request
export type ConnectionType = 'http' | 'https' | 'peer';
export interface AgentClientOptions {
timeout?: number;
retries?: number;
/** Optional CUA API key to send as X-API-Key header for HTTP requests */
apiKey?: string;
}
// Request types matching the Python proxy API
export interface AgentRequest {
model: string;
input: string | AgentMessage[];
agent_kwargs?: {
save_trajectory?: boolean;
verbosity?: number;
[key: string]: any;
};
computer_kwargs?: {
os_type?: string;
provider_type?: string;
[key: string]: any;
};
/**
* Optional per-request environment variable overrides.
* Keys and values are strings and will be forwarded to the backend proxy.
*/
env?: Record<string, string>;
}
// #endregion
// #region Response
// Response types
export interface AgentResponse {
output: AgentMessage[];
usage: Usage;
status: 'completed' | 'failed';
error?: string;
}
// Usage information
export interface Usage {
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
response_cost: number;
}
// #endregion
// #region Messages
// Agent message types - can be one of several different message types
export type AgentMessage =
| UserMessage
| AssistantMessage
| ReasoningMessage
| ComputerCallMessage
| ComputerCallOutputMessage
| FunctionCallMessage
| FunctionCallOutputMessage;
// Input message
export interface UserMessage {
type?: 'message';
role: 'user' | 'system' | 'developer';
content: string | InputContent[];
}
// Output message
export interface AssistantMessage {
type: 'message';
role: 'assistant';
content: OutputContent[];
}
// Output reasoning/thinking message
export interface ReasoningMessage {
type: 'reasoning';
summary: SummaryContent[];
}
// Output computer action call
export interface ComputerCallMessage {
type: 'computer_call';
call_id: string;
status: 'completed' | 'failed' | 'pending';
action: ComputerAction;
}
// Output computer action result (always a screenshot)
export interface ComputerCallOutputMessage {
type: 'computer_call_output';
call_id: string;
output: ComputerResultContent;
}
// Output function call
export interface FunctionCallMessage {
type: 'function_call';
call_id: string;
status: 'completed' | 'failed' | 'pending';
name: string;
arguments: string; // JSON dict of kwargs
}
// Output function call result (always text)
export interface FunctionCallOutputMessage {
type: 'function_call_output';
call_id: string;
output: string;
}
// #endregion
// #region Message Content
export interface InputContent {
type: 'input_image' | 'input_text';
text?: string;
image_url?: string;
}
export interface OutputContent {
type: 'output_text';
text: string;
}
export interface SummaryContent {
type: 'summary_text';
text: string;
}
export interface ComputerResultContent {
type: 'computer_screenshot' | 'input_image';
image_url: string;
}
// #endregion
// #region Actions
export type ComputerAction =
| ComputerActionOpenAI
| ComputerActionAnthropic;
// OpenAI Computer Actions
export type ComputerActionOpenAI =
| ClickAction
| DoubleClickAction
| DragAction
| KeyPressAction
| MoveAction
| ScreenshotAction
| ScrollAction
| TypeAction
| WaitAction;
export interface ClickAction {
type: 'click';
button: 'left' | 'right' | 'wheel' | 'back' | 'forward';
x: number;
y: number;
}
export interface DoubleClickAction {
type: 'double_click';
button?: 'left' | 'right' | 'wheel' | 'back' | 'forward';
x: number;
y: number;
}
export interface DragAction {
type: 'drag';
button?: 'left' | 'right' | 'wheel' | 'back' | 'forward';
path: Array<[number, number]>;
}
export interface KeyPressAction {
type: 'keypress';
keys: string[];
}
export interface MoveAction {
type: 'move';
x: number;
y: number;
}
export interface ScreenshotAction {
type: 'screenshot';
}
export interface ScrollAction {
type: 'scroll';
scroll_x: number;
scroll_y: number;
x: number;
y: number;
}
export interface TypeAction {
type: 'type';
text: string;
}
export interface WaitAction {
type: 'wait';
}
// Anthropic Computer Actions
export type ComputerActionAnthropic =
| LeftMouseDownAction
| LeftMouseUpAction;
export interface LeftMouseDownAction {
type: 'left_mouse_down';
x: number;
y: number;
}
export interface LeftMouseUpAction {
type: 'left_mouse_up';
x: number;
y: number;
}
// #endregion

View File

@@ -0,0 +1,34 @@
import { describe, it, expect } from 'vitest';
import AgentClient from '../src/index.js';
describe('AgentClient', () => {
it('should create client with HTTP URL', () => {
const client = new AgentClient('https://localhost:8000');
expect(client).toBeDefined();
expect(client.responses).toBeDefined();
expect(typeof client.responses.create).toBe('function');
});
it('should create client with peer URL', () => {
const client = new AgentClient('peer://test-peer-id');
expect(client).toBeDefined();
expect(client.responses).toBeDefined();
expect(typeof client.responses.create).toBe('function');
});
it('should throw error for invalid URL', () => {
expect(() => {
new AgentClient('invalid://url');
}).toThrow('Invalid URL format');
});
it('should have health method', async () => {
const client = new AgentClient('https://localhost:8000');
expect(typeof client.health).toBe('function');
});
it('should have disconnect method', async () => {
const client = new AgentClient('https://localhost:8000');
expect(typeof client.disconnect).toBe('function');
});
});

View File

@@ -0,0 +1,28 @@
{
"compilerOptions": {
"target": "esnext",
"lib": [
"es2023"
],
"moduleDetection": "force",
"module": "preserve",
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"resolveJsonModule": true,
"types": [
"node"
],
"allowSyntheticDefaultImports": true,
"strict": true,
"noUnusedLocals": true,
"declaration": true,
"emitDeclarationOnly": true,
"esModuleInterop": true,
"isolatedModules": true,
"verbatimModuleSyntax": true,
"skipLibCheck": true
},
"include": [
"src"
]
}

View File

@@ -0,0 +1,12 @@
import { defineConfig } from "tsdown";
export default defineConfig({
entry: ["src/index.ts"],
format: ["module"],
platform: "browser",
dts: true,
clean: true,
// Remove if we don't need to support including the library via '<script/>' tags.
// noExternal bundles this list of libraries within the final 'dist'
noExternal: ['peerjs']
});

View File

@@ -0,0 +1,7 @@
import { defineConfig } from 'vitest/config'
export default defineConfig({
test: {
environment: 'happy-dom',
},
})

View File

@@ -23,7 +23,10 @@
"pnpm": {
"onlyBuiltDependencies": [
"@biomejs/biome",
"esbuild"
"esbuild",
"protobufjs",
"sharp",
"unrs-resolver"
]
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,3 +1,4 @@
packages:
- "computer"
- "core"
- "agent"