diff --git a/libs/python/computer-server/computer_server/handlers/base.py b/libs/python/computer-server/computer_server/handlers/base.py index 012a296c..ac629832 100644 --- a/libs/python/computer-server/computer_server/handlers/base.py +++ b/libs/python/computer-server/computer_server/handlers/base.py @@ -167,7 +167,7 @@ class BaseAutomationHandler(ABC): pass @abstractmethod - async def hotkey(self, *keys: str) -> Dict[str, Any]: + async def hotkey(self, keys: List[str]) -> Dict[str, Any]: """Press a combination of keys together.""" pass diff --git a/libs/python/computer-server/computer_server/handlers/windows.py b/libs/python/computer-server/computer_server/handlers/windows.py index eeb0d6bf..9572cd85 100644 --- a/libs/python/computer-server/computer_server/handlers/windows.py +++ b/libs/python/computer-server/computer_server/handlers/windows.py @@ -416,11 +416,11 @@ class WindowsAutomationHandler(BaseAutomationHandler): except Exception as e: return {"success": False, "error": str(e)} - async def hotkey(self, keys: str) -> Dict[str, Any]: + async def hotkey(self, keys: List[str]) -> Dict[str, Any]: """Press a combination of keys simultaneously. Args: - keys (str): The keys to press together (e.g., 'ctrl+c', 'alt+tab'). + keys (List[str]): The keys to press together (e.g., ['ctrl', 'c'], ['alt', 'tab']). Returns: Dict[str, Any]: A dictionary with success status and optional error message. diff --git a/libs/python/computer-server/computer_server/main.py b/libs/python/computer-server/computer_server/main.py index ab1a9f30..ad0b0ede 100644 --- a/libs/python/computer-server/computer_server/main.py +++ b/libs/python/computer-server/computer_server/main.py @@ -1,6 +1,6 @@ from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request, HTTPException, Header -from fastapi.responses import StreamingResponse -from typing import List, Dict, Any, Optional +from fastapi.responses import StreamingResponse, JSONResponse +from typing import List, Dict, Any, Optional, Union, Literal, cast import uvicorn import logging import asyncio @@ -14,6 +14,14 @@ import os import aiohttp import hashlib import time +import platform +from fastapi.middleware.cors import CORSMiddleware + +try: + from agent import ComputerAgent + HAS_AGENT = True +except ImportError: + HAS_AGENT = False # Set up logging with more detail logger = logging.getLogger(__name__) @@ -30,6 +38,16 @@ app = FastAPI( websocket_max_size=WEBSOCKET_MAX_SIZE, ) +# CORS configuration +origins = ["*"] +app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + protocol_version = 1 try: from importlib.metadata import version @@ -197,6 +215,21 @@ class ConnectionManager: manager = ConnectionManager() auth_manager = AuthenticationManager() +@app.get("/status") +async def status(): + sys = platform.system().lower() + # get os type + if "darwin" in sys or sys == "macos" or sys == "mac": + os_type = "macos" + elif "windows" in sys: + os_type = "windows" + else: + os_type = "linux" + # get computer-server features + features = [] + if HAS_AGENT: + features.append("agent") + return {"status": "ok", "os_type": os_type, "features": features} @app.websocket("/ws", name="websocket_endpoint") async def websocket_endpoint(websocket: WebSocket): @@ -331,7 +364,6 @@ async def websocket_endpoint(websocket: WebSocket): pass manager.disconnect(websocket) - @app.post("/cmd") async def cmd_endpoint( request: Request, @@ -420,12 +452,255 @@ async def cmd_endpoint( headers={ "Cache-Control": "no-cache", "Connection": "keep-alive", - "Access-Control-Allow-Origin": "*", - "Access-Control-Allow-Methods": "POST, OPTIONS", - "Access-Control-Allow-Headers": "Content-Type, X-Container-Name, X-API-Key" } ) +@app.post("/responses") +async def agent_response_endpoint( + request: Request, + api_key: Optional[str] = Header(None, alias="X-API-Key"), +): + """ + Minimal proxy to run ComputerAgent for up to 2 turns. + + Security: + - If CONTAINER_NAME is set on the server, require X-API-Key + and validate using AuthenticationManager unless CUA_ENABLE_PUBLIC_PROXY is true. + + Body JSON: + { + "model": "...", # required + "input": "... or messages[]", # required + "agent_kwargs": { ... }, # optional, passed directly to ComputerAgent + "env": { ... } # optional env overrides for agent + } + """ + if not HAS_AGENT: + raise HTTPException(status_code=501, detail="ComputerAgent not available") + + # Authenticate via AuthenticationManager if running in cloud (CONTAINER_NAME set) + container_name = os.environ.get("CONTAINER_NAME") + if container_name: + is_public = os.environ.get("CUA_ENABLE_PUBLIC_PROXY", "").lower().strip() in ["1", "true", "yes", "y", "on"] + if not is_public: + if not api_key: + raise HTTPException(status_code=401, detail="Missing AGENT PROXY auth headers") + ok = await auth_manager.auth(container_name, api_key) + if not ok: + raise HTTPException(status_code=401, detail="Unauthorized") + + # Parse request body + try: + body = await request.json() + except Exception as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}") + + model = body.get("model") + input_data = body.get("input") + if not model or input_data is None: + raise HTTPException(status_code=400, detail="'model' and 'input' are required") + + agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {} + env_overrides: Dict[str, str] = body.get("env") or {} + + # Simple env override context + class _EnvOverride: + def __init__(self, overrides: Dict[str, str]): + self.overrides = overrides + self._original: Dict[str, Optional[str]] = {} + def __enter__(self): + for k, v in (self.overrides or {}).items(): + self._original[k] = os.environ.get(k) + os.environ[k] = str(v) + def __exit__(self, exc_type, exc, tb): + for k, old in self._original.items(): + if old is None: + os.environ.pop(k, None) + else: + os.environ[k] = old + + # Convert input to messages + def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: + if isinstance(data, str): + return [{"role": "user", "content": data}] + if isinstance(data, list): + return data + + messages = _to_messages(input_data) + + # Define a direct computer tool that implements the AsyncComputerHandler protocol + # and delegates to our existing automation/file/accessibility handlers. + from agent.computers import AsyncComputerHandler # runtime-checkable Protocol + + class DirectComputer(AsyncComputerHandler): + def __init__(self): + # use module-scope handler singletons created by HandlerFactory + self._auto = automation_handler + self._file = file_handler + self._access = accessibility_handler + + async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: + sys = platform.system().lower() + if "darwin" in sys or sys in ("macos", "mac"): + return "mac" + if "windows" in sys: + return "windows" + return "linux" + + async def get_dimensions(self) -> tuple[int, int]: + size = await self._auto.get_screen_size() + return size["width"], size["height"] + + async def screenshot(self) -> str: + img_b64 = await self._auto.screenshot() + return img_b64["image_data"] + + async def click(self, x: int, y: int, button: str = "left") -> None: + if button == "left": + await self._auto.left_click(x, y) + elif button == "right": + await self._auto.right_click(x, y) + else: + await self._auto.left_click(x, y) + + async def double_click(self, x: int, y: int) -> None: + await self._auto.double_click(x, y) + + async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + await self._auto.move_cursor(x, y) + await self._auto.scroll(scroll_x, scroll_y) + + async def type(self, text: str) -> None: + await self._auto.type_text(text) + + async def wait(self, ms: int = 1000) -> None: + await asyncio.sleep(ms / 1000.0) + + async def move(self, x: int, y: int) -> None: + await self._auto.move_cursor(x, y) + + async def keypress(self, keys: Union[List[str], str]) -> None: + if isinstance(keys, str): + parts = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys] + else: + parts = keys + if len(parts) == 1: + await self._auto.press_key(parts[0]) + else: + await self._auto.hotkey(parts) + + async def drag(self, path: List[Dict[str, int]]) -> None: + if not path: + return + start = path[0] + await self._auto.mouse_down(start["x"], start["y"]) + for pt in path[1:]: + await self._auto.move_cursor(pt["x"], pt["y"]) + end = path[-1] + await self._auto.mouse_up(end["x"], end["y"]) + + async def get_current_url(self) -> str: + # Not available in this server context + return "" + + async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None: + await self._auto.mouse_down(x, y, button="left") + + async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None: + await self._auto.mouse_up(x, y, button="left") + + # # Inline image URLs to base64 + # import base64, mimetypes, requests + # # Use a browser-like User-Agent to avoid 403s from some CDNs (e.g., Wikimedia) + # HEADERS = { + # "User-Agent": ( + # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + # "AppleWebKit/537.36 (KHTML, like Gecko) " + # "Chrome/124.0.0.0 Safari/537.36" + # ) + # } + # def _to_data_url(content_bytes: bytes, url: str, resp: requests.Response) -> str: + # ctype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream" + # b64 = base64.b64encode(content_bytes).decode("utf-8") + # return f"data:{ctype};base64,{b64}" + # def inline_image_urls(messages): + # # messages: List[{"role": "...","content":[...]}] + # out = [] + # for m in messages: + # if not isinstance(m.get("content"), list): + # out.append(m) + # continue + # new_content = [] + # for part in (m.get("content") or []): + # if part.get("type") == "input_image" and (url := part.get("image_url")): + # resp = requests.get(url, headers=HEADERS, timeout=30) + # resp.raise_for_status() + # new_content.append({ + # "type": "input_image", + # "image_url": _to_data_url(resp.content, url, resp) + # }) + # else: + # new_content.append(part) + # out.append({**m, "content": new_content}) + # return out + # messages = inline_image_urls(messages) + + error = None + + with _EnvOverride(env_overrides): + # Prepare tools: if caller did not pass tools, inject our DirectComputer + tools = agent_kwargs.get("tools") + if not tools: + tools = [DirectComputer()] + agent_kwargs = {**agent_kwargs, "tools": tools} + # Instantiate agent with our tools + agent = ComputerAgent(model=model, **agent_kwargs) # type: ignore[arg-type] + + total_output: List[Any] = [] + total_usage: Dict[str, Any] = {} + + pending_computer_call_ids = set() + try: + async for result in agent.run(messages): + total_output += result["output"] + # Try to collect usage if present + if isinstance(result, dict) and "usage" in result and isinstance(result["usage"], dict): + # Merge usage counters + for k, v in result["usage"].items(): + if isinstance(v, (int, float)): + total_usage[k] = total_usage.get(k, 0) + v + else: + total_usage[k] = v + for msg in result.get("output", []): + if msg.get("type") == "computer_call": + pending_computer_call_ids.add(msg["call_id"]) + elif msg.get("type") == "computer_call_output": + pending_computer_call_ids.discard(msg["call_id"]) + # exit if no pending computer calls + if not pending_computer_call_ids: + break + except Exception as e: + logger.error(f"Error running agent: {str(e)}") + logger.error(traceback.format_exc()) + error = str(e) + + # Build response payload + payload = { + "model": model, + "error": error, + "output": total_output, + "usage": total_usage, + "status": "completed" if not error else "failed" + } + + # CORS: allow any origin + headers = { + "Cache-Control": "no-cache", + "Connection": "keep-alive", + } + + return JSONResponse(content=payload, headers=headers) + if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/libs/typescript/agent/README.md b/libs/typescript/agent/README.md new file mode 100644 index 00000000..76db192e --- /dev/null +++ b/libs/typescript/agent/README.md @@ -0,0 +1,217 @@ +# @trycua/agent + +TypeScript SDK for CUA agent interaction. Connect to CUA agent proxies via HTTP/HTTPS or peer-to-peer (WebRTC) connections. + +## Installation + +```bash +npm install @trycua/agent +# or +pnpm add @trycua/agent +# or +yarn add @trycua/agent +``` + +## Usage + +### Basic Usage + +```typescript +import AgentClient from "@trycua/agent"; + +// Connect to local HTTP server +const client = new AgentClient("https://localhost:8000"); + +// Connect to a cloud container (port 8443 over HTTPS) +const cloud = new AgentClient( + "https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443", + { apiKey: process.env.NEXT_PUBLIC_CUA_API_KEY || "" } +); + +// Connect to peer +const peerClient = new AgentClient("peer://my-agent-proxy"); + +// Send a simple text request +const response = await client.responses.create({ + model: "anthropic/claude-3-5-sonnet-20241022", + input: "Write a one-sentence bedtime story about a unicorn.", + // Optional per-request env overrides + env: { + OPENAI_API_KEY: "sk-..." + } +}); + +console.log(response.output); +``` + +### Multi-modal Requests + +```typescript +const response = await client.responses.create({ + model: "anthropic/claude-3-5-sonnet-20241022", + input: [ + { + role: "user", + content: [ + { type: "input_text", text: "What is in this image?" }, + { + type: "input_image", + image_url: "https://example.com/image.jpg" + } + ] + } + ], + env: { OPENROUTER_API_KEY: "sk-..." } +}); +``` + +### Advanced Configuration + +```typescript +const client = new AgentClient("https://localhost:8000", { + timeout: 60000, // 60 second timeout + retries: 5, // 5 retry attempts + apiKey: "cua_...", // sent as X-API-Key header when using HTTP/HTTPS +}); + +const response = await client.responses.create({ + model: "anthropic/claude-3-5-sonnet-20241022", + input: "Hello, world!", + agent_kwargs: { + save_trajectory: true, + verbosity: 20 + }, + computer_kwargs: { + os_type: "linux", + provider_type: "cloud" + }, + // Per-request env overrides + env: { + ANTHROPIC_API_KEY: "sk-...", + OPENROUTER_API_KEY: "sk-..." + } +}); +``` + +### Health Check + +```typescript +const health = await client.health(); +console.log(health.status); // 'healthy', 'unhealthy', 'unreachable', 'connected', 'disconnected' +``` + +### Cleanup + +```typescript +// Clean up peer connections when done +await client.disconnect(); +``` + +## API Reference + +### AgentClient + +#### Constructor + +```typescript +new AgentClient(url: string, options?: AgentClientOptions) +``` + +- `url`: Connection URL. Supports `http://`, `https://`, or `peer://` protocols +- `options`: Optional configuration object + +#### Methods + +##### responses.create(request: AgentRequest): Promise + +Send a request to the agent and get a response. + +##### health(): Promise<{status: string}> + +Check the health/connection status of the agent. + +##### disconnect(): Promise + +Clean up resources and close connections. + +### Types + +#### AgentRequest + +```typescript +interface AgentRequest { + model: string; + input: string | AgentMessage[]; + agent_kwargs?: { + save_trajectory?: boolean; + verbosity?: number; + [key: string]: any; + }; + computer_kwargs?: { + os_type?: string; + provider_type?: string; + [key: string]: any; + }; + // Optional per-request environment overrides + env?: Record; +} +``` + +#### AgentResponse + +```typescript +interface AgentResponse { + output: AgentMessage[]; + usage: Usage; +} + +interface Usage { + prompt_tokens: number; + completion_tokens: number; + total_tokens: number; + response_cost: number; +} +``` + +The `output` array contains the conversation history including: +- User messages +- Agent reasoning/thinking +- Computer actions and their results +- Final agent responses + +The `usage` object provides token counts and cost information for the request. + +## Connection Types + +### HTTP/HTTPS + +Connect to a CUA agent proxy server: + +```typescript +// Local +const client = new AgentClient("https://my-agent-server.com:8000", { apiKey: "cua_..." }); + +// Cloud container (port 8443) +const cloud = new AgentClient( + "https://m-linux-96lcxd2c2k.containers.cloud.trycua.com:8443", + { apiKey: "cua_..." } +); +``` + +Notes: +- The client sends the API key as `X-API-Key` for HTTP/HTTPS connections. +- Cloud containers listen on `:8443` with HTTPS. + +### Peer-to-Peer (WebRTC) + +Connect directly to another peer using WebRTC: + +```typescript +const client = new AgentClient("peer://agent-proxy-peer-id"); +``` + +The client uses PeerJS with default configuration for peer connections. + +## License + +MIT diff --git a/libs/typescript/agent/examples/README.md b/libs/typescript/agent/examples/README.md new file mode 100644 index 00000000..98939466 --- /dev/null +++ b/libs/typescript/agent/examples/README.md @@ -0,0 +1,70 @@ +# CUA Agent Client Examples + +This directory contains examples demonstrating how to use the `@trycua/agent` client library. + +## Browser Example + +### `browser-example.html` + +A simple HTML page that demonstrates using the CUA Agent Client in a browser environment. + +**Features:** +- Connect to HTTP/HTTPS or P2P (peer://) agent proxies +- Send text messages to any supported model +- View responses in real-time +- Health check functionality +- Clear, simple interface with no external dependencies + +**Usage:** + +1. **Build the library first:** + ```bash + cd ../ + pnpm build + ``` + +2. **Start a local web server** (required for ES modules): + ```bash + # Option 1: Using Python + python -m http.server 8080 + + # Option 2: Using Node.js (if you have http-server installed) + npx http-server -p 8080 + + # Option 3: Using any other local server + ``` + +3. **Open in browser:** + Navigate to `http://localhost:8080/examples/playground-example.html` + +4. **Configure and test:** + - Enter an agent URL (e.g., `https://localhost:8000` or `peer://some-peer-id`) + - Enter a model name (e.g., `anthropic/claude-3-5-sonnet-20241022`) + - Type a message and click "Send Message" or press Enter + - View the response in the output textarea + +**Supported URLs:** +- **HTTP/HTTPS**: `https://localhost:8000`, `http://my-agent-server.com:8080` +- **Peer-to-Peer**: `peer://computer-agent-proxy`, `peer://any-peer-id` + +**Example Models:** +- `anthropic/claude-3-5-sonnet-20241022` +- `openai/gpt-4` +- `huggingface-local/microsoft/UI-TARS-7B` + +**Note:** Make sure you have a CUA agent proxy server running at the specified URL before testing. + +## Running Agent Proxy Server + +To test the examples, you'll need a CUA agent proxy server running: + +```bash +# HTTP server (default port 8000) +python -m agent.proxy.cli + +# P2P server +python -m agent.proxy.cli --mode p2p + +# Both HTTP and P2P +python -m agent.proxy.cli --mode both +``` diff --git a/libs/typescript/agent/examples/playground-example.html b/libs/typescript/agent/examples/playground-example.html new file mode 100644 index 00000000..8fd35a05 --- /dev/null +++ b/libs/typescript/agent/examples/playground-example.html @@ -0,0 +1,146 @@ + + + + + + CUA Agent Playground Example + + +

CUA Agent Playground Example

+ +
+

Configuration

+
+

+ +
+

+
+ +
+

Chat

+
+

+ + + +

+ +
+ +
+ + + + + diff --git a/libs/typescript/agent/package.json b/libs/typescript/agent/package.json new file mode 100644 index 00000000..f77206df --- /dev/null +++ b/libs/typescript/agent/package.json @@ -0,0 +1,54 @@ +{ + "name": "@trycua/agent", + "version": "0.1.0", + "packageManager": "pnpm@10.11.0", + "description": "TypeScript SDK for CUA agent interaction", + "type": "module", + "license": "MIT", + "homepage": "https://github.com/trycua/cua/tree/main/libs/typescript/agent", + "bugs": { + "url": "https://github.com/trycua/cua/issues" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/trycua/cua.git" + }, + "author": "cua", + "files": [ + "dist" + ], + "main": "./dist/index.js", + "module": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": "./dist/index.js", + "./package.json": "./package.json" + }, + "publishConfig": { + "access": "public" + }, + "scripts": { + "lint": "biome lint .", + "lint:fix": "biome lint --fix .", + "build": "tsdown", + "dev": "tsdown --watch", + "test": "vitest", + "typecheck": "tsc --noEmit", + "release": "bumpp && pnpm publish", + "prepublishOnly": "pnpm run build" + }, + "dependencies": { + "@trycua/core": "^0.1.2", + "peerjs": "^1.5.4", + "pino": "^9.7.0" + }, + "devDependencies": { + "@biomejs/biome": "^1.9.4", + "@types/node": "^22.15.17", + "bumpp": "^10.1.0", + "happy-dom": "^17.4.7", + "tsdown": "^0.14.1", + "typescript": "^5.7.2", + "vitest": "^2.1.8" + } +} diff --git a/libs/typescript/agent/src/client.ts b/libs/typescript/agent/src/client.ts new file mode 100644 index 00000000..d25e698b --- /dev/null +++ b/libs/typescript/agent/src/client.ts @@ -0,0 +1,197 @@ +import {Peer} from "peerjs"; +import type { + AgentRequest, + AgentResponse, + ConnectionType, + AgentClientOptions, +} from "./types"; + +export class AgentClient { + private url: string; + private connectionType: ConnectionType; + private options: AgentClientOptions; + private peer?: Peer; + private connection?: any; + + constructor(url: string, options: AgentClientOptions = {}) { + this.url = url; + this.options = { + timeout: 30000, + retries: 3, + ...options, + }; + + // Determine connection type from URL + if (url.startsWith("http://") || url.startsWith("https://")) { + this.connectionType = url.startsWith("https://") ? "https" : "http"; + } else if (url.startsWith("peer://")) { + this.connectionType = "peer"; + } else { + throw new Error( + "Invalid URL format. Must start with http://, https://, or peer://" + ); + } + } + + // Main responses API matching the desired usage pattern + public responses = { + create: async (request: AgentRequest): Promise => { + return this.sendRequest(request); + }, + }; + + private async sendRequest(request: AgentRequest): Promise { + switch (this.connectionType) { + case "http": + case "https": + return this.sendHttpRequest(request); + case "peer": + return this.sendPeerRequest(request); + default: + throw new Error(`Unsupported connection type: ${this.connectionType}`); + } + } + + private async sendHttpRequest(request: AgentRequest): Promise { + const controller = new AbortController(); + const timeoutId = setTimeout( + () => controller.abort(), + this.options.timeout + ); + + try { + const headers: Record = { + "Content-Type": "application/json", + }; + if (this.options.apiKey) { + headers["X-API-Key"] = this.options.apiKey; + } + + const response = await fetch(`${this.url}/responses`, { + method: "POST", + headers, + body: JSON.stringify(request), + signal: controller.signal, + }); + + clearTimeout(timeoutId); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + const data = await response.json(); + return data as AgentResponse; + } catch (error) { + clearTimeout(timeoutId); + if (error instanceof Error) { + throw new Error(`Failed to send HTTP request: ${error.message}`); + } + throw error; + } + } + + private async sendPeerRequest(request: AgentRequest): Promise { + // Extract peer ID from peer:// URL + const peerId = this.url.replace("peer://", ""); + + if (!this.peer) { + // Initialize peer connection with default options as requested + this.peer = new Peer(); + + return new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + reject(new Error("Peer connection timeout")); + }, this.options.timeout); + + this.peer!.on("open", () => { + // Connect to the target peer + this.connection = this.peer!.connect(peerId); + + this.connection.on("open", () => { + // Send the request + this.connection!.send(JSON.stringify(request)); + }); + + this.connection.on("data", (data: any) => { + clearTimeout(timeout); + try { + const response = + typeof data === "string" ? JSON.parse(data) : data; + resolve(response as AgentResponse); + } catch (error) { + reject(new Error("Failed to parse peer response")); + } + }); + + this.connection.on("error", (error: any) => { + clearTimeout(timeout); + reject(new Error(`Peer connection error: ${error}`)); + }); + }); + + this.peer!.on("error", (error: any) => { + clearTimeout(timeout); + reject(new Error(`Peer error: ${error}`)); + }); + }); + } else { + // Reuse existing connection + return new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + reject(new Error("Peer request timeout")); + }, this.options.timeout); + + if (this.connection && this.connection.open) { + this.connection.send(JSON.stringify(request)); + + const handleData = (data: any) => { + clearTimeout(timeout); + this.connection!.off("data", handleData); + try { + const response = + typeof data === "string" ? JSON.parse(data) : data; + resolve(response as AgentResponse); + } catch (error) { + reject(new Error("Failed to parse peer response")); + } + }; + + this.connection.on("data", handleData); + } else { + clearTimeout(timeout); + reject(new Error("Peer connection not available")); + } + }); + } + } + + // Health check method + async health(): Promise<{ status: string }> { + if (this.connectionType === "peer") { + return { status: this.peer?.open ? "connected" : "disconnected" }; + } + + try { + const response = await fetch(`${this.url}/health`); + if (response.ok) { + return { status: "healthy" }; + } + return { status: "unhealthy" }; + } catch { + return { status: "unreachable" }; + } + } + + // Clean up resources + async disconnect(): Promise { + if (this.connection) { + this.connection.close(); + this.connection = undefined; + } + if (this.peer) { + this.peer.destroy(); + this.peer = undefined; + } + } +} diff --git a/libs/typescript/agent/src/index.ts b/libs/typescript/agent/src/index.ts new file mode 100644 index 00000000..78257ee5 --- /dev/null +++ b/libs/typescript/agent/src/index.ts @@ -0,0 +1,29 @@ +// Export the main AgentClient class as default +export { AgentClient as default } from './client.js'; + +// Also export as named export for flexibility +export { AgentClient } from './client.js'; + +// Export types for TypeScript users +export type { + AgentRequest, + AgentResponse, + AgentMessage, + UserMessage, + AssistantMessage, + ReasoningMessage, + ComputerCallMessage, + ComputerCallOutputMessage, + OutputContent, + SummaryContent, + InputContent, + ComputerAction, + ClickAction, + TypeAction, + KeyPressAction, + ScrollAction, + WaitAction, + Usage, + ConnectionType, + AgentClientOptions, +} from './types'; diff --git a/libs/typescript/agent/src/types.ts b/libs/typescript/agent/src/types.ts new file mode 100644 index 00000000..30e7340a --- /dev/null +++ b/libs/typescript/agent/src/types.ts @@ -0,0 +1,203 @@ +// #region Request +export type ConnectionType = 'http' | 'https' | 'peer'; +export interface AgentClientOptions { + timeout?: number; + retries?: number; + /** Optional CUA API key to send as X-API-Key header for HTTP requests */ + apiKey?: string; +} +// Request types matching the Python proxy API +export interface AgentRequest { + model: string; + input: string | AgentMessage[]; + agent_kwargs?: { + save_trajectory?: boolean; + verbosity?: number; + [key: string]: any; + }; + computer_kwargs?: { + os_type?: string; + provider_type?: string; + [key: string]: any; + }; + /** + * Optional per-request environment variable overrides. + * Keys and values are strings and will be forwarded to the backend proxy. + */ + env?: Record; +} +// #endregion + + +// #region Response +// Response types +export interface AgentResponse { + output: AgentMessage[]; + usage: Usage; + status: 'completed' | 'failed'; + error?: string; +} +// Usage information +export interface Usage { + prompt_tokens: number; + completion_tokens: number; + total_tokens: number; + response_cost: number; +} +// #endregion + + + +// #region Messages +// Agent message types - can be one of several different message types +export type AgentMessage = + | UserMessage + | AssistantMessage + | ReasoningMessage + | ComputerCallMessage + | ComputerCallOutputMessage + | FunctionCallMessage + | FunctionCallOutputMessage; +// Input message +export interface UserMessage { + type?: 'message'; + role: 'user' | 'system' | 'developer'; + content: string | InputContent[]; +} +// Output message +export interface AssistantMessage { + type: 'message'; + role: 'assistant'; + content: OutputContent[]; +} +// Output reasoning/thinking message +export interface ReasoningMessage { + type: 'reasoning'; + summary: SummaryContent[]; +} +// Output computer action call +export interface ComputerCallMessage { + type: 'computer_call'; + call_id: string; + status: 'completed' | 'failed' | 'pending'; + action: ComputerAction; +} +// Output computer action result (always a screenshot) +export interface ComputerCallOutputMessage { + type: 'computer_call_output'; + call_id: string; + output: ComputerResultContent; +} +// Output function call +export interface FunctionCallMessage { + type: 'function_call'; + call_id: string; + status: 'completed' | 'failed' | 'pending'; + name: string; + arguments: string; // JSON dict of kwargs +} +// Output function call result (always text) +export interface FunctionCallOutputMessage { + type: 'function_call_output'; + call_id: string; + output: string; +} +// #endregion + + + +// #region Message Content +export interface InputContent { + type: 'input_image' | 'input_text'; + text?: string; + image_url?: string; +} +export interface OutputContent { + type: 'output_text'; + text: string; +} +export interface SummaryContent { + type: 'summary_text'; + text: string; +} +export interface ComputerResultContent { + type: 'computer_screenshot' | 'input_image'; + image_url: string; +} +// #endregion + + + +// #region Actions +export type ComputerAction = + | ComputerActionOpenAI + | ComputerActionAnthropic; +// OpenAI Computer Actions +export type ComputerActionOpenAI = + | ClickAction + | DoubleClickAction + | DragAction + | KeyPressAction + | MoveAction + | ScreenshotAction + | ScrollAction + | TypeAction + | WaitAction; +export interface ClickAction { + type: 'click'; + button: 'left' | 'right' | 'wheel' | 'back' | 'forward'; + x: number; + y: number; +} +export interface DoubleClickAction { + type: 'double_click'; + button?: 'left' | 'right' | 'wheel' | 'back' | 'forward'; + x: number; + y: number; +} +export interface DragAction { + type: 'drag'; + button?: 'left' | 'right' | 'wheel' | 'back' | 'forward'; + path: Array<[number, number]>; +} +export interface KeyPressAction { + type: 'keypress'; + keys: string[]; +} +export interface MoveAction { + type: 'move'; + x: number; + y: number; +} +export interface ScreenshotAction { + type: 'screenshot'; +} +export interface ScrollAction { + type: 'scroll'; + scroll_x: number; + scroll_y: number; + x: number; + y: number; +} +export interface TypeAction { + type: 'type'; + text: string; +} +export interface WaitAction { + type: 'wait'; +} +// Anthropic Computer Actions +export type ComputerActionAnthropic = + | LeftMouseDownAction + | LeftMouseUpAction; +export interface LeftMouseDownAction { + type: 'left_mouse_down'; + x: number; + y: number; +} +export interface LeftMouseUpAction { + type: 'left_mouse_up'; + x: number; + y: number; +} +// #endregion \ No newline at end of file diff --git a/libs/typescript/agent/tests/client.test.ts b/libs/typescript/agent/tests/client.test.ts new file mode 100644 index 00000000..b267b122 --- /dev/null +++ b/libs/typescript/agent/tests/client.test.ts @@ -0,0 +1,34 @@ +import { describe, it, expect } from 'vitest'; +import AgentClient from '../src/index.js'; + +describe('AgentClient', () => { + it('should create client with HTTP URL', () => { + const client = new AgentClient('https://localhost:8000'); + expect(client).toBeDefined(); + expect(client.responses).toBeDefined(); + expect(typeof client.responses.create).toBe('function'); + }); + + it('should create client with peer URL', () => { + const client = new AgentClient('peer://test-peer-id'); + expect(client).toBeDefined(); + expect(client.responses).toBeDefined(); + expect(typeof client.responses.create).toBe('function'); + }); + + it('should throw error for invalid URL', () => { + expect(() => { + new AgentClient('invalid://url'); + }).toThrow('Invalid URL format'); + }); + + it('should have health method', async () => { + const client = new AgentClient('https://localhost:8000'); + expect(typeof client.health).toBe('function'); + }); + + it('should have disconnect method', async () => { + const client = new AgentClient('https://localhost:8000'); + expect(typeof client.disconnect).toBe('function'); + }); +}); diff --git a/libs/typescript/agent/tsconfig.json b/libs/typescript/agent/tsconfig.json new file mode 100644 index 00000000..8d56b691 --- /dev/null +++ b/libs/typescript/agent/tsconfig.json @@ -0,0 +1,28 @@ +{ + "compilerOptions": { + "target": "esnext", + "lib": [ + "es2023" + ], + "moduleDetection": "force", + "module": "preserve", + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "resolveJsonModule": true, + "types": [ + "node" + ], + "allowSyntheticDefaultImports": true, + "strict": true, + "noUnusedLocals": true, + "declaration": true, + "emitDeclarationOnly": true, + "esModuleInterop": true, + "isolatedModules": true, + "verbatimModuleSyntax": true, + "skipLibCheck": true + }, + "include": [ + "src" + ] +} \ No newline at end of file diff --git a/libs/typescript/agent/tsdown.config.ts b/libs/typescript/agent/tsdown.config.ts new file mode 100644 index 00000000..b837b6ee --- /dev/null +++ b/libs/typescript/agent/tsdown.config.ts @@ -0,0 +1,12 @@ +import { defineConfig } from "tsdown"; + +export default defineConfig({ + entry: ["src/index.ts"], + format: ["module"], + platform: "browser", + dts: true, + clean: true, + // Remove if we don't need to support including the library via '