From cdc2b58d9464dd7072bd8653446aa9253ba58a1c Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 26 Aug 2025 10:24:47 -0400 Subject: [PATCH] Changed /responses endpoint to be standalone --- .../computer-server/computer_server/main.py | 202 +++++++++++++----- 1 file changed, 143 insertions(+), 59 deletions(-) diff --git a/libs/python/computer-server/computer_server/main.py b/libs/python/computer-server/computer_server/main.py index 5d028766..106f906f 100644 --- a/libs/python/computer-server/computer_server/main.py +++ b/libs/python/computer-server/computer_server/main.py @@ -440,24 +440,21 @@ async def agent_response_endpoint( api_key: Optional[str] = Header(None, alias="X-API-Key"), ): """ - Run a ComputerAgent step using server-side handlers as a tool. + Minimal proxy to run ComputerAgent for up to 2 turns. Security: - If CONTAINER_NAME is set on the server, require X-API-Key - and validate using AuthenticationManager. - - If CUA_ENABLE_PUBLIC_PROXY is set, allow public access. + and validate using AuthenticationManager unless CUA_ENABLE_PUBLIC_PROXY is true. Body JSON: { "model": "...", # required "input": "... or messages[]", # required - "agent_kwargs": { ... }, # optional, will be merged; tools will be overridden - "computer_kwargs": { ... }, # optional, ignored for this endpoint + "agent_kwargs": { ... }, # optional, passed directly to ComputerAgent "env": { ... } # optional env overrides for agent } """ - from agent.proxy.handlers import ResponsesHandler - from agent.computers import AsyncComputerHandler + from agent.agent import ComputerAgent # Authenticate via AuthenticationManager if running in cloud (CONTAINER_NAME set) container_name = os.environ.get("CONTAINER_NAME") @@ -481,98 +478,185 @@ async def agent_response_endpoint( if not model or input_data is None: raise HTTPException(status_code=400, detail="'model' and 'input' are required") - agent_kwargs = body.get("agent_kwargs") or {} - env_overrides = body.get("env") or {} + agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {} + env_overrides: Dict[str, str] = body.get("env") or {} + + # Simple env override context + class _EnvOverride: + def __init__(self, overrides: Dict[str, str]): + self.overrides = overrides + self._original: Dict[str, Optional[str]] = {} + def __enter__(self): + for k, v in (self.overrides or {}).items(): + self._original[k] = os.environ.get(k) + os.environ[k] = str(v) + def __exit__(self, exc_type, exc, tb): + for k, old in self._original.items(): + if old is None: + os.environ.pop(k, None) + else: + os.environ[k] = old + + # Convert input to messages + def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: + if isinstance(data, str): + return [{"role": "user", "content": data}] + if isinstance(data, list): + return data + + messages = _to_messages(input_data) + + # Define a direct computer tool that implements the AsyncComputerHandler protocol + # and delegates to our existing automation/file/accessibility handlers. + from agent.computers import AsyncComputerHandler # runtime-checkable Protocol + + class DirectComputer(AsyncComputerHandler): + def __init__(self): + # use module-scope handler singletons created by HandlerFactory + self._auto = automation_handler + self._file = file_handler + self._access = accessibility_handler - # Local AsyncComputerHandler implementation backed by automation_handler - class DirectComputerHandler(AsyncComputerHandler): async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: sys = platform.system().lower() - if "darwin" in sys or sys == "macos" or sys == "mac": + if "darwin" in sys or sys in ("macos", "mac"): return "mac" if "windows" in sys: return "windows" return "linux" async def get_dimensions(self) -> tuple[int, int]: - try: - res = await automation_handler.get_screen_size() - size = res.get("size") or {} - return int(size.get("width", 0)), int(size.get("height", 0)) - except Exception: - return (0, 0) + size = await self._auto.get_screen_size() + return size["width"], size["height"] async def screenshot(self) -> str: - res = await automation_handler.screenshot() - if not res.get("success"): - raise RuntimeError(res.get("error", "screenshot failed")) - return res.get("image_data", "") + img_b64 = await self._auto.screenshot() + return img_b64["image_data"] async def click(self, x: int, y: int, button: str = "left") -> None: - if button == "right": - await automation_handler.right_click(x, y) + if button == "left": + await self._auto.left_click(x, y) + elif button == "right": + await self._auto.right_click(x, y) else: - await automation_handler.left_click(x, y) + await self._auto.left_click(x, y) async def double_click(self, x: int, y: int) -> None: - await automation_handler.double_click(x, y) + await self._auto.double_click(x, y) async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: - try: - if scroll_y: - await automation_handler.scroll(scroll_y, 0) - if scroll_x: - await automation_handler.scroll(0, scroll_x) - except Exception: - await automation_handler.scroll(scroll_y or 0, scroll_x or 0) + await self._auto.move_cursor(x, y) + await self._auto.scroll(scroll_x, scroll_y) async def type(self, text: str) -> None: - await automation_handler.type_text(text) + await self._auto.type_text(text) async def wait(self, ms: int = 1000) -> None: await asyncio.sleep(ms / 1000.0) async def move(self, x: int, y: int) -> None: - await automation_handler.move_cursor(x, y) + await self._auto.move_cursor(x, y) async def keypress(self, keys: Union[List[str], str]) -> None: - if isinstance(keys, list): - if len(keys) <= 1: - key = keys[0] if keys else "" - if key: - await automation_handler.press_key(key) - else: - await cast(Any, automation_handler).hotkey([str(k) for k in keys]) - elif isinstance(keys, str): - await automation_handler.press_key(keys) + if isinstance(keys, str): + parts = keys.replace("-", "+").split("+") + else: + parts = keys + if len(parts) == 1: + await self._auto.press_key(parts[0]) + else: + await self._auto.hotkey(*parts) async def drag(self, path: List[Dict[str, int]]) -> None: - await automation_handler.drag([(p["x"], p["y"]) for p in path]) - + if not path: + return + start = path[0] + await self._auto.mouse_down(start["x"], start["y"]) + for pt in path[1:]: + await self._auto.move_cursor(pt["x"], pt["y"]) + end = path[-1] + await self._auto.mouse_up(end["x"], end["y"]) + async def get_current_url(self) -> str: + # Not available in this server context return "" async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None: - await automation_handler.mouse_down(x, y, button="left") + await self._auto.mouse_down(x, y, button="left") async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None: - await automation_handler.mouse_up(x, y, button="left") + await self._auto.mouse_up(x, y, button="left") - custom_handler = DirectComputerHandler() + # # Inline image URLs to base64 + # import base64, mimetypes, requests + # # Use a browser-like User-Agent to avoid 403s from some CDNs (e.g., Wikimedia) + # HEADERS = { + # "User-Agent": ( + # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + # "AppleWebKit/537.36 (KHTML, like Gecko) " + # "Chrome/124.0.0.0 Safari/537.36" + # ) + # } + # def _to_data_url(content_bytes: bytes, url: str, resp: requests.Response) -> str: + # ctype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream" + # b64 = base64.b64encode(content_bytes).decode("utf-8") + # return f"data:{ctype};base64,{b64}" + # def inline_image_urls(messages): + # # messages: List[{"role": "...","content":[...]}] + # out = [] + # for m in messages: + # if not isinstance(m.get("content"), list): + # out.append(m) + # continue + # new_content = [] + # for part in (m.get("content") or []): + # if part.get("type") == "input_image" and (url := part.get("image_url")): + # resp = requests.get(url, headers=HEADERS, timeout=30) + # resp.raise_for_status() + # new_content.append({ + # "type": "input_image", + # "image_url": _to_data_url(resp.content, url, resp) + # }) + # else: + # new_content.append(part) + # out.append({**m, "content": new_content}) + # return out + # messages = inline_image_urls(messages) - # Prepare request for ResponsesHandler and force our tool - rh = ResponsesHandler() - request_payload: Dict[str, Any] = { + with _EnvOverride(env_overrides): + # Prepare tools: if caller did not pass tools, inject our DirectComputer + tools = agent_kwargs.get("tools") + if not tools: + tools = [DirectComputer()] + agent_kwargs = {**agent_kwargs, "tools": tools} + # Instantiate agent with our tools + agent = ComputerAgent(model=model, **agent_kwargs) # type: ignore[arg-type] + + total_output: List[Any] = [] + total_usage: Dict[str, Any] = {} + + turns = 0 + async for result in agent.run(messages): + total_output += result["output"] + # Try to collect usage if present + if isinstance(result, dict) and "usage" in result and isinstance(result["usage"], dict): + # Merge usage counters + for k, v in result["usage"].items(): + if isinstance(v, (int, float)): + total_usage[k] = total_usage.get(k, 0) + v + else: + total_usage[k] = v + turns += 1 + if turns > 2: + break + + return { + "success": True, "model": model, - "input": input_data, - "agent_kwargs": {**agent_kwargs, "tools": [custom_handler]}, - # Don't need computer_kwargs; agent will use our tool instead - "env": env_overrides, + "output": total_output, + "usage": total_usage, } - result = await rh.process_request(request_payload) - return result - if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000)