Files
computer/libs/python/mcp-server/mcp_server/server.py
2025-09-25 16:07:53 -04:00

303 lines
10 KiB
Python

import asyncio
import base64
import inspect
import logging
import os
import sys
from tabnanny import verbose
import traceback
from typing import Any, Dict, List, Optional, Union, Tuple
# Configure logging to output to stderr for debug visibility
logging.basicConfig(
level=logging.DEBUG, # Changed to DEBUG
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
stream=sys.stderr,
)
logger = logging.getLogger("mcp-server")
# More visible startup message
logger.debug("MCP Server module loading...")
try:
from mcp.server.fastmcp import Context, FastMCP, Image
logger.debug("Successfully imported FastMCP")
except ImportError as e:
logger.error(f"Failed to import FastMCP: {e}")
traceback.print_exc(file=sys.stderr)
sys.exit(1)
try:
from computer import Computer
from agent import ComputerAgent
logger.debug("Successfully imported Computer and Agent modules")
except ImportError as e:
logger.error(f"Failed to import Computer/Agent modules: {e}")
traceback.print_exc(file=sys.stderr)
sys.exit(1)
# Global computer instance for reuse
global_computer = None
def get_env_bool(key: str, default: bool = False) -> bool:
"""Get boolean value from environment variable."""
return os.getenv(key, str(default)).lower() in ("true", "1", "yes")
async def _maybe_call_ctx_method(ctx: Context, method_name: str, *args, **kwargs) -> None:
"""Call a context helper if it exists, awaiting the result when necessary."""
method = getattr(ctx, method_name, None)
if not callable(method):
return
result = method(*args, **kwargs)
if inspect.isawaitable(result):
await result
def _normalise_message_content(content: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
"""Normalise message content to a list of structured parts."""
if isinstance(content, list):
return content
if content is None:
return []
return [{"type": "output_text", "text": str(content)}]
def _extract_text_from_content(content: Union[str, List[Dict[str, Any]]]) -> str:
"""Extract textual content for inclusion in the aggregated result string."""
if isinstance(content, str):
return content
texts: List[str] = []
for part in content or []:
if not isinstance(part, dict):
continue
if part.get("type") in {"output_text", "text"} and part.get("text"):
texts.append(str(part["text"]))
return "\n".join(texts)
def _serialise_tool_content(content: Any) -> str:
"""Convert tool outputs into a string for aggregation."""
if isinstance(content, str):
return content
if isinstance(content, list):
texts: List[str] = []
for part in content:
if isinstance(part, dict) and part.get("type") in {"output_text", "text"} and part.get("text"):
texts.append(str(part["text"]))
if texts:
return "\n".join(texts)
if content is None:
return ""
return str(content)
def serve() -> FastMCP:
"""Create and configure the MCP server."""
server = FastMCP("cua-agent")
@server.tool()
async def screenshot_cua(ctx: Context) -> Image:
"""
Take a screenshot of the current MacOS VM screen and return the image. Use this before running a CUA task to get a snapshot of the current state.
Args:
ctx: The MCP context
Returns:
An image resource containing the screenshot
"""
global global_computer
if global_computer is None:
global_computer = Computer(verbosity=logging.INFO)
await global_computer.run()
screenshot = await global_computer.interface.screenshot()
return Image(
format="png",
data=screenshot
)
@server.tool()
async def run_cua_task(ctx: Context, task: str) -> Tuple[str, Image]:
"""
Run a Computer-Use Agent (CUA) task in a MacOS VM and return the results.
Args:
ctx: The MCP context
task: The instruction or task for the agent to perform
Returns:
A tuple containing the agent's response and the final screenshot
"""
global global_computer
try:
logger.info(f"Starting CUA task: {task}")
# Initialize computer if needed
if global_computer is None:
global_computer = Computer(verbosity=logging.INFO)
await global_computer.run()
# Get model name - this now determines the loop and provider
model_name = os.getenv("CUA_MODEL_NAME", "anthropic/claude-3-5-sonnet-20241022")
logger.info(f"Using model: {model_name}")
# Create agent with the new v0.4.x API
agent = ComputerAgent(
model=model_name,
only_n_most_recent_images=int(os.getenv("CUA_MAX_IMAGES", "3")),
verbosity=logging.INFO,
tools=[global_computer]
)
# Create messages in the new v0.4.x format
messages = [{"role": "user", "content": task}]
# Collect all results
aggregated_messages: List[str] = []
async for result in agent.run(messages):
logger.info(f"Agent processing step")
ctx.info(f"Agent processing step")
# Process output if available
outputs = result.get("output", [])
for output in outputs:
output_type = output.get("type")
if output_type == "message":
logger.debug("Streaming assistant message: %s", output)
content = _normalise_message_content(output.get("content"))
aggregated_text = _extract_text_from_content(content)
if aggregated_text:
aggregated_messages.append(aggregated_text)
await _maybe_call_ctx_method(
ctx,
"yield_message",
role=output.get("role", "assistant"),
content=content,
)
elif output_type in {"tool_use", "computer_call", "function_call"}:
logger.debug("Streaming tool call: %s", output)
call_id = output.get("id") or output.get("call_id")
tool_name = output.get("name") or output.get("action", {}).get("type")
tool_input = output.get("input") or output.get("arguments") or output.get("action")
if call_id:
await _maybe_call_ctx_method(
ctx,
"yield_tool_call",
name=tool_name,
call_id=call_id,
input=tool_input,
)
elif output_type in {"tool_result", "computer_call_output", "function_call_output"}:
logger.debug("Streaming tool output: %s", output)
call_id = output.get("call_id") or output.get("id")
content = output.get("content") or output.get("output")
aggregated_text = _serialise_tool_content(content)
if aggregated_text:
aggregated_messages.append(aggregated_text)
if call_id:
await _maybe_call_ctx_method(
ctx,
"yield_tool_output",
call_id=call_id,
output=content,
is_error=output.get("status") == "failed" or output.get("is_error", False),
)
logger.info("CUA task completed successfully")
ctx.info("CUA task completed successfully")
screenshot_image = Image(
format="png",
data=await global_computer.interface.screenshot()
)
return (
"\n".join(aggregated_messages).strip() or "Task completed with no text output.",
screenshot_image,
)
except Exception as e:
error_msg = f"Error running CUA task: {str(e)}\n{traceback.format_exc()}"
logger.error(error_msg)
ctx.error(error_msg)
# Return tuple with error message and a screenshot if possible
try:
if global_computer is not None:
screenshot = await global_computer.interface.screenshot()
return (
f"Error during task execution: {str(e)}",
Image(format="png", data=screenshot)
)
except:
pass
# If we can't get a screenshot, return a placeholder
return (
f"Error during task execution: {str(e)}",
Image(format="png", data=b"")
)
@server.tool()
async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> List[Tuple[str, Image]]:
"""
Run multiple CUA tasks in a MacOS VM in sequence and return the combined results.
Args:
ctx: The MCP context
tasks: List of tasks to run in sequence
Returns:
Combined results from all tasks
"""
total_tasks = len(tasks)
if total_tasks == 0:
ctx.report_progress(1.0)
return []
results: List[Tuple[str, Image]] = []
for i, task in enumerate(tasks):
logger.info(f"Running task {i+1}/{total_tasks}: {task}")
ctx.info(f"Running task {i+1}/{total_tasks}: {task}")
ctx.report_progress(i / total_tasks)
task_result = await run_cua_task(ctx, task)
results.append(task_result)
ctx.report_progress((i + 1) / total_tasks)
return results
return server
server = serve()
def main():
"""Run the MCP server."""
try:
logger.debug("Starting MCP server...")
server.run()
except Exception as e:
logger.error(f"Error starting server: {e}")
traceback.print_exc(file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()