From 96e4b7586ab4e257adebd7f5d075056c29f40276 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 5 Aug 2025 10:51:21 -0400 Subject: [PATCH] update mcp server to cua-agent==0.4.x --- libs/python/mcp-server/README.md | 31 ++++--- libs/python/mcp-server/mcp_server/server.py | 97 ++++++++++----------- libs/python/mcp-server/pyproject.toml | 4 +- 3 files changed, 64 insertions(+), 68 deletions(-) diff --git a/libs/python/mcp-server/README.md b/libs/python/mcp-server/README.md index 3f3c8bbb..a94da8a7 100644 --- a/libs/python/mcp-server/README.md +++ b/libs/python/mcp-server/README.md @@ -16,6 +16,21 @@ **cua-mcp-server** is a MCP server for the Computer-Use Agent (CUA), allowing you to run CUA through Claude Desktop or other MCP clients. + +## LiteLLM Integration + +This MCP server features comprehensive liteLLM integration, allowing you to use any supported LLM provider with a simple model string configuration. + +- **Unified Configuration**: Use a single `CUA_MODEL_NAME` environment variable with a model string +- **Automatic Provider Detection**: The agent automatically detects the provider and capabilities from the model string +- **Extensive Provider Support**: Works with Anthropic, OpenAI, local models, and any liteLLM-compatible provider + +### Model String Examples: +- **Anthropic**: `"anthropic/claude-3-5-sonnet-20241022"` +- **OpenAI**: `"openai/computer-use-preview"` +- **UI-TARS**: `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"` +- **Omni + Any LiteLLM**: `"omniparser+litellm/gpt-4o"`, `"omniparser+litellm/claude-3-haiku"`, `"omniparser+ollama_chat/gemma3"` + ### Get started with Agent ## Prerequisites @@ -65,10 +80,7 @@ You can then use the script in your MCP configuration like this: "command": "/bin/bash", "args": ["~/.cua/start_mcp_server.sh"], "env": { - "CUA_AGENT_LOOP": "OMNI", - "CUA_MODEL_PROVIDER": "ANTHROPIC", - "CUA_MODEL_NAME": "claude-3-7-sonnet-20250219", - "CUA_PROVIDER_API_KEY": "your-api-key" + "CUA_MODEL_NAME": "anthropic/claude-3-5-sonnet-20241022" } } } @@ -86,11 +98,7 @@ If you want to develop with the cua-mcp-server directly without installation, yo "command": "/bin/bash", "args": ["~/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"], "env": { - "CUA_AGENT_LOOP": "UITARS", - "CUA_MODEL_PROVIDER": "OAICOMPAT", - "CUA_MODEL_NAME": "ByteDance-Seed/UI-TARS-1.5-7B", - "CUA_PROVIDER_BASE_URL": "https://****************.us-east-1.aws.endpoints.huggingface.cloud/v1", - "CUA_PROVIDER_API_KEY": "your-api-key" + "CUA_MODEL_NAME": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B" } } } @@ -142,10 +150,7 @@ The server is configured using environment variables (can be set in the Claude D | Variable | Description | Default | |----------|-------------|---------| -| `CUA_AGENT_LOOP` | Agent loop to use (OPENAI, ANTHROPIC, UITARS, OMNI) | OMNI | -| `CUA_MODEL_PROVIDER` | Model provider (ANTHROPIC, OPENAI, OLLAMA, OAICOMPAT) | ANTHROPIC | -| `CUA_MODEL_NAME` | Model name to use | None (provider default) | -| `CUA_PROVIDER_BASE_URL` | Base URL for provider API | None | +| `CUA_MODEL_NAME` | Model string (e.g., "anthropic/claude-3-5-sonnet-20241022", "openai/computer-use-preview", "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", "omniparser+litellm/gpt-4o", "omniparser+ollama_chat/gemma3") | anthropic/claude-3-5-sonnet-20241022 | | `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 | ## Available Tools diff --git a/libs/python/mcp-server/mcp_server/server.py b/libs/python/mcp-server/mcp_server/server.py index 03971cb6..73996d5e 100644 --- a/libs/python/mcp-server/mcp_server/server.py +++ b/libs/python/mcp-server/mcp_server/server.py @@ -3,6 +3,7 @@ import base64 import logging import os import sys +from tabnanny import verbose import traceback from typing import Any, Dict, List, Optional, Union, Tuple @@ -28,7 +29,7 @@ except ImportError as e: try: from computer import Computer - from agent import ComputerAgent, LLMProvider, LLM, AgentLoop + from agent import ComputerAgent logger.debug("Successfully imported Computer and Agent modules") except ImportError as e: @@ -92,49 +93,27 @@ def serve() -> FastMCP: global_computer = Computer(verbosity=logging.INFO) await global_computer.run() - # Determine which loop to use - loop_str = os.getenv("CUA_AGENT_LOOP", "OMNI") - loop = getattr(AgentLoop, loop_str) + # Get model name - this now determines the loop and provider + model_name = os.getenv("CUA_MODEL_NAME", "anthropic/claude-3-5-sonnet-20241022") + + logger.info(f"Using model: {model_name}") - # Determine provider - provider_str = os.getenv("CUA_MODEL_PROVIDER", "ANTHROPIC") - provider = getattr(LLMProvider, provider_str) - - # Get model name (if specified) - model_name = os.getenv("CUA_MODEL_NAME", None) - - # Get base URL for provider (if needed) - provider_base_url = os.getenv("CUA_PROVIDER_BASE_URL", None) - - # Get api key for provider (if needed) - api_key = os.getenv("CUA_PROVIDER_API_KEY", None) - - # Create agent with the specified configuration + # Create agent with the new v0.4.x API agent = ComputerAgent( - computer=global_computer, - loop=loop, - model=LLM( - provider=provider, - name=model_name, - provider_base_url=provider_base_url, - ), - api_key=api_key, - save_trajectory=False, + model=model_name, only_n_most_recent_images=int(os.getenv("CUA_MAX_IMAGES", "3")), verbosity=logging.INFO, + tools=[global_computer] ) + # Create messages in the new v0.4.x format + messages = [{"role": "user", "content": task}] + # Collect all results full_result = "" - async for result in agent.run(task): - logger.info(f"Agent step complete: {result.get('id', 'unknown')}") - ctx.info(f"Agent step complete: {result.get('id', 'unknown')}") - - # Add response ID to output - full_result += f"\n[Response ID: {result.get('id', 'unknown')}]\n" - - if "content" in result: - full_result += f"Response: {result.get('content', '')}\n" + async for result in agent.run(messages): + logger.info(f"Agent processing step") + ctx.info(f"Agent processing step") # Process output if available outputs = result.get("output", []) @@ -145,25 +124,23 @@ def serve() -> FastMCP: content = output.get("content", []) for content_part in content: if content_part.get("text"): - full_result += f"\nMessage: {content_part.get('text', '')}\n" - elif output_type == "reasoning": - logger.debug(f"Reasoning: {output}") - - summary_content = output.get("summary", []) - if summary_content: - for summary_part in summary_content: - if summary_part.get("text"): - full_result += f"\nReasoning: {summary_part.get('text', '')}\n" + full_result += f"Message: {content_part.get('text', '')}\n" + elif output_type == "tool_use": + logger.debug(f"Tool use: {output}") + tool_name = output.get("name", "") + full_result += f"Tool: {tool_name}\n" + elif output_type == "tool_result": + logger.debug(f"Tool result: {output}") + result_content = output.get("content", "") + if isinstance(result_content, list): + for item in result_content: + if item.get("type") == "text": + full_result += f"Result: {item.get('text', '')}\n" else: - full_result += f"\nReasoning: {output.get('text', output.get('content', ''))}\n" - elif output_type == "computer_call": - logger.debug(f"Computer call: {output}") - action = output.get("action", "") - result_value = output.get("result", "") - full_result += f"\nComputer Action: {action}\nResult: {result_value}\n" + full_result += f"Result: {result_content}\n" # Add separator between steps - full_result += "\n" + "-" * 40 + "\n" + full_result += "\n" + "-" * 20 + "\n" logger.info(f"CUA task completed successfully") ctx.info(f"CUA task completed successfully") @@ -179,7 +156,21 @@ def serve() -> FastMCP: error_msg = f"Error running CUA task: {str(e)}\n{traceback.format_exc()}" logger.error(error_msg) ctx.error(error_msg) - return f"Error during task execution: {str(e)}" + # Return tuple with error message and a screenshot if possible + try: + if global_computer is not None: + screenshot = await global_computer.interface.screenshot() + return ( + f"Error during task execution: {str(e)}", + Image(format="png", data=screenshot) + ) + except: + pass + # If we can't get a screenshot, return a placeholder + return ( + f"Error during task execution: {str(e)}", + Image(format="png", data=b"") + ) @server.tool() async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> List: diff --git a/libs/python/mcp-server/pyproject.toml b/libs/python/mcp-server/pyproject.toml index ed2ad435..f80a1b6b 100644 --- a/libs/python/mcp-server/pyproject.toml +++ b/libs/python/mcp-server/pyproject.toml @@ -13,8 +13,8 @@ authors = [ ] dependencies = [ "mcp>=1.6.0,<2.0.0", - "cua-agent[all]>=0.3.0,<0.4.0", - "cua-computer>=0.3.0,<0.4.0", + "cua-agent[all]>=0.4.0,<0.5.0", + "cua-computer>=0.4.0,<0.5.0", ] [project.scripts]