Merge pull request #95 from trycua/feature/agent/mcp-server

[Agent] Add MCP server
2026-02-23 14:59:15 -06:00 · 2025-04-06 17:29:57 -07:00
parent f9723f522a b4bfd7bd89
commit 32269c01a5
12 changed files with 1639 additions and 7 deletions
--- a/.github/workflows/publish-mcp-server.yml
+++ b/.github/workflows/publish-mcp-server.yml
@@ -0,0 +1,68 @@
+name: Publish MCP Server Package
+
+on:
+  push:
+    tags:
+      - 'mcp-server-v*'
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version to publish (without v prefix)'
+        required: true
+        default: '0.1.0'
+  workflow_call:
+    inputs:
+      version:
+        description: 'Version to publish'
+        required: true
+        type: string
+    outputs:
+      version:
+        description: "The version that was published"
+        value: ${{ jobs.determine-version.outputs.version }}
+
+# Adding permissions at workflow level
+permissions:
+  contents: write
+
+jobs:
+  determine-version:
+    runs-on: macos-latest
+    outputs:
+      version: ${{ steps.get-version.outputs.version }}
+    steps:
+    - uses: actions/checkout@v4
+      
+    - name: Determine version
+      id: get-version
+      run: |
+        if [ "${{ github.event_name }}" == "push" ]; then
+          # Extract version from tag (for package-specific tags)
+          if [[ "${{ github.ref }}" =~ ^refs/tags/mcp-server-v([0-9]+\.[0-9]+\.[0-9]+) ]]; then
+            VERSION=${BASH_REMATCH[1]}
+          else
+            echo "Invalid tag format for mcp-server"
+            exit 1
+          fi
+        elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
+          # Use version from workflow dispatch
+          VERSION=${{ github.event.inputs.version }}
+        else
+          # Use version from workflow_call
+          VERSION=${{ inputs.version }}
+        fi
+        echo "VERSION=$VERSION"
+        echo "version=$VERSION" >> $GITHUB_OUTPUT
+
+  publish:
+    needs: determine-version
+    uses: ./.github/workflows/reusable-publish.yml
+    with:
+      package_name: "mcp-server"
+      package_dir: "libs/mcp-server"
+      version: ${{ needs.determine-version.outputs.version }}
+      is_lume_package: false
+      base_package_name: "cua-mcp-server"
+      make_latest: false
+    secrets:
+      PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} 
--- a/.github/workflows/reusable-publish.yml
+++ b/.github/workflows/reusable-publish.yml
@@ -220,6 +220,38 @@ jobs:
          echo "# Run the server" >> release_notes.md
          echo "cua-computer-server" >> release_notes.md
          echo '```' >> release_notes.md
+        elif [ "${{ inputs.package_name }}" = "mcp-server" ]; then
+          echo "## MCP Server for the Computer-Use Agent (CUA)" >> release_notes.md
+          echo "" >> release_notes.md
+          echo "This package provides MCP (Model Context Protocol) integration for CUA agents, allowing them to be used with Claude Desktop, Cursor, and other MCP clients." >> release_notes.md
+          echo "" >> release_notes.md
+          echo "## Dependencies" >> release_notes.md
+          echo "* cua-computer: ${COMPUTER_VERSION:-latest}" >> release_notes.md
+          echo "* cua-agent: ${AGENT_VERSION:-latest}" >> release_notes.md
+          echo "" >> release_notes.md
+          echo "## Usage" >> release_notes.md
+          echo '```bash' >> release_notes.md
+          echo "# Run the MCP server directly" >> release_notes.md
+          echo "cua-mcp-server" >> release_notes.md
+          echo '```' >> release_notes.md
+          echo "" >> release_notes.md
+          echo "## Claude Desktop Integration" >> release_notes.md
+          echo "Add to your Claude Desktop configuration (~/.config/claude-desktop/claude_desktop_config.json or OS-specific location):" >> release_notes.md
+          echo '```json' >> release_notes.md
+          echo '"mcpServers": {' >> release_notes.md
+          echo '  "cua-agent": {' >> release_notes.md
+          echo '    "command": "cua-mcp-server",' >> release_notes.md
+          echo '    "args": [],' >> release_notes.md
+          echo '    "env": {' >> release_notes.md
+          echo '      "CUA_AGENT_LOOP": "OMNI",' >> release_notes.md
+          echo '      "CUA_MODEL_PROVIDER": "ANTHROPIC",' >> release_notes.md
+          echo '      "CUA_MODEL_NAME": "claude-3-opus-20240229",' >> release_notes.md
+          echo '      "ANTHROPIC_API_KEY": "your-api-key",' >> release_notes.md
+          echo '      "PYTHONIOENCODING": "utf-8"' >> release_notes.md
+          echo '    }' >> release_notes.md
+          echo '  }' >> release_notes.md
+          echo '}' >> release_notes.md
+          echo '```' >> release_notes.md
        fi
        
        # Add installation section if not agent (which has its own installation section)
--- a/examples/agent_examples.py
+++ b/examples/agent_examples.py
@@ -33,13 +33,13 @@ async def run_agent_example():
                loop=AgentLoop.OMNI,
                # model=LLM(provider=LLMProvider.OPENAI),  # No model name for Operator CUA
                # model=LLM(provider=LLMProvider.OPENAI, name="gpt-4o"),
-                model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
+                # model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
                # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3:4b-it-q4_K_M"),
-                # model=LLM(
-                #     provider=LLMProvider.OAICOMPAT,
-                #     name="qwen2.5-vl-7b-instruct",
-                #     provider_base_url="http://localhost:1234/v1",  # LM Studio local endpoint
-                # ),
+                model=LLM(
+                    provider=LLMProvider.OAICOMPAT,
+                    name="gemma-3-12b-it",
+                    provider_base_url="http://localhost:1234/v1",  # LM Studio local endpoint
+                ),
                save_trajectory=True,
                only_n_most_recent_images=3,
                verbosity=logging.DEBUG,
--- a/libs/mcp-server/README.md
+++ b/libs/mcp-server/README.md
@@ -0,0 +1,121 @@
+<div align="center">
+<h1>
+  <div class="image-wrapper" style="display: inline-block;">
+    <picture>
+      <source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../img/logo_white.png" style="display: block; margin: auto;">
+      <source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../img/logo_black.png" style="display: block; margin: auto;">
+      <img alt="Shows my svg">
+    </picture>
+  </div>
+
+  [![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
+  [![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
+  [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
+  [![PyPI](https://img.shields.io/pypi/v/cua-computer?color=333333)](https://pypi.org/project/cua-computer/)
+</h1>
+</div>
+
+**cua-mcp-server** is a MCP server for the Computer-Use Agent (CUA), allowing you to run CUA through Claude Desktop or other MCP clients.
+### Get started with Agent
+
+## Installation
+
+Install the package from PyPI:
+
+```bash
+pip install cua-mcp-server
+```
+
+This will install:
+- The MCP server
+- CUA agent and computer dependencies 
+- An executable `cua-mcp-server` script in your PATH
+
+## Claude Desktop Integration
+
+To use with Claude Desktop, add an entry to your Claude Desktop configuration (`claude_desktop_config.json`, typically found in `~/.config/claude-desktop/`):
+
+```json
+"mcpServers": {
+  "cua-agent": {
+    "command": "cua-mcp-server",
+    "args": [],
+    "env": {
+      "CUA_AGENT_LOOP": "OMNI",
+      "CUA_MODEL_PROVIDER": "ANTHROPIC",
+      "CUA_MODEL_NAME": "claude-3-opus-20240229",
+      "ANTHROPIC_API_KEY": "your-api-key",
+      "PYTHONIOENCODING": "utf-8"
+    }
+  }
+}
+```
+
+For more information on MCP with Claude Desktop, see the [official MCP User Guide](https://modelcontextprotocol.io/quickstart/user).
+
+## Cursor Integration
+
+To use with Cursor, add an MCP configuration file in one of these locations:
+
+- **Project-specific**: Create `.cursor/mcp.json` in your project directory
+- **Global**: Create `~/.cursor/mcp.json` in your home directory
+
+The configuration format is similar to Claude Desktop's:
+
+```json
+{
+  "mcpServers": {
+    "cua-agent": {
+      "command": "cua-mcp-server",
+      "args": [],
+      "env": {
+        "CUA_AGENT_LOOP": "OMNI",
+        "CUA_MODEL_PROVIDER": "ANTHROPIC",
+        "CUA_MODEL_NAME": "claude-3-7-sonnet-20250219",
+        "ANTHROPIC_API_KEY": "your-api-key",
+        "PYTHONPATH": "/path/to/your/cua/installation"
+      }
+    }
+  }
+}
+```
+
+After configuration, you can simply tell Cursor's Agent to perform computer tasks by explicitly mentioning the CUA agent, such as "Use the computer control tools to open Safari."
+
+For more information on MCP with Cursor, see the [official Cursor MCP documentation](https://docs.cursor.com/context/model-context-protocol).
+
+### First-time Usage Notes
+
+**API Keys**: Ensure you have valid API keys:
+   - Add your Anthropic API key, or other model provider API key in the Claude Desktop config (as shown above)
+   - Or set it as an environment variable in your shell profile
+
+## Configuration
+
+The server is configured using environment variables (can be set in the Claude Desktop config):
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `CUA_AGENT_LOOP` | Agent loop to use (OPENAI, ANTHROPIC, OMNI) | OMNI |
+| `CUA_MODEL_PROVIDER` | Model provider (ANTHROPIC, OPENAI, OLLAMA, OAICOMPAT) | ANTHROPIC |
+| `CUA_MODEL_NAME` | Model name to use | None (provider default) |
+| `CUA_PROVIDER_BASE_URL` | Base URL for provider API | None |
+| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 |
+
+## Available Tools
+
+The MCP server exposes the following tools to Claude:
+
+1. `run_cua_task` - Run a single Computer-Use Agent task with the given instruction
+2. `run_multi_cua_tasks` - Run multiple tasks in sequence
+
+## Usage
+
+Once configured, you can simply ask Claude to perform computer tasks:
+
+- "Open Chrome and go to github.com"
+- "Create a folder called 'Projects' on my desktop"
+- "Find all PDFs in my Downloads folder"
+- "Take a screenshot and highlight the error message"
+
+Claude will automatically use your CUA agent to perform these tasks.
--- a/libs/mcp-server/mcp_server/init.py
+++ b/libs/mcp-server/mcp_server/init.py
@@ -0,0 +1,19 @@
+"""MCP Server for Computer-Use Agent (CUA)."""
+
+import sys
+import os
+
+# Add detailed debugging at import time
+with open("/tmp/mcp_server_debug.log", "w") as f:
+    f.write(f"Python executable: {sys.executable}\n")
+    f.write(f"Python version: {sys.version}\n")
+    f.write(f"Working directory: {os.getcwd()}\n")
+    f.write(f"Python path:\n{chr(10).join(sys.path)}\n")
+    f.write(f"Environment variables:\n")
+    for key, value in os.environ.items():
+        f.write(f"{key}={value}\n")
+
+from .server import server, main
+
+__version__ = "0.1.0"
+__all__ = ["server", "main"]
--- a/libs/mcp-server/mcp_server/main.py
+++ b/libs/mcp-server/mcp_server/main.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+"""Entry point for the MCP server module."""
+
+from .server import main
+
+if __name__ == "__main__":
+    main()
--- a/libs/mcp-server/mcp_server/server.py
+++ b/libs/mcp-server/mcp_server/server.py
@@ -0,0 +1,193 @@
+import asyncio
+import logging
+import os
+import sys
+import traceback
+from typing import Any, Dict, List, Optional, Union
+
+# Configure logging to output to stderr for debug visibility
+logging.basicConfig(
+    level=logging.DEBUG,  # Changed to DEBUG
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    stream=sys.stderr,
+)
+logger = logging.getLogger("mcp-server")
+
+# More visible startup message
+logger.debug("MCP Server module loading...")
+
+try:
+    from mcp.server.fastmcp import Context, FastMCP
+
+    logger.debug("Successfully imported FastMCP")
+except ImportError as e:
+    logger.error(f"Failed to import FastMCP: {e}")
+    traceback.print_exc(file=sys.stderr)
+    sys.exit(1)
+
+try:
+    from computer import Computer
+    from agent import ComputerAgent, LLMProvider, LLM, AgentLoop
+
+    logger.debug("Successfully imported Computer and Agent modules")
+except ImportError as e:
+    logger.error(f"Failed to import Computer/Agent modules: {e}")
+    traceback.print_exc(file=sys.stderr)
+    sys.exit(1)
+
+# Global computer instance for reuse
+global_computer = None
+
+
+def get_env_bool(key: str, default: bool = False) -> bool:
+    """Get boolean value from environment variable."""
+    return os.getenv(key, str(default)).lower() in ("true", "1", "yes")
+
+
+def serve() -> FastMCP:
+    """Create and configure the MCP server."""
+    server = FastMCP("cua-agent")
+
+    @server.tool()
+    async def run_cua_task(ctx: Context, task: str) -> str:
+        """
+        Run a Computer-Use Agent (CUA) task and return the results.
+
+        Args:
+            ctx: The MCP context
+            task: The instruction or task for the agent to perform
+
+        Returns:
+            A string containing the agent's response
+        """
+        global global_computer
+
+        try:
+            logger.info(f"Starting CUA task: {task}")
+
+            # Initialize computer if needed
+            if global_computer is None:
+                global_computer = Computer(verbosity=logging.INFO)
+                await global_computer.run()
+
+            # Determine which loop to use
+            loop_str = os.getenv("CUA_AGENT_LOOP", "OMNI")
+            if loop_str == "OPENAI":
+                loop = AgentLoop.OPENAI
+            elif loop_str == "ANTHROPIC":
+                loop = AgentLoop.ANTHROPIC
+            else:
+                loop = AgentLoop.OMNI
+
+            # Determine provider
+            provider_str = os.getenv("CUA_MODEL_PROVIDER", "ANTHROPIC")
+            provider = getattr(LLMProvider, provider_str)
+
+            # Get model name (if specified)
+            model_name = os.getenv("CUA_MODEL_NAME", None)
+
+            # Get base URL for provider (if needed)
+            provider_base_url = os.getenv("CUA_PROVIDER_BASE_URL", None)
+
+            # Create agent with the specified configuration
+            agent = ComputerAgent(
+                computer=global_computer,
+                loop=loop,
+                model=LLM(
+                    provider=provider,
+                    name=model_name,
+                    provider_base_url=provider_base_url,
+                ),
+                save_trajectory=False,
+                only_n_most_recent_images=int(os.getenv("CUA_MAX_IMAGES", "3")),
+                verbosity=logging.INFO,
+            )
+
+            # Collect all results
+            full_result = ""
+            async for result in agent.run(task):
+                logger.info(f"Agent step complete: {result.get('id', 'unknown')}")
+
+                # Add response ID to output
+                full_result += f"\n[Response ID: {result.get('id', 'unknown')}]\n"
+
+                # Extract and concatenate text responses
+                if "text" in result:
+                    # Handle both string and dict responses
+                    text_response = result.get("text", "")
+                    if isinstance(text_response, str):
+                        full_result += f"Response: {text_response}\n"
+                    else:
+                        # If it's a dict or other structure, convert to string representation
+                        full_result += f"Response: {str(text_response)}\n"
+
+                # Log detailed information
+                if "tools" in result:
+                    tools_info = result.get("tools")
+                    logger.debug(f"Tools used: {tools_info}")
+                    full_result += f"\nTools used: {tools_info}\n"
+
+                # Process output if available
+                outputs = result.get("output", [])
+                for output in outputs:
+                    output_type = output.get("type")
+                    if output_type == "reasoning":
+                        logger.debug(f"Reasoning: {output}")
+                        full_result += f"\nReasoning: {output.get('content', '')}\n"
+                    elif output_type == "computer_call":
+                        logger.debug(f"Computer call: {output}")
+                        action = output.get("action", "")
+                        result_value = output.get("result", "")
+                        full_result += f"\nComputer Action: {action}\nResult: {result_value}\n"
+
+                # Add separator between steps
+                full_result += "\n" + "-" * 40 + "\n"
+
+            logger.info(f"CUA task completed successfully")
+            return full_result or "Task completed with no text output."
+
+        except Exception as e:
+            error_msg = f"Error running CUA task: {str(e)}\n{traceback.format_exc()}"
+            logger.error(error_msg)
+            return f"Error during task execution: {str(e)}"
+
+    @server.tool()
+    async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> str:
+        """
+        Run multiple CUA tasks in sequence and return the combined results.
+
+        Args:
+            ctx: The MCP context
+            tasks: List of tasks to run in sequence
+
+        Returns:
+            Combined results from all tasks
+        """
+        results = []
+
+        for i, task in enumerate(tasks):
+            logger.info(f"Running task {i+1}/{len(tasks)}: {task}")
+            result = await run_cua_task(ctx, task)
+            results.append(f"Task {i+1}: {task}\nResult: {result}\n")
+
+        return "\n".join(results)
+
+    return server
+
+
+server = serve()
+
+
+def main():
+    """Run the MCP server."""
+    try:
+        logger.debug("Starting MCP server...")
+        server.run()
+    except Exception as e:
+        logger.error(f"Error starting server: {e}")
+        traceback.print_exc(file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/libs/mcp-server/pdm.lock
+++ b/libs/mcp-server/pdm.lock
--- a/libs/mcp-server/pyproject.toml
+++ b/libs/mcp-server/pyproject.toml
@@ -0,0 +1,40 @@
+[build-system]
+requires = ["pdm-backend"]
+build-backend = "pdm.backend"
+
+[project]
+name = "cua-mcp-server"
+description = "MCP Server for Computer-Use Agent (CUA)"
+readme = "README.md"
+requires-python = ">=3.10,<3.13"
+version = "0.1.0"
+authors = [
+    {name = "TryCua", email = "gh@trycua.com"}
+]
+dependencies = [
+    "mcp>=1.6.0,<2.0.0",
+    "cua-agent>=0.1.0,<0.2.0",
+    "cua-computer>=0.1.0,<0.2.0",
+]
+
+[project.scripts]
+cua-mcp-server = "mcp_server.server:main"
+
+[tool.pdm]
+distribution = true
+
+[tool.pdm.dev-dependencies]
+dev = [
+    "black>=23.9.1",
+    "ruff>=0.0.292",
+]
+
+[tool.black]
+line-length = 100
+target-version = ["py310"]
+
+[tool.ruff]
+line-length = 100
+target-version = "py310"
+select = ["E", "F", "B", "I"]
+fix = true
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,6 +56,7 @@ cua-omniparser = { path = "libs/omniparser" }
 cua-agent = { path = "libs/agent" }
 pylume = { path = "libs/pylume" }
 cua-computer-server = { path = "libs/computer-server" }
+cua-mcp-server = { path = "libs/mcp-server" }

 [tool.black]
 line-length = 100
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -104,13 +104,16 @@ install_package "libs/agent" "agent" "all"
 # Install computer-server
 install_package "libs/computer-server" "computer-server"

+# Install mcp-server
+install_package "libs/mcp-server" "mcp-server"
+
 # Install development tools from root project
 print_step "Installing development dependencies..."
 pip install -e ".[dev,test,docs]"

 # Create a .env file for VS Code to use the virtual environment
 print_step "Creating .env file for VS Code..."
-echo "PYTHONPATH=${PROJECT_ROOT}/libs/core:${PROJECT_ROOT}/libs/computer:${PROJECT_ROOT}/libs/agent:${PROJECT_ROOT}/libs/som:${PROJECT_ROOT}/libs/pylume:${PROJECT_ROOT}/libs/computer-server" > .env
+echo "PYTHONPATH=${PROJECT_ROOT}/libs/core:${PROJECT_ROOT}/libs/computer:${PROJECT_ROOT}/libs/agent:${PROJECT_ROOT}/libs/som:${PROJECT_ROOT}/libs/pylume:${PROJECT_ROOT}/libs/computer-server:${PROJECT_ROOT}/libs/mcp-server" > .env

 print_success "All packages installed successfully!"
 print_step "Your virtual environment is ready. To activate it:"
--- a/scripts/cleanup.sh
+++ b/scripts/cleanup.sh
@@ -64,6 +64,11 @@ find . -type d -name ".pdm-build" -exec rm -rf {} +
 find . -name ".pdm-python" -delete  # .pdm-python is a file, not a directory
 print_success "PDM-related files removed"

+# Remove MCP-related files
+print_step "Removing MCP-related files..."
+find . -name "mcp_server.log" -delete
+print_success "MCP-related files removed"
+
 # Remove .env file
 print_step "Removing .env file..."
 rm -f .env