Merge branch 'main' into feature/agent/demo-video-maker

2026-05-05 22:53:26 -05:00 · 2025-05-01 20:31:19 -07:00
parent d55f566aa1 aecac8c9df
commit 0fc955a9c3
6 changed files with 140 additions and 43 deletions
@@ -82,7 +82,7 @@ If you want to use AI agents with virtualized environments:
   async with Computer(verbosity=logging.DEBUG) as macos_computer:
     agent = ComputerAgent(
         computer=macos_computer,
-         loop=AgentLoop.OPENAI, # or AgentLoop.ANTHROPIC, or AgentLoop.OMNI
+         loop=AgentLoop.OPENAI, # or AgentLoop.ANTHROPIC, or AgentLoop.UITARS, or AgentLoop.OMNI
         model=LLM(provider=LLMProvider.OPENAI) # or LLM(provider=LLMProvider.ANTHROPIC)
     )

@@ -50,10 +50,10 @@ async with Computer() as macos_computer:
      # model=LLM(provider=LLMProvider.ANTHROPIC)
      # or
      # loop=AgentLoop.OMNI,
-      # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
+      # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
      # or
      # loop=AgentLoop.UITARS,
-      # model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
+      # model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
  )

  tasks = [
@@ -179,8 +179,21 @@ final class Server {
                        return HTTPResponse(statusCode: .badRequest, body: "Missing VM name")
                    }

-                    // Extract storage from query params if present
-                    let storage = self.extractQueryParam(request: request, name: "storage")
+                    Logger.info("Processing stop VM request", metadata: ["method": request.method, "path": request.path])
+
+                    // Extract storage from the request body
+                    var storage: String? = nil
+                    if let bodyData = request.body, !bodyData.isEmpty {
+                        do {
+                            if let json = try JSONSerialization.jsonObject(with: bodyData) as? [String: Any],
+                               let bodyStorage = json["storage"] as? String {
+                                storage = bodyStorage
+                                Logger.info("Extracted storage from request body", metadata: ["storage": bodyStorage])
+                            }
+                        } catch {
+                            Logger.error("Failed to parse request body JSON", metadata: ["error": error.localizedDescription])
+                        }
+                    }

                    return try await self.handleStopVM(name: name, storage: storage)
                }),
@@ -68,13 +68,51 @@ You can then use the script in your MCP configuration like this:
        "CUA_AGENT_LOOP": "OMNI",
        "CUA_MODEL_PROVIDER": "ANTHROPIC",
        "CUA_MODEL_NAME": "claude-3-7-sonnet-20250219",
-        "ANTHROPIC_API_KEY": "your-api-key"
+        "CUA_PROVIDER_API_KEY": "your-api-key"
      }
    }
  }
 }
 ```

+## Development Guide
+
+If you want to develop with the cua-mcp-server directly without installation, you can use this configuration:
+
+```json
+{
+  "mcpServers": {
+    "cua-agent": {
+      "command": "/bin/bash",
+      "args": ["~/cua/libs/mcp-server/scripts/start_mcp_server.sh"],
+      "env": {
+        "CUA_AGENT_LOOP": "UITARS",
+        "CUA_MODEL_PROVIDER": "OAICOMPAT",
+        "CUA_MODEL_NAME": "ByteDance-Seed/UI-TARS-1.5-7B",
+        "CUA_PROVIDER_BASE_URL": "https://****************.us-east-1.aws.endpoints.huggingface.cloud/v1",
+        "CUA_PROVIDER_API_KEY": "your-api-key"
+      }
+    }
+  }
+}
+```
+
+This configuration:
+- Uses the start_mcp_server.sh script which automatically sets up the Python path and runs the server module
+- Works with Claude Desktop, Cursor, or any other MCP client
+- Automatically uses your development code without requiring installation
+
+Just add this to your MCP client's configuration and it will use your local development version of the server.
+
+### Troubleshooting
+
+If you get a `/bin/bash: ~/cua/libs/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative.
+
+To see the logs:
+```
+tail -n 20 -f ~/Library/Logs/Claude/mcp*.log
+```
+
 ## Claude Desktop Integration

 To use with Claude Desktop, add an entry to your Claude Desktop configuration (`claude_desktop_config.json`, typically found in `~/.config/claude-desktop/`):
@@ -1,9 +1,10 @@
 import asyncio
+import base64
 import logging
 import os
 import sys
 import traceback
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Tuple

 # Configure logging to output to stderr for debug visibility
 logging.basicConfig(
@@ -17,7 +18,7 @@ logger = logging.getLogger("mcp-server")
 logger.debug("MCP Server module loading...")

 try:
-    from mcp.server.fastmcp import Context, FastMCP
+    from mcp.server.fastmcp import Context, FastMCP, Image

    logger.debug("Successfully imported FastMCP")
 except ImportError as e:
@@ -49,16 +50,37 @@ def serve() -> FastMCP:
    server = FastMCP("cua-agent")

    @server.tool()
-    async def run_cua_task(ctx: Context, task: str) -> str:
+    async def screenshot_cua(ctx: Context) -> Image:
        """
-        Run a Computer-Use Agent (CUA) task and return the results.
+        Take a screenshot of the current MacOS VM screen and return the image. Use this before running a CUA task to get a snapshot of the current state.
+
+        Args:
+            ctx: The MCP context
+
+        Returns:
+            An image resource containing the screenshot
+        """
+        global global_computer
+        if global_computer is None:
+            global_computer = Computer(verbosity=logging.INFO)
+            await global_computer.run()
+        screenshot = await global_computer.interface.screenshot()
+        return Image(
+            format="png",
+            data=screenshot
+        )
+
+    @server.tool()
+    async def run_cua_task(ctx: Context, task: str) -> Tuple[str, Image]:
+        """
+        Run a Computer-Use Agent (CUA) task in a MacOS VM and return the results.

        Args:
            ctx: The MCP context
            task: The instruction or task for the agent to perform

        Returns:
-            A string containing the agent's response
+            A tuple containing the agent's response and the final screenshot
        """
        global global_computer

@@ -72,12 +94,7 @@ def serve() -> FastMCP:

            # Determine which loop to use
            loop_str = os.getenv("CUA_AGENT_LOOP", "OMNI")
-            if loop_str == "OPENAI":
-                loop = AgentLoop.OPENAI
-            elif loop_str == "ANTHROPIC":
-                loop = AgentLoop.ANTHROPIC
-            else:
-                loop = AgentLoop.OMNI
+            loop = getattr(AgentLoop, loop_str)

            # Determine provider
            provider_str = os.getenv("CUA_MODEL_PROVIDER", "ANTHROPIC")
@@ -89,6 +106,9 @@ def serve() -> FastMCP:
            # Get base URL for provider (if needed)
            provider_base_url = os.getenv("CUA_PROVIDER_BASE_URL", None)

+            # Get api key for provider (if needed)
+            api_key = os.getenv("CUA_PROVIDER_API_KEY", None)
+
            # Create agent with the specified configuration
            agent = ComputerAgent(
                computer=global_computer,
@@ -98,6 +118,7 @@ def serve() -> FastMCP:
                    name=model_name,
                    provider_base_url=provider_base_url,
                ),
+                api_key=api_key,
                save_trajectory=False,
                only_n_most_recent_images=int(os.getenv("CUA_MAX_IMAGES", "3")),
                verbosity=logging.INFO,
@@ -107,33 +128,34 @@ def serve() -> FastMCP:
            full_result = ""
            async for result in agent.run(task):
                logger.info(f"Agent step complete: {result.get('id', 'unknown')}")
+                ctx.info(f"Agent step complete: {result.get('id', 'unknown')}")

                # Add response ID to output
                full_result += f"\n[Response ID: {result.get('id', 'unknown')}]\n"
-
-                # Extract and concatenate text responses
-                if "text" in result:
-                    # Handle both string and dict responses
-                    text_response = result.get("text", "")
-                    if isinstance(text_response, str):
-                        full_result += f"Response: {text_response}\n"
-                    else:
-                        # If it's a dict or other structure, convert to string representation
-                        full_result += f"Response: {str(text_response)}\n"
-
-                # Log detailed information
-                if "tools" in result:
-                    tools_info = result.get("tools")
-                    logger.debug(f"Tools used: {tools_info}")
-                    full_result += f"\nTools used: {tools_info}\n"
+                
+                if "content" in result:
+                    full_result += f"Response: {result.get('content', '')}\n"

                # Process output if available
                outputs = result.get("output", [])
                for output in outputs:
                    output_type = output.get("type")
-                    if output_type == "reasoning":
+                    if output_type == "message":
+                        logger.debug(f"Message: {output}")
+                        content = output.get("content", [])
+                        for content_part in content:
+                            if content_part.get("text"):
+                                full_result += f"\nMessage: {content_part.get('text', '')}\n"
+                    elif output_type == "reasoning":
                        logger.debug(f"Reasoning: {output}")
-                        full_result += f"\nReasoning: {output.get('content', '')}\n"
+                        
+                        summary_content = output.get("summary", [])
+                        if summary_content:
+                            for summary_part in summary_content:
+                                if summary_part.get("text"):
+                                    full_result += f"\nReasoning: {summary_part.get('text', '')}\n"
+                        else:
+                            full_result += f"\nReasoning: {output.get('text', output.get('content', ''))}\n"
                    elif output_type == "computer_call":
                        logger.debug(f"Computer call: {output}")
                        action = output.get("action", "")
@@ -144,17 +166,25 @@ def serve() -> FastMCP:
                full_result += "\n" + "-" * 40 + "\n"

            logger.info(f"CUA task completed successfully")
-            return full_result or "Task completed with no text output."
+            ctx.info(f"CUA task completed successfully")
+            return (
+                full_result or "Task completed with no text output.",
+                Image(
+                    format="png",
+                    data=await global_computer.interface.screenshot()
+                )
+            )

        except Exception as e:
            error_msg = f"Error running CUA task: {str(e)}\n{traceback.format_exc()}"
            logger.error(error_msg)
+            ctx.error(error_msg)
            return f"Error during task execution: {str(e)}"

    @server.tool()
-    async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> str:
+    async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> List:
        """
-        Run multiple CUA tasks in sequence and return the combined results.
+        Run multiple CUA tasks in a MacOS VM in sequence and return the combined results.

        Args:
            ctx: The MCP context
@@ -164,13 +194,15 @@ def serve() -> FastMCP:
            Combined results from all tasks
        """
        results = []
-
        for i, task in enumerate(tasks):
            logger.info(f"Running task {i+1}/{len(tasks)}: {task}")
-            result = await run_cua_task(ctx, task)
-            results.append(f"Task {i+1}: {task}\nResult: {result}\n")
-
-        return "\n".join(results)
+            ctx.info(f"Running task {i+1}/{len(tasks)}: {task}")
+            
+            ctx.report_progress(i / len(tasks))
+            results.extend(await run_cua_task(ctx, task))
+            ctx.report_progress((i + 1) / len(tasks))
+            
+        return results

    return server

@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -e
+
+# Set the CUA repository path based on script location
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+CUA_REPO_DIR="$( cd "$SCRIPT_DIR/../../.." &> /dev/null && pwd )"
+PYTHON_PATH="${CUA_REPO_DIR}/.venv/bin/python"
+
+# Set Python path to include all necessary libraries
+export PYTHONPATH="${CUA_REPO_DIR}/libs/mcp-server:${CUA_REPO_DIR}/libs/agent:${CUA_REPO_DIR}/libs/computer:${CUA_REPO_DIR}/libs/core:${CUA_REPO_DIR}/libs/pylume"
+
+# Run the MCP server directly as a module
+$PYTHON_PATH -m mcp_server.server