Merge branch 'main' into feature/agent/demo-video-maker

This commit is contained in:
Dillon DuPont
2025-05-01 20:31:19 -07:00
6 changed files with 140 additions and 43 deletions
+1 -1
View File
@@ -82,7 +82,7 @@ If you want to use AI agents with virtualized environments:
async with Computer(verbosity=logging.DEBUG) as macos_computer:
agent = ComputerAgent(
computer=macos_computer,
loop=AgentLoop.OPENAI, # or AgentLoop.ANTHROPIC, or AgentLoop.OMNI
loop=AgentLoop.OPENAI, # or AgentLoop.ANTHROPIC, or AgentLoop.UITARS, or AgentLoop.OMNI
model=LLM(provider=LLMProvider.OPENAI) # or LLM(provider=LLMProvider.ANTHROPIC)
)
+2 -2
View File
@@ -50,10 +50,10 @@ async with Computer() as macos_computer:
# model=LLM(provider=LLMProvider.ANTHROPIC)
# or
# loop=AgentLoop.OMNI,
# model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
# model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
# or
# loop=AgentLoop.UITARS,
# model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
# model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
)
tasks = [
+15 -2
View File
@@ -179,8 +179,21 @@ final class Server {
return HTTPResponse(statusCode: .badRequest, body: "Missing VM name")
}
// Extract storage from query params if present
let storage = self.extractQueryParam(request: request, name: "storage")
Logger.info("Processing stop VM request", metadata: ["method": request.method, "path": request.path])
// Extract storage from the request body
var storage: String? = nil
if let bodyData = request.body, !bodyData.isEmpty {
do {
if let json = try JSONSerialization.jsonObject(with: bodyData) as? [String: Any],
let bodyStorage = json["storage"] as? String {
storage = bodyStorage
Logger.info("Extracted storage from request body", metadata: ["storage": bodyStorage])
}
} catch {
Logger.error("Failed to parse request body JSON", metadata: ["error": error.localizedDescription])
}
}
return try await self.handleStopVM(name: name, storage: storage)
}),
+39 -1
View File
@@ -68,13 +68,51 @@ You can then use the script in your MCP configuration like this:
"CUA_AGENT_LOOP": "OMNI",
"CUA_MODEL_PROVIDER": "ANTHROPIC",
"CUA_MODEL_NAME": "claude-3-7-sonnet-20250219",
"ANTHROPIC_API_KEY": "your-api-key"
"CUA_PROVIDER_API_KEY": "your-api-key"
}
}
}
}
```
## Development Guide
If you want to develop with the cua-mcp-server directly without installation, you can use this configuration:
```json
{
"mcpServers": {
"cua-agent": {
"command": "/bin/bash",
"args": ["~/cua/libs/mcp-server/scripts/start_mcp_server.sh"],
"env": {
"CUA_AGENT_LOOP": "UITARS",
"CUA_MODEL_PROVIDER": "OAICOMPAT",
"CUA_MODEL_NAME": "ByteDance-Seed/UI-TARS-1.5-7B",
"CUA_PROVIDER_BASE_URL": "https://****************.us-east-1.aws.endpoints.huggingface.cloud/v1",
"CUA_PROVIDER_API_KEY": "your-api-key"
}
}
}
}
```
This configuration:
- Uses the start_mcp_server.sh script which automatically sets up the Python path and runs the server module
- Works with Claude Desktop, Cursor, or any other MCP client
- Automatically uses your development code without requiring installation
Just add this to your MCP client's configuration and it will use your local development version of the server.
### Troubleshooting
If you get a `/bin/bash: ~/cua/libs/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative.
To see the logs:
```
tail -n 20 -f ~/Library/Logs/Claude/mcp*.log
```
## Claude Desktop Integration
To use with Claude Desktop, add an entry to your Claude Desktop configuration (`claude_desktop_config.json`, typically found in `~/.config/claude-desktop/`):
+69 -37
View File
@@ -1,9 +1,10 @@
import asyncio
import base64
import logging
import os
import sys
import traceback
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union, Tuple
# Configure logging to output to stderr for debug visibility
logging.basicConfig(
@@ -17,7 +18,7 @@ logger = logging.getLogger("mcp-server")
logger.debug("MCP Server module loading...")
try:
from mcp.server.fastmcp import Context, FastMCP
from mcp.server.fastmcp import Context, FastMCP, Image
logger.debug("Successfully imported FastMCP")
except ImportError as e:
@@ -49,16 +50,37 @@ def serve() -> FastMCP:
server = FastMCP("cua-agent")
@server.tool()
async def run_cua_task(ctx: Context, task: str) -> str:
async def screenshot_cua(ctx: Context) -> Image:
"""
Run a Computer-Use Agent (CUA) task and return the results.
Take a screenshot of the current MacOS VM screen and return the image. Use this before running a CUA task to get a snapshot of the current state.
Args:
ctx: The MCP context
Returns:
An image resource containing the screenshot
"""
global global_computer
if global_computer is None:
global_computer = Computer(verbosity=logging.INFO)
await global_computer.run()
screenshot = await global_computer.interface.screenshot()
return Image(
format="png",
data=screenshot
)
@server.tool()
async def run_cua_task(ctx: Context, task: str) -> Tuple[str, Image]:
"""
Run a Computer-Use Agent (CUA) task in a MacOS VM and return the results.
Args:
ctx: The MCP context
task: The instruction or task for the agent to perform
Returns:
A string containing the agent's response
A tuple containing the agent's response and the final screenshot
"""
global global_computer
@@ -72,12 +94,7 @@ def serve() -> FastMCP:
# Determine which loop to use
loop_str = os.getenv("CUA_AGENT_LOOP", "OMNI")
if loop_str == "OPENAI":
loop = AgentLoop.OPENAI
elif loop_str == "ANTHROPIC":
loop = AgentLoop.ANTHROPIC
else:
loop = AgentLoop.OMNI
loop = getattr(AgentLoop, loop_str)
# Determine provider
provider_str = os.getenv("CUA_MODEL_PROVIDER", "ANTHROPIC")
@@ -89,6 +106,9 @@ def serve() -> FastMCP:
# Get base URL for provider (if needed)
provider_base_url = os.getenv("CUA_PROVIDER_BASE_URL", None)
# Get api key for provider (if needed)
api_key = os.getenv("CUA_PROVIDER_API_KEY", None)
# Create agent with the specified configuration
agent = ComputerAgent(
computer=global_computer,
@@ -98,6 +118,7 @@ def serve() -> FastMCP:
name=model_name,
provider_base_url=provider_base_url,
),
api_key=api_key,
save_trajectory=False,
only_n_most_recent_images=int(os.getenv("CUA_MAX_IMAGES", "3")),
verbosity=logging.INFO,
@@ -107,33 +128,34 @@ def serve() -> FastMCP:
full_result = ""
async for result in agent.run(task):
logger.info(f"Agent step complete: {result.get('id', 'unknown')}")
ctx.info(f"Agent step complete: {result.get('id', 'unknown')}")
# Add response ID to output
full_result += f"\n[Response ID: {result.get('id', 'unknown')}]\n"
# Extract and concatenate text responses
if "text" in result:
# Handle both string and dict responses
text_response = result.get("text", "")
if isinstance(text_response, str):
full_result += f"Response: {text_response}\n"
else:
# If it's a dict or other structure, convert to string representation
full_result += f"Response: {str(text_response)}\n"
# Log detailed information
if "tools" in result:
tools_info = result.get("tools")
logger.debug(f"Tools used: {tools_info}")
full_result += f"\nTools used: {tools_info}\n"
if "content" in result:
full_result += f"Response: {result.get('content', '')}\n"
# Process output if available
outputs = result.get("output", [])
for output in outputs:
output_type = output.get("type")
if output_type == "reasoning":
if output_type == "message":
logger.debug(f"Message: {output}")
content = output.get("content", [])
for content_part in content:
if content_part.get("text"):
full_result += f"\nMessage: {content_part.get('text', '')}\n"
elif output_type == "reasoning":
logger.debug(f"Reasoning: {output}")
full_result += f"\nReasoning: {output.get('content', '')}\n"
summary_content = output.get("summary", [])
if summary_content:
for summary_part in summary_content:
if summary_part.get("text"):
full_result += f"\nReasoning: {summary_part.get('text', '')}\n"
else:
full_result += f"\nReasoning: {output.get('text', output.get('content', ''))}\n"
elif output_type == "computer_call":
logger.debug(f"Computer call: {output}")
action = output.get("action", "")
@@ -144,17 +166,25 @@ def serve() -> FastMCP:
full_result += "\n" + "-" * 40 + "\n"
logger.info(f"CUA task completed successfully")
return full_result or "Task completed with no text output."
ctx.info(f"CUA task completed successfully")
return (
full_result or "Task completed with no text output.",
Image(
format="png",
data=await global_computer.interface.screenshot()
)
)
except Exception as e:
error_msg = f"Error running CUA task: {str(e)}\n{traceback.format_exc()}"
logger.error(error_msg)
ctx.error(error_msg)
return f"Error during task execution: {str(e)}"
@server.tool()
async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> str:
async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> List:
"""
Run multiple CUA tasks in sequence and return the combined results.
Run multiple CUA tasks in a MacOS VM in sequence and return the combined results.
Args:
ctx: The MCP context
@@ -164,13 +194,15 @@ def serve() -> FastMCP:
Combined results from all tasks
"""
results = []
for i, task in enumerate(tasks):
logger.info(f"Running task {i+1}/{len(tasks)}: {task}")
result = await run_cua_task(ctx, task)
results.append(f"Task {i+1}: {task}\nResult: {result}\n")
return "\n".join(results)
ctx.info(f"Running task {i+1}/{len(tasks)}: {task}")
ctx.report_progress(i / len(tasks))
results.extend(await run_cua_task(ctx, task))
ctx.report_progress((i + 1) / len(tasks))
return results
return server
+14
View File
@@ -0,0 +1,14 @@
#!/bin/bash
set -e
# Set the CUA repository path based on script location
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
CUA_REPO_DIR="$( cd "$SCRIPT_DIR/../../.." &> /dev/null && pwd )"
PYTHON_PATH="${CUA_REPO_DIR}/.venv/bin/python"
# Set Python path to include all necessary libraries
export PYTHONPATH="${CUA_REPO_DIR}/libs/mcp-server:${CUA_REPO_DIR}/libs/agent:${CUA_REPO_DIR}/libs/computer:${CUA_REPO_DIR}/libs/core:${CUA_REPO_DIR}/libs/pylume"
# Run the MCP server directly as a module
$PYTHON_PATH -m mcp_server.server