mirror of
https://github.com/trycua/computer.git
synced 2026-05-05 22:53:26 -05:00
Merge branch 'main' into feature/agent/demo-video-maker
This commit is contained in:
@@ -82,7 +82,7 @@ If you want to use AI agents with virtualized environments:
|
||||
async with Computer(verbosity=logging.DEBUG) as macos_computer:
|
||||
agent = ComputerAgent(
|
||||
computer=macos_computer,
|
||||
loop=AgentLoop.OPENAI, # or AgentLoop.ANTHROPIC, or AgentLoop.OMNI
|
||||
loop=AgentLoop.OPENAI, # or AgentLoop.ANTHROPIC, or AgentLoop.UITARS, or AgentLoop.OMNI
|
||||
model=LLM(provider=LLMProvider.OPENAI) # or LLM(provider=LLMProvider.ANTHROPIC)
|
||||
)
|
||||
|
||||
|
||||
@@ -50,10 +50,10 @@ async with Computer() as macos_computer:
|
||||
# model=LLM(provider=LLMProvider.ANTHROPIC)
|
||||
# or
|
||||
# loop=AgentLoop.OMNI,
|
||||
# model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
|
||||
# model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
|
||||
# or
|
||||
# loop=AgentLoop.UITARS,
|
||||
# model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
|
||||
# model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
|
||||
)
|
||||
|
||||
tasks = [
|
||||
|
||||
@@ -179,8 +179,21 @@ final class Server {
|
||||
return HTTPResponse(statusCode: .badRequest, body: "Missing VM name")
|
||||
}
|
||||
|
||||
// Extract storage from query params if present
|
||||
let storage = self.extractQueryParam(request: request, name: "storage")
|
||||
Logger.info("Processing stop VM request", metadata: ["method": request.method, "path": request.path])
|
||||
|
||||
// Extract storage from the request body
|
||||
var storage: String? = nil
|
||||
if let bodyData = request.body, !bodyData.isEmpty {
|
||||
do {
|
||||
if let json = try JSONSerialization.jsonObject(with: bodyData) as? [String: Any],
|
||||
let bodyStorage = json["storage"] as? String {
|
||||
storage = bodyStorage
|
||||
Logger.info("Extracted storage from request body", metadata: ["storage": bodyStorage])
|
||||
}
|
||||
} catch {
|
||||
Logger.error("Failed to parse request body JSON", metadata: ["error": error.localizedDescription])
|
||||
}
|
||||
}
|
||||
|
||||
return try await self.handleStopVM(name: name, storage: storage)
|
||||
}),
|
||||
|
||||
@@ -68,13 +68,51 @@ You can then use the script in your MCP configuration like this:
|
||||
"CUA_AGENT_LOOP": "OMNI",
|
||||
"CUA_MODEL_PROVIDER": "ANTHROPIC",
|
||||
"CUA_MODEL_NAME": "claude-3-7-sonnet-20250219",
|
||||
"ANTHROPIC_API_KEY": "your-api-key"
|
||||
"CUA_PROVIDER_API_KEY": "your-api-key"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Development Guide
|
||||
|
||||
If you want to develop with the cua-mcp-server directly without installation, you can use this configuration:
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"cua-agent": {
|
||||
"command": "/bin/bash",
|
||||
"args": ["~/cua/libs/mcp-server/scripts/start_mcp_server.sh"],
|
||||
"env": {
|
||||
"CUA_AGENT_LOOP": "UITARS",
|
||||
"CUA_MODEL_PROVIDER": "OAICOMPAT",
|
||||
"CUA_MODEL_NAME": "ByteDance-Seed/UI-TARS-1.5-7B",
|
||||
"CUA_PROVIDER_BASE_URL": "https://****************.us-east-1.aws.endpoints.huggingface.cloud/v1",
|
||||
"CUA_PROVIDER_API_KEY": "your-api-key"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This configuration:
|
||||
- Uses the start_mcp_server.sh script which automatically sets up the Python path and runs the server module
|
||||
- Works with Claude Desktop, Cursor, or any other MCP client
|
||||
- Automatically uses your development code without requiring installation
|
||||
|
||||
Just add this to your MCP client's configuration and it will use your local development version of the server.
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
If you get a `/bin/bash: ~/cua/libs/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative.
|
||||
|
||||
To see the logs:
|
||||
```
|
||||
tail -n 20 -f ~/Library/Logs/Claude/mcp*.log
|
||||
```
|
||||
|
||||
## Claude Desktop Integration
|
||||
|
||||
To use with Claude Desktop, add an entry to your Claude Desktop configuration (`claude_desktop_config.json`, typically found in `~/.config/claude-desktop/`):
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Any, Dict, List, Optional, Union, Tuple
|
||||
|
||||
# Configure logging to output to stderr for debug visibility
|
||||
logging.basicConfig(
|
||||
@@ -17,7 +18,7 @@ logger = logging.getLogger("mcp-server")
|
||||
logger.debug("MCP Server module loading...")
|
||||
|
||||
try:
|
||||
from mcp.server.fastmcp import Context, FastMCP
|
||||
from mcp.server.fastmcp import Context, FastMCP, Image
|
||||
|
||||
logger.debug("Successfully imported FastMCP")
|
||||
except ImportError as e:
|
||||
@@ -49,16 +50,37 @@ def serve() -> FastMCP:
|
||||
server = FastMCP("cua-agent")
|
||||
|
||||
@server.tool()
|
||||
async def run_cua_task(ctx: Context, task: str) -> str:
|
||||
async def screenshot_cua(ctx: Context) -> Image:
|
||||
"""
|
||||
Run a Computer-Use Agent (CUA) task and return the results.
|
||||
Take a screenshot of the current MacOS VM screen and return the image. Use this before running a CUA task to get a snapshot of the current state.
|
||||
|
||||
Args:
|
||||
ctx: The MCP context
|
||||
|
||||
Returns:
|
||||
An image resource containing the screenshot
|
||||
"""
|
||||
global global_computer
|
||||
if global_computer is None:
|
||||
global_computer = Computer(verbosity=logging.INFO)
|
||||
await global_computer.run()
|
||||
screenshot = await global_computer.interface.screenshot()
|
||||
return Image(
|
||||
format="png",
|
||||
data=screenshot
|
||||
)
|
||||
|
||||
@server.tool()
|
||||
async def run_cua_task(ctx: Context, task: str) -> Tuple[str, Image]:
|
||||
"""
|
||||
Run a Computer-Use Agent (CUA) task in a MacOS VM and return the results.
|
||||
|
||||
Args:
|
||||
ctx: The MCP context
|
||||
task: The instruction or task for the agent to perform
|
||||
|
||||
Returns:
|
||||
A string containing the agent's response
|
||||
A tuple containing the agent's response and the final screenshot
|
||||
"""
|
||||
global global_computer
|
||||
|
||||
@@ -72,12 +94,7 @@ def serve() -> FastMCP:
|
||||
|
||||
# Determine which loop to use
|
||||
loop_str = os.getenv("CUA_AGENT_LOOP", "OMNI")
|
||||
if loop_str == "OPENAI":
|
||||
loop = AgentLoop.OPENAI
|
||||
elif loop_str == "ANTHROPIC":
|
||||
loop = AgentLoop.ANTHROPIC
|
||||
else:
|
||||
loop = AgentLoop.OMNI
|
||||
loop = getattr(AgentLoop, loop_str)
|
||||
|
||||
# Determine provider
|
||||
provider_str = os.getenv("CUA_MODEL_PROVIDER", "ANTHROPIC")
|
||||
@@ -89,6 +106,9 @@ def serve() -> FastMCP:
|
||||
# Get base URL for provider (if needed)
|
||||
provider_base_url = os.getenv("CUA_PROVIDER_BASE_URL", None)
|
||||
|
||||
# Get api key for provider (if needed)
|
||||
api_key = os.getenv("CUA_PROVIDER_API_KEY", None)
|
||||
|
||||
# Create agent with the specified configuration
|
||||
agent = ComputerAgent(
|
||||
computer=global_computer,
|
||||
@@ -98,6 +118,7 @@ def serve() -> FastMCP:
|
||||
name=model_name,
|
||||
provider_base_url=provider_base_url,
|
||||
),
|
||||
api_key=api_key,
|
||||
save_trajectory=False,
|
||||
only_n_most_recent_images=int(os.getenv("CUA_MAX_IMAGES", "3")),
|
||||
verbosity=logging.INFO,
|
||||
@@ -107,33 +128,34 @@ def serve() -> FastMCP:
|
||||
full_result = ""
|
||||
async for result in agent.run(task):
|
||||
logger.info(f"Agent step complete: {result.get('id', 'unknown')}")
|
||||
ctx.info(f"Agent step complete: {result.get('id', 'unknown')}")
|
||||
|
||||
# Add response ID to output
|
||||
full_result += f"\n[Response ID: {result.get('id', 'unknown')}]\n"
|
||||
|
||||
# Extract and concatenate text responses
|
||||
if "text" in result:
|
||||
# Handle both string and dict responses
|
||||
text_response = result.get("text", "")
|
||||
if isinstance(text_response, str):
|
||||
full_result += f"Response: {text_response}\n"
|
||||
else:
|
||||
# If it's a dict or other structure, convert to string representation
|
||||
full_result += f"Response: {str(text_response)}\n"
|
||||
|
||||
# Log detailed information
|
||||
if "tools" in result:
|
||||
tools_info = result.get("tools")
|
||||
logger.debug(f"Tools used: {tools_info}")
|
||||
full_result += f"\nTools used: {tools_info}\n"
|
||||
|
||||
if "content" in result:
|
||||
full_result += f"Response: {result.get('content', '')}\n"
|
||||
|
||||
# Process output if available
|
||||
outputs = result.get("output", [])
|
||||
for output in outputs:
|
||||
output_type = output.get("type")
|
||||
if output_type == "reasoning":
|
||||
if output_type == "message":
|
||||
logger.debug(f"Message: {output}")
|
||||
content = output.get("content", [])
|
||||
for content_part in content:
|
||||
if content_part.get("text"):
|
||||
full_result += f"\nMessage: {content_part.get('text', '')}\n"
|
||||
elif output_type == "reasoning":
|
||||
logger.debug(f"Reasoning: {output}")
|
||||
full_result += f"\nReasoning: {output.get('content', '')}\n"
|
||||
|
||||
summary_content = output.get("summary", [])
|
||||
if summary_content:
|
||||
for summary_part in summary_content:
|
||||
if summary_part.get("text"):
|
||||
full_result += f"\nReasoning: {summary_part.get('text', '')}\n"
|
||||
else:
|
||||
full_result += f"\nReasoning: {output.get('text', output.get('content', ''))}\n"
|
||||
elif output_type == "computer_call":
|
||||
logger.debug(f"Computer call: {output}")
|
||||
action = output.get("action", "")
|
||||
@@ -144,17 +166,25 @@ def serve() -> FastMCP:
|
||||
full_result += "\n" + "-" * 40 + "\n"
|
||||
|
||||
logger.info(f"CUA task completed successfully")
|
||||
return full_result or "Task completed with no text output."
|
||||
ctx.info(f"CUA task completed successfully")
|
||||
return (
|
||||
full_result or "Task completed with no text output.",
|
||||
Image(
|
||||
format="png",
|
||||
data=await global_computer.interface.screenshot()
|
||||
)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error running CUA task: {str(e)}\n{traceback.format_exc()}"
|
||||
logger.error(error_msg)
|
||||
ctx.error(error_msg)
|
||||
return f"Error during task execution: {str(e)}"
|
||||
|
||||
@server.tool()
|
||||
async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> str:
|
||||
async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> List:
|
||||
"""
|
||||
Run multiple CUA tasks in sequence and return the combined results.
|
||||
Run multiple CUA tasks in a MacOS VM in sequence and return the combined results.
|
||||
|
||||
Args:
|
||||
ctx: The MCP context
|
||||
@@ -164,13 +194,15 @@ def serve() -> FastMCP:
|
||||
Combined results from all tasks
|
||||
"""
|
||||
results = []
|
||||
|
||||
for i, task in enumerate(tasks):
|
||||
logger.info(f"Running task {i+1}/{len(tasks)}: {task}")
|
||||
result = await run_cua_task(ctx, task)
|
||||
results.append(f"Task {i+1}: {task}\nResult: {result}\n")
|
||||
|
||||
return "\n".join(results)
|
||||
ctx.info(f"Running task {i+1}/{len(tasks)}: {task}")
|
||||
|
||||
ctx.report_progress(i / len(tasks))
|
||||
results.extend(await run_cua_task(ctx, task))
|
||||
ctx.report_progress((i + 1) / len(tasks))
|
||||
|
||||
return results
|
||||
|
||||
return server
|
||||
|
||||
|
||||
Executable
+14
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# Set the CUA repository path based on script location
|
||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
CUA_REPO_DIR="$( cd "$SCRIPT_DIR/../../.." &> /dev/null && pwd )"
|
||||
PYTHON_PATH="${CUA_REPO_DIR}/.venv/bin/python"
|
||||
|
||||
# Set Python path to include all necessary libraries
|
||||
export PYTHONPATH="${CUA_REPO_DIR}/libs/mcp-server:${CUA_REPO_DIR}/libs/agent:${CUA_REPO_DIR}/libs/computer:${CUA_REPO_DIR}/libs/core:${CUA_REPO_DIR}/libs/pylume"
|
||||
|
||||
# Run the MCP server directly as a module
|
||||
$PYTHON_PATH -m mcp_server.server
|
||||
Reference in New Issue
Block a user