mirror of
https://github.com/trycua/computer.git
synced 2026-01-08 14:30:25 -06:00
Merge branch 'main' into feat/extra-models
This commit is contained in:
@@ -8,7 +8,7 @@ from litellm import completion, acompletion
|
||||
# Try to import HuggingFace dependencies
|
||||
try:
|
||||
import torch
|
||||
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
||||
from transformers import AutoModelForImageTextToText, AutoProcessor
|
||||
HF_AVAILABLE = True
|
||||
except ImportError:
|
||||
HF_AVAILABLE = False
|
||||
@@ -40,7 +40,7 @@ class HuggingFaceLocalAdapter(CustomLLM):
|
||||
"""
|
||||
if model_name not in self.models:
|
||||
# Load model
|
||||
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch.float16,
|
||||
device_map=self.device,
|
||||
@@ -145,8 +145,7 @@ class HuggingFaceLocalAdapter(CustomLLM):
|
||||
)
|
||||
|
||||
# Move inputs to the same device as model
|
||||
if torch.cuda.is_available() and self.device != "cpu":
|
||||
inputs = inputs.to("cuda")
|
||||
inputs = inputs.to(model.device)
|
||||
|
||||
# Generate response
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -422,6 +422,9 @@ class ComputerAgent:
|
||||
# Perform computer actions
|
||||
action = item.get("action")
|
||||
action_type = action.get("type")
|
||||
if action_type is None:
|
||||
print(f"Action type cannot be `None`: action={action}, action_type={action_type}")
|
||||
return []
|
||||
|
||||
# Extract action arguments (all fields except 'type')
|
||||
action_args = {k: v for k, v in action.items() if k != "type"}
|
||||
|
||||
@@ -93,4 +93,4 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
|
||||
|
||||
async def _deanonymize_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
|
||||
# TODO: Implement _deanonymize_item
|
||||
return item
|
||||
return item
|
||||
|
||||
@@ -178,13 +178,20 @@ def create_computer_instance(
|
||||
"""Create or get the global Computer instance."""
|
||||
global global_computer
|
||||
if global_computer is None:
|
||||
global_computer = Computer(
|
||||
verbosity=verbosity,
|
||||
os_type=os_type,
|
||||
provider_type=provider_type,
|
||||
name=name if name else "",
|
||||
api_key=api_key
|
||||
)
|
||||
if provider_type == "localhost":
|
||||
global_computer = Computer(
|
||||
verbosity=verbosity,
|
||||
os_type=os_type,
|
||||
use_host_computer_server=True
|
||||
)
|
||||
else:
|
||||
global_computer = Computer(
|
||||
verbosity=verbosity,
|
||||
os_type=os_type,
|
||||
provider_type=provider_type,
|
||||
name=name if name else "",
|
||||
api_key=api_key
|
||||
)
|
||||
return global_computer
|
||||
|
||||
|
||||
|
||||
@@ -211,7 +211,7 @@ if __name__ == "__main__":
|
||||
is_windows = platform.system().lower() == "windows"
|
||||
is_mac = platform.system().lower() == "darwin"
|
||||
|
||||
providers = ["cloud"]
|
||||
providers = ["cloud", "localhost"]
|
||||
if is_mac:
|
||||
providers += ["lume"]
|
||||
if is_windows:
|
||||
@@ -403,6 +403,23 @@ if __name__ == "__main__":
|
||||
type="password",
|
||||
)
|
||||
|
||||
# Provider visibility update function
|
||||
def update_provider_visibility(provider):
|
||||
"""Update visibility of container name and API key based on selected provider."""
|
||||
is_localhost = provider == "localhost"
|
||||
return [
|
||||
gr.update(visible=not is_localhost), # container_name
|
||||
gr.update(visible=not is_localhost and not has_cua_key) # cua_cloud_api_key
|
||||
]
|
||||
|
||||
# Connect provider change event
|
||||
computer_provider.change(
|
||||
fn=update_provider_visibility,
|
||||
inputs=[computer_provider],
|
||||
outputs=[container_name, cua_cloud_api_key],
|
||||
queue=False
|
||||
)
|
||||
|
||||
# Connect UI update events
|
||||
for dropdown in [agent_loop, omni_model_choice, uitars_model_choice, openai_model_choice, anthropic_model_choice]:
|
||||
dropdown.change(
|
||||
|
||||
@@ -19,7 +19,7 @@ dependencies = [
|
||||
"pydantic>=2.6.4",
|
||||
"rich>=13.7.1",
|
||||
"python-dotenv>=1.0.1",
|
||||
"cua-computer>=0.3.0,<0.5.0",
|
||||
"cua-computer>=0.4.0,<0.5.0",
|
||||
"cua-core>=0.1.8,<0.2.0",
|
||||
"certifi>=2024.2.2",
|
||||
"litellm>=1.74.12"
|
||||
|
||||
@@ -302,7 +302,7 @@ def upload_to_huggingface(dataset_name, visibility, filter_tags=None):
|
||||
)
|
||||
card = DatasetCard.from_template(
|
||||
card_data=card_data,
|
||||
template_str="---\n{{ card_data }}\n---\n\n# Uploaded computer interface trajectories\n\nThese trajectories were generated and uploaded using [c/ua](https://github.com/trycua/cua)"
|
||||
template_str="---\n{{ card_data }}\n---\n\n# Uploaded computer interface trajectories\n\nThese trajectories were generated and uploaded using [cua](https://github.com/trycua/cua)"
|
||||
)
|
||||
card.push_to_hub(
|
||||
dataset_name,
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "pdm.backend"
|
||||
|
||||
[project]
|
||||
name = "cua-computer"
|
||||
version = "0.3.0"
|
||||
version = "0.4.0"
|
||||
description = "Computer-Use Interface (CUI) framework powering Cua"
|
||||
readme = "README.md"
|
||||
authors = [
|
||||
|
||||
@@ -16,6 +16,21 @@
|
||||
</div>
|
||||
|
||||
**cua-mcp-server** is a MCP server for the Computer-Use Agent (CUA), allowing you to run CUA through Claude Desktop or other MCP clients.
|
||||
|
||||
## LiteLLM Integration
|
||||
|
||||
This MCP server features comprehensive liteLLM integration, allowing you to use any supported LLM provider with a simple model string configuration.
|
||||
|
||||
- **Unified Configuration**: Use a single `CUA_MODEL_NAME` environment variable with a model string
|
||||
- **Automatic Provider Detection**: The agent automatically detects the provider and capabilities from the model string
|
||||
- **Extensive Provider Support**: Works with Anthropic, OpenAI, local models, and any liteLLM-compatible provider
|
||||
|
||||
### Model String Examples:
|
||||
- **Anthropic**: `"anthropic/claude-3-5-sonnet-20241022"`
|
||||
- **OpenAI**: `"openai/computer-use-preview"`
|
||||
- **UI-TARS**: `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`
|
||||
- **Omni + Any LiteLLM**: `"omniparser+litellm/gpt-4o"`, `"omniparser+litellm/claude-3-haiku"`, `"omniparser+ollama_chat/gemma3"`
|
||||
|
||||
### Get started with Agent
|
||||
|
||||
## Prerequisites
|
||||
@@ -65,10 +80,7 @@ You can then use the script in your MCP configuration like this:
|
||||
"command": "/bin/bash",
|
||||
"args": ["~/.cua/start_mcp_server.sh"],
|
||||
"env": {
|
||||
"CUA_AGENT_LOOP": "OMNI",
|
||||
"CUA_MODEL_PROVIDER": "ANTHROPIC",
|
||||
"CUA_MODEL_NAME": "claude-3-7-sonnet-20250219",
|
||||
"CUA_PROVIDER_API_KEY": "your-api-key"
|
||||
"CUA_MODEL_NAME": "anthropic/claude-3-5-sonnet-20241022"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -86,11 +98,7 @@ If you want to develop with the cua-mcp-server directly without installation, yo
|
||||
"command": "/bin/bash",
|
||||
"args": ["~/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"],
|
||||
"env": {
|
||||
"CUA_AGENT_LOOP": "UITARS",
|
||||
"CUA_MODEL_PROVIDER": "OAICOMPAT",
|
||||
"CUA_MODEL_NAME": "ByteDance-Seed/UI-TARS-1.5-7B",
|
||||
"CUA_PROVIDER_BASE_URL": "https://****************.us-east-1.aws.endpoints.huggingface.cloud/v1",
|
||||
"CUA_PROVIDER_API_KEY": "your-api-key"
|
||||
"CUA_MODEL_NAME": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -142,10 +150,7 @@ The server is configured using environment variables (can be set in the Claude D
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| `CUA_AGENT_LOOP` | Agent loop to use (OPENAI, ANTHROPIC, UITARS, OMNI) | OMNI |
|
||||
| `CUA_MODEL_PROVIDER` | Model provider (ANTHROPIC, OPENAI, OLLAMA, OAICOMPAT) | ANTHROPIC |
|
||||
| `CUA_MODEL_NAME` | Model name to use | None (provider default) |
|
||||
| `CUA_PROVIDER_BASE_URL` | Base URL for provider API | None |
|
||||
| `CUA_MODEL_NAME` | Model string (e.g., "anthropic/claude-3-5-sonnet-20241022", "openai/computer-use-preview", "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", "omniparser+litellm/gpt-4o", "omniparser+ollama_chat/gemma3") | anthropic/claude-3-5-sonnet-20241022 |
|
||||
| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 |
|
||||
|
||||
## Available Tools
|
||||
|
||||
@@ -3,6 +3,7 @@ import base64
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from tabnanny import verbose
|
||||
import traceback
|
||||
from typing import Any, Dict, List, Optional, Union, Tuple
|
||||
|
||||
@@ -28,7 +29,7 @@ except ImportError as e:
|
||||
|
||||
try:
|
||||
from computer import Computer
|
||||
from agent import ComputerAgent, LLMProvider, LLM, AgentLoop
|
||||
from agent import ComputerAgent
|
||||
|
||||
logger.debug("Successfully imported Computer and Agent modules")
|
||||
except ImportError as e:
|
||||
@@ -92,49 +93,27 @@ def serve() -> FastMCP:
|
||||
global_computer = Computer(verbosity=logging.INFO)
|
||||
await global_computer.run()
|
||||
|
||||
# Determine which loop to use
|
||||
loop_str = os.getenv("CUA_AGENT_LOOP", "OMNI")
|
||||
loop = getattr(AgentLoop, loop_str)
|
||||
# Get model name - this now determines the loop and provider
|
||||
model_name = os.getenv("CUA_MODEL_NAME", "anthropic/claude-3-5-sonnet-20241022")
|
||||
|
||||
logger.info(f"Using model: {model_name}")
|
||||
|
||||
# Determine provider
|
||||
provider_str = os.getenv("CUA_MODEL_PROVIDER", "ANTHROPIC")
|
||||
provider = getattr(LLMProvider, provider_str)
|
||||
|
||||
# Get model name (if specified)
|
||||
model_name = os.getenv("CUA_MODEL_NAME", None)
|
||||
|
||||
# Get base URL for provider (if needed)
|
||||
provider_base_url = os.getenv("CUA_PROVIDER_BASE_URL", None)
|
||||
|
||||
# Get api key for provider (if needed)
|
||||
api_key = os.getenv("CUA_PROVIDER_API_KEY", None)
|
||||
|
||||
# Create agent with the specified configuration
|
||||
# Create agent with the new v0.4.x API
|
||||
agent = ComputerAgent(
|
||||
computer=global_computer,
|
||||
loop=loop,
|
||||
model=LLM(
|
||||
provider=provider,
|
||||
name=model_name,
|
||||
provider_base_url=provider_base_url,
|
||||
),
|
||||
api_key=api_key,
|
||||
save_trajectory=False,
|
||||
model=model_name,
|
||||
only_n_most_recent_images=int(os.getenv("CUA_MAX_IMAGES", "3")),
|
||||
verbosity=logging.INFO,
|
||||
tools=[global_computer]
|
||||
)
|
||||
|
||||
# Create messages in the new v0.4.x format
|
||||
messages = [{"role": "user", "content": task}]
|
||||
|
||||
# Collect all results
|
||||
full_result = ""
|
||||
async for result in agent.run(task):
|
||||
logger.info(f"Agent step complete: {result.get('id', 'unknown')}")
|
||||
ctx.info(f"Agent step complete: {result.get('id', 'unknown')}")
|
||||
|
||||
# Add response ID to output
|
||||
full_result += f"\n[Response ID: {result.get('id', 'unknown')}]\n"
|
||||
|
||||
if "content" in result:
|
||||
full_result += f"Response: {result.get('content', '')}\n"
|
||||
async for result in agent.run(messages):
|
||||
logger.info(f"Agent processing step")
|
||||
ctx.info(f"Agent processing step")
|
||||
|
||||
# Process output if available
|
||||
outputs = result.get("output", [])
|
||||
@@ -145,25 +124,23 @@ def serve() -> FastMCP:
|
||||
content = output.get("content", [])
|
||||
for content_part in content:
|
||||
if content_part.get("text"):
|
||||
full_result += f"\nMessage: {content_part.get('text', '')}\n"
|
||||
elif output_type == "reasoning":
|
||||
logger.debug(f"Reasoning: {output}")
|
||||
|
||||
summary_content = output.get("summary", [])
|
||||
if summary_content:
|
||||
for summary_part in summary_content:
|
||||
if summary_part.get("text"):
|
||||
full_result += f"\nReasoning: {summary_part.get('text', '')}\n"
|
||||
full_result += f"Message: {content_part.get('text', '')}\n"
|
||||
elif output_type == "tool_use":
|
||||
logger.debug(f"Tool use: {output}")
|
||||
tool_name = output.get("name", "")
|
||||
full_result += f"Tool: {tool_name}\n"
|
||||
elif output_type == "tool_result":
|
||||
logger.debug(f"Tool result: {output}")
|
||||
result_content = output.get("content", "")
|
||||
if isinstance(result_content, list):
|
||||
for item in result_content:
|
||||
if item.get("type") == "text":
|
||||
full_result += f"Result: {item.get('text', '')}\n"
|
||||
else:
|
||||
full_result += f"\nReasoning: {output.get('text', output.get('content', ''))}\n"
|
||||
elif output_type == "computer_call":
|
||||
logger.debug(f"Computer call: {output}")
|
||||
action = output.get("action", "")
|
||||
result_value = output.get("result", "")
|
||||
full_result += f"\nComputer Action: {action}\nResult: {result_value}\n"
|
||||
full_result += f"Result: {result_content}\n"
|
||||
|
||||
# Add separator between steps
|
||||
full_result += "\n" + "-" * 40 + "\n"
|
||||
full_result += "\n" + "-" * 20 + "\n"
|
||||
|
||||
logger.info(f"CUA task completed successfully")
|
||||
ctx.info(f"CUA task completed successfully")
|
||||
@@ -179,7 +156,21 @@ def serve() -> FastMCP:
|
||||
error_msg = f"Error running CUA task: {str(e)}\n{traceback.format_exc()}"
|
||||
logger.error(error_msg)
|
||||
ctx.error(error_msg)
|
||||
return f"Error during task execution: {str(e)}"
|
||||
# Return tuple with error message and a screenshot if possible
|
||||
try:
|
||||
if global_computer is not None:
|
||||
screenshot = await global_computer.interface.screenshot()
|
||||
return (
|
||||
f"Error during task execution: {str(e)}",
|
||||
Image(format="png", data=screenshot)
|
||||
)
|
||||
except:
|
||||
pass
|
||||
# If we can't get a screenshot, return a placeholder
|
||||
return (
|
||||
f"Error during task execution: {str(e)}",
|
||||
Image(format="png", data=b"")
|
||||
)
|
||||
|
||||
@server.tool()
|
||||
async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> List:
|
||||
|
||||
@@ -13,8 +13,8 @@ authors = [
|
||||
]
|
||||
dependencies = [
|
||||
"mcp>=1.6.0,<2.0.0",
|
||||
"cua-agent[all]>=0.3.0,<0.4.0",
|
||||
"cua-computer>=0.3.0,<0.4.0",
|
||||
"cua-agent[all]>=0.4.0,<0.5.0",
|
||||
"cua-computer>=0.4.0,<0.5.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
Reference in New Issue
Block a user