Merge branch 'main' into feat/extra-models

2026-01-08 14:30:25 -06:00 · 2025-08-05 12:46:26 -04:00
parent 8eb662bf4d 5456daba56
commit 0063eccb79
110 changed files with 9572 additions and 760 deletions
--- a/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py
+++ b/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py
@@ -8,7 +8,7 @@ from litellm import completion, acompletion
 # Try to import HuggingFace dependencies
 try:
    import torch
-    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    from transformers import AutoModelForImageTextToText, AutoProcessor
    HF_AVAILABLE = True
 except ImportError:
    HF_AVAILABLE = False
@@ -40,7 +40,7 @@ class HuggingFaceLocalAdapter(CustomLLM):
        """
        if model_name not in self.models:
            # Load model
-            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model = AutoModelForImageTextToText.from_pretrained(
                model_name,
                torch_dtype=torch.float16,
                device_map=self.device,
@@ -145,8 +145,7 @@ class HuggingFaceLocalAdapter(CustomLLM):
        )
        
        # Move inputs to the same device as model
-        if torch.cuda.is_available() and self.device != "cpu":
-            inputs = inputs.to("cuda")
+        inputs = inputs.to(model.device)
        
        # Generate response
        with torch.no_grad():
--- a/libs/python/agent/agent/agent.py
+++ b/libs/python/agent/agent/agent.py
@@ -422,6 +422,9 @@ class ComputerAgent:
            # Perform computer actions
            action = item.get("action")
            action_type = action.get("type")
+            if action_type is None:
+                print(f"Action type cannot be `None`: action={action}, action_type={action_type}")
+                return []
            
            # Extract action arguments (all fields except 'type')
            action_args = {k: v for k, v in action.items() if k != "type"}
--- a/libs/python/agent/agent/callbacks/pii_anonymization.py
+++ b/libs/python/agent/agent/callbacks/pii_anonymization.py
@@ -93,4 +93,4 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
    
    async def _deanonymize_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
        # TODO: Implement _deanonymize_item
-        return item
+        return item
--- a/libs/python/agent/agent/ui/gradio/app.py
+++ b/libs/python/agent/agent/ui/gradio/app.py
@@ -178,13 +178,20 @@ def create_computer_instance(
    """Create or get the global Computer instance."""
    global global_computer
    if global_computer is None:
-        global_computer = Computer(
-            verbosity=verbosity,
-            os_type=os_type,
-            provider_type=provider_type,
-            name=name if name else "",
-            api_key=api_key
-        )
+        if provider_type == "localhost":
+            global_computer = Computer(
+                verbosity=verbosity,
+                os_type=os_type,
+                use_host_computer_server=True
+            )
+        else:
+            global_computer = Computer(
+                verbosity=verbosity,
+                os_type=os_type,
+                provider_type=provider_type,
+                name=name if name else "",
+                api_key=api_key
+            )
    return global_computer


--- a/libs/python/agent/agent/ui/gradio/ui_components.py
+++ b/libs/python/agent/agent/ui/gradio/ui_components.py
@@ -211,7 +211,7 @@ if __name__ == "__main__":
                    is_windows = platform.system().lower() == "windows"
                    is_mac = platform.system().lower() == "darwin"
                    
-                    providers = ["cloud"]
+                    providers = ["cloud", "localhost"]
                    if is_mac:
                        providers += ["lume"]
                    if is_windows:
@@ -403,6 +403,23 @@ if __name__ == "__main__":
                        type="password",
                    )
                    
+                    # Provider visibility update function
+                    def update_provider_visibility(provider):
+                        """Update visibility of container name and API key based on selected provider."""
+                        is_localhost = provider == "localhost"
+                        return [
+                            gr.update(visible=not is_localhost),  # container_name
+                            gr.update(visible=not is_localhost and not has_cua_key)  # cua_cloud_api_key
+                        ]
+                    
+                    # Connect provider change event
+                    computer_provider.change(
+                        fn=update_provider_visibility,
+                        inputs=[computer_provider],
+                        outputs=[container_name, cua_cloud_api_key],
+                        queue=False
+                    )
+                    
                    # Connect UI update events
                    for dropdown in [agent_loop, omni_model_choice, uitars_model_choice, openai_model_choice, anthropic_model_choice]:
                        dropdown.change(
--- a/libs/python/agent/pyproject.toml
+++ b/libs/python/agent/pyproject.toml
@@ -19,7 +19,7 @@ dependencies = [
    "pydantic>=2.6.4",
    "rich>=13.7.1",
    "python-dotenv>=1.0.1",
-    "cua-computer>=0.3.0,<0.5.0",
+    "cua-computer>=0.4.0,<0.5.0",
    "cua-core>=0.1.8,<0.2.0",
    "certifi>=2024.2.2",
    "litellm>=1.74.12"
--- a/libs/python/computer/computer/ui/gradio/app.py
+++ b/libs/python/computer/computer/ui/gradio/app.py
@@ -302,7 +302,7 @@ def upload_to_huggingface(dataset_name, visibility, filter_tags=None):
        )
        card = DatasetCard.from_template(
            card_data=card_data,
-            template_str="---\n{{ card_data }}\n---\n\n# Uploaded computer interface trajectories\n\nThese trajectories were generated and uploaded using [c/ua](https://github.com/trycua/cua)"
+            template_str="---\n{{ card_data }}\n---\n\n# Uploaded computer interface trajectories\n\nThese trajectories were generated and uploaded using [cua](https://github.com/trycua/cua)"
        )
        card.push_to_hub(
            dataset_name,
--- a/libs/python/computer/pyproject.toml
+++ b/libs/python/computer/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "pdm.backend"

 [project]
 name = "cua-computer"
-version = "0.3.0"
+version = "0.4.0"
 description = "Computer-Use Interface (CUI) framework powering Cua"
 readme = "README.md"
 authors = [
--- a/libs/python/mcp-server/README.md
+++ b/libs/python/mcp-server/README.md
@@ -16,6 +16,21 @@
 </div>

 **cua-mcp-server** is a MCP server for the Computer-Use Agent (CUA), allowing you to run CUA through Claude Desktop or other MCP clients.
+
+## LiteLLM Integration
+
+This MCP server features comprehensive liteLLM integration, allowing you to use any supported LLM provider with a simple model string configuration.
+
+- **Unified Configuration**: Use a single `CUA_MODEL_NAME` environment variable with a model string
+- **Automatic Provider Detection**: The agent automatically detects the provider and capabilities from the model string
+- **Extensive Provider Support**: Works with Anthropic, OpenAI, local models, and any liteLLM-compatible provider
+
+### Model String Examples:
+- **Anthropic**: `"anthropic/claude-3-5-sonnet-20241022"`
+- **OpenAI**: `"openai/computer-use-preview"`
+- **UI-TARS**: `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`
+- **Omni + Any LiteLLM**: `"omniparser+litellm/gpt-4o"`, `"omniparser+litellm/claude-3-haiku"`, `"omniparser+ollama_chat/gemma3"`
+
 ### Get started with Agent

 ## Prerequisites
@@ -65,10 +80,7 @@ You can then use the script in your MCP configuration like this:
      "command": "/bin/bash",
      "args": ["~/.cua/start_mcp_server.sh"],
      "env": {
-        "CUA_AGENT_LOOP": "OMNI",
-        "CUA_MODEL_PROVIDER": "ANTHROPIC",
-        "CUA_MODEL_NAME": "claude-3-7-sonnet-20250219",
-        "CUA_PROVIDER_API_KEY": "your-api-key"
+        "CUA_MODEL_NAME": "anthropic/claude-3-5-sonnet-20241022"
      }
    }
  }
@@ -86,11 +98,7 @@ If you want to develop with the cua-mcp-server directly without installation, yo
      "command": "/bin/bash",
      "args": ["~/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"],
      "env": {
-        "CUA_AGENT_LOOP": "UITARS",
-        "CUA_MODEL_PROVIDER": "OAICOMPAT",
-        "CUA_MODEL_NAME": "ByteDance-Seed/UI-TARS-1.5-7B",
-        "CUA_PROVIDER_BASE_URL": "https://****************.us-east-1.aws.endpoints.huggingface.cloud/v1",
-        "CUA_PROVIDER_API_KEY": "your-api-key"
+        "CUA_MODEL_NAME": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"
      }
    }
  }
@@ -142,10 +150,7 @@ The server is configured using environment variables (can be set in the Claude D

 | Variable | Description | Default |
 |----------|-------------|---------|
-| `CUA_AGENT_LOOP` | Agent loop to use (OPENAI, ANTHROPIC, UITARS, OMNI) | OMNI |
-| `CUA_MODEL_PROVIDER` | Model provider (ANTHROPIC, OPENAI, OLLAMA, OAICOMPAT) | ANTHROPIC |
-| `CUA_MODEL_NAME` | Model name to use | None (provider default) |
-| `CUA_PROVIDER_BASE_URL` | Base URL for provider API | None |
+| `CUA_MODEL_NAME` | Model string (e.g., "anthropic/claude-3-5-sonnet-20241022", "openai/computer-use-preview", "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", "omniparser+litellm/gpt-4o", "omniparser+ollama_chat/gemma3") | anthropic/claude-3-5-sonnet-20241022 |
 | `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 |

 ## Available Tools
--- a/libs/python/mcp-server/mcp_server/server.py
+++ b/libs/python/mcp-server/mcp_server/server.py
@@ -3,6 +3,7 @@ import base64
 import logging
 import os
 import sys
+from tabnanny import verbose
 import traceback
 from typing import Any, Dict, List, Optional, Union, Tuple

@@ -28,7 +29,7 @@ except ImportError as e:

 try:
    from computer import Computer
-    from agent import ComputerAgent, LLMProvider, LLM, AgentLoop
+    from agent import ComputerAgent

    logger.debug("Successfully imported Computer and Agent modules")
 except ImportError as e:
@@ -92,49 +93,27 @@ def serve() -> FastMCP:
                global_computer = Computer(verbosity=logging.INFO)
                await global_computer.run()

-            # Determine which loop to use
-            loop_str = os.getenv("CUA_AGENT_LOOP", "OMNI")
-            loop = getattr(AgentLoop, loop_str)
+            # Get model name - this now determines the loop and provider
+            model_name = os.getenv("CUA_MODEL_NAME", "anthropic/claude-3-5-sonnet-20241022")
+            
+            logger.info(f"Using model: {model_name}")

-            # Determine provider
-            provider_str = os.getenv("CUA_MODEL_PROVIDER", "ANTHROPIC")
-            provider = getattr(LLMProvider, provider_str)
-
-            # Get model name (if specified)
-            model_name = os.getenv("CUA_MODEL_NAME", None)
-
-            # Get base URL for provider (if needed)
-            provider_base_url = os.getenv("CUA_PROVIDER_BASE_URL", None)
-
-            # Get api key for provider (if needed)
-            api_key = os.getenv("CUA_PROVIDER_API_KEY", None)
-
-            # Create agent with the specified configuration
+            # Create agent with the new v0.4.x API
            agent = ComputerAgent(
-                computer=global_computer,
-                loop=loop,
-                model=LLM(
-                    provider=provider,
-                    name=model_name,
-                    provider_base_url=provider_base_url,
-                ),
-                api_key=api_key,
-                save_trajectory=False,
+                model=model_name,
                only_n_most_recent_images=int(os.getenv("CUA_MAX_IMAGES", "3")),
                verbosity=logging.INFO,
+                tools=[global_computer]
            )

+            # Create messages in the new v0.4.x format
+            messages = [{"role": "user", "content": task}]
+            
            # Collect all results
            full_result = ""
-            async for result in agent.run(task):
-                logger.info(f"Agent step complete: {result.get('id', 'unknown')}")
-                ctx.info(f"Agent step complete: {result.get('id', 'unknown')}")
-
-                # Add response ID to output
-                full_result += f"\n[Response ID: {result.get('id', 'unknown')}]\n"
-                
-                if "content" in result:
-                    full_result += f"Response: {result.get('content', '')}\n"
+            async for result in agent.run(messages):
+                logger.info(f"Agent processing step")
+                ctx.info(f"Agent processing step")

                # Process output if available
                outputs = result.get("output", [])
@@ -145,25 +124,23 @@ def serve() -> FastMCP:
                        content = output.get("content", [])
                        for content_part in content:
                            if content_part.get("text"):
-                                full_result += f"\nMessage: {content_part.get('text', '')}\n"
-                    elif output_type == "reasoning":
-                        logger.debug(f"Reasoning: {output}")
-                        
-                        summary_content = output.get("summary", [])
-                        if summary_content:
-                            for summary_part in summary_content:
-                                if summary_part.get("text"):
-                                    full_result += f"\nReasoning: {summary_part.get('text', '')}\n"
+                                full_result += f"Message: {content_part.get('text', '')}\n"
+                    elif output_type == "tool_use":
+                        logger.debug(f"Tool use: {output}")
+                        tool_name = output.get("name", "")
+                        full_result += f"Tool: {tool_name}\n"
+                    elif output_type == "tool_result":
+                        logger.debug(f"Tool result: {output}")
+                        result_content = output.get("content", "")
+                        if isinstance(result_content, list):
+                            for item in result_content:
+                                if item.get("type") == "text":
+                                    full_result += f"Result: {item.get('text', '')}\n"
                        else:
-                            full_result += f"\nReasoning: {output.get('text', output.get('content', ''))}\n"
-                    elif output_type == "computer_call":
-                        logger.debug(f"Computer call: {output}")
-                        action = output.get("action", "")
-                        result_value = output.get("result", "")
-                        full_result += f"\nComputer Action: {action}\nResult: {result_value}\n"
+                            full_result += f"Result: {result_content}\n"

                # Add separator between steps
-                full_result += "\n" + "-" * 40 + "\n"
+                full_result += "\n" + "-" * 20 + "\n"

            logger.info(f"CUA task completed successfully")
            ctx.info(f"CUA task completed successfully")
@@ -179,7 +156,21 @@ def serve() -> FastMCP:
            error_msg = f"Error running CUA task: {str(e)}\n{traceback.format_exc()}"
            logger.error(error_msg)
            ctx.error(error_msg)
-            return f"Error during task execution: {str(e)}"
+            # Return tuple with error message and a screenshot if possible
+            try:
+                if global_computer is not None:
+                    screenshot = await global_computer.interface.screenshot()
+                    return (
+                        f"Error during task execution: {str(e)}",
+                        Image(format="png", data=screenshot)
+                    )
+            except:
+                pass
+            # If we can't get a screenshot, return a placeholder
+            return (
+                f"Error during task execution: {str(e)}",
+                Image(format="png", data=b"")
+            )

    @server.tool()
    async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> List:
--- a/libs/python/mcp-server/pyproject.toml
+++ b/libs/python/mcp-server/pyproject.toml
@@ -13,8 +13,8 @@ authors = [
 ]
 dependencies = [
    "mcp>=1.6.0,<2.0.0",
-    "cua-agent[all]>=0.3.0,<0.4.0",
-    "cua-computer>=0.3.0,<0.4.0",
+    "cua-agent[all]>=0.4.0,<0.5.0",
+    "cua-computer>=0.4.0,<0.5.0",
 ]

 [project.scripts]