From 39c4915682323f7789578d253d2e32249dd8ac9e Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Thu, 24 Apr 2025 19:24:12 -0400 Subject: [PATCH 01/20] consistency with other loops --- libs/agent/agent/providers/uitars/loop.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py index 99132365..0d3bc9f7 100644 --- a/libs/agent/agent/providers/uitars/loop.py +++ b/libs/agent/agent/providers/uitars/loop.py @@ -232,8 +232,11 @@ class UITARSLoop(BaseLoop): if self.client is None: raise RuntimeError("Failed to initialize client") - # Convert messages to UI-TARS format + # Get messages in standard format from the message manager + self.message_manager.messages = messages.copy() prepared_messages = self.message_manager.get_messages() + + # Convert messages to UI-TARS format uitars_messages = self.to_uitars_format(prepared_messages) # Log request From 4d21f9e2ea951f335816c73d5e206512b69676fa Mon Sep 17 00:00:00 2001 From: Morgan Dean Date: Sat, 26 Apr 2025 14:52:46 -0700 Subject: [PATCH 02/20] create mlxvlm provider --- libs/agent/agent/core/types.py | 1 + .../agent/providers/uitars/clients/mlxvlm.py | 158 ++++++++++++++++++ libs/agent/pyproject.toml | 1 + 3 files changed, 160 insertions(+) create mode 100644 libs/agent/agent/providers/uitars/clients/mlxvlm.py diff --git a/libs/agent/agent/core/types.py b/libs/agent/agent/core/types.py index ef50d09e..fd337062 100644 --- a/libs/agent/agent/core/types.py +++ b/libs/agent/agent/core/types.py @@ -23,6 +23,7 @@ class LLMProvider(StrEnum): OPENAI = "openai" OLLAMA = "ollama" OAICOMPAT = "oaicompat" + MLXVLM= "mlxvlm" @dataclass diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py new file mode 100644 index 00000000..f644ce6d --- /dev/null +++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py @@ -0,0 +1,158 @@ +"""MLX LVM client implementation.""" + +import logging +import base64 +import tempfile +import os +from typing import Dict, List, Optional, Any, cast + +from .base import BaseUITarsClient +import mlx.core as mx +from mlx_vlm import load, generate +from mlx_vlm.prompt_utils import apply_chat_template +from mlx_vlm.utils import load_config +from transformers.tokenization_utils import PreTrainedTokenizer + +logger = logging.getLogger(__name__) + + +class MLXLMVUITarsClient(BaseUITarsClient): + """MLX LVM client implementation class.""" + + def __init__(self, api_key: Optional[str] = None, model: str = "mlx-community/UI-TARS-1.5-7B-4bit"): + """Initialize MLX LVM client. + + Args: + api_key: Optional API key + model: Model name or path (defaults to mlx-community/UI-TARS-1.5-7B-4bit) + """ + self.api_key = api_key + self.model = model + + async def run_interleaved( + self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None + ) -> Dict[str, Any]: + """Run interleaved chat completion. + + Args: + messages: List of message dicts + system: System prompt + max_tokens: Optional max tokens override + + Returns: + Response dict + """ + # Extract text and images from messages + prompt_parts = [] + images = [] + + # Add system message first + prompt_parts.append(system) + + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", []) + + # Handle different content formats + if isinstance(content, str): + # If content is a string, just add it as text + prompt_parts.append(f"{role}: {content}") + elif isinstance(content, list): + # If content is a list, process each item + text_parts = [] + + for item in content: + if item.get("type") == "text": + text_parts.append(item.get("text", "")) + elif item.get("type") == "image_url": + # Extract image URL and add to images list + image_url = item.get("image_url", {}).get("url", "") + if image_url.startswith("data:image/"): + # Extract base64 data and convert to URL or save as temp file + # For now, we'll just store the URL directly + images.append(image_url) + + # Add text parts to prompt + if text_parts: + prompt_parts.append(f"{role}: {''.join(text_parts)}") + + # Combine all text parts into a single prompt + combined_prompt = "\n".join(prompt_parts) + + try: + # Load model and processor + model_obj, processor = load(self.model) + config = load_config(self.model) + + # Process images to ensure they're in the right format + processed_images = [] + for img in images: + if img.startswith('data:image/'): + # Extract base64 data + img_format = img.split(';')[0].split('/')[1] + base64_data = img.split(',')[1] + + # Create a temporary file to store the image + with tempfile.NamedTemporaryFile(suffix=f'.{img_format}', delete=False) as temp_file: + temp_file.write(base64.b64decode(base64_data)) + processed_images.append(temp_file.name) + else: + # Assume it's already a valid URL or path + processed_images.append(img) + + # Format prompt according to model requirements + formatted_prompt = apply_chat_template( + processor, config, str(combined_prompt), num_images=len(processed_images) + ) + + # Cast processor to PreTrainedTokenizer to satisfy type checker + tokenizer = cast(PreTrainedTokenizer, processor) + + # Generate response + output = generate( + model_obj, + tokenizer, + str(formatted_prompt), + processed_images, + verbose=False, + max_tokens=max_tokens + ) + + # Clean up temporary files + for img_path in processed_images: + if img_path.startswith(tempfile.gettempdir()) and os.path.exists(img_path): + try: + os.unlink(img_path) + except Exception as e: + logger.warning(f"Failed to delete temporary file {img_path}: {e}") + except Exception as e: + logger.error(f"Error generating response: {str(e)}") + return { + "choices": [ + { + "message": { + "role": "assistant", + "content": f"Error generating response: {str(e)}" + }, + "finish_reason": "error" + } + ], + "model": self.model, + "error": str(e) + } + + # Format response to match OpenAI format + response = { + "choices": [ + { + "message": { + "role": "assistant", + "content": output + }, + "finish_reason": "stop" + } + ], + "model": self.model + } + + return response diff --git a/libs/agent/pyproject.toml b/libs/agent/pyproject.toml index 1572465b..d3b97112 100644 --- a/libs/agent/pyproject.toml +++ b/libs/agent/pyproject.toml @@ -36,6 +36,7 @@ openai = [ ] uitars = [ "httpx>=0.27.0,<0.29.0", + "mlx-vlm>=0.1.25" ] ui = [ "gradio>=5.23.3,<6.0.0", From 9c870fcdddb7f4db2b33c1415c50ff75c51abf35 Mon Sep 17 00:00:00 2001 From: Morgan Dean Date: Sat, 26 Apr 2025 15:15:36 -0700 Subject: [PATCH 03/20] Fix bugs with uitars loop config for multiple providers, add MLXVLM provider to agent core --- libs/agent/agent/core/factory.py | 1 + libs/agent/agent/core/provider_config.py | 2 ++ libs/agent/agent/providers/uitars/loop.py | 38 ++++++++++++++++------- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/libs/agent/agent/core/factory.py b/libs/agent/agent/core/factory.py index 461b5cbc..f0c6046e 100644 --- a/libs/agent/agent/core/factory.py +++ b/libs/agent/agent/core/factory.py @@ -116,6 +116,7 @@ class LoopFactory: base_dir=trajectory_dir, only_n_most_recent_images=only_n_most_recent_images, provider_base_url=provider_base_url, + provider=provider, ) else: raise ValueError(f"Unsupported loop type: {loop_type}") diff --git a/libs/agent/agent/core/provider_config.py b/libs/agent/agent/core/provider_config.py index 21a5d283..f6cd1feb 100644 --- a/libs/agent/agent/core/provider_config.py +++ b/libs/agent/agent/core/provider_config.py @@ -8,6 +8,7 @@ DEFAULT_MODELS = { LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219", LLMProvider.OLLAMA: "gemma3:4b-it-q4_K_M", LLMProvider.OAICOMPAT: "Qwen2.5-VL-7B-Instruct", + LLMProvider.MLXVLM: "mlx-community/UI-TARS-1.5-7B-4bit", } # Map providers to their environment variable names @@ -16,4 +17,5 @@ ENV_VARS = { LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY", LLMProvider.OLLAMA: "none", LLMProvider.OAICOMPAT: "none", # OpenAI-compatible API typically doesn't require an API key + LLMProvider.MLXVLM: "none", # MLX VLM typically doesn't require an API key } diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py index 0d3bc9f7..26f6913f 100644 --- a/libs/agent/agent/providers/uitars/loop.py +++ b/libs/agent/agent/providers/uitars/loop.py @@ -23,6 +23,7 @@ from .tools.computer import ToolResult from .prompts import COMPUTER_USE, SYSTEM_PROMPT from .clients.oaicompat import OAICompatClient +from .clients.mlxvlm import MLXLMVUITarsClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -44,6 +45,7 @@ class UITARSLoop(BaseLoop): computer: Computer, api_key: str, model: str, + provider: Optional[LLMProvider] = None, provider_base_url: Optional[str] = "http://localhost:8000/v1", only_n_most_recent_images: Optional[int] = 2, base_dir: Optional[str] = "trajectories", @@ -64,9 +66,10 @@ class UITARSLoop(BaseLoop): max_retries: Maximum number of retries for API calls retry_delay: Delay between retries in seconds save_trajectory: Whether to save trajectory data + provider: The LLM provider to use (defaults to OAICOMPAT if not specified) """ # Set provider before initializing base class - self.provider = LLMProvider.OAICOMPAT + self.provider = provider or LLMProvider.OAICOMPAT self.provider_base_url = provider_base_url # Initialize message manager with image retention config @@ -113,7 +116,7 @@ class UITARSLoop(BaseLoop): logger.error(f"Error initializing tool manager: {str(e)}") logger.warning("Will attempt to initialize tools on first use.") - # Initialize client for the OAICompat provider + # Initialize client for the selected provider try: await self.initialize_client() except Exception as e: @@ -128,18 +131,29 @@ class UITARSLoop(BaseLoop): """Initialize the appropriate client. Implements abstract method from BaseLoop to set up the specific - provider client (OAICompat for UI-TARS). + provider client based on the configured provider. """ try: - logger.info(f"Initializing OAICompat client for UI-TARS with model {self.model}...") - - self.client = OAICompatClient( - api_key=self.api_key or "EMPTY", # Local endpoints typically don't require an API key - model=self.model, - provider_base_url=self.provider_base_url, - ) - - logger.info(f"Initialized OAICompat client with model {self.model}") + if self.provider == LLMProvider.MLXVLM: + logger.info(f"Initializing MLX VLM client for UI-TARS with model {self.model}...") + + self.client = MLXLMVUITarsClient( + api_key=self.api_key, + model=self.model, + ) + + logger.info(f"Initialized MLX VLM client with model {self.model}") + else: + # Default to OAICompat client for other providers + logger.info(f"Initializing OAICompat client for UI-TARS with model {self.model}...") + + self.client = OAICompatClient( + api_key=self.api_key or "EMPTY", # Local endpoints typically don't require an API key + model=self.model, + provider_base_url=self.provider_base_url, + ) + + logger.info(f"Initialized OAICompat client with model {self.model}") except Exception as e: logger.error(f"Error initializing client: {str(e)}") self.client = None From 36887a82012ffa035f9f2f3e47e94dd12ca1b96b Mon Sep 17 00:00:00 2001 From: Morgan Dean Date: Sat, 26 Apr 2025 15:29:15 -0700 Subject: [PATCH 04/20] Add MLXVLM provider to agent_examples.py --- examples/agent_examples.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/agent_examples.py b/examples/agent_examples.py index 01d7483e..189ecddd 100644 --- a/examples/agent_examples.py +++ b/examples/agent_examples.py @@ -36,6 +36,7 @@ async def run_agent_example(): # model=LLM(provider=LLMProvider.OPENAI, name="gpt-4o"), # model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"), # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3:4b-it-q4_K_M"), + # model=LLM(provider=LLMProvider.MLXVLM, name="mlx-community/UI-TARS-1.5-7B-4bit"), model=LLM( provider=LLMProvider.OAICOMPAT, name="gemma-3-12b-it", From 0a222c0fdf373e1c4e03985198a3334899af6046 Mon Sep 17 00:00:00 2001 From: Morgan Dean Date: Sat, 26 Apr 2025 18:44:45 -0700 Subject: [PATCH 05/20] Fix mispelling, update prompt to use PIL instead of temp files --- .../agent/providers/uitars/clients/mlxvlm.py | 63 +++++++++---------- libs/agent/agent/providers/uitars/loop.py | 5 +- 2 files changed, 30 insertions(+), 38 deletions(-) diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py index f644ce6d..77d83146 100644 --- a/libs/agent/agent/providers/uitars/clients/mlxvlm.py +++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py @@ -1,10 +1,12 @@ """MLX LVM client implementation.""" +import io import logging import base64 import tempfile import os from typing import Dict, List, Optional, Any, cast +from PIL import Image from .base import BaseUITarsClient import mlx.core as mx @@ -16,18 +18,21 @@ from transformers.tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) -class MLXLMVUITarsClient(BaseUITarsClient): +class MLXVLMUITarsClient(BaseUITarsClient): """MLX LVM client implementation class.""" - def __init__(self, api_key: Optional[str] = None, model: str = "mlx-community/UI-TARS-1.5-7B-4bit"): + def __init__(self, model: str = "mlx-community/UI-TARS-1.5-7B-4bit"): """Initialize MLX LVM client. Args: - api_key: Optional API key model: Model name or path (defaults to mlx-community/UI-TARS-1.5-7B-4bit) """ - self.api_key = api_key - self.model = model + # Load model and processor + model_obj, processor = load(model) + self.config = load_config(model) + self.model = model_obj + self.processor = processor + async def run_interleaved( self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None @@ -78,39 +83,34 @@ class MLXLMVUITarsClient(BaseUITarsClient): # Combine all text parts into a single prompt combined_prompt = "\n".join(prompt_parts) + processed_images = [] + for img in images: + if img.startswith('data:image/'): + # Extract base64 data + base64_data = img.split(',')[1] + + # Convert base64 to PIL Image directly + image_data = base64.b64decode(base64_data) + pil_image = Image.open(io.BytesIO(image_data)) + processed_images.append(pil_image) + else: + # Assume it's already a valid URL or path + # For file paths or URLs, we'll load them with PIL + pil_image = Image.open(img) + processed_images.append(pil_image) try: - # Load model and processor - model_obj, processor = load(self.model) - config = load_config(self.model) - - # Process images to ensure they're in the right format - processed_images = [] - for img in images: - if img.startswith('data:image/'): - # Extract base64 data - img_format = img.split(';')[0].split('/')[1] - base64_data = img.split(',')[1] - - # Create a temporary file to store the image - with tempfile.NamedTemporaryFile(suffix=f'.{img_format}', delete=False) as temp_file: - temp_file.write(base64.b64decode(base64_data)) - processed_images.append(temp_file.name) - else: - # Assume it's already a valid URL or path - processed_images.append(img) - # Format prompt according to model requirements formatted_prompt = apply_chat_template( - processor, config, str(combined_prompt), num_images=len(processed_images) + self.processor, self.config, str(combined_prompt), num_images=len(processed_images) ) # Cast processor to PreTrainedTokenizer to satisfy type checker - tokenizer = cast(PreTrainedTokenizer, processor) + tokenizer = cast(PreTrainedTokenizer, self.processor) # Generate response output = generate( - model_obj, + self.model, tokenizer, str(formatted_prompt), processed_images, @@ -118,13 +118,6 @@ class MLXLMVUITarsClient(BaseUITarsClient): max_tokens=max_tokens ) - # Clean up temporary files - for img_path in processed_images: - if img_path.startswith(tempfile.gettempdir()) and os.path.exists(img_path): - try: - os.unlink(img_path) - except Exception as e: - logger.warning(f"Failed to delete temporary file {img_path}: {e}") except Exception as e: logger.error(f"Error generating response: {str(e)}") return { diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py index 26f6913f..c0ea6c73 100644 --- a/libs/agent/agent/providers/uitars/loop.py +++ b/libs/agent/agent/providers/uitars/loop.py @@ -23,7 +23,7 @@ from .tools.computer import ToolResult from .prompts import COMPUTER_USE, SYSTEM_PROMPT from .clients.oaicompat import OAICompatClient -from .clients.mlxvlm import MLXLMVUITarsClient +from .clients.mlxvlm import MLXVLMUITarsClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -137,8 +137,7 @@ class UITARSLoop(BaseLoop): if self.provider == LLMProvider.MLXVLM: logger.info(f"Initializing MLX VLM client for UI-TARS with model {self.model}...") - self.client = MLXLMVUITarsClient( - api_key=self.api_key, + self.client = MLXVLMUITarsClient( model=self.model, ) From 8da80d5ebf928b1bef465f2a85c440517eead93c Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 10:32:50 -0400 Subject: [PATCH 06/20] added mlx vlm to cua-agent[all] --- libs/agent/pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libs/agent/pyproject.toml b/libs/agent/pyproject.toml index d3b97112..8772575c 100644 --- a/libs/agent/pyproject.toml +++ b/libs/agent/pyproject.toml @@ -85,7 +85,8 @@ all = [ "requests>=2.31.0,<3.0.0", "ollama>=0.4.7,<0.5.0", "gradio>=5.23.3,<6.0.0", - "python-dotenv>=1.0.1,<2.0.0" + "python-dotenv>=1.0.1,<2.0.0", + "mlx-vlm>=0.1.25" ] [tool.pdm] From 0b61dea8a4f5678722e7a8be76a3a1025e7a594d Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 10:42:04 -0400 Subject: [PATCH 07/20] use model chat template --- .../agent/providers/uitars/clients/mlxvlm.py | 81 +++++++------------ 1 file changed, 27 insertions(+), 54 deletions(-) diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py index 77d83146..0eca5292 100644 --- a/libs/agent/agent/providers/uitars/clients/mlxvlm.py +++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py @@ -47,73 +47,46 @@ class MLXVLMUITarsClient(BaseUITarsClient): Returns: Response dict """ - # Extract text and images from messages - prompt_parts = [] - images = [] - - # Add system message first - prompt_parts.append(system) - - for msg in messages: - role = msg.get("role", "user") - content = msg.get("content", []) + # Ensure the system message is included + if not any(msg.get("role") == "system" for msg in messages): + messages = [{"role": "system", "content": system}] + messages - # Handle different content formats - if isinstance(content, str): - # If content is a string, just add it as text - prompt_parts.append(f"{role}: {content}") - elif isinstance(content, list): - # If content is a list, process each item - text_parts = [] - + # Extract any images from the messages + images = [] + for msg in messages: + content = msg.get("content", []) + if isinstance(content, list): for item in content: - if item.get("type") == "text": - text_parts.append(item.get("text", "")) - elif item.get("type") == "image_url": - # Extract image URL and add to images list + if item.get("type") == "image_url": image_url = item.get("image_url", {}).get("url", "") if image_url.startswith("data:image/"): - # Extract base64 data and convert to URL or save as temp file - # For now, we'll just store the URL directly - images.append(image_url) - - # Add text parts to prompt - if text_parts: - prompt_parts.append(f"{role}: {''.join(text_parts)}") - - # Combine all text parts into a single prompt - combined_prompt = "\n".join(prompt_parts) - processed_images = [] - for img in images: - if img.startswith('data:image/'): - # Extract base64 data - base64_data = img.split(',')[1] - - # Convert base64 to PIL Image directly - image_data = base64.b64decode(base64_data) - pil_image = Image.open(io.BytesIO(image_data)) - processed_images.append(pil_image) - else: - # Assume it's already a valid URL or path - # For file paths or URLs, we'll load them with PIL - pil_image = Image.open(img) - processed_images.append(pil_image) + # Extract base64 data + base64_data = image_url.split(',')[1] + + # Convert base64 to PIL Image + image_data = base64.b64decode(base64_data) + pil_image = Image.open(io.BytesIO(image_data)) + images.append(pil_image) + else: + # Handle file path or URL + pil_image = Image.open(image_url) + images.append(pil_image) try: - # Format prompt according to model requirements - formatted_prompt = apply_chat_template( - self.processor, self.config, str(combined_prompt), num_images=len(processed_images) + # Format prompt according to model requirements using the processor directly + prompt = self.processor.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True ) - - # Cast processor to PreTrainedTokenizer to satisfy type checker tokenizer = cast(PreTrainedTokenizer, self.processor) # Generate response output = generate( self.model, tokenizer, - str(formatted_prompt), - processed_images, + str(prompt), + images, verbose=False, max_tokens=max_tokens ) From 184db1037ff3bffc2bb0fe4f1cb864c7f4dbaaa3 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 14:28:17 -0400 Subject: [PATCH 08/20] add to gradio ui --- libs/agent/agent/ui/gradio/app.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/libs/agent/agent/ui/gradio/app.py b/libs/agent/agent/ui/gradio/app.py index c6ac57ea..16c5f7e6 100644 --- a/libs/agent/agent/ui/gradio/app.py +++ b/libs/agent/agent/ui/gradio/app.py @@ -163,8 +163,10 @@ MODEL_MAPPINGS = { "claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219", }, "uitars": { - # UI-TARS models default to custom endpoint - "default": "ByteDance-Seed/UI-TARS-1.5-7B", + # UI-TARS models using MLXVLM provider + "default": "mlx-community/UI-TARS-1.5-7B-4bit", + "UI-TARS-1.5-7B-4bit": "mlx-community/UI-TARS-1.5-7B-4bit", + "UI-TARS-1.5-7B-6bit": "mlx-community/UI-TARS-1.5-7B-6bit" }, "ollama": { # For Ollama models, we keep the original name @@ -287,8 +289,16 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple: model_name_to_use = cleaned_model_name # agent_loop remains AgentLoop.OMNI elif agent_loop == AgentLoop.UITARS: - provider = LLMProvider.OAICOMPAT - model_name_to_use = MODEL_MAPPINGS["uitars"]["default"] # Default + # For UITARS, use MLXVLM provider for the MLX models, OAICOMPAT for custom + if model_name == "Custom model...": + provider = LLMProvider.OAICOMPAT + model_name_to_use = "tgi" + else: + provider = LLMProvider.MLXVLM + # Get the model name from the mappings or use as-is if not found + model_name_to_use = MODEL_MAPPINGS["uitars"].get( + model_name, model_name if model_name else MODEL_MAPPINGS["uitars"]["default"] + ) else: # Default to OpenAI if unrecognized loop provider = LLMProvider.OPENAI @@ -558,7 +568,11 @@ def create_gradio_ui( "OPENAI": openai_models, "ANTHROPIC": anthropic_models, "OMNI": omni_models + ["Custom model..."], # Add custom model option - "UITARS": ["Custom model..."], # UI-TARS options + "UITARS": [ + "mlx-community/UI-TARS-1.5-7B-4bit", + "mlx-community/UI-TARS-1.5-7B-6bit", + "Custom model..." + ], # UI-TARS options with MLX models } # --- Apply Saved Settings (override defaults if available) --- From a87efe86da5754a013fcc01d0d1f7cf1126a7dec Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 14:34:59 -0400 Subject: [PATCH 09/20] correct mappings --- libs/agent/agent/ui/gradio/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/agent/agent/ui/gradio/app.py b/libs/agent/agent/ui/gradio/app.py index 16c5f7e6..354580d7 100644 --- a/libs/agent/agent/ui/gradio/app.py +++ b/libs/agent/agent/ui/gradio/app.py @@ -165,8 +165,8 @@ MODEL_MAPPINGS = { "uitars": { # UI-TARS models using MLXVLM provider "default": "mlx-community/UI-TARS-1.5-7B-4bit", - "UI-TARS-1.5-7B-4bit": "mlx-community/UI-TARS-1.5-7B-4bit", - "UI-TARS-1.5-7B-6bit": "mlx-community/UI-TARS-1.5-7B-6bit" + "mlx-community/UI-TARS-1.5-7B-4bit": "mlx-community/UI-TARS-1.5-7B-4bit", + "mlx-community/UI-TARS-1.5-7B-6bit": "mlx-community/UI-TARS-1.5-7B-6bit" }, "ollama": { # For Ollama models, we keep the original name From f2501ee6b0f6ee3dc65478a3bb5aabc789a2fc32 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 14:41:13 -0400 Subject: [PATCH 10/20] add to readme --- README.md | 4 ++-- libs/agent/README.md | 29 +++++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 53102fcb..b0630760 100644 --- a/README.md +++ b/README.md @@ -80,8 +80,8 @@ If you want to use AI agents with virtualized environments: async with Computer(verbosity=logging.DEBUG) as macos_computer: agent = ComputerAgent( computer=macos_computer, - loop=AgentLoop.OPENAI, # or AgentLoop.ANTHROPIC, or AgentLoop.OMNI - model=LLM(provider=LLMProvider.OPENAI) # or LLM(provider=LLMProvider.ANTHROPIC) + loop=AgentLoop.OPENAI, # or AgentLoop.UITARS, AgentLoop.OMNI, or AgentLoop.ANTHROPIC + model=LLM(provider=LLMProvider.OPENAI) # or LLM(provider=LLMProvider.MLXVLM, name="mlx-community/UI-TARS-1.5-7B-4bit") ) tasks = [ diff --git a/libs/agent/README.md b/libs/agent/README.md index e5dad869..bc4bce32 100644 --- a/libs/agent/README.md +++ b/libs/agent/README.md @@ -136,7 +136,32 @@ The Gradio UI provides: ### Using UI-TARS -You can use UI-TARS by first following the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md). This will give you a provider URL like this: `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the gradio UI. +The UI-TARS models are available in two forms: + +1. **MLX UI-TARS models** (Default): These models run locally using MLXVLM provider + - `mlx-community/UI-TARS-1.5-7B-4bit` (default) - 4-bit quantized version + - `mlx-community/UI-TARS-1.5-7B-6bit` - 6-bit quantized version for higher quality + + ```python + agent = ComputerAgent( + computer=macos_computer, + loop=AgentLoop.UITARS, + model=LLM(provider=LLMProvider.MLXVLM, name="mlx-community/UI-TARS-1.5-7B-4bit") + ) + ``` + +2. **OpenAI-compatible UI-TARS**: For using the original ByteDance model + - If you want to use the original ByteDance UI-TARS model via an OpenAI-compatible API, follow the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md) + - This will give you a provider URL like `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the code or Gradio UI: + + ```python + agent = ComputerAgent( + computer=macos_computer, + loop=AgentLoop.UITARS, + model=LLM(provider=LLMProvider.OAICOMPAT, name="tgi", + provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1") + ) + ``` ## Agent Loops @@ -146,7 +171,7 @@ The `cua-agent` package provides three agent loops variations, based on differen |:-----------|:-----------------|:------------|:-------------| | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required | | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`
• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required | -| `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required | +| `AgentLoop.UITARS` | • `mlx-community/UI-TARS-1.5-7B-4bit` (default)
• `mlx-community/UI-TARS-1.5-7B-6bit`
• `ByteDance-Seed/UI-TARS-1.5-7B` (via openAI-compatible endpoint) | Uses UI-TARS models with MLXVLM (default) or OAICOMPAT providers | Not Required | | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`
• `claude-3-7-sonnet-20250219`
• `gpt-4.5-preview`
• `gpt-4o`
• `gpt-4`
• `phi4`
• `phi4-mini`
• `gemma3`
• `...`
• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser | ## AgentResponse From 8cc28612c860876ab8ce1be81b8695fdc350b132 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 14:57:16 -0400 Subject: [PATCH 11/20] log model name instead of model --- libs/agent/agent/providers/uitars/clients/mlxvlm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py index 0eca5292..d1e5dfff 100644 --- a/libs/agent/agent/providers/uitars/clients/mlxvlm.py +++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py @@ -32,6 +32,7 @@ class MLXVLMUITarsClient(BaseUITarsClient): self.config = load_config(model) self.model = model_obj self.processor = processor + self.model_name = model async def run_interleaved( @@ -103,7 +104,7 @@ class MLXVLMUITarsClient(BaseUITarsClient): "finish_reason": "error" } ], - "model": self.model, + "model": self.model_name, "error": str(e) } @@ -118,7 +119,7 @@ class MLXVLMUITarsClient(BaseUITarsClient): "finish_reason": "stop" } ], - "model": self.model + "model": self.model_name } return response From 00eb09209c27129cacf55920d34f5051de414619 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 15:23:50 -0400 Subject: [PATCH 12/20] added forced resolution --- .../agent/providers/uitars/clients/mlxvlm.py | 133 +++++++++++++++--- 1 file changed, 111 insertions(+), 22 deletions(-) diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py index d1e5dfff..c0c9b459 100644 --- a/libs/agent/agent/providers/uitars/clients/mlxvlm.py +++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py @@ -5,7 +5,8 @@ import logging import base64 import tempfile import os -from typing import Dict, List, Optional, Any, cast +import re +from typing import Dict, List, Optional, Any, cast, Tuple from PIL import Image from .base import BaseUITarsClient @@ -21,11 +22,17 @@ logger = logging.getLogger(__name__) class MLXVLMUITarsClient(BaseUITarsClient): """MLX LVM client implementation class.""" - def __init__(self, model: str = "mlx-community/UI-TARS-1.5-7B-4bit"): + def __init__( + self, + model: str = "mlx-community/UI-TARS-1.5-7B-4bit", + force_resolution: Optional[Tuple[int, int]] = (1512, 982) + ): """Initialize MLX LVM client. Args: model: Model name or path (defaults to mlx-community/UI-TARS-1.5-7B-4bit) + force_resolution: Optional target resolution to resize images to (width, height). + If None, images will not be resized. """ # Load model and processor model_obj, processor = load(model) @@ -33,8 +40,32 @@ class MLXVLMUITarsClient(BaseUITarsClient): self.model = model_obj self.processor = processor self.model_name = model + self.force_resolution = force_resolution + def _remap_coordinates(self, text: str, original_size: Tuple[int, int], target_size: Tuple[int, int]) -> str: + """Remap coordinates in box tokens based on image resizing. + + Args: + text: Text containing box tokens + original_size: Original image size (width, height) + target_size: Target image size (width, height) + + Returns: + Text with remapped coordinates + """ + # Find all box tokens + box_pattern = r"<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>" + + def remap_coords(match): + x, y = int(match.group(1)), int(match.group(2)) + # Scale coordinates to new dimensions + new_x = int(x * target_size[0] / original_size[0]) + new_y = int(y * target_size[1] / original_size[1]) + return f"<|box_start|>({new_x},{new_y})<|box_end|>" + + return re.sub(box_pattern, remap_coords, text) + async def run_interleaved( self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None ) -> Dict[str, Any]: @@ -51,32 +82,79 @@ class MLXVLMUITarsClient(BaseUITarsClient): # Ensure the system message is included if not any(msg.get("role") == "system" for msg in messages): messages = [{"role": "system", "content": system}] + messages - - # Extract any images from the messages + + # Create a deep copy of messages to avoid modifying the original + processed_messages = messages.copy() + + # Extract images and process messages if force_resolution is set images = [] - for msg in messages: + original_sizes = {} # Track original sizes of images for coordinate remapping + image_index = 0 + + for msg_idx, msg in enumerate(messages): content = msg.get("content", []) - if isinstance(content, list): - for item in content: - if item.get("type") == "image_url": - image_url = item.get("image_url", {}).get("url", "") - if image_url.startswith("data:image/"): - # Extract base64 data - base64_data = image_url.split(',')[1] - - # Convert base64 to PIL Image - image_data = base64.b64decode(base64_data) - pil_image = Image.open(io.BytesIO(image_data)) - images.append(pil_image) - else: - # Handle file path or URL - pil_image = Image.open(image_url) - images.append(pil_image) + if not isinstance(content, list): + continue + + # Create a copy of the content list to modify + processed_content = [] + + for item_idx, item in enumerate(content): + if item.get("type") == "image_url": + image_url = item.get("image_url", {}).get("url", "") + pil_image = None + + if image_url.startswith("data:image/"): + # Extract base64 data + base64_data = image_url.split(',')[1] + # Convert base64 to PIL Image + image_data = base64.b64decode(base64_data) + pil_image = Image.open(io.BytesIO(image_data)) + else: + # Handle file path or URL + pil_image = Image.open(image_url) + + # Store original image size for coordinate mapping + original_sizes[image_index] = pil_image.size + + # Resize image if force_resolution is set + if self.force_resolution: + pil_image = pil_image.resize(self.force_resolution) + + images.append(pil_image) + image_index += 1 + + # Copy items to processed content list + processed_content.append(item.copy()) + + # Update the processed message content + processed_messages[msg_idx] = msg.copy() + processed_messages[msg_idx]["content"] = processed_content + + # Remap coordinates in messages with box tokens if force_resolution is set + if self.force_resolution and original_sizes: + for msg_idx, msg in enumerate(processed_messages): + content = msg.get("content", []) + if not isinstance(content, list): + continue + + for item_idx, item in enumerate(content): + if item.get("type") == "text": + text_content = item.get("text", "") + + # Check if there are any box tokens to remap + if "<|box_start|>" in text_content: + # Use the first image's dimensions as reference (most common case) + if 0 in original_sizes: + orig_size = original_sizes[0] + processed_messages[msg_idx]["content"][item_idx]["text"] = self._remap_coordinates( + text_content, orig_size, self.force_resolution + ) try: # Format prompt according to model requirements using the processor directly prompt = self.processor.apply_chat_template( - messages, + processed_messages, # Use processed messages instead of original tokenize=False, add_generation_prompt=True ) @@ -108,6 +186,17 @@ class MLXVLMUITarsClient(BaseUITarsClient): "error": str(e) } + # Remap coordinates in the response back to original image space if needed + if self.force_resolution and original_sizes and 0 in original_sizes: + # Get original image size (using the first image) + orig_size = original_sizes[0] + + # Check if output contains box tokens that need remapping + if "<|box_start|>" in output: + # Remap coordinates from model space back to original image space + # We just swap the arguments - from force_resolution back to original size + output = self._remap_coordinates(output, self.force_resolution, orig_size) + # Format response to match OpenAI format response = { "choices": [ From 8e8200dc17595f692aa283d27ba92888ac8d7d7f Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 16:32:48 -0400 Subject: [PATCH 13/20] extra coordinate processing --- .../agent/providers/uitars/clients/mlxvlm.py | 133 +++++++++++------- 1 file changed, 81 insertions(+), 52 deletions(-) diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py index c0c9b459..6a88a8a3 100644 --- a/libs/agent/agent/providers/uitars/clients/mlxvlm.py +++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py @@ -6,6 +6,7 @@ import base64 import tempfile import os import re +import math from typing import Dict, List, Optional, Any, cast, Tuple from PIL import Image @@ -18,53 +19,95 @@ from transformers.tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) +# Constants for smart_resize +IMAGE_FACTOR = 28 +MIN_PIXELS = 100 * 28 * 28 +MAX_PIXELS = 16384 * 28 * 28 +MAX_RATIO = 200 + +def round_by_factor(number: float, factor: int) -> int: + """Returns the closest integer to 'number' that is divisible by 'factor'.""" + return round(number / factor) * factor + +def ceil_by_factor(number: float, factor: int) -> int: + """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.""" + return math.ceil(number / factor) * factor + +def floor_by_factor(number: float, factor: int) -> int: + """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'.""" + return math.floor(number / factor) * factor + +def smart_resize( + height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS +) -> tuple[int, int]: + """ + Rescales the image so that the following conditions are met: + + 1. Both dimensions (height and width) are divisible by 'factor'. + 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. + 3. The aspect ratio of the image is maintained as closely as possible. + """ + if max(height, width) / min(height, width) > MAX_RATIO: + raise ValueError( + f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}" + ) + h_bar = max(factor, round_by_factor(height, factor)) + w_bar = max(factor, round_by_factor(width, factor)) + if h_bar * w_bar > max_pixels: + beta = math.sqrt((height * width) / max_pixels) + h_bar = floor_by_factor(height / beta, factor) + w_bar = floor_by_factor(width / beta, factor) + elif h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = ceil_by_factor(height * beta, factor) + w_bar = ceil_by_factor(width * beta, factor) + return h_bar, w_bar class MLXVLMUITarsClient(BaseUITarsClient): """MLX LVM client implementation class.""" def __init__( self, - model: str = "mlx-community/UI-TARS-1.5-7B-4bit", - force_resolution: Optional[Tuple[int, int]] = (1512, 982) + model: str = "mlx-community/UI-TARS-1.5-7B-4bit" ): """Initialize MLX LVM client. Args: model: Model name or path (defaults to mlx-community/UI-TARS-1.5-7B-4bit) - force_resolution: Optional target resolution to resize images to (width, height). - If None, images will not be resized. """ # Load model and processor - model_obj, processor = load(model) + model_obj, processor = load( + model, + processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS} + ) self.config = load_config(model) self.model = model_obj self.processor = processor self.model_name = model - self.force_resolution = force_resolution - - def _remap_coordinates(self, text: str, original_size: Tuple[int, int], target_size: Tuple[int, int]) -> str: - """Remap coordinates in box tokens based on image resizing. + def _process_coordinates(self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]) -> str: + """Process coordinates in box tokens based on image resizing using smart_resize approach. Args: text: Text containing box tokens original_size: Original image size (width, height) - target_size: Target image size (width, height) + model_size: Model processed image size (width, height) Returns: - Text with remapped coordinates + Text with processed coordinates """ # Find all box tokens box_pattern = r"<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>" - def remap_coords(match): - x, y = int(match.group(1)), int(match.group(2)) - # Scale coordinates to new dimensions - new_x = int(x * target_size[0] / original_size[0]) - new_y = int(y * target_size[1] / original_size[1]) + def process_coords(match): + model_x, model_y = int(match.group(1)), int(match.group(2)) + # Scale coordinates from model space to original image space + # Note that model_size is (height, width) while original_size is (width, height) + new_x = int(model_x * original_size[0] / model_size[1]) # Width + new_y = int(model_y * original_size[1] / model_size[0]) # Height return f"<|box_start|>({new_x},{new_y})<|box_end|>" - return re.sub(box_pattern, remap_coords, text) + return re.sub(box_pattern, process_coords, text) async def run_interleaved( self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None @@ -86,9 +129,10 @@ class MLXVLMUITarsClient(BaseUITarsClient): # Create a deep copy of messages to avoid modifying the original processed_messages = messages.copy() - # Extract images and process messages if force_resolution is set + # Extract images and process messages images = [] - original_sizes = {} # Track original sizes of images for coordinate remapping + original_sizes = {} # Track original sizes of images for coordinate mapping + model_sizes = {} # Track model processed sizes image_index = 0 for msg_idx, msg in enumerate(messages): @@ -115,13 +159,18 @@ class MLXVLMUITarsClient(BaseUITarsClient): pil_image = Image.open(image_url) # Store original image size for coordinate mapping - original_sizes[image_index] = pil_image.size + original_size = pil_image.size + original_sizes[image_index] = original_size - # Resize image if force_resolution is set - if self.force_resolution: - pil_image = pil_image.resize(self.force_resolution) + # Use smart_resize to determine model size + # Note: smart_resize expects (height, width) but PIL gives (width, height) + height, width = original_size[1], original_size[0] + new_height, new_width = smart_resize(height, width) + model_sizes[image_index] = (new_height, new_width) - images.append(pil_image) + # Resize the image using the calculated dimensions from smart_resize + resized_image = pil_image.resize((new_width, new_height)) + images.append(resized_image) image_index += 1 # Copy items to processed content list @@ -131,30 +180,10 @@ class MLXVLMUITarsClient(BaseUITarsClient): processed_messages[msg_idx] = msg.copy() processed_messages[msg_idx]["content"] = processed_content - # Remap coordinates in messages with box tokens if force_resolution is set - if self.force_resolution and original_sizes: - for msg_idx, msg in enumerate(processed_messages): - content = msg.get("content", []) - if not isinstance(content, list): - continue - - for item_idx, item in enumerate(content): - if item.get("type") == "text": - text_content = item.get("text", "") - - # Check if there are any box tokens to remap - if "<|box_start|>" in text_content: - # Use the first image's dimensions as reference (most common case) - if 0 in original_sizes: - orig_size = original_sizes[0] - processed_messages[msg_idx]["content"][item_idx]["text"] = self._remap_coordinates( - text_content, orig_size, self.force_resolution - ) - try: # Format prompt according to model requirements using the processor directly prompt = self.processor.apply_chat_template( - processed_messages, # Use processed messages instead of original + processed_messages, tokenize=False, add_generation_prompt=True ) @@ -186,16 +215,16 @@ class MLXVLMUITarsClient(BaseUITarsClient): "error": str(e) } - # Remap coordinates in the response back to original image space if needed - if self.force_resolution and original_sizes and 0 in original_sizes: - # Get original image size (using the first image) + # Process coordinates in the response back to original image space + if original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes: + # Get original image size and model size (using the first image) orig_size = original_sizes[0] + model_size = model_sizes[0] - # Check if output contains box tokens that need remapping + # Check if output contains box tokens that need processing if "<|box_start|>" in output: - # Remap coordinates from model space back to original image space - # We just swap the arguments - from force_resolution back to original size - output = self._remap_coordinates(output, self.force_resolution, orig_size) + # Process coordinates from model space back to original image space + output = self._process_coordinates(output, orig_size, model_size) # Format response to match OpenAI format response = { From 0abd72ff99a08f30d3872cb72bc615a9ec38c375 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 17:04:46 -0400 Subject: [PATCH 14/20] less confusing coordinate spaces --- .../agent/providers/uitars/clients/mlxvlm.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py index 6a88a8a3..24f41f34 100644 --- a/libs/agent/agent/providers/uitars/clients/mlxvlm.py +++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py @@ -102,9 +102,9 @@ class MLXVLMUITarsClient(BaseUITarsClient): def process_coords(match): model_x, model_y = int(match.group(1)), int(match.group(2)) # Scale coordinates from model space to original image space - # Note that model_size is (height, width) while original_size is (width, height) - new_x = int(model_x * original_size[0] / model_size[1]) # Width - new_y = int(model_y * original_size[1] / model_size[0]) # Height + # Both original_size and model_size are in (width, height) format + new_x = int(model_x * original_size[0] / model_size[0]) # Width + new_y = int(model_y * original_size[1] / model_size[1]) # Height return f"<|box_start|>({new_x},{new_y})<|box_end|>" return re.sub(box_pattern, process_coords, text) @@ -166,7 +166,8 @@ class MLXVLMUITarsClient(BaseUITarsClient): # Note: smart_resize expects (height, width) but PIL gives (width, height) height, width = original_size[1], original_size[0] new_height, new_width = smart_resize(height, width) - model_sizes[image_index] = (new_height, new_width) + # Store model size in (width, height) format for consistent coordinate processing + model_sizes[image_index] = (new_width, new_height) # Resize the image using the calculated dimensions from smart_resize resized_image = pil_image.resize((new_width, new_height)) @@ -180,6 +181,18 @@ class MLXVLMUITarsClient(BaseUITarsClient): processed_messages[msg_idx] = msg.copy() processed_messages[msg_idx]["content"] = processed_content + logger.info(f"resized {len(images)} from {original_sizes[0]} to {model_sizes[0]}") + + # Process user text input with box coordinates after image processing + # Swap original_size and model_size arguments for inverse transformation + for msg_idx, msg in enumerate(processed_messages): + if msg.get("role") == "user" and isinstance(msg.get("content"), str): + if "<|box_start|>" in msg.get("content") and original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes: + orig_size = original_sizes[0] + model_size = model_sizes[0] + # Swap arguments to perform inverse transformation for user input + processed_messages[msg_idx]["content"] = self._process_coordinates(msg["content"], model_size, orig_size) + try: # Format prompt according to model requirements using the processor directly prompt = self.processor.apply_chat_template( From 0304c45de5e43e2fd74d733a2d10493943bf880c Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 19:12:32 -0400 Subject: [PATCH 15/20] fix endpoint not liking string message content --- .../agent/providers/omni/clients/oaicompat.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/libs/agent/agent/providers/omni/clients/oaicompat.py b/libs/agent/agent/providers/omni/clients/oaicompat.py index 6a95896a..b15515fd 100644 --- a/libs/agent/agent/providers/omni/clients/oaicompat.py +++ b/libs/agent/agent/providers/omni/clients/oaicompat.py @@ -93,7 +93,14 @@ class OAICompatClient(BaseOmniClient): """ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} - final_messages = [{"role": "system", "content": system}] + final_messages = [ + { + "role": "system", + "content": [ + { "type": "text", "text": system } + ] + } + ] # Process messages for item in messages: @@ -117,7 +124,10 @@ class OAICompatClient(BaseOmniClient): else: message = { "role": item["role"], - "content": [{"type": "text", "text": item["content"]}], + "content": [{ + "type": "text", + "text": item["content"] + }], } final_messages.append(message) else: From 6a6fe48dbca0bd8f17652c538e08183ba289eefe Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 5 May 2025 10:31:15 -0400 Subject: [PATCH 16/20] use prncvrm's mlx-vlm patch for testing --- .../agent/providers/uitars/clients/mlxvlm.py | 17 ++++++++++++----- libs/agent/agent/providers/uitars/utils.py | 2 +- libs/agent/pyproject.toml | 4 ++-- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py index 24f41f34..197b08cb 100644 --- a/libs/agent/agent/providers/uitars/clients/mlxvlm.py +++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py @@ -202,8 +202,10 @@ class MLXVLMUITarsClient(BaseUITarsClient): ) tokenizer = cast(PreTrainedTokenizer, self.processor) + print("generating response...") + # Generate response - output = generate( + text_content, usage = generate( self.model, tokenizer, str(prompt), @@ -212,6 +214,10 @@ class MLXVLMUITarsClient(BaseUITarsClient): max_tokens=max_tokens ) + from pprint import pprint + print("DEBUG - AGENT GENERATION --------") + pprint(text_content) + print("DEBUG - AGENT GENERATION --------") except Exception as e: logger.error(f"Error generating response: {str(e)}") return { @@ -235,9 +241,9 @@ class MLXVLMUITarsClient(BaseUITarsClient): model_size = model_sizes[0] # Check if output contains box tokens that need processing - if "<|box_start|>" in output: + if "<|box_start|>" in text_content: # Process coordinates from model space back to original image space - output = self._process_coordinates(output, orig_size, model_size) + text_content = self._process_coordinates(text_content, orig_size, model_size) # Format response to match OpenAI format response = { @@ -245,12 +251,13 @@ class MLXVLMUITarsClient(BaseUITarsClient): { "message": { "role": "assistant", - "content": output + "content": text_content }, "finish_reason": "stop" } ], - "model": self.model_name + "model": self.model_name, + "usage": usage } return response diff --git a/libs/agent/agent/providers/uitars/utils.py b/libs/agent/agent/providers/uitars/utils.py index cc904115..bdfd58cd 100644 --- a/libs/agent/agent/providers/uitars/utils.py +++ b/libs/agent/agent/providers/uitars/utils.py @@ -105,7 +105,7 @@ async def to_agent_response_format( } ], truncation="auto", - usage=response["usage"], + usage=response.get("usage", {}), user=None, metadata={}, response=response diff --git a/libs/agent/pyproject.toml b/libs/agent/pyproject.toml index 8772575c..1289adca 100644 --- a/libs/agent/pyproject.toml +++ b/libs/agent/pyproject.toml @@ -36,7 +36,7 @@ openai = [ ] uitars = [ "httpx>=0.27.0,<0.29.0", - "mlx-vlm>=0.1.25" + "mlx-vlm @ git+https://github.com/prncvrm/mlx-vlm.git@fix/qwen2-position-id" ] ui = [ "gradio>=5.23.3,<6.0.0", @@ -86,7 +86,7 @@ all = [ "ollama>=0.4.7,<0.5.0", "gradio>=5.23.3,<6.0.0", "python-dotenv>=1.0.1,<2.0.0", - "mlx-vlm>=0.1.25" + "mlx-vlm @ git+https://github.com/prncvrm/mlx-vlm.git@fix/qwen2-position-id" ] [tool.pdm] From 44ef3e3bbe302c2e829d231ba6683bdc77c665f2 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 6 May 2025 15:45:03 -0400 Subject: [PATCH 17/20] use my own mlx-vlm patch --- libs/agent/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/agent/pyproject.toml b/libs/agent/pyproject.toml index 1289adca..94d97889 100644 --- a/libs/agent/pyproject.toml +++ b/libs/agent/pyproject.toml @@ -36,7 +36,7 @@ openai = [ ] uitars = [ "httpx>=0.27.0,<0.29.0", - "mlx-vlm @ git+https://github.com/prncvrm/mlx-vlm.git@fix/qwen2-position-id" + "mlx-vlm @ git+https://github.com/ddupont808/mlx-vlm.git@fix/qwen2-position-id" ] ui = [ "gradio>=5.23.3,<6.0.0", @@ -86,7 +86,7 @@ all = [ "ollama>=0.4.7,<0.5.0", "gradio>=5.23.3,<6.0.0", "python-dotenv>=1.0.1,<2.0.0", - "mlx-vlm @ git+https://github.com/prncvrm/mlx-vlm.git@fix/qwen2-position-id" + "mlx-vlm @ git+https://github.com/ddupont808/mlx-vlm.git@fix/qwen2-position-id" ] [tool.pdm] From ac2717f663019ede11790e2eb8ef34d405b4945e Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Sat, 10 May 2025 17:13:00 -0400 Subject: [PATCH 18/20] readme correction --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 673590f4..dafef93d 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ async def main(): agent = ComputerAgent( computer=computer, loop="UITARS", - model=LLM(provider="MLX", name="mlx-community/UI-TARS-1.5-7B-6bit") + model=LLM(provider="MLXVLM", name="mlx-community/UI-TARS-1.5-7B-6bit") ) await agent.run("Find the trycua/cua repository on GitHub and follow the quick start guide") @@ -193,7 +193,7 @@ For complete examples, see [agent_examples.py](./examples/agent_examples.py) or from agent import ComputerAgent, LLM, AgentLoop, LLMProvider # UI-TARS-1.5 agent for local execution with MLX -ComputerAgent(loop=AgentLoop.UITARS, model=LLM(provider=LLMProvider.MLX, name="mlx-community/UI-TARS-1.5-7B-6bit")) +ComputerAgent(loop=AgentLoop.UITARS, model=LLM(provider=LLMProvider.MLXVLM, name="mlx-community/UI-TARS-1.5-7B-6bit")) # OpenAI Computer-Use agent using OPENAI_API_KEY ComputerAgent(loop=AgentLoop.OPENAI, model=LLM(provider=LLMProvider.OPENAI, name="computer-use-preview")) # Anthropic Claude agent using ANTHROPIC_API_KEY From 28295fd72bbf19881f08da25145b59e5a10d57ee Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Sat, 10 May 2025 17:26:42 -0400 Subject: [PATCH 19/20] branch change --- libs/agent/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/agent/pyproject.toml b/libs/agent/pyproject.toml index 94d97889..5f23c2a1 100644 --- a/libs/agent/pyproject.toml +++ b/libs/agent/pyproject.toml @@ -36,7 +36,7 @@ openai = [ ] uitars = [ "httpx>=0.27.0,<0.29.0", - "mlx-vlm @ git+https://github.com/ddupont808/mlx-vlm.git@fix/qwen2-position-id" + "mlx-vlm @ git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id" ] ui = [ "gradio>=5.23.3,<6.0.0", @@ -86,7 +86,7 @@ all = [ "ollama>=0.4.7,<0.5.0", "gradio>=5.23.3,<6.0.0", "python-dotenv>=1.0.1,<2.0.0", - "mlx-vlm @ git+https://github.com/ddupont808/mlx-vlm.git@fix/qwen2-position-id" + "mlx-vlm @ git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id" ] [tool.pdm] From db823ac5612ae8f9b6cad8f22309835ca89ebfc8 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Sat, 10 May 2025 17:49:32 -0400 Subject: [PATCH 20/20] moved mlx into optional dep --- libs/agent/README.md | 1 + libs/agent/pyproject.toml | 2 ++ 2 files changed, 3 insertions(+) diff --git a/libs/agent/README.md b/libs/agent/README.md index 07f3d3fd..3a255c71 100644 --- a/libs/agent/README.md +++ b/libs/agent/README.md @@ -32,6 +32,7 @@ pip install "cua-agent[all]" pip install "cua-agent[openai]" # OpenAI Cua Loop pip install "cua-agent[anthropic]" # Anthropic Cua Loop pip install "cua-agent[uitars]" # UI-Tars support +pip install "cua-agent[uitars-mlx]" # local UI-Tars support with MLXVLM pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models) pip install "cua-agent[ui]" # Gradio UI for the agent ``` diff --git a/libs/agent/pyproject.toml b/libs/agent/pyproject.toml index 5f23c2a1..89d14883 100644 --- a/libs/agent/pyproject.toml +++ b/libs/agent/pyproject.toml @@ -36,6 +36,8 @@ openai = [ ] uitars = [ "httpx>=0.27.0,<0.29.0", +] +uitars-mlx = [ "mlx-vlm @ git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id" ] ui = [