From 39c4915682323f7789578d253d2e32249dd8ac9e Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Thu, 24 Apr 2025 19:24:12 -0400
Subject: [PATCH 01/20] consistency with other loops

---
 libs/agent/agent/providers/uitars/loop.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py
index 99132365..0d3bc9f7 100644
--- a/libs/agent/agent/providers/uitars/loop.py
+++ b/libs/agent/agent/providers/uitars/loop.py
@@ -232,8 +232,11 @@ class UITARSLoop(BaseLoop):
                     if self.client is None:
                         raise RuntimeError("Failed to initialize client")
 
-                # Convert messages to UI-TARS format
+                # Get messages in standard format from the message manager
+                self.message_manager.messages = messages.copy()
                 prepared_messages = self.message_manager.get_messages()
+                
+                # Convert messages to UI-TARS format
                 uitars_messages = self.to_uitars_format(prepared_messages)
                 
                 # Log request

From 4d21f9e2ea951f335816c73d5e206512b69676fa Mon Sep 17 00:00:00 2001
From: Morgan Dean <morgandeanhi@gmail.com>
Date: Sat, 26 Apr 2025 14:52:46 -0700
Subject: [PATCH 02/20] create mlxvlm provider

---
 libs/agent/agent/core/types.py                |   1 +
 .../agent/providers/uitars/clients/mlxvlm.py  | 158 ++++++++++++++++++
 libs/agent/pyproject.toml                     |   1 +
 3 files changed, 160 insertions(+)
 create mode 100644 libs/agent/agent/providers/uitars/clients/mlxvlm.py

diff --git a/libs/agent/agent/core/types.py b/libs/agent/agent/core/types.py
index ef50d09e..fd337062 100644
--- a/libs/agent/agent/core/types.py
+++ b/libs/agent/agent/core/types.py
@@ -23,6 +23,7 @@ class LLMProvider(StrEnum):
     OPENAI = "openai"
     OLLAMA = "ollama"
     OAICOMPAT = "oaicompat"
+    MLXVLM= "mlxvlm"
 
 
 @dataclass
diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
new file mode 100644
index 00000000..f644ce6d
--- /dev/null
+++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
@@ -0,0 +1,158 @@
+"""MLX LVM client implementation."""
+
+import logging
+import base64
+import tempfile
+import os
+from typing import Dict, List, Optional, Any, cast
+
+from .base import BaseUITarsClient
+import mlx.core as mx
+from mlx_vlm import load, generate
+from mlx_vlm.prompt_utils import apply_chat_template
+from mlx_vlm.utils import load_config
+from transformers.tokenization_utils import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+
+class MLXLMVUITarsClient(BaseUITarsClient):
+    """MLX LVM client implementation class."""
+
+    def __init__(self, api_key: Optional[str] = None, model: str = "mlx-community/UI-TARS-1.5-7B-4bit"):
+        """Initialize MLX LVM client.
+
+        Args:
+            api_key: Optional API key
+            model: Model name or path (defaults to mlx-community/UI-TARS-1.5-7B-4bit)
+        """
+        self.api_key = api_key
+        self.model = model
+
+    async def run_interleaved(
+        self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
+    ) -> Dict[str, Any]:
+        """Run interleaved chat completion.
+
+        Args:
+            messages: List of message dicts
+            system: System prompt
+            max_tokens: Optional max tokens override
+
+        Returns:
+            Response dict
+        """
+        # Extract text and images from messages
+        prompt_parts = []
+        images = []
+        
+        # Add system message first
+        prompt_parts.append(system)
+        
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", [])
+            
+            # Handle different content formats
+            if isinstance(content, str):
+                # If content is a string, just add it as text
+                prompt_parts.append(f"{role}: {content}")
+            elif isinstance(content, list):
+                # If content is a list, process each item
+                text_parts = []
+                
+                for item in content:
+                    if item.get("type") == "text":
+                        text_parts.append(item.get("text", ""))
+                    elif item.get("type") == "image_url":
+                        # Extract image URL and add to images list
+                        image_url = item.get("image_url", {}).get("url", "")
+                        if image_url.startswith("data:image/"):
+                            # Extract base64 data and convert to URL or save as temp file
+                            # For now, we'll just store the URL directly
+                            images.append(image_url)
+                
+                # Add text parts to prompt
+                if text_parts:
+                    prompt_parts.append(f"{role}: {''.join(text_parts)}")
+        
+        # Combine all text parts into a single prompt
+        combined_prompt = "\n".join(prompt_parts)
+        
+        try:
+            # Load model and processor
+            model_obj, processor = load(self.model)
+            config = load_config(self.model)
+            
+            # Process images to ensure they're in the right format
+            processed_images = []
+            for img in images:
+                if img.startswith('data:image/'):
+                    # Extract base64 data
+                    img_format = img.split(';')[0].split('/')[1]
+                    base64_data = img.split(',')[1]
+                    
+                    # Create a temporary file to store the image
+                    with tempfile.NamedTemporaryFile(suffix=f'.{img_format}', delete=False) as temp_file:
+                        temp_file.write(base64.b64decode(base64_data))
+                        processed_images.append(temp_file.name)
+                else:
+                    # Assume it's already a valid URL or path
+                    processed_images.append(img)
+            
+            # Format prompt according to model requirements
+            formatted_prompt = apply_chat_template(
+                processor, config, str(combined_prompt), num_images=len(processed_images)
+            )
+            
+            # Cast processor to PreTrainedTokenizer to satisfy type checker
+            tokenizer = cast(PreTrainedTokenizer, processor)
+            
+            # Generate response
+            output = generate(
+                model_obj, 
+                tokenizer, 
+                str(formatted_prompt), 
+                processed_images, 
+                verbose=False,
+                max_tokens=max_tokens
+            )
+            
+            # Clean up temporary files
+            for img_path in processed_images:
+                if img_path.startswith(tempfile.gettempdir()) and os.path.exists(img_path):
+                    try:
+                        os.unlink(img_path)
+                    except Exception as e:
+                        logger.warning(f"Failed to delete temporary file {img_path}: {e}")
+        except Exception as e:
+            logger.error(f"Error generating response: {str(e)}")
+            return {
+                "choices": [
+                    {
+                        "message": {
+                            "role": "assistant",
+                            "content": f"Error generating response: {str(e)}"
+                        },
+                        "finish_reason": "error"
+                    }
+                ],
+                "model": self.model,
+                "error": str(e)
+            }
+        
+        # Format response to match OpenAI format
+        response = {
+            "choices": [
+                {
+                    "message": {
+                        "role": "assistant",
+                        "content": output
+                    },
+                    "finish_reason": "stop"
+                }
+            ],
+            "model": self.model
+        }
+        
+        return response
diff --git a/libs/agent/pyproject.toml b/libs/agent/pyproject.toml
index 1572465b..d3b97112 100644
--- a/libs/agent/pyproject.toml
+++ b/libs/agent/pyproject.toml
@@ -36,6 +36,7 @@ openai = [
 ]
 uitars = [
     "httpx>=0.27.0,<0.29.0",
+    "mlx-vlm>=0.1.25"
 ]
 ui = [
     "gradio>=5.23.3,<6.0.0",

From 9c870fcdddb7f4db2b33c1415c50ff75c51abf35 Mon Sep 17 00:00:00 2001
From: Morgan Dean <morgandeanhi@gmail.com>
Date: Sat, 26 Apr 2025 15:15:36 -0700
Subject: [PATCH 03/20] Fix bugs with uitars loop config for multiple
 providers, add MLXVLM provider to agent core

---
 libs/agent/agent/core/factory.py          |  1 +
 libs/agent/agent/core/provider_config.py  |  2 ++
 libs/agent/agent/providers/uitars/loop.py | 38 ++++++++++++++++-------
 3 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/libs/agent/agent/core/factory.py b/libs/agent/agent/core/factory.py
index 461b5cbc..f0c6046e 100644
--- a/libs/agent/agent/core/factory.py
+++ b/libs/agent/agent/core/factory.py
@@ -116,6 +116,7 @@ class LoopFactory:
                 base_dir=trajectory_dir,
                 only_n_most_recent_images=only_n_most_recent_images,
                 provider_base_url=provider_base_url,
+                provider=provider,
             )
         else:
             raise ValueError(f"Unsupported loop type: {loop_type}")
diff --git a/libs/agent/agent/core/provider_config.py b/libs/agent/agent/core/provider_config.py
index 21a5d283..f6cd1feb 100644
--- a/libs/agent/agent/core/provider_config.py
+++ b/libs/agent/agent/core/provider_config.py
@@ -8,6 +8,7 @@ DEFAULT_MODELS = {
     LLMProvider.ANTHROPIC: "claude-3-7-sonnet-20250219",
     LLMProvider.OLLAMA: "gemma3:4b-it-q4_K_M",
     LLMProvider.OAICOMPAT: "Qwen2.5-VL-7B-Instruct",
+    LLMProvider.MLXVLM: "mlx-community/UI-TARS-1.5-7B-4bit",
 }
 
 # Map providers to their environment variable names
@@ -16,4 +17,5 @@ ENV_VARS = {
     LLMProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
     LLMProvider.OLLAMA: "none",
     LLMProvider.OAICOMPAT: "none",  # OpenAI-compatible API typically doesn't require an API key
+    LLMProvider.MLXVLM: "none",  # MLX VLM typically doesn't require an API key
 }
diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py
index 0d3bc9f7..26f6913f 100644
--- a/libs/agent/agent/providers/uitars/loop.py
+++ b/libs/agent/agent/providers/uitars/loop.py
@@ -23,6 +23,7 @@ from .tools.computer import ToolResult
 from .prompts import COMPUTER_USE, SYSTEM_PROMPT
 
 from .clients.oaicompat import OAICompatClient
+from .clients.mlxvlm import MLXLMVUITarsClient
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -44,6 +45,7 @@ class UITARSLoop(BaseLoop):
         computer: Computer,
         api_key: str,
         model: str,
+        provider: Optional[LLMProvider] = None,
         provider_base_url: Optional[str] = "http://localhost:8000/v1",
         only_n_most_recent_images: Optional[int] = 2,
         base_dir: Optional[str] = "trajectories",
@@ -64,9 +66,10 @@ class UITARSLoop(BaseLoop):
             max_retries: Maximum number of retries for API calls
             retry_delay: Delay between retries in seconds
             save_trajectory: Whether to save trajectory data
+            provider: The LLM provider to use (defaults to OAICOMPAT if not specified)
         """
         # Set provider before initializing base class
-        self.provider = LLMProvider.OAICOMPAT
+        self.provider = provider or LLMProvider.OAICOMPAT
         self.provider_base_url = provider_base_url
 
         # Initialize message manager with image retention config
@@ -113,7 +116,7 @@ class UITARSLoop(BaseLoop):
             logger.error(f"Error initializing tool manager: {str(e)}")
             logger.warning("Will attempt to initialize tools on first use.")
 
-        # Initialize client for the OAICompat provider
+        # Initialize client for the selected provider
         try:
             await self.initialize_client()
         except Exception as e:
@@ -128,18 +131,29 @@ class UITARSLoop(BaseLoop):
         """Initialize the appropriate client.
 
         Implements abstract method from BaseLoop to set up the specific
-        provider client (OAICompat for UI-TARS).
+        provider client based on the configured provider.
         """
         try:
-            logger.info(f"Initializing OAICompat client for UI-TARS with model {self.model}...")
-
-            self.client = OAICompatClient(
-                api_key=self.api_key or "EMPTY",  # Local endpoints typically don't require an API key
-                model=self.model,
-                provider_base_url=self.provider_base_url,
-            )
-
-            logger.info(f"Initialized OAICompat client with model {self.model}")
+            if self.provider == LLMProvider.MLXVLM:
+                logger.info(f"Initializing MLX VLM client for UI-TARS with model {self.model}...")
+                
+                self.client = MLXLMVUITarsClient(
+                    api_key=self.api_key,
+                    model=self.model,
+                )
+                
+                logger.info(f"Initialized MLX VLM client with model {self.model}")
+            else:
+                # Default to OAICompat client for other providers
+                logger.info(f"Initializing OAICompat client for UI-TARS with model {self.model}...")
+                
+                self.client = OAICompatClient(
+                    api_key=self.api_key or "EMPTY",  # Local endpoints typically don't require an API key
+                    model=self.model,
+                    provider_base_url=self.provider_base_url,
+                )
+                
+                logger.info(f"Initialized OAICompat client with model {self.model}")
         except Exception as e:
             logger.error(f"Error initializing client: {str(e)}")
             self.client = None

From 36887a82012ffa035f9f2f3e47e94dd12ca1b96b Mon Sep 17 00:00:00 2001
From: Morgan Dean <morgandeanhi@gmail.com>
Date: Sat, 26 Apr 2025 15:29:15 -0700
Subject: [PATCH 04/20] Add MLXVLM provider to agent_examples.py

---
 examples/agent_examples.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/agent_examples.py b/examples/agent_examples.py
index 01d7483e..189ecddd 100644
--- a/examples/agent_examples.py
+++ b/examples/agent_examples.py
@@ -36,6 +36,7 @@ async def run_agent_example():
                 # model=LLM(provider=LLMProvider.OPENAI, name="gpt-4o"),
                 # model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
                 # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3:4b-it-q4_K_M"),
+                # model=LLM(provider=LLMProvider.MLXVLM, name="mlx-community/UI-TARS-1.5-7B-4bit"),
                 model=LLM(
                     provider=LLMProvider.OAICOMPAT,
                     name="gemma-3-12b-it",

From 0a222c0fdf373e1c4e03985198a3334899af6046 Mon Sep 17 00:00:00 2001
From: Morgan Dean <morgandeanhi@gmail.com>
Date: Sat, 26 Apr 2025 18:44:45 -0700
Subject: [PATCH 05/20] Fix mispelling, update prompt to use PIL instead of
 temp files

---
 .../agent/providers/uitars/clients/mlxvlm.py  | 63 +++++++++----------
 libs/agent/agent/providers/uitars/loop.py     |  5 +-
 2 files changed, 30 insertions(+), 38 deletions(-)

diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
index f644ce6d..77d83146 100644
--- a/libs/agent/agent/providers/uitars/clients/mlxvlm.py
+++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
@@ -1,10 +1,12 @@
 """MLX LVM client implementation."""
 
+import io
 import logging
 import base64
 import tempfile
 import os
 from typing import Dict, List, Optional, Any, cast
+from PIL import Image
 
 from .base import BaseUITarsClient
 import mlx.core as mx
@@ -16,18 +18,21 @@ from transformers.tokenization_utils import PreTrainedTokenizer
 logger = logging.getLogger(__name__)
 
 
-class MLXLMVUITarsClient(BaseUITarsClient):
+class MLXVLMUITarsClient(BaseUITarsClient):
     """MLX LVM client implementation class."""
 
-    def __init__(self, api_key: Optional[str] = None, model: str = "mlx-community/UI-TARS-1.5-7B-4bit"):
+    def __init__(self, model: str = "mlx-community/UI-TARS-1.5-7B-4bit"):
         """Initialize MLX LVM client.
 
         Args:
-            api_key: Optional API key
             model: Model name or path (defaults to mlx-community/UI-TARS-1.5-7B-4bit)
         """
-        self.api_key = api_key
-        self.model = model
+        # Load model and processor
+        model_obj, processor = load(model)
+        self.config = load_config(model)
+        self.model = model_obj
+        self.processor = processor
+
 
     async def run_interleaved(
         self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
@@ -78,39 +83,34 @@ class MLXLMVUITarsClient(BaseUITarsClient):
         
         # Combine all text parts into a single prompt
         combined_prompt = "\n".join(prompt_parts)
+        processed_images = []
+        for img in images:
+            if img.startswith('data:image/'):
+                # Extract base64 data
+                base64_data = img.split(',')[1]
+                
+                # Convert base64 to PIL Image directly
+                image_data = base64.b64decode(base64_data)
+                pil_image = Image.open(io.BytesIO(image_data))
+                processed_images.append(pil_image)
+            else:
+                # Assume it's already a valid URL or path
+                # For file paths or URLs, we'll load them with PIL
+                pil_image = Image.open(img)
+                processed_images.append(pil_image)
         
         try:
-            # Load model and processor
-            model_obj, processor = load(self.model)
-            config = load_config(self.model)
-            
-            # Process images to ensure they're in the right format
-            processed_images = []
-            for img in images:
-                if img.startswith('data:image/'):
-                    # Extract base64 data
-                    img_format = img.split(';')[0].split('/')[1]
-                    base64_data = img.split(',')[1]
-                    
-                    # Create a temporary file to store the image
-                    with tempfile.NamedTemporaryFile(suffix=f'.{img_format}', delete=False) as temp_file:
-                        temp_file.write(base64.b64decode(base64_data))
-                        processed_images.append(temp_file.name)
-                else:
-                    # Assume it's already a valid URL or path
-                    processed_images.append(img)
-            
             # Format prompt according to model requirements
             formatted_prompt = apply_chat_template(
-                processor, config, str(combined_prompt), num_images=len(processed_images)
+                self.processor, self.config, str(combined_prompt), num_images=len(processed_images)
             )
             
             # Cast processor to PreTrainedTokenizer to satisfy type checker
-            tokenizer = cast(PreTrainedTokenizer, processor)
+            tokenizer = cast(PreTrainedTokenizer, self.processor)
             
             # Generate response
             output = generate(
-                model_obj, 
+                self.model, 
                 tokenizer, 
                 str(formatted_prompt), 
                 processed_images, 
@@ -118,13 +118,6 @@ class MLXLMVUITarsClient(BaseUITarsClient):
                 max_tokens=max_tokens
             )
             
-            # Clean up temporary files
-            for img_path in processed_images:
-                if img_path.startswith(tempfile.gettempdir()) and os.path.exists(img_path):
-                    try:
-                        os.unlink(img_path)
-                    except Exception as e:
-                        logger.warning(f"Failed to delete temporary file {img_path}: {e}")
         except Exception as e:
             logger.error(f"Error generating response: {str(e)}")
             return {
diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py
index 26f6913f..c0ea6c73 100644
--- a/libs/agent/agent/providers/uitars/loop.py
+++ b/libs/agent/agent/providers/uitars/loop.py
@@ -23,7 +23,7 @@ from .tools.computer import ToolResult
 from .prompts import COMPUTER_USE, SYSTEM_PROMPT
 
 from .clients.oaicompat import OAICompatClient
-from .clients.mlxvlm import MLXLMVUITarsClient
+from .clients.mlxvlm import MLXVLMUITarsClient
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -137,8 +137,7 @@ class UITARSLoop(BaseLoop):
             if self.provider == LLMProvider.MLXVLM:
                 logger.info(f"Initializing MLX VLM client for UI-TARS with model {self.model}...")
                 
-                self.client = MLXLMVUITarsClient(
-                    api_key=self.api_key,
+                self.client = MLXVLMUITarsClient(
                     model=self.model,
                 )
                 

From 8da80d5ebf928b1bef465f2a85c440517eead93c Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 28 Apr 2025 10:32:50 -0400
Subject: [PATCH 06/20] added mlx vlm to cua-agent[all]

---
 libs/agent/pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libs/agent/pyproject.toml b/libs/agent/pyproject.toml
index d3b97112..8772575c 100644
--- a/libs/agent/pyproject.toml
+++ b/libs/agent/pyproject.toml
@@ -85,7 +85,8 @@ all = [
     "requests>=2.31.0,<3.0.0",
     "ollama>=0.4.7,<0.5.0",
     "gradio>=5.23.3,<6.0.0",
-    "python-dotenv>=1.0.1,<2.0.0"
+    "python-dotenv>=1.0.1,<2.0.0",
+    "mlx-vlm>=0.1.25"
 ]
 
 [tool.pdm]

From 0b61dea8a4f5678722e7a8be76a3a1025e7a594d Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 28 Apr 2025 10:42:04 -0400
Subject: [PATCH 07/20] use model chat template

---
 .../agent/providers/uitars/clients/mlxvlm.py  | 81 +++++++------------
 1 file changed, 27 insertions(+), 54 deletions(-)

diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
index 77d83146..0eca5292 100644
--- a/libs/agent/agent/providers/uitars/clients/mlxvlm.py
+++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
@@ -47,73 +47,46 @@ class MLXVLMUITarsClient(BaseUITarsClient):
         Returns:
             Response dict
         """
-        # Extract text and images from messages
-        prompt_parts = []
-        images = []
-        
-        # Add system message first
-        prompt_parts.append(system)
-        
-        for msg in messages:
-            role = msg.get("role", "user")
-            content = msg.get("content", [])
+        # Ensure the system message is included
+        if not any(msg.get("role") == "system" for msg in messages):
+            messages = [{"role": "system", "content": system}] + messages
             
-            # Handle different content formats
-            if isinstance(content, str):
-                # If content is a string, just add it as text
-                prompt_parts.append(f"{role}: {content}")
-            elif isinstance(content, list):
-                # If content is a list, process each item
-                text_parts = []
-                
+        # Extract any images from the messages
+        images = []
+        for msg in messages:
+            content = msg.get("content", [])
+            if isinstance(content, list):
                 for item in content:
-                    if item.get("type") == "text":
-                        text_parts.append(item.get("text", ""))
-                    elif item.get("type") == "image_url":
-                        # Extract image URL and add to images list
+                    if item.get("type") == "image_url":
                         image_url = item.get("image_url", {}).get("url", "")
                         if image_url.startswith("data:image/"):
-                            # Extract base64 data and convert to URL or save as temp file
-                            # For now, we'll just store the URL directly
-                            images.append(image_url)
-                
-                # Add text parts to prompt
-                if text_parts:
-                    prompt_parts.append(f"{role}: {''.join(text_parts)}")
-        
-        # Combine all text parts into a single prompt
-        combined_prompt = "\n".join(prompt_parts)
-        processed_images = []
-        for img in images:
-            if img.startswith('data:image/'):
-                # Extract base64 data
-                base64_data = img.split(',')[1]
-                
-                # Convert base64 to PIL Image directly
-                image_data = base64.b64decode(base64_data)
-                pil_image = Image.open(io.BytesIO(image_data))
-                processed_images.append(pil_image)
-            else:
-                # Assume it's already a valid URL or path
-                # For file paths or URLs, we'll load them with PIL
-                pil_image = Image.open(img)
-                processed_images.append(pil_image)
+                            # Extract base64 data
+                            base64_data = image_url.split(',')[1]
+                            
+                            # Convert base64 to PIL Image
+                            image_data = base64.b64decode(base64_data)
+                            pil_image = Image.open(io.BytesIO(image_data))
+                            images.append(pil_image)
+                        else:
+                            # Handle file path or URL
+                            pil_image = Image.open(image_url)
+                            images.append(pil_image)
         
         try:
-            # Format prompt according to model requirements
-            formatted_prompt = apply_chat_template(
-                self.processor, self.config, str(combined_prompt), num_images=len(processed_images)
+            # Format prompt according to model requirements using the processor directly
+            prompt = self.processor.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
             )
-            
-            # Cast processor to PreTrainedTokenizer to satisfy type checker
             tokenizer = cast(PreTrainedTokenizer, self.processor)
             
             # Generate response
             output = generate(
                 self.model, 
                 tokenizer, 
-                str(formatted_prompt), 
-                processed_images, 
+                str(prompt), 
+                images, 
                 verbose=False,
                 max_tokens=max_tokens
             )

From 184db1037ff3bffc2bb0fe4f1cb864c7f4dbaaa3 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 28 Apr 2025 14:28:17 -0400
Subject: [PATCH 08/20] add to gradio ui

---
 libs/agent/agent/ui/gradio/app.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/libs/agent/agent/ui/gradio/app.py b/libs/agent/agent/ui/gradio/app.py
index c6ac57ea..16c5f7e6 100644
--- a/libs/agent/agent/ui/gradio/app.py
+++ b/libs/agent/agent/ui/gradio/app.py
@@ -163,8 +163,10 @@ MODEL_MAPPINGS = {
         "claude-3-7-sonnet-20250219": "claude-3-7-sonnet-20250219",
     },
     "uitars": {
-        # UI-TARS models default to custom endpoint
-        "default": "ByteDance-Seed/UI-TARS-1.5-7B",
+        # UI-TARS models using MLXVLM provider
+        "default": "mlx-community/UI-TARS-1.5-7B-4bit",
+        "UI-TARS-1.5-7B-4bit": "mlx-community/UI-TARS-1.5-7B-4bit",
+        "UI-TARS-1.5-7B-6bit": "mlx-community/UI-TARS-1.5-7B-6bit"
     },
     "ollama": {
         # For Ollama models, we keep the original name
@@ -287,8 +289,16 @@ def get_provider_and_model(model_name: str, loop_provider: str) -> tuple:
         model_name_to_use = cleaned_model_name
         # agent_loop remains AgentLoop.OMNI
     elif agent_loop == AgentLoop.UITARS:
-        provider = LLMProvider.OAICOMPAT
-        model_name_to_use = MODEL_MAPPINGS["uitars"]["default"]  # Default 
+        # For UITARS, use MLXVLM provider for the MLX models, OAICOMPAT for custom
+        if model_name == "Custom model...":
+            provider = LLMProvider.OAICOMPAT
+            model_name_to_use = "tgi"
+        else:
+            provider = LLMProvider.MLXVLM
+            # Get the model name from the mappings or use as-is if not found
+            model_name_to_use = MODEL_MAPPINGS["uitars"].get(
+                model_name, model_name if model_name else MODEL_MAPPINGS["uitars"]["default"]
+            )
     else:
         # Default to OpenAI if unrecognized loop
         provider = LLMProvider.OPENAI
@@ -558,7 +568,11 @@ def create_gradio_ui(
         "OPENAI": openai_models,
         "ANTHROPIC": anthropic_models,
         "OMNI": omni_models + ["Custom model..."],  # Add custom model option
-        "UITARS": ["Custom model..."],  # UI-TARS options
+        "UITARS": [
+            "mlx-community/UI-TARS-1.5-7B-4bit",
+            "mlx-community/UI-TARS-1.5-7B-6bit",
+            "Custom model..."
+        ],  # UI-TARS options with MLX models
     }
 
     # --- Apply Saved Settings (override defaults if available) ---

From a87efe86da5754a013fcc01d0d1f7cf1126a7dec Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 28 Apr 2025 14:34:59 -0400
Subject: [PATCH 09/20] correct mappings

---
 libs/agent/agent/ui/gradio/app.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/agent/agent/ui/gradio/app.py b/libs/agent/agent/ui/gradio/app.py
index 16c5f7e6..354580d7 100644
--- a/libs/agent/agent/ui/gradio/app.py
+++ b/libs/agent/agent/ui/gradio/app.py
@@ -165,8 +165,8 @@ MODEL_MAPPINGS = {
     "uitars": {
         # UI-TARS models using MLXVLM provider
         "default": "mlx-community/UI-TARS-1.5-7B-4bit",
-        "UI-TARS-1.5-7B-4bit": "mlx-community/UI-TARS-1.5-7B-4bit",
-        "UI-TARS-1.5-7B-6bit": "mlx-community/UI-TARS-1.5-7B-6bit"
+        "mlx-community/UI-TARS-1.5-7B-4bit": "mlx-community/UI-TARS-1.5-7B-4bit",
+        "mlx-community/UI-TARS-1.5-7B-6bit": "mlx-community/UI-TARS-1.5-7B-6bit"
     },
     "ollama": {
         # For Ollama models, we keep the original name

From f2501ee6b0f6ee3dc65478a3bb5aabc789a2fc32 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 28 Apr 2025 14:41:13 -0400
Subject: [PATCH 10/20] add to readme

---
 README.md            |  4 ++--
 libs/agent/README.md | 29 +++++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 53102fcb..b0630760 100644
--- a/README.md
+++ b/README.md
@@ -80,8 +80,8 @@ If you want to use AI agents with virtualized environments:
    async with Computer(verbosity=logging.DEBUG) as macos_computer:
      agent = ComputerAgent(
          computer=macos_computer,
-         loop=AgentLoop.OPENAI, # or AgentLoop.ANTHROPIC, or AgentLoop.OMNI
-         model=LLM(provider=LLMProvider.OPENAI) # or LLM(provider=LLMProvider.ANTHROPIC)
+         loop=AgentLoop.OPENAI, # or AgentLoop.UITARS, AgentLoop.OMNI, or AgentLoop.ANTHROPIC
+         model=LLM(provider=LLMProvider.OPENAI) # or LLM(provider=LLMProvider.MLXVLM, name="mlx-community/UI-TARS-1.5-7B-4bit")
      )
 
      tasks = [
diff --git a/libs/agent/README.md b/libs/agent/README.md
index e5dad869..bc4bce32 100644
--- a/libs/agent/README.md
+++ b/libs/agent/README.md
@@ -136,7 +136,32 @@ The Gradio UI provides:
 
 ### Using UI-TARS
 
-You can use UI-TARS by first following the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md). This will give you a provider URL like this: `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the gradio UI.
+The UI-TARS models are available in two forms:
+
+1. **MLX UI-TARS models** (Default): These models run locally using MLXVLM provider
+   - `mlx-community/UI-TARS-1.5-7B-4bit` (default) - 4-bit quantized version
+   - `mlx-community/UI-TARS-1.5-7B-6bit` - 6-bit quantized version for higher quality
+
+   ```python
+   agent = ComputerAgent(
+       computer=macos_computer,
+       loop=AgentLoop.UITARS,
+       model=LLM(provider=LLMProvider.MLXVLM, name="mlx-community/UI-TARS-1.5-7B-4bit")
+   )
+   ```
+
+2. **OpenAI-compatible UI-TARS**: For using the original ByteDance model
+   - If you want to use the original ByteDance UI-TARS model via an OpenAI-compatible API, follow the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md)
+   - This will give you a provider URL like `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the code or Gradio UI:
+
+   ```python 
+   agent = ComputerAgent(
+       computer=macos_computer,
+       loop=AgentLoop.UITARS,
+       model=LLM(provider=LLMProvider.OAICOMPAT, name="tgi", 
+                provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
+   )
+   ```
 
 ## Agent Loops
 
@@ -146,7 +171,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
 |:-----------|:-----------------|:------------|:-------------|
 | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
 | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
-| `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required |
+| `AgentLoop.UITARS` | • `mlx-community/UI-TARS-1.5-7B-4bit` (default)<br>• `mlx-community/UI-TARS-1.5-7B-6bit`<br>• `ByteDance-Seed/UI-TARS-1.5-7B` (via openAI-compatible endpoint) | Uses UI-TARS models with MLXVLM (default) or OAICOMPAT providers | Not Required |
 | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
 
 ## AgentResponse

From 8cc28612c860876ab8ce1be81b8695fdc350b132 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 28 Apr 2025 14:57:16 -0400
Subject: [PATCH 11/20] log model name instead of model

---
 libs/agent/agent/providers/uitars/clients/mlxvlm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
index 0eca5292..d1e5dfff 100644
--- a/libs/agent/agent/providers/uitars/clients/mlxvlm.py
+++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
@@ -32,6 +32,7 @@ class MLXVLMUITarsClient(BaseUITarsClient):
         self.config = load_config(model)
         self.model = model_obj
         self.processor = processor
+        self.model_name = model
 
 
     async def run_interleaved(
@@ -103,7 +104,7 @@ class MLXVLMUITarsClient(BaseUITarsClient):
                         "finish_reason": "error"
                     }
                 ],
-                "model": self.model,
+                "model": self.model_name,
                 "error": str(e)
             }
         
@@ -118,7 +119,7 @@ class MLXVLMUITarsClient(BaseUITarsClient):
                     "finish_reason": "stop"
                 }
             ],
-            "model": self.model
+            "model": self.model_name
         }
         
         return response

From 00eb09209c27129cacf55920d34f5051de414619 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 28 Apr 2025 15:23:50 -0400
Subject: [PATCH 12/20] added forced resolution

---
 .../agent/providers/uitars/clients/mlxvlm.py  | 133 +++++++++++++++---
 1 file changed, 111 insertions(+), 22 deletions(-)

diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
index d1e5dfff..c0c9b459 100644
--- a/libs/agent/agent/providers/uitars/clients/mlxvlm.py
+++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
@@ -5,7 +5,8 @@ import logging
 import base64
 import tempfile
 import os
-from typing import Dict, List, Optional, Any, cast
+import re
+from typing import Dict, List, Optional, Any, cast, Tuple
 from PIL import Image
 
 from .base import BaseUITarsClient
@@ -21,11 +22,17 @@ logger = logging.getLogger(__name__)
 class MLXVLMUITarsClient(BaseUITarsClient):
     """MLX LVM client implementation class."""
 
-    def __init__(self, model: str = "mlx-community/UI-TARS-1.5-7B-4bit"):
+    def __init__(
+        self, 
+        model: str = "mlx-community/UI-TARS-1.5-7B-4bit", 
+        force_resolution: Optional[Tuple[int, int]] = (1512, 982)
+    ):
         """Initialize MLX LVM client.
 
         Args:
             model: Model name or path (defaults to mlx-community/UI-TARS-1.5-7B-4bit)
+            force_resolution: Optional target resolution to resize images to (width, height).
+                              If None, images will not be resized.
         """
         # Load model and processor
         model_obj, processor = load(model)
@@ -33,8 +40,32 @@ class MLXVLMUITarsClient(BaseUITarsClient):
         self.model = model_obj
         self.processor = processor
         self.model_name = model
+        self.force_resolution = force_resolution
 
 
+    def _remap_coordinates(self, text: str, original_size: Tuple[int, int], target_size: Tuple[int, int]) -> str:
+        """Remap coordinates in box tokens based on image resizing.
+        
+        Args:
+            text: Text containing box tokens
+            original_size: Original image size (width, height)
+            target_size: Target image size (width, height)
+            
+        Returns:
+            Text with remapped coordinates
+        """
+        # Find all box tokens
+        box_pattern = r"<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>"
+        
+        def remap_coords(match):
+            x, y = int(match.group(1)), int(match.group(2))
+            # Scale coordinates to new dimensions
+            new_x = int(x * target_size[0] / original_size[0])
+            new_y = int(y * target_size[1] / original_size[1])
+            return f"<|box_start|>({new_x},{new_y})<|box_end|>"
+        
+        return re.sub(box_pattern, remap_coords, text)
+
     async def run_interleaved(
         self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
     ) -> Dict[str, Any]:
@@ -51,32 +82,79 @@ class MLXVLMUITarsClient(BaseUITarsClient):
         # Ensure the system message is included
         if not any(msg.get("role") == "system" for msg in messages):
             messages = [{"role": "system", "content": system}] + messages
-            
-        # Extract any images from the messages
+        
+        # Create a deep copy of messages to avoid modifying the original
+        processed_messages = messages.copy()
+        
+        # Extract images and process messages if force_resolution is set
         images = []
-        for msg in messages:
+        original_sizes = {}  # Track original sizes of images for coordinate remapping
+        image_index = 0
+        
+        for msg_idx, msg in enumerate(messages):
             content = msg.get("content", [])
-            if isinstance(content, list):
-                for item in content:
-                    if item.get("type") == "image_url":
-                        image_url = item.get("image_url", {}).get("url", "")
-                        if image_url.startswith("data:image/"):
-                            # Extract base64 data
-                            base64_data = image_url.split(',')[1]
-                            
-                            # Convert base64 to PIL Image
-                            image_data = base64.b64decode(base64_data)
-                            pil_image = Image.open(io.BytesIO(image_data))
-                            images.append(pil_image)
-                        else:
-                            # Handle file path or URL
-                            pil_image = Image.open(image_url)
-                            images.append(pil_image)
+            if not isinstance(content, list):
+                continue
+                
+            # Create a copy of the content list to modify
+            processed_content = []
+            
+            for item_idx, item in enumerate(content):
+                if item.get("type") == "image_url":
+                    image_url = item.get("image_url", {}).get("url", "")
+                    pil_image = None
+                    
+                    if image_url.startswith("data:image/"):
+                        # Extract base64 data
+                        base64_data = image_url.split(',')[1]
+                        # Convert base64 to PIL Image
+                        image_data = base64.b64decode(base64_data)
+                        pil_image = Image.open(io.BytesIO(image_data))
+                    else:
+                        # Handle file path or URL
+                        pil_image = Image.open(image_url)
+                    
+                    # Store original image size for coordinate mapping
+                    original_sizes[image_index] = pil_image.size
+                    
+                    # Resize image if force_resolution is set
+                    if self.force_resolution:
+                        pil_image = pil_image.resize(self.force_resolution)
+                    
+                    images.append(pil_image)
+                    image_index += 1
+                
+                # Copy items to processed content list
+                processed_content.append(item.copy())
+            
+            # Update the processed message content
+            processed_messages[msg_idx] = msg.copy()
+            processed_messages[msg_idx]["content"] = processed_content
+        
+        # Remap coordinates in messages with box tokens if force_resolution is set
+        if self.force_resolution and original_sizes:
+            for msg_idx, msg in enumerate(processed_messages):
+                content = msg.get("content", [])
+                if not isinstance(content, list):
+                    continue
+                
+                for item_idx, item in enumerate(content):
+                    if item.get("type") == "text":
+                        text_content = item.get("text", "")
+                        
+                        # Check if there are any box tokens to remap
+                        if "<|box_start|>" in text_content:
+                            # Use the first image's dimensions as reference (most common case)
+                            if 0 in original_sizes:
+                                orig_size = original_sizes[0]
+                                processed_messages[msg_idx]["content"][item_idx]["text"] = self._remap_coordinates(
+                                    text_content, orig_size, self.force_resolution
+                                )
         
         try:
             # Format prompt according to model requirements using the processor directly
             prompt = self.processor.apply_chat_template(
-                messages,
+                processed_messages,  # Use processed messages instead of original
                 tokenize=False,
                 add_generation_prompt=True
             )
@@ -108,6 +186,17 @@ class MLXVLMUITarsClient(BaseUITarsClient):
                 "error": str(e)
             }
         
+        # Remap coordinates in the response back to original image space if needed
+        if self.force_resolution and original_sizes and 0 in original_sizes:
+            # Get original image size (using the first image)
+            orig_size = original_sizes[0]
+            
+            # Check if output contains box tokens that need remapping
+            if "<|box_start|>" in output:
+                # Remap coordinates from model space back to original image space
+                # We just swap the arguments - from force_resolution back to original size
+                output = self._remap_coordinates(output, self.force_resolution, orig_size)
+        
         # Format response to match OpenAI format
         response = {
             "choices": [

From 8e8200dc17595f692aa283d27ba92888ac8d7d7f Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 28 Apr 2025 16:32:48 -0400
Subject: [PATCH 13/20] extra coordinate processing

---
 .../agent/providers/uitars/clients/mlxvlm.py  | 133 +++++++++++-------
 1 file changed, 81 insertions(+), 52 deletions(-)

diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
index c0c9b459..6a88a8a3 100644
--- a/libs/agent/agent/providers/uitars/clients/mlxvlm.py
+++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
@@ -6,6 +6,7 @@ import base64
 import tempfile
 import os
 import re
+import math
 from typing import Dict, List, Optional, Any, cast, Tuple
 from PIL import Image
 
@@ -18,53 +19,95 @@ from transformers.tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
+# Constants for smart_resize
+IMAGE_FACTOR = 28
+MIN_PIXELS = 100 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
+
+def round_by_factor(number: float, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+
+def ceil_by_factor(number: float, factor: int) -> int:
+    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+
+def floor_by_factor(number: float, factor: int) -> int:
+    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+
+def smart_resize(
+    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
+) -> tuple[int, int]:
+    """
+    Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    return h_bar, w_bar
 
 class MLXVLMUITarsClient(BaseUITarsClient):
     """MLX LVM client implementation class."""
 
     def __init__(
         self, 
-        model: str = "mlx-community/UI-TARS-1.5-7B-4bit", 
-        force_resolution: Optional[Tuple[int, int]] = (1512, 982)
+        model: str = "mlx-community/UI-TARS-1.5-7B-4bit"
     ):
         """Initialize MLX LVM client.
 
         Args:
             model: Model name or path (defaults to mlx-community/UI-TARS-1.5-7B-4bit)
-            force_resolution: Optional target resolution to resize images to (width, height).
-                              If None, images will not be resized.
         """
         # Load model and processor
-        model_obj, processor = load(model)
+        model_obj, processor = load(
+            model, 
+            processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS}
+        )
         self.config = load_config(model)
         self.model = model_obj
         self.processor = processor
         self.model_name = model
-        self.force_resolution = force_resolution
 
-
-    def _remap_coordinates(self, text: str, original_size: Tuple[int, int], target_size: Tuple[int, int]) -> str:
-        """Remap coordinates in box tokens based on image resizing.
+    def _process_coordinates(self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]) -> str:
+        """Process coordinates in box tokens based on image resizing using smart_resize approach.
         
         Args:
             text: Text containing box tokens
             original_size: Original image size (width, height)
-            target_size: Target image size (width, height)
+            model_size: Model processed image size (width, height)
             
         Returns:
-            Text with remapped coordinates
+            Text with processed coordinates
         """
         # Find all box tokens
         box_pattern = r"<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>"
         
-        def remap_coords(match):
-            x, y = int(match.group(1)), int(match.group(2))
-            # Scale coordinates to new dimensions
-            new_x = int(x * target_size[0] / original_size[0])
-            new_y = int(y * target_size[1] / original_size[1])
+        def process_coords(match):
+            model_x, model_y = int(match.group(1)), int(match.group(2))
+            # Scale coordinates from model space to original image space
+            # Note that model_size is (height, width) while original_size is (width, height)
+            new_x = int(model_x * original_size[0] / model_size[1])  # Width
+            new_y = int(model_y * original_size[1] / model_size[0])  # Height
             return f"<|box_start|>({new_x},{new_y})<|box_end|>"
         
-        return re.sub(box_pattern, remap_coords, text)
+        return re.sub(box_pattern, process_coords, text)
 
     async def run_interleaved(
         self, messages: List[Dict[str, Any]], system: str, max_tokens: Optional[int] = None
@@ -86,9 +129,10 @@ class MLXVLMUITarsClient(BaseUITarsClient):
         # Create a deep copy of messages to avoid modifying the original
         processed_messages = messages.copy()
         
-        # Extract images and process messages if force_resolution is set
+        # Extract images and process messages
         images = []
-        original_sizes = {}  # Track original sizes of images for coordinate remapping
+        original_sizes = {}  # Track original sizes of images for coordinate mapping
+        model_sizes = {}  # Track model processed sizes
         image_index = 0
         
         for msg_idx, msg in enumerate(messages):
@@ -115,13 +159,18 @@ class MLXVLMUITarsClient(BaseUITarsClient):
                         pil_image = Image.open(image_url)
                     
                     # Store original image size for coordinate mapping
-                    original_sizes[image_index] = pil_image.size
+                    original_size = pil_image.size
+                    original_sizes[image_index] = original_size
                     
-                    # Resize image if force_resolution is set
-                    if self.force_resolution:
-                        pil_image = pil_image.resize(self.force_resolution)
+                    # Use smart_resize to determine model size
+                    # Note: smart_resize expects (height, width) but PIL gives (width, height)
+                    height, width = original_size[1], original_size[0]
+                    new_height, new_width = smart_resize(height, width)
+                    model_sizes[image_index] = (new_height, new_width)
                     
-                    images.append(pil_image)
+                    # Resize the image using the calculated dimensions from smart_resize
+                    resized_image = pil_image.resize((new_width, new_height))
+                    images.append(resized_image)
                     image_index += 1
                 
                 # Copy items to processed content list
@@ -131,30 +180,10 @@ class MLXVLMUITarsClient(BaseUITarsClient):
             processed_messages[msg_idx] = msg.copy()
             processed_messages[msg_idx]["content"] = processed_content
         
-        # Remap coordinates in messages with box tokens if force_resolution is set
-        if self.force_resolution and original_sizes:
-            for msg_idx, msg in enumerate(processed_messages):
-                content = msg.get("content", [])
-                if not isinstance(content, list):
-                    continue
-                
-                for item_idx, item in enumerate(content):
-                    if item.get("type") == "text":
-                        text_content = item.get("text", "")
-                        
-                        # Check if there are any box tokens to remap
-                        if "<|box_start|>" in text_content:
-                            # Use the first image's dimensions as reference (most common case)
-                            if 0 in original_sizes:
-                                orig_size = original_sizes[0]
-                                processed_messages[msg_idx]["content"][item_idx]["text"] = self._remap_coordinates(
-                                    text_content, orig_size, self.force_resolution
-                                )
-        
         try:
             # Format prompt according to model requirements using the processor directly
             prompt = self.processor.apply_chat_template(
-                processed_messages,  # Use processed messages instead of original
+                processed_messages,
                 tokenize=False,
                 add_generation_prompt=True
             )
@@ -186,16 +215,16 @@ class MLXVLMUITarsClient(BaseUITarsClient):
                 "error": str(e)
             }
         
-        # Remap coordinates in the response back to original image space if needed
-        if self.force_resolution and original_sizes and 0 in original_sizes:
-            # Get original image size (using the first image)
+        # Process coordinates in the response back to original image space
+        if original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
+            # Get original image size and model size (using the first image)
             orig_size = original_sizes[0]
+            model_size = model_sizes[0]
             
-            # Check if output contains box tokens that need remapping
+            # Check if output contains box tokens that need processing
             if "<|box_start|>" in output:
-                # Remap coordinates from model space back to original image space
-                # We just swap the arguments - from force_resolution back to original size
-                output = self._remap_coordinates(output, self.force_resolution, orig_size)
+                # Process coordinates from model space back to original image space
+                output = self._process_coordinates(output, orig_size, model_size)
         
         # Format response to match OpenAI format
         response = {

From 0abd72ff99a08f30d3872cb72bc615a9ec38c375 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 28 Apr 2025 17:04:46 -0400
Subject: [PATCH 14/20] less confusing coordinate spaces

---
 .../agent/providers/uitars/clients/mlxvlm.py  | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
index 6a88a8a3..24f41f34 100644
--- a/libs/agent/agent/providers/uitars/clients/mlxvlm.py
+++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
@@ -102,9 +102,9 @@ class MLXVLMUITarsClient(BaseUITarsClient):
         def process_coords(match):
             model_x, model_y = int(match.group(1)), int(match.group(2))
             # Scale coordinates from model space to original image space
-            # Note that model_size is (height, width) while original_size is (width, height)
-            new_x = int(model_x * original_size[0] / model_size[1])  # Width
-            new_y = int(model_y * original_size[1] / model_size[0])  # Height
+            # Both original_size and model_size are in (width, height) format
+            new_x = int(model_x * original_size[0] / model_size[0])  # Width
+            new_y = int(model_y * original_size[1] / model_size[1])  # Height
             return f"<|box_start|>({new_x},{new_y})<|box_end|>"
         
         return re.sub(box_pattern, process_coords, text)
@@ -166,7 +166,8 @@ class MLXVLMUITarsClient(BaseUITarsClient):
                     # Note: smart_resize expects (height, width) but PIL gives (width, height)
                     height, width = original_size[1], original_size[0]
                     new_height, new_width = smart_resize(height, width)
-                    model_sizes[image_index] = (new_height, new_width)
+                    # Store model size in (width, height) format for consistent coordinate processing
+                    model_sizes[image_index] = (new_width, new_height)
                     
                     # Resize the image using the calculated dimensions from smart_resize
                     resized_image = pil_image.resize((new_width, new_height))
@@ -180,6 +181,18 @@ class MLXVLMUITarsClient(BaseUITarsClient):
             processed_messages[msg_idx] = msg.copy()
             processed_messages[msg_idx]["content"] = processed_content
         
+        logger.info(f"resized {len(images)} from {original_sizes[0]} to {model_sizes[0]}")
+        
+        # Process user text input with box coordinates after image processing
+        # Swap original_size and model_size arguments for inverse transformation
+        for msg_idx, msg in enumerate(processed_messages):
+            if msg.get("role") == "user" and isinstance(msg.get("content"), str):
+                if "<|box_start|>" in msg.get("content") and original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
+                    orig_size = original_sizes[0]
+                    model_size = model_sizes[0]
+                    # Swap arguments to perform inverse transformation for user input
+                    processed_messages[msg_idx]["content"] = self._process_coordinates(msg["content"], model_size, orig_size)
+        
         try:
             # Format prompt according to model requirements using the processor directly
             prompt = self.processor.apply_chat_template(

From 0304c45de5e43e2fd74d733a2d10493943bf880c Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 28 Apr 2025 19:12:32 -0400
Subject: [PATCH 15/20] fix endpoint not liking string message content

---
 .../agent/providers/omni/clients/oaicompat.py      | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/libs/agent/agent/providers/omni/clients/oaicompat.py b/libs/agent/agent/providers/omni/clients/oaicompat.py
index 6a95896a..b15515fd 100644
--- a/libs/agent/agent/providers/omni/clients/oaicompat.py
+++ b/libs/agent/agent/providers/omni/clients/oaicompat.py
@@ -93,7 +93,14 @@ class OAICompatClient(BaseOmniClient):
         """
         headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
 
-        final_messages = [{"role": "system", "content": system}]
+        final_messages = [
+            {
+                "role": "system", 
+                "content": [
+                    { "type": "text", "text": system }
+                ]
+            }
+        ]
 
         # Process messages
         for item in messages:
@@ -117,7 +124,10 @@ class OAICompatClient(BaseOmniClient):
                     else:
                         message = {
                             "role": item["role"],
-                            "content": [{"type": "text", "text": item["content"]}],
+                            "content": [{
+                                "type": "text", 
+                                "text": item["content"]
+                            }],
                         }
                     final_messages.append(message)
             else:

From 6a6fe48dbca0bd8f17652c538e08183ba289eefe Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 5 May 2025 10:31:15 -0400
Subject: [PATCH 16/20] use prncvrm's mlx-vlm patch for testing

---
 .../agent/providers/uitars/clients/mlxvlm.py    | 17 ++++++++++++-----
 libs/agent/agent/providers/uitars/utils.py      |  2 +-
 libs/agent/pyproject.toml                       |  4 ++--
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/libs/agent/agent/providers/uitars/clients/mlxvlm.py b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
index 24f41f34..197b08cb 100644
--- a/libs/agent/agent/providers/uitars/clients/mlxvlm.py
+++ b/libs/agent/agent/providers/uitars/clients/mlxvlm.py
@@ -202,8 +202,10 @@ class MLXVLMUITarsClient(BaseUITarsClient):
             )
             tokenizer = cast(PreTrainedTokenizer, self.processor)
             
+            print("generating response...")
+            
             # Generate response
-            output = generate(
+            text_content, usage = generate(
                 self.model, 
                 tokenizer, 
                 str(prompt), 
@@ -212,6 +214,10 @@ class MLXVLMUITarsClient(BaseUITarsClient):
                 max_tokens=max_tokens
             )
             
+            from pprint import pprint
+            print("DEBUG - AGENT GENERATION --------")
+            pprint(text_content)
+            print("DEBUG - AGENT GENERATION --------")
         except Exception as e:
             logger.error(f"Error generating response: {str(e)}")
             return {
@@ -235,9 +241,9 @@ class MLXVLMUITarsClient(BaseUITarsClient):
             model_size = model_sizes[0]
             
             # Check if output contains box tokens that need processing
-            if "<|box_start|>" in output:
+            if "<|box_start|>" in text_content:
                 # Process coordinates from model space back to original image space
-                output = self._process_coordinates(output, orig_size, model_size)
+                text_content = self._process_coordinates(text_content, orig_size, model_size)
         
         # Format response to match OpenAI format
         response = {
@@ -245,12 +251,13 @@ class MLXVLMUITarsClient(BaseUITarsClient):
                 {
                     "message": {
                         "role": "assistant",
-                        "content": output
+                        "content": text_content
                     },
                     "finish_reason": "stop"
                 }
             ],
-            "model": self.model_name
+            "model": self.model_name,
+            "usage": usage
         }
         
         return response
diff --git a/libs/agent/agent/providers/uitars/utils.py b/libs/agent/agent/providers/uitars/utils.py
index cc904115..bdfd58cd 100644
--- a/libs/agent/agent/providers/uitars/utils.py
+++ b/libs/agent/agent/providers/uitars/utils.py
@@ -105,7 +105,7 @@ async def to_agent_response_format(
             }
         ],
         truncation="auto",
-        usage=response["usage"],
+        usage=response.get("usage", {}),
         user=None,
         metadata={},
         response=response
diff --git a/libs/agent/pyproject.toml b/libs/agent/pyproject.toml
index 8772575c..1289adca 100644
--- a/libs/agent/pyproject.toml
+++ b/libs/agent/pyproject.toml
@@ -36,7 +36,7 @@ openai = [
 ]
 uitars = [
     "httpx>=0.27.0,<0.29.0",
-    "mlx-vlm>=0.1.25"
+    "mlx-vlm @ git+https://github.com/prncvrm/mlx-vlm.git@fix/qwen2-position-id"
 ]
 ui = [
     "gradio>=5.23.3,<6.0.0",
@@ -86,7 +86,7 @@ all = [
     "ollama>=0.4.7,<0.5.0",
     "gradio>=5.23.3,<6.0.0",
     "python-dotenv>=1.0.1,<2.0.0",
-    "mlx-vlm>=0.1.25"
+    "mlx-vlm @ git+https://github.com/prncvrm/mlx-vlm.git@fix/qwen2-position-id"
 ]
 
 [tool.pdm]

From 44ef3e3bbe302c2e829d231ba6683bdc77c665f2 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Tue, 6 May 2025 15:45:03 -0400
Subject: [PATCH 17/20] use my own mlx-vlm patch

---
 libs/agent/pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/agent/pyproject.toml b/libs/agent/pyproject.toml
index 1289adca..94d97889 100644
--- a/libs/agent/pyproject.toml
+++ b/libs/agent/pyproject.toml
@@ -36,7 +36,7 @@ openai = [
 ]
 uitars = [
     "httpx>=0.27.0,<0.29.0",
-    "mlx-vlm @ git+https://github.com/prncvrm/mlx-vlm.git@fix/qwen2-position-id"
+    "mlx-vlm @ git+https://github.com/ddupont808/mlx-vlm.git@fix/qwen2-position-id"
 ]
 ui = [
     "gradio>=5.23.3,<6.0.0",
@@ -86,7 +86,7 @@ all = [
     "ollama>=0.4.7,<0.5.0",
     "gradio>=5.23.3,<6.0.0",
     "python-dotenv>=1.0.1,<2.0.0",
-    "mlx-vlm @ git+https://github.com/prncvrm/mlx-vlm.git@fix/qwen2-position-id"
+    "mlx-vlm @ git+https://github.com/ddupont808/mlx-vlm.git@fix/qwen2-position-id"
 ]
 
 [tool.pdm]

From ac2717f663019ede11790e2eb8ef34d405b4945e Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Sat, 10 May 2025 17:13:00 -0400
Subject: [PATCH 18/20] readme correction

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 673590f4..dafef93d 100644
--- a/README.md
+++ b/README.md
@@ -92,7 +92,7 @@ async def main():
         agent = ComputerAgent(
           computer=computer,
           loop="UITARS",
-          model=LLM(provider="MLX", name="mlx-community/UI-TARS-1.5-7B-6bit")
+          model=LLM(provider="MLXVLM", name="mlx-community/UI-TARS-1.5-7B-6bit")
         )
         await agent.run("Find the trycua/cua repository on GitHub and follow the quick start guide")
 
@@ -193,7 +193,7 @@ For complete examples, see [agent_examples.py](./examples/agent_examples.py) or
 from agent import ComputerAgent, LLM, AgentLoop, LLMProvider
 
 # UI-TARS-1.5 agent for local execution with MLX
-ComputerAgent(loop=AgentLoop.UITARS, model=LLM(provider=LLMProvider.MLX, name="mlx-community/UI-TARS-1.5-7B-6bit"))   
+ComputerAgent(loop=AgentLoop.UITARS, model=LLM(provider=LLMProvider.MLXVLM, name="mlx-community/UI-TARS-1.5-7B-6bit"))   
 # OpenAI Computer-Use agent using OPENAI_API_KEY  
 ComputerAgent(loop=AgentLoop.OPENAI, model=LLM(provider=LLMProvider.OPENAI, name="computer-use-preview"))
 # Anthropic Claude agent using ANTHROPIC_API_KEY

From 28295fd72bbf19881f08da25145b59e5a10d57ee Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Sat, 10 May 2025 17:26:42 -0400
Subject: [PATCH 19/20] branch change

---
 libs/agent/pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/agent/pyproject.toml b/libs/agent/pyproject.toml
index 94d97889..5f23c2a1 100644
--- a/libs/agent/pyproject.toml
+++ b/libs/agent/pyproject.toml
@@ -36,7 +36,7 @@ openai = [
 ]
 uitars = [
     "httpx>=0.27.0,<0.29.0",
-    "mlx-vlm @ git+https://github.com/ddupont808/mlx-vlm.git@fix/qwen2-position-id"
+    "mlx-vlm @ git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id"
 ]
 ui = [
     "gradio>=5.23.3,<6.0.0",
@@ -86,7 +86,7 @@ all = [
     "ollama>=0.4.7,<0.5.0",
     "gradio>=5.23.3,<6.0.0",
     "python-dotenv>=1.0.1,<2.0.0",
-    "mlx-vlm @ git+https://github.com/ddupont808/mlx-vlm.git@fix/qwen2-position-id"
+    "mlx-vlm @ git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id"
 ]
 
 [tool.pdm]

From db823ac5612ae8f9b6cad8f22309835ca89ebfc8 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Sat, 10 May 2025 17:49:32 -0400
Subject: [PATCH 20/20] moved mlx into optional dep

---
 libs/agent/README.md      | 1 +
 libs/agent/pyproject.toml | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/libs/agent/README.md b/libs/agent/README.md
index 07f3d3fd..3a255c71 100644
--- a/libs/agent/README.md
+++ b/libs/agent/README.md
@@ -32,6 +32,7 @@ pip install "cua-agent[all]"
 pip install "cua-agent[openai]" # OpenAI Cua Loop
 pip install "cua-agent[anthropic]" # Anthropic Cua Loop
 pip install "cua-agent[uitars]"    # UI-Tars support
+pip install "cua-agent[uitars-mlx]"    # local UI-Tars support with MLXVLM
 pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models)
 pip install "cua-agent[ui]" # Gradio UI for the agent
 ```
diff --git a/libs/agent/pyproject.toml b/libs/agent/pyproject.toml
index 5f23c2a1..89d14883 100644
--- a/libs/agent/pyproject.toml
+++ b/libs/agent/pyproject.toml
@@ -36,6 +36,8 @@ openai = [
 ]
 uitars = [
     "httpx>=0.27.0,<0.29.0",
+]
+uitars-mlx = [
     "mlx-vlm @ git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id"
 ]
 ui = [