feat(mlx): add mlx backend (#6049)

* chore: allow to install with pip Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * WIP Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Make the backend to build and actually work Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * List models from system only Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add script to build darwin python backends Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Run protogen in libbackend Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Detect if mps is available across python backends Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * CI: try to build backend Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Debug CI Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Index mlx-vlm Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Remove mlx-vlm Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Drop CI test Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-02-17 15:39:26 -06:00 · 2025-08-22 08:42:29 +02:00
parent 6dccfb09f8
commit 1d830ce7dd
38 changed files with 926 additions and 211 deletions
--- a/backend/python/bark/Makefile
+++ b/backend/python/bark/Makefile
@@ -1,29 +1,23 @@
 .PHONY: ttsbark
-ttsbark: protogen
+ttsbark:
 	bash install.sh

 .PHONY: run
-run: protogen
+run: ttsbark
 	@echo "Running bark..."
 	bash run.sh
 	@echo "bark run."

 .PHONY: test
-test: protogen
+test: ttsbark
 	@echo "Testing bark..."
 	bash test.sh
 	@echo "bark tested."

-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/chatterbox/Makefile
+++ b/backend/python/chatterbox/Makefile
@@ -1,29 +1,23 @@
-.PHONY: coqui
-coqui: protogen
+.PHONY: chatterbox
+chatterbox:
 	bash install.sh

 .PHONY: run
-run: protogen
+run: chatterbox
 	@echo "Running coqui..."
 	bash run.sh
 	@echo "coqui run."

 .PHONY: test
-test: protogen
+test: chatterbox
 	@echo "Testing coqui..."
 	bash test.sh
 	@echo "coqui tested."

-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/chatterbox/backend.py
+++ b/backend/python/chatterbox/backend.py
@@ -41,7 +41,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        else:
            print("CUDA is not available", file=sys.stderr)
            device = "cpu"
-
+        mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        if mps_available:
+            device = "mps"
        if not torch.cuda.is_available() and request.CUDA:
            return backend_pb2.Result(success=False, message="CUDA is not available")

--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -17,8 +17,16 @@
 # LIMIT_TARGETS="cublas12"
 # source $(dirname $0)/../common/libbackend.sh
 #
+# You can switch between uv (conda-like) and pip installation methods by setting USE_PIP:
+# USE_PIP=true source $(dirname $0)/../common/libbackend.sh
+#

-PYTHON_VERSION="3.10"
+PYTHON_VERSION="${PYTHON_VERSION:-3.10}"
+
+# Default to uv if USE_PIP is not set
+if [ "x${USE_PIP}" == "x" ]; then
+    USE_PIP=false
+fi

 function init() {
    # Name of the backend (directory name)
@@ -57,11 +65,6 @@ function init() {
 # - hipblas
 # - intel
 function getBuildProfile() {
-    if [ "x${BUILD_TYPE}" == "xl4t" ]; then
-        echo "l4t"
-        return 0
-    fi
-
    # First check if we are a cublas build, and if so report the correct build profile
    if [ x"${BUILD_TYPE}" == "xcublas" ]; then
        if [ ! -z ${CUDA_MAJOR_VERSION} ]; then
@@ -81,7 +84,7 @@ function getBuildProfile() {
    fi

    # If for any other values of BUILD_TYPE, we don't need any special handling/discovery
-    if [ ! -z ${BUILD_TYPE} ]; then
+    if [ -n ${BUILD_TYPE} ]; then
        echo ${BUILD_TYPE}
        return 0
    fi
@@ -95,18 +98,48 @@ function getBuildProfile() {
 # This function is idempotent, so you can call it as many times as you want and it will
 # always result in an activated virtual environment
 function ensureVenv() {
-    if [ ! -d "${EDIR}/venv" ]; then
-        uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
-        echo "virtualenv created"
-    fi
+     if [ ! -d "${EDIR}/venv" ]; then
+        if [ "x${USE_PIP}" == "xtrue" ]; then
+                echo "Using pip and Python virtual environments"

+                # Use Python virtual environment with pip
+                interpreter="python3"
+                # if there is no python , call python${PYTHON_VERSION}
+                
+                if command -v python${PYTHON_VERSION} &> /dev/null; then
+                    interpreter="python${PYTHON_VERSION}"
+                fi
+                echo "Using interpreter: ${interpreter}"
+                ${interpreter} -m venv ${EDIR}/venv
+                source ${EDIR}/venv/bin/activate
+                ${interpreter} -m pip install --upgrade pip
+                echo "Python virtual environment created"
+        else
+                echo "Using uv package manager"
+                uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
+                echo "uv virtual environment created"
+        fi
+    fi
    # Source if we are not already in a Virtual env
    if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then
        source ${EDIR}/venv/bin/activate
-        echo "virtualenv activated"
+        echo "Python virtual environment activated"
    fi

-    echo "activated virtualenv has been ensured"
+    echo "activated virtual environment has been ensured"
+}
+
+function runProtogen() {
+    ensureVenv
+
+    if [ "x${USE_PIP}" == "xtrue" ]; then
+        pip install grpcio-tools
+    else
+        uv pip install grpcio-tools
+    fi
+    pushd ${EDIR}
+        python3 -m grpc_tools.protoc -I../../ -I./ --python_out=. --grpc_python_out=. backend.proto
+    popd
 }

 # installRequirements looks for several requirements files and if they exist runs the install for them in order
@@ -116,7 +149,7 @@ function ensureVenv() {
 #  - requirements-${BUILD_TYPE}.txt
 #  - requirements-${BUILD_PROFILE}.txt
 #
-# BUILD_PROFILE is a pore specific version of BUILD_TYPE, ex: cuda-11 or cuda-12
+# BUILD_PROFILE is a more specific version of BUILD_TYPE, ex: cuda-11 or cuda-12
 # it can also include some options that we do not have BUILD_TYPES for, ex: intel
 #
 # NOTE: for BUILD_PROFILE==intel, this function does NOT automatically use the Intel python package index.
@@ -158,10 +191,18 @@ function installRequirements() {
    for reqFile in ${requirementFiles[@]}; do
        if [ -f ${reqFile} ]; then
            echo "starting requirements install for ${reqFile}"
-            uv pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile}
+            if [ "x${USE_PIP}" == "xtrue" ]; then
+                # Use pip for installation
+                pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile}
+            else
+                # Use uv for installation
+                uv pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile}
+            fi
            echo "finished requirements install for ${reqFile}"
        fi
    done
+
+    runProtogen
 }

 # startBackend discovers and runs the backend GRPC server
--- a/backend/python/common/template/Makefile
+++ b/backend/python/common/template/Makefile
@@ -3,18 +3,11 @@
 .PHONY: install
 install:
 	bash install.sh
-	$(MAKE) protogen
-
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py

 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

-backend_pb2_grpc.py backend_pb2.py:
-	bash protogen.sh
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/common/template/protogen.sh
+++ b/backend/python/common/template/protogen.sh
@@ -8,6 +8,4 @@ else
    source $backend_dir/../common/libbackend.sh
 fi

-ensureVenv
-
-python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
+runProtogen
--- a/backend/python/coqui/Makefile
+++ b/backend/python/coqui/Makefile
@@ -1,29 +1,23 @@
 .PHONY: coqui
-coqui: protogen
+coqui:
 	bash install.sh

 .PHONY: run
-run: protogen
+run: coqui
 	@echo "Running coqui..."
 	bash run.sh
 	@echo "coqui run."

 .PHONY: test
-test: protogen
+test: coqui
 	@echo "Testing coqui..."
 	bash test.sh
 	@echo "coqui tested."

-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@@ -40,7 +40,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        else:
            print("CUDA is not available", file=sys.stderr)
            device = "cpu"
-
+        mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        if mps_available:
+            device = "mps"
        if not torch.cuda.is_available() and request.CUDA:
            return backend_pb2.Result(success=False, message="CUDA is not available")

--- a/backend/python/diffusers/Makefile
+++ b/backend/python/diffusers/Makefile
@@ -12,28 +12,22 @@ export SKIP_CONDA=1
 endif

 .PHONY: diffusers
-diffusers: protogen
+diffusers:
 	bash install.sh

 .PHONY: run
-run: protogen
+run: diffusers
 	@echo "Running diffusers..."
 	bash run.sh
 	@echo "Diffusers run."

-test: protogen
+test: diffusers
 	bash test.sh

-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -368,6 +368,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            device = "cpu" if not request.CUDA else "cuda"
            if XPU:
                device = "xpu"
+            mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+            if mps_available:
+                device = "mps"
            self.device = device
            if request.LoraAdapter:
                # Check if its a local file and not a directory ( we load lora differently for a safetensor file )
--- a/backend/python/exllama2/Makefile
+++ b/backend/python/exllama2/Makefile
@@ -1,23 +1,17 @@
 .PHONY: exllama2
-exllama2: protogen
+exllama2:
 	bash install.sh

 .PHONY: run
-run: protogen
+run: exllama2
 	@echo "Running exllama2..."
 	bash run.sh
 	@echo "exllama2 run."

-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	$(RM) -r venv source __pycache__
--- a/backend/python/faster-whisper/Makefile
+++ b/backend/python/faster-whisper/Makefile
@@ -3,18 +3,11 @@
 .PHONY: install
 install:
 	bash install.sh
-	$(MAKE) protogen
-
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py

 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

-backend_pb2_grpc.py backend_pb2.py:
-	bash protogen.sh
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/faster-whisper/backend.py
+++ b/backend/python/faster-whisper/backend.py
@@ -10,7 +10,7 @@ import sys
 import os
 import backend_pb2
 import backend_pb2_grpc
-
+import torch
 from faster_whisper import WhisperModel

 import grpc
@@ -35,7 +35,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        # device = "cuda" if request.CUDA else "cpu"
        if request.CUDA:
            device = "cuda"
-
+        mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        if mps_available:
+            device = "mps"
        try:
            print("Preparing models, please wait", file=sys.stderr)
            self.model = WhisperModel(request.Model, device=device, compute_type="float16")
--- a/backend/python/kitten-tts/Makefile
+++ b/backend/python/kitten-tts/Makefile
@@ -1,29 +1,23 @@
 .PHONY: kitten-tts
-kitten-tts: protogen
+kitten-tts:
 	bash install.sh

 .PHONY: run
-run: protogen
+run: kitten-tts
 	@echo "Running kitten-tts..."
 	bash run.sh
 	@echo "kitten-tts run."

 .PHONY: test
-test: protogen
+test: kitten-tts
 	@echo "Testing kitten-tts..."
 	bash test.sh
 	@echo "kitten-tts tested."

-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/kitten-tts/backend.py
+++ b/backend/python/kitten-tts/backend.py
@@ -33,18 +33,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):

-        # Get device
-        # device = "cuda" if request.CUDA else "cpu"
-        if torch.cuda.is_available():
-            print("CUDA is available", file=sys.stderr)
-            device = "cuda"
-        else:
-            print("CUDA is not available", file=sys.stderr)
-            device = "cpu"
-
-        if not torch.cuda.is_available() and request.CUDA:
-            return backend_pb2.Result(success=False, message="CUDA is not available")
-
        self.AudioPath = None
        # List available KittenTTS models
        print("Available KittenTTS voices: expr-voice-2-m, expr-voice-2-f, expr-voice-3-m, expr-voice-3-f, expr-voice-4-m, expr-voice-4-f, expr-voice-5-m, expr-voice-5-f")
--- a/backend/python/kokoro/Makefile
+++ b/backend/python/kokoro/Makefile
@@ -1,29 +1,23 @@
 .PHONY: kokoro
-kokoro: protogen
+kokoro:
 	bash install.sh

 .PHONY: run
-run: protogen
+run: kokoro
 	@echo "Running kokoro..."
 	bash run.sh
 	@echo "kokoro run."

 .PHONY: test
-test: protogen
+test: kokoro
 	@echo "Testing kokoro..."
 	bash test.sh
 	@echo "kokoro tested."

-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/kokoro/backend.py
+++ b/backend/python/kokoro/backend.py
@@ -33,17 +33,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    
    def LoadModel(self, request, context):
-        # Get device
-        if torch.cuda.is_available():
-            print("CUDA is available", file=sys.stderr)
-            device = "cuda"
-        else:
-            print("CUDA is not available", file=sys.stderr)
-            device = "cpu"
-
-        if not torch.cuda.is_available() and request.CUDA:
-            return backend_pb2.Result(success=False, message="CUDA is not available")
-
        try:
            print("Preparing Kokoro TTS pipeline, please wait", file=sys.stderr)
            # empty dict
--- a/backend/python/mlx/Makefile
+++ b/backend/python/mlx/Makefile
@@ -0,0 +1,23 @@
+.PHONY: mlx
+mlx:
+	bash install.sh
+
+.PHONY: run
+run:
+	@echo "Running mlx..."
+	bash run.sh
+	@echo "mlx run."
+
+.PHONY: test
+test:
+	@echo "Testing mlx..."
+	bash test.sh
+	@echo "mlx tested."
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
--- a/backend/python/mlx/backend.py
+++ b/backend/python/mlx/backend.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python3
+import asyncio
+from concurrent import futures
+import argparse
+import signal
+import sys
+import os
+from typing import List
+import time
+
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+from mlx_lm import load, generate, stream_generate
+from mlx_lm.sample_utils import make_sampler
+from mlx_lm.models.cache import make_prompt_cache
+import mlx.core as mx
+import base64
+import io
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    A gRPC servicer that implements the Backend service defined in backend.proto.
+    """
+
+    def _is_float(self, s):
+        """Check if a string can be converted to float."""
+        try:
+            float(s)
+            return True
+        except ValueError:
+            return False
+
+    def _is_int(self, s):
+        """Check if a string can be converted to int."""
+        try:
+            int(s)
+            return True
+        except ValueError:
+            return False
+
+    def Health(self, request, context):
+        """
+        Returns a health check message.
+
+        Args:
+            request: The health check request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Reply: The health check reply.
+        """
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+    async def LoadModel(self, request, context):
+        """
+        Loads a language model using MLX.
+
+        Args:
+            request: The load model request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Result: The load model result.
+        """
+        try:
+            print(f"Loading MLX model: {request.Model}", file=sys.stderr)
+            print(f"Request: {request}", file=sys.stderr)
+            
+            # Parse options like in the diffusers backend
+            options = request.Options
+            self.options = {}
+            
+            # The options are a list of strings in this form optname:optvalue
+            # We store all the options in a dict for later use
+            for opt in options:
+                if ":" not in opt:
+                    continue
+                key, value = opt.split(":", 1)  # Split only on first colon to handle values with colons
+                
+                # Convert numeric values to appropriate types
+                if self._is_float(value):
+                    value = float(value)
+                elif self._is_int(value):
+                    value = int(value)
+                elif value.lower() in ["true", "false"]:
+                    value = value.lower() == "true"
+                    
+                self.options[key] = value
+            
+            print(f"Options: {self.options}", file=sys.stderr)
+            
+            # Build tokenizer config for MLX using options
+            tokenizer_config = {}
+            
+            # Handle trust_remote_code from request or options
+            if request.TrustRemoteCode or self.options.get("trust_remote_code", False):
+                tokenizer_config["trust_remote_code"] = True
+            
+            # Handle EOS token from options
+            if "eos_token" in self.options:
+                tokenizer_config["eos_token"] = self.options["eos_token"]
+            
+            # Handle other tokenizer config options
+            for key in ["pad_token", "bos_token", "unk_token", "sep_token", "cls_token", "mask_token"]:
+                if key in self.options:
+                    tokenizer_config[key] = self.options[key]
+            
+            # Load model and tokenizer using MLX
+            if tokenizer_config:
+                print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
+                self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
+            else:
+                self.model, self.tokenizer = load(request.Model)
+            
+            # Initialize prompt cache for efficient generation
+            max_kv_size = self.options.get("max_kv_size", None)
+            self.prompt_cache = make_prompt_cache(self.model, max_kv_size)
+                
+        except Exception as err:
+            print(f"Error loading MLX model {err=}, {type(err)=}", file=sys.stderr)
+            return backend_pb2.Result(success=False, message=f"Error loading MLX model: {err}")
+
+        print("MLX model loaded successfully", file=sys.stderr)
+        return backend_pb2.Result(message="MLX model loaded successfully", success=True)
+
+    async def Predict(self, request, context):
+        """
+        Generates text based on the given prompt and sampling parameters using MLX.
+
+        Args:
+            request: The predict request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Reply: The predict result.
+        """
+        try:
+            # Prepare the prompt
+            prompt = self._prepare_prompt(request)
+            
+            # Build generation parameters using request attributes and options
+            max_tokens, sampler_params = self._build_generation_params(request)
+            
+            print(f"Generating text with MLX - max_tokens: {max_tokens}, sampler_params: {sampler_params}", file=sys.stderr)
+            
+            # Create sampler with parameters
+            sampler = make_sampler(**sampler_params)
+            
+            # Generate text using MLX with proper parameters
+            response = generate(
+                self.model,
+                self.tokenizer,
+                prompt=prompt,
+                max_tokens=max_tokens,
+                sampler=sampler,
+                prompt_cache=self.prompt_cache,
+                verbose=False
+            )
+            
+            return backend_pb2.Reply(message=bytes(response, encoding='utf-8'))
+            
+        except Exception as e:
+            print(f"Error in MLX Predict: {e}", file=sys.stderr)
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(f"Generation failed: {str(e)}")
+            return backend_pb2.Reply(message=bytes("", encoding='utf-8'))
+
+    def Embedding(self, request, context):
+        """
+        A gRPC method that calculates embeddings for a given sentence.
+        
+        Note: MLX-LM doesn't support embeddings directly. This method returns an error.
+
+        Args:
+            request: An EmbeddingRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            An EmbeddingResult object that contains the calculated embeddings.
+        """
+        print("Embeddings not supported in MLX backend", file=sys.stderr)
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details("Embeddings are not supported in the MLX backend.")
+        return backend_pb2.EmbeddingResult()
+
+    async def PredictStream(self, request, context):
+        """
+        Generates text based on the given prompt and sampling parameters, and streams the results using MLX.
+
+        Args:
+            request: The predict stream request.
+            context: The gRPC context.
+
+        Yields:
+            backend_pb2.Reply: Streaming predict results.
+        """
+        try:
+            # Prepare the prompt
+            prompt = self._prepare_prompt(request)
+            
+            # Build generation parameters using request attributes and options
+            max_tokens, sampler_params = self._build_generation_params(request, default_max_tokens=512)
+            
+            print(f"Streaming text with MLX - max_tokens: {max_tokens}, sampler_params: {sampler_params}", file=sys.stderr)
+            
+            # Create sampler with parameters
+            sampler = make_sampler(**sampler_params)
+            
+            # Stream text generation using MLX with proper parameters
+            for response in stream_generate(
+                self.model,
+                self.tokenizer,
+                prompt=prompt,
+                max_tokens=max_tokens,
+                sampler=sampler,
+                prompt_cache=self.prompt_cache,
+            ):
+                yield backend_pb2.Reply(message=bytes(response.text, encoding='utf-8'))
+                
+        except Exception as e:
+            print(f"Error in MLX PredictStream: {e}", file=sys.stderr)
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(f"Streaming generation failed: {str(e)}")
+            yield backend_pb2.Reply(message=bytes("", encoding='utf-8'))
+
+    def _prepare_prompt(self, request):
+        """
+        Prepare the prompt for MLX generation, handling chat templates if needed.
+
+        Args:
+            request: The gRPC request containing prompt and message information.
+
+        Returns:
+            str: The prepared prompt.
+        """
+        # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
+        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
+            # Convert gRPC messages to the format expected by apply_chat_template
+            messages = []
+            for msg in request.Messages:
+                messages.append({"role": msg.role, "content": msg.content})
+            
+            prompt = self.tokenizer.apply_chat_template(
+                messages, 
+                tokenize=False, 
+                add_generation_prompt=True
+            )
+            return prompt
+        else:
+            return request.Prompt
+
+
+
+
+
+    def _build_generation_params(self, request, default_max_tokens=200):
+        """
+        Build generation parameters from request attributes and options.
+
+        Args:
+            request: The gRPC request.
+            default_max_tokens: Default max_tokens if not specified.
+
+        Returns:
+            tuple: (max_tokens, sampler_params dict)
+        """
+        # Extract max_tokens
+        max_tokens = getattr(request, 'Tokens', default_max_tokens)
+        if max_tokens == 0:
+            max_tokens = default_max_tokens
+        
+        # Extract sampler parameters from request attributes
+        temp = getattr(request, 'Temperature', 0.0)
+        if temp == 0.0:
+            temp = 0.6  # Default temperature
+        
+        top_p = getattr(request, 'TopP', 0.0)
+        if top_p == 0.0:
+            top_p = 1.0  # Default top_p
+        
+        # Initialize sampler parameters
+        sampler_params = {
+            'temp': temp,
+            'top_p': top_p,
+            'xtc_threshold': 0.0,
+            'xtc_probability': 0.0,
+        }
+        
+        # Add seed if specified
+        seed = getattr(request, 'Seed', 0)
+        if seed != 0:
+            mx.random.seed(seed)
+        
+        # Override with options if available
+        if hasattr(self, 'options'):
+            # Max tokens from options
+            if 'max_tokens' in self.options:
+                max_tokens = self.options['max_tokens']
+            
+            # Sampler parameters from options
+            sampler_option_mapping = {
+                'temp': 'temp',
+                'temperature': 'temp',  # alias
+                'top_p': 'top_p', 
+                'xtc_threshold': 'xtc_threshold',
+                'xtc_probability': 'xtc_probability',
+            }
+            
+            for option_key, param_key in sampler_option_mapping.items():
+                if option_key in self.options:
+                    sampler_params[param_key] = self.options[option_key]
+            
+            # Handle seed from options
+            if 'seed' in self.options:
+                mx.random.seed(self.options['seed'])
+        
+        # Special tokens for XTC sampling (if tokenizer has eos_token_ids)
+        xtc_special_tokens = []
+        if hasattr(self.tokenizer, 'eos_token_ids') and self.tokenizer.eos_token_ids:
+            xtc_special_tokens = list(self.tokenizer.eos_token_ids)
+        elif hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None:
+            xtc_special_tokens = [self.tokenizer.eos_token_id]
+        
+        # Add newline token if available
+        try:
+            newline_tokens = self.tokenizer.encode("\n")
+            xtc_special_tokens.extend(newline_tokens)
+        except:
+            pass  # Skip if encoding fails
+        
+        sampler_params['xtc_special_tokens'] = xtc_special_tokens
+        
+        return max_tokens, sampler_params
+
+async def serve(address):
+    # Start asyncio gRPC server
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
+    # Add the servicer to the server
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    # Bind the server to the address
+    server.add_insecure_port(address)
+
+    # Gracefully shutdown the server on SIGTERM or SIGINT
+    loop = asyncio.get_event_loop()
+    for sig in (signal.SIGINT, signal.SIGTERM):
+        loop.add_signal_handler(
+            sig, lambda: asyncio.ensure_future(server.stop(5))
+        )
+
+    # Start the server
+    await server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+    # Wait for the server to be terminated
+    await server.wait_for_termination()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    asyncio.run(serve(args.addr))
--- a/backend/python/rfdetr/protogen.sh
+++ b/backend/python/rfdetr/protogen.sh
@@ -1,13 +1,15 @@
 #!/bin/bash
 set -e

+USE_PIP=true
+PYTHON_VERSION=""
+
 backend_dir=$(dirname $0)
+
 if [ -d $backend_dir/common ]; then
    source $backend_dir/common/libbackend.sh
 else
    source $backend_dir/../common/libbackend.sh
 fi

-ensureVenv
-
-python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
+installRequirements
--- a/backend/python/mlx/requirements-mps.txt
+++ b/backend/python/mlx/requirements-mps.txt
@@ -0,0 +1 @@
+mlx-lm
--- a/backend/python/mlx/requirements.txt
+++ b/backend/python/mlx/requirements.txt
@@ -0,0 +1,4 @@
+grpcio==1.71.0
+protobuf
+certifi
+setuptools
--- a/backend/python/mlx/run.sh
+++ b/backend/python/mlx/run.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+backend_dir=$(dirname $0)
+
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+startBackend $@
--- a/backend/python/mlx/test.py
+++ b/backend/python/mlx/test.py
@@ -0,0 +1,146 @@
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+import unittest
+import subprocess
+import time
+import grpc
+import backend_pb2_grpc
+import backend_pb2
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service.
+
+    This class contains methods to test the startup and shutdown of the gRPC service.
+    """
+    def setUp(self):
+        self.service = subprocess.Popen(["python", "backend.py", "--addr", "localhost:50051"])
+        time.sleep(10)
+
+    def tearDown(self) -> None:
+        self.service.terminate()
+        self.service.wait()
+
+    def test_server_startup(self):
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_text(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
+                self.assertTrue(response.success)
+                req = backend_pb2.PredictOptions(Prompt="The capital of France is")
+                resp = stub.Predict(req)
+                self.assertIsNotNone(resp.message)
+        except Exception as err:
+            print(err)
+            self.fail("text service failed")
+        finally:
+            self.tearDown()
+
+    def test_sampling_params(self):
+        """
+        This method tests if all sampling parameters are correctly processed
+        NOTE: this does NOT test for correctness, just that we received a compatible response
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
+                self.assertTrue(response.success)
+
+                req = backend_pb2.PredictOptions(
+                    Prompt="The capital of France is",
+                    TopP=0.8,
+                    Tokens=50,
+                    Temperature=0.7,
+                    TopK=40,
+                    PresencePenalty=0.1,
+                    FrequencyPenalty=0.2,
+                    RepetitionPenalty=1.1,
+                    MinP=0.05,
+                    Seed=42,
+                    StopPrompts=["\n"],
+                    StopTokenIds=[50256],
+                    BadWords=["badword"],
+                    IncludeStopStrInOutput=True,
+                    IgnoreEOS=True,
+                    MinTokens=5,
+                    Logprobs=5,
+                    PromptLogprobs=5,
+                    SkipSpecialTokens=True,
+                    SpacesBetweenSpecialTokens=True,
+                    TruncatePromptTokens=10,
+                    GuidedDecoding=True,
+                    N=2,
+                )
+                resp = stub.Predict(req)
+                self.assertIsNotNone(resp.message)
+                self.assertIsNotNone(resp.logprobs)
+        except Exception as err:
+            print(err)
+            self.fail("sampling params service failed")
+        finally:
+            self.tearDown()
+
+
+    def test_embedding(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="intfloat/e5-mistral-7b-instruct"))
+                self.assertTrue(response.success)
+                embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
+                embedding_response = stub.Embedding(embedding_request)
+                self.assertIsNotNone(embedding_response.embeddings)
+                # assert that is a list of floats
+                self.assertIsInstance(embedding_response.embeddings, list)
+                # assert that the list is not empty
+                self.assertTrue(len(embedding_response.embeddings) > 0)
+        except Exception as err:
+            print(err)
+            self.fail("Embedding service failed")
+        finally:
+            self.tearDown()
--- a/backend/python/mlx/test.sh
+++ b/backend/python/mlx/test.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+runUnittests
--- a/backend/python/rerankers/Makefile
+++ b/backend/python/rerankers/Makefile
@@ -1,30 +1,24 @@
 .PHONY: rerankers
-rerankers: protogen
+rerankers:
 	bash install.sh

 .PHONY: run
-run: protogen
+run: rerankers
 	@echo "Running rerankers..."
 	bash run.sh
 	@echo "rerankers run."

 # It is not working well by using command line. It only6 works with IDE like VSCode.
 .PHONY: test
-test: protogen
+test: rerankers
 	@echo "Testing rerankers..."
 	bash test.sh
 	@echo "rerankers tested."

-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/rfdetr/Makefile
+++ b/backend/python/rfdetr/Makefile
@@ -3,18 +3,11 @@
 .PHONY: install
 install:
 	bash install.sh
-	$(MAKE) protogen
-
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py

 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

-backend_pb2_grpc.py backend_pb2.py:
-	bash protogen.sh
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/transformers/Makefile
+++ b/backend/python/transformers/Makefile
@@ -1,30 +1,24 @@
 .PHONY: transformers
-transformers: protogen
+transformers:
 	bash install.sh

 .PHONY: run
-run: protogen
+run: transformers
 	@echo "Running transformers..."
 	bash run.sh
 	@echo "transformers run."

 # It is not working well by using command line. It only6 works with IDE like VSCode.
 .PHONY: test
-test: protogen
+test: transformers
 	@echo "Testing transformers..."
 	bash test.sh
 	@echo "transformers tested."

-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -94,7 +94,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        self.SentenceTransformer = False

        device_map="cpu"
-
+        mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        if mps_available:
+            device_map = "mps"
        quantization = None
        autoTokenizer = True

--- a/backend/python/vllm/Makefile
+++ b/backend/python/vllm/Makefile
@@ -1,29 +1,23 @@
 .PHONY: vllm
-vllm: protogen
+vllm:
 	bash install.sh

 .PHONY: run
-run: protogen
+run: vllm
 	@echo "Running vllm..."
 	bash run.sh
 	@echo "vllm run."

 .PHONY: test
-test: protogen
+test: vllm
 	@echo "Testing vllm..."
 	bash test.sh
 	@echo "vllm tested."

-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
-
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__