mirror of
https://github.com/mudler/LocalAI.git
synced 2025-12-30 22:20:20 -06:00
feat(mlx): add mlx backend (#6049)
* chore: allow to install with pip Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * WIP Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Make the backend to build and actually work Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * List models from system only Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add script to build darwin python backends Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Run protogen in libbackend Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Detect if mps is available across python backends Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * CI: try to build backend Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Debug CI Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Index mlx-vlm Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Remove mlx-vlm Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Drop CI test Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
6dccfb09f8
commit
1d830ce7dd
@@ -1,29 +1,23 @@
|
||||
.PHONY: ttsbark
|
||||
ttsbark: protogen
|
||||
ttsbark:
|
||||
bash install.sh
|
||||
|
||||
.PHONY: run
|
||||
run: protogen
|
||||
run: ttsbark
|
||||
@echo "Running bark..."
|
||||
bash run.sh
|
||||
@echo "bark run."
|
||||
|
||||
.PHONY: test
|
||||
test: protogen
|
||||
test: ttsbark
|
||||
@echo "Testing bark..."
|
||||
bash test.sh
|
||||
@echo "bark tested."
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
@@ -1,29 +1,23 @@
|
||||
.PHONY: coqui
|
||||
coqui: protogen
|
||||
.PHONY: chatterbox
|
||||
chatterbox:
|
||||
bash install.sh
|
||||
|
||||
.PHONY: run
|
||||
run: protogen
|
||||
run: chatterbox
|
||||
@echo "Running coqui..."
|
||||
bash run.sh
|
||||
@echo "coqui run."
|
||||
|
||||
.PHONY: test
|
||||
test: protogen
|
||||
test: chatterbox
|
||||
@echo "Testing coqui..."
|
||||
bash test.sh
|
||||
@echo "coqui tested."
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
@@ -41,7 +41,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
else:
|
||||
print("CUDA is not available", file=sys.stderr)
|
||||
device = "cpu"
|
||||
|
||||
mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
||||
if mps_available:
|
||||
device = "mps"
|
||||
if not torch.cuda.is_available() and request.CUDA:
|
||||
return backend_pb2.Result(success=False, message="CUDA is not available")
|
||||
|
||||
|
||||
@@ -17,8 +17,16 @@
|
||||
# LIMIT_TARGETS="cublas12"
|
||||
# source $(dirname $0)/../common/libbackend.sh
|
||||
#
|
||||
# You can switch between uv (conda-like) and pip installation methods by setting USE_PIP:
|
||||
# USE_PIP=true source $(dirname $0)/../common/libbackend.sh
|
||||
#
|
||||
|
||||
PYTHON_VERSION="3.10"
|
||||
PYTHON_VERSION="${PYTHON_VERSION:-3.10}"
|
||||
|
||||
# Default to uv if USE_PIP is not set
|
||||
if [ "x${USE_PIP}" == "x" ]; then
|
||||
USE_PIP=false
|
||||
fi
|
||||
|
||||
function init() {
|
||||
# Name of the backend (directory name)
|
||||
@@ -57,11 +65,6 @@ function init() {
|
||||
# - hipblas
|
||||
# - intel
|
||||
function getBuildProfile() {
|
||||
if [ "x${BUILD_TYPE}" == "xl4t" ]; then
|
||||
echo "l4t"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# First check if we are a cublas build, and if so report the correct build profile
|
||||
if [ x"${BUILD_TYPE}" == "xcublas" ]; then
|
||||
if [ ! -z ${CUDA_MAJOR_VERSION} ]; then
|
||||
@@ -81,7 +84,7 @@ function getBuildProfile() {
|
||||
fi
|
||||
|
||||
# If for any other values of BUILD_TYPE, we don't need any special handling/discovery
|
||||
if [ ! -z ${BUILD_TYPE} ]; then
|
||||
if [ -n ${BUILD_TYPE} ]; then
|
||||
echo ${BUILD_TYPE}
|
||||
return 0
|
||||
fi
|
||||
@@ -95,18 +98,48 @@ function getBuildProfile() {
|
||||
# This function is idempotent, so you can call it as many times as you want and it will
|
||||
# always result in an activated virtual environment
|
||||
function ensureVenv() {
|
||||
if [ ! -d "${EDIR}/venv" ]; then
|
||||
uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
|
||||
echo "virtualenv created"
|
||||
fi
|
||||
if [ ! -d "${EDIR}/venv" ]; then
|
||||
if [ "x${USE_PIP}" == "xtrue" ]; then
|
||||
echo "Using pip and Python virtual environments"
|
||||
|
||||
# Use Python virtual environment with pip
|
||||
interpreter="python3"
|
||||
# if there is no python , call python${PYTHON_VERSION}
|
||||
|
||||
if command -v python${PYTHON_VERSION} &> /dev/null; then
|
||||
interpreter="python${PYTHON_VERSION}"
|
||||
fi
|
||||
echo "Using interpreter: ${interpreter}"
|
||||
${interpreter} -m venv ${EDIR}/venv
|
||||
source ${EDIR}/venv/bin/activate
|
||||
${interpreter} -m pip install --upgrade pip
|
||||
echo "Python virtual environment created"
|
||||
else
|
||||
echo "Using uv package manager"
|
||||
uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
|
||||
echo "uv virtual environment created"
|
||||
fi
|
||||
fi
|
||||
# Source if we are not already in a Virtual env
|
||||
if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then
|
||||
source ${EDIR}/venv/bin/activate
|
||||
echo "virtualenv activated"
|
||||
echo "Python virtual environment activated"
|
||||
fi
|
||||
|
||||
echo "activated virtualenv has been ensured"
|
||||
echo "activated virtual environment has been ensured"
|
||||
}
|
||||
|
||||
function runProtogen() {
|
||||
ensureVenv
|
||||
|
||||
if [ "x${USE_PIP}" == "xtrue" ]; then
|
||||
pip install grpcio-tools
|
||||
else
|
||||
uv pip install grpcio-tools
|
||||
fi
|
||||
pushd ${EDIR}
|
||||
python3 -m grpc_tools.protoc -I../../ -I./ --python_out=. --grpc_python_out=. backend.proto
|
||||
popd
|
||||
}
|
||||
|
||||
# installRequirements looks for several requirements files and if they exist runs the install for them in order
|
||||
@@ -116,7 +149,7 @@ function ensureVenv() {
|
||||
# - requirements-${BUILD_TYPE}.txt
|
||||
# - requirements-${BUILD_PROFILE}.txt
|
||||
#
|
||||
# BUILD_PROFILE is a pore specific version of BUILD_TYPE, ex: cuda-11 or cuda-12
|
||||
# BUILD_PROFILE is a more specific version of BUILD_TYPE, ex: cuda-11 or cuda-12
|
||||
# it can also include some options that we do not have BUILD_TYPES for, ex: intel
|
||||
#
|
||||
# NOTE: for BUILD_PROFILE==intel, this function does NOT automatically use the Intel python package index.
|
||||
@@ -158,10 +191,18 @@ function installRequirements() {
|
||||
for reqFile in ${requirementFiles[@]}; do
|
||||
if [ -f ${reqFile} ]; then
|
||||
echo "starting requirements install for ${reqFile}"
|
||||
uv pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile}
|
||||
if [ "x${USE_PIP}" == "xtrue" ]; then
|
||||
# Use pip for installation
|
||||
pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile}
|
||||
else
|
||||
# Use uv for installation
|
||||
uv pip install ${EXTRA_PIP_INSTALL_FLAGS} --requirement ${reqFile}
|
||||
fi
|
||||
echo "finished requirements install for ${reqFile}"
|
||||
fi
|
||||
done
|
||||
|
||||
runProtogen
|
||||
}
|
||||
|
||||
# startBackend discovers and runs the backend GRPC server
|
||||
|
||||
@@ -3,18 +3,11 @@
|
||||
.PHONY: install
|
||||
install:
|
||||
bash install.sh
|
||||
$(MAKE) protogen
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
bash protogen.sh
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
@@ -8,6 +8,4 @@ else
|
||||
source $backend_dir/../common/libbackend.sh
|
||||
fi
|
||||
|
||||
ensureVenv
|
||||
|
||||
python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
|
||||
runProtogen
|
||||
@@ -1,29 +1,23 @@
|
||||
.PHONY: coqui
|
||||
coqui: protogen
|
||||
coqui:
|
||||
bash install.sh
|
||||
|
||||
.PHONY: run
|
||||
run: protogen
|
||||
run: coqui
|
||||
@echo "Running coqui..."
|
||||
bash run.sh
|
||||
@echo "coqui run."
|
||||
|
||||
.PHONY: test
|
||||
test: protogen
|
||||
test: coqui
|
||||
@echo "Testing coqui..."
|
||||
bash test.sh
|
||||
@echo "coqui tested."
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
@@ -40,7 +40,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
else:
|
||||
print("CUDA is not available", file=sys.stderr)
|
||||
device = "cpu"
|
||||
|
||||
mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
||||
if mps_available:
|
||||
device = "mps"
|
||||
if not torch.cuda.is_available() and request.CUDA:
|
||||
return backend_pb2.Result(success=False, message="CUDA is not available")
|
||||
|
||||
|
||||
@@ -12,28 +12,22 @@ export SKIP_CONDA=1
|
||||
endif
|
||||
|
||||
.PHONY: diffusers
|
||||
diffusers: protogen
|
||||
diffusers:
|
||||
bash install.sh
|
||||
|
||||
.PHONY: run
|
||||
run: protogen
|
||||
run: diffusers
|
||||
@echo "Running diffusers..."
|
||||
bash run.sh
|
||||
@echo "Diffusers run."
|
||||
|
||||
test: protogen
|
||||
test: diffusers
|
||||
bash test.sh
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
@@ -368,6 +368,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
device = "cpu" if not request.CUDA else "cuda"
|
||||
if XPU:
|
||||
device = "xpu"
|
||||
mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
||||
if mps_available:
|
||||
device = "mps"
|
||||
self.device = device
|
||||
if request.LoraAdapter:
|
||||
# Check if its a local file and not a directory ( we load lora differently for a safetensor file )
|
||||
|
||||
@@ -1,23 +1,17 @@
|
||||
.PHONY: exllama2
|
||||
exllama2: protogen
|
||||
exllama2:
|
||||
bash install.sh
|
||||
|
||||
.PHONY: run
|
||||
run: protogen
|
||||
run: exllama2
|
||||
@echo "Running exllama2..."
|
||||
bash run.sh
|
||||
@echo "exllama2 run."
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
$(RM) -r venv source __pycache__
|
||||
@@ -3,18 +3,11 @@
|
||||
.PHONY: install
|
||||
install:
|
||||
bash install.sh
|
||||
$(MAKE) protogen
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
bash protogen.sh
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
@@ -10,7 +10,7 @@ import sys
|
||||
import os
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
|
||||
import torch
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
import grpc
|
||||
@@ -35,7 +35,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
# device = "cuda" if request.CUDA else "cpu"
|
||||
if request.CUDA:
|
||||
device = "cuda"
|
||||
|
||||
mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
||||
if mps_available:
|
||||
device = "mps"
|
||||
try:
|
||||
print("Preparing models, please wait", file=sys.stderr)
|
||||
self.model = WhisperModel(request.Model, device=device, compute_type="float16")
|
||||
|
||||
@@ -1,29 +1,23 @@
|
||||
.PHONY: kitten-tts
|
||||
kitten-tts: protogen
|
||||
kitten-tts:
|
||||
bash install.sh
|
||||
|
||||
.PHONY: run
|
||||
run: protogen
|
||||
run: kitten-tts
|
||||
@echo "Running kitten-tts..."
|
||||
bash run.sh
|
||||
@echo "kitten-tts run."
|
||||
|
||||
.PHONY: test
|
||||
test: protogen
|
||||
test: kitten-tts
|
||||
@echo "Testing kitten-tts..."
|
||||
bash test.sh
|
||||
@echo "kitten-tts tested."
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
@@ -33,18 +33,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
||||
def LoadModel(self, request, context):
|
||||
|
||||
# Get device
|
||||
# device = "cuda" if request.CUDA else "cpu"
|
||||
if torch.cuda.is_available():
|
||||
print("CUDA is available", file=sys.stderr)
|
||||
device = "cuda"
|
||||
else:
|
||||
print("CUDA is not available", file=sys.stderr)
|
||||
device = "cpu"
|
||||
|
||||
if not torch.cuda.is_available() and request.CUDA:
|
||||
return backend_pb2.Result(success=False, message="CUDA is not available")
|
||||
|
||||
self.AudioPath = None
|
||||
# List available KittenTTS models
|
||||
print("Available KittenTTS voices: expr-voice-2-m, expr-voice-2-f, expr-voice-3-m, expr-voice-3-f, expr-voice-4-m, expr-voice-4-f, expr-voice-5-m, expr-voice-5-f")
|
||||
|
||||
@@ -1,29 +1,23 @@
|
||||
.PHONY: kokoro
|
||||
kokoro: protogen
|
||||
kokoro:
|
||||
bash install.sh
|
||||
|
||||
.PHONY: run
|
||||
run: protogen
|
||||
run: kokoro
|
||||
@echo "Running kokoro..."
|
||||
bash run.sh
|
||||
@echo "kokoro run."
|
||||
|
||||
.PHONY: test
|
||||
test: protogen
|
||||
test: kokoro
|
||||
@echo "Testing kokoro..."
|
||||
bash test.sh
|
||||
@echo "kokoro tested."
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
@@ -33,17 +33,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
||||
|
||||
def LoadModel(self, request, context):
|
||||
# Get device
|
||||
if torch.cuda.is_available():
|
||||
print("CUDA is available", file=sys.stderr)
|
||||
device = "cuda"
|
||||
else:
|
||||
print("CUDA is not available", file=sys.stderr)
|
||||
device = "cpu"
|
||||
|
||||
if not torch.cuda.is_available() and request.CUDA:
|
||||
return backend_pb2.Result(success=False, message="CUDA is not available")
|
||||
|
||||
try:
|
||||
print("Preparing Kokoro TTS pipeline, please wait", file=sys.stderr)
|
||||
# empty dict
|
||||
|
||||
23
backend/python/mlx/Makefile
Normal file
23
backend/python/mlx/Makefile
Normal file
@@ -0,0 +1,23 @@
|
||||
.PHONY: mlx
|
||||
mlx:
|
||||
bash install.sh
|
||||
|
||||
.PHONY: run
|
||||
run:
|
||||
@echo "Running mlx..."
|
||||
bash run.sh
|
||||
@echo "mlx run."
|
||||
|
||||
.PHONY: test
|
||||
test:
|
||||
@echo "Testing mlx..."
|
||||
bash test.sh
|
||||
@echo "mlx tested."
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
376
backend/python/mlx/backend.py
Normal file
376
backend/python/mlx/backend.py
Normal file
@@ -0,0 +1,376 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
from concurrent import futures
|
||||
import argparse
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
from typing import List
|
||||
import time
|
||||
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
|
||||
import grpc
|
||||
from mlx_lm import load, generate, stream_generate
|
||||
from mlx_lm.sample_utils import make_sampler
|
||||
from mlx_lm.models.cache import make_prompt_cache
|
||||
import mlx.core as mx
|
||||
import base64
|
||||
import io
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
|
||||
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
||||
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
||||
|
||||
# Implement the BackendServicer class with the service methods
|
||||
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
"""
|
||||
A gRPC servicer that implements the Backend service defined in backend.proto.
|
||||
"""
|
||||
|
||||
def _is_float(self, s):
|
||||
"""Check if a string can be converted to float."""
|
||||
try:
|
||||
float(s)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def _is_int(self, s):
|
||||
"""Check if a string can be converted to int."""
|
||||
try:
|
||||
int(s)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def Health(self, request, context):
|
||||
"""
|
||||
Returns a health check message.
|
||||
|
||||
Args:
|
||||
request: The health check request.
|
||||
context: The gRPC context.
|
||||
|
||||
Returns:
|
||||
backend_pb2.Reply: The health check reply.
|
||||
"""
|
||||
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
||||
|
||||
async def LoadModel(self, request, context):
|
||||
"""
|
||||
Loads a language model using MLX.
|
||||
|
||||
Args:
|
||||
request: The load model request.
|
||||
context: The gRPC context.
|
||||
|
||||
Returns:
|
||||
backend_pb2.Result: The load model result.
|
||||
"""
|
||||
try:
|
||||
print(f"Loading MLX model: {request.Model}", file=sys.stderr)
|
||||
print(f"Request: {request}", file=sys.stderr)
|
||||
|
||||
# Parse options like in the diffusers backend
|
||||
options = request.Options
|
||||
self.options = {}
|
||||
|
||||
# The options are a list of strings in this form optname:optvalue
|
||||
# We store all the options in a dict for later use
|
||||
for opt in options:
|
||||
if ":" not in opt:
|
||||
continue
|
||||
key, value = opt.split(":", 1) # Split only on first colon to handle values with colons
|
||||
|
||||
# Convert numeric values to appropriate types
|
||||
if self._is_float(value):
|
||||
value = float(value)
|
||||
elif self._is_int(value):
|
||||
value = int(value)
|
||||
elif value.lower() in ["true", "false"]:
|
||||
value = value.lower() == "true"
|
||||
|
||||
self.options[key] = value
|
||||
|
||||
print(f"Options: {self.options}", file=sys.stderr)
|
||||
|
||||
# Build tokenizer config for MLX using options
|
||||
tokenizer_config = {}
|
||||
|
||||
# Handle trust_remote_code from request or options
|
||||
if request.TrustRemoteCode or self.options.get("trust_remote_code", False):
|
||||
tokenizer_config["trust_remote_code"] = True
|
||||
|
||||
# Handle EOS token from options
|
||||
if "eos_token" in self.options:
|
||||
tokenizer_config["eos_token"] = self.options["eos_token"]
|
||||
|
||||
# Handle other tokenizer config options
|
||||
for key in ["pad_token", "bos_token", "unk_token", "sep_token", "cls_token", "mask_token"]:
|
||||
if key in self.options:
|
||||
tokenizer_config[key] = self.options[key]
|
||||
|
||||
# Load model and tokenizer using MLX
|
||||
if tokenizer_config:
|
||||
print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr)
|
||||
self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config)
|
||||
else:
|
||||
self.model, self.tokenizer = load(request.Model)
|
||||
|
||||
# Initialize prompt cache for efficient generation
|
||||
max_kv_size = self.options.get("max_kv_size", None)
|
||||
self.prompt_cache = make_prompt_cache(self.model, max_kv_size)
|
||||
|
||||
except Exception as err:
|
||||
print(f"Error loading MLX model {err=}, {type(err)=}", file=sys.stderr)
|
||||
return backend_pb2.Result(success=False, message=f"Error loading MLX model: {err}")
|
||||
|
||||
print("MLX model loaded successfully", file=sys.stderr)
|
||||
return backend_pb2.Result(message="MLX model loaded successfully", success=True)
|
||||
|
||||
async def Predict(self, request, context):
|
||||
"""
|
||||
Generates text based on the given prompt and sampling parameters using MLX.
|
||||
|
||||
Args:
|
||||
request: The predict request.
|
||||
context: The gRPC context.
|
||||
|
||||
Returns:
|
||||
backend_pb2.Reply: The predict result.
|
||||
"""
|
||||
try:
|
||||
# Prepare the prompt
|
||||
prompt = self._prepare_prompt(request)
|
||||
|
||||
# Build generation parameters using request attributes and options
|
||||
max_tokens, sampler_params = self._build_generation_params(request)
|
||||
|
||||
print(f"Generating text with MLX - max_tokens: {max_tokens}, sampler_params: {sampler_params}", file=sys.stderr)
|
||||
|
||||
# Create sampler with parameters
|
||||
sampler = make_sampler(**sampler_params)
|
||||
|
||||
# Generate text using MLX with proper parameters
|
||||
response = generate(
|
||||
self.model,
|
||||
self.tokenizer,
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
sampler=sampler,
|
||||
prompt_cache=self.prompt_cache,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
return backend_pb2.Reply(message=bytes(response, encoding='utf-8'))
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in MLX Predict: {e}", file=sys.stderr)
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(f"Generation failed: {str(e)}")
|
||||
return backend_pb2.Reply(message=bytes("", encoding='utf-8'))
|
||||
|
||||
def Embedding(self, request, context):
|
||||
"""
|
||||
A gRPC method that calculates embeddings for a given sentence.
|
||||
|
||||
Note: MLX-LM doesn't support embeddings directly. This method returns an error.
|
||||
|
||||
Args:
|
||||
request: An EmbeddingRequest object that contains the request parameters.
|
||||
context: A grpc.ServicerContext object that provides information about the RPC.
|
||||
|
||||
Returns:
|
||||
An EmbeddingResult object that contains the calculated embeddings.
|
||||
"""
|
||||
print("Embeddings not supported in MLX backend", file=sys.stderr)
|
||||
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||
context.set_details("Embeddings are not supported in the MLX backend.")
|
||||
return backend_pb2.EmbeddingResult()
|
||||
|
||||
async def PredictStream(self, request, context):
|
||||
"""
|
||||
Generates text based on the given prompt and sampling parameters, and streams the results using MLX.
|
||||
|
||||
Args:
|
||||
request: The predict stream request.
|
||||
context: The gRPC context.
|
||||
|
||||
Yields:
|
||||
backend_pb2.Reply: Streaming predict results.
|
||||
"""
|
||||
try:
|
||||
# Prepare the prompt
|
||||
prompt = self._prepare_prompt(request)
|
||||
|
||||
# Build generation parameters using request attributes and options
|
||||
max_tokens, sampler_params = self._build_generation_params(request, default_max_tokens=512)
|
||||
|
||||
print(f"Streaming text with MLX - max_tokens: {max_tokens}, sampler_params: {sampler_params}", file=sys.stderr)
|
||||
|
||||
# Create sampler with parameters
|
||||
sampler = make_sampler(**sampler_params)
|
||||
|
||||
# Stream text generation using MLX with proper parameters
|
||||
for response in stream_generate(
|
||||
self.model,
|
||||
self.tokenizer,
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
sampler=sampler,
|
||||
prompt_cache=self.prompt_cache,
|
||||
):
|
||||
yield backend_pb2.Reply(message=bytes(response.text, encoding='utf-8'))
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in MLX PredictStream: {e}", file=sys.stderr)
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details(f"Streaming generation failed: {str(e)}")
|
||||
yield backend_pb2.Reply(message=bytes("", encoding='utf-8'))
|
||||
|
||||
def _prepare_prompt(self, request):
|
||||
"""
|
||||
Prepare the prompt for MLX generation, handling chat templates if needed.
|
||||
|
||||
Args:
|
||||
request: The gRPC request containing prompt and message information.
|
||||
|
||||
Returns:
|
||||
str: The prepared prompt.
|
||||
"""
|
||||
# If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
|
||||
if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
|
||||
# Convert gRPC messages to the format expected by apply_chat_template
|
||||
messages = []
|
||||
for msg in request.Messages:
|
||||
messages.append({"role": msg.role, "content": msg.content})
|
||||
|
||||
prompt = self.tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True
|
||||
)
|
||||
return prompt
|
||||
else:
|
||||
return request.Prompt
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def _build_generation_params(self, request, default_max_tokens=200):
|
||||
"""
|
||||
Build generation parameters from request attributes and options.
|
||||
|
||||
Args:
|
||||
request: The gRPC request.
|
||||
default_max_tokens: Default max_tokens if not specified.
|
||||
|
||||
Returns:
|
||||
tuple: (max_tokens, sampler_params dict)
|
||||
"""
|
||||
# Extract max_tokens
|
||||
max_tokens = getattr(request, 'Tokens', default_max_tokens)
|
||||
if max_tokens == 0:
|
||||
max_tokens = default_max_tokens
|
||||
|
||||
# Extract sampler parameters from request attributes
|
||||
temp = getattr(request, 'Temperature', 0.0)
|
||||
if temp == 0.0:
|
||||
temp = 0.6 # Default temperature
|
||||
|
||||
top_p = getattr(request, 'TopP', 0.0)
|
||||
if top_p == 0.0:
|
||||
top_p = 1.0 # Default top_p
|
||||
|
||||
# Initialize sampler parameters
|
||||
sampler_params = {
|
||||
'temp': temp,
|
||||
'top_p': top_p,
|
||||
'xtc_threshold': 0.0,
|
||||
'xtc_probability': 0.0,
|
||||
}
|
||||
|
||||
# Add seed if specified
|
||||
seed = getattr(request, 'Seed', 0)
|
||||
if seed != 0:
|
||||
mx.random.seed(seed)
|
||||
|
||||
# Override with options if available
|
||||
if hasattr(self, 'options'):
|
||||
# Max tokens from options
|
||||
if 'max_tokens' in self.options:
|
||||
max_tokens = self.options['max_tokens']
|
||||
|
||||
# Sampler parameters from options
|
||||
sampler_option_mapping = {
|
||||
'temp': 'temp',
|
||||
'temperature': 'temp', # alias
|
||||
'top_p': 'top_p',
|
||||
'xtc_threshold': 'xtc_threshold',
|
||||
'xtc_probability': 'xtc_probability',
|
||||
}
|
||||
|
||||
for option_key, param_key in sampler_option_mapping.items():
|
||||
if option_key in self.options:
|
||||
sampler_params[param_key] = self.options[option_key]
|
||||
|
||||
# Handle seed from options
|
||||
if 'seed' in self.options:
|
||||
mx.random.seed(self.options['seed'])
|
||||
|
||||
# Special tokens for XTC sampling (if tokenizer has eos_token_ids)
|
||||
xtc_special_tokens = []
|
||||
if hasattr(self.tokenizer, 'eos_token_ids') and self.tokenizer.eos_token_ids:
|
||||
xtc_special_tokens = list(self.tokenizer.eos_token_ids)
|
||||
elif hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None:
|
||||
xtc_special_tokens = [self.tokenizer.eos_token_id]
|
||||
|
||||
# Add newline token if available
|
||||
try:
|
||||
newline_tokens = self.tokenizer.encode("\n")
|
||||
xtc_special_tokens.extend(newline_tokens)
|
||||
except:
|
||||
pass # Skip if encoding fails
|
||||
|
||||
sampler_params['xtc_special_tokens'] = xtc_special_tokens
|
||||
|
||||
return max_tokens, sampler_params
|
||||
|
||||
async def serve(address):
|
||||
# Start asyncio gRPC server
|
||||
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||
])
|
||||
# Add the servicer to the server
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
# Bind the server to the address
|
||||
server.add_insecure_port(address)
|
||||
|
||||
# Gracefully shutdown the server on SIGTERM or SIGINT
|
||||
loop = asyncio.get_event_loop()
|
||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||
loop.add_signal_handler(
|
||||
sig, lambda: asyncio.ensure_future(server.stop(5))
|
||||
)
|
||||
|
||||
# Start the server
|
||||
await server.start()
|
||||
print("Server started. Listening on: " + address, file=sys.stderr)
|
||||
# Wait for the server to be terminated
|
||||
await server.wait_for_termination()
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run the gRPC server.")
|
||||
parser.add_argument(
|
||||
"--addr", default="localhost:50051", help="The address to bind the server to."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(serve(args.addr))
|
||||
8
backend/python/rfdetr/protogen.sh → backend/python/mlx/install.sh
Normal file → Executable file
8
backend/python/rfdetr/protogen.sh → backend/python/mlx/install.sh
Normal file → Executable file
@@ -1,13 +1,15 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
USE_PIP=true
|
||||
PYTHON_VERSION=""
|
||||
|
||||
backend_dir=$(dirname $0)
|
||||
|
||||
if [ -d $backend_dir/common ]; then
|
||||
source $backend_dir/common/libbackend.sh
|
||||
else
|
||||
source $backend_dir/../common/libbackend.sh
|
||||
fi
|
||||
|
||||
ensureVenv
|
||||
|
||||
python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
|
||||
installRequirements
|
||||
1
backend/python/mlx/requirements-mps.txt
Normal file
1
backend/python/mlx/requirements-mps.txt
Normal file
@@ -0,0 +1 @@
|
||||
mlx-lm
|
||||
4
backend/python/mlx/requirements.txt
Normal file
4
backend/python/mlx/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
grpcio==1.71.0
|
||||
protobuf
|
||||
certifi
|
||||
setuptools
|
||||
11
backend/python/mlx/run.sh
Executable file
11
backend/python/mlx/run.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
|
||||
backend_dir=$(dirname $0)
|
||||
|
||||
if [ -d $backend_dir/common ]; then
|
||||
source $backend_dir/common/libbackend.sh
|
||||
else
|
||||
source $backend_dir/../common/libbackend.sh
|
||||
fi
|
||||
|
||||
startBackend $@
|
||||
146
backend/python/mlx/test.py
Normal file
146
backend/python/mlx/test.py
Normal file
@@ -0,0 +1,146 @@
|
||||
import unittest
|
||||
import subprocess
|
||||
import time
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
|
||||
import grpc
|
||||
|
||||
import unittest
|
||||
import subprocess
|
||||
import time
|
||||
import grpc
|
||||
import backend_pb2_grpc
|
||||
import backend_pb2
|
||||
|
||||
class TestBackendServicer(unittest.TestCase):
|
||||
"""
|
||||
TestBackendServicer is the class that tests the gRPC service.
|
||||
|
||||
This class contains methods to test the startup and shutdown of the gRPC service.
|
||||
"""
|
||||
def setUp(self):
|
||||
self.service = subprocess.Popen(["python", "backend.py", "--addr", "localhost:50051"])
|
||||
time.sleep(10)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
self.service.terminate()
|
||||
self.service.wait()
|
||||
|
||||
def test_server_startup(self):
|
||||
try:
|
||||
self.setUp()
|
||||
with grpc.insecure_channel("localhost:50051") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.Health(backend_pb2.HealthMessage())
|
||||
self.assertEqual(response.message, b'OK')
|
||||
except Exception as err:
|
||||
print(err)
|
||||
self.fail("Server failed to start")
|
||||
finally:
|
||||
self.tearDown()
|
||||
def test_load_model(self):
|
||||
"""
|
||||
This method tests if the model is loaded successfully
|
||||
"""
|
||||
try:
|
||||
self.setUp()
|
||||
with grpc.insecure_channel("localhost:50051") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
|
||||
self.assertTrue(response.success)
|
||||
self.assertEqual(response.message, "Model loaded successfully")
|
||||
except Exception as err:
|
||||
print(err)
|
||||
self.fail("LoadModel service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
|
||||
def test_text(self):
|
||||
"""
|
||||
This method tests if the embeddings are generated successfully
|
||||
"""
|
||||
try:
|
||||
self.setUp()
|
||||
with grpc.insecure_channel("localhost:50051") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
|
||||
self.assertTrue(response.success)
|
||||
req = backend_pb2.PredictOptions(Prompt="The capital of France is")
|
||||
resp = stub.Predict(req)
|
||||
self.assertIsNotNone(resp.message)
|
||||
except Exception as err:
|
||||
print(err)
|
||||
self.fail("text service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
|
||||
def test_sampling_params(self):
|
||||
"""
|
||||
This method tests if all sampling parameters are correctly processed
|
||||
NOTE: this does NOT test for correctness, just that we received a compatible response
|
||||
"""
|
||||
try:
|
||||
self.setUp()
|
||||
with grpc.insecure_channel("localhost:50051") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
|
||||
self.assertTrue(response.success)
|
||||
|
||||
req = backend_pb2.PredictOptions(
|
||||
Prompt="The capital of France is",
|
||||
TopP=0.8,
|
||||
Tokens=50,
|
||||
Temperature=0.7,
|
||||
TopK=40,
|
||||
PresencePenalty=0.1,
|
||||
FrequencyPenalty=0.2,
|
||||
RepetitionPenalty=1.1,
|
||||
MinP=0.05,
|
||||
Seed=42,
|
||||
StopPrompts=["\n"],
|
||||
StopTokenIds=[50256],
|
||||
BadWords=["badword"],
|
||||
IncludeStopStrInOutput=True,
|
||||
IgnoreEOS=True,
|
||||
MinTokens=5,
|
||||
Logprobs=5,
|
||||
PromptLogprobs=5,
|
||||
SkipSpecialTokens=True,
|
||||
SpacesBetweenSpecialTokens=True,
|
||||
TruncatePromptTokens=10,
|
||||
GuidedDecoding=True,
|
||||
N=2,
|
||||
)
|
||||
resp = stub.Predict(req)
|
||||
self.assertIsNotNone(resp.message)
|
||||
self.assertIsNotNone(resp.logprobs)
|
||||
except Exception as err:
|
||||
print(err)
|
||||
self.fail("sampling params service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
|
||||
|
||||
def test_embedding(self):
|
||||
"""
|
||||
This method tests if the embeddings are generated successfully
|
||||
"""
|
||||
try:
|
||||
self.setUp()
|
||||
with grpc.insecure_channel("localhost:50051") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="intfloat/e5-mistral-7b-instruct"))
|
||||
self.assertTrue(response.success)
|
||||
embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
|
||||
embedding_response = stub.Embedding(embedding_request)
|
||||
self.assertIsNotNone(embedding_response.embeddings)
|
||||
# assert that is a list of floats
|
||||
self.assertIsInstance(embedding_response.embeddings, list)
|
||||
# assert that the list is not empty
|
||||
self.assertTrue(len(embedding_response.embeddings) > 0)
|
||||
except Exception as err:
|
||||
print(err)
|
||||
self.fail("Embedding service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
12
backend/python/mlx/test.sh
Executable file
12
backend/python/mlx/test.sh
Executable file
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
backend_dir=$(dirname $0)
|
||||
|
||||
if [ -d $backend_dir/common ]; then
|
||||
source $backend_dir/common/libbackend.sh
|
||||
else
|
||||
source $backend_dir/../common/libbackend.sh
|
||||
fi
|
||||
|
||||
runUnittests
|
||||
@@ -1,30 +1,24 @@
|
||||
.PHONY: rerankers
|
||||
rerankers: protogen
|
||||
rerankers:
|
||||
bash install.sh
|
||||
|
||||
.PHONY: run
|
||||
run: protogen
|
||||
run: rerankers
|
||||
@echo "Running rerankers..."
|
||||
bash run.sh
|
||||
@echo "rerankers run."
|
||||
|
||||
# It is not working well by using command line. It only6 works with IDE like VSCode.
|
||||
.PHONY: test
|
||||
test: protogen
|
||||
test: rerankers
|
||||
@echo "Testing rerankers..."
|
||||
bash test.sh
|
||||
@echo "rerankers tested."
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
@@ -3,18 +3,11 @@
|
||||
.PHONY: install
|
||||
install:
|
||||
bash install.sh
|
||||
$(MAKE) protogen
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
bash protogen.sh
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
@@ -1,30 +1,24 @@
|
||||
.PHONY: transformers
|
||||
transformers: protogen
|
||||
transformers:
|
||||
bash install.sh
|
||||
|
||||
.PHONY: run
|
||||
run: protogen
|
||||
run: transformers
|
||||
@echo "Running transformers..."
|
||||
bash run.sh
|
||||
@echo "transformers run."
|
||||
|
||||
# It is not working well by using command line. It only6 works with IDE like VSCode.
|
||||
.PHONY: test
|
||||
test: protogen
|
||||
test: transformers
|
||||
@echo "Testing transformers..."
|
||||
bash test.sh
|
||||
@echo "transformers tested."
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
@@ -94,7 +94,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
self.SentenceTransformer = False
|
||||
|
||||
device_map="cpu"
|
||||
|
||||
mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
||||
if mps_available:
|
||||
device_map = "mps"
|
||||
quantization = None
|
||||
autoTokenizer = True
|
||||
|
||||
|
||||
@@ -1,29 +1,23 @@
|
||||
.PHONY: vllm
|
||||
vllm: protogen
|
||||
vllm:
|
||||
bash install.sh
|
||||
|
||||
.PHONY: run
|
||||
run: protogen
|
||||
run: vllm
|
||||
@echo "Running vllm..."
|
||||
bash run.sh
|
||||
@echo "vllm run."
|
||||
|
||||
.PHONY: test
|
||||
test: protogen
|
||||
test: vllm
|
||||
@echo "Testing vllm..."
|
||||
bash test.sh
|
||||
@echo "vllm tested."
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
Reference in New Issue
Block a user