From c85d5599192c55fcfbd425e4a1b69c5aac77dc25 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 24 Sep 2025 18:37:37 +0200 Subject: [PATCH] feat(chatterbox): support multilingual (#6240) * feat(chatterbox): support multilingual Signed-off-by: Ettore Di Giacinto * Add l4t support Signed-off-by: Ettore Di Giacinto * Fixups Signed-off-by: Ettore Di Giacinto * fix: switch to fork Until https://github.com/resemble-ai/chatterbox/pull/295 is merged Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto --- .github/workflows/backend.yml | 12 ++++ Makefile | 3 + backend/index.yaml | 12 ++++ backend/python/chatterbox/backend.py | 62 ++++++++++++++++--- backend/python/chatterbox/install.sh | 1 + .../python/chatterbox/requirements-cpu.txt | 10 +-- .../chatterbox/requirements-cublas11.txt | 3 +- .../chatterbox/requirements-cublas12.txt | 9 +-- .../chatterbox/requirements-hipblas.txt | 5 +- .../python/chatterbox/requirements-intel.txt | 5 +- .../python/chatterbox/requirements-l4t.txt | 6 ++ 11 files changed, 107 insertions(+), 21 deletions(-) create mode 100644 backend/python/chatterbox/requirements-l4t.txt diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index b303de1f4..048e9a47b 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -955,6 +955,18 @@ jobs: backend: "exllama2" dockerfile: "./backend/Dockerfile.python" context: "./backend" + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'true' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-arm64-chatterbox' + base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" + runs-on: 'ubuntu-24.04-arm' + backend: "chatterbox" + dockerfile: "./backend/Dockerfile.python" + context: "./backend" # runs out of space on the runner # - build-type: 'hipblas' # cuda-major-version: "" diff --git a/Makefile b/Makefile index 24502b57b..5d32926ae 100644 --- a/Makefile +++ b/Makefile @@ -429,6 +429,9 @@ docker-build-kitten-tts: docker-save-kitten-tts: backend-images docker save local-ai-backend:kitten-tts -o backend-images/kitten-tts.tar +docker-save-chatterbox: backend-images + docker save local-ai-backend:chatterbox -o backend-images/chatterbox.tar + docker-build-kokoro: docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro ./backend diff --git a/backend/index.yaml b/backend/index.yaml index e078391d6..c55df2636 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -353,6 +353,7 @@ nvidia: "cuda12-chatterbox" metal: "metal-chatterbox" default: "cpu-chatterbox" + nvidia-l4t: "nvidia-l4t-arm64-chatterbox" - &piper name: "piper" uri: "quay.io/go-skynet/local-ai-backends:latest-piper" @@ -1239,6 +1240,7 @@ nvidia: "cuda12-chatterbox-development" metal: "metal-chatterbox-development" default: "cpu-chatterbox-development" + nvidia-l4t: "nvidia-l4t-arm64-chatterbox" - !!merge <<: *chatterbox name: "cpu-chatterbox" uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-chatterbox" @@ -1249,6 +1251,16 @@ uri: "quay.io/go-skynet/local-ai-backends:master-cpu-chatterbox" mirrors: - localai/localai-backends:master-cpu-chatterbox +- !!merge <<: *chatterbox + name: "nvidia-l4t-arm64-chatterbox" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox +- !!merge <<: *chatterbox + name: "nvidia-l4t-arm64-chatterbox-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-l4t-arm64-chatterbox" + mirrors: + - localai/localai-backends:master-gpu-nvidia-l4t-arm64-chatterbox - !!merge <<: *chatterbox name: "metal-chatterbox" uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-chatterbox" diff --git a/backend/python/chatterbox/backend.py b/backend/python/chatterbox/backend.py index 0944202b9..4cc45b7c7 100644 --- a/backend/python/chatterbox/backend.py +++ b/backend/python/chatterbox/backend.py @@ -14,9 +14,23 @@ import backend_pb2_grpc import torch import torchaudio as ta from chatterbox.tts import ChatterboxTTS - +from chatterbox.mtl_tts import ChatterboxMultilingualTTS import grpc +def is_float(s): + """Check if a string can be converted to float.""" + try: + float(s) + return True + except ValueError: + return False +def is_int(s): + """Check if a string can be converted to int.""" + try: + int(s) + return True + except ValueError: + return False _ONE_DAY_IN_SECONDS = 60 * 60 * 24 @@ -47,6 +61,28 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if not torch.cuda.is_available() and request.CUDA: return backend_pb2.Result(success=False, message="CUDA is not available") + + options = request.Options + + # empty dict + self.options = {} + + # The options are a list of strings in this form optname:optvalue + # We are storing all the options in a dict so we can use it later when + # generating the images + for opt in options: + if ":" not in opt: + continue + key, value = opt.split(":") + # if value is a number, convert it to the appropriate type + if is_float(value): + value = float(value) + elif is_int(value): + value = int(value) + elif value.lower() in ["true", "false"]: + value = value.lower() == "true" + self.options[key] = value + self.AudioPath = None if os.path.isabs(request.AudioPath): @@ -56,10 +92,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): modelFileBase = os.path.dirname(request.ModelFile) # modify LoraAdapter to be relative to modelFileBase self.AudioPath = os.path.join(modelFileBase, request.AudioPath) - try: print("Preparing models, please wait", file=sys.stderr) - self.model = ChatterboxTTS.from_pretrained(device=device) + if "multilingual" in self.options: + # remove key from options + del self.options["multilingual"] + self.model = ChatterboxMultilingualTTS.from_pretrained(device=device) + else: + self.model = ChatterboxTTS.from_pretrained(device=device) except Exception as err: return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") # Implement your logic here for the LoadModel service @@ -68,12 +108,18 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): def TTS(self, request, context): try: - # Generate audio using ChatterboxTTS + kwargs = {} + + if "language" in self.options: + kwargs["language_id"] = self.options["language"] if self.AudioPath is not None: - wav = self.model.generate(request.text, audio_prompt_path=self.AudioPath) - else: - wav = self.model.generate(request.text) - + kwargs["audio_prompt_path"] = self.AudioPath + + # add options to kwargs + kwargs.update(self.options) + + # Generate audio using ChatterboxTTS + wav = self.model.generate(request.text, **kwargs) # Save the generated audio ta.save(request.dst, wav, self.model.sr) diff --git a/backend/python/chatterbox/install.sh b/backend/python/chatterbox/install.sh index 32befa8e6..8f607485b 100755 --- a/backend/python/chatterbox/install.sh +++ b/backend/python/chatterbox/install.sh @@ -15,5 +15,6 @@ fi if [ "x${BUILD_PROFILE}" == "xintel" ]; then EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" fi +EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation" installRequirements diff --git a/backend/python/chatterbox/requirements-cpu.txt b/backend/python/chatterbox/requirements-cpu.txt index 4d9cf55cb..625d5a509 100644 --- a/backend/python/chatterbox/requirements-cpu.txt +++ b/backend/python/chatterbox/requirements-cpu.txt @@ -1,6 +1,8 @@ --extra-index-url https://download.pytorch.org/whl/cpu accelerate -torch==2.6.0 -torchaudio==2.6.0 -transformers==4.46.3 -chatterbox-tts==0.1.2 \ No newline at end of file +torch +torchaudio +transformers +# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289 +chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster +#chatterbox-tts==0.1.4 \ No newline at end of file diff --git a/backend/python/chatterbox/requirements-cublas11.txt b/backend/python/chatterbox/requirements-cublas11.txt index 1d5f08e2d..6dbeb19ec 100644 --- a/backend/python/chatterbox/requirements-cublas11.txt +++ b/backend/python/chatterbox/requirements-cublas11.txt @@ -2,5 +2,6 @@ torch==2.6.0+cu118 torchaudio==2.6.0+cu118 transformers==4.46.3 -chatterbox-tts==0.1.2 +# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289 +chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster accelerate \ No newline at end of file diff --git a/backend/python/chatterbox/requirements-cublas12.txt b/backend/python/chatterbox/requirements-cublas12.txt index 3e97fda28..84b9b6f80 100644 --- a/backend/python/chatterbox/requirements-cublas12.txt +++ b/backend/python/chatterbox/requirements-cublas12.txt @@ -1,5 +1,6 @@ -torch==2.6.0 -torchaudio==2.6.0 -transformers==4.46.3 -chatterbox-tts==0.1.2 +torch +torchaudio +transformers +# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289 +chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster accelerate diff --git a/backend/python/chatterbox/requirements-hipblas.txt b/backend/python/chatterbox/requirements-hipblas.txt index 9086928d7..458ad44f4 100644 --- a/backend/python/chatterbox/requirements-hipblas.txt +++ b/backend/python/chatterbox/requirements-hipblas.txt @@ -1,6 +1,7 @@ --extra-index-url https://download.pytorch.org/whl/rocm6.0 torch==2.6.0+rocm6.1 torchaudio==2.6.0+rocm6.1 -transformers==4.46.3 -chatterbox-tts==0.1.2 +transformers +# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289 +chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster accelerate diff --git a/backend/python/chatterbox/requirements-intel.txt b/backend/python/chatterbox/requirements-intel.txt index d4cb49848..b011a20c3 100644 --- a/backend/python/chatterbox/requirements-intel.txt +++ b/backend/python/chatterbox/requirements-intel.txt @@ -2,8 +2,9 @@ intel-extension-for-pytorch==2.3.110+xpu torch==2.3.1+cxx11.abi torchaudio==2.3.1+cxx11.abi -transformers==4.46.3 -chatterbox-tts==0.1.2 +transformers +# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289 +chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster accelerate oneccl_bind_pt==2.3.100+xpu optimum[openvino] diff --git a/backend/python/chatterbox/requirements-l4t.txt b/backend/python/chatterbox/requirements-l4t.txt new file mode 100644 index 000000000..6f90be031 --- /dev/null +++ b/backend/python/chatterbox/requirements-l4t.txt @@ -0,0 +1,6 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu126/ +torch +torchaudio +transformers +chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster +accelerate