feat(chatterbox): support multilingual (#6240)

* feat(chatterbox): support multilingual Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add l4t support Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix: switch to fork Until https://github.com/resemble-ai/chatterbox/pull/295 is merged Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-01-06 18:49:55 -06:00 · 2025-09-24 18:37:37 +02:00
parent b5efc4f89e
commit c85d559919
11 changed files with 107 additions and 21 deletions
--- a/backend/python/chatterbox/backend.py
+++ b/backend/python/chatterbox/backend.py
@@ -14,9 +14,23 @@ import backend_pb2_grpc
 import torch
 import torchaudio as ta
 from chatterbox.tts import ChatterboxTTS
-
+from chatterbox.mtl_tts import ChatterboxMultilingualTTS
 import grpc

+def is_float(s):
+    """Check if a string can be converted to float."""
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+def is_int(s):
+    """Check if a string can be converted to int."""
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

@@ -47,6 +61,28 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if not torch.cuda.is_available() and request.CUDA:
            return backend_pb2.Result(success=False, message="CUDA is not available")

+
+        options = request.Options
+
+        # empty dict
+        self.options = {}
+
+        # The options are a list of strings in this form optname:optvalue
+        # We are storing all the options in a dict so we can use it later when
+        # generating the images
+        for opt in options:
+            if ":" not in opt:
+                continue
+            key, value = opt.split(":")
+            # if value is a number, convert it to the appropriate type
+            if is_float(value):
+                value = float(value)
+            elif is_int(value):
+                value = int(value)
+            elif value.lower() in ["true", "false"]:
+                value = value.lower() == "true"
+            self.options[key] = value
+
        self.AudioPath = None

        if os.path.isabs(request.AudioPath):
@@ -56,10 +92,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            modelFileBase = os.path.dirname(request.ModelFile)
            # modify LoraAdapter to be relative to modelFileBase
            self.AudioPath = os.path.join(modelFileBase, request.AudioPath)
-
        try:
            print("Preparing models, please wait", file=sys.stderr)
-            self.model = ChatterboxTTS.from_pretrained(device=device)
+            if "multilingual" in self.options:
+                # remove key from options
+                del self.options["multilingual"]
+                self.model = ChatterboxMultilingualTTS.from_pretrained(device=device)
+            else:
+                self.model = ChatterboxTTS.from_pretrained(device=device)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        # Implement your logic here for the LoadModel service
@@ -68,12 +108,18 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

    def TTS(self, request, context):
        try:
-            # Generate audio using ChatterboxTTS
+            kwargs = {}
+
+            if "language" in self.options:
+                kwargs["language_id"] = self.options["language"]
            if self.AudioPath is not None:
-                wav = self.model.generate(request.text, audio_prompt_path=self.AudioPath)
-            else:
-                wav = self.model.generate(request.text)
-            
+                kwargs["audio_prompt_path"] = self.AudioPath
+
+            # add options to kwargs
+            kwargs.update(self.options)
+
+            # Generate audio using ChatterboxTTS
+            wav = self.model.generate(request.text, **kwargs)
            # Save the generated audio
            ta.save(request.dst, wav, self.model.sr)
            
--- a/backend/python/chatterbox/install.sh
+++ b/backend/python/chatterbox/install.sh
@@ -15,5 +15,6 @@ fi
 if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
+EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation"

 installRequirements
--- a/backend/python/chatterbox/requirements-cpu.txt
+++ b/backend/python/chatterbox/requirements-cpu.txt
@@ -1,6 +1,8 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
-torch==2.6.0
-torchaudio==2.6.0
-transformers==4.46.3
-chatterbox-tts==0.1.2
+torch
+torchaudio
+transformers
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
+#chatterbox-tts==0.1.4
--- a/backend/python/chatterbox/requirements-cublas11.txt
+++ b/backend/python/chatterbox/requirements-cublas11.txt
@@ -2,5 +2,6 @@
 torch==2.6.0+cu118
 torchaudio==2.6.0+cu118
 transformers==4.46.3
-chatterbox-tts==0.1.2
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
 accelerate
--- a/backend/python/chatterbox/requirements-cublas12.txt
+++ b/backend/python/chatterbox/requirements-cublas12.txt
@@ -1,5 +1,6 @@
-torch==2.6.0
-torchaudio==2.6.0
-transformers==4.46.3
-chatterbox-tts==0.1.2
+torch
+torchaudio
+transformers
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
 accelerate
--- a/backend/python/chatterbox/requirements-hipblas.txt
+++ b/backend/python/chatterbox/requirements-hipblas.txt
@@ -1,6 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.6.0+rocm6.1
 torchaudio==2.6.0+rocm6.1
-transformers==4.46.3
-chatterbox-tts==0.1.2
+transformers
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
 accelerate
--- a/backend/python/chatterbox/requirements-intel.txt
+++ b/backend/python/chatterbox/requirements-intel.txt
@@ -2,8 +2,9 @@
 intel-extension-for-pytorch==2.3.110+xpu
 torch==2.3.1+cxx11.abi
 torchaudio==2.3.1+cxx11.abi
-transformers==4.46.3
-chatterbox-tts==0.1.2
+transformers
+# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
 accelerate
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
--- a/backend/python/chatterbox/requirements-l4t.txt
+++ b/backend/python/chatterbox/requirements-l4t.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu126/
+torch
+torchaudio
+transformers
+chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
+accelerate