From 20a70e1244dc43b38ce7cb24fb808e7b878dbb6f Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 6 Aug 2025 12:38:45 +0200
Subject: [PATCH] feat(backends): add KittenTTS (#5977)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/workflows/backend.yml              |  13 +++
 Makefile                                   |   9 ++
 backend/index.yaml                         |  18 +++
 backend/python/kitten-tts/Makefile         |  29 +++++
 backend/python/kitten-tts/backend.py       | 121 +++++++++++++++++++++
 backend/python/kitten-tts/install.sh       |  19 ++++
 backend/python/kitten-tts/requirements.txt |   5 +
 backend/python/kitten-tts/run.sh           |   9 ++
 backend/python/kitten-tts/test.py          |  82 ++++++++++++++
 backend/python/kitten-tts/test.sh          |  11 ++
 gallery/index.yaml                         |  22 ++++
 11 files changed, 338 insertions(+)
 create mode 100644 backend/python/kitten-tts/Makefile
 create mode 100644 backend/python/kitten-tts/backend.py
 create mode 100755 backend/python/kitten-tts/install.sh
 create mode 100644 backend/python/kitten-tts/requirements.txt
 create mode 100755 backend/python/kitten-tts/run.sh
 create mode 100644 backend/python/kitten-tts/test.py
 create mode 100755 backend/python/kitten-tts/test.sh

diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index fb05611e0..839d09c47 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -920,6 +920,19 @@ jobs:
           #   backend: "rfdetr"
           #   dockerfile: "./backend/Dockerfile.python"
           #   context: "./backend"
+          # kitten-tts
+          - build-type: ''
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64,linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-kitten-tts'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            backend: "kitten-tts"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
   llama-cpp-darwin:
     runs-on: macOS-14
     strategy:
diff --git a/Makefile b/Makefile
index ef4f56725..16703f5e8 100644
--- a/Makefile
+++ b/Makefile
@@ -156,6 +156,9 @@ backends/huggingface: docker-build-huggingface docker-save-huggingface build
 backends/rfdetr: docker-build-rfdetr docker-save-rfdetr build
 	./local-ai backends install "ocifile://$(abspath ./backend-images/rfdetr.tar)"
 
+backends/kitten-tts: docker-build-kitten-tts docker-save-kitten-tts build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/kitten-tts.tar)"
+
 ########################################################
 ## AIO tests
 ########################################################
@@ -369,6 +372,12 @@ docker-build-huggingface:
 docker-build-rfdetr:
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:rfdetr -f backend/Dockerfile.python --build-arg BACKEND=rfdetr ./backend
 
+docker-build-kitten-tts:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kitten-tts -f backend/Dockerfile.python --build-arg BACKEND=kitten-tts ./backend
+
+docker-save-kitten-tts: backend-images
+	docker save local-ai-backend:kitten-tts -o backend-images/kitten-tts.tar
+
 docker-save-rfdetr: backend-images
 	docker save local-ai-backend:rfdetr -o backend-images/rfdetr.tar
 
diff --git a/backend/index.yaml b/backend/index.yaml
index 17061106c..9167bbd2b 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -356,6 +356,24 @@
     - LLM
     - huggingface
   license: MIT
+- &kitten-tts
+  name: "kitten-tts"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-kitten-tts"
+  mirrors:
+    - localai/localai-backends:latest-kitten-tts
+  urls:
+    - https://github.com/KittenML/KittenTTS
+  description: |
+    Kitten TTS is a text-to-speech model that can generate speech from text.
+  tags:
+    - text-to-speech
+    - TTS
+  license: apache-2.0
+- !!merge <<: *kitten-tts
+  name: "kitten-tts-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-kitten-tts"
+  mirrors:
+    - localai/localai-backends:master-kitten-tts
 - !!merge <<: *huggingface
   name: "huggingface-development"
   uri: "quay.io/go-skynet/local-ai-backends:master-huggingface"
diff --git a/backend/python/kitten-tts/Makefile b/backend/python/kitten-tts/Makefile
new file mode 100644
index 000000000..f05fc1916
--- /dev/null
+++ b/backend/python/kitten-tts/Makefile
@@ -0,0 +1,29 @@
+.PHONY: kitten-tts
+kitten-tts: protogen
+	bash install.sh
+
+.PHONY: run
+run: protogen
+	@echo "Running kitten-tts..."
+	bash run.sh
+	@echo "kitten-tts run."
+
+.PHONY: test
+test: protogen
+	@echo "Testing kitten-tts..."
+	bash test.sh
+	@echo "kitten-tts tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
\ No newline at end of file
diff --git a/backend/python/kitten-tts/backend.py b/backend/python/kitten-tts/backend.py
new file mode 100644
index 000000000..775f85f57
--- /dev/null
+++ b/backend/python/kitten-tts/backend.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+"""
+This is an extra gRPC server of LocalAI for Kitten TTS
+"""
+from concurrent import futures
+import time
+import argparse
+import signal
+import sys
+import os
+import backend_pb2
+import backend_pb2_grpc
+
+import torch
+from kittentts import KittenTTS
+import soundfile as sf
+
+import grpc
+
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+KITTEN_LANGUAGE = os.environ.get('KITTEN_LANGUAGE', None)
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    BackendServicer is the class that implements the gRPC service
+    """
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+    def LoadModel(self, request, context):
+
+        # Get device
+        # device = "cuda" if request.CUDA else "cpu"
+        if torch.cuda.is_available():
+            print("CUDA is available", file=sys.stderr)
+            device = "cuda"
+        else:
+            print("CUDA is not available", file=sys.stderr)
+            device = "cpu"
+
+        if not torch.cuda.is_available() and request.CUDA:
+            return backend_pb2.Result(success=False, message="CUDA is not available")
+
+        self.AudioPath = None
+        # List available KittenTTS models
+        print("Available KittenTTS voices: expr-voice-2-m, expr-voice-2-f, expr-voice-3-m, expr-voice-3-f, expr-voice-4-m, expr-voice-4-f, expr-voice-5-m, expr-voice-5-f")
+        if os.path.isabs(request.AudioPath):
+            self.AudioPath = request.AudioPath
+        elif request.AudioPath and request.ModelFile != "" and not os.path.isabs(request.AudioPath):
+            # get base path of modelFile
+            modelFileBase = os.path.dirname(request.ModelFile)
+            # modify LoraAdapter to be relative to modelFileBase
+            self.AudioPath = os.path.join(modelFileBase, request.AudioPath)
+
+        try:
+            print("Preparing KittenTTS model, please wait", file=sys.stderr)
+            # Use the model name from request.Model, defaulting to "KittenML/kitten-tts-nano-0.1" if not specified
+            model_name = request.Model if request.Model else "KittenML/kitten-tts-nano-0.1"
+            self.tts = KittenTTS(model_name)
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        # Implement your logic here for the LoadModel service
+        # Replace this with your desired response
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def TTS(self, request, context):
+        try:
+            # KittenTTS doesn't use language parameter like TTS, so we ignore it
+            # For multi-speaker models, use voice parameter
+            voice = request.voice if request.voice else "expr-voice-2-f"
+            
+            # Generate audio using KittenTTS
+            audio = self.tts.generate(request.text, voice=voice)
+            
+            # Save the audio using soundfile
+            sf.write(request.dst, audio, 24000)
+            
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(success=True)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
diff --git a/backend/python/kitten-tts/install.sh b/backend/python/kitten-tts/install.sh
new file mode 100755
index 000000000..32befa8e6
--- /dev/null
+++ b/backend/python/kitten-tts/install.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
+# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
+# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
+# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
+if [ "x${BUILD_PROFILE}" == "xintel" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+fi
+
+installRequirements
diff --git a/backend/python/kitten-tts/requirements.txt b/backend/python/kitten-tts/requirements.txt
new file mode 100644
index 000000000..23439f8e5
--- /dev/null
+++ b/backend/python/kitten-tts/requirements.txt
@@ -0,0 +1,5 @@
+grpcio==1.71.0
+protobuf
+certifi
+packaging==24.1
+https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl
\ No newline at end of file
diff --git a/backend/python/kitten-tts/run.sh b/backend/python/kitten-tts/run.sh
new file mode 100755
index 000000000..82b7b09ec
--- /dev/null
+++ b/backend/python/kitten-tts/run.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+startBackend $@
\ No newline at end of file
diff --git a/backend/python/kitten-tts/test.py b/backend/python/kitten-tts/test.py
new file mode 100644
index 000000000..e0b1a0bdd
--- /dev/null
+++ b/backend/python/kitten-tts/test.py
@@ -0,0 +1,82 @@
+"""
+A test script to test the gRPC service
+"""
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service
+    """
+    def setUp(self):
+        """
+        This method sets up the gRPC service by starting the server
+        """
+        self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
+        time.sleep(30)
+
+    def tearDown(self) -> None:
+        """
+        This method tears down the gRPC service by terminating the server
+        """
+        self.service.terminate()
+        self.service.wait()
+
+    def test_server_startup(self):
+        """
+        This method tests if the server starts up successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="tts_models/en/vctk/vits"))
+                print(response)
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_tts(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="tts_models/en/vctk/vits"))
+                self.assertTrue(response.success)
+                tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story")
+                tts_response = stub.TTS(tts_request)
+                self.assertIsNotNone(tts_response)
+        except Exception as err:
+            print(err)
+            self.fail("TTS service failed")
+        finally:
+            self.tearDown()
\ No newline at end of file
diff --git a/backend/python/kitten-tts/test.sh b/backend/python/kitten-tts/test.sh
new file mode 100755
index 000000000..eb59f2aaf
--- /dev/null
+++ b/backend/python/kitten-tts/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+    source $backend_dir/common/libbackend.sh
+else
+    source $backend_dir/../common/libbackend.sh
+fi
+
+runUnittests
diff --git a/gallery/index.yaml b/gallery/index.yaml
index 936f8ff52..9231cf152 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,26 @@
 ---
+- name: "kitten-tts"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://github.com/KittenML/KittenTTS
+  license: apache-2.0
+  tags:
+    - tts
+    - kitten-tts
+    - gpu
+    - cpu
+    - text-to-speech
+  description: |
+    Kitten TTS is an open-source realistic text-to-speech model with just 15 million parameters, designed for lightweight deployment and high-quality voice synthesis.
+  overrides:
+    backend: "kitten-tts"
+    name: "kitten-tts"
+    description: "Kitten TTS is a text-to-speech model that can generate speech from text."
+    parameters:
+      model: "KittenML/kitten-tts-nano-0.1"
+      voice: "expr-voice-5-f"
+    known_usecases:
+      - tts
 - &qwenimage
   name: "qwen-image"
   url: "github:mudler/LocalAI/gallery/qwen-image.yaml@master"