From e6ebfd3ba19d1bc749fa046ada2055341602a915 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Thu, 28 Aug 2025 16:25:18 +0100 Subject: [PATCH] feat(whisper-cpp): Convert to Purego and add VAD (#6087) * fix(ci): Avoid matching wrong backend with the same prefix Signed-off-by: Richard Palethorpe * chore(whisper): Use Purego and enable VAD This replaces the Whisper CGO bindings with our own Purego based module to make compilation easier. In addition this allows VAD models to be loaded by Whisper. There is not much benefit now except that the same backend can be used for VAD and transcription. Depending on upstream we may also be able to use GPU for VAD in the future, but presently it is disabled. Signed-off-by: Richard Palethorpe --------- Signed-off-by: Richard Palethorpe Co-authored-by: Ettore Di Giacinto --- .dockerignore | 4 + backend/go/whisper/.gitignore | 7 ++ backend/go/whisper/CMakeLists.txt | 12 ++ backend/go/whisper/Makefile | 112 +++++------------- backend/go/whisper/gowhisper.cpp | 146 +++++++++++++++++++++++ backend/go/whisper/gowhisper.go | 156 +++++++++++++++++++++++++ backend/go/whisper/gowhisper.h | 16 +++ backend/go/whisper/main.go | 28 ++++- backend/go/whisper/package.sh | 6 +- backend/go/whisper/whisper.go | 105 ----------------- core/http/endpoints/openai/realtime.go | 5 +- gallery/index.yaml | 19 ++- scripts/changed-backends.js | 6 +- 13 files changed, 424 insertions(+), 198 deletions(-) create mode 100644 backend/go/whisper/.gitignore create mode 100644 backend/go/whisper/CMakeLists.txt create mode 100644 backend/go/whisper/gowhisper.cpp create mode 100644 backend/go/whisper/gowhisper.go create mode 100644 backend/go/whisper/gowhisper.h delete mode 100644 backend/go/whisper/whisper.go diff --git a/.dockerignore b/.dockerignore index 854ef3db3..5b62e5f31 100644 --- a/.dockerignore +++ b/.dockerignore @@ -6,6 +6,10 @@ models backends examples/chatbot-ui/models backend/go/image/stablediffusion-ggml/build/ +backend/go/*/build +backend/go/*/.cache +backend/go/*/sources +backend/go/*/package examples/rwkv/models examples/**/models Dockerfile* diff --git a/backend/go/whisper/.gitignore b/backend/go/whisper/.gitignore new file mode 100644 index 000000000..017e34a10 --- /dev/null +++ b/backend/go/whisper/.gitignore @@ -0,0 +1,7 @@ +.cache/ +sources/ +build/ +package/ +whisper +libgowhisper.so + diff --git a/backend/go/whisper/CMakeLists.txt b/backend/go/whisper/CMakeLists.txt new file mode 100644 index 000000000..7a1a773e3 --- /dev/null +++ b/backend/go/whisper/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 3.12) +project(gowhisper LANGUAGES C CXX) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +add_subdirectory(./sources/whisper.cpp) + +add_library(gowhisper MODULE gowhisper.cpp) +target_link_libraries(gowhisper PRIVATE whisper ggml stdc++fs) + +set_property(TARGET gowhisper PROPERTY CXX_STANDARD 17) +set_target_properties(gowhisper PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) diff --git a/backend/go/whisper/Makefile b/backend/go/whisper/Makefile index 36cc120d6..a976b3ae8 100644 --- a/backend/go/whisper/Makefile +++ b/backend/go/whisper/Makefile @@ -1,110 +1,53 @@ -GOCMD=go +CMAKE_ARGS?= +BUILD_TYPE?= NATIVE?=false -BUILD_TYPE?= -CMAKE_ARGS?= +GOCMD?=go +GO_TAGS?= +JOBS?=$(shell nproc --ignore=1) # whisper.cpp version WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp WHISPER_CPP_VERSION?=7745fcf32846006128f16de429cfe1677c963b30 -export WHISPER_CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF -export WHISPER_DIR=$(abspath ./sources/whisper.cpp) -export WHISPER_INCLUDE_PATH=$(WHISPER_DIR)/include:$(WHISPER_DIR)/ggml/include -export WHISPER_LIBRARY_PATH=$(WHISPER_DIR)/build/src/:$(WHISPER_DIR)/build/ggml/src +CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -CGO_LDFLAGS_WHISPER?= -CGO_LDFLAGS_WHISPER+=-lggml -CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF -CUDA_LIBPATH?=/usr/local/cuda/lib64/ - -ONEAPI_VERSION?=2025.2 - -# IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS -ifeq ($(NATIVE),false) - CMAKE_ARGS+=-DGGML_NATIVE=OFF - WHISPER_CMAKE_ARGS+=-DGGML_NATIVE=OFF -endif -CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) ifeq ($(NATIVE),false) CMAKE_ARGS+=-DGGML_NATIVE=OFF endif -# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically + ifeq ($(BUILD_TYPE),cublas) - CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) -L$(CUDA_LIBPATH)/stubs/ -lcuda CMAKE_ARGS+=-DGGML_CUDA=ON - CGO_LDFLAGS_WHISPER+=-lcufft -lggml-cuda - export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-cuda/ -# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -# to CMAKE_ARGS automatically else ifeq ($(BUILD_TYPE),openblas) CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path else ifeq ($(BUILD_TYPE),clblas) CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path -# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ else ifeq ($(BUILD_TYPE),hipblas) - ROCM_HOME ?= /opt/rocm - ROCM_PATH ?= /opt/rocm - LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib - export STABLE_BUILD_TYPE= - export CXX=$(ROCM_HOME)/llvm/bin/clang++ - export CC=$(ROCM_HOME)/llvm/bin/clang -# GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102 -# AMDGPU_TARGETS ?= "$(GPU_TARGETS)" - CMAKE_ARGS+=-DGGML_HIP=ON - CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib -L$(CURRENT_MAKEFILE_DIR)/sources/whisper.cpp/build/ggml/src/ggml-hip/ -lggml-hip -# CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)" + CMAKE_ARGS+=-DGGML_HIPBLAS=ON else ifeq ($(BUILD_TYPE),vulkan) - CMAKE_ARGS+=-DGGML_VULKAN=1 - CGO_LDFLAGS_WHISPER+=-lggml-vulkan -lvulkan - export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-vulkan/ + CMAKE_ARGS+=-DGGML_VULKAN=ON else ifeq ($(OS),Darwin) - ifeq ($(BUILD_TYPE),) - BUILD_TYPE=metal - endif ifneq ($(BUILD_TYPE),metal) CMAKE_ARGS+=-DGGML_METAL=OFF - CGO_LDFLAGS_WHISPER+=-lggml-blas - export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas else CMAKE_ARGS+=-DGGML_METAL=ON CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON - CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON - CMAKE_ARGS+=-DGGML_OPENMP=OFF - CMAKE_ARGS+=-DWHISPER_BUILD_EXAMPLES=OFF - CMAKE_ARGS+=-DWHISPER_BUILD_TESTS=OFF - CMAKE_ARGS+=-DWHISPER_BUILD_SERVER=OFF - CGO_LDFLAGS += -framework Accelerate - CGO_LDFLAGS_WHISPER+=-lggml-metal -lggml-blas - export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-metal/:$(WHISPER_DIR)/build/ggml/src/ggml-blas endif - TARGET+=--target ggml-metal -endif - -ifneq (,$(findstring sycl,$(BUILD_TYPE))) - export CC=icx - export CXX=icpx - CGO_LDFLAGS_WHISPER += -fsycl -L${DNNLROOT}/lib -rpath ${ONEAPI_ROOT}/${ONEAPI_VERSION}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL -lggml-sycl - CGO_LDFLAGS_WHISPER += $(shell pkg-config --libs mkl-static-lp64-gomp) - CGO_CXXFLAGS_WHISPER += -fiopenmp -fopenmp-targets=spir64 - CGO_CXXFLAGS_WHISPER += $(shell pkg-config --cflags mkl-static-lp64-gomp ) - export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-sycl/ - CMAKE_ARGS+=-DGGML_SYCL=ON \ - -DCMAKE_C_COMPILER=icx \ - -DCMAKE_CXX_COMPILER=icpx \ - -DCMAKE_CXX_FLAGS="-fsycl" endif ifeq ($(BUILD_TYPE),sycl_f16) - CMAKE_ARGS+=-DGGML_SYCL_F16=ON + CMAKE_ARGS+=-DGGML_SYCL=ON \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx \ + -DGGML_SYCL_F16=ON endif -ifneq ($(OS),Darwin) - CGO_LDFLAGS_WHISPER+=-lgomp +ifeq ($(BUILD_TYPE),sycl_f32) + CMAKE_ARGS+=-DGGML_SYCL=ON \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx endif -## whisper sources/whisper.cpp: mkdir -p sources/whisper.cpp cd sources/whisper.cpp && \ @@ -114,18 +57,21 @@ sources/whisper.cpp: git checkout $(WHISPER_CPP_VERSION) && \ git submodule update --init --recursive --depth 1 --single-branch -sources/whisper.cpp/build/src/libwhisper.a: sources/whisper.cpp - cd sources/whisper.cpp && cmake $(CMAKE_ARGS) $(WHISPER_CMAKE_ARGS) . -B ./build - cd sources/whisper.cpp/build && cmake --build . --config Release +libgowhisper.so: sources/whisper.cpp CMakeLists.txt gowhisper.cpp gowhisper.h + mkdir -p build && \ + cd build && \ + cmake .. $(CMAKE_ARGS) && \ + cmake --build . --config Release -j$(JOBS) && \ + cd .. && \ + mv build/libgowhisper.so ./ -whisper: sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a - $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp - $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go - CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="${WHISPER_INCLUDE_PATH}" LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" LD_LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" \ - CGO_CXXFLAGS="$(CGO_CXXFLAGS_WHISPER)" \ - $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o whisper ./ +whisper: main.go gowhisper.go libgowhisper.so + CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o whisper ./ package: bash package.sh build: whisper package + +clean: + rm -rf libgowhisper.o build whisper diff --git a/backend/go/whisper/gowhisper.cpp b/backend/go/whisper/gowhisper.cpp new file mode 100644 index 000000000..cd2d8d313 --- /dev/null +++ b/backend/go/whisper/gowhisper.cpp @@ -0,0 +1,146 @@ +#include "gowhisper.h" +#include "ggml-backend.h" +#include "whisper.h" +#include + +static struct whisper_vad_context *vctx; +static struct whisper_context *ctx; +static std::vector flat_segs; + +static void ggml_log_cb(enum ggml_log_level level, const char* log, void* data) { + const char* level_str; + + if (!log) { + return; + } + + switch (level) { + case GGML_LOG_LEVEL_DEBUG: + level_str = "DEBUG"; + break; + case GGML_LOG_LEVEL_INFO: + level_str = "INFO"; + break; + case GGML_LOG_LEVEL_WARN: + level_str = "WARN"; + break; + case GGML_LOG_LEVEL_ERROR: + level_str = "ERROR"; + break; + default: /* Potential future-proofing */ + level_str = "?????"; + break; + } + + fprintf(stderr, "[%-5s] ", level_str); + fputs(log, stderr); + fflush(stderr); +} + +int load_model(const char *const model_path) { + whisper_log_set(ggml_log_cb, nullptr); + ggml_backend_load_all(); + + struct whisper_context_params cparams = whisper_context_default_params(); + + ctx = whisper_init_from_file_with_params(model_path, cparams); + if (ctx == nullptr) { + fprintf(stderr, "error: Also failed to init model as transcriber\n"); + return 1; + } + + return 0; +} + +int load_model_vad(const char *const model_path) { + whisper_log_set(ggml_log_cb, nullptr); + ggml_backend_load_all(); + + struct whisper_vad_context_params vcparams = + whisper_vad_default_context_params(); + + // XXX: Overridden to false in upstream due to performance? + // vcparams.use_gpu = true; + + vctx = whisper_vad_init_from_file_with_params(model_path, vcparams); + if (vctx == nullptr) { + fprintf(stderr, "error: Failed to init model as VAD\n"); + return 1; + } + + return 0; +} + +int vad(float pcmf32[], size_t pcmf32_len, float **segs_out, + size_t *segs_out_len) { + if (!whisper_vad_detect_speech(vctx, pcmf32, pcmf32_len)) { + fprintf(stderr, "error: failed to detect speech\n"); + return 1; + } + + struct whisper_vad_params params = whisper_vad_default_params(); + struct whisper_vad_segments *segs = + whisper_vad_segments_from_probs(vctx, params); + size_t segn = whisper_vad_segments_n_segments(segs); + + // fprintf(stderr, "Got segments %zd\n", segn); + + flat_segs.clear(); + + for (int i = 0; i < segn; i++) { + flat_segs.push_back(whisper_vad_segments_get_segment_t0(segs, i)); + flat_segs.push_back(whisper_vad_segments_get_segment_t1(segs, i)); + } + + // fprintf(stderr, "setting out variables: %p=%p -> %p, %p=%zx -> %zx\n", + // segs_out, *segs_out, flat_segs.data(), segs_out_len, *segs_out_len, + // flat_segs.size()); + *segs_out = flat_segs.data(); + *segs_out_len = flat_segs.size(); + + // fprintf(stderr, "freeing segs\n"); + whisper_vad_free_segments(segs); + + // fprintf(stderr, "returning\n"); + return 0; +} + +int transcribe(uint32_t threads, char *lang, bool translate, float pcmf32[], + size_t pcmf32_len, size_t *segs_out_len) { + whisper_full_params wparams = + whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + + wparams.n_threads = threads; + if (*lang != '\0') + wparams.language = lang; + else { + wparams.language = nullptr; + } + + wparams.translate = translate; + wparams.debug_mode = true; + wparams.print_progress = true; + + if (whisper_full(ctx, wparams, pcmf32, pcmf32_len)) { + fprintf(stderr, "error: transcription failed\n"); + return 1; + } + + *segs_out_len = whisper_full_n_segments(ctx); + + return 0; +} + +const char *get_segment_text(int i) { + return whisper_full_get_segment_text(ctx, i); +} + +int64_t get_segment_t0(int i) { return whisper_full_get_segment_t0(ctx, i); } + +int64_t get_segment_t1(int i) { return whisper_full_get_segment_t1(ctx, i); } + +int n_tokens(int i) { return whisper_full_n_tokens(ctx, i); } + +int32_t get_token_id(int i, int j) { + return whisper_full_get_token_id(ctx, i, j); +} diff --git a/backend/go/whisper/gowhisper.go b/backend/go/whisper/gowhisper.go new file mode 100644 index 000000000..6b4ab4d9f --- /dev/null +++ b/backend/go/whisper/gowhisper.go @@ -0,0 +1,156 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "unsafe" + + "github.com/go-audio/wav" + "github.com/mudler/LocalAI/pkg/grpc/base" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + "github.com/mudler/LocalAI/pkg/utils" +) + +var ( + CppLoadModel func(modelPath string) int + CppLoadModelVAD func(modelPath string) int + CppVAD func(pcmf32 []float32, pcmf32Size uintptr, segsOut unsafe.Pointer, segsOutLen unsafe.Pointer) int + CppTranscribe func(threads uint32, lang string, translate bool, pcmf32 []float32, pcmf32Len uintptr, segsOutLen unsafe.Pointer) int + CppGetSegmentText func(i int) string + CppGetSegmentStart func(i int) int64 + CppGetSegmentEnd func(i int) int64 + CppNTokens func(i int) int + CppGetTokenID func(i int, j int) int +) + +type Whisper struct { + base.SingleThread +} + +func (w *Whisper) Load(opts *pb.ModelOptions) error { + vadOnly := false + + for _, oo := range opts.Options { + if oo == "vad_only" { + vadOnly = true + } else { + fmt.Fprintf(os.Stderr, "Unrecognized option: %v\n", oo) + } + } + + if vadOnly { + if ret := CppLoadModelVAD(opts.ModelFile); ret != 0 { + return fmt.Errorf("Failed to load Whisper VAD model") + } + + return nil + } + + if ret := CppLoadModel(opts.ModelFile); ret != 0 { + return fmt.Errorf("Failed to load Whisper transcription model") + } + + return nil +} + +func (w *Whisper) VAD(req *pb.VADRequest) (pb.VADResponse, error) { + audio := req.Audio + // We expect 0xdeadbeef to be overwritten and if we see it in a stack trace we know it wasn't + segsPtr, segsLen := uintptr(0xdeadbeef), uintptr(0xdeadbeef) + segsPtrPtr, segsLenPtr := unsafe.Pointer(&segsPtr), unsafe.Pointer(&segsLen) + + if ret := CppVAD(audio, uintptr(len(audio)), segsPtrPtr, segsLenPtr); ret != 0 { + return pb.VADResponse{}, fmt.Errorf("Failed VAD") + } + + // Happens when CPP vector has not had any elements pushed to it + if segsPtr == 0 { + return pb.VADResponse{ + Segments: []*pb.VADSegment{}, + }, nil + } + + // unsafeptr warning is caused by segsPtr being on the stack and therefor being subject to stack copying AFAICT + // however the stack shouldn't have grown between setting segsPtr and now, also the memory pointed to is allocated by C++ + segs := unsafe.Slice((*float32)(unsafe.Pointer(segsPtr)), segsLen) + + vadSegments := []*pb.VADSegment{} + for i := range len(segs) >> 1 { + s := segs[2*i] / 100 + t := segs[2*i+1] / 100 + vadSegments = append(vadSegments, &pb.VADSegment{ + Start: s, + End: t, + }) + } + + return pb.VADResponse{ + Segments: vadSegments, + }, nil +} + +func (w *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptResult, error) { + dir, err := os.MkdirTemp("", "whisper") + if err != nil { + return pb.TranscriptResult{}, err + } + defer os.RemoveAll(dir) + + convertedPath := filepath.Join(dir, "converted.wav") + + if err := utils.AudioToWav(opts.Dst, convertedPath); err != nil { + return pb.TranscriptResult{}, err + } + + // Open samples + fh, err := os.Open(convertedPath) + if err != nil { + return pb.TranscriptResult{}, err + } + defer fh.Close() + + // Read samples + d := wav.NewDecoder(fh) + buf, err := d.FullPCMBuffer() + if err != nil { + return pb.TranscriptResult{}, err + } + + data := buf.AsFloat32Buffer().Data + segsLen := uintptr(0xdeadbeef) + segsLenPtr := unsafe.Pointer(&segsLen) + + if ret := CppTranscribe(opts.Threads, opts.Language, opts.Translate, data, uintptr(len(data)), segsLenPtr); ret != 0 { + return pb.TranscriptResult{}, fmt.Errorf("Failed Transcribe") + } + + segments := []*pb.TranscriptSegment{} + text := "" + for i := range int(segsLen) { + s := CppGetSegmentStart(i) + t := CppGetSegmentEnd(i) + txt := strings.Clone(CppGetSegmentText(i)) + tokens := make([]int32, CppNTokens(i)) + + for j := range tokens { + tokens[j] = int32(CppGetTokenID(i, j)) + } + segment := &pb.TranscriptSegment{ + Id: int32(i), + Text: txt, + Start: s, End: t, + Tokens: tokens, + } + + segments = append(segments, segment) + + text += " " + strings.TrimSpace(txt) + } + + return pb.TranscriptResult{ + Segments: segments, + Text: strings.TrimSpace(text), + }, nil +} diff --git a/backend/go/whisper/gowhisper.h b/backend/go/whisper/gowhisper.h new file mode 100644 index 000000000..2972a3203 --- /dev/null +++ b/backend/go/whisper/gowhisper.h @@ -0,0 +1,16 @@ +#include +#include + +extern "C" { +int load_model(const char *const model_path); +int load_model_vad(const char *const model_path); +int vad(float pcmf32[], size_t pcmf32_size, float **segs_out, + size_t *segs_out_len); +int transcribe(uint32_t threads, char *lang, bool translate, float pcmf32[], + size_t pcmf32_len, size_t *segs_out_len); +const char *get_segment_text(int i); +int64_t get_segment_t0(int i); +int64_t get_segment_t1(int i); +int n_tokens(int i); +int32_t get_token_id(int i, int j); +} diff --git a/backend/go/whisper/main.go b/backend/go/whisper/main.go index 6c66f517c..d466d6eb9 100644 --- a/backend/go/whisper/main.go +++ b/backend/go/whisper/main.go @@ -1,10 +1,10 @@ package main // Note: this is started internally by LocalAI and a server is allocated for each model - import ( "flag" + "github.com/ebitengine/purego" grpc "github.com/mudler/LocalAI/pkg/grpc" ) @@ -12,7 +12,33 @@ var ( addr = flag.String("addr", "localhost:50051", "the address to connect to") ) +type LibFuncs struct { + FuncPtr any + Name string +} + func main() { + gosd, err := purego.Dlopen("./libgowhisper.so", purego.RTLD_NOW|purego.RTLD_GLOBAL) + if err != nil { + panic(err) + } + + libFuncs := []LibFuncs{ + {&CppLoadModel, "load_model"}, + {&CppLoadModelVAD, "load_model_vad"}, + {&CppVAD, "vad"}, + {&CppTranscribe, "transcribe"}, + {&CppGetSegmentText, "get_segment_text"}, + {&CppGetSegmentStart, "get_segment_t0"}, + {&CppGetSegmentEnd, "get_segment_t1"}, + {&CppNTokens, "n_tokens"}, + {&CppGetTokenID, "get_token_id"}, + } + + for _, lf := range libFuncs { + purego.RegisterLibFunc(lf.FuncPtr, gosd, lf.Name) + } + flag.Parse() if err := grpc.StartServer(*addr, &Whisper{}); err != nil { diff --git a/backend/go/whisper/package.sh b/backend/go/whisper/package.sh index 3bda9695c..48e03e8e5 100755 --- a/backend/go/whisper/package.sh +++ b/backend/go/whisper/package.sh @@ -10,7 +10,7 @@ CURDIR=$(dirname "$(realpath $0)") # Create lib directory mkdir -p $CURDIR/package/lib -cp -avrf $CURDIR/whisper $CURDIR/package/ +cp -avrf $CURDIR/whisper $CURDIR/libgowhisper.so $CURDIR/package/ cp -rfv $CURDIR/run.sh $CURDIR/package/ # Detect architecture and copy appropriate libraries @@ -47,6 +47,6 @@ else exit 1 fi -echo "Packaging completed successfully" +echo "Packaging completed successfully" ls -liah $CURDIR/package/ -ls -liah $CURDIR/package/lib/ \ No newline at end of file +ls -liah $CURDIR/package/lib/ diff --git a/backend/go/whisper/whisper.go b/backend/go/whisper/whisper.go deleted file mode 100644 index 5c7ec0cbe..000000000 --- a/backend/go/whisper/whisper.go +++ /dev/null @@ -1,105 +0,0 @@ -package main - -// This is a wrapper to statisfy the GRPC service interface -// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc) -import ( - "os" - "path/filepath" - - "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper" - "github.com/go-audio/wav" - "github.com/mudler/LocalAI/pkg/grpc/base" - pb "github.com/mudler/LocalAI/pkg/grpc/proto" - "github.com/mudler/LocalAI/pkg/utils" -) - -type Whisper struct { - base.SingleThread - whisper whisper.Model -} - -func (sd *Whisper) Load(opts *pb.ModelOptions) error { - // Note: the Model here is a path to a directory containing the model files - w, err := whisper.New(opts.ModelFile) - sd.whisper = w - return err -} - -func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptResult, error) { - - dir, err := os.MkdirTemp("", "whisper") - if err != nil { - return pb.TranscriptResult{}, err - } - defer os.RemoveAll(dir) - - convertedPath := filepath.Join(dir, "converted.wav") - - if err := utils.AudioToWav(opts.Dst, convertedPath); err != nil { - return pb.TranscriptResult{}, err - } - - // Open samples - fh, err := os.Open(convertedPath) - if err != nil { - return pb.TranscriptResult{}, err - } - defer fh.Close() - - // Read samples - d := wav.NewDecoder(fh) - buf, err := d.FullPCMBuffer() - if err != nil { - return pb.TranscriptResult{}, err - } - - data := buf.AsFloat32Buffer().Data - - // Process samples - context, err := sd.whisper.NewContext() - if err != nil { - return pb.TranscriptResult{}, err - - } - - context.SetThreads(uint(opts.Threads)) - - if opts.Language != "" { - context.SetLanguage(opts.Language) - } else { - context.SetLanguage("auto") - } - - if opts.Translate { - context.SetTranslate(true) - } - - if err := context.Process(data, nil, nil, nil); err != nil { - return pb.TranscriptResult{}, err - } - - segments := []*pb.TranscriptSegment{} - text := "" - for { - s, err := context.NextSegment() - if err != nil { - break - } - - var tokens []int32 - for _, t := range s.Tokens { - tokens = append(tokens, int32(t.Id)) - } - - segment := &pb.TranscriptSegment{Id: int32(s.Num), Text: s.Text, Start: int64(s.Start), End: int64(s.End), Tokens: tokens} - segments = append(segments, segment) - - text += s.Text - } - - return pb.TranscriptResult{ - Segments: segments, - Text: text, - }, nil - -} diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index 2e692b52a..bd55b720d 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -31,6 +31,7 @@ import ( const ( localSampleRate = 16000 remoteSampleRate = 24000 + vadModel = "silero-vad-ggml" ) // A model can be "emulated" that is: transcribe audio to text -> feed text to the LLM -> generate audio as result @@ -233,7 +234,7 @@ func registerRealtime(application *application.Application) func(c *websocket.Co // TODO: The API has no way to configure the VAD model or other models that make up a pipeline to fake any-to-any // So possibly we could have a way to configure a composite model that can be used in situations where any-to-any is expected pipeline := config.Pipeline{ - VAD: "silero-vad", + VAD: vadModel, Transcription: session.InputAudioTranscription.Model, } @@ -568,7 +569,7 @@ func updateTransSession(session *Session, update *types.ClientSession, cl *confi if trUpd != nil && trUpd.Model != "" && trUpd.Model != trCur.Model { pipeline := config.Pipeline{ - VAD: "silero-vad", + VAD: vadModel, Transcription: trUpd.Model, } diff --git a/gallery/index.yaml b/gallery/index.yaml index 712bbf278..414438bc2 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -20730,7 +20730,8 @@ - filename: nomic-embed-text-v1.5.f16.gguf uri: https://huggingface.co/mradermacher/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.f16.gguf sha256: af8cb9e4ca0bf19eb54d08c612fdf325059264abbbd2c619527e5d2dda8de655 -- name: "silero-vad" +- &silero + name: "silero-vad" icon: https://github.com/snakers4/silero-models/raw/master/files/silero_logo.jpg url: github:mudler/LocalAI/gallery/virtual.yaml@master urls: @@ -20750,6 +20751,22 @@ - filename: silero-vad.onnx uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808 +- !!merge <<: *silero + name: "silero-vad-ggml" + urls: + - https://github.com/snakers4/silero-vad + - https://github.com/ggml-org/whisper.cpp + - https://huggingface.co/ggml-org/whisper-vad + overrides: + backend: whisper-vad + parameters: + model: ggml-silero-v5.1.2.bin + options: + - "vad_only" + files: + - filename: ggml-silero-v5.1.2.bin + uri: https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin + sha256: 29940d98d42b91fbd05ce489f3ecf7c72f0a42f027e4875919a28fb4c04ea2cf - &bark name: "bark-cpp" icon: https://avatars.githubusercontent.com/u/99442120 diff --git a/scripts/changed-backends.js b/scripts/changed-backends.js index b8e832d3e..717d713dc 100644 --- a/scripts/changed-backends.js +++ b/scripts/changed-backends.js @@ -47,13 +47,13 @@ async function getChangedFiles() { // Infer backend path function inferBackendPath(item) { if (item.dockerfile.endsWith("python")) { - return `backend/python/${item.backend}`; + return `backend/python/${item.backend}/`; } if (item.dockerfile.endsWith("golang")) { - return `backend/go/${item.backend}`; + return `backend/go/${item.backend}/`; } if (item.dockerfile.endsWith("llama-cpp")) { - return `backend/cpp/llama-cpp`; + return `backend/cpp/llama-cpp/`; } return null; }