diff --git a/.dockerignore b/.dockerignore index 854ef3db3..5b62e5f31 100644 --- a/.dockerignore +++ b/.dockerignore @@ -6,6 +6,10 @@ models backends examples/chatbot-ui/models backend/go/image/stablediffusion-ggml/build/ +backend/go/*/build +backend/go/*/.cache +backend/go/*/sources +backend/go/*/package examples/rwkv/models examples/**/models Dockerfile* diff --git a/backend/go/whisper/.gitignore b/backend/go/whisper/.gitignore new file mode 100644 index 000000000..017e34a10 --- /dev/null +++ b/backend/go/whisper/.gitignore @@ -0,0 +1,7 @@ +.cache/ +sources/ +build/ +package/ +whisper +libgowhisper.so + diff --git a/backend/go/whisper/CMakeLists.txt b/backend/go/whisper/CMakeLists.txt new file mode 100644 index 000000000..7a1a773e3 --- /dev/null +++ b/backend/go/whisper/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 3.12) +project(gowhisper LANGUAGES C CXX) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +add_subdirectory(./sources/whisper.cpp) + +add_library(gowhisper MODULE gowhisper.cpp) +target_link_libraries(gowhisper PRIVATE whisper ggml stdc++fs) + +set_property(TARGET gowhisper PROPERTY CXX_STANDARD 17) +set_target_properties(gowhisper PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) diff --git a/backend/go/whisper/Makefile b/backend/go/whisper/Makefile index 36cc120d6..a976b3ae8 100644 --- a/backend/go/whisper/Makefile +++ b/backend/go/whisper/Makefile @@ -1,110 +1,53 @@ -GOCMD=go +CMAKE_ARGS?= +BUILD_TYPE?= NATIVE?=false -BUILD_TYPE?= -CMAKE_ARGS?= +GOCMD?=go +GO_TAGS?= +JOBS?=$(shell nproc --ignore=1) # whisper.cpp version WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp WHISPER_CPP_VERSION?=7745fcf32846006128f16de429cfe1677c963b30 -export WHISPER_CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF -export WHISPER_DIR=$(abspath ./sources/whisper.cpp) -export WHISPER_INCLUDE_PATH=$(WHISPER_DIR)/include:$(WHISPER_DIR)/ggml/include -export WHISPER_LIBRARY_PATH=$(WHISPER_DIR)/build/src/:$(WHISPER_DIR)/build/ggml/src +CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -CGO_LDFLAGS_WHISPER?= -CGO_LDFLAGS_WHISPER+=-lggml -CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF -CUDA_LIBPATH?=/usr/local/cuda/lib64/ - -ONEAPI_VERSION?=2025.2 - -# IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS -ifeq ($(NATIVE),false) - CMAKE_ARGS+=-DGGML_NATIVE=OFF - WHISPER_CMAKE_ARGS+=-DGGML_NATIVE=OFF -endif -CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) ifeq ($(NATIVE),false) CMAKE_ARGS+=-DGGML_NATIVE=OFF endif -# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically + ifeq ($(BUILD_TYPE),cublas) - CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) -L$(CUDA_LIBPATH)/stubs/ -lcuda CMAKE_ARGS+=-DGGML_CUDA=ON - CGO_LDFLAGS_WHISPER+=-lcufft -lggml-cuda - export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-cuda/ -# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -# to CMAKE_ARGS automatically else ifeq ($(BUILD_TYPE),openblas) CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path else ifeq ($(BUILD_TYPE),clblas) CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path -# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ else ifeq ($(BUILD_TYPE),hipblas) - ROCM_HOME ?= /opt/rocm - ROCM_PATH ?= /opt/rocm - LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib - export STABLE_BUILD_TYPE= - export CXX=$(ROCM_HOME)/llvm/bin/clang++ - export CC=$(ROCM_HOME)/llvm/bin/clang -# GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102 -# AMDGPU_TARGETS ?= "$(GPU_TARGETS)" - CMAKE_ARGS+=-DGGML_HIP=ON - CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib -L$(CURRENT_MAKEFILE_DIR)/sources/whisper.cpp/build/ggml/src/ggml-hip/ -lggml-hip -# CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)" + CMAKE_ARGS+=-DGGML_HIPBLAS=ON else ifeq ($(BUILD_TYPE),vulkan) - CMAKE_ARGS+=-DGGML_VULKAN=1 - CGO_LDFLAGS_WHISPER+=-lggml-vulkan -lvulkan - export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-vulkan/ + CMAKE_ARGS+=-DGGML_VULKAN=ON else ifeq ($(OS),Darwin) - ifeq ($(BUILD_TYPE),) - BUILD_TYPE=metal - endif ifneq ($(BUILD_TYPE),metal) CMAKE_ARGS+=-DGGML_METAL=OFF - CGO_LDFLAGS_WHISPER+=-lggml-blas - export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas else CMAKE_ARGS+=-DGGML_METAL=ON CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON - CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON - CMAKE_ARGS+=-DGGML_OPENMP=OFF - CMAKE_ARGS+=-DWHISPER_BUILD_EXAMPLES=OFF - CMAKE_ARGS+=-DWHISPER_BUILD_TESTS=OFF - CMAKE_ARGS+=-DWHISPER_BUILD_SERVER=OFF - CGO_LDFLAGS += -framework Accelerate - CGO_LDFLAGS_WHISPER+=-lggml-metal -lggml-blas - export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-metal/:$(WHISPER_DIR)/build/ggml/src/ggml-blas endif - TARGET+=--target ggml-metal -endif - -ifneq (,$(findstring sycl,$(BUILD_TYPE))) - export CC=icx - export CXX=icpx - CGO_LDFLAGS_WHISPER += -fsycl -L${DNNLROOT}/lib -rpath ${ONEAPI_ROOT}/${ONEAPI_VERSION}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL -lggml-sycl - CGO_LDFLAGS_WHISPER += $(shell pkg-config --libs mkl-static-lp64-gomp) - CGO_CXXFLAGS_WHISPER += -fiopenmp -fopenmp-targets=spir64 - CGO_CXXFLAGS_WHISPER += $(shell pkg-config --cflags mkl-static-lp64-gomp ) - export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-sycl/ - CMAKE_ARGS+=-DGGML_SYCL=ON \ - -DCMAKE_C_COMPILER=icx \ - -DCMAKE_CXX_COMPILER=icpx \ - -DCMAKE_CXX_FLAGS="-fsycl" endif ifeq ($(BUILD_TYPE),sycl_f16) - CMAKE_ARGS+=-DGGML_SYCL_F16=ON + CMAKE_ARGS+=-DGGML_SYCL=ON \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx \ + -DGGML_SYCL_F16=ON endif -ifneq ($(OS),Darwin) - CGO_LDFLAGS_WHISPER+=-lgomp +ifeq ($(BUILD_TYPE),sycl_f32) + CMAKE_ARGS+=-DGGML_SYCL=ON \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx endif -## whisper sources/whisper.cpp: mkdir -p sources/whisper.cpp cd sources/whisper.cpp && \ @@ -114,18 +57,21 @@ sources/whisper.cpp: git checkout $(WHISPER_CPP_VERSION) && \ git submodule update --init --recursive --depth 1 --single-branch -sources/whisper.cpp/build/src/libwhisper.a: sources/whisper.cpp - cd sources/whisper.cpp && cmake $(CMAKE_ARGS) $(WHISPER_CMAKE_ARGS) . -B ./build - cd sources/whisper.cpp/build && cmake --build . --config Release +libgowhisper.so: sources/whisper.cpp CMakeLists.txt gowhisper.cpp gowhisper.h + mkdir -p build && \ + cd build && \ + cmake .. $(CMAKE_ARGS) && \ + cmake --build . --config Release -j$(JOBS) && \ + cd .. && \ + mv build/libgowhisper.so ./ -whisper: sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a - $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp - $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go - CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="${WHISPER_INCLUDE_PATH}" LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" LD_LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" \ - CGO_CXXFLAGS="$(CGO_CXXFLAGS_WHISPER)" \ - $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o whisper ./ +whisper: main.go gowhisper.go libgowhisper.so + CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o whisper ./ package: bash package.sh build: whisper package + +clean: + rm -rf libgowhisper.o build whisper diff --git a/backend/go/whisper/gowhisper.cpp b/backend/go/whisper/gowhisper.cpp new file mode 100644 index 000000000..cd2d8d313 --- /dev/null +++ b/backend/go/whisper/gowhisper.cpp @@ -0,0 +1,146 @@ +#include "gowhisper.h" +#include "ggml-backend.h" +#include "whisper.h" +#include + +static struct whisper_vad_context *vctx; +static struct whisper_context *ctx; +static std::vector flat_segs; + +static void ggml_log_cb(enum ggml_log_level level, const char* log, void* data) { + const char* level_str; + + if (!log) { + return; + } + + switch (level) { + case GGML_LOG_LEVEL_DEBUG: + level_str = "DEBUG"; + break; + case GGML_LOG_LEVEL_INFO: + level_str = "INFO"; + break; + case GGML_LOG_LEVEL_WARN: + level_str = "WARN"; + break; + case GGML_LOG_LEVEL_ERROR: + level_str = "ERROR"; + break; + default: /* Potential future-proofing */ + level_str = "?????"; + break; + } + + fprintf(stderr, "[%-5s] ", level_str); + fputs(log, stderr); + fflush(stderr); +} + +int load_model(const char *const model_path) { + whisper_log_set(ggml_log_cb, nullptr); + ggml_backend_load_all(); + + struct whisper_context_params cparams = whisper_context_default_params(); + + ctx = whisper_init_from_file_with_params(model_path, cparams); + if (ctx == nullptr) { + fprintf(stderr, "error: Also failed to init model as transcriber\n"); + return 1; + } + + return 0; +} + +int load_model_vad(const char *const model_path) { + whisper_log_set(ggml_log_cb, nullptr); + ggml_backend_load_all(); + + struct whisper_vad_context_params vcparams = + whisper_vad_default_context_params(); + + // XXX: Overridden to false in upstream due to performance? + // vcparams.use_gpu = true; + + vctx = whisper_vad_init_from_file_with_params(model_path, vcparams); + if (vctx == nullptr) { + fprintf(stderr, "error: Failed to init model as VAD\n"); + return 1; + } + + return 0; +} + +int vad(float pcmf32[], size_t pcmf32_len, float **segs_out, + size_t *segs_out_len) { + if (!whisper_vad_detect_speech(vctx, pcmf32, pcmf32_len)) { + fprintf(stderr, "error: failed to detect speech\n"); + return 1; + } + + struct whisper_vad_params params = whisper_vad_default_params(); + struct whisper_vad_segments *segs = + whisper_vad_segments_from_probs(vctx, params); + size_t segn = whisper_vad_segments_n_segments(segs); + + // fprintf(stderr, "Got segments %zd\n", segn); + + flat_segs.clear(); + + for (int i = 0; i < segn; i++) { + flat_segs.push_back(whisper_vad_segments_get_segment_t0(segs, i)); + flat_segs.push_back(whisper_vad_segments_get_segment_t1(segs, i)); + } + + // fprintf(stderr, "setting out variables: %p=%p -> %p, %p=%zx -> %zx\n", + // segs_out, *segs_out, flat_segs.data(), segs_out_len, *segs_out_len, + // flat_segs.size()); + *segs_out = flat_segs.data(); + *segs_out_len = flat_segs.size(); + + // fprintf(stderr, "freeing segs\n"); + whisper_vad_free_segments(segs); + + // fprintf(stderr, "returning\n"); + return 0; +} + +int transcribe(uint32_t threads, char *lang, bool translate, float pcmf32[], + size_t pcmf32_len, size_t *segs_out_len) { + whisper_full_params wparams = + whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + + wparams.n_threads = threads; + if (*lang != '\0') + wparams.language = lang; + else { + wparams.language = nullptr; + } + + wparams.translate = translate; + wparams.debug_mode = true; + wparams.print_progress = true; + + if (whisper_full(ctx, wparams, pcmf32, pcmf32_len)) { + fprintf(stderr, "error: transcription failed\n"); + return 1; + } + + *segs_out_len = whisper_full_n_segments(ctx); + + return 0; +} + +const char *get_segment_text(int i) { + return whisper_full_get_segment_text(ctx, i); +} + +int64_t get_segment_t0(int i) { return whisper_full_get_segment_t0(ctx, i); } + +int64_t get_segment_t1(int i) { return whisper_full_get_segment_t1(ctx, i); } + +int n_tokens(int i) { return whisper_full_n_tokens(ctx, i); } + +int32_t get_token_id(int i, int j) { + return whisper_full_get_token_id(ctx, i, j); +} diff --git a/backend/go/whisper/gowhisper.go b/backend/go/whisper/gowhisper.go new file mode 100644 index 000000000..6b4ab4d9f --- /dev/null +++ b/backend/go/whisper/gowhisper.go @@ -0,0 +1,156 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "unsafe" + + "github.com/go-audio/wav" + "github.com/mudler/LocalAI/pkg/grpc/base" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + "github.com/mudler/LocalAI/pkg/utils" +) + +var ( + CppLoadModel func(modelPath string) int + CppLoadModelVAD func(modelPath string) int + CppVAD func(pcmf32 []float32, pcmf32Size uintptr, segsOut unsafe.Pointer, segsOutLen unsafe.Pointer) int + CppTranscribe func(threads uint32, lang string, translate bool, pcmf32 []float32, pcmf32Len uintptr, segsOutLen unsafe.Pointer) int + CppGetSegmentText func(i int) string + CppGetSegmentStart func(i int) int64 + CppGetSegmentEnd func(i int) int64 + CppNTokens func(i int) int + CppGetTokenID func(i int, j int) int +) + +type Whisper struct { + base.SingleThread +} + +func (w *Whisper) Load(opts *pb.ModelOptions) error { + vadOnly := false + + for _, oo := range opts.Options { + if oo == "vad_only" { + vadOnly = true + } else { + fmt.Fprintf(os.Stderr, "Unrecognized option: %v\n", oo) + } + } + + if vadOnly { + if ret := CppLoadModelVAD(opts.ModelFile); ret != 0 { + return fmt.Errorf("Failed to load Whisper VAD model") + } + + return nil + } + + if ret := CppLoadModel(opts.ModelFile); ret != 0 { + return fmt.Errorf("Failed to load Whisper transcription model") + } + + return nil +} + +func (w *Whisper) VAD(req *pb.VADRequest) (pb.VADResponse, error) { + audio := req.Audio + // We expect 0xdeadbeef to be overwritten and if we see it in a stack trace we know it wasn't + segsPtr, segsLen := uintptr(0xdeadbeef), uintptr(0xdeadbeef) + segsPtrPtr, segsLenPtr := unsafe.Pointer(&segsPtr), unsafe.Pointer(&segsLen) + + if ret := CppVAD(audio, uintptr(len(audio)), segsPtrPtr, segsLenPtr); ret != 0 { + return pb.VADResponse{}, fmt.Errorf("Failed VAD") + } + + // Happens when CPP vector has not had any elements pushed to it + if segsPtr == 0 { + return pb.VADResponse{ + Segments: []*pb.VADSegment{}, + }, nil + } + + // unsafeptr warning is caused by segsPtr being on the stack and therefor being subject to stack copying AFAICT + // however the stack shouldn't have grown between setting segsPtr and now, also the memory pointed to is allocated by C++ + segs := unsafe.Slice((*float32)(unsafe.Pointer(segsPtr)), segsLen) + + vadSegments := []*pb.VADSegment{} + for i := range len(segs) >> 1 { + s := segs[2*i] / 100 + t := segs[2*i+1] / 100 + vadSegments = append(vadSegments, &pb.VADSegment{ + Start: s, + End: t, + }) + } + + return pb.VADResponse{ + Segments: vadSegments, + }, nil +} + +func (w *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptResult, error) { + dir, err := os.MkdirTemp("", "whisper") + if err != nil { + return pb.TranscriptResult{}, err + } + defer os.RemoveAll(dir) + + convertedPath := filepath.Join(dir, "converted.wav") + + if err := utils.AudioToWav(opts.Dst, convertedPath); err != nil { + return pb.TranscriptResult{}, err + } + + // Open samples + fh, err := os.Open(convertedPath) + if err != nil { + return pb.TranscriptResult{}, err + } + defer fh.Close() + + // Read samples + d := wav.NewDecoder(fh) + buf, err := d.FullPCMBuffer() + if err != nil { + return pb.TranscriptResult{}, err + } + + data := buf.AsFloat32Buffer().Data + segsLen := uintptr(0xdeadbeef) + segsLenPtr := unsafe.Pointer(&segsLen) + + if ret := CppTranscribe(opts.Threads, opts.Language, opts.Translate, data, uintptr(len(data)), segsLenPtr); ret != 0 { + return pb.TranscriptResult{}, fmt.Errorf("Failed Transcribe") + } + + segments := []*pb.TranscriptSegment{} + text := "" + for i := range int(segsLen) { + s := CppGetSegmentStart(i) + t := CppGetSegmentEnd(i) + txt := strings.Clone(CppGetSegmentText(i)) + tokens := make([]int32, CppNTokens(i)) + + for j := range tokens { + tokens[j] = int32(CppGetTokenID(i, j)) + } + segment := &pb.TranscriptSegment{ + Id: int32(i), + Text: txt, + Start: s, End: t, + Tokens: tokens, + } + + segments = append(segments, segment) + + text += " " + strings.TrimSpace(txt) + } + + return pb.TranscriptResult{ + Segments: segments, + Text: strings.TrimSpace(text), + }, nil +} diff --git a/backend/go/whisper/gowhisper.h b/backend/go/whisper/gowhisper.h new file mode 100644 index 000000000..2972a3203 --- /dev/null +++ b/backend/go/whisper/gowhisper.h @@ -0,0 +1,16 @@ +#include +#include + +extern "C" { +int load_model(const char *const model_path); +int load_model_vad(const char *const model_path); +int vad(float pcmf32[], size_t pcmf32_size, float **segs_out, + size_t *segs_out_len); +int transcribe(uint32_t threads, char *lang, bool translate, float pcmf32[], + size_t pcmf32_len, size_t *segs_out_len); +const char *get_segment_text(int i); +int64_t get_segment_t0(int i); +int64_t get_segment_t1(int i); +int n_tokens(int i); +int32_t get_token_id(int i, int j); +} diff --git a/backend/go/whisper/main.go b/backend/go/whisper/main.go index 6c66f517c..d466d6eb9 100644 --- a/backend/go/whisper/main.go +++ b/backend/go/whisper/main.go @@ -1,10 +1,10 @@ package main // Note: this is started internally by LocalAI and a server is allocated for each model - import ( "flag" + "github.com/ebitengine/purego" grpc "github.com/mudler/LocalAI/pkg/grpc" ) @@ -12,7 +12,33 @@ var ( addr = flag.String("addr", "localhost:50051", "the address to connect to") ) +type LibFuncs struct { + FuncPtr any + Name string +} + func main() { + gosd, err := purego.Dlopen("./libgowhisper.so", purego.RTLD_NOW|purego.RTLD_GLOBAL) + if err != nil { + panic(err) + } + + libFuncs := []LibFuncs{ + {&CppLoadModel, "load_model"}, + {&CppLoadModelVAD, "load_model_vad"}, + {&CppVAD, "vad"}, + {&CppTranscribe, "transcribe"}, + {&CppGetSegmentText, "get_segment_text"}, + {&CppGetSegmentStart, "get_segment_t0"}, + {&CppGetSegmentEnd, "get_segment_t1"}, + {&CppNTokens, "n_tokens"}, + {&CppGetTokenID, "get_token_id"}, + } + + for _, lf := range libFuncs { + purego.RegisterLibFunc(lf.FuncPtr, gosd, lf.Name) + } + flag.Parse() if err := grpc.StartServer(*addr, &Whisper{}); err != nil { diff --git a/backend/go/whisper/package.sh b/backend/go/whisper/package.sh index 3bda9695c..48e03e8e5 100755 --- a/backend/go/whisper/package.sh +++ b/backend/go/whisper/package.sh @@ -10,7 +10,7 @@ CURDIR=$(dirname "$(realpath $0)") # Create lib directory mkdir -p $CURDIR/package/lib -cp -avrf $CURDIR/whisper $CURDIR/package/ +cp -avrf $CURDIR/whisper $CURDIR/libgowhisper.so $CURDIR/package/ cp -rfv $CURDIR/run.sh $CURDIR/package/ # Detect architecture and copy appropriate libraries @@ -47,6 +47,6 @@ else exit 1 fi -echo "Packaging completed successfully" +echo "Packaging completed successfully" ls -liah $CURDIR/package/ -ls -liah $CURDIR/package/lib/ \ No newline at end of file +ls -liah $CURDIR/package/lib/ diff --git a/backend/go/whisper/whisper.go b/backend/go/whisper/whisper.go deleted file mode 100644 index 5c7ec0cbe..000000000 --- a/backend/go/whisper/whisper.go +++ /dev/null @@ -1,105 +0,0 @@ -package main - -// This is a wrapper to statisfy the GRPC service interface -// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc) -import ( - "os" - "path/filepath" - - "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper" - "github.com/go-audio/wav" - "github.com/mudler/LocalAI/pkg/grpc/base" - pb "github.com/mudler/LocalAI/pkg/grpc/proto" - "github.com/mudler/LocalAI/pkg/utils" -) - -type Whisper struct { - base.SingleThread - whisper whisper.Model -} - -func (sd *Whisper) Load(opts *pb.ModelOptions) error { - // Note: the Model here is a path to a directory containing the model files - w, err := whisper.New(opts.ModelFile) - sd.whisper = w - return err -} - -func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptResult, error) { - - dir, err := os.MkdirTemp("", "whisper") - if err != nil { - return pb.TranscriptResult{}, err - } - defer os.RemoveAll(dir) - - convertedPath := filepath.Join(dir, "converted.wav") - - if err := utils.AudioToWav(opts.Dst, convertedPath); err != nil { - return pb.TranscriptResult{}, err - } - - // Open samples - fh, err := os.Open(convertedPath) - if err != nil { - return pb.TranscriptResult{}, err - } - defer fh.Close() - - // Read samples - d := wav.NewDecoder(fh) - buf, err := d.FullPCMBuffer() - if err != nil { - return pb.TranscriptResult{}, err - } - - data := buf.AsFloat32Buffer().Data - - // Process samples - context, err := sd.whisper.NewContext() - if err != nil { - return pb.TranscriptResult{}, err - - } - - context.SetThreads(uint(opts.Threads)) - - if opts.Language != "" { - context.SetLanguage(opts.Language) - } else { - context.SetLanguage("auto") - } - - if opts.Translate { - context.SetTranslate(true) - } - - if err := context.Process(data, nil, nil, nil); err != nil { - return pb.TranscriptResult{}, err - } - - segments := []*pb.TranscriptSegment{} - text := "" - for { - s, err := context.NextSegment() - if err != nil { - break - } - - var tokens []int32 - for _, t := range s.Tokens { - tokens = append(tokens, int32(t.Id)) - } - - segment := &pb.TranscriptSegment{Id: int32(s.Num), Text: s.Text, Start: int64(s.Start), End: int64(s.End), Tokens: tokens} - segments = append(segments, segment) - - text += s.Text - } - - return pb.TranscriptResult{ - Segments: segments, - Text: text, - }, nil - -} diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index 2e692b52a..bd55b720d 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -31,6 +31,7 @@ import ( const ( localSampleRate = 16000 remoteSampleRate = 24000 + vadModel = "silero-vad-ggml" ) // A model can be "emulated" that is: transcribe audio to text -> feed text to the LLM -> generate audio as result @@ -233,7 +234,7 @@ func registerRealtime(application *application.Application) func(c *websocket.Co // TODO: The API has no way to configure the VAD model or other models that make up a pipeline to fake any-to-any // So possibly we could have a way to configure a composite model that can be used in situations where any-to-any is expected pipeline := config.Pipeline{ - VAD: "silero-vad", + VAD: vadModel, Transcription: session.InputAudioTranscription.Model, } @@ -568,7 +569,7 @@ func updateTransSession(session *Session, update *types.ClientSession, cl *confi if trUpd != nil && trUpd.Model != "" && trUpd.Model != trCur.Model { pipeline := config.Pipeline{ - VAD: "silero-vad", + VAD: vadModel, Transcription: trUpd.Model, } diff --git a/gallery/index.yaml b/gallery/index.yaml index 712bbf278..414438bc2 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -20730,7 +20730,8 @@ - filename: nomic-embed-text-v1.5.f16.gguf uri: https://huggingface.co/mradermacher/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.f16.gguf sha256: af8cb9e4ca0bf19eb54d08c612fdf325059264abbbd2c619527e5d2dda8de655 -- name: "silero-vad" +- &silero + name: "silero-vad" icon: https://github.com/snakers4/silero-models/raw/master/files/silero_logo.jpg url: github:mudler/LocalAI/gallery/virtual.yaml@master urls: @@ -20750,6 +20751,22 @@ - filename: silero-vad.onnx uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808 +- !!merge <<: *silero + name: "silero-vad-ggml" + urls: + - https://github.com/snakers4/silero-vad + - https://github.com/ggml-org/whisper.cpp + - https://huggingface.co/ggml-org/whisper-vad + overrides: + backend: whisper-vad + parameters: + model: ggml-silero-v5.1.2.bin + options: + - "vad_only" + files: + - filename: ggml-silero-v5.1.2.bin + uri: https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin + sha256: 29940d98d42b91fbd05ce489f3ecf7c72f0a42f027e4875919a28fb4c04ea2cf - &bark name: "bark-cpp" icon: https://avatars.githubusercontent.com/u/99442120 diff --git a/scripts/changed-backends.js b/scripts/changed-backends.js index b8e832d3e..717d713dc 100644 --- a/scripts/changed-backends.js +++ b/scripts/changed-backends.js @@ -47,13 +47,13 @@ async function getChangedFiles() { // Infer backend path function inferBackendPath(item) { if (item.dockerfile.endsWith("python")) { - return `backend/python/${item.backend}`; + return `backend/python/${item.backend}/`; } if (item.dockerfile.endsWith("golang")) { - return `backend/go/${item.backend}`; + return `backend/go/${item.backend}/`; } if (item.dockerfile.endsWith("llama-cpp")) { - return `backend/cpp/llama-cpp`; + return `backend/cpp/llama-cpp/`; } return null; }