feat(whisper-cpp): Convert to Purego and add VAD (#6087)

* fix(ci): Avoid matching wrong backend with the same prefix

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* chore(whisper): Use Purego and enable VAD

This replaces the Whisper CGO bindings with our own Purego based module
to make compilation easier.

In addition this allows VAD models to be loaded by Whisper. There is not
much benefit now except that the same backend can be used for VAD and
transcription. Depending on upstream we may also be able to use GPU for
VAD in the future, but presently it is disabled.

Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
Richard Palethorpe
2025-08-28 16:25:18 +01:00
committed by GitHub
parent ead00a28b9
commit e6ebfd3ba1
13 changed files with 424 additions and 198 deletions

View File

@@ -31,6 +31,7 @@ import (
const (
localSampleRate = 16000
remoteSampleRate = 24000
vadModel = "silero-vad-ggml"
)
// A model can be "emulated" that is: transcribe audio to text -> feed text to the LLM -> generate audio as result
@@ -233,7 +234,7 @@ func registerRealtime(application *application.Application) func(c *websocket.Co
// TODO: The API has no way to configure the VAD model or other models that make up a pipeline to fake any-to-any
// So possibly we could have a way to configure a composite model that can be used in situations where any-to-any is expected
pipeline := config.Pipeline{
VAD: "silero-vad",
VAD: vadModel,
Transcription: session.InputAudioTranscription.Model,
}
@@ -568,7 +569,7 @@ func updateTransSession(session *Session, update *types.ClientSession, cl *confi
if trUpd != nil && trUpd.Model != "" && trUpd.Model != trCur.Model {
pipeline := config.Pipeline{
VAD: "silero-vad",
VAD: vadModel,
Transcription: trUpd.Model,
}