feat: Realtime API support reboot (#5392)

* feat(realtime): Initial Realtime API implementation

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore: go mod tidy

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* feat: Implement transcription only mode for realtime API

Reduce the scope of the real time API for the initial realease and make
transcription only mode functional.

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* chore(build): Build backends on a separate layer to speed up core only changes

Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Signed-off-by: Richard Palethorpe <io@richiejp.com>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Richard Palethorpe
2025-05-25 21:25:05 +01:00
committed by GitHub
parent 4a91950848
commit bf6426aef2
18 changed files with 2953 additions and 70 deletions

55
pkg/audio/audio.go Normal file
View File

@@ -0,0 +1,55 @@
package audio
// Copied from VoxInput
import (
"encoding/binary"
"io"
)
// WAVHeader represents the WAV file header (44 bytes for PCM)
type WAVHeader struct {
// RIFF Chunk (12 bytes)
ChunkID [4]byte
ChunkSize uint32
Format [4]byte
// fmt Subchunk (16 bytes)
Subchunk1ID [4]byte
Subchunk1Size uint32
AudioFormat uint16
NumChannels uint16
SampleRate uint32
ByteRate uint32
BlockAlign uint16
BitsPerSample uint16
// data Subchunk (8 bytes)
Subchunk2ID [4]byte
Subchunk2Size uint32
}
func NewWAVHeader(pcmLen uint32) WAVHeader {
header := WAVHeader{
ChunkID: [4]byte{'R', 'I', 'F', 'F'},
Format: [4]byte{'W', 'A', 'V', 'E'},
Subchunk1ID: [4]byte{'f', 'm', 't', ' '},
Subchunk1Size: 16, // PCM = 16 bytes
AudioFormat: 1, // PCM
NumChannels: 1, // Mono
SampleRate: 16000,
ByteRate: 16000 * 2, // SampleRate * BlockAlign (mono, 2 bytes per sample)
BlockAlign: 2, // 16-bit = 2 bytes per sample
BitsPerSample: 16,
Subchunk2ID: [4]byte{'d', 'a', 't', 'a'},
Subchunk2Size: pcmLen,
}
header.ChunkSize = 36 + header.Subchunk2Size
return header
}
func (h *WAVHeader) Write(writer io.Writer) error {
return binary.Write(writer, binary.LittleEndian, h)
}

View File

@@ -35,9 +35,9 @@ type Backend interface {
IsBusy() bool
HealthCheck(ctx context.Context) (bool, error)
Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.EmbeddingResult, error)
Predict(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.Reply, error)
LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grpc.CallOption) (*pb.Result, error)
PredictStream(ctx context.Context, in *pb.PredictOptions, f func(reply *pb.Reply), opts ...grpc.CallOption) error
Predict(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.Reply, error)
GenerateImage(ctx context.Context, in *pb.GenerateImageRequest, opts ...grpc.CallOption) (*pb.Result, error)
GenerateVideo(ctx context.Context, in *pb.GenerateVideoRequest, opts ...grpc.CallOption) (*pb.Result, error)
TTS(ctx context.Context, in *pb.TTSRequest, opts ...grpc.CallOption) (*pb.Result, error)

12
pkg/sound/float32.go Normal file
View File

@@ -0,0 +1,12 @@
package sound
import (
"encoding/binary"
"math"
)
func BytesFloat32(bytes []byte) float32 {
bits := binary.LittleEndian.Uint32(bytes)
float := math.Float32frombits(bits)
return float
}

90
pkg/sound/int16.go Normal file
View File

@@ -0,0 +1,90 @@
package sound
import (
"encoding/binary"
"math"
)
/*
MIT License
Copyright (c) 2024 Xbozon
*/
// calculateRMS16 calculates the root mean square of the audio buffer for int16 samples.
func CalculateRMS16(buffer []int16) float64 {
var sumSquares float64
for _, sample := range buffer {
val := float64(sample) // Convert int16 to float64 for calculation
sumSquares += val * val
}
meanSquares := sumSquares / float64(len(buffer))
return math.Sqrt(meanSquares)
}
func ResampleInt16(input []int16, inputRate, outputRate int) []int16 {
// Calculate the resampling ratio
ratio := float64(inputRate) / float64(outputRate)
// Calculate the length of the resampled output
outputLength := int(float64(len(input)) / ratio)
// Allocate a slice for the resampled output
output := make([]int16, outputLength)
// Perform linear interpolation for resampling
for i := 0; i < outputLength-1; i++ {
// Calculate the corresponding position in the input
pos := float64(i) * ratio
// Calculate the indices of the surrounding input samples
indexBefore := int(pos)
indexAfter := indexBefore + 1
if indexAfter >= len(input) {
indexAfter = len(input) - 1
}
// Calculate the fractional part of the position
frac := pos - float64(indexBefore)
// Linearly interpolate between the two surrounding input samples
output[i] = int16((1-frac)*float64(input[indexBefore]) + frac*float64(input[indexAfter]))
}
// Handle the last sample explicitly to avoid index out of range
output[outputLength-1] = input[len(input)-1]
return output
}
func ConvertInt16ToInt(input []int16) []int {
output := make([]int, len(input)) // Allocate a slice for the output
for i, value := range input {
output[i] = int(value) // Convert each int16 to int and assign it to the output slice
}
return output // Return the converted slice
}
func BytesToInt16sLE(bytes []byte) []int16 {
// Ensure the byte slice length is even
if len(bytes)%2 != 0 {
panic("bytesToInt16sLE: input bytes slice has odd length, must be even")
}
int16s := make([]int16, len(bytes)/2)
for i := 0; i < len(int16s); i++ {
int16s[i] = int16(bytes[2*i]) | int16(bytes[2*i+1])<<8
}
return int16s
}
func Int16toBytesLE(arr []int16) []byte {
le := binary.LittleEndian
result := make([]byte, 0, 2*len(arr))
for _, val := range arr {
result = le.AppendUint16(result, uint16(val))
}
return result
}

View File

@@ -5,6 +5,8 @@ import (
"os"
"os/exec"
"strings"
"github.com/go-audio/wav"
)
func ffmpegCommand(args []string) (string, error) {
@@ -17,6 +19,21 @@ func ffmpegCommand(args []string) (string, error) {
// AudioToWav converts audio to wav for transcribe.
// TODO: use https://github.com/mccoyst/ogg?
func AudioToWav(src, dst string) error {
if strings.HasSuffix(src, ".wav") {
f, err := os.Open(src)
if err != nil {
return fmt.Errorf("open: %w", err)
}
dec := wav.NewDecoder(f)
dec.ReadInfo()
f.Close()
if dec.BitDepth == 16 && dec.NumChans == 1 && dec.SampleRate == 16000 {
os.Rename(src, dst)
return nil
}
}
commandArgs := []string{"-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
out, err := ffmpegCommand(commandArgs)
if err != nil {