mirror of
https://github.com/mudler/LocalAI.git
synced 2026-01-06 02:29:54 -06:00
feat: Realtime API support reboot (#5392)
* feat(realtime): Initial Realtime API implementation Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore: go mod tidy Signed-off-by: Richard Palethorpe <io@richiejp.com> * feat: Implement transcription only mode for realtime API Reduce the scope of the real time API for the initial realease and make transcription only mode functional. Signed-off-by: Richard Palethorpe <io@richiejp.com> * chore(build): Build backends on a separate layer to speed up core only changes Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Signed-off-by: Richard Palethorpe <io@richiejp.com> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
4a91950848
commit
bf6426aef2
1265
core/http/endpoints/openai/realtime.go
Normal file
1265
core/http/endpoints/openai/realtime.go
Normal file
File diff suppressed because it is too large
Load Diff
259
core/http/endpoints/openai/realtime_model.go
Normal file
259
core/http/endpoints/openai/realtime_model.go
Normal file
@@ -0,0 +1,259 @@
|
||||
package openai
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
grpcClient "github.com/mudler/LocalAI/pkg/grpc"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/rs/zerolog/log"
|
||||
"google.golang.org/grpc"
|
||||
)
|
||||
|
||||
var (
|
||||
_ Model = new(wrappedModel)
|
||||
_ Model = new(anyToAnyModel)
|
||||
)
|
||||
|
||||
// wrappedModel represent a model which does not support Any-to-Any operations
|
||||
// This means that we will fake an Any-to-Any model by overriding some of the gRPC client methods
|
||||
// which are for Any-To-Any models, but instead we will call a pipeline (for e.g STT->LLM->TTS)
|
||||
type wrappedModel struct {
|
||||
TTSConfig *config.BackendConfig
|
||||
TranscriptionConfig *config.BackendConfig
|
||||
LLMConfig *config.BackendConfig
|
||||
TTSClient grpcClient.Backend
|
||||
TranscriptionClient grpcClient.Backend
|
||||
LLMClient grpcClient.Backend
|
||||
|
||||
VADConfig *config.BackendConfig
|
||||
VADClient grpcClient.Backend
|
||||
}
|
||||
|
||||
// anyToAnyModel represent a model which supports Any-to-Any operations
|
||||
// We have to wrap this out as well because we want to load two models one for VAD and one for the actual model.
|
||||
// In the future there could be models that accept continous audio input only so this design will be useful for that
|
||||
type anyToAnyModel struct {
|
||||
LLMConfig *config.BackendConfig
|
||||
LLMClient grpcClient.Backend
|
||||
|
||||
VADConfig *config.BackendConfig
|
||||
VADClient grpcClient.Backend
|
||||
}
|
||||
|
||||
type transcriptOnlyModel struct {
|
||||
TranscriptionConfig *config.BackendConfig
|
||||
TranscriptionClient grpcClient.Backend
|
||||
VADConfig *config.BackendConfig
|
||||
VADClient grpcClient.Backend
|
||||
}
|
||||
|
||||
func (m *transcriptOnlyModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) {
|
||||
return m.VADClient.VAD(ctx, in)
|
||||
}
|
||||
|
||||
func (m *transcriptOnlyModel) Transcribe(ctx context.Context, in *proto.TranscriptRequest, opts ...grpc.CallOption) (*proto.TranscriptResult, error) {
|
||||
return m.TranscriptionClient.AudioTranscription(ctx, in, opts...)
|
||||
}
|
||||
|
||||
func (m *transcriptOnlyModel) Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) {
|
||||
return nil, fmt.Errorf("predict operation not supported in transcript-only mode")
|
||||
}
|
||||
|
||||
func (m *transcriptOnlyModel) PredictStream(ctx context.Context, in *proto.PredictOptions, f func(reply *proto.Reply), opts ...grpc.CallOption) error {
|
||||
return fmt.Errorf("predict stream operation not supported in transcript-only mode")
|
||||
}
|
||||
|
||||
func (m *wrappedModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) {
|
||||
return m.VADClient.VAD(ctx, in)
|
||||
}
|
||||
|
||||
func (m *anyToAnyModel) VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error) {
|
||||
return m.VADClient.VAD(ctx, in)
|
||||
}
|
||||
|
||||
func (m *wrappedModel) Transcribe(ctx context.Context, in *proto.TranscriptRequest, opts ...grpc.CallOption) (*proto.TranscriptResult, error) {
|
||||
return m.TranscriptionClient.AudioTranscription(ctx, in, opts...)
|
||||
}
|
||||
|
||||
func (m *anyToAnyModel) Transcribe(ctx context.Context, in *proto.TranscriptRequest, opts ...grpc.CallOption) (*proto.TranscriptResult, error) {
|
||||
// TODO: Can any-to-any models transcribe?
|
||||
return m.LLMClient.AudioTranscription(ctx, in, opts...)
|
||||
}
|
||||
|
||||
func (m *wrappedModel) Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) {
|
||||
// TODO: Convert with pipeline (audio to text, text to llm, result to tts, and return it)
|
||||
// sound.BufferAsWAV(audioData, "audio.wav")
|
||||
|
||||
return m.LLMClient.Predict(ctx, in)
|
||||
}
|
||||
|
||||
func (m *wrappedModel) PredictStream(ctx context.Context, in *proto.PredictOptions, f func(reply *proto.Reply), opts ...grpc.CallOption) error {
|
||||
// TODO: Convert with pipeline (audio to text, text to llm, result to tts, and return it)
|
||||
|
||||
return m.LLMClient.PredictStream(ctx, in, f)
|
||||
}
|
||||
|
||||
func (m *anyToAnyModel) Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error) {
|
||||
return m.LLMClient.Predict(ctx, in)
|
||||
}
|
||||
|
||||
func (m *anyToAnyModel) PredictStream(ctx context.Context, in *proto.PredictOptions, f func(reply *proto.Reply), opts ...grpc.CallOption) error {
|
||||
return m.LLMClient.PredictStream(ctx, in, f)
|
||||
}
|
||||
|
||||
func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.BackendConfig, error) {
|
||||
cfgVAD, err := cl.LoadBackendConfigFileByName(pipeline.VAD, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
}
|
||||
|
||||
if !cfgVAD.Validate() {
|
||||
return nil, nil, fmt.Errorf("failed to validate config: %w", err)
|
||||
}
|
||||
|
||||
opts := backend.ModelOptions(*cfgVAD, appConfig)
|
||||
VADClient, err := ml.Load(opts...)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("failed to load tts model: %w", err)
|
||||
}
|
||||
|
||||
cfgSST, err := cl.LoadBackendConfigFileByName(pipeline.Transcription, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
}
|
||||
|
||||
if !cfgSST.Validate() {
|
||||
return nil, nil, fmt.Errorf("failed to validate config: %w", err)
|
||||
}
|
||||
|
||||
opts = backend.ModelOptions(*cfgSST, appConfig)
|
||||
transcriptionClient, err := ml.Load(opts...)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("failed to load SST model: %w", err)
|
||||
}
|
||||
|
||||
return &transcriptOnlyModel{
|
||||
VADConfig: cfgVAD,
|
||||
VADClient: VADClient,
|
||||
TranscriptionConfig: cfgSST,
|
||||
TranscriptionClient: transcriptionClient,
|
||||
}, cfgSST, nil
|
||||
}
|
||||
|
||||
// returns and loads either a wrapped model or a model that support audio-to-audio
|
||||
func newModel(pipeline *config.Pipeline, cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, error) {
|
||||
|
||||
cfgVAD, err := cl.LoadBackendConfigFileByName(pipeline.VAD, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
}
|
||||
|
||||
if !cfgVAD.Validate() {
|
||||
return nil, fmt.Errorf("failed to validate config: %w", err)
|
||||
}
|
||||
|
||||
opts := backend.ModelOptions(*cfgVAD, appConfig)
|
||||
VADClient, err := ml.Load(opts...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load tts model: %w", err)
|
||||
}
|
||||
|
||||
// TODO: Do we always need a transcription model? It can be disabled. Note that any-to-any instruction following models don't transcribe as such, so if transcription is required it is a separate process
|
||||
cfgSST, err := cl.LoadBackendConfigFileByName(pipeline.Transcription, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
}
|
||||
|
||||
if !cfgSST.Validate() {
|
||||
return nil, fmt.Errorf("failed to validate config: %w", err)
|
||||
}
|
||||
|
||||
opts = backend.ModelOptions(*cfgSST, appConfig)
|
||||
transcriptionClient, err := ml.Load(opts...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load SST model: %w", err)
|
||||
}
|
||||
|
||||
// TODO: Decide when we have a real any-to-any model
|
||||
if false {
|
||||
|
||||
cfgAnyToAny, err := cl.LoadBackendConfigFileByName(pipeline.LLM, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
}
|
||||
|
||||
if !cfgAnyToAny.Validate() {
|
||||
return nil, fmt.Errorf("failed to validate config: %w", err)
|
||||
}
|
||||
|
||||
opts := backend.ModelOptions(*cfgAnyToAny, appConfig)
|
||||
anyToAnyClient, err := ml.Load(opts...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load tts model: %w", err)
|
||||
}
|
||||
|
||||
return &anyToAnyModel{
|
||||
LLMConfig: cfgAnyToAny,
|
||||
LLMClient: anyToAnyClient,
|
||||
VADConfig: cfgVAD,
|
||||
VADClient: VADClient,
|
||||
}, nil
|
||||
}
|
||||
|
||||
log.Debug().Msg("Loading a wrapped model")
|
||||
|
||||
// Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations
|
||||
cfgLLM, err := cl.LoadBackendConfigFileByName(pipeline.LLM, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
}
|
||||
|
||||
if !cfgLLM.Validate() {
|
||||
return nil, fmt.Errorf("failed to validate config: %w", err)
|
||||
}
|
||||
|
||||
cfgTTS, err := cl.LoadBackendConfigFileByName(pipeline.TTS, ml.ModelPath)
|
||||
if err != nil {
|
||||
|
||||
return nil, fmt.Errorf("failed to load backend config: %w", err)
|
||||
}
|
||||
|
||||
if !cfgTTS.Validate() {
|
||||
return nil, fmt.Errorf("failed to validate config: %w", err)
|
||||
}
|
||||
|
||||
|
||||
opts = backend.ModelOptions(*cfgTTS, appConfig)
|
||||
ttsClient, err := ml.Load(opts...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load tts model: %w", err)
|
||||
}
|
||||
|
||||
opts = backend.ModelOptions(*cfgLLM, appConfig)
|
||||
llmClient, err := ml.Load(opts...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load LLM model: %w", err)
|
||||
}
|
||||
|
||||
return &wrappedModel{
|
||||
TTSConfig: cfgTTS,
|
||||
TranscriptionConfig: cfgSST,
|
||||
LLMConfig: cfgLLM,
|
||||
TTSClient: ttsClient,
|
||||
TranscriptionClient: transcriptionClient,
|
||||
LLMClient: llmClient,
|
||||
|
||||
VADConfig: cfgVAD,
|
||||
VADClient: VADClient,
|
||||
}, nil
|
||||
}
|
||||
1178
core/http/endpoints/openai/types/realtime.go
Normal file
1178
core/http/endpoints/openai/types/realtime.go
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user