mirror of
https://github.com/mudler/LocalAI.git
synced 2026-01-06 10:39:55 -06:00
* fix(ci): Avoid matching wrong backend with the same prefix Signed-off-by: Richard Palethorpe <io@richiejp.com> * chore(whisper): Use Purego and enable VAD This replaces the Whisper CGO bindings with our own Purego based module to make compilation easier. In addition this allows VAD models to be loaded by Whisper. There is not much benefit now except that the same backend can be used for VAD and transcription. Depending on upstream we may also be able to use GPU for VAD in the future, but presently it is disabled. Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
1287 lines
40 KiB
Go
1287 lines
40 KiB
Go
package openai
|
|
|
|
import (
|
|
"context"
|
|
"encoding/base64"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/go-audio/audio"
|
|
"github.com/gofiber/fiber/v2"
|
|
"github.com/gofiber/websocket/v2"
|
|
"github.com/mudler/LocalAI/core/application"
|
|
"github.com/mudler/LocalAI/core/config"
|
|
"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
|
|
"github.com/mudler/LocalAI/core/templates"
|
|
laudio "github.com/mudler/LocalAI/pkg/audio"
|
|
"github.com/mudler/LocalAI/pkg/functions"
|
|
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
model "github.com/mudler/LocalAI/pkg/model"
|
|
"github.com/mudler/LocalAI/pkg/sound"
|
|
|
|
"google.golang.org/grpc"
|
|
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
const (
|
|
localSampleRate = 16000
|
|
remoteSampleRate = 24000
|
|
vadModel = "silero-vad-ggml"
|
|
)
|
|
|
|
// A model can be "emulated" that is: transcribe audio to text -> feed text to the LLM -> generate audio as result
|
|
// If the model support instead audio-to-audio, we will use the specific gRPC calls instead
|
|
|
|
// Session represents a single WebSocket connection and its state
|
|
type Session struct {
|
|
ID string
|
|
TranscriptionOnly bool
|
|
Model string
|
|
Voice string
|
|
TurnDetection *types.ServerTurnDetection `json:"turn_detection"` // "server_vad" or "none"
|
|
InputAudioTranscription *types.InputAudioTranscription
|
|
Functions functions.Functions
|
|
Conversations map[string]*Conversation
|
|
InputAudioBuffer []byte
|
|
AudioBufferLock sync.Mutex
|
|
Instructions string
|
|
DefaultConversationID string
|
|
ModelInterface Model
|
|
}
|
|
|
|
func (s *Session) FromClient(session *types.ClientSession) {
|
|
}
|
|
|
|
func (s *Session) ToServer() types.ServerSession {
|
|
return types.ServerSession{
|
|
ID: s.ID,
|
|
Object: func() string {
|
|
if s.TranscriptionOnly {
|
|
return "realtime.transcription_session"
|
|
} else {
|
|
return "realtime.session"
|
|
}
|
|
}(),
|
|
Model: s.Model,
|
|
Modalities: []types.Modality{types.ModalityText, types.ModalityAudio},
|
|
Instructions: s.Instructions,
|
|
Voice: s.Voice,
|
|
InputAudioFormat: types.AudioFormatPcm16,
|
|
OutputAudioFormat: types.AudioFormatPcm16,
|
|
TurnDetection: s.TurnDetection,
|
|
InputAudioTranscription: s.InputAudioTranscription,
|
|
// TODO: Should be constructed from Functions?
|
|
Tools: []types.Tool{},
|
|
// TODO: ToolChoice
|
|
// TODO: Temperature
|
|
// TODO: MaxOutputTokens
|
|
// TODO: InputAudioNoiseReduction
|
|
}
|
|
}
|
|
|
|
// TODO: Update to tools?
|
|
// FunctionCall represents a function call initiated by the model
|
|
type FunctionCall struct {
|
|
Name string `json:"name"`
|
|
Arguments map[string]interface{} `json:"arguments"`
|
|
}
|
|
|
|
// Conversation represents a conversation with a list of items
|
|
type Conversation struct {
|
|
ID string
|
|
Items []*types.MessageItem
|
|
Lock sync.Mutex
|
|
}
|
|
|
|
func (c *Conversation) ToServer() types.Conversation {
|
|
return types.Conversation{
|
|
ID: c.ID,
|
|
Object: "realtime.conversation",
|
|
}
|
|
}
|
|
|
|
// Item represents a message, function_call, or function_call_output
|
|
type Item struct {
|
|
ID string `json:"id"`
|
|
Object string `json:"object"`
|
|
Type string `json:"type"` // "message", "function_call", "function_call_output"
|
|
Status string `json:"status"`
|
|
Role string `json:"role"`
|
|
Content []ConversationContent `json:"content,omitempty"`
|
|
FunctionCall *FunctionCall `json:"function_call,omitempty"`
|
|
}
|
|
|
|
// ConversationContent represents the content of an item
|
|
type ConversationContent struct {
|
|
Type string `json:"type"` // "input_text", "input_audio", "text", "audio", etc.
|
|
Audio string `json:"audio,omitempty"`
|
|
Text string `json:"text,omitempty"`
|
|
// Additional fields as needed
|
|
}
|
|
|
|
// Define the structures for incoming messages
|
|
type IncomingMessage struct {
|
|
Type types.ClientEventType `json:"type"`
|
|
Session json.RawMessage `json:"session,omitempty"`
|
|
Item json.RawMessage `json:"item,omitempty"`
|
|
Audio string `json:"audio,omitempty"`
|
|
Response json.RawMessage `json:"response,omitempty"`
|
|
Error *ErrorMessage `json:"error,omitempty"`
|
|
// Other fields as needed
|
|
}
|
|
|
|
// ErrorMessage represents an error message sent to the client
|
|
type ErrorMessage struct {
|
|
Type string `json:"type"`
|
|
Code string `json:"code"`
|
|
Message string `json:"message"`
|
|
Param string `json:"param,omitempty"`
|
|
EventID string `json:"event_id,omitempty"`
|
|
}
|
|
|
|
// Define a structure for outgoing messages
|
|
type OutgoingMessage struct {
|
|
Type string `json:"type"`
|
|
Session *Session `json:"session,omitempty"`
|
|
Conversation *Conversation `json:"conversation,omitempty"`
|
|
Item *Item `json:"item,omitempty"`
|
|
Content string `json:"content,omitempty"`
|
|
Audio string `json:"audio,omitempty"`
|
|
Error *ErrorMessage `json:"error,omitempty"`
|
|
}
|
|
|
|
// Map to store sessions (in-memory)
|
|
var sessions = make(map[string]*Session)
|
|
var sessionLock sync.Mutex
|
|
|
|
// TODO: implement interface as we start to define usages
|
|
type Model interface {
|
|
VAD(ctx context.Context, in *proto.VADRequest, opts ...grpc.CallOption) (*proto.VADResponse, error)
|
|
Transcribe(ctx context.Context, in *proto.TranscriptRequest, opts ...grpc.CallOption) (*proto.TranscriptResult, error)
|
|
Predict(ctx context.Context, in *proto.PredictOptions, opts ...grpc.CallOption) (*proto.Reply, error)
|
|
PredictStream(ctx context.Context, in *proto.PredictOptions, f func(*proto.Reply), opts ...grpc.CallOption) error
|
|
}
|
|
|
|
// TODO: Implement ephemeral keys to allow these endpoints to be used
|
|
func RealtimeSessions(application *application.Application) fiber.Handler {
|
|
return func(ctx *fiber.Ctx) error {
|
|
return ctx.SendStatus(501)
|
|
}
|
|
}
|
|
|
|
func RealtimeTranscriptionSession(application *application.Application) fiber.Handler {
|
|
return func(ctx *fiber.Ctx) error {
|
|
return ctx.SendStatus(501)
|
|
}
|
|
}
|
|
|
|
func Realtime(application *application.Application) fiber.Handler {
|
|
return websocket.New(registerRealtime(application))
|
|
}
|
|
|
|
func registerRealtime(application *application.Application) func(c *websocket.Conn) {
|
|
return func(c *websocket.Conn) {
|
|
|
|
evaluator := application.TemplatesEvaluator()
|
|
log.Debug().Msgf("WebSocket connection established with '%s'", c.RemoteAddr().String())
|
|
|
|
model := c.Query("model", "gpt-4o")
|
|
|
|
intent := c.Query("intent")
|
|
if intent != "transcription" {
|
|
sendNotImplemented(c, "Only transcription mode is supported which requires the intent=transcription parameter")
|
|
}
|
|
|
|
log.Debug().Msgf("Realtime params: model=%s, intent=%s", model, intent)
|
|
|
|
sessionID := generateSessionID()
|
|
session := &Session{
|
|
ID: sessionID,
|
|
TranscriptionOnly: true,
|
|
Model: model, // default model
|
|
Voice: "alloy", // default voice
|
|
TurnDetection: &types.ServerTurnDetection{
|
|
Type: types.ServerTurnDetectionTypeServerVad,
|
|
TurnDetectionParams: types.TurnDetectionParams{
|
|
// TODO: Need some way to pass this to the backend
|
|
Threshold: 0.5,
|
|
// TODO: This is ignored and the amount of padding is random at present
|
|
PrefixPaddingMs: 30,
|
|
SilenceDurationMs: 500,
|
|
CreateResponse: func() *bool { t := true; return &t }(),
|
|
},
|
|
},
|
|
InputAudioTranscription: &types.InputAudioTranscription{
|
|
Model: "whisper-1",
|
|
},
|
|
Conversations: make(map[string]*Conversation),
|
|
}
|
|
|
|
// Create a default conversation
|
|
conversationID := generateConversationID()
|
|
conversation := &Conversation{
|
|
ID: conversationID,
|
|
Items: []*types.MessageItem{},
|
|
}
|
|
session.Conversations[conversationID] = conversation
|
|
session.DefaultConversationID = conversationID
|
|
|
|
// TODO: The API has no way to configure the VAD model or other models that make up a pipeline to fake any-to-any
|
|
// So possibly we could have a way to configure a composite model that can be used in situations where any-to-any is expected
|
|
pipeline := config.Pipeline{
|
|
VAD: vadModel,
|
|
Transcription: session.InputAudioTranscription.Model,
|
|
}
|
|
|
|
m, cfg, err := newTranscriptionOnlyModel(
|
|
&pipeline,
|
|
application.ModelConfigLoader(),
|
|
application.ModelLoader(),
|
|
application.ApplicationConfig(),
|
|
)
|
|
if err != nil {
|
|
log.Error().Msgf("failed to load model: %s", err.Error())
|
|
sendError(c, "model_load_error", "Failed to load model", "", "")
|
|
return
|
|
}
|
|
session.ModelInterface = m
|
|
|
|
// Store the session
|
|
sessionLock.Lock()
|
|
sessions[sessionID] = session
|
|
sessionLock.Unlock()
|
|
|
|
// Send session.created and conversation.created events to the client
|
|
sendEvent(c, types.SessionCreatedEvent{
|
|
ServerEventBase: types.ServerEventBase{
|
|
EventID: "event_TODO",
|
|
Type: types.ServerEventTypeSessionCreated,
|
|
},
|
|
Session: session.ToServer(),
|
|
})
|
|
sendEvent(c, types.ConversationCreatedEvent{
|
|
ServerEventBase: types.ServerEventBase{
|
|
EventID: "event_TODO",
|
|
Type: types.ServerEventTypeConversationCreated,
|
|
},
|
|
Conversation: conversation.ToServer(),
|
|
})
|
|
|
|
var (
|
|
// mt int
|
|
msg []byte
|
|
wg sync.WaitGroup
|
|
done = make(chan struct{})
|
|
)
|
|
|
|
vadServerStarted := true
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
conversation := session.Conversations[session.DefaultConversationID]
|
|
handleVAD(cfg, evaluator, session, conversation, c, done)
|
|
}()
|
|
|
|
for {
|
|
if _, msg, err = c.ReadMessage(); err != nil {
|
|
log.Error().Msgf("read: %s", err.Error())
|
|
break
|
|
}
|
|
|
|
// Parse the incoming message
|
|
var incomingMsg IncomingMessage
|
|
if err := json.Unmarshal(msg, &incomingMsg); err != nil {
|
|
log.Error().Msgf("invalid json: %s", err.Error())
|
|
sendError(c, "invalid_json", "Invalid JSON format", "", "")
|
|
continue
|
|
}
|
|
|
|
var sessionUpdate types.ClientSession
|
|
switch incomingMsg.Type {
|
|
case types.ClientEventTypeTranscriptionSessionUpdate:
|
|
log.Debug().Msgf("recv: %s", msg)
|
|
|
|
if err := json.Unmarshal(incomingMsg.Session, &sessionUpdate); err != nil {
|
|
log.Error().Msgf("failed to unmarshal 'transcription_session.update': %s", err.Error())
|
|
sendError(c, "invalid_session_update", "Invalid session update format", "", "")
|
|
continue
|
|
}
|
|
if err := updateTransSession(
|
|
session,
|
|
&sessionUpdate,
|
|
application.ModelConfigLoader(),
|
|
application.ModelLoader(),
|
|
application.ApplicationConfig(),
|
|
); err != nil {
|
|
log.Error().Msgf("failed to update session: %s", err.Error())
|
|
sendError(c, "session_update_error", "Failed to update session", "", "")
|
|
continue
|
|
}
|
|
|
|
sendEvent(c, types.SessionUpdatedEvent{
|
|
ServerEventBase: types.ServerEventBase{
|
|
EventID: "event_TODO",
|
|
Type: types.ServerEventTypeTranscriptionSessionUpdated,
|
|
},
|
|
Session: session.ToServer(),
|
|
})
|
|
|
|
case types.ClientEventTypeSessionUpdate:
|
|
log.Debug().Msgf("recv: %s", msg)
|
|
|
|
// Update session configurations
|
|
if err := json.Unmarshal(incomingMsg.Session, &sessionUpdate); err != nil {
|
|
log.Error().Msgf("failed to unmarshal 'session.update': %s", err.Error())
|
|
sendError(c, "invalid_session_update", "Invalid session update format", "", "")
|
|
continue
|
|
}
|
|
if err := updateSession(
|
|
session,
|
|
&sessionUpdate,
|
|
application.ModelConfigLoader(),
|
|
application.ModelLoader(),
|
|
application.ApplicationConfig(),
|
|
); err != nil {
|
|
log.Error().Msgf("failed to update session: %s", err.Error())
|
|
sendError(c, "session_update_error", "Failed to update session", "", "")
|
|
continue
|
|
}
|
|
|
|
sendEvent(c, types.SessionUpdatedEvent{
|
|
ServerEventBase: types.ServerEventBase{
|
|
EventID: "event_TODO",
|
|
Type: types.ServerEventTypeSessionUpdated,
|
|
},
|
|
Session: session.ToServer(),
|
|
})
|
|
|
|
if session.TurnDetection.Type == types.ServerTurnDetectionTypeServerVad && !vadServerStarted {
|
|
log.Debug().Msg("Starting VAD goroutine...")
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
conversation := session.Conversations[session.DefaultConversationID]
|
|
handleVAD(cfg, evaluator, session, conversation, c, done)
|
|
}()
|
|
vadServerStarted = true
|
|
} else if session.TurnDetection.Type != types.ServerTurnDetectionTypeServerVad && vadServerStarted {
|
|
log.Debug().Msg("Stopping VAD goroutine...")
|
|
|
|
wg.Add(-1)
|
|
go func() {
|
|
done <- struct{}{}
|
|
}()
|
|
vadServerStarted = false
|
|
}
|
|
case types.ClientEventTypeInputAudioBufferAppend:
|
|
// Handle 'input_audio_buffer.append'
|
|
if incomingMsg.Audio == "" {
|
|
log.Error().Msg("Audio data is missing in 'input_audio_buffer.append'")
|
|
sendError(c, "missing_audio_data", "Audio data is missing", "", "")
|
|
continue
|
|
}
|
|
|
|
// Decode base64 audio data
|
|
decodedAudio, err := base64.StdEncoding.DecodeString(incomingMsg.Audio)
|
|
if err != nil {
|
|
log.Error().Msgf("failed to decode audio data: %s", err.Error())
|
|
sendError(c, "invalid_audio_data", "Failed to decode audio data", "", "")
|
|
continue
|
|
}
|
|
|
|
// Append to InputAudioBuffer
|
|
session.AudioBufferLock.Lock()
|
|
session.InputAudioBuffer = append(session.InputAudioBuffer, decodedAudio...)
|
|
session.AudioBufferLock.Unlock()
|
|
|
|
case types.ClientEventTypeInputAudioBufferCommit:
|
|
log.Debug().Msgf("recv: %s", msg)
|
|
|
|
// TODO: Trigger transcription.
|
|
// TODO: Ignore this if VAD enabled or interrupt VAD?
|
|
|
|
if session.TranscriptionOnly {
|
|
continue
|
|
}
|
|
|
|
// Commit the audio buffer to the conversation as a new item
|
|
item := &types.MessageItem{
|
|
ID: generateItemID(),
|
|
Type: "message",
|
|
Status: "completed",
|
|
Role: "user",
|
|
Content: []types.MessageContentPart{
|
|
{
|
|
Type: "input_audio",
|
|
Audio: base64.StdEncoding.EncodeToString(session.InputAudioBuffer),
|
|
},
|
|
},
|
|
}
|
|
|
|
// Add item to conversation
|
|
conversation.Lock.Lock()
|
|
conversation.Items = append(conversation.Items, item)
|
|
conversation.Lock.Unlock()
|
|
|
|
// Reset InputAudioBuffer
|
|
session.AudioBufferLock.Lock()
|
|
session.InputAudioBuffer = nil
|
|
session.AudioBufferLock.Unlock()
|
|
|
|
// Send item.created event
|
|
sendEvent(c, types.ConversationItemCreatedEvent{
|
|
ServerEventBase: types.ServerEventBase{
|
|
EventID: "event_TODO",
|
|
Type: "conversation.item.created",
|
|
},
|
|
Item: types.ResponseMessageItem{
|
|
Object: "realtime.item",
|
|
MessageItem: *item,
|
|
},
|
|
})
|
|
|
|
case types.ClientEventTypeConversationItemCreate:
|
|
log.Debug().Msgf("recv: %s", msg)
|
|
|
|
// Handle creating new conversation items
|
|
var item types.ConversationItemCreateEvent
|
|
if err := json.Unmarshal(incomingMsg.Item, &item); err != nil {
|
|
log.Error().Msgf("failed to unmarshal 'conversation.item.create': %s", err.Error())
|
|
sendError(c, "invalid_item", "Invalid item format", "", "")
|
|
continue
|
|
}
|
|
|
|
sendNotImplemented(c, "conversation.item.create")
|
|
|
|
// Generate item ID and set status
|
|
// item.ID = generateItemID()
|
|
// item.Object = "realtime.item"
|
|
// item.Status = "completed"
|
|
//
|
|
// // Add item to conversation
|
|
// conversation.Lock.Lock()
|
|
// conversation.Items = append(conversation.Items, &item)
|
|
// conversation.Lock.Unlock()
|
|
//
|
|
// // Send item.created event
|
|
// sendEvent(c, OutgoingMessage{
|
|
// Type: "conversation.item.created",
|
|
// Item: &item,
|
|
// })
|
|
|
|
case types.ClientEventTypeConversationItemDelete:
|
|
sendError(c, "not_implemented", "Deleting items not implemented", "", "event_TODO")
|
|
|
|
case types.ClientEventTypeResponseCreate:
|
|
// Handle generating a response
|
|
var responseCreate types.ResponseCreateEvent
|
|
if len(incomingMsg.Response) > 0 {
|
|
if err := json.Unmarshal(incomingMsg.Response, &responseCreate); err != nil {
|
|
log.Error().Msgf("failed to unmarshal 'response.create' response object: %s", err.Error())
|
|
sendError(c, "invalid_response_create", "Invalid response create format", "", "")
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Update session functions if provided
|
|
if len(responseCreate.Response.Tools) > 0 {
|
|
// TODO: Tools -> Functions
|
|
}
|
|
|
|
sendNotImplemented(c, "response.create")
|
|
|
|
// TODO: Generate a response based on the conversation history
|
|
// wg.Add(1)
|
|
// go func() {
|
|
// defer wg.Done()
|
|
// generateResponse(cfg, evaluator, session, conversation, responseCreate, c, mt)
|
|
// }()
|
|
|
|
case types.ClientEventTypeResponseCancel:
|
|
log.Printf("recv: %s", msg)
|
|
|
|
// Handle cancellation of ongoing responses
|
|
// Implement cancellation logic as needed
|
|
sendNotImplemented(c, "response.cancel")
|
|
|
|
default:
|
|
log.Error().Msgf("unknown message type: %s", incomingMsg.Type)
|
|
sendError(c, "unknown_message_type", fmt.Sprintf("Unknown message type: %s", incomingMsg.Type), "", "")
|
|
}
|
|
}
|
|
|
|
// Close the done channel to signal goroutines to exit
|
|
close(done)
|
|
wg.Wait()
|
|
|
|
// Remove the session from the sessions map
|
|
sessionLock.Lock()
|
|
delete(sessions, sessionID)
|
|
sessionLock.Unlock()
|
|
}
|
|
}
|
|
|
|
// Helper function to send events to the client
|
|
func sendEvent(c *websocket.Conn, event types.ServerEvent) {
|
|
eventBytes, err := json.Marshal(event)
|
|
if err != nil {
|
|
log.Error().Msgf("failed to marshal event: %s", err.Error())
|
|
return
|
|
}
|
|
if err = c.WriteMessage(websocket.TextMessage, eventBytes); err != nil {
|
|
log.Error().Msgf("write: %s", err.Error())
|
|
}
|
|
}
|
|
|
|
// Helper function to send errors to the client
|
|
func sendError(c *websocket.Conn, code, message, param, eventID string) {
|
|
errorEvent := types.ErrorEvent{
|
|
ServerEventBase: types.ServerEventBase{
|
|
Type: types.ServerEventTypeError,
|
|
EventID: eventID,
|
|
},
|
|
Error: types.Error{
|
|
Type: "invalid_request_error",
|
|
Code: code,
|
|
Message: message,
|
|
EventID: eventID,
|
|
},
|
|
}
|
|
|
|
sendEvent(c, errorEvent)
|
|
}
|
|
|
|
func sendNotImplemented(c *websocket.Conn, message string) {
|
|
sendError(c, "not_implemented", message, "", "event_TODO")
|
|
}
|
|
|
|
func updateTransSession(session *Session, update *types.ClientSession, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error {
|
|
sessionLock.Lock()
|
|
defer sessionLock.Unlock()
|
|
|
|
trUpd := update.InputAudioTranscription
|
|
trCur := session.InputAudioTranscription
|
|
|
|
if trUpd != nil && trUpd.Model != "" && trUpd.Model != trCur.Model {
|
|
pipeline := config.Pipeline{
|
|
VAD: vadModel,
|
|
Transcription: trUpd.Model,
|
|
}
|
|
|
|
m, _, err := newTranscriptionOnlyModel(&pipeline, cl, ml, appConfig)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
session.ModelInterface = m
|
|
}
|
|
|
|
if update.TurnDetection != nil && update.TurnDetection.Type != "" {
|
|
session.TurnDetection.Type = types.ServerTurnDetectionType(update.TurnDetection.Type)
|
|
session.TurnDetection.TurnDetectionParams = update.TurnDetection.TurnDetectionParams
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Function to update session configurations
|
|
func updateSession(session *Session, update *types.ClientSession, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) error {
|
|
sessionLock.Lock()
|
|
defer sessionLock.Unlock()
|
|
|
|
if update.Model != "" {
|
|
pipeline := config.Pipeline{
|
|
LLM: update.Model,
|
|
// TODO: Setup pipeline by configuring STT and TTS models
|
|
}
|
|
m, err := newModel(&pipeline, cl, ml, appConfig)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
session.ModelInterface = m
|
|
session.Model = update.Model
|
|
}
|
|
|
|
if update.Voice != "" {
|
|
session.Voice = update.Voice
|
|
}
|
|
if update.TurnDetection != nil && update.TurnDetection.Type != "" {
|
|
session.TurnDetection.Type = types.ServerTurnDetectionType(update.TurnDetection.Type)
|
|
session.TurnDetection.TurnDetectionParams = update.TurnDetection.TurnDetectionParams
|
|
}
|
|
// TODO: We should actually check if the field was present in the JSON; empty string means clear the settings
|
|
if update.Instructions != "" {
|
|
session.Instructions = update.Instructions
|
|
}
|
|
if update.Tools != nil {
|
|
return fmt.Errorf("Haven't implemented tools")
|
|
}
|
|
|
|
session.InputAudioTranscription = update.InputAudioTranscription
|
|
|
|
return nil
|
|
}
|
|
|
|
// handleVAD is a goroutine that listens for audio data from the client,
|
|
// runs VAD on the audio data, and commits utterances to the conversation
|
|
func handleVAD(cfg *config.ModelConfig, evaluator *templates.Evaluator, session *Session, conv *Conversation, c *websocket.Conn, done chan struct{}) {
|
|
vadContext, cancel := context.WithCancel(context.Background())
|
|
go func() {
|
|
<-done
|
|
cancel()
|
|
}()
|
|
|
|
silenceThreshold := float64(session.TurnDetection.SilenceDurationMs) / 1000
|
|
speechStarted := false
|
|
startTime := time.Now()
|
|
|
|
ticker := time.NewTicker(300 * time.Millisecond)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-done:
|
|
return
|
|
case <-ticker.C:
|
|
session.AudioBufferLock.Lock()
|
|
allAudio := make([]byte, len(session.InputAudioBuffer))
|
|
copy(allAudio, session.InputAudioBuffer)
|
|
session.AudioBufferLock.Unlock()
|
|
|
|
aints := sound.BytesToInt16sLE(allAudio)
|
|
if len(aints) == 0 || len(aints) < int(silenceThreshold)*remoteSampleRate {
|
|
continue
|
|
}
|
|
|
|
// Resample from 24kHz to 16kHz
|
|
aints = sound.ResampleInt16(aints, remoteSampleRate, localSampleRate)
|
|
|
|
segments, err := runVAD(vadContext, session, aints)
|
|
if err != nil {
|
|
if err.Error() == "unexpected speech end" {
|
|
log.Debug().Msg("VAD cancelled")
|
|
continue
|
|
}
|
|
log.Error().Msgf("failed to process audio: %s", err.Error())
|
|
sendError(c, "processing_error", "Failed to process audio: "+err.Error(), "", "")
|
|
continue
|
|
}
|
|
|
|
audioLength := float64(len(aints)) / localSampleRate
|
|
|
|
// TODO: When resetting the buffer we should retain a small postfix
|
|
// TODO: The OpenAI documentation seems to suggest that only the client decides when to clear the buffer
|
|
if len(segments) == 0 && audioLength > silenceThreshold {
|
|
session.AudioBufferLock.Lock()
|
|
session.InputAudioBuffer = nil
|
|
session.AudioBufferLock.Unlock()
|
|
log.Debug().Msgf("Detected silence for a while, clearing audio buffer")
|
|
|
|
sendEvent(c, types.InputAudioBufferClearedEvent{
|
|
ServerEventBase: types.ServerEventBase{
|
|
EventID: "event_TODO",
|
|
Type: types.ServerEventTypeInputAudioBufferCleared,
|
|
},
|
|
})
|
|
|
|
continue
|
|
} else if len(segments) == 0 {
|
|
continue
|
|
}
|
|
|
|
if !speechStarted {
|
|
sendEvent(c, types.InputAudioBufferSpeechStartedEvent{
|
|
ServerEventBase: types.ServerEventBase{
|
|
EventID: "event_TODO",
|
|
Type: types.ServerEventTypeInputAudioBufferSpeechStarted,
|
|
},
|
|
AudioStartMs: time.Now().Sub(startTime).Milliseconds(),
|
|
})
|
|
speechStarted = true
|
|
}
|
|
|
|
// Segment still in progress when audio ended
|
|
segEndTime := segments[len(segments)-1].GetEnd()
|
|
if segEndTime == 0 {
|
|
continue
|
|
}
|
|
|
|
if float32(audioLength)-segEndTime > float32(silenceThreshold) {
|
|
log.Debug().Msgf("Detected end of speech segment")
|
|
session.AudioBufferLock.Lock()
|
|
session.InputAudioBuffer = nil
|
|
session.AudioBufferLock.Unlock()
|
|
|
|
sendEvent(c, types.InputAudioBufferSpeechStoppedEvent{
|
|
ServerEventBase: types.ServerEventBase{
|
|
EventID: "event_TODO",
|
|
Type: types.ServerEventTypeInputAudioBufferSpeechStopped,
|
|
},
|
|
AudioEndMs: time.Now().Sub(startTime).Milliseconds(),
|
|
})
|
|
speechStarted = false
|
|
|
|
sendEvent(c, types.InputAudioBufferCommittedEvent{
|
|
ServerEventBase: types.ServerEventBase{
|
|
EventID: "event_TODO",
|
|
Type: types.ServerEventTypeInputAudioBufferCommitted,
|
|
},
|
|
ItemID: generateItemID(),
|
|
PreviousItemID: "TODO",
|
|
})
|
|
|
|
abytes := sound.Int16toBytesLE(aints)
|
|
// TODO: Remove prefix silence that is is over TurnDetectionParams.PrefixPaddingMs
|
|
go commitUtterance(vadContext, abytes, cfg, evaluator, session, conv, c)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func commitUtterance(ctx context.Context, utt []byte, cfg *config.ModelConfig, evaluator *templates.Evaluator, session *Session, conv *Conversation, c *websocket.Conn) {
|
|
if len(utt) == 0 {
|
|
return
|
|
}
|
|
|
|
// TODO: If we have a real any-to-any model then transcription is optional
|
|
|
|
f, err := os.CreateTemp("", "realtime-audio-chunk-*.wav")
|
|
if err != nil {
|
|
log.Error().Msgf("failed to create temp file: %s", err.Error())
|
|
return
|
|
}
|
|
defer f.Close()
|
|
defer os.Remove(f.Name())
|
|
log.Debug().Msgf("Writing to %s\n", f.Name())
|
|
|
|
hdr := laudio.NewWAVHeader(uint32(len(utt)))
|
|
if err := hdr.Write(f); err != nil {
|
|
log.Error().Msgf("Failed to write WAV header: %s", err.Error())
|
|
return
|
|
}
|
|
|
|
if _, err := f.Write(utt); err != nil {
|
|
log.Error().Msgf("Failed to write audio data: %s", err.Error())
|
|
return
|
|
}
|
|
|
|
f.Sync()
|
|
|
|
if session.InputAudioTranscription != nil {
|
|
tr, err := session.ModelInterface.Transcribe(ctx, &proto.TranscriptRequest{
|
|
Dst: f.Name(),
|
|
Language: session.InputAudioTranscription.Language,
|
|
Translate: false,
|
|
Threads: uint32(*cfg.Threads),
|
|
})
|
|
if err != nil {
|
|
sendError(c, "transcription_failed", err.Error(), "", "event_TODO")
|
|
}
|
|
|
|
sendEvent(c, types.ResponseAudioTranscriptDoneEvent{
|
|
ServerEventBase: types.ServerEventBase{
|
|
Type: types.ServerEventTypeResponseAudioTranscriptDone,
|
|
EventID: "event_TODO",
|
|
},
|
|
|
|
ItemID: generateItemID(),
|
|
ResponseID: "resp_TODO",
|
|
OutputIndex: 0,
|
|
ContentIndex: 0,
|
|
Transcript: tr.GetText(),
|
|
})
|
|
// TODO: Update the prompt with transcription result?
|
|
}
|
|
|
|
if !session.TranscriptionOnly {
|
|
sendNotImplemented(c, "Commiting items to the conversation not implemented")
|
|
}
|
|
|
|
// TODO: Commit the audio and/or transcribed text to the conversation
|
|
// Commit logic: create item, broadcast item.created, etc.
|
|
// item := &Item{
|
|
// ID: generateItemID(),
|
|
// Object: "realtime.item",
|
|
// Type: "message",
|
|
// Status: "completed",
|
|
// Role: "user",
|
|
// Content: []ConversationContent{
|
|
// {
|
|
// Type: "input_audio",
|
|
// Audio: base64.StdEncoding.EncodeToString(utt),
|
|
// },
|
|
// },
|
|
// }
|
|
// conv.Lock.Lock()
|
|
// conv.Items = append(conv.Items, item)
|
|
// conv.Lock.Unlock()
|
|
//
|
|
//
|
|
// sendEvent(c, OutgoingMessage{
|
|
// Type: "conversation.item.created",
|
|
// Item: item,
|
|
// })
|
|
//
|
|
//
|
|
// // trigger the response generation
|
|
// generateResponse(cfg, evaluator, session, conv, ResponseCreate{}, c, websocket.TextMessage)
|
|
}
|
|
|
|
func runVAD(ctx context.Context, session *Session, adata []int16) ([]*proto.VADSegment, error) {
|
|
soundIntBuffer := &audio.IntBuffer{
|
|
Format: &audio.Format{SampleRate: localSampleRate, NumChannels: 1},
|
|
SourceBitDepth: 16,
|
|
Data: sound.ConvertInt16ToInt(adata),
|
|
}
|
|
|
|
float32Data := soundIntBuffer.AsFloat32Buffer().Data
|
|
|
|
resp, err := session.ModelInterface.VAD(ctx, &proto.VADRequest{
|
|
Audio: float32Data,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// If resp.Segments is empty => no speech
|
|
return resp.Segments, nil
|
|
}
|
|
|
|
// TODO: Below needed for normal mode instead of transcription only
|
|
// Function to generate a response based on the conversation
|
|
// func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator, session *Session, conversation *Conversation, responseCreate ResponseCreate, c *websocket.Conn, mt int) {
|
|
//
|
|
// log.Debug().Msg("Generating realtime response...")
|
|
//
|
|
// // Compile the conversation history
|
|
// conversation.Lock.Lock()
|
|
// var conversationHistory []schema.Message
|
|
// var latestUserAudio string
|
|
// for _, item := range conversation.Items {
|
|
// for _, content := range item.Content {
|
|
// switch content.Type {
|
|
// case "input_text", "text":
|
|
// conversationHistory = append(conversationHistory, schema.Message{
|
|
// Role: string(item.Role),
|
|
// StringContent: content.Text,
|
|
// Content: content.Text,
|
|
// })
|
|
// case "input_audio":
|
|
// // We do not to turn to text here the audio result.
|
|
// // When generating it later on from the LLM,
|
|
// // we will also generate text and return it and store it in the conversation
|
|
// // Here we just want to get the user audio if there is any as a new input for the conversation.
|
|
// if item.Role == "user" {
|
|
// latestUserAudio = content.Audio
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
//
|
|
// conversation.Lock.Unlock()
|
|
//
|
|
// var generatedText string
|
|
// var generatedAudio []byte
|
|
// var functionCall *FunctionCall
|
|
// var err error
|
|
//
|
|
// if latestUserAudio != "" {
|
|
// // Process the latest user audio input
|
|
// decodedAudio, err := base64.StdEncoding.DecodeString(latestUserAudio)
|
|
// if err != nil {
|
|
// log.Error().Msgf("failed to decode latest user audio: %s", err.Error())
|
|
// sendError(c, "invalid_audio_data", "Failed to decode audio data", "", "")
|
|
// return
|
|
// }
|
|
//
|
|
// // Process the audio input and generate a response
|
|
// generatedText, generatedAudio, functionCall, err = processAudioResponse(session, decodedAudio)
|
|
// if err != nil {
|
|
// log.Error().Msgf("failed to process audio response: %s", err.Error())
|
|
// sendError(c, "processing_error", "Failed to generate audio response", "", "")
|
|
// return
|
|
// }
|
|
// } else {
|
|
//
|
|
// if session.Instructions != "" {
|
|
// conversationHistory = append([]schema.Message{{
|
|
// Role: "system",
|
|
// StringContent: session.Instructions,
|
|
// Content: session.Instructions,
|
|
// }}, conversationHistory...)
|
|
// }
|
|
//
|
|
// funcs := session.Functions
|
|
// shouldUseFn := len(funcs) > 0 && config.ShouldUseFunctions()
|
|
//
|
|
// // Allow the user to set custom actions via config file
|
|
// // to be "embedded" in each model
|
|
// noActionName := "answer"
|
|
// noActionDescription := "use this action to answer without performing any action"
|
|
//
|
|
// if config.FunctionsConfig.NoActionFunctionName != "" {
|
|
// noActionName = config.FunctionsConfig.NoActionFunctionName
|
|
// }
|
|
// if config.FunctionsConfig.NoActionDescriptionName != "" {
|
|
// noActionDescription = config.FunctionsConfig.NoActionDescriptionName
|
|
// }
|
|
//
|
|
// if (!config.FunctionsConfig.GrammarConfig.NoGrammar) && shouldUseFn {
|
|
// noActionGrammar := functions.Function{
|
|
// Name: noActionName,
|
|
// Description: noActionDescription,
|
|
// Parameters: map[string]interface{}{
|
|
// "properties": map[string]interface{}{
|
|
// "message": map[string]interface{}{
|
|
// "type": "string",
|
|
// "description": "The message to reply the user with",
|
|
// }},
|
|
// },
|
|
// }
|
|
//
|
|
// // Append the no action function
|
|
// if !config.FunctionsConfig.DisableNoAction {
|
|
// funcs = append(funcs, noActionGrammar)
|
|
// }
|
|
//
|
|
// // Update input grammar
|
|
// jsStruct := funcs.ToJSONStructure(config.FunctionsConfig.FunctionNameKey, config.FunctionsConfig.FunctionNameKey)
|
|
// g, err := jsStruct.Grammar(config.FunctionsConfig.GrammarOptions()...)
|
|
// if err == nil {
|
|
// config.Grammar = g
|
|
// }
|
|
// }
|
|
//
|
|
// // Generate a response based on text conversation history
|
|
// prompt := evaluator.TemplateMessages(conversationHistory, config, funcs, shouldUseFn)
|
|
//
|
|
// generatedText, functionCall, err = processTextResponse(config, session, prompt)
|
|
// if err != nil {
|
|
// log.Error().Msgf("failed to process text response: %s", err.Error())
|
|
// sendError(c, "processing_error", "Failed to generate text response", "", "")
|
|
// return
|
|
// }
|
|
// log.Debug().Any("text", generatedText).Msg("Generated text response")
|
|
// }
|
|
//
|
|
// if functionCall != nil {
|
|
// // The model wants to call a function
|
|
// // Create a function_call item and send it to the client
|
|
// item := &Item{
|
|
// ID: generateItemID(),
|
|
// Object: "realtime.item",
|
|
// Type: "function_call",
|
|
// Status: "completed",
|
|
// Role: "assistant",
|
|
// FunctionCall: functionCall,
|
|
// }
|
|
//
|
|
// // Add item to conversation
|
|
// conversation.Lock.Lock()
|
|
// conversation.Items = append(conversation.Items, item)
|
|
// conversation.Lock.Unlock()
|
|
//
|
|
// // Send item.created event
|
|
// sendEvent(c, OutgoingMessage{
|
|
// Type: "conversation.item.created",
|
|
// Item: item,
|
|
// })
|
|
//
|
|
// // Optionally, you can generate a message to the user indicating the function call
|
|
// // For now, we'll assume the client handles the function call and may trigger another response
|
|
//
|
|
// } else {
|
|
// // Send response.stream messages
|
|
// if generatedAudio != nil {
|
|
// // If generatedAudio is available, send it as audio
|
|
// encodedAudio := base64.StdEncoding.EncodeToString(generatedAudio)
|
|
// outgoingMsg := OutgoingMessage{
|
|
// Type: "response.stream",
|
|
// Audio: encodedAudio,
|
|
// }
|
|
// sendEvent(c, outgoingMsg)
|
|
// } else {
|
|
// // Send text response (could be streamed in chunks)
|
|
// chunks := splitResponseIntoChunks(generatedText)
|
|
// for _, chunk := range chunks {
|
|
// outgoingMsg := OutgoingMessage{
|
|
// Type: "response.stream",
|
|
// Content: chunk,
|
|
// }
|
|
// sendEvent(c, outgoingMsg)
|
|
// }
|
|
// }
|
|
//
|
|
// // Send response.done message
|
|
// sendEvent(c, OutgoingMessage{
|
|
// Type: "response.done",
|
|
// })
|
|
//
|
|
// // Add the assistant's response to the conversation
|
|
// content := []ConversationContent{}
|
|
// if generatedAudio != nil {
|
|
// content = append(content, ConversationContent{
|
|
// Type: "audio",
|
|
// Audio: base64.StdEncoding.EncodeToString(generatedAudio),
|
|
// })
|
|
// // Optionally include a text transcript
|
|
// if generatedText != "" {
|
|
// content = append(content, ConversationContent{
|
|
// Type: "text",
|
|
// Text: generatedText,
|
|
// })
|
|
// }
|
|
// } else {
|
|
// content = append(content, ConversationContent{
|
|
// Type: "text",
|
|
// Text: generatedText,
|
|
// })
|
|
// }
|
|
//
|
|
// item := &Item{
|
|
// ID: generateItemID(),
|
|
// Object: "realtime.item",
|
|
// Type: "message",
|
|
// Status: "completed",
|
|
// Role: "assistant",
|
|
// Content: content,
|
|
// }
|
|
//
|
|
// // Add item to conversation
|
|
// conversation.Lock.Lock()
|
|
// conversation.Items = append(conversation.Items, item)
|
|
// conversation.Lock.Unlock()
|
|
//
|
|
// // Send item.created event
|
|
// sendEvent(c, OutgoingMessage{
|
|
// Type: "conversation.item.created",
|
|
// Item: item,
|
|
// })
|
|
//
|
|
// log.Debug().Any("item", item).Msg("Realtime response sent")
|
|
// }
|
|
// }
|
|
|
|
// Function to process text response and detect function calls
|
|
func processTextResponse(config *config.ModelConfig, session *Session, prompt string) (string, *FunctionCall, error) {
|
|
|
|
// Placeholder implementation
|
|
// Replace this with actual model inference logic using session.Model and prompt
|
|
// For example, the model might return a special token or JSON indicating a function call
|
|
|
|
/*
|
|
predFunc, err := backend.ModelInference(context.Background(), prompt, input.Messages, images, videos, audios, ml, *config, o, nil)
|
|
|
|
result, tokenUsage, err := ComputeChoices(input, prompt, config, startupOptions, ml, func(s string, c *[]schema.Choice) {
|
|
if !shouldUseFn {
|
|
// no function is called, just reply and use stop as finish reason
|
|
*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
|
|
return
|
|
}
|
|
|
|
textContentToReturn = functions.ParseTextContent(s, config.FunctionsConfig)
|
|
s = functions.CleanupLLMResult(s, config.FunctionsConfig)
|
|
results := functions.ParseFunctionCall(s, config.FunctionsConfig)
|
|
log.Debug().Msgf("Text content to return: %s", textContentToReturn)
|
|
noActionsToRun := len(results) > 0 && results[0].Name == noActionName || len(results) == 0
|
|
|
|
switch {
|
|
case noActionsToRun:
|
|
result, err := handleQuestion(config, input, ml, startupOptions, results, s, predInput)
|
|
if err != nil {
|
|
log.Error().Err(err).Msg("error handling question")
|
|
return
|
|
}
|
|
*c = append(*c, schema.Choice{
|
|
Message: &schema.Message{Role: "assistant", Content: &result}})
|
|
default:
|
|
toolChoice := schema.Choice{
|
|
Message: &schema.Message{
|
|
Role: "assistant",
|
|
},
|
|
}
|
|
|
|
if len(input.Tools) > 0 {
|
|
toolChoice.FinishReason = "tool_calls"
|
|
}
|
|
|
|
for _, ss := range results {
|
|
name, args := ss.Name, ss.Arguments
|
|
if len(input.Tools) > 0 {
|
|
// If we are using tools, we condense the function calls into
|
|
// a single response choice with all the tools
|
|
toolChoice.Message.Content = textContentToReturn
|
|
toolChoice.Message.ToolCalls = append(toolChoice.Message.ToolCalls,
|
|
schema.ToolCall{
|
|
ID: id,
|
|
Type: "function",
|
|
FunctionCall: schema.FunctionCall{
|
|
Name: name,
|
|
Arguments: args,
|
|
},
|
|
},
|
|
)
|
|
} else {
|
|
// otherwise we return more choices directly
|
|
*c = append(*c, schema.Choice{
|
|
FinishReason: "function_call",
|
|
Message: &schema.Message{
|
|
Role: "assistant",
|
|
Content: &textContentToReturn,
|
|
FunctionCall: map[string]interface{}{
|
|
"name": name,
|
|
"arguments": args,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
}
|
|
|
|
if len(input.Tools) > 0 {
|
|
// we need to append our result if we are using tools
|
|
*c = append(*c, toolChoice)
|
|
}
|
|
}
|
|
|
|
}, nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
resp := &schema.OpenAIResponse{
|
|
ID: id,
|
|
Created: created,
|
|
Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
|
|
Choices: result,
|
|
Object: "chat.completion",
|
|
Usage: schema.OpenAIUsage{
|
|
PromptTokens: tokenUsage.Prompt,
|
|
CompletionTokens: tokenUsage.Completion,
|
|
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
|
|
},
|
|
}
|
|
respData, _ := json.Marshal(resp)
|
|
log.Debug().Msgf("Response: %s", respData)
|
|
|
|
// Return the prediction in the response body
|
|
return c.JSON(resp)
|
|
|
|
*/
|
|
|
|
// TODO: use session.ModelInterface...
|
|
// Simulate a function call
|
|
if strings.Contains(prompt, "weather") {
|
|
functionCall := &FunctionCall{
|
|
Name: "get_weather",
|
|
Arguments: map[string]interface{}{
|
|
"location": "New York",
|
|
"scale": "celsius",
|
|
},
|
|
}
|
|
return "", functionCall, nil
|
|
}
|
|
|
|
// Otherwise, return a normal text response
|
|
return "This is a generated response based on the conversation.", nil, nil
|
|
}
|
|
|
|
// Function to process audio response and detect function calls
|
|
func processAudioResponse(session *Session, audioData []byte) (string, []byte, *FunctionCall, error) {
|
|
// TODO: Do the below or use an any-to-any model like Qwen Omni
|
|
// Implement the actual model inference logic using session.Model and audioData
|
|
// For example:
|
|
// 1. Transcribe the audio to text
|
|
// 2. Generate a response based on the transcribed text
|
|
// 3. Check if the model wants to call a function
|
|
// 4. Convert the response text to speech (audio)
|
|
//
|
|
// Placeholder implementation:
|
|
|
|
// TODO: template eventual messages, like chat.go
|
|
reply, err := session.ModelInterface.Predict(context.Background(), &proto.PredictOptions{
|
|
Prompt: "What's the weather in New York?",
|
|
})
|
|
|
|
if err != nil {
|
|
return "", nil, nil, err
|
|
}
|
|
|
|
generatedAudio := reply.Audio
|
|
|
|
transcribedText := "What's the weather in New York?"
|
|
var functionCall *FunctionCall
|
|
|
|
// Simulate a function call
|
|
if strings.Contains(transcribedText, "weather") {
|
|
functionCall = &FunctionCall{
|
|
Name: "get_weather",
|
|
Arguments: map[string]interface{}{
|
|
"location": "New York",
|
|
"scale": "celsius",
|
|
},
|
|
}
|
|
return "", nil, functionCall, nil
|
|
}
|
|
|
|
// Generate a response
|
|
generatedText := "This is a response to your speech input."
|
|
|
|
return generatedText, generatedAudio, nil, nil
|
|
}
|
|
|
|
// Function to split the response into chunks (for streaming)
|
|
func splitResponseIntoChunks(response string) []string {
|
|
// Split the response into chunks of fixed size
|
|
chunkSize := 50 // characters per chunk
|
|
var chunks []string
|
|
for len(response) > 0 {
|
|
if len(response) > chunkSize {
|
|
chunks = append(chunks, response[:chunkSize])
|
|
response = response[chunkSize:]
|
|
} else {
|
|
chunks = append(chunks, response)
|
|
break
|
|
}
|
|
}
|
|
return chunks
|
|
}
|
|
|
|
// Helper functions to generate unique IDs
|
|
func generateSessionID() string {
|
|
// Generate a unique session ID
|
|
// Implement as needed
|
|
return "sess_" + generateUniqueID()
|
|
}
|
|
|
|
func generateConversationID() string {
|
|
// Generate a unique conversation ID
|
|
// Implement as needed
|
|
return "conv_" + generateUniqueID()
|
|
}
|
|
|
|
func generateItemID() string {
|
|
// Generate a unique item ID
|
|
// Implement as needed
|
|
return "item_" + generateUniqueID()
|
|
}
|
|
|
|
func generateUniqueID() string {
|
|
// Generate a unique ID string
|
|
// For simplicity, use a counter or UUID
|
|
// Implement as needed
|
|
return "unique_id"
|
|
}
|
|
|
|
// Structures for 'response.create' messages
|
|
type ResponseCreate struct {
|
|
Modalities []string `json:"modalities,omitempty"`
|
|
Instructions string `json:"instructions,omitempty"`
|
|
Functions functions.Functions `json:"functions,omitempty"`
|
|
// Other fields as needed
|
|
}
|