From eae4ca08dae7c181c45c3924990fced4eaf39be2 Mon Sep 17 00:00:00 2001 From: Max Goltzsche Date: Mon, 21 Jul 2025 09:15:55 +0200 Subject: [PATCH] feat(openai): support input_audio chat api field (#5870) Improving the chat completion endpoint OpenAI API compatibility by supporting messages of type `input_audio`, e.g.: ``` { ... "messages": [ { "role": "user", "content": [{ "type": "input_audio", "input_audio": { "data": "", "format": "wav" } }] } ] } ``` Closes #5869 Signed-off-by: Max Goltzsche --- core/http/middleware/request.go | 7 ++++++- core/schema/openai.go | 18 +++++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go index 5d0093cfb..3a9177f51 100644 --- a/core/http/middleware/request.go +++ b/core/http/middleware/request.go @@ -308,7 +308,7 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff vidIndex++ nrOfVideosInMessage++ - case "audio_url", "audio", "input_audio": + case "audio_url", "audio": // Decode content as base64 either if it's an URL or base64 text base64, err := utils.GetContentURIAsBase64(pp.AudioURL.URL) if err != nil { @@ -318,6 +318,11 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, base64) // TODO: make sure that we only return base64 stuff audioIndex++ nrOfAudiosInMessage++ + case "input_audio": + // TODO: make sure that we only return base64 stuff + input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, pp.InputAudio.Data) + audioIndex++ + nrOfAudiosInMessage++ case "image_url", "image": // Decode content as base64 either if it's an URL or base64 text base64, err := utils.GetContentURIAsBase64(pp.ImageURL.URL) diff --git a/core/schema/openai.go b/core/schema/openai.go index 8eb203646..c8947b99a 100644 --- a/core/schema/openai.go +++ b/core/schema/openai.go @@ -58,17 +58,25 @@ type Choice struct { } type Content struct { - Type string `json:"type" yaml:"type"` - Text string `json:"text" yaml:"text"` - ImageURL ContentURL `json:"image_url" yaml:"image_url"` - AudioURL ContentURL `json:"audio_url" yaml:"audio_url"` - VideoURL ContentURL `json:"video_url" yaml:"video_url"` + Type string `json:"type" yaml:"type"` + Text string `json:"text" yaml:"text"` + ImageURL ContentURL `json:"image_url" yaml:"image_url"` + AudioURL ContentURL `json:"audio_url" yaml:"audio_url"` + VideoURL ContentURL `json:"video_url" yaml:"video_url"` + InputAudio InputAudio `json:"input_audio" yaml:"input_audio"` } type ContentURL struct { URL string `json:"url" yaml:"url"` } +type InputAudio struct { + // Format identifies the audio format, e.g. 'wav'. + Format string `json:"format" yaml:"format"` + // Data holds the base64-encoded audio data. + Data string `json:"data" yaml:"data"` +} + type Message struct { // The message role Role string `json:"role,omitempty" yaml:"role"`