feat(openai): support input_audio chat api field (#5870)

Improving the chat completion endpoint OpenAI API compatibility by supporting messages of type `input_audio`, e.g.:
```
{
  ...
  "messages": [
    {
      "role": "user",
      "content": [{
        "type": "input_audio",
        "input_audio": {
          "data": "<base64-encoded audio data>",
          "format": "wav"
        }
      }]
    }
  ]
}
```

Closes #5869

Signed-off-by: Max Goltzsche <max.goltzsche@gmail.com>
This commit is contained in:
Max Goltzsche
2025-07-21 09:15:55 +02:00
committed by GitHub
parent fa284f7445
commit eae4ca08da
2 changed files with 19 additions and 6 deletions

View File

@@ -308,7 +308,7 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch
input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff
vidIndex++
nrOfVideosInMessage++
case "audio_url", "audio", "input_audio":
case "audio_url", "audio":
// Decode content as base64 either if it's an URL or base64 text
base64, err := utils.GetContentURIAsBase64(pp.AudioURL.URL)
if err != nil {
@@ -318,6 +318,11 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch
input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, base64) // TODO: make sure that we only return base64 stuff
audioIndex++
nrOfAudiosInMessage++
case "input_audio":
// TODO: make sure that we only return base64 stuff
input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, pp.InputAudio.Data)
audioIndex++
nrOfAudiosInMessage++
case "image_url", "image":
// Decode content as base64 either if it's an URL or base64 text
base64, err := utils.GetContentURIAsBase64(pp.ImageURL.URL)

View File

@@ -58,17 +58,25 @@ type Choice struct {
}
type Content struct {
Type string `json:"type" yaml:"type"`
Text string `json:"text" yaml:"text"`
ImageURL ContentURL `json:"image_url" yaml:"image_url"`
AudioURL ContentURL `json:"audio_url" yaml:"audio_url"`
VideoURL ContentURL `json:"video_url" yaml:"video_url"`
Type string `json:"type" yaml:"type"`
Text string `json:"text" yaml:"text"`
ImageURL ContentURL `json:"image_url" yaml:"image_url"`
AudioURL ContentURL `json:"audio_url" yaml:"audio_url"`
VideoURL ContentURL `json:"video_url" yaml:"video_url"`
InputAudio InputAudio `json:"input_audio" yaml:"input_audio"`
}
type ContentURL struct {
URL string `json:"url" yaml:"url"`
}
type InputAudio struct {
// Format identifies the audio format, e.g. 'wav'.
Format string `json:"format" yaml:"format"`
// Data holds the base64-encoded audio data.
Data string `json:"data" yaml:"data"`
}
type Message struct {
// The message role
Role string `json:"role,omitempty" yaml:"role"`