From 106f4b777efb8911adc3966497f79a9dfd5dc6b6 Mon Sep 17 00:00:00 2001 From: Haruko386 Date: Thu, 14 May 2026 18:58:00 +0800 Subject: [PATCH] Go: implement TTS for fishaudio, openrouter and asr for fishaudio (#14926) ### What problem does this PR solve? This PR implement TTS for FishAudio and MiniMax provider and ASR for FishAudio **The following functionalities are now supported:** **FishAudio:** - [x] Text To Speech - [x] Stream Text To Speech - [x] Audio To Text **OpenRouter:** - [x] Text To Speech **Verified examples from the CLI:** ```plaintext **FishAudio** RAGFlow(user)> tts with 's1@test@fishaudio' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"reference_id": "90e65eaaf50e4470b8e6d43ee6afd7d5", "temperature": 0.7, "top_p": 0.7, "prosody": {"speed": 1, "volume": 0, "normalize_loudness": true}, "chunk_length": 300, "normalize": true, "sample_rate": 44100, "mp3_bitrate": 128, "latency": "normal", "max_new_tokens": 1024, "repetition_penalty": 1.2, "min_chunk_length": 50, "condition_on_previous_chunks": true, "early_stop_threshold": 1}' Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/s1_output.wav SUCCESS RAGFlow(user)> stream tts with 's1@test@fishaudio' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"reference_id": "90e65eaaf50e4470b8e6d43ee6afd7d5", "temperature": 0.7, "top_p": 0.7, "prosody": {"speed": 1, "volume": 0, "normalize_loudness": true}, "chunk_length": 300, "normalize": true, "sample_rate": 44100, "mp3_bitrate": 128, "latency": "normal", "max_new_tokens": 1024, "repetition_penalty": 1.2, "min_chunk_length": 50, "condition_on_previous_chunks": true, "early_stop_threshold": 1}' Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/s1_output.wav SUCCESS RAGFlow(user)> asr with 'transcribe-1@test@fishaudio' audio './internal/test.wav' param '{"language": "en", "ignore_timestamps": true}' +----------------------------------------------------------------------------------------------------------------------+ | text | +----------------------------------------------------------------------------------------------------------------------+ | The examination and testimony of the experts enabled the commission to conclude that five shots may have been fired. | +----------------------------------------------------------------------------------------------------------------------+ ``` ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [x] Refactoring --- conf/models/fishaudio.json | 26 ++- conf/models/openrouter.json | 10 +- internal/cli/user_command.go | 38 +++- internal/cli/user_parser.go | 23 ++- internal/entity/models/fishaudio.go | 272 ++++++++++++++++++++++++++- internal/entity/models/minimax.go | 17 +- internal/entity/models/openrouter.go | 59 +++++- internal/entity/models/types.go | 6 +- internal/handler/providers.go | 18 +- 9 files changed, 436 insertions(+), 33 deletions(-) diff --git a/conf/models/fishaudio.json b/conf/models/fishaudio.json index 585aab3369..aa6beda964 100644 --- a/conf/models/fishaudio.json +++ b/conf/models/fishaudio.json @@ -5,10 +5,32 @@ }, "url_suffix": { "models": "model", - "balance": "self/package" + "balance": "self/package", + "tts": "v1/tts", + "asr": "v1/asr" }, "class": "fishaudio", "models": [ - + { + "name": "s2-pro", + "max_tokens": 8192, + "model_types": [ + "tts" + ] + }, + { + "name": "s1", + "max_tokens": 8192, + "model_types": [ + "tts" + ] + }, + { + "name": "transcribe-1", + "max_tokens": 8192, + "model_types": [ + "asr" + ] + } ] } \ No newline at end of file diff --git a/conf/models/openrouter.json b/conf/models/openrouter.json index 6af1e2d15d..33d0bdadbd 100644 --- a/conf/models/openrouter.json +++ b/conf/models/openrouter.json @@ -8,7 +8,8 @@ "models": "models", "embedding": "embeddings", "rerank": "rerank", - "balance": "credits" + "balance": "credits", + "tts": "audio/speech" }, "class": "openrouter", "models": [ @@ -44,6 +45,13 @@ "default_value": true, "clear_thinking": true } + }, + { + "name": "openai/gpt-audio-mini", + "max_tokens": 131072, + "model_types": [ + "tts" + ] } ] } \ No newline at end of file diff --git a/internal/cli/user_command.go b/internal/cli/user_command.go index 7a2b275967..f631a25275 100644 --- a/internal/cli/user_command.go +++ b/internal/cli/user_command.go @@ -2013,7 +2013,7 @@ func (c *RAGFlowClient) TTSUserCommand(cmd *Command) (ResponseIf, error) { if explicitFormat != "" { ttsConfigPayload["format"] = explicitFormat } else { - explicitFormat = "mp3" + ttsConfigPayload["format"] = "mp3" } if len(ttsConfigPayload) > 0 { @@ -2056,7 +2056,6 @@ func (c *RAGFlowClient) TTSUserCommand(cmd *Command) (ResponseIf, error) { shouldSave, _ := cmd.Params["save"].(bool) saveDir, _ := cmd.Params["save_path"].(string) - fileName := fmt.Sprintf("%s_output.%s", modelName, explicitFormat) cwd, err := os.Getwd() @@ -2149,14 +2148,27 @@ func (c *RAGFlowClient) ASRUserCommand(cmd *Command) (ResponseIf, error) { audioFile, ok := cmd.Params["audio_file"].(string) if !ok { - return nil, fmt.Errorf("text not provided") + return nil, fmt.Errorf("audio file not provided") } payload := map[string]interface{}{ "provider_name": providerName, "instance_name": instanceName, "model_name": modelName, - "audio_file": audioFile, + "file": audioFile, + } + + asrConfigPayload := make(map[string]interface{}) + if paramStr, ok := cmd.Params["param_str"].(string); ok && paramStr != "" { + var dynamicParams map[string]interface{} + if err := json.Unmarshal([]byte(paramStr), &dynamicParams); err != nil { + return nil, fmt.Errorf("param string must be valid JSON. Error: %w", err) + } + asrConfigPayload["params"] = dynamicParams + } + + if len(asrConfigPayload) > 0 { + payload["asr_config"] = asrConfigPayload } url := "/audio/transcriptions" @@ -2168,13 +2180,23 @@ func (c *RAGFlowClient) ASRUserCommand(cmd *Command) (ResponseIf, error) { if resp.StatusCode != 200 { return nil, fmt.Errorf("failed to ASR document: HTTP %d, body: %s", resp.StatusCode, string(resp.Body)) } - var result CommonResponse - if err = json.Unmarshal(resp.Body, &result); err != nil { + var rawResult struct { + Code int `json:"code"` + Message string `json:"message"` + Data map[string]interface{} `json:"data"` + } + + if err = json.Unmarshal(resp.Body, &rawResult); err != nil { return nil, fmt.Errorf("ASR document failed: invalid JSON (%w)", err) } - if result.Code != 0 { - return nil, fmt.Errorf("%s", result.Message) + + if rawResult.Code != 0 { + return nil, fmt.Errorf("%s", rawResult.Message) } + + var result CommonResponse + result.Code = rawResult.Code + result.Message = rawResult.Data["text"].(string) // TODO result.Duration = resp.Duration return &result, nil diff --git a/internal/cli/user_parser.go b/internal/cli/user_parser.go index 04ebc7e87e..28f3207114 100644 --- a/internal/cli/user_parser.go +++ b/internal/cli/user_parser.go @@ -2753,7 +2753,7 @@ func (p *Parser) parseASRCommand() (*Command, error) { if p.curToken.Type != TokenAudio { return nil, fmt.Errorf("expected AUDIO to ASR") } - p.nextToken() // consume FILE + p.nextToken() // consume AUDIO audioFile, err := p.parseQuotedString() if err != nil { @@ -2761,14 +2761,29 @@ func (p *Parser) parseASRCommand() (*Command, error) { } p.nextToken() + cmd := NewCommand("asr_user_command") + cmd.Params["composite_model_name"] = compositeModelName + cmd.Params["audio_file"] = audioFile + + for p.curToken.Type != TokenEOF && p.curToken.Type != TokenSemicolon { + switch p.curToken.Type { + case TokenParam: + p.nextToken() + if p.curToken.Type != TokenQuotedString { + return nil, fmt.Errorf("expect quoted string after 'param'") + } + cmd.Params["param_str"] = strings.Trim(p.curToken.Value, "\"'") + p.nextToken() + default: + return nil, fmt.Errorf("unexpected token in asr command: %s", p.curToken.Value) + } + } + // Semicolon is optional for UNSET TOKEN if p.curToken.Type == TokenSemicolon { p.nextToken() } - cmd := NewCommand("asr_user_command") - cmd.Params["composite_model_name"] = compositeModelName - cmd.Params["audio_file"] = audioFile return cmd, nil } diff --git a/internal/entity/models/fishaudio.go b/internal/entity/models/fishaudio.go index 1e9f0aa9d5..0eec1b1653 100644 --- a/internal/entity/models/fishaudio.go +++ b/internal/entity/models/fishaudio.go @@ -1,10 +1,17 @@ package models import ( + "bufio" + "bytes" + "encoding/base64" "encoding/json" "fmt" "io" + "mime/multipart" "net/http" + "os" + "path/filepath" + "strconv" "strings" "time" ) @@ -64,20 +71,273 @@ func (f *FishAudioModel) Rerank(modelName *string, query string, documents []str // TranscribeAudio transcribe audio func (f *FishAudioModel) TranscribeAudio(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig) (*ASRResponse, error) { - return nil, fmt.Errorf("%s, no such method", f.Name()) + + if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" { + return nil, fmt.Errorf("FishAudio API key is missing") + } + + if file == nil || *file == "" { + return nil, fmt.Errorf("file is missing") + } + + region := "default" + if apiConfig.Region != nil && *apiConfig.Region != "" { + region = *apiConfig.Region + } + + url := fmt.Sprintf("%s/%s", f.BaseURL[region], f.URLSuffix.ASR) + + var body bytes.Buffer + writer := multipart.NewWriter(&body) + + // audio file + audioFile, err := os.Open(*file) + if err != nil { + return nil, fmt.Errorf("failed to open audio file: %w", err) + } + defer audioFile.Close() + + part, err := writer.CreateFormFile("audio", filepath.Base(*file)) + if err != nil { + return nil, fmt.Errorf("failed to create multipart file: %w", err) + } + + if _, err = io.Copy(part, audioFile); err != nil { + return nil, fmt.Errorf("failed to copy audio data: %w", err) + } + + // extra params + if asrConfig != nil && asrConfig.Params != nil { + for key, value := range asrConfig.Params { + + var val string + + switch v := value.(type) { + case string: + val = v + case bool: + val = strconv.FormatBool(v) + case int: + val = strconv.Itoa(v) + case float64: + val = strconv.FormatFloat(v, 'f', -1, 64) + default: + val = fmt.Sprintf("%v", v) + } + + if err := writer.WriteField(key, val); err != nil { + return nil, fmt.Errorf("failed to write field %s: %w", key, err) + } + } + } + + if err := writer.Close(); err != nil { + return nil, fmt.Errorf("failed to close multipart writer: %w", err) + } + + // request + req, err := http.NewRequest("POST", url, &body) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey)) + req.Header.Set("Content-Type", writer.FormDataContentType()) + + resp, err := f.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to send request: %w", err) + } + defer resp.Body.Close() + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf( + "FishAudio ASR error: %s - %s", + resp.Status, + string(respBody), + ) + } + + // result + var result struct { + Text string `json:"text"` + } + + if err := json.Unmarshal(respBody, &result); err != nil { + return nil, fmt.Errorf("failed to unmarshal response: %w", err) + } + + return &ASRResponse{ + Text: result.Text, + }, nil } -func (z *FishAudioModel) TranscribeAudioWithSender(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig, sender func(*string, *string) error) error { - return fmt.Errorf("%s, no such method", z.Name()) +func (f *FishAudioModel) TranscribeAudioWithSender(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig, sender func(*string, *string) error) error { + return fmt.Errorf("%s, no such method", f.Name()) } // AudioSpeech convert audio to text func (f *FishAudioModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, asrConfig *TTSConfig) (*TTSResponse, error) { - return nil, fmt.Errorf("%s, no such method", f.Name()) + if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" { + return nil, fmt.Errorf("FishAudio API key is missing") + } + + if audioContent == nil || *audioContent == "" { + return nil, fmt.Errorf("text content is missing") + } + + var region = "default" + if apiConfig.Region != nil && *apiConfig.Region != "" { + region = *apiConfig.Region + } + + url := fmt.Sprintf("%s/%s", f.BaseURL[region], f.URLSuffix.TTS) + + reqBody := map[string]interface{}{ + "text": *audioContent, + } + + if asrConfig != nil && asrConfig.Params != nil { + for key, value := range asrConfig.Params { + reqBody[key] = value + } + } + if asrConfig != nil && asrConfig.Format != "" { + reqBody["format"] = asrConfig.Format + } + + jsonData, err := json.Marshal(reqBody) + if err != nil { + return nil, fmt.Errorf("failed to marshal request: %w", err) + } + + req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData)) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey)) + req.Header.Set("model", *modelName) + + resp, err := f.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to send request: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("%s - %s", resp.Status, string(body)) + } + + return &TTSResponse{Audio: body}, nil } -func (z *FishAudioModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error { - return fmt.Errorf("%s, no such method", z.Name()) +func (f *FishAudioModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, asrConfig *TTSConfig, sender func(*string, *string) error) error { + if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" { + return fmt.Errorf("FishAudio API key is missing") + } + + if audioContent == nil || *audioContent == "" { + return fmt.Errorf("text content is missing") + } + + var region = "default" + if apiConfig.Region != nil && *apiConfig.Region != "" { + region = *apiConfig.Region + } + + url := fmt.Sprintf("%s/%s/%s", f.BaseURL[region], f.URLSuffix.TTS, "stream/with-timestamp") + + reqBody := map[string]interface{}{ + "text": *audioContent, + } + + if asrConfig != nil && asrConfig.Params != nil { + for key, value := range asrConfig.Params { + reqBody[key] = value + } + } + if asrConfig != nil && asrConfig.Format != "" { + reqBody["format"] = asrConfig.Format + } + + jsonData, err := json.Marshal(reqBody) + if err != nil { + return fmt.Errorf("failed to marshal request: %w", err) + } + + // Build Request + req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData)) + if err != nil { + return fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey)) + req.Header.Set("model", *modelName) + + resp, err := f.httpClient.Do(req) + if err != nil { + return fmt.Errorf("failed to send request: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + buf := make([]byte, 1024) + n, _ := resp.Body.Read(buf) + return fmt.Errorf("FishAudio stream API error: %d - %s", resp.StatusCode, string(buf[:n])) + } + + scanner := bufio.NewScanner(resp.Body) + scanner.Buffer(make([]byte, 64*1024), 8*1024*1024) + + for scanner.Scan() { + line := scanner.Text() + + if !strings.HasPrefix(line, "data: ") { + continue + } + + dataStr := strings.TrimSpace(line[6:]) + if dataStr == "" { + continue + } + + var event struct { + AudioBase64 string `json:"audio_base64"` + } + + if err := json.Unmarshal([]byte(dataStr), &event); err != nil { + continue + } + + if event.AudioBase64 != "" { + audioBytes, err := base64.StdEncoding.DecodeString(event.AudioBase64) + if err == nil && len(audioBytes) > 0 { + chunk := string(audioBytes) + if errSend := sender(&chunk, nil); errSend != nil { + return errSend + } + } + } + } + + if err := scanner.Err(); err != nil { + return fmt.Errorf("error reading FishAudio stream: %w", err) + } + + return nil } // OCRFile OCR file diff --git a/internal/entity/models/minimax.go b/internal/entity/models/minimax.go index 1f8afe3b94..683a8dc454 100644 --- a/internal/entity/models/minimax.go +++ b/internal/entity/models/minimax.go @@ -478,7 +478,7 @@ func (z *MinimaxModel) AudioSpeech(modelName *string, audioContent *string, apiC } url := fmt.Sprintf("%s/%s", z.BaseURL[region], z.URLSuffix.TTS) - + reqBody := map[string]interface{}{ "model": modelName, "text": audioContent, @@ -488,6 +488,11 @@ func (z *MinimaxModel) AudioSpeech(modelName *string, audioContent *string, apiC reqBody[key] = value } } + if asrConfig != nil && asrConfig.Format != "" { + reqBody["audio_setting"] = map[string]interface{}{ + "format": asrConfig.Format, + } + } reqBody["stream"] = false jsonData, err := json.Marshal(reqBody) @@ -547,7 +552,6 @@ func (z *MinimaxModel) AudioSpeech(modelName *string, audioContent *string, apiC }, nil } -// tts with 'speech-2.8-hd@test@minimax' text 'If that day, out position was switched, would our fate, be different?' voice 'English_expressive_narrator' param '{"voice_setting": {"voice_id": "English_expressive_narrator", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}' func (z *MinimaxModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error { if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" { return fmt.Errorf("MiniMax API key is missing") @@ -581,6 +585,13 @@ func (z *MinimaxModel) AudioSpeechWithSender(modelName *string, audioContent *st } } reqBody["stream"] = false + + if ttsConfig != nil && ttsConfig.Format != "" { + reqBody["audio_setting"] = map[string]interface{}{ + "format": ttsConfig.Format, + } + } + reqBody["stream"] = true jsonData, err := json.Marshal(reqBody) @@ -658,4 +669,4 @@ func (z *MinimaxModel) AudioSpeechWithSender(modelName *string, audioContent *st // OCRFile OCR file func (m *MinimaxModel) OCRFile(modelName *string, content []byte, url *string, apiConfig *APIConfig, ocrConfig *OCRConfig) (*OCRResponse, error) { return nil, fmt.Errorf("%s, no such method", m.Name()) -} +} \ No newline at end of file diff --git a/internal/entity/models/openrouter.go b/internal/entity/models/openrouter.go index d401ba02bb..461a1fe4c3 100644 --- a/internal/entity/models/openrouter.go +++ b/internal/entity/models/openrouter.go @@ -545,7 +545,64 @@ func (z *OpenRouterModel) TranscribeAudioWithSender(modelName *string, file *str // AudioSpeech convert audio to text func (o *OpenRouterModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, asrConfig *TTSConfig) (*TTSResponse, error) { - return nil, fmt.Errorf("%s, no such method", o.Name()) + if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" { + return nil, fmt.Errorf("OpenRouter API key is missing") + } + if audioContent == nil || *audioContent == "" { + return nil, fmt.Errorf("text content is empty") + } + + var region = "default" + if apiConfig.Region != nil && *apiConfig.Region != "" { + region = *apiConfig.Region + } + + url := fmt.Sprintf("%s/%s", o.BaseURL[region], o.URLSuffix.TTS) + + // OpenRouter:response Audio bytes stream + reqBody := map[string]interface{}{ + "model": modelName, + "input": audioContent, + } + + if asrConfig != nil && asrConfig.Params != nil { + for key, value := range asrConfig.Params { + reqBody[key] = value + } + } + if asrConfig != nil && asrConfig.Format != "" { + reqBody["response_format"] = asrConfig.Format + } + + jsonData, err := json.Marshal(reqBody) + if err != nil { + return nil, fmt.Errorf("failed to marshal request: %w", err) + } + + req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData)) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey)) + + resp, err := o.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to send request: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("OpenRouter API error: %s, body: %s", resp.Status, string(body)) + } + + return &TTSResponse{Audio: body}, nil } func (z *OpenRouterModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error { diff --git a/internal/entity/models/types.go b/internal/entity/models/types.go index 3de7ac5158..b0bc9b46b0 100644 --- a/internal/entity/models/types.go +++ b/internal/entity/models/types.go @@ -62,6 +62,7 @@ type RerankResponse struct { } type ASRResponse struct { + Text string `json:"text"` } type TTSResponse struct { @@ -85,6 +86,7 @@ type URLSuffix struct { Files string `json:"files"` Status string `json:"status"` TTS string `json:"tts"` + ASR string `json:"asr"` } type ChatConfig struct { @@ -115,10 +117,12 @@ type RerankConfig struct { } type ASRConfig struct { + Params map[string]interface{} `json:"params"` } type TTSConfig struct { - Params map[string]interface{} + Format string `json:"format"` + Params map[string]interface{} `json:"params"` } type OCRConfig struct { diff --git a/internal/handler/providers.go b/internal/handler/providers.go index 3b060a5097..c6d61a2e65 100644 --- a/internal/handler/providers.go +++ b/internal/handler/providers.go @@ -1049,13 +1049,14 @@ func (h *ProviderHandler) RerankDocument(c *gin.Context) { } type TranscribeAudioRequest struct { - ProviderName *string `json:"provider_name"` - InstanceName *string `json:"instance_name"` - ModelName *string `json:"model_name"` - File *string `json:"file"` - Language []string `json:"language"` - Prompt int `json:"prompt"` - Stream bool `json:"stream"` + ProviderName *string `json:"provider_name"` + InstanceName *string `json:"instance_name"` + ModelName *string `json:"model_name"` + File *string `json:"file"` + Language []string `json:"language"` + Prompt int `json:"prompt"` + Stream bool `json:"stream"` + ASRConfig *models.ASRConfig `json:"asr_config"` } func (h *ProviderHandler) TranscribeAudio(c *gin.Context) { @@ -1101,6 +1102,9 @@ func (h *ProviderHandler) TranscribeAudio(c *gin.Context) { } asrConfig := models.ASRConfig{} + if req.ASRConfig != nil { + asrConfig = *req.ASRConfig + } // Check if it's a stream request if req.Stream {