Go: implement TTS for fishaudio, openrouter and asr for fishaudio (#14926)

### What problem does this PR solve?

This PR implement TTS for FishAudio and MiniMax provider and ASR for
FishAudio

**The following functionalities are now supported:**

**FishAudio:**
- [x] Text To Speech
- [x] Stream Text To Speech
- [x] Audio To Text

**OpenRouter:**

- [x] Text To Speech

**Verified examples from the CLI:**
```plaintext

**FishAudio**

RAGFlow(user)> tts with 's1@test@fishaudio' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"reference_id": "90e65eaaf50e4470b8e6d43ee6afd7d5", "temperature": 0.7, "top_p": 0.7, "prosody": {"speed": 1, "volume": 0, "normalize_loudness": true}, "chunk_length": 300, "normalize": true, "sample_rate": 44100, "mp3_bitrate": 128, "latency": "normal", "max_new_tokens": 1024, "repetition_penalty": 1.2, "min_chunk_length": 50, "condition_on_previous_chunks": true, "early_stop_threshold": 1}'
Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/s1_output.wav
SUCCESS

RAGFlow(user)> stream tts with 's1@test@fishaudio' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"reference_id": "90e65eaaf50e4470b8e6d43ee6afd7d5", "temperature": 0.7, "top_p": 0.7, "prosody": {"speed": 1, "volume": 0, "normalize_loudness": true}, "chunk_length": 300, "normalize": true, "sample_rate": 44100, "mp3_bitrate": 128, "latency": "normal", "max_new_tokens": 1024, "repetition_penalty": 1.2, "min_chunk_length": 50, "condition_on_previous_chunks": true, "early_stop_threshold": 1}'
Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/s1_output.wav
SUCCESS

RAGFlow(user)> asr with 'transcribe-1@test@fishaudio' audio './internal/test.wav' param '{"language": "en", "ignore_timestamps": true}'
+----------------------------------------------------------------------------------------------------------------------+
| text                                                                                                                 |
+----------------------------------------------------------------------------------------------------------------------+
| The examination and testimony of the experts enabled the commission to conclude that five shots may have been fired. |
+----------------------------------------------------------------------------------------------------------------------+

```

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
This commit is contained in:
Haruko386
2026-05-14 18:58:00 +08:00
committed by GitHub
parent a98994ff91
commit 106f4b777e
9 changed files with 436 additions and 33 deletions

View File

@@ -5,10 +5,32 @@
}, },
"url_suffix": { "url_suffix": {
"models": "model", "models": "model",
"balance": "self/package" "balance": "self/package",
"tts": "v1/tts",
"asr": "v1/asr"
}, },
"class": "fishaudio", "class": "fishaudio",
"models": [ "models": [
{
"name": "s2-pro",
"max_tokens": 8192,
"model_types": [
"tts"
]
},
{
"name": "s1",
"max_tokens": 8192,
"model_types": [
"tts"
]
},
{
"name": "transcribe-1",
"max_tokens": 8192,
"model_types": [
"asr"
]
}
] ]
} }

View File

@@ -8,7 +8,8 @@
"models": "models", "models": "models",
"embedding": "embeddings", "embedding": "embeddings",
"rerank": "rerank", "rerank": "rerank",
"balance": "credits" "balance": "credits",
"tts": "audio/speech"
}, },
"class": "openrouter", "class": "openrouter",
"models": [ "models": [
@@ -44,6 +45,13 @@
"default_value": true, "default_value": true,
"clear_thinking": true "clear_thinking": true
} }
},
{
"name": "openai/gpt-audio-mini",
"max_tokens": 131072,
"model_types": [
"tts"
]
} }
] ]
} }

View File

@@ -2013,7 +2013,7 @@ func (c *RAGFlowClient) TTSUserCommand(cmd *Command) (ResponseIf, error) {
if explicitFormat != "" { if explicitFormat != "" {
ttsConfigPayload["format"] = explicitFormat ttsConfigPayload["format"] = explicitFormat
} else { } else {
explicitFormat = "mp3" ttsConfigPayload["format"] = "mp3"
} }
if len(ttsConfigPayload) > 0 { if len(ttsConfigPayload) > 0 {
@@ -2056,7 +2056,6 @@ func (c *RAGFlowClient) TTSUserCommand(cmd *Command) (ResponseIf, error) {
shouldSave, _ := cmd.Params["save"].(bool) shouldSave, _ := cmd.Params["save"].(bool)
saveDir, _ := cmd.Params["save_path"].(string) saveDir, _ := cmd.Params["save_path"].(string)
fileName := fmt.Sprintf("%s_output.%s", modelName, explicitFormat) fileName := fmt.Sprintf("%s_output.%s", modelName, explicitFormat)
cwd, err := os.Getwd() cwd, err := os.Getwd()
@@ -2149,14 +2148,27 @@ func (c *RAGFlowClient) ASRUserCommand(cmd *Command) (ResponseIf, error) {
audioFile, ok := cmd.Params["audio_file"].(string) audioFile, ok := cmd.Params["audio_file"].(string)
if !ok { if !ok {
return nil, fmt.Errorf("text not provided") return nil, fmt.Errorf("audio file not provided")
} }
payload := map[string]interface{}{ payload := map[string]interface{}{
"provider_name": providerName, "provider_name": providerName,
"instance_name": instanceName, "instance_name": instanceName,
"model_name": modelName, "model_name": modelName,
"audio_file": audioFile, "file": audioFile,
}
asrConfigPayload := make(map[string]interface{})
if paramStr, ok := cmd.Params["param_str"].(string); ok && paramStr != "" {
var dynamicParams map[string]interface{}
if err := json.Unmarshal([]byte(paramStr), &dynamicParams); err != nil {
return nil, fmt.Errorf("param string must be valid JSON. Error: %w", err)
}
asrConfigPayload["params"] = dynamicParams
}
if len(asrConfigPayload) > 0 {
payload["asr_config"] = asrConfigPayload
} }
url := "/audio/transcriptions" url := "/audio/transcriptions"
@@ -2168,13 +2180,23 @@ func (c *RAGFlowClient) ASRUserCommand(cmd *Command) (ResponseIf, error) {
if resp.StatusCode != 200 { if resp.StatusCode != 200 {
return nil, fmt.Errorf("failed to ASR document: HTTP %d, body: %s", resp.StatusCode, string(resp.Body)) return nil, fmt.Errorf("failed to ASR document: HTTP %d, body: %s", resp.StatusCode, string(resp.Body))
} }
var result CommonResponse var rawResult struct {
if err = json.Unmarshal(resp.Body, &result); err != nil { Code int `json:"code"`
Message string `json:"message"`
Data map[string]interface{} `json:"data"`
}
if err = json.Unmarshal(resp.Body, &rawResult); err != nil {
return nil, fmt.Errorf("ASR document failed: invalid JSON (%w)", err) return nil, fmt.Errorf("ASR document failed: invalid JSON (%w)", err)
} }
if result.Code != 0 {
return nil, fmt.Errorf("%s", result.Message) if rawResult.Code != 0 {
return nil, fmt.Errorf("%s", rawResult.Message)
} }
var result CommonResponse
result.Code = rawResult.Code
result.Message = rawResult.Data["text"].(string) // TODO
result.Duration = resp.Duration result.Duration = resp.Duration
return &result, nil return &result, nil

View File

@@ -2753,7 +2753,7 @@ func (p *Parser) parseASRCommand() (*Command, error) {
if p.curToken.Type != TokenAudio { if p.curToken.Type != TokenAudio {
return nil, fmt.Errorf("expected AUDIO to ASR") return nil, fmt.Errorf("expected AUDIO to ASR")
} }
p.nextToken() // consume FILE p.nextToken() // consume AUDIO
audioFile, err := p.parseQuotedString() audioFile, err := p.parseQuotedString()
if err != nil { if err != nil {
@@ -2761,14 +2761,29 @@ func (p *Parser) parseASRCommand() (*Command, error) {
} }
p.nextToken() p.nextToken()
cmd := NewCommand("asr_user_command")
cmd.Params["composite_model_name"] = compositeModelName
cmd.Params["audio_file"] = audioFile
for p.curToken.Type != TokenEOF && p.curToken.Type != TokenSemicolon {
switch p.curToken.Type {
case TokenParam:
p.nextToken()
if p.curToken.Type != TokenQuotedString {
return nil, fmt.Errorf("expect quoted string after 'param'")
}
cmd.Params["param_str"] = strings.Trim(p.curToken.Value, "\"'")
p.nextToken()
default:
return nil, fmt.Errorf("unexpected token in asr command: %s", p.curToken.Value)
}
}
// Semicolon is optional for UNSET TOKEN // Semicolon is optional for UNSET TOKEN
if p.curToken.Type == TokenSemicolon { if p.curToken.Type == TokenSemicolon {
p.nextToken() p.nextToken()
} }
cmd := NewCommand("asr_user_command")
cmd.Params["composite_model_name"] = compositeModelName
cmd.Params["audio_file"] = audioFile
return cmd, nil return cmd, nil
} }

View File

@@ -1,10 +1,17 @@
package models package models
import ( import (
"bufio"
"bytes"
"encoding/base64"
"encoding/json" "encoding/json"
"fmt" "fmt"
"io" "io"
"mime/multipart"
"net/http" "net/http"
"os"
"path/filepath"
"strconv"
"strings" "strings"
"time" "time"
) )
@@ -64,20 +71,273 @@ func (f *FishAudioModel) Rerank(modelName *string, query string, documents []str
// TranscribeAudio transcribe audio // TranscribeAudio transcribe audio
func (f *FishAudioModel) TranscribeAudio(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig) (*ASRResponse, error) { func (f *FishAudioModel) TranscribeAudio(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig) (*ASRResponse, error) {
return nil, fmt.Errorf("%s, no such method", f.Name())
if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" {
return nil, fmt.Errorf("FishAudio API key is missing")
}
if file == nil || *file == "" {
return nil, fmt.Errorf("file is missing")
}
region := "default"
if apiConfig.Region != nil && *apiConfig.Region != "" {
region = *apiConfig.Region
}
url := fmt.Sprintf("%s/%s", f.BaseURL[region], f.URLSuffix.ASR)
var body bytes.Buffer
writer := multipart.NewWriter(&body)
// audio file
audioFile, err := os.Open(*file)
if err != nil {
return nil, fmt.Errorf("failed to open audio file: %w", err)
}
defer audioFile.Close()
part, err := writer.CreateFormFile("audio", filepath.Base(*file))
if err != nil {
return nil, fmt.Errorf("failed to create multipart file: %w", err)
}
if _, err = io.Copy(part, audioFile); err != nil {
return nil, fmt.Errorf("failed to copy audio data: %w", err)
}
// extra params
if asrConfig != nil && asrConfig.Params != nil {
for key, value := range asrConfig.Params {
var val string
switch v := value.(type) {
case string:
val = v
case bool:
val = strconv.FormatBool(v)
case int:
val = strconv.Itoa(v)
case float64:
val = strconv.FormatFloat(v, 'f', -1, 64)
default:
val = fmt.Sprintf("%v", v)
}
if err := writer.WriteField(key, val); err != nil {
return nil, fmt.Errorf("failed to write field %s: %w", key, err)
}
}
}
if err := writer.Close(); err != nil {
return nil, fmt.Errorf("failed to close multipart writer: %w", err)
}
// request
req, err := http.NewRequest("POST", url, &body)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey))
req.Header.Set("Content-Type", writer.FormDataContentType())
resp, err := f.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
respBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response body: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf(
"FishAudio ASR error: %s - %s",
resp.Status,
string(respBody),
)
}
// result
var result struct {
Text string `json:"text"`
}
if err := json.Unmarshal(respBody, &result); err != nil {
return nil, fmt.Errorf("failed to unmarshal response: %w", err)
}
return &ASRResponse{
Text: result.Text,
}, nil
} }
func (z *FishAudioModel) TranscribeAudioWithSender(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig, sender func(*string, *string) error) error { func (f *FishAudioModel) TranscribeAudioWithSender(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig, sender func(*string, *string) error) error {
return fmt.Errorf("%s, no such method", z.Name()) return fmt.Errorf("%s, no such method", f.Name())
} }
// AudioSpeech convert audio to text // AudioSpeech convert audio to text
func (f *FishAudioModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, asrConfig *TTSConfig) (*TTSResponse, error) { func (f *FishAudioModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, asrConfig *TTSConfig) (*TTSResponse, error) {
return nil, fmt.Errorf("%s, no such method", f.Name()) if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" {
return nil, fmt.Errorf("FishAudio API key is missing")
}
if audioContent == nil || *audioContent == "" {
return nil, fmt.Errorf("text content is missing")
}
var region = "default"
if apiConfig.Region != nil && *apiConfig.Region != "" {
region = *apiConfig.Region
}
url := fmt.Sprintf("%s/%s", f.BaseURL[region], f.URLSuffix.TTS)
reqBody := map[string]interface{}{
"text": *audioContent,
}
if asrConfig != nil && asrConfig.Params != nil {
for key, value := range asrConfig.Params {
reqBody[key] = value
}
}
if asrConfig != nil && asrConfig.Format != "" {
reqBody["format"] = asrConfig.Format
}
jsonData, err := json.Marshal(reqBody)
if err != nil {
return nil, fmt.Errorf("failed to marshal request: %w", err)
}
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey))
req.Header.Set("model", *modelName)
resp, err := f.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response body: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("%s - %s", resp.Status, string(body))
}
return &TTSResponse{Audio: body}, nil
} }
func (z *FishAudioModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error { func (f *FishAudioModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, asrConfig *TTSConfig, sender func(*string, *string) error) error {
return fmt.Errorf("%s, no such method", z.Name()) if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" {
return fmt.Errorf("FishAudio API key is missing")
}
if audioContent == nil || *audioContent == "" {
return fmt.Errorf("text content is missing")
}
var region = "default"
if apiConfig.Region != nil && *apiConfig.Region != "" {
region = *apiConfig.Region
}
url := fmt.Sprintf("%s/%s/%s", f.BaseURL[region], f.URLSuffix.TTS, "stream/with-timestamp")
reqBody := map[string]interface{}{
"text": *audioContent,
}
if asrConfig != nil && asrConfig.Params != nil {
for key, value := range asrConfig.Params {
reqBody[key] = value
}
}
if asrConfig != nil && asrConfig.Format != "" {
reqBody["format"] = asrConfig.Format
}
jsonData, err := json.Marshal(reqBody)
if err != nil {
return fmt.Errorf("failed to marshal request: %w", err)
}
// Build Request
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
if err != nil {
return fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey))
req.Header.Set("model", *modelName)
resp, err := f.httpClient.Do(req)
if err != nil {
return fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
buf := make([]byte, 1024)
n, _ := resp.Body.Read(buf)
return fmt.Errorf("FishAudio stream API error: %d - %s", resp.StatusCode, string(buf[:n]))
}
scanner := bufio.NewScanner(resp.Body)
scanner.Buffer(make([]byte, 64*1024), 8*1024*1024)
for scanner.Scan() {
line := scanner.Text()
if !strings.HasPrefix(line, "data: ") {
continue
}
dataStr := strings.TrimSpace(line[6:])
if dataStr == "" {
continue
}
var event struct {
AudioBase64 string `json:"audio_base64"`
}
if err := json.Unmarshal([]byte(dataStr), &event); err != nil {
continue
}
if event.AudioBase64 != "" {
audioBytes, err := base64.StdEncoding.DecodeString(event.AudioBase64)
if err == nil && len(audioBytes) > 0 {
chunk := string(audioBytes)
if errSend := sender(&chunk, nil); errSend != nil {
return errSend
}
}
}
}
if err := scanner.Err(); err != nil {
return fmt.Errorf("error reading FishAudio stream: %w", err)
}
return nil
} }
// OCRFile OCR file // OCRFile OCR file

View File

@@ -478,7 +478,7 @@ func (z *MinimaxModel) AudioSpeech(modelName *string, audioContent *string, apiC
} }
url := fmt.Sprintf("%s/%s", z.BaseURL[region], z.URLSuffix.TTS) url := fmt.Sprintf("%s/%s", z.BaseURL[region], z.URLSuffix.TTS)
reqBody := map[string]interface{}{ reqBody := map[string]interface{}{
"model": modelName, "model": modelName,
"text": audioContent, "text": audioContent,
@@ -488,6 +488,11 @@ func (z *MinimaxModel) AudioSpeech(modelName *string, audioContent *string, apiC
reqBody[key] = value reqBody[key] = value
} }
} }
if asrConfig != nil && asrConfig.Format != "" {
reqBody["audio_setting"] = map[string]interface{}{
"format": asrConfig.Format,
}
}
reqBody["stream"] = false reqBody["stream"] = false
jsonData, err := json.Marshal(reqBody) jsonData, err := json.Marshal(reqBody)
@@ -547,7 +552,6 @@ func (z *MinimaxModel) AudioSpeech(modelName *string, audioContent *string, apiC
}, nil }, nil
} }
// tts with 'speech-2.8-hd@test@minimax' text 'If that day, out position was switched, would our fate, be different?' voice 'English_expressive_narrator' param '{"voice_setting": {"voice_id": "English_expressive_narrator", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}'
func (z *MinimaxModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error { func (z *MinimaxModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error {
if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" { if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" {
return fmt.Errorf("MiniMax API key is missing") return fmt.Errorf("MiniMax API key is missing")
@@ -581,6 +585,13 @@ func (z *MinimaxModel) AudioSpeechWithSender(modelName *string, audioContent *st
} }
} }
reqBody["stream"] = false reqBody["stream"] = false
if ttsConfig != nil && ttsConfig.Format != "" {
reqBody["audio_setting"] = map[string]interface{}{
"format": ttsConfig.Format,
}
}
reqBody["stream"] = true reqBody["stream"] = true
jsonData, err := json.Marshal(reqBody) jsonData, err := json.Marshal(reqBody)
@@ -658,4 +669,4 @@ func (z *MinimaxModel) AudioSpeechWithSender(modelName *string, audioContent *st
// OCRFile OCR file // OCRFile OCR file
func (m *MinimaxModel) OCRFile(modelName *string, content []byte, url *string, apiConfig *APIConfig, ocrConfig *OCRConfig) (*OCRResponse, error) { func (m *MinimaxModel) OCRFile(modelName *string, content []byte, url *string, apiConfig *APIConfig, ocrConfig *OCRConfig) (*OCRResponse, error) {
return nil, fmt.Errorf("%s, no such method", m.Name()) return nil, fmt.Errorf("%s, no such method", m.Name())
} }

View File

@@ -545,7 +545,64 @@ func (z *OpenRouterModel) TranscribeAudioWithSender(modelName *string, file *str
// AudioSpeech convert audio to text // AudioSpeech convert audio to text
func (o *OpenRouterModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, asrConfig *TTSConfig) (*TTSResponse, error) { func (o *OpenRouterModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, asrConfig *TTSConfig) (*TTSResponse, error) {
return nil, fmt.Errorf("%s, no such method", o.Name()) if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" {
return nil, fmt.Errorf("OpenRouter API key is missing")
}
if audioContent == nil || *audioContent == "" {
return nil, fmt.Errorf("text content is empty")
}
var region = "default"
if apiConfig.Region != nil && *apiConfig.Region != "" {
region = *apiConfig.Region
}
url := fmt.Sprintf("%s/%s", o.BaseURL[region], o.URLSuffix.TTS)
// OpenRouter:response Audio bytes stream
reqBody := map[string]interface{}{
"model": modelName,
"input": audioContent,
}
if asrConfig != nil && asrConfig.Params != nil {
for key, value := range asrConfig.Params {
reqBody[key] = value
}
}
if asrConfig != nil && asrConfig.Format != "" {
reqBody["response_format"] = asrConfig.Format
}
jsonData, err := json.Marshal(reqBody)
if err != nil {
return nil, fmt.Errorf("failed to marshal request: %w", err)
}
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey))
resp, err := o.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response body: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("OpenRouter API error: %s, body: %s", resp.Status, string(body))
}
return &TTSResponse{Audio: body}, nil
} }
func (z *OpenRouterModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error { func (z *OpenRouterModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error {

View File

@@ -62,6 +62,7 @@ type RerankResponse struct {
} }
type ASRResponse struct { type ASRResponse struct {
Text string `json:"text"`
} }
type TTSResponse struct { type TTSResponse struct {
@@ -85,6 +86,7 @@ type URLSuffix struct {
Files string `json:"files"` Files string `json:"files"`
Status string `json:"status"` Status string `json:"status"`
TTS string `json:"tts"` TTS string `json:"tts"`
ASR string `json:"asr"`
} }
type ChatConfig struct { type ChatConfig struct {
@@ -115,10 +117,12 @@ type RerankConfig struct {
} }
type ASRConfig struct { type ASRConfig struct {
Params map[string]interface{} `json:"params"`
} }
type TTSConfig struct { type TTSConfig struct {
Params map[string]interface{} Format string `json:"format"`
Params map[string]interface{} `json:"params"`
} }
type OCRConfig struct { type OCRConfig struct {

View File

@@ -1049,13 +1049,14 @@ func (h *ProviderHandler) RerankDocument(c *gin.Context) {
} }
type TranscribeAudioRequest struct { type TranscribeAudioRequest struct {
ProviderName *string `json:"provider_name"` ProviderName *string `json:"provider_name"`
InstanceName *string `json:"instance_name"` InstanceName *string `json:"instance_name"`
ModelName *string `json:"model_name"` ModelName *string `json:"model_name"`
File *string `json:"file"` File *string `json:"file"`
Language []string `json:"language"` Language []string `json:"language"`
Prompt int `json:"prompt"` Prompt int `json:"prompt"`
Stream bool `json:"stream"` Stream bool `json:"stream"`
ASRConfig *models.ASRConfig `json:"asr_config"`
} }
func (h *ProviderHandler) TranscribeAudio(c *gin.Context) { func (h *ProviderHandler) TranscribeAudio(c *gin.Context) {
@@ -1101,6 +1102,9 @@ func (h *ProviderHandler) TranscribeAudio(c *gin.Context) {
} }
asrConfig := models.ASRConfig{} asrConfig := models.ASRConfig{}
if req.ASRConfig != nil {
asrConfig = *req.ASRConfig
}
// Check if it's a stream request // Check if it's a stream request
if req.Stream { if req.Stream {