mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
Go: implement TTS for fishaudio, openrouter and asr for fishaudio (#14926)
### What problem does this PR solve?
This PR implement TTS for FishAudio and MiniMax provider and ASR for
FishAudio
**The following functionalities are now supported:**
**FishAudio:**
- [x] Text To Speech
- [x] Stream Text To Speech
- [x] Audio To Text
**OpenRouter:**
- [x] Text To Speech
**Verified examples from the CLI:**
```plaintext
**FishAudio**
RAGFlow(user)> tts with 's1@test@fishaudio' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"reference_id": "90e65eaaf50e4470b8e6d43ee6afd7d5", "temperature": 0.7, "top_p": 0.7, "prosody": {"speed": 1, "volume": 0, "normalize_loudness": true}, "chunk_length": 300, "normalize": true, "sample_rate": 44100, "mp3_bitrate": 128, "latency": "normal", "max_new_tokens": 1024, "repetition_penalty": 1.2, "min_chunk_length": 50, "condition_on_previous_chunks": true, "early_stop_threshold": 1}'
Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/s1_output.wav
SUCCESS
RAGFlow(user)> stream tts with 's1@test@fishaudio' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"reference_id": "90e65eaaf50e4470b8e6d43ee6afd7d5", "temperature": 0.7, "top_p": 0.7, "prosody": {"speed": 1, "volume": 0, "normalize_loudness": true}, "chunk_length": 300, "normalize": true, "sample_rate": 44100, "mp3_bitrate": 128, "latency": "normal", "max_new_tokens": 1024, "repetition_penalty": 1.2, "min_chunk_length": 50, "condition_on_previous_chunks": true, "early_stop_threshold": 1}'
Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/s1_output.wav
SUCCESS
RAGFlow(user)> asr with 'transcribe-1@test@fishaudio' audio './internal/test.wav' param '{"language": "en", "ignore_timestamps": true}'
+----------------------------------------------------------------------------------------------------------------------+
| text |
+----------------------------------------------------------------------------------------------------------------------+
| The examination and testimony of the experts enabled the commission to conclude that five shots may have been fired. |
+----------------------------------------------------------------------------------------------------------------------+
```
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
This commit is contained in:
@@ -5,10 +5,32 @@
|
||||
},
|
||||
"url_suffix": {
|
||||
"models": "model",
|
||||
"balance": "self/package"
|
||||
"balance": "self/package",
|
||||
"tts": "v1/tts",
|
||||
"asr": "v1/asr"
|
||||
},
|
||||
"class": "fishaudio",
|
||||
"models": [
|
||||
|
||||
{
|
||||
"name": "s2-pro",
|
||||
"max_tokens": 8192,
|
||||
"model_types": [
|
||||
"tts"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "s1",
|
||||
"max_tokens": 8192,
|
||||
"model_types": [
|
||||
"tts"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "transcribe-1",
|
||||
"max_tokens": 8192,
|
||||
"model_types": [
|
||||
"asr"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -8,7 +8,8 @@
|
||||
"models": "models",
|
||||
"embedding": "embeddings",
|
||||
"rerank": "rerank",
|
||||
"balance": "credits"
|
||||
"balance": "credits",
|
||||
"tts": "audio/speech"
|
||||
},
|
||||
"class": "openrouter",
|
||||
"models": [
|
||||
@@ -44,6 +45,13 @@
|
||||
"default_value": true,
|
||||
"clear_thinking": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "openai/gpt-audio-mini",
|
||||
"max_tokens": 131072,
|
||||
"model_types": [
|
||||
"tts"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -2013,7 +2013,7 @@ func (c *RAGFlowClient) TTSUserCommand(cmd *Command) (ResponseIf, error) {
|
||||
if explicitFormat != "" {
|
||||
ttsConfigPayload["format"] = explicitFormat
|
||||
} else {
|
||||
explicitFormat = "mp3"
|
||||
ttsConfigPayload["format"] = "mp3"
|
||||
}
|
||||
|
||||
if len(ttsConfigPayload) > 0 {
|
||||
@@ -2056,7 +2056,6 @@ func (c *RAGFlowClient) TTSUserCommand(cmd *Command) (ResponseIf, error) {
|
||||
shouldSave, _ := cmd.Params["save"].(bool)
|
||||
saveDir, _ := cmd.Params["save_path"].(string)
|
||||
|
||||
|
||||
fileName := fmt.Sprintf("%s_output.%s", modelName, explicitFormat)
|
||||
|
||||
cwd, err := os.Getwd()
|
||||
@@ -2149,14 +2148,27 @@ func (c *RAGFlowClient) ASRUserCommand(cmd *Command) (ResponseIf, error) {
|
||||
|
||||
audioFile, ok := cmd.Params["audio_file"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("text not provided")
|
||||
return nil, fmt.Errorf("audio file not provided")
|
||||
}
|
||||
|
||||
payload := map[string]interface{}{
|
||||
"provider_name": providerName,
|
||||
"instance_name": instanceName,
|
||||
"model_name": modelName,
|
||||
"audio_file": audioFile,
|
||||
"file": audioFile,
|
||||
}
|
||||
|
||||
asrConfigPayload := make(map[string]interface{})
|
||||
if paramStr, ok := cmd.Params["param_str"].(string); ok && paramStr != "" {
|
||||
var dynamicParams map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(paramStr), &dynamicParams); err != nil {
|
||||
return nil, fmt.Errorf("param string must be valid JSON. Error: %w", err)
|
||||
}
|
||||
asrConfigPayload["params"] = dynamicParams
|
||||
}
|
||||
|
||||
if len(asrConfigPayload) > 0 {
|
||||
payload["asr_config"] = asrConfigPayload
|
||||
}
|
||||
|
||||
url := "/audio/transcriptions"
|
||||
@@ -2168,13 +2180,23 @@ func (c *RAGFlowClient) ASRUserCommand(cmd *Command) (ResponseIf, error) {
|
||||
if resp.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("failed to ASR document: HTTP %d, body: %s", resp.StatusCode, string(resp.Body))
|
||||
}
|
||||
var result CommonResponse
|
||||
if err = json.Unmarshal(resp.Body, &result); err != nil {
|
||||
var rawResult struct {
|
||||
Code int `json:"code"`
|
||||
Message string `json:"message"`
|
||||
Data map[string]interface{} `json:"data"`
|
||||
}
|
||||
|
||||
if err = json.Unmarshal(resp.Body, &rawResult); err != nil {
|
||||
return nil, fmt.Errorf("ASR document failed: invalid JSON (%w)", err)
|
||||
}
|
||||
if result.Code != 0 {
|
||||
return nil, fmt.Errorf("%s", result.Message)
|
||||
|
||||
if rawResult.Code != 0 {
|
||||
return nil, fmt.Errorf("%s", rawResult.Message)
|
||||
}
|
||||
|
||||
var result CommonResponse
|
||||
result.Code = rawResult.Code
|
||||
result.Message = rawResult.Data["text"].(string) // TODO
|
||||
result.Duration = resp.Duration
|
||||
|
||||
return &result, nil
|
||||
|
||||
@@ -2753,7 +2753,7 @@ func (p *Parser) parseASRCommand() (*Command, error) {
|
||||
if p.curToken.Type != TokenAudio {
|
||||
return nil, fmt.Errorf("expected AUDIO to ASR")
|
||||
}
|
||||
p.nextToken() // consume FILE
|
||||
p.nextToken() // consume AUDIO
|
||||
|
||||
audioFile, err := p.parseQuotedString()
|
||||
if err != nil {
|
||||
@@ -2761,14 +2761,29 @@ func (p *Parser) parseASRCommand() (*Command, error) {
|
||||
}
|
||||
p.nextToken()
|
||||
|
||||
cmd := NewCommand("asr_user_command")
|
||||
cmd.Params["composite_model_name"] = compositeModelName
|
||||
cmd.Params["audio_file"] = audioFile
|
||||
|
||||
for p.curToken.Type != TokenEOF && p.curToken.Type != TokenSemicolon {
|
||||
switch p.curToken.Type {
|
||||
case TokenParam:
|
||||
p.nextToken()
|
||||
if p.curToken.Type != TokenQuotedString {
|
||||
return nil, fmt.Errorf("expect quoted string after 'param'")
|
||||
}
|
||||
cmd.Params["param_str"] = strings.Trim(p.curToken.Value, "\"'")
|
||||
p.nextToken()
|
||||
default:
|
||||
return nil, fmt.Errorf("unexpected token in asr command: %s", p.curToken.Value)
|
||||
}
|
||||
}
|
||||
|
||||
// Semicolon is optional for UNSET TOKEN
|
||||
if p.curToken.Type == TokenSemicolon {
|
||||
p.nextToken()
|
||||
}
|
||||
|
||||
cmd := NewCommand("asr_user_command")
|
||||
cmd.Params["composite_model_name"] = compositeModelName
|
||||
cmd.Params["audio_file"] = audioFile
|
||||
return cmd, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -1,10 +1,17 @@
|
||||
package models
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -64,20 +71,273 @@ func (f *FishAudioModel) Rerank(modelName *string, query string, documents []str
|
||||
|
||||
// TranscribeAudio transcribe audio
|
||||
func (f *FishAudioModel) TranscribeAudio(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig) (*ASRResponse, error) {
|
||||
return nil, fmt.Errorf("%s, no such method", f.Name())
|
||||
|
||||
if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" {
|
||||
return nil, fmt.Errorf("FishAudio API key is missing")
|
||||
}
|
||||
|
||||
if file == nil || *file == "" {
|
||||
return nil, fmt.Errorf("file is missing")
|
||||
}
|
||||
|
||||
region := "default"
|
||||
if apiConfig.Region != nil && *apiConfig.Region != "" {
|
||||
region = *apiConfig.Region
|
||||
}
|
||||
|
||||
url := fmt.Sprintf("%s/%s", f.BaseURL[region], f.URLSuffix.ASR)
|
||||
|
||||
var body bytes.Buffer
|
||||
writer := multipart.NewWriter(&body)
|
||||
|
||||
// audio file
|
||||
audioFile, err := os.Open(*file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open audio file: %w", err)
|
||||
}
|
||||
defer audioFile.Close()
|
||||
|
||||
part, err := writer.CreateFormFile("audio", filepath.Base(*file))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create multipart file: %w", err)
|
||||
}
|
||||
|
||||
if _, err = io.Copy(part, audioFile); err != nil {
|
||||
return nil, fmt.Errorf("failed to copy audio data: %w", err)
|
||||
}
|
||||
|
||||
// extra params
|
||||
if asrConfig != nil && asrConfig.Params != nil {
|
||||
for key, value := range asrConfig.Params {
|
||||
|
||||
var val string
|
||||
|
||||
switch v := value.(type) {
|
||||
case string:
|
||||
val = v
|
||||
case bool:
|
||||
val = strconv.FormatBool(v)
|
||||
case int:
|
||||
val = strconv.Itoa(v)
|
||||
case float64:
|
||||
val = strconv.FormatFloat(v, 'f', -1, 64)
|
||||
default:
|
||||
val = fmt.Sprintf("%v", v)
|
||||
}
|
||||
|
||||
if err := writer.WriteField(key, val); err != nil {
|
||||
return nil, fmt.Errorf("failed to write field %s: %w", key, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := writer.Close(); err != nil {
|
||||
return nil, fmt.Errorf("failed to close multipart writer: %w", err)
|
||||
}
|
||||
|
||||
// request
|
||||
req, err := http.NewRequest("POST", url, &body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey))
|
||||
req.Header.Set("Content-Type", writer.FormDataContentType())
|
||||
|
||||
resp, err := f.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to send request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read response body: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf(
|
||||
"FishAudio ASR error: %s - %s",
|
||||
resp.Status,
|
||||
string(respBody),
|
||||
)
|
||||
}
|
||||
|
||||
// result
|
||||
var result struct {
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(respBody, &result); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal response: %w", err)
|
||||
}
|
||||
|
||||
return &ASRResponse{
|
||||
Text: result.Text,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (z *FishAudioModel) TranscribeAudioWithSender(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig, sender func(*string, *string) error) error {
|
||||
return fmt.Errorf("%s, no such method", z.Name())
|
||||
func (f *FishAudioModel) TranscribeAudioWithSender(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig, sender func(*string, *string) error) error {
|
||||
return fmt.Errorf("%s, no such method", f.Name())
|
||||
}
|
||||
|
||||
// AudioSpeech convert audio to text
|
||||
func (f *FishAudioModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, asrConfig *TTSConfig) (*TTSResponse, error) {
|
||||
return nil, fmt.Errorf("%s, no such method", f.Name())
|
||||
if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" {
|
||||
return nil, fmt.Errorf("FishAudio API key is missing")
|
||||
}
|
||||
|
||||
if audioContent == nil || *audioContent == "" {
|
||||
return nil, fmt.Errorf("text content is missing")
|
||||
}
|
||||
|
||||
var region = "default"
|
||||
if apiConfig.Region != nil && *apiConfig.Region != "" {
|
||||
region = *apiConfig.Region
|
||||
}
|
||||
|
||||
url := fmt.Sprintf("%s/%s", f.BaseURL[region], f.URLSuffix.TTS)
|
||||
|
||||
reqBody := map[string]interface{}{
|
||||
"text": *audioContent,
|
||||
}
|
||||
|
||||
if asrConfig != nil && asrConfig.Params != nil {
|
||||
for key, value := range asrConfig.Params {
|
||||
reqBody[key] = value
|
||||
}
|
||||
}
|
||||
if asrConfig != nil && asrConfig.Format != "" {
|
||||
reqBody["format"] = asrConfig.Format
|
||||
}
|
||||
|
||||
jsonData, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to marshal request: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey))
|
||||
req.Header.Set("model", *modelName)
|
||||
|
||||
resp, err := f.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to send request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read response body: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("%s - %s", resp.Status, string(body))
|
||||
}
|
||||
|
||||
return &TTSResponse{Audio: body}, nil
|
||||
}
|
||||
|
||||
func (z *FishAudioModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error {
|
||||
return fmt.Errorf("%s, no such method", z.Name())
|
||||
func (f *FishAudioModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, asrConfig *TTSConfig, sender func(*string, *string) error) error {
|
||||
if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" {
|
||||
return fmt.Errorf("FishAudio API key is missing")
|
||||
}
|
||||
|
||||
if audioContent == nil || *audioContent == "" {
|
||||
return fmt.Errorf("text content is missing")
|
||||
}
|
||||
|
||||
var region = "default"
|
||||
if apiConfig.Region != nil && *apiConfig.Region != "" {
|
||||
region = *apiConfig.Region
|
||||
}
|
||||
|
||||
url := fmt.Sprintf("%s/%s/%s", f.BaseURL[region], f.URLSuffix.TTS, "stream/with-timestamp")
|
||||
|
||||
reqBody := map[string]interface{}{
|
||||
"text": *audioContent,
|
||||
}
|
||||
|
||||
if asrConfig != nil && asrConfig.Params != nil {
|
||||
for key, value := range asrConfig.Params {
|
||||
reqBody[key] = value
|
||||
}
|
||||
}
|
||||
if asrConfig != nil && asrConfig.Format != "" {
|
||||
reqBody["format"] = asrConfig.Format
|
||||
}
|
||||
|
||||
jsonData, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal request: %w", err)
|
||||
}
|
||||
|
||||
// Build Request
|
||||
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey))
|
||||
req.Header.Set("model", *modelName)
|
||||
|
||||
resp, err := f.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to send request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
buf := make([]byte, 1024)
|
||||
n, _ := resp.Body.Read(buf)
|
||||
return fmt.Errorf("FishAudio stream API error: %d - %s", resp.StatusCode, string(buf[:n]))
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(resp.Body)
|
||||
scanner.Buffer(make([]byte, 64*1024), 8*1024*1024)
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
if !strings.HasPrefix(line, "data: ") {
|
||||
continue
|
||||
}
|
||||
|
||||
dataStr := strings.TrimSpace(line[6:])
|
||||
if dataStr == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
var event struct {
|
||||
AudioBase64 string `json:"audio_base64"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal([]byte(dataStr), &event); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if event.AudioBase64 != "" {
|
||||
audioBytes, err := base64.StdEncoding.DecodeString(event.AudioBase64)
|
||||
if err == nil && len(audioBytes) > 0 {
|
||||
chunk := string(audioBytes)
|
||||
if errSend := sender(&chunk, nil); errSend != nil {
|
||||
return errSend
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
return fmt.Errorf("error reading FishAudio stream: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// OCRFile OCR file
|
||||
|
||||
@@ -478,7 +478,7 @@ func (z *MinimaxModel) AudioSpeech(modelName *string, audioContent *string, apiC
|
||||
}
|
||||
|
||||
url := fmt.Sprintf("%s/%s", z.BaseURL[region], z.URLSuffix.TTS)
|
||||
|
||||
|
||||
reqBody := map[string]interface{}{
|
||||
"model": modelName,
|
||||
"text": audioContent,
|
||||
@@ -488,6 +488,11 @@ func (z *MinimaxModel) AudioSpeech(modelName *string, audioContent *string, apiC
|
||||
reqBody[key] = value
|
||||
}
|
||||
}
|
||||
if asrConfig != nil && asrConfig.Format != "" {
|
||||
reqBody["audio_setting"] = map[string]interface{}{
|
||||
"format": asrConfig.Format,
|
||||
}
|
||||
}
|
||||
reqBody["stream"] = false
|
||||
|
||||
jsonData, err := json.Marshal(reqBody)
|
||||
@@ -547,7 +552,6 @@ func (z *MinimaxModel) AudioSpeech(modelName *string, audioContent *string, apiC
|
||||
}, nil
|
||||
}
|
||||
|
||||
// tts with 'speech-2.8-hd@test@minimax' text 'If that day, out position was switched, would our fate, be different?' voice 'English_expressive_narrator' param '{"voice_setting": {"voice_id": "English_expressive_narrator", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}'
|
||||
func (z *MinimaxModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error {
|
||||
if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" {
|
||||
return fmt.Errorf("MiniMax API key is missing")
|
||||
@@ -581,6 +585,13 @@ func (z *MinimaxModel) AudioSpeechWithSender(modelName *string, audioContent *st
|
||||
}
|
||||
}
|
||||
reqBody["stream"] = false
|
||||
|
||||
if ttsConfig != nil && ttsConfig.Format != "" {
|
||||
reqBody["audio_setting"] = map[string]interface{}{
|
||||
"format": ttsConfig.Format,
|
||||
}
|
||||
}
|
||||
|
||||
reqBody["stream"] = true
|
||||
|
||||
jsonData, err := json.Marshal(reqBody)
|
||||
@@ -658,4 +669,4 @@ func (z *MinimaxModel) AudioSpeechWithSender(modelName *string, audioContent *st
|
||||
// OCRFile OCR file
|
||||
func (m *MinimaxModel) OCRFile(modelName *string, content []byte, url *string, apiConfig *APIConfig, ocrConfig *OCRConfig) (*OCRResponse, error) {
|
||||
return nil, fmt.Errorf("%s, no such method", m.Name())
|
||||
}
|
||||
}
|
||||
@@ -545,7 +545,64 @@ func (z *OpenRouterModel) TranscribeAudioWithSender(modelName *string, file *str
|
||||
|
||||
// AudioSpeech convert audio to text
|
||||
func (o *OpenRouterModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, asrConfig *TTSConfig) (*TTSResponse, error) {
|
||||
return nil, fmt.Errorf("%s, no such method", o.Name())
|
||||
if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" {
|
||||
return nil, fmt.Errorf("OpenRouter API key is missing")
|
||||
}
|
||||
if audioContent == nil || *audioContent == "" {
|
||||
return nil, fmt.Errorf("text content is empty")
|
||||
}
|
||||
|
||||
var region = "default"
|
||||
if apiConfig.Region != nil && *apiConfig.Region != "" {
|
||||
region = *apiConfig.Region
|
||||
}
|
||||
|
||||
url := fmt.Sprintf("%s/%s", o.BaseURL[region], o.URLSuffix.TTS)
|
||||
|
||||
// OpenRouter:response Audio bytes stream
|
||||
reqBody := map[string]interface{}{
|
||||
"model": modelName,
|
||||
"input": audioContent,
|
||||
}
|
||||
|
||||
if asrConfig != nil && asrConfig.Params != nil {
|
||||
for key, value := range asrConfig.Params {
|
||||
reqBody[key] = value
|
||||
}
|
||||
}
|
||||
if asrConfig != nil && asrConfig.Format != "" {
|
||||
reqBody["response_format"] = asrConfig.Format
|
||||
}
|
||||
|
||||
jsonData, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to marshal request: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey))
|
||||
|
||||
resp, err := o.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to send request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read response body: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("OpenRouter API error: %s, body: %s", resp.Status, string(body))
|
||||
}
|
||||
|
||||
return &TTSResponse{Audio: body}, nil
|
||||
}
|
||||
|
||||
func (z *OpenRouterModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error {
|
||||
|
||||
@@ -62,6 +62,7 @@ type RerankResponse struct {
|
||||
}
|
||||
|
||||
type ASRResponse struct {
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
type TTSResponse struct {
|
||||
@@ -85,6 +86,7 @@ type URLSuffix struct {
|
||||
Files string `json:"files"`
|
||||
Status string `json:"status"`
|
||||
TTS string `json:"tts"`
|
||||
ASR string `json:"asr"`
|
||||
}
|
||||
|
||||
type ChatConfig struct {
|
||||
@@ -115,10 +117,12 @@ type RerankConfig struct {
|
||||
}
|
||||
|
||||
type ASRConfig struct {
|
||||
Params map[string]interface{} `json:"params"`
|
||||
}
|
||||
|
||||
type TTSConfig struct {
|
||||
Params map[string]interface{}
|
||||
Format string `json:"format"`
|
||||
Params map[string]interface{} `json:"params"`
|
||||
}
|
||||
|
||||
type OCRConfig struct {
|
||||
|
||||
@@ -1049,13 +1049,14 @@ func (h *ProviderHandler) RerankDocument(c *gin.Context) {
|
||||
}
|
||||
|
||||
type TranscribeAudioRequest struct {
|
||||
ProviderName *string `json:"provider_name"`
|
||||
InstanceName *string `json:"instance_name"`
|
||||
ModelName *string `json:"model_name"`
|
||||
File *string `json:"file"`
|
||||
Language []string `json:"language"`
|
||||
Prompt int `json:"prompt"`
|
||||
Stream bool `json:"stream"`
|
||||
ProviderName *string `json:"provider_name"`
|
||||
InstanceName *string `json:"instance_name"`
|
||||
ModelName *string `json:"model_name"`
|
||||
File *string `json:"file"`
|
||||
Language []string `json:"language"`
|
||||
Prompt int `json:"prompt"`
|
||||
Stream bool `json:"stream"`
|
||||
ASRConfig *models.ASRConfig `json:"asr_config"`
|
||||
}
|
||||
|
||||
func (h *ProviderHandler) TranscribeAudio(c *gin.Context) {
|
||||
@@ -1101,6 +1102,9 @@ func (h *ProviderHandler) TranscribeAudio(c *gin.Context) {
|
||||
}
|
||||
|
||||
asrConfig := models.ASRConfig{}
|
||||
if req.ASRConfig != nil {
|
||||
asrConfig = *req.ASRConfig
|
||||
}
|
||||
|
||||
// Check if it's a stream request
|
||||
if req.Stream {
|
||||
|
||||
Reference in New Issue
Block a user