// // Copyright 2026 The InfiniFlow Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // package models import ( "bytes" "context" "encoding/base64" "encoding/json" "fmt" "io" "mime/multipart" "net/http" "os" "path/filepath" "strconv" "strings" ) type FishAudioModel struct { baseModel BaseModel } func NewFishAudioModel(baseURL map[string]string, urlSuffix URLSuffix) *FishAudioModel { return &FishAudioModel{ baseModel: BaseModel{ BaseURL: baseURL, URLSuffix: urlSuffix, httpClient: NewDriverHTTPClient(), }, } } func (f *FishAudioModel) NewInstance(baseURL map[string]string) ModelDriver { return NewFishAudioModel(baseURL, f.baseModel.URLSuffix) } func (f *FishAudioModel) Name() string { return "fishaudio" } func (f *FishAudioModel) ChatWithMessages(modelName string, messages []Message, apiConfig *APIConfig, chatModelConfig *ChatConfig) (*ChatResponse, error) { return nil, fmt.Errorf("%s, no such method", f.Name()) } func (f *FishAudioModel) ChatStreamlyWithSender(modelName string, messages []Message, apiConfig *APIConfig, modelConfig *ChatConfig, sender func(*string, *string) error) error { return fmt.Errorf("%s, no such method", f.Name()) } func (f *FishAudioModel) Embed(modelName *string, texts []string, apiConfig *APIConfig, embeddingConfig *EmbeddingConfig) ([]EmbeddingData, error) { return nil, fmt.Errorf("no such method") } func (f *FishAudioModel) Rerank(modelName *string, query string, documents []string, apiConfig *APIConfig, rerankConfig *RerankConfig) (*RerankResponse, error) { return nil, fmt.Errorf("no such method") } // TranscribeAudio transcribe audio func (f *FishAudioModel) TranscribeAudio(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig) (*ASRResponse, error) { if err := f.baseModel.APIConfigCheck(apiConfig); err != nil { return nil, err } if file == nil || *file == "" { return nil, fmt.Errorf("file is missing") } resolvedBaseURL, err := f.baseModel.GetBaseURL(apiConfig) if err != nil { return nil, err } url := fmt.Sprintf("%s/%s", resolvedBaseURL, f.baseModel.URLSuffix.ASR) var body bytes.Buffer writer := multipart.NewWriter(&body) // audio file audioFile, err := os.Open(*file) if err != nil { return nil, fmt.Errorf("failed to open audio file: %w", err) } defer audioFile.Close() part, err := writer.CreateFormFile("audio", filepath.Base(*file)) if err != nil { return nil, fmt.Errorf("failed to create multipart file: %w", err) } if _, err = io.Copy(part, audioFile); err != nil { return nil, fmt.Errorf("failed to copy audio data: %w", err) } // extra params if asrConfig != nil && asrConfig.Params != nil { for key, value := range asrConfig.Params { var val string switch v := value.(type) { case string: val = v case bool: val = strconv.FormatBool(v) case int: val = strconv.Itoa(v) case float64: val = strconv.FormatFloat(v, 'f', -1, 64) default: val = fmt.Sprintf("%v", v) } if err := writer.WriteField(key, val); err != nil { return nil, fmt.Errorf("failed to write field %s: %w", key, err) } } } if err := writer.Close(); err != nil { return nil, fmt.Errorf("failed to close multipart writer: %w", err) } // request ctx, cancel := context.WithTimeout(context.Background(), longOpCallTimeout) defer cancel() req, err := http.NewRequestWithContext(ctx, "POST", url, &body) if err != nil { return nil, fmt.Errorf("failed to create request: %w", err) } req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey)) req.Header.Set("Content-Type", writer.FormDataContentType()) resp, err := f.baseModel.httpClient.Do(req) if err != nil { return nil, fmt.Errorf("failed to send request: %w", err) } defer resp.Body.Close() respBody, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("failed to read response body: %w", err) } if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("FishAudio ASR error: %s - %s", resp.Status, string(respBody)) } // result var result struct { Text string `json:"text"` } if err := json.Unmarshal(respBody, &result); err != nil { return nil, fmt.Errorf("failed to unmarshal response: %w", err) } return &ASRResponse{ Text: result.Text, }, nil } func (f *FishAudioModel) TranscribeAudioWithSender(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig, sender func(*string, *string) error) error { return fmt.Errorf("%s, no such method", f.Name()) } // AudioSpeech convert text to audio func (f *FishAudioModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig) (*TTSResponse, error) { if err := f.baseModel.APIConfigCheck(apiConfig); err != nil { return nil, err } if audioContent == nil || *audioContent == "" { return nil, fmt.Errorf("text content is missing") } resolvedBaseURL, err := f.baseModel.GetBaseURL(apiConfig) if err != nil { return nil, err } url := fmt.Sprintf("%s/%s", resolvedBaseURL, f.baseModel.URLSuffix.TTS) reqBody := map[string]interface{}{ "text": *audioContent, } if ttsConfig != nil && ttsConfig.Params != nil { for key, value := range ttsConfig.Params { reqBody[key] = value } } if ttsConfig != nil && ttsConfig.Format != "" { reqBody["format"] = ttsConfig.Format } jsonData, err := json.Marshal(reqBody) if err != nil { return nil, fmt.Errorf("failed to marshal request: %w", err) } ctx, cancel := context.WithTimeout(context.Background(), longOpCallTimeout) defer cancel() req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonData)) if err != nil { return nil, fmt.Errorf("failed to create request: %w", err) } req.Header.Set("Content-Type", "application/json") req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey)) req.Header.Set("model", *modelName) resp, err := f.baseModel.httpClient.Do(req) if err != nil { return nil, fmt.Errorf("failed to send request: %w", err) } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("failed to read response body: %w", err) } if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("%s - %s", resp.Status, string(body)) } return &TTSResponse{Audio: body}, nil } func (f *FishAudioModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error { if err := f.baseModel.APIConfigCheck(apiConfig); err != nil { return err } if audioContent == nil || *audioContent == "" { return fmt.Errorf("text content is missing") } resolvedBaseURL, err := f.baseModel.GetBaseURL(apiConfig) if err != nil { return err } url := fmt.Sprintf("%s/%s/%s", resolvedBaseURL, f.baseModel.URLSuffix.TTS, "stream/with-timestamp") reqBody := map[string]interface{}{ "text": *audioContent, } if ttsConfig != nil && ttsConfig.Params != nil { for key, value := range ttsConfig.Params { reqBody[key] = value } } if ttsConfig != nil && ttsConfig.Format != "" { reqBody["format"] = ttsConfig.Format } jsonData, err := json.Marshal(reqBody) if err != nil { return fmt.Errorf("failed to marshal request: %w", err) } // Build Request ctx, cancel := context.WithTimeout(context.Background(), streamCallTimeout) defer cancel() req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonData)) if err != nil { return fmt.Errorf("failed to create request: %w", err) } req.Header.Set("Content-Type", "application/json") req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey)) req.Header.Set("model", *modelName) resp, err := f.baseModel.httpClient.Do(req) if err != nil { return fmt.Errorf("failed to send request: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { buf := make([]byte, 1024) n, _ := resp.Body.Read(buf) return fmt.Errorf("FishAudio stream API error: %d - %s", resp.StatusCode, string(buf[:n])) } if _, err := ParseSSEStream[struct { AudioBase64 string `json:"audio_base64"` }](resp.Body, func(event struct { AudioBase64 string `json:"audio_base64"` }) error { if event.AudioBase64 != "" { audioBytes, err := base64.StdEncoding.DecodeString(event.AudioBase64) if err == nil && len(audioBytes) > 0 { chunk := string(audioBytes) if errSend := sender(&chunk, nil); errSend != nil { return errSend } } } return nil }); err != nil { return fmt.Errorf("failed to scan response body: %w", err) } return nil } // OCRFile OCR file func (f *FishAudioModel) OCRFile(modelName *string, content []byte, url *string, apiConfig *APIConfig, ocrConfig *OCRConfig) (*OCRFileResponse, error) { return nil, fmt.Errorf("%s, no such method", f.Name()) } // ParseFile parse file func (f *FishAudioModel) ParseFile(modelName *string, content []byte, url *string, apiConfig *APIConfig, parseFileConfig *ParseFileConfig) (*ParseFileResponse, error) { return nil, fmt.Errorf("%s, no such method", f.Name()) } func (f *FishAudioModel) ListModels(apiConfig *APIConfig) ([]ListModelResponse, error) { if err := f.baseModel.APIConfigCheck(apiConfig); err != nil { return nil, err } resolvedBaseURL, err := f.baseModel.GetBaseURL(apiConfig) if err != nil { return nil, err } url := fmt.Sprintf("%s/%s", resolvedBaseURL, f.baseModel.URLSuffix.Models) ctx, cancel := context.WithTimeout(context.Background(), nonStreamCallTimeout) defer cancel() req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return nil, fmt.Errorf("failed to create request: %w", err) } req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey)) resp, err := f.baseModel.httpClient.Do(req) if err != nil { return nil, fmt.Errorf("failed to send request: %w", err) } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("failed to read response body: %w", err) } if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("Fish Audio API request failed with status %d: %s", resp.StatusCode, string(body)) } var result struct { Items []struct { ID string `json:"_id"` Title string `json:"title"` } `json:"items"` } if err := json.Unmarshal(body, &result); err != nil { return nil, fmt.Errorf("failed to parse response: %w", err) } models := make([]ListModelResponse, 0, len(result.Items)) for _, item := range result.Items { models = append(models, ListModelResponse{ Name: item.Title, }) } return models, nil } func (f *FishAudioModel) Balance(apiConfig *APIConfig) (map[string]interface{}, error) { if err := f.baseModel.APIConfigCheck(apiConfig); err != nil { return nil, err } baseURL, err := f.baseModel.GetBaseURL(apiConfig) if err != nil { return nil, err } url := fmt.Sprintf("%s/wallet/self/api-credit", strings.TrimSuffix(baseURL, "/")) ctx, cancel := context.WithTimeout(context.Background(), nonStreamCallTimeout) defer cancel() req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { return nil, fmt.Errorf("failed to create request: %w", err) } req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey)) resp, err := f.baseModel.httpClient.Do(req) if err != nil { return nil, fmt.Errorf("failed to send request: %w", err) } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("failed to read response body: %w", err) } if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("Fish Audio balance API error: status %d, body: %s", resp.StatusCode, string(body)) } var result map[string]interface{} if err := json.Unmarshal(body, &result); err != nil { return nil, fmt.Errorf("failed to parse response: %w", err) } return result, nil } func (f *FishAudioModel) CheckConnection(apiConfig *APIConfig) error { _, err := f.ListModels(apiConfig) return err } func (f *FishAudioModel) ListTasks(apiConfig *APIConfig) ([]ListTaskStatus, error) { return nil, fmt.Errorf("%s, no such method", f.Name()) } func (f *FishAudioModel) ShowTask(taskID string, apiConfig *APIConfig) (*TaskResponse, error) { return nil, fmt.Errorf("%s, no such method", f.Name()) }