From 302f97de5049c0a5340c772cdea9e8e200cc4c29 Mon Sep 17 00:00:00 2001 From: Tohka <72191648+Ltohka@users.noreply.github.com> Date: Fri, 22 May 2026 18:02:30 +0800 Subject: [PATCH] Go: implement reasoning_chat, TTS, ASR for Groq (#15153) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? Go: implement reasoning_chat, TTS, ASR for Groq **Verify from CLI** ``` RAGFlow(user)> think chat with 'qwen/qwen3-32b@test@groq' message 'who r u' Thinking: Okay, the user asked, who r u. I need to determine what the user is asking. They may be asking about my identity. I should introduce my name and basic functions. The user might want to know what I can do, so I should list some common use cases, such as answering questions, creating writing, coding, and expressing opinions. The user may be curious about how they can interact with me, so they can be advised to ask any questions or provide instructions. Keep your answers conversational, avoid overly technical terms, keep answers concise, and encourage further interaction. Check if there's any ambiguity in the answer and make sure it's accurate and meets the user's needs. Also consider if there are other aspects the user may be interested in, such as my training data or performance. But since the question is basic, I'll focus on the essentials first and invite the user to ask more. In summary, respond to the user's questions by introducing yourself, your functions, and encouraging further interaction. Answer: Hello! I'm Qwen. I am a large-scale language model developed by Tongyi Lab, designed to assist you in various ways, such as answering questions, creating text, logical reasoning, programming, and more. I aim to provide clear, accurate, and helpful information and support. How can I assist you today? Feel free to ask any questions or give me tasks! 😊 Time: 2.199908 RAGFlow(user)> stream think chat with 'openai/gpt-oss-20b@test@groq' message 'who r u' Thinking: to respond politely. Answer: ’m ChatGPT—an AI language model created by OpenAI. I’m here to answer questions, offer explanations, and help with a wide range of topics. How can I assist you today? RAGFlow(user)> tts with 'canopylabs/orpheus-arabic-saudi@test@groq' text 'hello? show yourself' play format 'wav' param '{"voice": "fahad"}' SUCCESS RAGFlow(user)> asr with 'whisper-large-v3-turbo@test@groq' audio './internal/test.wav' param '{"language": "en"}' +----------------------------------------------------------------------------------------------------------------------+ | text | +----------------------------------------------------------------------------------------------------------------------+ | The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired | +----------------------------------------------------------------------------------------------------------------------+ ``` ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- conf/models/groq.json | 35 ++++++- internal/entity/models/groq.go | 180 ++++++++++++++++++++++++++++++++- 2 files changed, 212 insertions(+), 3 deletions(-) diff --git a/conf/models/groq.json b/conf/models/groq.json index 827e3829ef..4ec32c6d2c 100644 --- a/conf/models/groq.json +++ b/conf/models/groq.json @@ -5,7 +5,9 @@ }, "url_suffix": { "chat": "chat/completions", - "models": "models" + "models": "models", + "asr": "audio/transcriptions", + "tts": "audio/speech" }, "class": "groq", "models": [ @@ -51,6 +53,13 @@ "chat" ] }, + { + "name": "openai/gpt-oss-20b", + "max_tokens": 131072, + "model_types": [ + "chat" + ] + }, { "name": "meta-llama/llama-4-scout-17b-16e-instruct", "max_tokens": 131072, @@ -64,6 +73,30 @@ "model_types": [ "chat" ] + }, + { + "name": "canopylabs/orpheus-v1-english", + "model_types": [ + "tts" + ] + }, + { + "name": "canopylabs/orpheus-arabic-saudi", + "model_types": [ + "tts" + ] + }, + { + "name": "whisper-large-v3-turbo", + "model_types": [ + "asr" + ] + }, + { + "name": "whisper-large-v3", + "model_types": [ + "asr" + ] } ] } diff --git a/internal/entity/models/groq.go b/internal/entity/models/groq.go index 87072c95ac..fdf3d0652d 100644 --- a/internal/entity/models/groq.go +++ b/internal/entity/models/groq.go @@ -23,7 +23,11 @@ import ( "encoding/json" "fmt" "io" + "mime/multipart" "net/http" + "os" + "path/filepath" + "strconv" "strings" "time" ) @@ -113,6 +117,16 @@ func groqChatPayload(modelName string, messages []Message, stream bool, chatMode "stream": stream, } + modelLower := strings.ToLower(modelName) + if strings.Contains(modelLower, "gpt-oss") { + reqBody["include_reasoning"] = true + if chatModelConfig.Effort != nil { + reqBody["reasoning_effort"] = chatModelConfig.Effort + } + } else if strings.Contains(modelLower, "qwen") || strings.Contains(modelLower, "deepseek") { + reqBody["reasoning_format"] = "parsed" + } + if chatModelConfig != nil { if chatModelConfig.MaxTokens != nil { reqBody["max_tokens"] = *chatModelConfig.MaxTokens @@ -126,6 +140,7 @@ func groqChatPayload(modelName string, messages []Message, stream bool, chatMode if chatModelConfig.Stop != nil { reqBody["stop"] = *chatModelConfig.Stop } + } return reqBody @@ -403,7 +418,115 @@ func (g *GroqModel) Balance(apiConfig *APIConfig) (map[string]interface{}, error } func (g *GroqModel) TranscribeAudio(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig) (*ASRResponse, error) { - return nil, fmt.Errorf("%s, no such method", g.Name()) + if file == nil || *file == "" { + return nil, fmt.Errorf("file is missing") + } + + region := "default" + if apiConfig != nil && apiConfig.Region != nil && *apiConfig.Region != "" { + region = *apiConfig.Region + } + + url := fmt.Sprintf("%s/%s", g.BaseURL[region], g.URLSuffix.ASR) + + // multipart body + var body bytes.Buffer + writer := multipart.NewWriter(&body) + + // open audio file + audioFile, err := os.Open(*file) + if err != nil { + return nil, fmt.Errorf("failed to open audio file: %w", err) + } + defer audioFile.Close() + + // create multipart file field + part, err := writer.CreateFormFile( + "file", + filepath.Base(*file), + ) + if err != nil { + return nil, fmt.Errorf("failed to create multipart file: %w", err) + } + + // copy file content + if _, err = io.Copy(part, audioFile); err != nil { + return nil, fmt.Errorf("failed to copy audio data: %w", err) + } + + // model field + if err := writer.WriteField("model", *modelName); err != nil { + return nil, fmt.Errorf("failed to write model field: %w", err) + } + + // extra params + if asrConfig != nil && asrConfig.Params != nil { + for key, value := range asrConfig.Params { + + var val string + + switch v := value.(type) { + case string: + val = v + case bool: + val = strconv.FormatBool(v) + case int: + val = strconv.Itoa(v) + case int64: + val = strconv.FormatInt(v, 10) + case float32: + val = strconv.FormatFloat(float64(v), 'f', -1, 32) + case float64: + val = strconv.FormatFloat(v, 'f', -1, 64) + default: + val = fmt.Sprintf("%v", v) + } + + if err = writer.WriteField(key, val); err != nil { + return nil, fmt.Errorf("failed to write field %s: %w", key, err) + } + } + } + + if err = writer.Close(); err != nil { + return nil, fmt.Errorf("failed to close multipart writer: %w", err) + } + + // build request + req, err := http.NewRequest("POST", url, &body) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey)) + req.Header.Set("Content-Type", writer.FormDataContentType()) + + // send request + resp, err := g.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to send request: %w", err) + } + defer resp.Body.Close() + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("Groq ASR error: %s - %s", resp.Status, string(respBody)) + } + + // response + var result struct { + Text string `json:"text"` + } + + if err = json.Unmarshal(respBody, &result); err != nil { + return nil, fmt.Errorf("failed to unmarshal response: %w, body=%s", err, string(respBody)) + } + + return &ASRResponse{Text: result.Text}, nil } func (g *GroqModel) TranscribeAudioWithSender(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig, sender func(*string, *string) error) error { @@ -411,7 +534,60 @@ func (g *GroqModel) TranscribeAudioWithSender(modelName *string, file *string, a } func (g *GroqModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig) (*TTSResponse, error) { - return nil, fmt.Errorf("%s, no such method", g.Name()) + if audioContent == nil || *audioContent == "" { + return nil, fmt.Errorf("audio content is empty") + } + + var region = "default" + if apiConfig != nil && apiConfig.Region != nil && *apiConfig.Region != "" { + region = *apiConfig.Region + } + + url := fmt.Sprintf("%s/%s", g.BaseURL[region], g.URLSuffix.TTS) + + reqBody := map[string]interface{}{ + "model": *modelName, + "input": *audioContent, + } + + if ttsConfig != nil && ttsConfig.Params != nil { + for key, value := range ttsConfig.Params { + reqBody[key] = value + } + } + if ttsConfig != nil && ttsConfig.Format != "" { + reqBody["response_format"] = ttsConfig.Format + } + + jsonData, err := json.Marshal(reqBody) + if err != nil { + return nil, fmt.Errorf("failed to marshal request: %w", err) + } + + req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData)) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", *apiConfig.ApiKey)) + + resp, err := g.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to send request: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("%s - %s", resp.Status, string(body)) + } + + return &TTSResponse{Audio: body}, nil } func (g *GroqModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error {