From 3a51c27a7559e3cece5cc88928cf597a9659230b Mon Sep 17 00:00:00 2001 From: Jin Hai Date: Tue, 5 May 2026 18:14:39 +0800 Subject: [PATCH] Go: CLI chat with text, image, video (#14573) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? ``` RAGFlow(user)> chat with 'glm-4.6v-flash@test@zhipu-ai' message 'What are the pics talk about?' image 'https://cdn.bigmodel.cn/static/logo/register.png' 'https://cdn.bigmodel.cn/static/logo/api-key.png' Answer: The first picture shows a login/register modal with options for phone number login, account login, and WeChat QR code login, along with a prompt for new users to get a 20 million tokens experience package. The second picture displays the API keys management page of a platform, including a warning about API key security and a table listing existing API keys with details like creation time and usage history. Time: 31.600545 RAGFlow(user)> chat with 'glm-4.6v-flash@test@zhipu-ai' message 'What are the video talk about?' video 'https://cdn.bigmodel.cn/agent-demos/lark/113123.mov' Answer: Based on the sequence of frames provided, the video is a demonstration of a web search and navigation process. 1. The video starts with a blank Google search page. 2. The user types "智谱" (which is the Chinese name for the company Zhipu AI) into the search box. 3. The search is initiated and the page shows "About 0 results". 4. The search results load, showing information about Zhipu AI, including its website. 5. The user clicks on the main website link (www.zhipuai.cn). 6. The video ends by showing the homepage of Zhipu AI's website, titled "Z.ai GLM Large Model Open Platform". In summary, the video is about searching for the company "智谱" (Zhipu AI) on Google and then navigating to its official website. Time: 76.582520 ``` ### Type of change - [x] New Feature (non-breaking change which adds functionality) Signed-off-by: Jin Hai --- internal/cli/lexer.go | 8 ++ internal/cli/types.go | 5 +- internal/cli/user_command.go | 137 +++++++++++++++++--- internal/cli/user_parser.go | 238 +++++++++++++++++++++++------------ 4 files changed, 289 insertions(+), 99 deletions(-) diff --git a/internal/cli/lexer.go b/internal/cli/lexer.go index 8e682df547..11b4b8c013 100644 --- a/internal/cli/lexer.go +++ b/internal/cli/lexer.go @@ -301,6 +301,14 @@ func (l *Lexer) lookupIdent(ident string) Token { return Token{Type: TokenChats, Value: ident} case "CHAT": return Token{Type: TokenChat, Value: ident} + case "MESSAGE": + return Token{Type: TokenMessage, Value: ident} + case "IMAGE": + return Token{Type: TokenImage, Value: ident} + case "VIDEO": + return Token{Type: TokenVideo, Value: ident} + case "AUDIO": + return Token{Type: TokenAudio, Value: ident} case "THINK": return Token{Type: TokenThink, Value: ident} case "EFFORT": diff --git a/internal/cli/types.go b/internal/cli/types.go index b6032cd11c..25490797d9 100644 --- a/internal/cli/types.go +++ b/internal/cli/types.go @@ -81,6 +81,10 @@ const ( TokenDefault TokenChats TokenChat + TokenMessage + TokenImage + TokenVideo + TokenAudio TokenStream TokenFiles TokenAs @@ -109,7 +113,6 @@ const ( TokenVector TokenSize TokenName // For ALTER PROVIDER NAME - TokenPool TokenBalance TokenInstance TokenInstances diff --git a/internal/cli/user_command.go b/internal/cli/user_command.go index 2ca0fcca19..5d87b2f643 100644 --- a/internal/cli/user_command.go +++ b/internal/cli/user_command.go @@ -19,8 +19,10 @@ package cli import ( "bufio" "context" + "encoding/base64" "encoding/json" "fmt" + netUrl "net/url" "os" ce "ragflow/internal/cli/filesystem" "strings" @@ -1514,6 +1516,14 @@ func (c *RAGFlowClient) EnableOrDisableModel(cmd *Command, status string) (Respo return &result, nil } +func isValidURL(str string) bool { + u, err := netUrl.Parse(str) + if err != nil { + return false + } + return u.Scheme != "" && u.Host != "" +} + func (c *RAGFlowClient) ChatToModel(cmd *Command) (ResponseIf, error) { if c.ServerType != "user" { return nil, fmt.Errorf("this command is only allowed in USER mode") @@ -1539,7 +1549,102 @@ func (c *RAGFlowClient) ChatToModel(cmd *Command) (ResponseIf, error) { return nil, fmt.Errorf("model name not provided and no current model set. Use 'use model' command first") } - message := cmd.Params["message"].(string) + formattedMessages := []map[string]interface{}{} + + messages, ok := cmd.Params["messages"].([]string) + if !ok { + return nil, fmt.Errorf("messages not provided") + } + contents := []map[string]interface{}{} + if len(messages) > 0 { + for _, message := range messages { + contents = append(contents, map[string]interface{}{ + "type": "text", + "text": message, + }) + } + + } + + images, ok := cmd.Params["images"].([]string) + if !ok { + return nil, fmt.Errorf("images not provided") + } + if len(images) > 0 { + for _, image := range images { + if isValidURL(image) { + contents = append(contents, map[string]interface{}{ + "type": "image_url", + "image_url": map[string]string{ + "url": image, + }, + }) + } else { + // image is a path, read the file and turn it into base64 + imageContent, err := os.ReadFile(image) + if err != nil { + return nil, fmt.Errorf("failed to read image: %w", err) + } + contents = append(contents, map[string]interface{}{ + "type": "image_file", + "image_file": map[string]interface{}{ + "content": base64.StdEncoding.EncodeToString(imageContent), + }, + }) + } + } + } + + videos, ok := cmd.Params["videos"].([]string) + if !ok { + return nil, fmt.Errorf("images not provided") + } + if len(videos) > 0 { + for _, video := range videos { + if isValidURL(video) { + contents = append(contents, map[string]interface{}{ + "type": "video_url", + "video_url": map[string]interface{}{ + "url": video, + }, + }) + } else { + return nil, fmt.Errorf("invalid video URL: %s", video) + } + } + } + + //audios, ok := cmd.Params["audios"].([]string) + //if !ok { + // return nil, fmt.Errorf("images not provided") + //} + + files, ok := cmd.Params["files"].([]string) + if !ok { + return nil, fmt.Errorf("images not provided") + } + + if len(files) > 0 { + for _, file := range files { + if isValidURL(file) { + contents = append(contents, map[string]interface{}{ + "type": "file_url", + "file_url": map[string]interface{}{ + "url": file, + }, + }) + } else { + return nil, fmt.Errorf("invalid file URL: %s", file) + } + } + } + + formattedText := map[string]interface{}{ + "role": "user", + "content": contents, + } + formattedMessages = append(formattedMessages, formattedText) + thinking := cmd.Params["thinking"].(bool) stream := cmd.Params["stream"].(bool) effort := cmd.Params["effort"].(string) @@ -1547,26 +1652,26 @@ func (c *RAGFlowClient) ChatToModel(cmd *Command) (ResponseIf, error) { url := "/chat/completions" - message = strings.TrimSpace(message) - var content interface{} = message - if strings.HasPrefix(message, "[") && strings.HasSuffix(message, "]") { - var parts []map[string]interface{} - if err := json.Unmarshal([]byte(message), &parts); err == nil { - content = parts - } - } - formattedMessage := []map[string]interface{}{ - { - "role": "user", - "content": content, - }, - } + //message = strings.TrimSpace(message) + //var content interface{} = message + //if strings.HasPrefix(message, "[") && strings.HasSuffix(message, "]") { + // var parts []map[string]interface{} + // if err := json.Unmarshal([]byte(message), &parts); err == nil { + // content = parts + // } + //} + //formattedMessage := []map[string]interface{}{ + // { + // "role": "user", + // "content": content, + // }, + //} payload := map[string]interface{}{ "provider_name": providerName, "instance_name": instanceName, "model_name": modelName, - "messages": formattedMessage, + "messages": formattedMessages, "stream": stream, "thinking": thinking, } diff --git a/internal/cli/user_parser.go b/internal/cli/user_parser.go index 5496cc3aa1..0e2a5fed54 100644 --- a/internal/cli/user_parser.go +++ b/internal/cli/user_parser.go @@ -2412,102 +2412,176 @@ func (p *Parser) parseDisableCommand() (*Command, error) { return cmd, nil } +// CHAT 'model@instance@provider' 'hello world' +// CHAT WITH 'model@instance@provider' MESSAGE 'hello world' 'who are you' IMAGE 'url1' 'file0' VIDEO "url2.mov" "file1" FILE "url" "path file2" AUDIO "file.wav" func (p *Parser) parseChatCommand() (*Command, error) { p.nextToken() // consume CHAT - var compositeModelName string - var message string - - // Check if we have a quoted string that looks like a model identifier (contains two slashes) - // Format: 'model@instance@provider' or just 'message' - if p.curToken.Type == TokenQuotedString { - firstArg := p.curToken.Value - - // Check if it looks like a model identifier (contains exactly 2 slashes) - slashCount := strings.Count(firstArg, "@") - if slashCount == 2 { - // This is likely a model identifier, expect another quoted string for message - compositeModelName = firstArg - p.nextToken() - - // After model name, expect message - if p.curToken.Type != TokenQuotedString { - return nil, fmt.Errorf("expected message after model name") - } - message = p.curToken.Value - p.nextToken() - } else { - // This is just a message, use current model - message = firstArg - p.nextToken() - } - } else if p.curToken.Type == TokenIdentifier { - // Context engine style: chat - message = p.curToken.Value - p.nextToken() - } else { - return nil, fmt.Errorf("expected model name (quoted string) or message") - } - - cmd := NewCommand("chat_to_model") - + var err error + var compositeModelName string = "" + var messages []string + var images []string + var videos []string + var audios []string + var files []string effort := "default" verbosity := "low" - if p.curToken.Type == TokenWith { - p.nextToken() // pass WITH + +optionsLoop: + for { switch p.curToken.Type { + case TokenWith: + p.nextToken() + // 'model@instance@provider' + if compositeModelName != "" { + return nil, fmt.Errorf("model name is already set") + } + compositeModelName, err = p.parseQuotedString() + if err != nil { + return nil, err + } + p.nextToken() + case TokenMessage: + p.nextToken() + if len(messages) != 0 { + return nil, fmt.Errorf("message is already set") + } + messageLoop: + for { + if p.curToken.Type != TokenQuotedString { + break messageLoop + } + var message string + message, err = p.parseQuotedString() + if err != nil { + return nil, err + } + message = strings.TrimSpace(message) + messages = append(messages, message) + p.nextToken() + } + case TokenImage: + p.nextToken() + if len(images) != 0 { + return nil, fmt.Errorf("image is already set") + } + imageLoop: + for { + if p.curToken.Type != TokenQuotedString { + break imageLoop + } + var image string + image, err = p.parseQuotedString() + if err != nil { + return nil, err + } + images = append(images, image) + p.nextToken() + } + case TokenVideo: + p.nextToken() + if len(videos) != 0 { + return nil, fmt.Errorf("video is already set") + } + videoLoop: + for { + if p.curToken.Type != TokenQuotedString { + break videoLoop + } + var video string + video, err = p.parseQuotedString() + if err != nil { + return nil, err + } + videos = append(videos, video) + p.nextToken() + } + case TokenAudio: + p.nextToken() + if len(audios) != 0 { + return nil, fmt.Errorf("video is already set") + } + audioLoop: + for { + if p.curToken.Type != TokenQuotedString { + break audioLoop + } + var audio string + audio, err = p.parseQuotedString() + if err != nil { + return nil, err + } + audios = append(audios, audio) + p.nextToken() + } + case TokenFile: + p.nextToken() + if len(files) != 0 { + return nil, fmt.Errorf("video is already set") + } + fileLoop: + for { + if p.curToken.Type != TokenQuotedString { + break fileLoop + } + var file string + file, err = p.parseQuotedString() + if err != nil { + return nil, err + } + files = append(files, file) + p.nextToken() + } case TokenEffort: - { - p.nextToken() // pass Effort - switch p.curToken.Type { - case TokenNone: - effort = "none" - case TokenMinimal: - effort = "minimal" - case TokenLow: - effort = "low" - case TokenMedium: - effort = "medium" - case TokenHigh: - effort = "high" - case TokenMax: - effort = "max" - default: - return nil, fmt.Errorf("invalid effort level") - } - p.nextToken() - break + p.nextToken() // pass Effort + switch p.curToken.Type { + case TokenNone: + effort = "none" + case TokenMinimal: + effort = "minimal" + case TokenLow: + effort = "low" + case TokenMedium: + effort = "medium" + case TokenHigh: + effort = "high" + case TokenMax: + effort = "max" + default: + return nil, fmt.Errorf("invalid effort level") } + p.nextToken() + break optionsLoop case TokenVerbosity: - { - p.nextToken() // pass VERBOSITY - switch p.curToken.Type { - case TokenLow: - verbosity = "low" - case TokenMedium: - verbosity = "median" - case TokenHigh: - verbosity = "high" - default: - return nil, fmt.Errorf("invalid verbosity level") - } - p.nextToken() - break + p.nextToken() // pass VERBOSITY + switch p.curToken.Type { + case TokenLow: + verbosity = "low" + case TokenMedium: + verbosity = "median" + case TokenHigh: + verbosity = "high" + default: + return nil, fmt.Errorf("invalid verbosity level") } + p.nextToken() + break optionsLoop + case TokenSemicolon: + p.nextToken() + break optionsLoop // done default: - return nil, fmt.Errorf("expected VERBOSITY or EFFORT") + // No more options to process + break optionsLoop } } + cmd := NewCommand("chat_to_model") - // Semicolon is optional - if p.curToken.Type == TokenSemicolon { - p.nextToken() - } - - if compositeModelName != "" { - cmd.Params["composite_model_name"] = compositeModelName - } - cmd.Params["message"] = message + cmd.Params["composite_model_name"] = compositeModelName + cmd.Params["messages"] = messages + cmd.Params["images"] = images + cmd.Params["videos"] = videos + cmd.Params["audios"] = audios + cmd.Params["files"] = files cmd.Params["thinking"] = false cmd.Params["stream"] = false cmd.Params["effort"] = effort