Go: CLI chat with text, image, video (#14573)

### What problem does this PR solve?

```
RAGFlow(user)> chat with 'glm-4.6v-flash@test@zhipu-ai' message 'What are the pics talk about?' image 'https://cdn.bigmodel.cn/static/logo/register.png' 'https://cdn.bigmodel.cn/static/logo/api-key.png'
Answer: The first picture shows a login/register modal with options for phone number login, account login, and WeChat QR code login, along with a prompt for new users to get a 20 million tokens experience package. The second picture displays the API keys management page of a platform, including a warning about API key security and a table listing existing API keys with details like creation time and usage history.
Time: 31.600545
RAGFlow(user)> chat with 'glm-4.6v-flash@test@zhipu-ai' message 'What are the video talk about?' video 'https://cdn.bigmodel.cn/agent-demos/lark/113123.mov'
Answer: Based on the sequence of frames provided, the video is a demonstration of a web search and navigation process.

1.  The video starts with a blank Google search page.
2.  The user types "智谱" (which is the Chinese name for the company Zhipu AI) into the search box.
3.  The search is initiated and the page shows "About 0 results".
4.  The search results load, showing information about Zhipu AI, including its website.
5.  The user clicks on the main website link (www.zhipuai.cn).
6.  The video ends by showing the homepage of Zhipu AI's website, titled "Z.ai GLM Large Model Open Platform".

In summary, the video is about searching for the company "智谱" (Zhipu AI) on Google and then navigating to its official website.
Time: 76.582520
```

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai
2026-05-05 18:14:39 +08:00
committed by GitHub
parent 24af0875e5
commit 3a51c27a75
4 changed files with 289 additions and 99 deletions

View File

@@ -301,6 +301,14 @@ func (l *Lexer) lookupIdent(ident string) Token {
return Token{Type: TokenChats, Value: ident}
case "CHAT":
return Token{Type: TokenChat, Value: ident}
case "MESSAGE":
return Token{Type: TokenMessage, Value: ident}
case "IMAGE":
return Token{Type: TokenImage, Value: ident}
case "VIDEO":
return Token{Type: TokenVideo, Value: ident}
case "AUDIO":
return Token{Type: TokenAudio, Value: ident}
case "THINK":
return Token{Type: TokenThink, Value: ident}
case "EFFORT":

View File

@@ -81,6 +81,10 @@ const (
TokenDefault
TokenChats
TokenChat
TokenMessage
TokenImage
TokenVideo
TokenAudio
TokenStream
TokenFiles
TokenAs
@@ -109,7 +113,6 @@ const (
TokenVector
TokenSize
TokenName // For ALTER PROVIDER <name> NAME <new_name>
TokenPool
TokenBalance
TokenInstance
TokenInstances

View File

@@ -19,8 +19,10 @@ package cli
import (
"bufio"
"context"
"encoding/base64"
"encoding/json"
"fmt"
netUrl "net/url"
"os"
ce "ragflow/internal/cli/filesystem"
"strings"
@@ -1514,6 +1516,14 @@ func (c *RAGFlowClient) EnableOrDisableModel(cmd *Command, status string) (Respo
return &result, nil
}
func isValidURL(str string) bool {
u, err := netUrl.Parse(str)
if err != nil {
return false
}
return u.Scheme != "" && u.Host != ""
}
func (c *RAGFlowClient) ChatToModel(cmd *Command) (ResponseIf, error) {
if c.ServerType != "user" {
return nil, fmt.Errorf("this command is only allowed in USER mode")
@@ -1539,7 +1549,102 @@ func (c *RAGFlowClient) ChatToModel(cmd *Command) (ResponseIf, error) {
return nil, fmt.Errorf("model name not provided and no current model set. Use 'use model' command first")
}
message := cmd.Params["message"].(string)
formattedMessages := []map[string]interface{}{}
messages, ok := cmd.Params["messages"].([]string)
if !ok {
return nil, fmt.Errorf("messages not provided")
}
contents := []map[string]interface{}{}
if len(messages) > 0 {
for _, message := range messages {
contents = append(contents, map[string]interface{}{
"type": "text",
"text": message,
})
}
}
images, ok := cmd.Params["images"].([]string)
if !ok {
return nil, fmt.Errorf("images not provided")
}
if len(images) > 0 {
for _, image := range images {
if isValidURL(image) {
contents = append(contents, map[string]interface{}{
"type": "image_url",
"image_url": map[string]string{
"url": image,
},
})
} else {
// image is a path, read the file and turn it into base64
imageContent, err := os.ReadFile(image)
if err != nil {
return nil, fmt.Errorf("failed to read image: %w", err)
}
contents = append(contents, map[string]interface{}{
"type": "image_file",
"image_file": map[string]interface{}{
"content": base64.StdEncoding.EncodeToString(imageContent),
},
})
}
}
}
videos, ok := cmd.Params["videos"].([]string)
if !ok {
return nil, fmt.Errorf("images not provided")
}
if len(videos) > 0 {
for _, video := range videos {
if isValidURL(video) {
contents = append(contents, map[string]interface{}{
"type": "video_url",
"video_url": map[string]interface{}{
"url": video,
},
})
} else {
return nil, fmt.Errorf("invalid video URL: %s", video)
}
}
}
//audios, ok := cmd.Params["audios"].([]string)
//if !ok {
// return nil, fmt.Errorf("images not provided")
//}
files, ok := cmd.Params["files"].([]string)
if !ok {
return nil, fmt.Errorf("images not provided")
}
if len(files) > 0 {
for _, file := range files {
if isValidURL(file) {
contents = append(contents, map[string]interface{}{
"type": "file_url",
"file_url": map[string]interface{}{
"url": file,
},
})
} else {
return nil, fmt.Errorf("invalid file URL: %s", file)
}
}
}
formattedText := map[string]interface{}{
"role": "user",
"content": contents,
}
formattedMessages = append(formattedMessages, formattedText)
thinking := cmd.Params["thinking"].(bool)
stream := cmd.Params["stream"].(bool)
effort := cmd.Params["effort"].(string)
@@ -1547,26 +1652,26 @@ func (c *RAGFlowClient) ChatToModel(cmd *Command) (ResponseIf, error) {
url := "/chat/completions"
message = strings.TrimSpace(message)
var content interface{} = message
if strings.HasPrefix(message, "[") && strings.HasSuffix(message, "]") {
var parts []map[string]interface{}
if err := json.Unmarshal([]byte(message), &parts); err == nil {
content = parts
}
}
formattedMessage := []map[string]interface{}{
{
"role": "user",
"content": content,
},
}
//message = strings.TrimSpace(message)
//var content interface{} = message
//if strings.HasPrefix(message, "[") && strings.HasSuffix(message, "]") {
// var parts []map[string]interface{}
// if err := json.Unmarshal([]byte(message), &parts); err == nil {
// content = parts
// }
//}
//formattedMessage := []map[string]interface{}{
// {
// "role": "user",
// "content": content,
// },
//}
payload := map[string]interface{}{
"provider_name": providerName,
"instance_name": instanceName,
"model_name": modelName,
"messages": formattedMessage,
"messages": formattedMessages,
"stream": stream,
"thinking": thinking,
}

View File

@@ -2412,102 +2412,176 @@ func (p *Parser) parseDisableCommand() (*Command, error) {
return cmd, nil
}
// CHAT 'model@instance@provider' 'hello world'
// CHAT WITH 'model@instance@provider' MESSAGE 'hello world' 'who are you' IMAGE 'url1' 'file0' VIDEO "url2.mov" "file1" FILE "url" "path file2" AUDIO "file.wav"
func (p *Parser) parseChatCommand() (*Command, error) {
p.nextToken() // consume CHAT
var compositeModelName string
var message string
// Check if we have a quoted string that looks like a model identifier (contains two slashes)
// Format: 'model@instance@provider' or just 'message'
if p.curToken.Type == TokenQuotedString {
firstArg := p.curToken.Value
// Check if it looks like a model identifier (contains exactly 2 slashes)
slashCount := strings.Count(firstArg, "@")
if slashCount == 2 {
// This is likely a model identifier, expect another quoted string for message
compositeModelName = firstArg
p.nextToken()
// After model name, expect message
if p.curToken.Type != TokenQuotedString {
return nil, fmt.Errorf("expected message after model name")
}
message = p.curToken.Value
p.nextToken()
} else {
// This is just a message, use current model
message = firstArg
p.nextToken()
}
} else if p.curToken.Type == TokenIdentifier {
// Context engine style: chat <message>
message = p.curToken.Value
p.nextToken()
} else {
return nil, fmt.Errorf("expected model name (quoted string) or message")
}
cmd := NewCommand("chat_to_model")
var err error
var compositeModelName string = ""
var messages []string
var images []string
var videos []string
var audios []string
var files []string
effort := "default"
verbosity := "low"
if p.curToken.Type == TokenWith {
p.nextToken() // pass WITH
optionsLoop:
for {
switch p.curToken.Type {
case TokenWith:
p.nextToken()
// 'model@instance@provider'
if compositeModelName != "" {
return nil, fmt.Errorf("model name is already set")
}
compositeModelName, err = p.parseQuotedString()
if err != nil {
return nil, err
}
p.nextToken()
case TokenMessage:
p.nextToken()
if len(messages) != 0 {
return nil, fmt.Errorf("message is already set")
}
messageLoop:
for {
if p.curToken.Type != TokenQuotedString {
break messageLoop
}
var message string
message, err = p.parseQuotedString()
if err != nil {
return nil, err
}
message = strings.TrimSpace(message)
messages = append(messages, message)
p.nextToken()
}
case TokenImage:
p.nextToken()
if len(images) != 0 {
return nil, fmt.Errorf("image is already set")
}
imageLoop:
for {
if p.curToken.Type != TokenQuotedString {
break imageLoop
}
var image string
image, err = p.parseQuotedString()
if err != nil {
return nil, err
}
images = append(images, image)
p.nextToken()
}
case TokenVideo:
p.nextToken()
if len(videos) != 0 {
return nil, fmt.Errorf("video is already set")
}
videoLoop:
for {
if p.curToken.Type != TokenQuotedString {
break videoLoop
}
var video string
video, err = p.parseQuotedString()
if err != nil {
return nil, err
}
videos = append(videos, video)
p.nextToken()
}
case TokenAudio:
p.nextToken()
if len(audios) != 0 {
return nil, fmt.Errorf("video is already set")
}
audioLoop:
for {
if p.curToken.Type != TokenQuotedString {
break audioLoop
}
var audio string
audio, err = p.parseQuotedString()
if err != nil {
return nil, err
}
audios = append(audios, audio)
p.nextToken()
}
case TokenFile:
p.nextToken()
if len(files) != 0 {
return nil, fmt.Errorf("video is already set")
}
fileLoop:
for {
if p.curToken.Type != TokenQuotedString {
break fileLoop
}
var file string
file, err = p.parseQuotedString()
if err != nil {
return nil, err
}
files = append(files, file)
p.nextToken()
}
case TokenEffort:
{
p.nextToken() // pass Effort
switch p.curToken.Type {
case TokenNone:
effort = "none"
case TokenMinimal:
effort = "minimal"
case TokenLow:
effort = "low"
case TokenMedium:
effort = "medium"
case TokenHigh:
effort = "high"
case TokenMax:
effort = "max"
default:
return nil, fmt.Errorf("invalid effort level")
}
p.nextToken()
break
p.nextToken() // pass Effort
switch p.curToken.Type {
case TokenNone:
effort = "none"
case TokenMinimal:
effort = "minimal"
case TokenLow:
effort = "low"
case TokenMedium:
effort = "medium"
case TokenHigh:
effort = "high"
case TokenMax:
effort = "max"
default:
return nil, fmt.Errorf("invalid effort level")
}
p.nextToken()
break optionsLoop
case TokenVerbosity:
{
p.nextToken() // pass VERBOSITY
switch p.curToken.Type {
case TokenLow:
verbosity = "low"
case TokenMedium:
verbosity = "median"
case TokenHigh:
verbosity = "high"
default:
return nil, fmt.Errorf("invalid verbosity level")
}
p.nextToken()
break
p.nextToken() // pass VERBOSITY
switch p.curToken.Type {
case TokenLow:
verbosity = "low"
case TokenMedium:
verbosity = "median"
case TokenHigh:
verbosity = "high"
default:
return nil, fmt.Errorf("invalid verbosity level")
}
p.nextToken()
break optionsLoop
case TokenSemicolon:
p.nextToken()
break optionsLoop // done
default:
return nil, fmt.Errorf("expected VERBOSITY or EFFORT")
// No more options to process
break optionsLoop
}
}
cmd := NewCommand("chat_to_model")
// Semicolon is optional
if p.curToken.Type == TokenSemicolon {
p.nextToken()
}
if compositeModelName != "" {
cmd.Params["composite_model_name"] = compositeModelName
}
cmd.Params["message"] = message
cmd.Params["composite_model_name"] = compositeModelName
cmd.Params["messages"] = messages
cmd.Params["images"] = images
cmd.Params["videos"] = videos
cmd.Params["audios"] = audios
cmd.Params["files"] = files
cmd.Params["thinking"] = false
cmd.Params["stream"] = false
cmd.Params["effort"] = effort