Implement UpdateDataset and UpdateMetadata in GO (#13928)

### What problem does this PR solve?

Implement UpdateDataset and UpdateMetadata in GO

Add cli:
UPDATE CHUNK <chunk_id> OF DATASET <dataset_name> SET <update_fields>
REMOVE TAGS 'tag1', 'tag2' from DATASET 'dataset_name';
SET METADATA OF DOCUMENT <doc_id> TO <meta>


### Type of change

- [ ] Refactoring
This commit is contained in:
qinling0210
2026-04-07 09:44:51 +08:00
committed by GitHub
parent 60ec5880e5
commit 49386bc1b5
27 changed files with 1620 additions and 171 deletions

View File

@@ -99,6 +99,9 @@ sql_command: login_user
| list_chunks
| insert_dataset_from_file
| insert_metadata_from_file
| update_chunk
| set_metadata
| remove_tags
| create_chat_session
| drop_chat_session
| list_chat_sessions
@@ -114,10 +117,12 @@ sql_command: login_user
// meta command definition
meta_command: "\\" meta_command_name [meta_args]
COMMA: ","
meta_command_name: /[a-zA-Z?]+/
meta_args: (meta_arg)+
meta_arg: /[^\\s"']+/ | quoted_string
meta_arg: /[^\s"',]+/ | quoted_string
// command definition
@@ -215,8 +220,11 @@ SIZE: "SIZE"i
KEYWORDS: "KEYWORDS"i
AVAILABLE: "AVAILABLE"i
FILE: "FILE"i
UPDATE: "UPDATE"i
REMOVE: "REMOVE"i
TAGS: "TAGS"i
login_user: LOGIN USER quoted_string ";"
login_user: LOGIN USER quoted_string (PASSWORD quoted_string)? ";"
list_services: LIST SERVICES ";"
show_service: SHOW SERVICE NUMBER ";"
startup_service: STARTUP SERVICE NUMBER ";"
@@ -299,6 +307,9 @@ user_statement: ping_server
| list_user_default_models
| import_docs_into_dataset
| search_on_datasets
| update_chunk
| set_metadata
| remove_tags
| create_chat_session
| drop_chat_session
| list_chat_sessions
@@ -328,8 +339,8 @@ create_user_dataset_with_pipeline: CREATE DATASET quoted_string WITH EMBEDDING q
drop_user_dataset: DROP DATASET quoted_string ";"
list_user_dataset_files: LIST FILES OF DATASET quoted_string ";"
list_user_dataset_documents: LIST DOCUMENTS OF DATASET quoted_string ";"
list_user_datasets_metadata: LIST METADATA OF DATASETS quoted_string ("," quoted_string)* ";"
list_user_documents_metadata_summary: LIST METADATA SUMMARY OF DATASET quoted_string (DOCUMENTS quoted_string ("," quoted_string)*)? ";"
list_user_datasets_metadata: LIST METADATA OF DATASETS quoted_string (COMMA quoted_string)* ";"
list_user_documents_metadata_summary: LIST METADATA SUMMARY OF DATASET quoted_string (DOCUMENTS quoted_string (COMMA quoted_string)*)? ";"
list_user_agents: LIST AGENTS ";"
list_user_chats: LIST CHATS ";"
create_user_chat: CREATE CHAT quoted_string ";"
@@ -353,11 +364,15 @@ parse_dataset_docs: PARSE quoted_string OF DATASET quoted_string ";"
parse_dataset_sync: PARSE DATASET quoted_string SYNC ";"
parse_dataset_async: PARSE DATASET quoted_string ASYNC ";"
update_chunk: UPDATE CHUNK quoted_string OF DATASET quoted_string SET quoted_string ";"
set_metadata: SET METADATA OF DOCUMENT quoted_string TO quoted_string ";"
remove_tags: REMOVE TAGS quoted_string (COMMA quoted_string)* FROM DATASET quoted_string ";"
// Internal CLI for GO
insert_dataset_from_file: INSERT DATASET FROM FILE quoted_string ";"
insert_metadata_from_file: INSERT METADATA FROM FILE quoted_string ";"
identifier_list: identifier ("," identifier)*
identifier_list: identifier (COMMA identifier)*
identifier: WORD
quoted_string: QUOTED_STRING
@@ -381,7 +396,13 @@ class RAGFlowCLITransformer(Transformer):
def login_user(self, items):
email = items[2].children[0].strip("'\"")
return {"type": "login_user", "email": email}
if len(items) == 5:
# With password: LOGIN USER email PASSWORD password
password = items[4].children[0].strip("'\"")
return {"type": "login_user", "email": email, "password": password}
else:
# Without password: LOGIN USER email
return {"type": "login_user", "email": email}
def ping_server(self, items):
return {"type": "ping_server"}
@@ -766,6 +787,44 @@ class RAGFlowCLITransformer(Transformer):
file_path = items[4].children[0].strip("'\"")
return {"type": "insert_metadata_from_file", "file_path": file_path}
def update_chunk(self, items):
def get_quoted_value(item):
if hasattr(item, 'children') and item.children:
return item.children[0].strip("'\"")
return str(item).strip("'\"")
chunk_id = get_quoted_value(items[2])
dataset_name = get_quoted_value(items[5])
json_body = get_quoted_value(items[7])
return {"type": "update_chunk", "chunk_id": chunk_id, "dataset_name": dataset_name, "json_body": json_body}
def set_metadata(self, items):
doc_id = items[4].children[0].strip("'\"")
meta_json = items[6].children[0].strip("'\"")
return {"type": "set_metadata", "doc_id": doc_id, "meta": meta_json}
def remove_tags(self, items):
# items: REMOVE, TAGS, quoted_string(tag1), quoted_string(tag2), ..., FROM, DATASET, quoted_string(dataset_name), ";"
tags = []
# Start from index 2 (after TAGS keyword) and parse quoted strings until FROM
for i in range(2, len(items)):
item = items[i]
# Check for FROM token to stop
if hasattr(item, 'type') and item.type == 'FROM':
break
if hasattr(item, 'children') and item.children:
tag = item.children[0].strip("'\"")
tags.append(tag)
# Find dataset_name: quoted_string after DATASET
dataset_name = None
for i, item in enumerate(items):
# Check if item is a DATASET token
if hasattr(item, 'type') and item.type == 'DATASET':
# Next item should be quoted_string
dataset_name = items[i + 1].children[0].strip("'\"")
break
return {"type": "remove_tags", "dataset_name": dataset_name, "tags": tags}
def list_chunks(self, items):
doc_id = items[4].children[0].strip("'\"")
result = {"type": "list_chunks", "doc_id": doc_id}

View File

@@ -18,6 +18,9 @@ import sys
import argparse
import base64
import getpass
import os
import atexit
import readline
from cmd import Cmd
from typing import Any, Dict, List
@@ -61,6 +64,12 @@ class RAGFlowCLI(Cmd):
self.port: int = 0
self.mode: str = "admin"
self.ragflow_client = None
# History file for readline persistence
self.history_file = os.path.expanduser("~/.ragflow_cli_history")
# Load existing history
self._load_history()
# Register cleanup to save history on exit
atexit.register(self._save_history)
intro = r"""Type "\h" for help."""
prompt = "ragflow> "
@@ -99,6 +108,7 @@ class RAGFlowCLI(Cmd):
return {"type": "empty"}
self.command_history.append(command_str)
readline.add_history(command_str)
try:
result = self.parser.parse(command_str)
@@ -210,6 +220,21 @@ class RAGFlowCLI(Cmd):
print(separator)
def _load_history(self):
"""Load command history from file."""
try:
if os.path.exists(self.history_file):
readline.read_history_file(self.history_file)
except Exception:
pass # Ignore errors loading history
def _save_history(self):
"""Save command history to file."""
try:
readline.write_history_file(self.history_file)
except Exception:
pass # Ignore errors saving history
def run_interactive(self, args):
if self.verify_auth(args, single_command=False, auth=args["auth"]):
print(r"""

View File

@@ -24,7 +24,6 @@ from http_client import HttpClient
from lark import Tree
from user import encrypt_password, login_user
import getpass
import base64
from Cryptodome.Cipher import PKCS1_v1_5 as Cipher_pkcs1_v1_5
from Cryptodome.PublicKey import RSA
@@ -63,10 +62,16 @@ class RAGFlowClient:
return
email: str = command["email"]
user_password = getpass.getpass(f"password for {email}: ").strip()
user_password: str = command.get("password")
if not user_password:
import getpass
user_password = getpass.getpass("Password: ")
try:
token = login_user(self.http_client, self.server_type, email, user_password)
self.http_client.login_token = token
# Also store as api_key for API endpoint authentication
if self.server_type == "user":
self.http_client.api_key = token
print(f"Login user {email} successfully")
except Exception as e:
print(str(e))
@@ -1506,6 +1511,108 @@ class RAGFlowClient:
else:
print(f"Fail to insert metadata from file, code: {res_json['code']}, message: {res_json['message']}")
def update_chunk(self, command_dict):
if self.server_type != "user":
print("This command is only allowed in USER mode")
return
chunk_id = command_dict["chunk_id"]
dataset_name = command_dict["dataset_name"]
json_body_str = command_dict["json_body"]
# Get dataset_id from dataset_name
dataset_id = self._get_dataset_id(dataset_name)
if dataset_id is None:
return
# Get doc_id from chunk_id via GET /chunk/get
response = self.http_client.request("GET", f"/chunk/get?chunk_id={chunk_id}", use_api_base=False,
auth_kind="web")
res_json = response.json()
if response.status_code != 200:
print(f"Fail to get chunk info, code: {res_json.get('code')}, message: {res_json.get('message')}")
return
doc_id = None
if res_json.get("code") == 0 and res_json.get("data"):
doc_id = res_json["data"].get("doc_id")
if not doc_id:
print(f"Could not find document_id for chunk {chunk_id}")
return
# Parse json_body
try:
payload = json.loads(json_body_str)
except json.JSONDecodeError as e:
print(f"Invalid JSON body: {e}")
return
# Call PUT /datasets/{dataset_id}/documents/{doc_id}/chunks/{chunk_id}
path = f"/datasets/{dataset_id}/documents/{doc_id}/chunks/{chunk_id}"
response = self.http_client.request("PUT", path, json_body=payload, use_api_base=True, auth_kind="api")
res_json = response.json()
if response.status_code == 200:
if res_json.get("code") == 0:
print(f"Success to update chunk: {chunk_id}")
else:
print(f"Fail to update chunk, code: {res_json.get('code')}, message: {res_json.get('message')}")
else:
print(f"Fail to update chunk, HTTP {response.status_code}")
def set_metadata(self, command_dict):
if self.server_type != "user":
print("This command is only allowed in USER mode")
return
doc_id = command_dict["doc_id"]
meta_json_str = command_dict["meta"]
# Send meta as JSON string
payload = {
"doc_id": doc_id,
"meta": meta_json_str,
}
response = self.http_client.request("POST", "/document/set_meta", json_body=payload,
use_api_base=False, auth_kind="web")
res_json = response.json()
if response.status_code == 200:
if res_json.get("code") == 0:
print(f"Success to set metadata for document: {doc_id}")
else:
print(f"Fail to set metadata, code: {res_json.get('code')}, message: {res_json.get('message')}")
else:
print(f"Fail to set metadata, HTTP {response.status_code}")
def remove_tags(self, command_dict):
if self.server_type != "user":
print("This command is only allowed in USER mode")
return
dataset_name = command_dict["dataset_name"]
dataset_id = self._get_dataset_id(dataset_name)
if dataset_id is None:
print(f"Dataset not found: {dataset_name}")
return
tags = command_dict["tags"]
payload = {
"tags": tags,
}
response = self.http_client.request("POST", f"/kb/{dataset_id}/rm_tags", json_body=payload,
use_api_base=False, auth_kind="web")
res_json = response.json()
if response.status_code == 200:
if res_json.get("code") == 0:
print(f"Success to remove tags from dataset: {dataset_name}")
else:
print(f"Fail to remove tags, code: {res_json.get('code')}, message: {res_json.get('message')}")
else:
print(f"Fail to remove tags, HTTP {response.status_code}")
def list_chunks(self, command_dict):
if self.server_type != "user":
print("This command is only allowed in USER mode")
@@ -1903,6 +2010,12 @@ def run_command(client: RAGFlowClient, command_dict: dict):
return client.insert_dataset_from_file(command_dict)
case "insert_metadata_from_file":
return client.insert_metadata_from_file(command_dict)
case "update_chunk":
return client.update_chunk(command_dict)
case "set_metadata":
return client.set_metadata(command_dict)
case "remove_tags":
return client.remove_tags(command_dict)
case "list_chunks":
return client.list_chunks(command_dict)
case "meta":