mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
Implement metadata search in Infinity in GO (#13706)
### What problem does this PR solve?
Add cli
LIST DOCUMENTS OF DATASET quoted_string ";"
LIST METADATA OF DATASETS quoted_string ("," quoted_string)* ";"
LIST METADATA SUMMARY OF DATASET quoted_string (DOCUMENTS quoted_string
("," quoted_string)*)? ";"
### Type of change
- [x] Refactoring
This commit is contained in:
@@ -77,6 +77,9 @@ sql_command: login_user
|
||||
| drop_user_dataset
|
||||
| list_user_datasets
|
||||
| list_user_dataset_files
|
||||
| list_user_dataset_documents
|
||||
| list_user_datasets_metadata
|
||||
| list_user_documents_metadata_summary
|
||||
| list_user_agents
|
||||
| list_user_chats
|
||||
| create_user_chat
|
||||
@@ -161,10 +164,14 @@ DEFAULT: "DEFAULT"i
|
||||
CHATS: "CHATS"i
|
||||
CHAT: "CHAT"i
|
||||
FILES: "FILES"i
|
||||
DOCUMENTS: "DOCUMENTS"i
|
||||
METADATA: "METADATA"i
|
||||
SUMMARY: "SUMMARY"i
|
||||
AS: "AS"i
|
||||
PARSE: "PARSE"i
|
||||
IMPORT: "IMPORT"i
|
||||
INTO: "INTO"i
|
||||
IN: "IN"i
|
||||
WITH: "WITH"i
|
||||
PARSER: "PARSER"i
|
||||
PIPELINE: "PIPELINE"i
|
||||
@@ -299,6 +306,9 @@ create_user_dataset_with_parser: CREATE DATASET quoted_string WITH EMBEDDING quo
|
||||
create_user_dataset_with_pipeline: CREATE DATASET quoted_string WITH EMBEDDING quoted_string PIPELINE quoted_string ";"
|
||||
drop_user_dataset: DROP DATASET quoted_string ";"
|
||||
list_user_dataset_files: LIST FILES OF DATASET quoted_string ";"
|
||||
list_user_dataset_documents: LIST DOCUMENTS OF DATASET quoted_string ";"
|
||||
list_user_datasets_metadata: LIST METADATA OF DATASETS quoted_string ("," quoted_string)* ";"
|
||||
list_user_documents_metadata_summary: LIST METADATA SUMMARY OF DATASET quoted_string (DOCUMENTS quoted_string ("," quoted_string)*)? ";"
|
||||
list_user_agents: LIST AGENTS ";"
|
||||
list_user_chats: LIST CHATS ";"
|
||||
create_user_chat: CREATE CHAT quoted_string ";"
|
||||
@@ -592,6 +602,28 @@ class RAGFlowCLITransformer(Transformer):
|
||||
dataset_name = items[4].children[0].strip("'\"")
|
||||
return {"type": "list_user_dataset_files", "dataset_name": dataset_name}
|
||||
|
||||
def list_user_dataset_documents(self, items):
|
||||
dataset_name = items[4].children[0].strip("'\"")
|
||||
return {"type": "list_user_dataset_documents", "dataset_name": dataset_name}
|
||||
|
||||
def list_user_datasets_metadata(self, items):
|
||||
dataset_names = []
|
||||
dataset_names.append(items[4].children[0].strip("'\""))
|
||||
for i in range(5, len(items)):
|
||||
if items[i] and hasattr(items[i], 'children') and items[i].children:
|
||||
dataset_names.append(items[i].children[0].strip("'\""))
|
||||
return {"type": "list_user_datasets_metadata", "dataset_names": dataset_names}
|
||||
|
||||
def list_user_documents_metadata_summary(self, items):
|
||||
dataset_name = items[5].children[0].strip("'\"")
|
||||
doc_ids = []
|
||||
if len(items) > 6 and items[6] == "DOCUMENTS":
|
||||
for i in range(7, len(items)):
|
||||
if items[i] and hasattr(items[i], 'children') and items[i].children:
|
||||
doc_id = items[i].children[0].strip("'\"")
|
||||
doc_ids.append(doc_id)
|
||||
return {"type": "list_user_documents_metadata_summary", "dataset_name": dataset_name, "document_ids": doc_ids}
|
||||
|
||||
def list_user_agents(self, items):
|
||||
return {"type": "list_user_agents"}
|
||||
|
||||
|
||||
@@ -63,7 +63,7 @@ class RAGFlowClient:
|
||||
print("Can't access server for login (connection failed)")
|
||||
return
|
||||
|
||||
email : str = command["email"]
|
||||
email: str = command["email"]
|
||||
user_password = getpass.getpass(f"password for {email}: ").strip()
|
||||
try:
|
||||
token = login_user(self.http_client, self.server_type, email, user_password)
|
||||
@@ -597,7 +597,8 @@ class RAGFlowClient:
|
||||
if self.server_type != "admin":
|
||||
print("This command is only allowed in ADMIN mode")
|
||||
license = command["license"]
|
||||
response = self.http_client.request("POST", "/admin/license", json_body={"license": license}, use_api_base=True, auth_kind="admin")
|
||||
response = self.http_client.request("POST", "/admin/license", json_body={"license": license}, use_api_base=True,
|
||||
auth_kind="admin")
|
||||
res_json = response.json()
|
||||
if response.status_code == 200:
|
||||
print("Set license successfully")
|
||||
@@ -609,7 +610,9 @@ class RAGFlowClient:
|
||||
print("This command is only allowed in ADMIN mode")
|
||||
value1 = command["value1"]
|
||||
value2 = command["value2"]
|
||||
response = self.http_client.request("POST", "/admin/license/config", json_body={"value1": value1, "value2": value2}, use_api_base=True, auth_kind="admin")
|
||||
response = self.http_client.request("POST", "/admin/license/config",
|
||||
json_body={"value1": value1, "value2": value2}, use_api_base=True,
|
||||
auth_kind="admin")
|
||||
res_json = response.json()
|
||||
if response.status_code == 200:
|
||||
print("Set license successfully")
|
||||
@@ -636,7 +639,6 @@ class RAGFlowClient:
|
||||
else:
|
||||
print(f"Fail to show license, code: {res_json['code']}, message: {res_json['message']}")
|
||||
|
||||
|
||||
def list_server_configs(self, command):
|
||||
"""List server configs by calling /system/configs API and flattening the JSON response."""
|
||||
response = self.http_client.request("GET", "/system/configs", use_api_base=False, auth_kind="web")
|
||||
@@ -825,6 +827,130 @@ class RAGFlowClient:
|
||||
return
|
||||
self._print_table_simple(res_json)
|
||||
|
||||
def list_user_dataset_documents(self, command_dict):
|
||||
if self.server_type != "user":
|
||||
print("This command is only allowed in USER mode")
|
||||
|
||||
dataset_name = command_dict["dataset_name"]
|
||||
dataset_id = self._get_dataset_id(dataset_name)
|
||||
if dataset_id is None:
|
||||
return
|
||||
|
||||
docs = self._list_documents(dataset_name, dataset_id)
|
||||
if docs is None:
|
||||
return
|
||||
|
||||
if not docs:
|
||||
print(f"No documents found in dataset {dataset_name}")
|
||||
return
|
||||
|
||||
print(f"Documents in dataset: {dataset_name}")
|
||||
print("-" * 60)
|
||||
# Select key fields for display
|
||||
display_docs = []
|
||||
for doc in docs:
|
||||
meta_fields = doc.get("meta_fields", {})
|
||||
# Convert meta_fields dict to string for display
|
||||
meta_fields_str = ""
|
||||
if meta_fields:
|
||||
meta_fields_str = str(meta_fields)
|
||||
display_doc = {
|
||||
"name": doc.get("name", ""),
|
||||
"id": doc.get("id", ""),
|
||||
"size": doc.get("size", 0),
|
||||
"status": doc.get("status", ""),
|
||||
"created_at": doc.get("created_at", ""),
|
||||
}
|
||||
if meta_fields_str:
|
||||
display_doc["meta_fields"] = meta_fields_str
|
||||
display_docs.append(display_doc)
|
||||
self._print_table_simple(display_docs)
|
||||
|
||||
def list_user_datasets_metadata(self, command_dict):
|
||||
if self.server_type != "user":
|
||||
print("This command is only allowed in USER mode")
|
||||
return
|
||||
|
||||
dataset_names = command_dict["dataset_names"]
|
||||
valid_datasets = []
|
||||
for dataset_name in dataset_names:
|
||||
dataset_id = self._get_dataset_id(dataset_name)
|
||||
if dataset_id is None:
|
||||
print(f"Dataset not found: {dataset_name}")
|
||||
continue
|
||||
valid_datasets.append((dataset_name, dataset_id))
|
||||
|
||||
if not valid_datasets:
|
||||
print("No valid datasets found")
|
||||
return
|
||||
|
||||
dataset_ids = [dataset_id for _, dataset_id in valid_datasets]
|
||||
kb_ids_param = ",".join(dataset_ids)
|
||||
response = self.http_client.request("GET", f"/kb/get_meta?kb_ids={kb_ids_param}",
|
||||
use_api_base=False, auth_kind="web")
|
||||
res_json = response.json()
|
||||
if response.status_code != 200:
|
||||
print(f"Fail to get metadata, code: {res_json.get('code')}, message: {res_json.get('message')}")
|
||||
return
|
||||
|
||||
meta = res_json.get("data", {})
|
||||
if not meta:
|
||||
print("No metadata found")
|
||||
return
|
||||
|
||||
table_data = []
|
||||
for field_name, values_dict in meta.items():
|
||||
for value, docs in values_dict.items():
|
||||
table_data.append({
|
||||
"field": field_name,
|
||||
"value": value,
|
||||
"doc_ids": ", ".join(docs)
|
||||
})
|
||||
self._print_table_simple(table_data)
|
||||
|
||||
def list_user_documents_metadata_summary(self, command_dict):
|
||||
if self.server_type != "user":
|
||||
print("This command is only allowed in USER mode")
|
||||
return
|
||||
|
||||
dataset_name = command_dict["dataset_name"]
|
||||
doc_ids = command_dict.get("document_ids", [])
|
||||
|
||||
kb_id = self._get_dataset_id(dataset_name)
|
||||
if kb_id is None:
|
||||
return
|
||||
|
||||
payload = {"kb_id": kb_id}
|
||||
if doc_ids:
|
||||
payload["doc_ids"] = doc_ids
|
||||
response = self.http_client.request("POST", "/document/metadata/summary", json_body=payload,
|
||||
use_api_base=False, auth_kind="web")
|
||||
res_json = response.json()
|
||||
if response.status_code == 200:
|
||||
summary = res_json.get("data", {}).get("summary", {})
|
||||
if not summary:
|
||||
if doc_ids:
|
||||
print(f"No metadata summary found for documents: {', '.join(doc_ids)}")
|
||||
else:
|
||||
print(f"No metadata summary found in dataset {dataset_name}")
|
||||
return
|
||||
if doc_ids:
|
||||
print(f"Metadata summary for document(s): {', '.join(doc_ids)}")
|
||||
else:
|
||||
print(f"Metadata summary for all documents in dataset: {dataset_name}")
|
||||
print("-" * 60)
|
||||
for field_name, field_info in summary.items():
|
||||
field_type = field_info.get("type", "unknown")
|
||||
values = field_info.get("values", [])
|
||||
print(f"\nField: {field_name} (type: {field_type})")
|
||||
print(f" Total unique values: {len(values)}")
|
||||
if values:
|
||||
print(" Values:")
|
||||
for value, count in values:
|
||||
print(f" {value}: {count}")
|
||||
else:
|
||||
print(f"Fail to get metadata summary, code: {res_json.get('code')}, message: {res_json.get('message')}")
|
||||
|
||||
def list_user_agents(self, command):
|
||||
if self.server_type != "user":
|
||||
print("This command is only allowed in USER mode")
|
||||
@@ -1013,7 +1139,8 @@ class RAGFlowClient:
|
||||
if response.status_code == 200 and res_json["code"] == 0:
|
||||
print(f"Success to create chat session for chat: {chat_name}")
|
||||
else:
|
||||
print(f"Fail to create chat session for chat {chat_name}, code: {res_json['code']}, message: {res_json['message']}")
|
||||
print(
|
||||
f"Fail to create chat session for chat {chat_name}, code: {res_json['code']}, message: {res_json['message']}")
|
||||
|
||||
def drop_chat_session(self, command):
|
||||
if self.server_type != "user":
|
||||
@@ -1040,7 +1167,8 @@ class RAGFlowClient:
|
||||
if response.status_code == 200 and res_json["code"] == 0:
|
||||
print(f"Success to drop chat session '{session_id}' from chat: {chat_name}")
|
||||
else:
|
||||
print(f"Fail to drop chat session '{session_id}' from chat {chat_name}, code: {res_json['code']}, message: {res_json['message']}")
|
||||
print(
|
||||
f"Fail to drop chat session '{session_id}' from chat {chat_name}, code: {res_json['code']}, message: {res_json['message']}")
|
||||
|
||||
def list_chat_sessions(self, command):
|
||||
if self.server_type != "user":
|
||||
@@ -1094,7 +1222,8 @@ class RAGFlowClient:
|
||||
try:
|
||||
data_json = json.loads(data_str)
|
||||
if data_json.get("code") != 0:
|
||||
print(f"\nFail to chat on session, code: {data_json.get('code')}, message: {data_json.get('message', '')}")
|
||||
print(
|
||||
f"\nFail to chat on session, code: {data_json.get('code')}, message: {data_json.get('message', '')}")
|
||||
return
|
||||
# Check if it's the final message
|
||||
if data_json.get("data") is True:
|
||||
@@ -1598,6 +1727,12 @@ def run_command(client: RAGFlowClient, command_dict: dict):
|
||||
client.drop_user_dataset(command_dict)
|
||||
case "list_user_dataset_files":
|
||||
return client.list_user_dataset_files(command_dict)
|
||||
case "list_user_dataset_documents":
|
||||
return client.list_user_dataset_documents(command_dict)
|
||||
case "list_user_datasets_metadata":
|
||||
return client.list_user_datasets_metadata(command_dict)
|
||||
case "list_user_documents_metadata_summary":
|
||||
return client.list_user_documents_metadata_summary(command_dict)
|
||||
case "list_user_agents":
|
||||
return client.list_user_agents(command_dict)
|
||||
case "list_user_chats":
|
||||
@@ -1677,6 +1812,13 @@ GENERATE KEY FOR USER <user>
|
||||
LIST KEYS OF <user>
|
||||
DROP KEY <key> OF <user>
|
||||
|
||||
User Commands (use -t user):
|
||||
LIST DATASETS
|
||||
LIST DOCUMENTS OF DATASET <dataset>
|
||||
SEARCH <query> ON DATASETS <dataset>
|
||||
LIST METADATA OF DATASETS <dataset>[, <dataset>]*
|
||||
LIST METADATA SUMMARY OF DATASET <dataset> DOCUMENTS <doc_id>[, <doc_id>]*
|
||||
|
||||
Meta Commands:
|
||||
\\?, \\h, \\help Show this help
|
||||
\\q, \\quit, \\exit Quit the CLI
|
||||
|
||||
Reference in New Issue
Block a user