Implement GetChunk() in Infinity in GO (#13758)

### What problem does this PR solve?

Implement GetChunk() in Infinity in GO

Add cli:
GET CHUNK 'XXX';
LIST CHUNKS OF DOCUMENT 'XXX';

### Type of change

- [x] Refactoring
This commit is contained in:
qinling0210
2026-03-24 20:10:21 +08:00
committed by GitHub
parent b308cd3a02
commit 7c8927c4fb
11 changed files with 989 additions and 75 deletions

View File

@@ -91,6 +91,8 @@ sql_command: login_user
| parse_dataset_async
| import_docs_into_dataset
| search_on_datasets
| get_chunk
| list_chunks
| create_chat_session
| drop_chat_session
| list_chat_sessions
@@ -164,6 +166,7 @@ DEFAULT: "DEFAULT"i
CHATS: "CHATS"i
CHAT: "CHAT"i
FILES: "FILES"i
DOCUMENT: "DOCUMENT"i
DOCUMENTS: "DOCUMENTS"i
METADATA: "METADATA"i
SUMMARY: "SUMMARY"i
@@ -194,6 +197,13 @@ FINGERPRINT: "FINGERPRINT"i
LICENSE: "LICENSE"i
CHECK: "CHECK"i
CONFIG: "CONFIG"i
CHUNK: "CHUNK"i
CHUNKS: "CHUNKS"i
GET: "GET"i
PAGE: "PAGE"i
SIZE: "SIZE"i
KEYWORDS: "KEYWORDS"i
AVAILABLE: "AVAILABLE"i
login_user: LOGIN USER quoted_string ";"
list_services: LIST SERVICES ";"
@@ -321,6 +331,8 @@ list_user_model_providers: LIST MODEL PROVIDERS ";"
list_user_default_models: LIST DEFAULT MODELS ";"
import_docs_into_dataset: IMPORT quoted_string INTO DATASET quoted_string ";"
search_on_datasets: SEARCH quoted_string ON DATASETS quoted_string ";"
get_chunk: GET CHUNK quoted_string ";"
list_chunks: LIST CHUNKS OF DOCUMENT quoted_string ("PAGE" NUMBER)? ("SIZE" NUMBER)? ("KEYWORDS" quoted_string)? ("AVAILABLE" NUMBER)? ";"
parse_dataset_docs: PARSE quoted_string OF DATASET quoted_string ";"
parse_dataset_sync: PARSE DATASET quoted_string SYNC ";"
@@ -698,6 +710,28 @@ class RAGFlowCLITransformer(Transformer):
datasets = datasets.split(" ")
return {"type": "search_on_datasets", "datasets": datasets, "question": question}
def get_chunk(self, items):
chunk_id = items[2].children[0].strip("'\"")
return {"type": "get_chunk", "chunk_id": chunk_id}
def list_chunks(self, items):
doc_id = items[4].children[0].strip("'\"")
result = {"type": "list_chunks", "doc_id": doc_id}
# Parse optional parameters: PAGE, SIZE, KEYWORDS, AVAILABLE
# items structure varies based on which params are present
for i, item in enumerate(items):
if str(item) == "PAGE":
result["page"] = int(items[i + 1])
elif str(item) == "SIZE":
result["size"] = int(items[i + 1])
elif str(item) == "KEYWORDS":
result["keywords"] = items[i + 1].children[0].strip("'\"")
elif str(item) == "AVAILABLE":
result["available_int"] = int(items[i + 1])
return result
def benchmark(self, items):
concurrency: int = int(items[1])
iterations: int = int(items[2])

View File

@@ -1434,6 +1434,61 @@ class RAGFlowClient:
print(
f"Fail to search datasets: {dataset_names}, code: {res_json['code']}, message: {res_json['message']}")
def get_chunk(self, command_dict):
if self.server_type != "user":
print("This command is only allowed in USER mode")
return
chunk_id = command_dict["chunk_id"]
response = self.http_client.request("GET", f"/chunk/get?chunk_id={chunk_id}", use_api_base=False,
auth_kind="web")
res_json = response.json()
if response.status_code == 200:
if res_json["code"] == 0:
self._print_key_value(res_json["data"])
else:
print(f"Fail to get chunk, code: {res_json['code']}, message: {res_json['message']}")
else:
print(f"Fail to get chunk, code: {res_json['code']}, message: {res_json['message']}")
def list_chunks(self, command_dict):
if self.server_type != "user":
print("This command is only allowed in USER mode")
return
doc_id = command_dict["doc_id"]
payload = {
"doc_id": doc_id,
}
# Add optional parameters (only if explicitly provided)
if "page" in command_dict:
payload["page"] = command_dict["page"]
if "size" in command_dict:
payload["size"] = command_dict["size"]
if "keywords" in command_dict and command_dict["keywords"]:
payload["keywords"] = command_dict["keywords"]
if "available_int" in command_dict:
payload["available_int"] = command_dict["available_int"]
response = self.http_client.request("POST", "/chunk/list", json_body=payload, use_api_base=False,
auth_kind="web")
res_json = response.json()
if response.status_code == 200:
if res_json["code"] == 0:
chunks = res_json["data"]["chunks"]
if chunks:
for i, chunk in enumerate(chunks):
print(f"\n--- Chunk {i+1} ---")
for key, value in chunk.items():
print(f" {key}: {value}")
else:
print("No chunks found")
else:
print(f"Fail to list chunks, code: {res_json['code']}, message: {res_json['message']}")
else:
print(f"Fail to list chunks, code: {res_json['code']}, message: {res_json['message']}")
def show_version(self, command):
if self.server_type == "admin":
response = self.http_client.request("GET", "/admin/version", use_api_base=True, auth_kind="admin")
@@ -1618,6 +1673,14 @@ class RAGFlowClient:
print(separator)
def _print_key_value(self, data: dict):
"""Print data as key-value pairs (one per line)"""
if not data:
print("No data to print")
return
for key, value in data.items():
print(f"{key}: {value}")
def run_command(client: RAGFlowClient, command_dict: dict):
command_type = command_dict["type"]
@@ -1761,6 +1824,10 @@ def run_command(client: RAGFlowClient, command_dict: dict):
client.import_docs_into_dataset(command_dict)
case "search_on_datasets":
return client.search_on_datasets(command_dict)
case "get_chunk":
return client.get_chunk(command_dict)
case "list_chunks":
return client.list_chunks(command_dict)
case "meta":
_handle_meta_command(command_dict)
case _:
@@ -1818,6 +1885,8 @@ LIST DOCUMENTS OF DATASET <dataset>
SEARCH <query> ON DATASETS <dataset>
LIST METADATA OF DATASETS <dataset>[, <dataset>]*
LIST METADATA SUMMARY OF DATASET <dataset> DOCUMENTS <doc_id>[, <doc_id>]*
GET CHUNK <chunk_id>
LIST CHUNKS OF DOCUMENT <doc_id> [PAGE <page>] [SIZE <size>] [KEYWORDS <keywords>] [AVAILABLE <0|1>]
Meta Commands:
\\?, \\h, \\help Show this help