Implement GetChunk() in Infinity in GO (#13758)

### What problem does this PR solve? Implement GetChunk() in Infinity in GO Add cli: GET CHUNK 'XXX'; LIST CHUNKS OF DOCUMENT 'XXX'; ### Type of change - [x] Refactoring
2026-06-29 15:31:05 +08:00 · 2026-03-24 20:10:21 +08:00
parent b308cd3a02
commit 7c8927c4fb
11 changed files with 989 additions and 75 deletions
--- a/admin/client/parser.py
+++ b/admin/client/parser.py
@@ -91,6 +91,8 @@ sql_command: login_user
           | parse_dataset_async
           | import_docs_into_dataset
           | search_on_datasets
+           | get_chunk
+           | list_chunks
           | create_chat_session
           | drop_chat_session
           | list_chat_sessions
@@ -164,6 +166,7 @@ DEFAULT: "DEFAULT"i
 CHATS: "CHATS"i
 CHAT: "CHAT"i
 FILES: "FILES"i
+DOCUMENT: "DOCUMENT"i
 DOCUMENTS: "DOCUMENTS"i
 METADATA: "METADATA"i
 SUMMARY: "SUMMARY"i
@@ -194,6 +197,13 @@ FINGERPRINT: "FINGERPRINT"i
 LICENSE: "LICENSE"i
 CHECK: "CHECK"i
 CONFIG: "CONFIG"i
+CHUNK: "CHUNK"i
+CHUNKS: "CHUNKS"i
+GET: "GET"i
+PAGE: "PAGE"i
+SIZE: "SIZE"i
+KEYWORDS: "KEYWORDS"i
+AVAILABLE: "AVAILABLE"i

 login_user: LOGIN USER quoted_string ";"
 list_services: LIST SERVICES ";"
@@ -321,6 +331,8 @@ list_user_model_providers: LIST MODEL PROVIDERS ";"
 list_user_default_models: LIST DEFAULT MODELS ";"
 import_docs_into_dataset: IMPORT quoted_string INTO DATASET quoted_string ";"
 search_on_datasets: SEARCH quoted_string ON DATASETS quoted_string ";"
+get_chunk: GET CHUNK quoted_string ";"
+list_chunks: LIST CHUNKS OF DOCUMENT quoted_string ("PAGE" NUMBER)? ("SIZE" NUMBER)? ("KEYWORDS" quoted_string)? ("AVAILABLE" NUMBER)? ";"

 parse_dataset_docs: PARSE quoted_string OF DATASET quoted_string ";"
 parse_dataset_sync: PARSE DATASET quoted_string SYNC ";"
@@ -698,6 +710,28 @@ class RAGFlowCLITransformer(Transformer):
            datasets = datasets.split(" ")
        return {"type": "search_on_datasets", "datasets": datasets, "question": question}

+    def get_chunk(self, items):
+        chunk_id = items[2].children[0].strip("'\"")
+        return {"type": "get_chunk", "chunk_id": chunk_id}
+
+    def list_chunks(self, items):
+        doc_id = items[4].children[0].strip("'\"")
+        result = {"type": "list_chunks", "doc_id": doc_id}
+
+        # Parse optional parameters: PAGE, SIZE, KEYWORDS, AVAILABLE
+        # items structure varies based on which params are present
+        for i, item in enumerate(items):
+            if str(item) == "PAGE":
+                result["page"] = int(items[i + 1])
+            elif str(item) == "SIZE":
+                result["size"] = int(items[i + 1])
+            elif str(item) == "KEYWORDS":
+                result["keywords"] = items[i + 1].children[0].strip("'\"")
+            elif str(item) == "AVAILABLE":
+                result["available_int"] = int(items[i + 1])
+
+        return result
+
    def benchmark(self, items):
        concurrency: int = int(items[1])
        iterations: int = int(items[2])
--- a/admin/client/ragflow_client.py
+++ b/admin/client/ragflow_client.py
@@ -1434,6 +1434,61 @@ class RAGFlowClient:
                print(
                    f"Fail to search datasets: {dataset_names}, code: {res_json['code']}, message: {res_json['message']}")

+    def get_chunk(self, command_dict):
+        if self.server_type != "user":
+            print("This command is only allowed in USER mode")
+            return
+
+        chunk_id = command_dict["chunk_id"]
+        response = self.http_client.request("GET", f"/chunk/get?chunk_id={chunk_id}", use_api_base=False,
+                                            auth_kind="web")
+        res_json = response.json()
+        if response.status_code == 200:
+            if res_json["code"] == 0:
+                self._print_key_value(res_json["data"])
+            else:
+                print(f"Fail to get chunk, code: {res_json['code']}, message: {res_json['message']}")
+        else:
+            print(f"Fail to get chunk, code: {res_json['code']}, message: {res_json['message']}")
+
+    def list_chunks(self, command_dict):
+        if self.server_type != "user":
+            print("This command is only allowed in USER mode")
+            return
+
+        doc_id = command_dict["doc_id"]
+        payload = {
+            "doc_id": doc_id,
+        }
+
+        # Add optional parameters (only if explicitly provided)
+        if "page" in command_dict:
+            payload["page"] = command_dict["page"]
+        if "size" in command_dict:
+            payload["size"] = command_dict["size"]
+        if "keywords" in command_dict and command_dict["keywords"]:
+            payload["keywords"] = command_dict["keywords"]
+        if "available_int" in command_dict:
+            payload["available_int"] = command_dict["available_int"]
+
+        response = self.http_client.request("POST", "/chunk/list", json_body=payload, use_api_base=False,
+                                            auth_kind="web")
+        res_json = response.json()
+        if response.status_code == 200:
+            if res_json["code"] == 0:
+                chunks = res_json["data"]["chunks"]
+                if chunks:
+                    for i, chunk in enumerate(chunks):
+                        print(f"\n--- Chunk {i+1} ---")
+                        for key, value in chunk.items():
+                            print(f"  {key}: {value}")
+                else:
+                    print("No chunks found")
+            else:
+                print(f"Fail to list chunks, code: {res_json['code']}, message: {res_json['message']}")
+        else:
+            print(f"Fail to list chunks, code: {res_json['code']}, message: {res_json['message']}")
+
    def show_version(self, command):
        if self.server_type == "admin":
            response = self.http_client.request("GET", "/admin/version", use_api_base=True, auth_kind="admin")
@@ -1618,6 +1673,14 @@ class RAGFlowClient:

        print(separator)

+    def _print_key_value(self, data: dict):
+        """Print data as key-value pairs (one per line)"""
+        if not data:
+            print("No data to print")
+            return
+        for key, value in data.items():
+            print(f"{key}: {value}")
+

 def run_command(client: RAGFlowClient, command_dict: dict):
    command_type = command_dict["type"]
@@ -1761,6 +1824,10 @@ def run_command(client: RAGFlowClient, command_dict: dict):
            client.import_docs_into_dataset(command_dict)
        case "search_on_datasets":
            return client.search_on_datasets(command_dict)
+        case "get_chunk":
+            return client.get_chunk(command_dict)
+        case "list_chunks":
+            return client.list_chunks(command_dict)
        case "meta":
            _handle_meta_command(command_dict)
        case _:
@@ -1818,6 +1885,8 @@ LIST DOCUMENTS OF DATASET <dataset>
 SEARCH <query> ON DATASETS <dataset>
 LIST METADATA OF DATASETS <dataset>[, <dataset>]*
 LIST METADATA SUMMARY OF DATASET <dataset> DOCUMENTS <doc_id>[, <doc_id>]*
+GET CHUNK <chunk_id>
+LIST CHUNKS OF DOCUMENT <doc_id> [PAGE <page>] [SIZE <size>] [KEYWORDS <keywords>] [AVAILABLE <0|1>]

 Meta Commands:
 \\?, \\h, \\help     Show this help