feat: support reading tags via API (#12891) (#13732)

### What problem does this PR solve? Enable reading Tag Set tags via API (expose tag_kwd field). The result of the queried list chunks is as shown below: <img width="1422" height="818" alt="image" src="https://github.com/user-attachments/assets/abd1960a-fe34-489e-9d72-525f8e574938" /> ### Type of change - [x] New Feature (non-breaking change which adds functionality) Co-authored-by: heyang.why <heyang.why@alibaba-inc.com>
2026-06-29 15:31:05 +08:00 · 2026-03-29 20:17:01 +08:00
parent cb78ce0a7b
commit 641b319647
13 changed files with 162 additions and 6 deletions
--- a/api/apps/chunk_app.py
+++ b/api/apps/chunk_app.py
@@ -155,6 +155,10 @@ async def set():
        d["question_kwd"] = req["question_kwd"]
        d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"]))
    if "tag_kwd" in req:
+        if not isinstance(req["tag_kwd"], list):
+            return get_data_error_result(message="`tag_kwd` should be a list")
+        if not all(isinstance(t, str) for t in req["tag_kwd"]):
+            return get_data_error_result(message="`tag_kwd` must be a list of strings")
        d["tag_kwd"] = req["tag_kwd"]
    if "tag_feas" in req:
        d["tag_feas"] = req["tag_feas"]
@@ -317,6 +321,12 @@ async def create():
    d["question_tks"] = rag_tokenizer.tokenize("\n".join(d["question_kwd"]))
    d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
    d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
+    if "tag_kwd" in req:
+        if not isinstance(req["tag_kwd"], list):
+            return get_data_error_result(message="`tag_kwd` is required to be a list")
+        if not all(isinstance(t, str) for t in req["tag_kwd"]):
+            return get_data_error_result(message="`tag_kwd` must be a list of strings")
+        d["tag_kwd"] = req["tag_kwd"]
    if "tag_feas" in req:
        d["tag_feas"] = req["tag_feas"]
    image_base64 = req.get("image_base64", None)
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@@ -58,6 +58,7 @@ class Chunk(BaseModel):
    document_id: str = ""
    docnm_kwd: str = ""
    important_keywords: list = Field(default_factory=list)
+    tag_kwd: list = Field(default_factory=list)
    questions: list = Field(default_factory=list)
    question_tks: str = ""
    image_id: str = ""
@@ -1048,6 +1049,11 @@ async def list_chunks(tenant_id, dataset_id, document_id):
                    items:
                      type: string
                    description: Important keywords.
+                  tag_kwd:
+                    type: array
+                    items:
+                      type: string
+                    description: Tag keywords.
                  image_id:
                    type: string
                    description: Image ID associated with the chunk.
@@ -1137,6 +1143,7 @@ async def list_chunks(tenant_id, dataset_id, document_id):
                "document_id": sres.field[id]["doc_id"],
                "docnm_kwd": sres.field[id]["docnm_kwd"],
                "important_keywords": sres.field[id].get("important_kwd", []),
+                "tag_kwd": sres.field[id].get("tag_kwd", []),
                "questions": sres.field[id].get("question_kwd", []),
                "dataset_id": sres.field[id].get("kb_id", sres.field[id].get("dataset_id")),
                "image_id": sres.field[id].get("img_id", ""),
@@ -1251,6 +1258,10 @@ async def add_chunk(tenant_id, dataset_id, document_id):
    d["docnm_kwd"] = doc.name
    d["doc_id"] = document_id
    if "tag_kwd" in req:
+        if not isinstance(req["tag_kwd"], list):
+            return get_error_data_result("`tag_kwd` is required to be a list")
+        if not all(isinstance(t, str) for t in req["tag_kwd"]):
+            return get_error_data_result("`tag_kwd` must be a list of strings")
        d["tag_kwd"] = req["tag_kwd"]
    if "tag_feas" in req:
        d["tag_feas"] = req["tag_feas"]
@@ -1283,6 +1294,7 @@ async def add_chunk(tenant_id, dataset_id, document_id):
        "content_with_weight": "content",
        "doc_id": "document_id",
        "important_kwd": "important_keywords",
+        "tag_kwd": "tag_kwd",
        "question_kwd": "questions",
        "kb_id": "dataset_id",
        "create_timestamp_flt": "create_timestamp",
@@ -1432,6 +1444,11 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
              items:
                type: string
              description: Updated important keywords.
+            tag_kwd:
+              type: array
+              items:
+                type: string
+              description: Updated tag keywords.
            available:
              type: boolean
              description: Availability status of the chunk.
@@ -1480,6 +1497,10 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
            return get_error_data_result("`positions` should be a list")
        d["position_int"] = req["positions"]
    if "tag_kwd" in req:
+        if not isinstance(req["tag_kwd"], list):
+            return get_error_data_result("`tag_kwd` should be a list")
+        if not all(isinstance(t, str) for t in req["tag_kwd"]):
+            return get_error_data_result("`tag_kwd` must be a list of strings")
        d["tag_kwd"] = req["tag_kwd"]
    if "tag_feas" in req:
        d["tag_feas"] = req["tag_feas"]
--- a/docs/references/http_api_reference.md
+++ b/docs/references/http_api_reference.md
@@ -2005,6 +2005,7 @@ Adds a chunk to a specified document in a specified dataset.
 - Body:
  - `"content"`: `string`
  - `"important_keywords"`: `list[string]`
+  - `"tag_kwd"`: `list[string]`
  - `"image_base64"`: `string`

 ##### Request example
@@ -2031,6 +2032,8 @@ curl --request POST \
  The text content of the chunk.
 - `"important_keywords`(*Body parameter*), `list[string]`
  The key terms or phrases to tag with the chunk.
+- `"tag_kwd"`: (*Body parameter*), `list[string]`
+  Tag keywords to associate with the chunk.
 - `"questions"`(*Body parameter*), `list[string]`
  If there is a given question, the embedded chunks will be based on them
 - `"image_base64"`: (*Body parameter*), `string`
@@ -2053,6 +2056,7 @@ Success:
            "id": "12ccdc56e59837e5",
            "image_id": "",
            "important_keywords": [],
+            "tag_kwd": [],
            "questions": []
        }
    }
@@ -2123,6 +2127,7 @@ Success:
                "id": "b48c170e90f70af998485c1065490726",
                "image_id": "",
                "important_keywords": "",
+                "tag_kwd": [],
                "positions": [
                    ""
                ]
@@ -2267,6 +2272,7 @@ Updates content or configurations for a specified chunk.
 - Body:
  - `"content"`: `string`
  - `"important_keywords"`: `list[string]`
+  - `"tag_kwd"`: `list[string]`
  - `"available"`: `boolean`

 ##### Request example
@@ -2295,6 +2301,8 @@ curl --request PUT \
  The text content of the chunk.
 - `"important_keywords"`: (*Body parameter*), `list[string]`  
  A list of key terms or phrases to tag with the chunk.
+- `"tag_kwd"`: (*Body parameter*), `list[string]`  
+  Updated tag keywords.
 - `"available"`: (*Body parameter*) `boolean`  
  The chunk's availability status in the dataset. Value options:  
  - `true`: Available (default)
@@ -2696,6 +2704,7 @@ Success:
                "important_keywords": [
                    ""
                ],
+                "tag_kwd": [],
                "kb_id": "c7ee74067a2c11efb21c0242ac120006",
                "positions": [
                    ""
--- a/docs/references/python_api_reference.md
+++ b/docs/references/python_api_reference.md
@@ -855,7 +855,7 @@ print("Async bulk parsing cancelled.")
 ### Add chunk

 ```python
-Document.add_chunk(content:str, important_keywords:list[str] = [], image_base64:str = None) -> Chunk
+Document.add_chunk(content:str, important_keywords:list[str] = [], image_base64:str = None, *, tag_kwd:list[str] = []) -> Chunk
 ```

 Adds a chunk to the current document.
@@ -874,6 +874,10 @@ The key terms or phrases to tag with the chunk.

 A base64-encoded image to associate with the chunk. If the chunk already has an image, the new image will be vertically concatenated below the existing one.

+##### tag_kwd: `list[str]`
+
+Tag keywords to associate with the chunk.
+
 #### Returns

 - Success: A `Chunk` object.
@@ -884,6 +888,7 @@ A `Chunk` object contains the following attributes:
 - `id`: `str`: The chunk ID.
 - `content`: `str` The text content of the chunk.
 - `important_keywords`: `list[str]` A list of key terms or phrases tagged with the chunk.
+- `tag_kwd`: `list[str]` A list of tag keywords associated with the chunk.
 - `image_id`: `str` The image ID associated with the chunk (empty string if no image).
 - `create_time`: `str` The time when the chunk was created (added to the document).
 - `create_timestamp`: `float` The timestamp representing the creation time of the chunk, expressed in seconds since January 1, 1970.
@@ -1024,6 +1029,7 @@ A dictionary representing the attributes to update, with the following keys:

 - `"content"`: `str` The text content of the chunk.
 - `"important_keywords"`: `list[str]` A list of key terms or phrases to tag with the chunk.
+- `"tag_kwd"`: `list[str]` A list of tag keywords to associate with the chunk.
 - `"available"`: `bool` The chunk's availability status in the dataset. Value options:
  - `False`: Unavailable
  - `True`: Available (default)
--- a/internal/service/chunk.go
+++ b/internal/service/chunk.go
@@ -532,6 +532,9 @@ func buildRetrievalTestResults(filteredChunks []map[string]interface{}) []map[st
 		} else if v, ok := chunk["important_keywords"]; ok {
 			result["important_kwd"] = v
 		}
+		if v, ok := chunk["tag_kwd"]; ok {
+			result["tag_kwd"] = v
+		}
 		if v, ok := chunk["similarity"]; ok {
 			result["similarity"] = v
 		}
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@@ -477,6 +477,7 @@ class Dealer:
                "docnm_kwd": dnm,
                "kb_id": chunk["kb_id"],
                "important_kwd": chunk.get("important_kwd", []),
+                "tag_kwd": chunk.get("tag_kwd", []),
                "image_id": chunk.get("img_id", ""),
                "similarity": float(sim_np[i]),
                "vector_similarity": float(vsim[i]),
--- a/sdk/python/ragflow_sdk/modules/chunk.py
+++ b/sdk/python/ragflow_sdk/modules/chunk.py
@@ -28,6 +28,7 @@ class Chunk(Base):
        self.id = ""
        self.content = ""
        self.important_keywords = []
+        self.tag_kwd = []
        self.questions = []
        self.create_time = ""
        self.create_timestamp = 0.0
--- a/sdk/python/ragflow_sdk/modules/document.py
+++ b/sdk/python/ragflow_sdk/modules/document.py
@@ -87,8 +87,8 @@ class Document(Base):
            return chunks
        raise Exception(res.get("message"))

-    def add_chunk(self, content: str, important_keywords: list[str] = [], questions: list[str] = [], image_base64: str | None = None):
-        body = {"content": content, "important_keywords": important_keywords, "questions": questions}
+    def add_chunk(self, content: str, important_keywords: list[str] = [], questions: list[str] = [], image_base64: str | None = None, *, tag_kwd: list[str] = []):
+        body = {"content": content, "important_keywords": important_keywords, "tag_kwd": tag_kwd, "questions": questions}
        if image_base64 is not None:
            body["image_base64"] = image_base64
        res = self.post(f"/datasets/{self.dataset_id}/documents/{self.id}/chunks", body)
--- a/test/testcases/conftest.py
+++ b/test/testcases/conftest.py
@@ -165,7 +165,7 @@ def token(auth):
    response = requests.post(url=url, headers=auth)
    res = response.json()
    if res.get("code") != 0:
-        error_msg = f"access: {url}, POST method, error code: {res.get("code")}, message: {res.get('message')}"
+        error_msg = f"access: {url}, POST method, error code: {res.get('code')}, message: {res.get('message')}"
        raise Exception(error_msg)
    return res["data"].get("token")

--- a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py
+++ b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py
@@ -30,6 +30,8 @@ def validate_chunk_details(dataset_id, document_id, payload, res):
        assert chunk["important_keywords"] == payload["important_keywords"]
    if "questions" in payload:
        assert chunk["questions"] == [str(q).strip() for q in payload.get("questions", []) if str(q).strip()]
+    if "tag_kwd" in payload:
+        assert chunk["tag_kwd"] == payload["tag_kwd"]


@pytest.mark.p1
@@ -76,7 +78,7 @@ class TestAddChunk:
            assert False, res
        chunks_count = res["data"]["doc"]["chunk_count"]
        res = add_chunk(HttpApiAuth, dataset_id, document_id, payload)
-        assert res["code"] == expected_code
+        assert res["code"] == expected_code, res
        if expected_code == 0:
            validate_chunk_details(dataset_id, document_id, payload, res)
            res = list_chunks(HttpApiAuth, dataset_id, document_id)
@@ -109,7 +111,9 @@ class TestAddChunk:
            assert False, res
        chunks_count = res["data"]["doc"]["chunk_count"]
        res = add_chunk(HttpApiAuth, dataset_id, document_id, payload)
-        assert res["code"] == expected_code
+        assert res["code"] == expected_code, (
+            f"Expected code: {expected_code}, got: {res['code']}, message: {res.get('message')}"
+        )
        if expected_code == 0:
            validate_chunk_details(dataset_id, document_id, payload, res)
            res = list_chunks(HttpApiAuth, dataset_id, document_id)
@@ -138,6 +142,35 @@ class TestAddChunk:
            assert False, res
        chunks_count = res["data"]["doc"]["chunk_count"]
        res = add_chunk(HttpApiAuth, dataset_id, document_id, payload)
+        assert res["code"] == expected_code, res
+        if expected_code == 0:
+            validate_chunk_details(dataset_id, document_id, payload, res)
+            res = list_chunks(HttpApiAuth, dataset_id, document_id)
+            assert res["data"]["doc"]["chunk_count"] == chunks_count + 1
+        else:
+            assert res["message"] == expected_message
+
+    @pytest.mark.p2
+    @pytest.mark.parametrize(
+        "payload, expected_code, expected_message",
+        [
+            ({"content": "chunk test", "tag_kwd": ["tag1", "tag2"]}, 0, ""),
+            ({"content": "chunk test", "tag_kwd": [""]}, 0, ""),
+            ({"content": "chunk test", "tag_kwd": [1]}, 102, "`tag_kwd` must be a list of strings"),
+            ({"content": "chunk test", "tag_kwd": ["tag", "tag"]}, 0, ""),
+            ({"content": "chunk test", "tag_kwd": "abc"}, 102, "`tag_kwd` is required to be a list"),
+            ({"content": "chunk test", "tag_kwd": 123}, 102, "`tag_kwd` is required to be a list"),
+        ],
+    )
+    def test_tag_kwd(self, HttpApiAuth, add_document, payload, expected_code, expected_message):
+        dataset_id, document_id = add_document
+        res = list_chunks(HttpApiAuth, dataset_id, document_id)
+        if res["code"] != 0:
+            assert False, res
+        chunks_count = res["data"]["doc"]["chunk_count"]
+        res = add_chunk(HttpApiAuth, dataset_id, document_id, payload)
+        if res["code"] != expected_code:
+            print(f"\nFAILED! Expected code: {expected_code}, got: {res['code']}, message: {res.get('message')}")
        assert res["code"] == expected_code
        if expected_code == 0:
            validate_chunk_details(dataset_id, document_id, payload, res)
--- a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py
+++ b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py
@@ -115,6 +115,25 @@ class TestUpdatedChunk:
        if expected_code != 0:
            assert res["message"] == expected_message

+    @pytest.mark.p2
+    @pytest.mark.parametrize(
+        "payload, expected_code, expected_message",
+        [
+            ({"tag_kwd": ["tag1", "tag2"]}, 0, ""),
+            ({"tag_kwd": [""]}, 0, ""),
+            ({"tag_kwd": [1]}, 102, "`tag_kwd` must be a list of strings"),
+            ({"tag_kwd": ["tag", "tag"]}, 0, ""),
+            ({"tag_kwd": "tag"}, 102, "`tag_kwd` should be a list"),
+            ({"tag_kwd": 123}, 102, "`tag_kwd` should be a list"),
+        ],
+    )
+    def test_tag_kwd(self, HttpApiAuth, add_chunks, payload, expected_code, expected_message):
+        dataset_id, document_id, chunk_ids = add_chunks
+        res = update_chunk(HttpApiAuth, dataset_id, document_id, chunk_ids[0], payload)
+        assert res["code"] == expected_code
+        if expected_code != 0:
+            assert res["message"] == expected_message
+
    @pytest.mark.p2
    @pytest.mark.parametrize(
        "payload, expected_code, expected_message",
--- a/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_add_chunk.py
+++ b/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_add_chunk.py
@@ -28,6 +28,8 @@ def validate_chunk_details(dataset_id: str, document_id: str, payload: dict, chu
        assert chunk.important_keywords == payload["important_keywords"]
    if "questions" in payload:
        assert chunk.questions == [str(q).strip() for q in payload.get("questions", []) if str(q).strip()]
+    if "tag_kwd" in payload:
+        assert chunk.tag_kwd == payload["tag_kwd"]


 class TestAddChunk:
@@ -115,6 +117,34 @@ class TestAddChunk:
            chunks = document.list_chunks()
            assert len(chunks) == chunks_count + 1, str(chunks)

+    @pytest.mark.p2
+    @pytest.mark.parametrize(
+        "payload, expected_message",
+        [
+            ({"content": "chunk test test_tag_kwd 1", "tag_kwd": ["tag1", "tag2"]}, ""),
+            ({"content": "chunk test test_tag_kwd 2", "tag_kwd": [""]}, ""),
+            ({"content": "chunk test test_tag_kwd 3", "tag_kwd": [1]}, "not instance of"),
+            ({"content": "chunk test test_tag_kwd 4", "tag_kwd": ["tag", "tag"]}, ""),
+            ({"content": "chunk test test_tag_kwd 5", "tag_kwd": "abc"}, "not instance of"),
+            ({"content": "chunk test test_tag_kwd 6", "tag_kwd": 123}, "not instance of"),
+        ],
+    )
+    def test_tag_kwd(self, add_document, payload, expected_message):
+        dataset, document = add_document
+        chunks_count = len(document.list_chunks())
+
+        if expected_message:
+            with pytest.raises(Exception) as exception_info:
+                document.add_chunk(**payload)
+            assert expected_message in str(exception_info.value), str(exception_info.value)
+        else:
+            chunk = document.add_chunk(**payload)
+            validate_chunk_details(dataset.id, document.id, payload, chunk)
+
+            sleep(1)
+            chunks = document.list_chunks()
+            assert len(chunks) == chunks_count + 1, str(chunks)
+
    @pytest.mark.p3
    def test_repeated_add_chunk(self, add_document):
        payload = {"content": "chunk test repeated_add_chunk"}
--- a/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_update_chunk.py
+++ b/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_update_chunk.py
@@ -102,6 +102,29 @@ class TestUpdatedChunk:
        else:
            chunk.update(payload)

+    @pytest.mark.p2
+    @pytest.mark.parametrize(
+        "payload, expected_message",
+        [
+            ({"tag_kwd": ["tag1", "tag2"]}, ""),
+            ({"tag_kwd": [""]}, ""),
+            ({"tag_kwd": [1]}, "`tag_kwd` must be a list of strings"),
+            ({"tag_kwd": ["tag", "tag"]}, ""),
+            ({"tag_kwd": "tag"}, "`tag_kwd` should be a list"),
+            ({"tag_kwd": 123}, "`tag_kwd` should be a list"),
+        ],
+    )
+    def test_tag_kwd(self, add_chunks, payload, expected_message):
+        _, _, chunks = add_chunks
+        chunk = chunks[0]
+
+        if expected_message:
+            with pytest.raises(Exception) as exception_info:
+                chunk.update(payload)
+            assert expected_message in str(exception_info.value), str(exception_info.value)
+        else:
+            chunk.update(payload)
+
    @pytest.mark.p2
    @pytest.mark.parametrize(
        "payload, expected_message",