feat: support reading tags via API (#12891) (#13732)

### What problem does this PR solve?

Enable reading Tag Set tags via API (expose tag_kwd field). The result
of the queried list chunks is as shown below:

<img width="1422" height="818" alt="image"
src="https://github.com/user-attachments/assets/abd1960a-fe34-489e-9d72-525f8e574938"
/>


### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Co-authored-by: heyang.why <heyang.why@alibaba-inc.com>
This commit is contained in:
Heyang Wang
2026-03-29 20:17:01 +08:00
committed by GitHub
parent cb78ce0a7b
commit 641b319647
13 changed files with 162 additions and 6 deletions

View File

@@ -155,6 +155,10 @@ async def set():
d["question_kwd"] = req["question_kwd"]
d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"]))
if "tag_kwd" in req:
if not isinstance(req["tag_kwd"], list):
return get_data_error_result(message="`tag_kwd` should be a list")
if not all(isinstance(t, str) for t in req["tag_kwd"]):
return get_data_error_result(message="`tag_kwd` must be a list of strings")
d["tag_kwd"] = req["tag_kwd"]
if "tag_feas" in req:
d["tag_feas"] = req["tag_feas"]
@@ -317,6 +321,12 @@ async def create():
d["question_tks"] = rag_tokenizer.tokenize("\n".join(d["question_kwd"]))
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
if "tag_kwd" in req:
if not isinstance(req["tag_kwd"], list):
return get_data_error_result(message="`tag_kwd` is required to be a list")
if not all(isinstance(t, str) for t in req["tag_kwd"]):
return get_data_error_result(message="`tag_kwd` must be a list of strings")
d["tag_kwd"] = req["tag_kwd"]
if "tag_feas" in req:
d["tag_feas"] = req["tag_feas"]
image_base64 = req.get("image_base64", None)

View File

@@ -58,6 +58,7 @@ class Chunk(BaseModel):
document_id: str = ""
docnm_kwd: str = ""
important_keywords: list = Field(default_factory=list)
tag_kwd: list = Field(default_factory=list)
questions: list = Field(default_factory=list)
question_tks: str = ""
image_id: str = ""
@@ -1048,6 +1049,11 @@ async def list_chunks(tenant_id, dataset_id, document_id):
items:
type: string
description: Important keywords.
tag_kwd:
type: array
items:
type: string
description: Tag keywords.
image_id:
type: string
description: Image ID associated with the chunk.
@@ -1137,6 +1143,7 @@ async def list_chunks(tenant_id, dataset_id, document_id):
"document_id": sres.field[id]["doc_id"],
"docnm_kwd": sres.field[id]["docnm_kwd"],
"important_keywords": sres.field[id].get("important_kwd", []),
"tag_kwd": sres.field[id].get("tag_kwd", []),
"questions": sres.field[id].get("question_kwd", []),
"dataset_id": sres.field[id].get("kb_id", sres.field[id].get("dataset_id")),
"image_id": sres.field[id].get("img_id", ""),
@@ -1251,6 +1258,10 @@ async def add_chunk(tenant_id, dataset_id, document_id):
d["docnm_kwd"] = doc.name
d["doc_id"] = document_id
if "tag_kwd" in req:
if not isinstance(req["tag_kwd"], list):
return get_error_data_result("`tag_kwd` is required to be a list")
if not all(isinstance(t, str) for t in req["tag_kwd"]):
return get_error_data_result("`tag_kwd` must be a list of strings")
d["tag_kwd"] = req["tag_kwd"]
if "tag_feas" in req:
d["tag_feas"] = req["tag_feas"]
@@ -1283,6 +1294,7 @@ async def add_chunk(tenant_id, dataset_id, document_id):
"content_with_weight": "content",
"doc_id": "document_id",
"important_kwd": "important_keywords",
"tag_kwd": "tag_kwd",
"question_kwd": "questions",
"kb_id": "dataset_id",
"create_timestamp_flt": "create_timestamp",
@@ -1432,6 +1444,11 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
items:
type: string
description: Updated important keywords.
tag_kwd:
type: array
items:
type: string
description: Updated tag keywords.
available:
type: boolean
description: Availability status of the chunk.
@@ -1480,6 +1497,10 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
return get_error_data_result("`positions` should be a list")
d["position_int"] = req["positions"]
if "tag_kwd" in req:
if not isinstance(req["tag_kwd"], list):
return get_error_data_result("`tag_kwd` should be a list")
if not all(isinstance(t, str) for t in req["tag_kwd"]):
return get_error_data_result("`tag_kwd` must be a list of strings")
d["tag_kwd"] = req["tag_kwd"]
if "tag_feas" in req:
d["tag_feas"] = req["tag_feas"]

View File

@@ -2005,6 +2005,7 @@ Adds a chunk to a specified document in a specified dataset.
- Body:
- `"content"`: `string`
- `"important_keywords"`: `list[string]`
- `"tag_kwd"`: `list[string]`
- `"image_base64"`: `string`
##### Request example
@@ -2031,6 +2032,8 @@ curl --request POST \
The text content of the chunk.
- `"important_keywords`(*Body parameter*), `list[string]`
The key terms or phrases to tag with the chunk.
- `"tag_kwd"`: (*Body parameter*), `list[string]`
Tag keywords to associate with the chunk.
- `"questions"`(*Body parameter*), `list[string]`
If there is a given question, the embedded chunks will be based on them
- `"image_base64"`: (*Body parameter*), `string`
@@ -2053,6 +2056,7 @@ Success:
"id": "12ccdc56e59837e5",
"image_id": "",
"important_keywords": [],
"tag_kwd": [],
"questions": []
}
}
@@ -2123,6 +2127,7 @@ Success:
"id": "b48c170e90f70af998485c1065490726",
"image_id": "",
"important_keywords": "",
"tag_kwd": [],
"positions": [
""
]
@@ -2267,6 +2272,7 @@ Updates content or configurations for a specified chunk.
- Body:
- `"content"`: `string`
- `"important_keywords"`: `list[string]`
- `"tag_kwd"`: `list[string]`
- `"available"`: `boolean`
##### Request example
@@ -2295,6 +2301,8 @@ curl --request PUT \
The text content of the chunk.
- `"important_keywords"`: (*Body parameter*), `list[string]`
A list of key terms or phrases to tag with the chunk.
- `"tag_kwd"`: (*Body parameter*), `list[string]`
Updated tag keywords.
- `"available"`: (*Body parameter*) `boolean`
The chunk's availability status in the dataset. Value options:
- `true`: Available (default)
@@ -2696,6 +2704,7 @@ Success:
"important_keywords": [
""
],
"tag_kwd": [],
"kb_id": "c7ee74067a2c11efb21c0242ac120006",
"positions": [
""

View File

@@ -855,7 +855,7 @@ print("Async bulk parsing cancelled.")
### Add chunk
```python
Document.add_chunk(content:str, important_keywords:list[str] = [], image_base64:str = None) -> Chunk
Document.add_chunk(content:str, important_keywords:list[str] = [], image_base64:str = None, *, tag_kwd:list[str] = []) -> Chunk
```
Adds a chunk to the current document.
@@ -874,6 +874,10 @@ The key terms or phrases to tag with the chunk.
A base64-encoded image to associate with the chunk. If the chunk already has an image, the new image will be vertically concatenated below the existing one.
##### tag_kwd: `list[str]`
Tag keywords to associate with the chunk.
#### Returns
- Success: A `Chunk` object.
@@ -884,6 +888,7 @@ A `Chunk` object contains the following attributes:
- `id`: `str`: The chunk ID.
- `content`: `str` The text content of the chunk.
- `important_keywords`: `list[str]` A list of key terms or phrases tagged with the chunk.
- `tag_kwd`: `list[str]` A list of tag keywords associated with the chunk.
- `image_id`: `str` The image ID associated with the chunk (empty string if no image).
- `create_time`: `str` The time when the chunk was created (added to the document).
- `create_timestamp`: `float` The timestamp representing the creation time of the chunk, expressed in seconds since January 1, 1970.
@@ -1024,6 +1029,7 @@ A dictionary representing the attributes to update, with the following keys:
- `"content"`: `str` The text content of the chunk.
- `"important_keywords"`: `list[str]` A list of key terms or phrases to tag with the chunk.
- `"tag_kwd"`: `list[str]` A list of tag keywords to associate with the chunk.
- `"available"`: `bool` The chunk's availability status in the dataset. Value options:
- `False`: Unavailable
- `True`: Available (default)

View File

@@ -532,6 +532,9 @@ func buildRetrievalTestResults(filteredChunks []map[string]interface{}) []map[st
} else if v, ok := chunk["important_keywords"]; ok {
result["important_kwd"] = v
}
if v, ok := chunk["tag_kwd"]; ok {
result["tag_kwd"] = v
}
if v, ok := chunk["similarity"]; ok {
result["similarity"] = v
}

View File

@@ -477,6 +477,7 @@ class Dealer:
"docnm_kwd": dnm,
"kb_id": chunk["kb_id"],
"important_kwd": chunk.get("important_kwd", []),
"tag_kwd": chunk.get("tag_kwd", []),
"image_id": chunk.get("img_id", ""),
"similarity": float(sim_np[i]),
"vector_similarity": float(vsim[i]),

View File

@@ -28,6 +28,7 @@ class Chunk(Base):
self.id = ""
self.content = ""
self.important_keywords = []
self.tag_kwd = []
self.questions = []
self.create_time = ""
self.create_timestamp = 0.0

View File

@@ -87,8 +87,8 @@ class Document(Base):
return chunks
raise Exception(res.get("message"))
def add_chunk(self, content: str, important_keywords: list[str] = [], questions: list[str] = [], image_base64: str | None = None):
body = {"content": content, "important_keywords": important_keywords, "questions": questions}
def add_chunk(self, content: str, important_keywords: list[str] = [], questions: list[str] = [], image_base64: str | None = None, *, tag_kwd: list[str] = []):
body = {"content": content, "important_keywords": important_keywords, "tag_kwd": tag_kwd, "questions": questions}
if image_base64 is not None:
body["image_base64"] = image_base64
res = self.post(f"/datasets/{self.dataset_id}/documents/{self.id}/chunks", body)

View File

@@ -165,7 +165,7 @@ def token(auth):
response = requests.post(url=url, headers=auth)
res = response.json()
if res.get("code") != 0:
error_msg = f"access: {url}, POST method, error code: {res.get("code")}, message: {res.get('message')}"
error_msg = f"access: {url}, POST method, error code: {res.get('code')}, message: {res.get('message')}"
raise Exception(error_msg)
return res["data"].get("token")

View File

@@ -30,6 +30,8 @@ def validate_chunk_details(dataset_id, document_id, payload, res):
assert chunk["important_keywords"] == payload["important_keywords"]
if "questions" in payload:
assert chunk["questions"] == [str(q).strip() for q in payload.get("questions", []) if str(q).strip()]
if "tag_kwd" in payload:
assert chunk["tag_kwd"] == payload["tag_kwd"]
@pytest.mark.p1
@@ -76,7 +78,7 @@ class TestAddChunk:
assert False, res
chunks_count = res["data"]["doc"]["chunk_count"]
res = add_chunk(HttpApiAuth, dataset_id, document_id, payload)
assert res["code"] == expected_code
assert res["code"] == expected_code, res
if expected_code == 0:
validate_chunk_details(dataset_id, document_id, payload, res)
res = list_chunks(HttpApiAuth, dataset_id, document_id)
@@ -109,7 +111,9 @@ class TestAddChunk:
assert False, res
chunks_count = res["data"]["doc"]["chunk_count"]
res = add_chunk(HttpApiAuth, dataset_id, document_id, payload)
assert res["code"] == expected_code
assert res["code"] == expected_code, (
f"Expected code: {expected_code}, got: {res['code']}, message: {res.get('message')}"
)
if expected_code == 0:
validate_chunk_details(dataset_id, document_id, payload, res)
res = list_chunks(HttpApiAuth, dataset_id, document_id)
@@ -138,6 +142,35 @@ class TestAddChunk:
assert False, res
chunks_count = res["data"]["doc"]["chunk_count"]
res = add_chunk(HttpApiAuth, dataset_id, document_id, payload)
assert res["code"] == expected_code, res
if expected_code == 0:
validate_chunk_details(dataset_id, document_id, payload, res)
res = list_chunks(HttpApiAuth, dataset_id, document_id)
assert res["data"]["doc"]["chunk_count"] == chunks_count + 1
else:
assert res["message"] == expected_message
@pytest.mark.p2
@pytest.mark.parametrize(
"payload, expected_code, expected_message",
[
({"content": "chunk test", "tag_kwd": ["tag1", "tag2"]}, 0, ""),
({"content": "chunk test", "tag_kwd": [""]}, 0, ""),
({"content": "chunk test", "tag_kwd": [1]}, 102, "`tag_kwd` must be a list of strings"),
({"content": "chunk test", "tag_kwd": ["tag", "tag"]}, 0, ""),
({"content": "chunk test", "tag_kwd": "abc"}, 102, "`tag_kwd` is required to be a list"),
({"content": "chunk test", "tag_kwd": 123}, 102, "`tag_kwd` is required to be a list"),
],
)
def test_tag_kwd(self, HttpApiAuth, add_document, payload, expected_code, expected_message):
dataset_id, document_id = add_document
res = list_chunks(HttpApiAuth, dataset_id, document_id)
if res["code"] != 0:
assert False, res
chunks_count = res["data"]["doc"]["chunk_count"]
res = add_chunk(HttpApiAuth, dataset_id, document_id, payload)
if res["code"] != expected_code:
print(f"\nFAILED! Expected code: {expected_code}, got: {res['code']}, message: {res.get('message')}")
assert res["code"] == expected_code
if expected_code == 0:
validate_chunk_details(dataset_id, document_id, payload, res)

View File

@@ -115,6 +115,25 @@ class TestUpdatedChunk:
if expected_code != 0:
assert res["message"] == expected_message
@pytest.mark.p2
@pytest.mark.parametrize(
"payload, expected_code, expected_message",
[
({"tag_kwd": ["tag1", "tag2"]}, 0, ""),
({"tag_kwd": [""]}, 0, ""),
({"tag_kwd": [1]}, 102, "`tag_kwd` must be a list of strings"),
({"tag_kwd": ["tag", "tag"]}, 0, ""),
({"tag_kwd": "tag"}, 102, "`tag_kwd` should be a list"),
({"tag_kwd": 123}, 102, "`tag_kwd` should be a list"),
],
)
def test_tag_kwd(self, HttpApiAuth, add_chunks, payload, expected_code, expected_message):
dataset_id, document_id, chunk_ids = add_chunks
res = update_chunk(HttpApiAuth, dataset_id, document_id, chunk_ids[0], payload)
assert res["code"] == expected_code
if expected_code != 0:
assert res["message"] == expected_message
@pytest.mark.p2
@pytest.mark.parametrize(
"payload, expected_code, expected_message",

View File

@@ -28,6 +28,8 @@ def validate_chunk_details(dataset_id: str, document_id: str, payload: dict, chu
assert chunk.important_keywords == payload["important_keywords"]
if "questions" in payload:
assert chunk.questions == [str(q).strip() for q in payload.get("questions", []) if str(q).strip()]
if "tag_kwd" in payload:
assert chunk.tag_kwd == payload["tag_kwd"]
class TestAddChunk:
@@ -115,6 +117,34 @@ class TestAddChunk:
chunks = document.list_chunks()
assert len(chunks) == chunks_count + 1, str(chunks)
@pytest.mark.p2
@pytest.mark.parametrize(
"payload, expected_message",
[
({"content": "chunk test test_tag_kwd 1", "tag_kwd": ["tag1", "tag2"]}, ""),
({"content": "chunk test test_tag_kwd 2", "tag_kwd": [""]}, ""),
({"content": "chunk test test_tag_kwd 3", "tag_kwd": [1]}, "not instance of"),
({"content": "chunk test test_tag_kwd 4", "tag_kwd": ["tag", "tag"]}, ""),
({"content": "chunk test test_tag_kwd 5", "tag_kwd": "abc"}, "not instance of"),
({"content": "chunk test test_tag_kwd 6", "tag_kwd": 123}, "not instance of"),
],
)
def test_tag_kwd(self, add_document, payload, expected_message):
dataset, document = add_document
chunks_count = len(document.list_chunks())
if expected_message:
with pytest.raises(Exception) as exception_info:
document.add_chunk(**payload)
assert expected_message in str(exception_info.value), str(exception_info.value)
else:
chunk = document.add_chunk(**payload)
validate_chunk_details(dataset.id, document.id, payload, chunk)
sleep(1)
chunks = document.list_chunks()
assert len(chunks) == chunks_count + 1, str(chunks)
@pytest.mark.p3
def test_repeated_add_chunk(self, add_document):
payload = {"content": "chunk test repeated_add_chunk"}

View File

@@ -102,6 +102,29 @@ class TestUpdatedChunk:
else:
chunk.update(payload)
@pytest.mark.p2
@pytest.mark.parametrize(
"payload, expected_message",
[
({"tag_kwd": ["tag1", "tag2"]}, ""),
({"tag_kwd": [""]}, ""),
({"tag_kwd": [1]}, "`tag_kwd` must be a list of strings"),
({"tag_kwd": ["tag", "tag"]}, ""),
({"tag_kwd": "tag"}, "`tag_kwd` should be a list"),
({"tag_kwd": 123}, "`tag_kwd` should be a list"),
],
)
def test_tag_kwd(self, add_chunks, payload, expected_message):
_, _, chunks = add_chunks
chunk = chunks[0]
if expected_message:
with pytest.raises(Exception) as exception_info:
chunk.update(payload)
assert expected_message in str(exception_info.value), str(exception_info.value)
else:
chunk.update(payload)
@pytest.mark.p2
@pytest.mark.parametrize(
"payload, expected_message",