mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
### What problem does this PR solve? Enable reading Tag Set tags via API (expose tag_kwd field). The result of the queried list chunks is as shown below: <img width="1422" height="818" alt="image" src="https://github.com/user-attachments/assets/abd1960a-fe34-489e-9d72-525f8e574938" /> ### Type of change - [x] New Feature (non-breaking change which adds functionality) Co-authored-by: heyang.why <heyang.why@alibaba-inc.com>
This commit is contained in:
@@ -155,6 +155,10 @@ async def set():
|
||||
d["question_kwd"] = req["question_kwd"]
|
||||
d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["question_kwd"]))
|
||||
if "tag_kwd" in req:
|
||||
if not isinstance(req["tag_kwd"], list):
|
||||
return get_data_error_result(message="`tag_kwd` should be a list")
|
||||
if not all(isinstance(t, str) for t in req["tag_kwd"]):
|
||||
return get_data_error_result(message="`tag_kwd` must be a list of strings")
|
||||
d["tag_kwd"] = req["tag_kwd"]
|
||||
if "tag_feas" in req:
|
||||
d["tag_feas"] = req["tag_feas"]
|
||||
@@ -317,6 +321,12 @@ async def create():
|
||||
d["question_tks"] = rag_tokenizer.tokenize("\n".join(d["question_kwd"]))
|
||||
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
||||
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
||||
if "tag_kwd" in req:
|
||||
if not isinstance(req["tag_kwd"], list):
|
||||
return get_data_error_result(message="`tag_kwd` is required to be a list")
|
||||
if not all(isinstance(t, str) for t in req["tag_kwd"]):
|
||||
return get_data_error_result(message="`tag_kwd` must be a list of strings")
|
||||
d["tag_kwd"] = req["tag_kwd"]
|
||||
if "tag_feas" in req:
|
||||
d["tag_feas"] = req["tag_feas"]
|
||||
image_base64 = req.get("image_base64", None)
|
||||
|
||||
@@ -58,6 +58,7 @@ class Chunk(BaseModel):
|
||||
document_id: str = ""
|
||||
docnm_kwd: str = ""
|
||||
important_keywords: list = Field(default_factory=list)
|
||||
tag_kwd: list = Field(default_factory=list)
|
||||
questions: list = Field(default_factory=list)
|
||||
question_tks: str = ""
|
||||
image_id: str = ""
|
||||
@@ -1048,6 +1049,11 @@ async def list_chunks(tenant_id, dataset_id, document_id):
|
||||
items:
|
||||
type: string
|
||||
description: Important keywords.
|
||||
tag_kwd:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
description: Tag keywords.
|
||||
image_id:
|
||||
type: string
|
||||
description: Image ID associated with the chunk.
|
||||
@@ -1137,6 +1143,7 @@ async def list_chunks(tenant_id, dataset_id, document_id):
|
||||
"document_id": sres.field[id]["doc_id"],
|
||||
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
||||
"important_keywords": sres.field[id].get("important_kwd", []),
|
||||
"tag_kwd": sres.field[id].get("tag_kwd", []),
|
||||
"questions": sres.field[id].get("question_kwd", []),
|
||||
"dataset_id": sres.field[id].get("kb_id", sres.field[id].get("dataset_id")),
|
||||
"image_id": sres.field[id].get("img_id", ""),
|
||||
@@ -1251,6 +1258,10 @@ async def add_chunk(tenant_id, dataset_id, document_id):
|
||||
d["docnm_kwd"] = doc.name
|
||||
d["doc_id"] = document_id
|
||||
if "tag_kwd" in req:
|
||||
if not isinstance(req["tag_kwd"], list):
|
||||
return get_error_data_result("`tag_kwd` is required to be a list")
|
||||
if not all(isinstance(t, str) for t in req["tag_kwd"]):
|
||||
return get_error_data_result("`tag_kwd` must be a list of strings")
|
||||
d["tag_kwd"] = req["tag_kwd"]
|
||||
if "tag_feas" in req:
|
||||
d["tag_feas"] = req["tag_feas"]
|
||||
@@ -1283,6 +1294,7 @@ async def add_chunk(tenant_id, dataset_id, document_id):
|
||||
"content_with_weight": "content",
|
||||
"doc_id": "document_id",
|
||||
"important_kwd": "important_keywords",
|
||||
"tag_kwd": "tag_kwd",
|
||||
"question_kwd": "questions",
|
||||
"kb_id": "dataset_id",
|
||||
"create_timestamp_flt": "create_timestamp",
|
||||
@@ -1432,6 +1444,11 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
|
||||
items:
|
||||
type: string
|
||||
description: Updated important keywords.
|
||||
tag_kwd:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
description: Updated tag keywords.
|
||||
available:
|
||||
type: boolean
|
||||
description: Availability status of the chunk.
|
||||
@@ -1480,6 +1497,10 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
|
||||
return get_error_data_result("`positions` should be a list")
|
||||
d["position_int"] = req["positions"]
|
||||
if "tag_kwd" in req:
|
||||
if not isinstance(req["tag_kwd"], list):
|
||||
return get_error_data_result("`tag_kwd` should be a list")
|
||||
if not all(isinstance(t, str) for t in req["tag_kwd"]):
|
||||
return get_error_data_result("`tag_kwd` must be a list of strings")
|
||||
d["tag_kwd"] = req["tag_kwd"]
|
||||
if "tag_feas" in req:
|
||||
d["tag_feas"] = req["tag_feas"]
|
||||
|
||||
@@ -2005,6 +2005,7 @@ Adds a chunk to a specified document in a specified dataset.
|
||||
- Body:
|
||||
- `"content"`: `string`
|
||||
- `"important_keywords"`: `list[string]`
|
||||
- `"tag_kwd"`: `list[string]`
|
||||
- `"image_base64"`: `string`
|
||||
|
||||
##### Request example
|
||||
@@ -2031,6 +2032,8 @@ curl --request POST \
|
||||
The text content of the chunk.
|
||||
- `"important_keywords`(*Body parameter*), `list[string]`
|
||||
The key terms or phrases to tag with the chunk.
|
||||
- `"tag_kwd"`: (*Body parameter*), `list[string]`
|
||||
Tag keywords to associate with the chunk.
|
||||
- `"questions"`(*Body parameter*), `list[string]`
|
||||
If there is a given question, the embedded chunks will be based on them
|
||||
- `"image_base64"`: (*Body parameter*), `string`
|
||||
@@ -2053,6 +2056,7 @@ Success:
|
||||
"id": "12ccdc56e59837e5",
|
||||
"image_id": "",
|
||||
"important_keywords": [],
|
||||
"tag_kwd": [],
|
||||
"questions": []
|
||||
}
|
||||
}
|
||||
@@ -2123,6 +2127,7 @@ Success:
|
||||
"id": "b48c170e90f70af998485c1065490726",
|
||||
"image_id": "",
|
||||
"important_keywords": "",
|
||||
"tag_kwd": [],
|
||||
"positions": [
|
||||
""
|
||||
]
|
||||
@@ -2267,6 +2272,7 @@ Updates content or configurations for a specified chunk.
|
||||
- Body:
|
||||
- `"content"`: `string`
|
||||
- `"important_keywords"`: `list[string]`
|
||||
- `"tag_kwd"`: `list[string]`
|
||||
- `"available"`: `boolean`
|
||||
|
||||
##### Request example
|
||||
@@ -2295,6 +2301,8 @@ curl --request PUT \
|
||||
The text content of the chunk.
|
||||
- `"important_keywords"`: (*Body parameter*), `list[string]`
|
||||
A list of key terms or phrases to tag with the chunk.
|
||||
- `"tag_kwd"`: (*Body parameter*), `list[string]`
|
||||
Updated tag keywords.
|
||||
- `"available"`: (*Body parameter*) `boolean`
|
||||
The chunk's availability status in the dataset. Value options:
|
||||
- `true`: Available (default)
|
||||
@@ -2696,6 +2704,7 @@ Success:
|
||||
"important_keywords": [
|
||||
""
|
||||
],
|
||||
"tag_kwd": [],
|
||||
"kb_id": "c7ee74067a2c11efb21c0242ac120006",
|
||||
"positions": [
|
||||
""
|
||||
|
||||
@@ -855,7 +855,7 @@ print("Async bulk parsing cancelled.")
|
||||
### Add chunk
|
||||
|
||||
```python
|
||||
Document.add_chunk(content:str, important_keywords:list[str] = [], image_base64:str = None) -> Chunk
|
||||
Document.add_chunk(content:str, important_keywords:list[str] = [], image_base64:str = None, *, tag_kwd:list[str] = []) -> Chunk
|
||||
```
|
||||
|
||||
Adds a chunk to the current document.
|
||||
@@ -874,6 +874,10 @@ The key terms or phrases to tag with the chunk.
|
||||
|
||||
A base64-encoded image to associate with the chunk. If the chunk already has an image, the new image will be vertically concatenated below the existing one.
|
||||
|
||||
##### tag_kwd: `list[str]`
|
||||
|
||||
Tag keywords to associate with the chunk.
|
||||
|
||||
#### Returns
|
||||
|
||||
- Success: A `Chunk` object.
|
||||
@@ -884,6 +888,7 @@ A `Chunk` object contains the following attributes:
|
||||
- `id`: `str`: The chunk ID.
|
||||
- `content`: `str` The text content of the chunk.
|
||||
- `important_keywords`: `list[str]` A list of key terms or phrases tagged with the chunk.
|
||||
- `tag_kwd`: `list[str]` A list of tag keywords associated with the chunk.
|
||||
- `image_id`: `str` The image ID associated with the chunk (empty string if no image).
|
||||
- `create_time`: `str` The time when the chunk was created (added to the document).
|
||||
- `create_timestamp`: `float` The timestamp representing the creation time of the chunk, expressed in seconds since January 1, 1970.
|
||||
@@ -1024,6 +1029,7 @@ A dictionary representing the attributes to update, with the following keys:
|
||||
|
||||
- `"content"`: `str` The text content of the chunk.
|
||||
- `"important_keywords"`: `list[str]` A list of key terms or phrases to tag with the chunk.
|
||||
- `"tag_kwd"`: `list[str]` A list of tag keywords to associate with the chunk.
|
||||
- `"available"`: `bool` The chunk's availability status in the dataset. Value options:
|
||||
- `False`: Unavailable
|
||||
- `True`: Available (default)
|
||||
|
||||
@@ -532,6 +532,9 @@ func buildRetrievalTestResults(filteredChunks []map[string]interface{}) []map[st
|
||||
} else if v, ok := chunk["important_keywords"]; ok {
|
||||
result["important_kwd"] = v
|
||||
}
|
||||
if v, ok := chunk["tag_kwd"]; ok {
|
||||
result["tag_kwd"] = v
|
||||
}
|
||||
if v, ok := chunk["similarity"]; ok {
|
||||
result["similarity"] = v
|
||||
}
|
||||
|
||||
@@ -477,6 +477,7 @@ class Dealer:
|
||||
"docnm_kwd": dnm,
|
||||
"kb_id": chunk["kb_id"],
|
||||
"important_kwd": chunk.get("important_kwd", []),
|
||||
"tag_kwd": chunk.get("tag_kwd", []),
|
||||
"image_id": chunk.get("img_id", ""),
|
||||
"similarity": float(sim_np[i]),
|
||||
"vector_similarity": float(vsim[i]),
|
||||
|
||||
@@ -28,6 +28,7 @@ class Chunk(Base):
|
||||
self.id = ""
|
||||
self.content = ""
|
||||
self.important_keywords = []
|
||||
self.tag_kwd = []
|
||||
self.questions = []
|
||||
self.create_time = ""
|
||||
self.create_timestamp = 0.0
|
||||
|
||||
@@ -87,8 +87,8 @@ class Document(Base):
|
||||
return chunks
|
||||
raise Exception(res.get("message"))
|
||||
|
||||
def add_chunk(self, content: str, important_keywords: list[str] = [], questions: list[str] = [], image_base64: str | None = None):
|
||||
body = {"content": content, "important_keywords": important_keywords, "questions": questions}
|
||||
def add_chunk(self, content: str, important_keywords: list[str] = [], questions: list[str] = [], image_base64: str | None = None, *, tag_kwd: list[str] = []):
|
||||
body = {"content": content, "important_keywords": important_keywords, "tag_kwd": tag_kwd, "questions": questions}
|
||||
if image_base64 is not None:
|
||||
body["image_base64"] = image_base64
|
||||
res = self.post(f"/datasets/{self.dataset_id}/documents/{self.id}/chunks", body)
|
||||
|
||||
@@ -165,7 +165,7 @@ def token(auth):
|
||||
response = requests.post(url=url, headers=auth)
|
||||
res = response.json()
|
||||
if res.get("code") != 0:
|
||||
error_msg = f"access: {url}, POST method, error code: {res.get("code")}, message: {res.get('message')}"
|
||||
error_msg = f"access: {url}, POST method, error code: {res.get('code')}, message: {res.get('message')}"
|
||||
raise Exception(error_msg)
|
||||
return res["data"].get("token")
|
||||
|
||||
|
||||
@@ -30,6 +30,8 @@ def validate_chunk_details(dataset_id, document_id, payload, res):
|
||||
assert chunk["important_keywords"] == payload["important_keywords"]
|
||||
if "questions" in payload:
|
||||
assert chunk["questions"] == [str(q).strip() for q in payload.get("questions", []) if str(q).strip()]
|
||||
if "tag_kwd" in payload:
|
||||
assert chunk["tag_kwd"] == payload["tag_kwd"]
|
||||
|
||||
|
||||
@pytest.mark.p1
|
||||
@@ -76,7 +78,7 @@ class TestAddChunk:
|
||||
assert False, res
|
||||
chunks_count = res["data"]["doc"]["chunk_count"]
|
||||
res = add_chunk(HttpApiAuth, dataset_id, document_id, payload)
|
||||
assert res["code"] == expected_code
|
||||
assert res["code"] == expected_code, res
|
||||
if expected_code == 0:
|
||||
validate_chunk_details(dataset_id, document_id, payload, res)
|
||||
res = list_chunks(HttpApiAuth, dataset_id, document_id)
|
||||
@@ -109,7 +111,9 @@ class TestAddChunk:
|
||||
assert False, res
|
||||
chunks_count = res["data"]["doc"]["chunk_count"]
|
||||
res = add_chunk(HttpApiAuth, dataset_id, document_id, payload)
|
||||
assert res["code"] == expected_code
|
||||
assert res["code"] == expected_code, (
|
||||
f"Expected code: {expected_code}, got: {res['code']}, message: {res.get('message')}"
|
||||
)
|
||||
if expected_code == 0:
|
||||
validate_chunk_details(dataset_id, document_id, payload, res)
|
||||
res = list_chunks(HttpApiAuth, dataset_id, document_id)
|
||||
@@ -138,6 +142,35 @@ class TestAddChunk:
|
||||
assert False, res
|
||||
chunks_count = res["data"]["doc"]["chunk_count"]
|
||||
res = add_chunk(HttpApiAuth, dataset_id, document_id, payload)
|
||||
assert res["code"] == expected_code, res
|
||||
if expected_code == 0:
|
||||
validate_chunk_details(dataset_id, document_id, payload, res)
|
||||
res = list_chunks(HttpApiAuth, dataset_id, document_id)
|
||||
assert res["data"]["doc"]["chunk_count"] == chunks_count + 1
|
||||
else:
|
||||
assert res["message"] == expected_message
|
||||
|
||||
@pytest.mark.p2
|
||||
@pytest.mark.parametrize(
|
||||
"payload, expected_code, expected_message",
|
||||
[
|
||||
({"content": "chunk test", "tag_kwd": ["tag1", "tag2"]}, 0, ""),
|
||||
({"content": "chunk test", "tag_kwd": [""]}, 0, ""),
|
||||
({"content": "chunk test", "tag_kwd": [1]}, 102, "`tag_kwd` must be a list of strings"),
|
||||
({"content": "chunk test", "tag_kwd": ["tag", "tag"]}, 0, ""),
|
||||
({"content": "chunk test", "tag_kwd": "abc"}, 102, "`tag_kwd` is required to be a list"),
|
||||
({"content": "chunk test", "tag_kwd": 123}, 102, "`tag_kwd` is required to be a list"),
|
||||
],
|
||||
)
|
||||
def test_tag_kwd(self, HttpApiAuth, add_document, payload, expected_code, expected_message):
|
||||
dataset_id, document_id = add_document
|
||||
res = list_chunks(HttpApiAuth, dataset_id, document_id)
|
||||
if res["code"] != 0:
|
||||
assert False, res
|
||||
chunks_count = res["data"]["doc"]["chunk_count"]
|
||||
res = add_chunk(HttpApiAuth, dataset_id, document_id, payload)
|
||||
if res["code"] != expected_code:
|
||||
print(f"\nFAILED! Expected code: {expected_code}, got: {res['code']}, message: {res.get('message')}")
|
||||
assert res["code"] == expected_code
|
||||
if expected_code == 0:
|
||||
validate_chunk_details(dataset_id, document_id, payload, res)
|
||||
|
||||
@@ -115,6 +115,25 @@ class TestUpdatedChunk:
|
||||
if expected_code != 0:
|
||||
assert res["message"] == expected_message
|
||||
|
||||
@pytest.mark.p2
|
||||
@pytest.mark.parametrize(
|
||||
"payload, expected_code, expected_message",
|
||||
[
|
||||
({"tag_kwd": ["tag1", "tag2"]}, 0, ""),
|
||||
({"tag_kwd": [""]}, 0, ""),
|
||||
({"tag_kwd": [1]}, 102, "`tag_kwd` must be a list of strings"),
|
||||
({"tag_kwd": ["tag", "tag"]}, 0, ""),
|
||||
({"tag_kwd": "tag"}, 102, "`tag_kwd` should be a list"),
|
||||
({"tag_kwd": 123}, 102, "`tag_kwd` should be a list"),
|
||||
],
|
||||
)
|
||||
def test_tag_kwd(self, HttpApiAuth, add_chunks, payload, expected_code, expected_message):
|
||||
dataset_id, document_id, chunk_ids = add_chunks
|
||||
res = update_chunk(HttpApiAuth, dataset_id, document_id, chunk_ids[0], payload)
|
||||
assert res["code"] == expected_code
|
||||
if expected_code != 0:
|
||||
assert res["message"] == expected_message
|
||||
|
||||
@pytest.mark.p2
|
||||
@pytest.mark.parametrize(
|
||||
"payload, expected_code, expected_message",
|
||||
|
||||
@@ -28,6 +28,8 @@ def validate_chunk_details(dataset_id: str, document_id: str, payload: dict, chu
|
||||
assert chunk.important_keywords == payload["important_keywords"]
|
||||
if "questions" in payload:
|
||||
assert chunk.questions == [str(q).strip() for q in payload.get("questions", []) if str(q).strip()]
|
||||
if "tag_kwd" in payload:
|
||||
assert chunk.tag_kwd == payload["tag_kwd"]
|
||||
|
||||
|
||||
class TestAddChunk:
|
||||
@@ -115,6 +117,34 @@ class TestAddChunk:
|
||||
chunks = document.list_chunks()
|
||||
assert len(chunks) == chunks_count + 1, str(chunks)
|
||||
|
||||
@pytest.mark.p2
|
||||
@pytest.mark.parametrize(
|
||||
"payload, expected_message",
|
||||
[
|
||||
({"content": "chunk test test_tag_kwd 1", "tag_kwd": ["tag1", "tag2"]}, ""),
|
||||
({"content": "chunk test test_tag_kwd 2", "tag_kwd": [""]}, ""),
|
||||
({"content": "chunk test test_tag_kwd 3", "tag_kwd": [1]}, "not instance of"),
|
||||
({"content": "chunk test test_tag_kwd 4", "tag_kwd": ["tag", "tag"]}, ""),
|
||||
({"content": "chunk test test_tag_kwd 5", "tag_kwd": "abc"}, "not instance of"),
|
||||
({"content": "chunk test test_tag_kwd 6", "tag_kwd": 123}, "not instance of"),
|
||||
],
|
||||
)
|
||||
def test_tag_kwd(self, add_document, payload, expected_message):
|
||||
dataset, document = add_document
|
||||
chunks_count = len(document.list_chunks())
|
||||
|
||||
if expected_message:
|
||||
with pytest.raises(Exception) as exception_info:
|
||||
document.add_chunk(**payload)
|
||||
assert expected_message in str(exception_info.value), str(exception_info.value)
|
||||
else:
|
||||
chunk = document.add_chunk(**payload)
|
||||
validate_chunk_details(dataset.id, document.id, payload, chunk)
|
||||
|
||||
sleep(1)
|
||||
chunks = document.list_chunks()
|
||||
assert len(chunks) == chunks_count + 1, str(chunks)
|
||||
|
||||
@pytest.mark.p3
|
||||
def test_repeated_add_chunk(self, add_document):
|
||||
payload = {"content": "chunk test repeated_add_chunk"}
|
||||
|
||||
@@ -102,6 +102,29 @@ class TestUpdatedChunk:
|
||||
else:
|
||||
chunk.update(payload)
|
||||
|
||||
@pytest.mark.p2
|
||||
@pytest.mark.parametrize(
|
||||
"payload, expected_message",
|
||||
[
|
||||
({"tag_kwd": ["tag1", "tag2"]}, ""),
|
||||
({"tag_kwd": [""]}, ""),
|
||||
({"tag_kwd": [1]}, "`tag_kwd` must be a list of strings"),
|
||||
({"tag_kwd": ["tag", "tag"]}, ""),
|
||||
({"tag_kwd": "tag"}, "`tag_kwd` should be a list"),
|
||||
({"tag_kwd": 123}, "`tag_kwd` should be a list"),
|
||||
],
|
||||
)
|
||||
def test_tag_kwd(self, add_chunks, payload, expected_message):
|
||||
_, _, chunks = add_chunks
|
||||
chunk = chunks[0]
|
||||
|
||||
if expected_message:
|
||||
with pytest.raises(Exception) as exception_info:
|
||||
chunk.update(payload)
|
||||
assert expected_message in str(exception_info.value), str(exception_info.value)
|
||||
else:
|
||||
chunk.update(payload)
|
||||
|
||||
@pytest.mark.p2
|
||||
@pytest.mark.parametrize(
|
||||
"payload, expected_message",
|
||||
|
||||
Reference in New Issue
Block a user