mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
Fix: shared dataset chunk index lookup (#14764)
### What problem does this PR solve? shared dataset chunk index lookup ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@@ -96,12 +96,22 @@ def _strip_chunk_runtime_fields(chunk):
|
||||
return chunk
|
||||
|
||||
|
||||
def _get_dataset_tenant_id(dataset_id):
|
||||
ok, kb = KnowledgebaseService.get_by_id(dataset_id)
|
||||
if not ok:
|
||||
return None
|
||||
return kb.tenant_id
|
||||
|
||||
|
||||
@manager.route("/datasets/<dataset_id>/documents/<document_id>/chunks", methods=["GET"]) # noqa: F821
|
||||
@login_required
|
||||
@add_tenant_id_to_kwargs
|
||||
async def list_chunks(tenant_id, dataset_id, document_id):
|
||||
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
dataset_tenant_id = _get_dataset_tenant_id(dataset_id)
|
||||
if not dataset_tenant_id:
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
|
||||
if not doc:
|
||||
return get_error_data_result(message=f"You don't own the document {document_id}.")
|
||||
@@ -122,7 +132,7 @@ async def list_chunks(tenant_id, dataset_id, document_id):
|
||||
|
||||
res = {"total": 0, "chunks": [], "doc": _map_doc(doc)}
|
||||
if req.get("id"):
|
||||
chunk = settings.docStoreConn.get(req.get("id"), search.index_name(tenant_id), [dataset_id])
|
||||
chunk = settings.docStoreConn.get(req.get("id"), search.index_name(dataset_tenant_id), [dataset_id])
|
||||
if not chunk:
|
||||
return get_result(message=f"Chunk not found: {dataset_id}/{req.get('id')}", code=RetCode.DATA_ERROR)
|
||||
if str(chunk.get("doc_id", chunk.get("document_id"))) != str(document_id):
|
||||
@@ -145,10 +155,10 @@ async def list_chunks(tenant_id, dataset_id, document_id):
|
||||
}
|
||||
res["chunks"].append(final_chunk)
|
||||
_ = Chunk(**final_chunk)
|
||||
elif settings.docStoreConn.index_exist(search.index_name(tenant_id), dataset_id):
|
||||
elif settings.docStoreConn.index_exist(search.index_name(dataset_tenant_id), dataset_id):
|
||||
sres = await settings.retriever.search(
|
||||
query,
|
||||
search.index_name(tenant_id),
|
||||
search.index_name(dataset_tenant_id),
|
||||
[dataset_id],
|
||||
emb_mdl=None,
|
||||
highlight=True,
|
||||
@@ -183,11 +193,14 @@ async def list_chunks(tenant_id, dataset_id, document_id):
|
||||
async def get_chunk(tenant_id, dataset_id, document_id, chunk_id):
|
||||
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
dataset_tenant_id = _get_dataset_tenant_id(dataset_id)
|
||||
if not dataset_tenant_id:
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
|
||||
if not doc:
|
||||
return get_error_data_result(message=f"You don't own the document {document_id}.")
|
||||
try:
|
||||
chunk = settings.docStoreConn.get(chunk_id, search.index_name(tenant_id), [dataset_id])
|
||||
chunk = settings.docStoreConn.get(chunk_id, search.index_name(dataset_tenant_id), [dataset_id])
|
||||
if chunk is None or str(chunk.get("doc_id", chunk.get("document_id"))) != str(document_id):
|
||||
return get_result(data=False, message="Chunk not found!", code=RetCode.DATA_ERROR)
|
||||
return get_result(data=_strip_chunk_runtime_fields(chunk))
|
||||
@@ -203,6 +216,9 @@ async def get_chunk(tenant_id, dataset_id, document_id, chunk_id):
|
||||
async def add_chunk(tenant_id, dataset_id, document_id):
|
||||
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
dataset_tenant_id = _get_dataset_tenant_id(dataset_id)
|
||||
if not dataset_tenant_id:
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
|
||||
if not doc:
|
||||
return get_error_data_result(message=f"You don't own the document {document_id}.")
|
||||
@@ -254,12 +270,12 @@ async def add_chunk(tenant_id, dataset_id, document_id):
|
||||
model_config = get_model_config_by_id(tenant_embd_id)
|
||||
else:
|
||||
embd_id = DocumentService.get_embd_id(document_id)
|
||||
model_config = get_model_config_by_type_and_name(tenant_id, LLMType.EMBEDDING.value, embd_id)
|
||||
model_config = get_model_config_by_type_and_name(dataset_tenant_id, LLMType.EMBEDDING.value, embd_id)
|
||||
embd_mdl = TenantLLMService.model_instance(model_config)
|
||||
v, c = embd_mdl.encode([doc.name, req["content"] if not d["question_kwd"] else "\n".join(d["question_kwd"])])
|
||||
v = 0.1 * v[0] + 0.9 * v[1]
|
||||
d[f"q_{len(v)}_vec"] = v.tolist()
|
||||
settings.docStoreConn.insert([d], search.index_name(tenant_id), dataset_id)
|
||||
settings.docStoreConn.insert([d], search.index_name(dataset_tenant_id), dataset_id)
|
||||
|
||||
if image_base64:
|
||||
store_chunk_image(dataset_id, chunk_id, base64.b64decode(image_base64))
|
||||
@@ -289,6 +305,9 @@ async def add_chunk(tenant_id, dataset_id, document_id):
|
||||
async def rm_chunk(tenant_id, dataset_id, document_id):
|
||||
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
dataset_tenant_id = _get_dataset_tenant_id(dataset_id)
|
||||
if not dataset_tenant_id:
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
docs = DocumentService.query(id=document_id, kb_id=dataset_id)
|
||||
if not docs:
|
||||
return get_error_data_result(message=f"You don't own the document {document_id}.")
|
||||
@@ -300,8 +319,8 @@ async def rm_chunk(tenant_id, dataset_id, document_id):
|
||||
if not chunk_ids:
|
||||
if req.get("delete_all") is True:
|
||||
doc = docs[0]
|
||||
DocumentService.delete_chunk_images(doc, tenant_id)
|
||||
chunk_number = settings.docStoreConn.delete({"doc_id": document_id}, search.index_name(tenant_id), dataset_id)
|
||||
DocumentService.delete_chunk_images(doc, dataset_tenant_id)
|
||||
chunk_number = settings.docStoreConn.delete({"doc_id": document_id}, search.index_name(dataset_tenant_id), dataset_id)
|
||||
if chunk_number != 0:
|
||||
DocumentService.decrement_chunk_num(document_id, dataset_id, 1, chunk_number, 0)
|
||||
return get_result(message=f"deleted {chunk_number} chunks")
|
||||
@@ -310,7 +329,7 @@ async def rm_chunk(tenant_id, dataset_id, document_id):
|
||||
unique_chunk_ids, duplicate_messages = check_duplicate_ids(chunk_ids, "chunk")
|
||||
chunk_number = settings.docStoreConn.delete(
|
||||
{"doc_id": document_id, "id": unique_chunk_ids},
|
||||
search.index_name(tenant_id),
|
||||
search.index_name(dataset_tenant_id),
|
||||
dataset_id,
|
||||
)
|
||||
if chunk_number != 0:
|
||||
@@ -333,11 +352,14 @@ async def rm_chunk(tenant_id, dataset_id, document_id):
|
||||
async def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
|
||||
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
dataset_tenant_id = _get_dataset_tenant_id(dataset_id)
|
||||
if not dataset_tenant_id:
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
|
||||
if not doc:
|
||||
return get_error_data_result(message=f"You don't own the document {document_id}.")
|
||||
doc = doc[0]
|
||||
chunk = settings.docStoreConn.get(chunk_id, search.index_name(tenant_id), [dataset_id])
|
||||
chunk = settings.docStoreConn.get(chunk_id, search.index_name(dataset_tenant_id), [dataset_id])
|
||||
if chunk is None or str(chunk.get("doc_id", chunk.get("document_id"))) != str(document_id):
|
||||
return get_error_data_result(f"Can't find this chunk {chunk_id}")
|
||||
req = await get_request_json()
|
||||
@@ -387,7 +409,7 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
|
||||
model_config = get_model_config_by_id(tenant_embd_id)
|
||||
else:
|
||||
embd_id = DocumentService.get_embd_id(document_id)
|
||||
model_config = get_model_config_by_type_and_name(tenant_id, LLMType.EMBEDDING.value, embd_id)
|
||||
model_config = get_model_config_by_type_and_name(dataset_tenant_id, LLMType.EMBEDDING.value, embd_id)
|
||||
embd_mdl = TenantLLMService.model_instance(model_config)
|
||||
if doc.parser_id == ParserType.QA:
|
||||
arr = [t for t in re.split(r"[\n\t]", d["content_with_weight"]) if len(t) > 1]
|
||||
@@ -404,7 +426,7 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
|
||||
)
|
||||
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
||||
d[f"q_{len(v)}_vec"] = v.tolist()
|
||||
settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id)
|
||||
settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(dataset_tenant_id), dataset_id)
|
||||
if image_base64:
|
||||
store_chunk_image(dataset_id, chunk_id, base64.b64decode(image_base64))
|
||||
return get_result()
|
||||
@@ -416,6 +438,9 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
|
||||
async def switch_chunks(tenant_id, dataset_id, document_id):
|
||||
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
dataset_tenant_id = _get_dataset_tenant_id(dataset_id)
|
||||
if not dataset_tenant_id:
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
req = await get_request_json()
|
||||
if not req.get("chunk_ids"):
|
||||
return get_error_data_result(message="`chunk_ids` is required.")
|
||||
@@ -434,7 +459,7 @@ async def switch_chunks(tenant_id, dataset_id, document_id):
|
||||
if not settings.docStoreConn.update(
|
||||
{"id": cid},
|
||||
{"available_int": available_int},
|
||||
search.index_name(tenant_id),
|
||||
search.index_name(dataset_tenant_id),
|
||||
doc.kb_id,
|
||||
):
|
||||
return get_error_data_result(message="Index updating failure")
|
||||
|
||||
@@ -706,6 +706,36 @@ class TestDocRoutesUnit:
|
||||
assert res["data"]["total"] == 1
|
||||
assert res["data"]["chunks"][0]["id"] == "chunk-1"
|
||||
|
||||
def test_list_chunks_uses_dataset_owner_index_for_team_dataset(self, monkeypatch):
|
||||
module = _load_restful_chunk_module(monkeypatch)
|
||||
seen = {}
|
||||
monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: True)
|
||||
monkeypatch.setattr(
|
||||
module.KnowledgebaseService,
|
||||
"get_by_id",
|
||||
lambda _dataset_id: (True, SimpleNamespace(tenant_id="owner-tenant")),
|
||||
)
|
||||
monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [_DummyDoc(kb_id="ds-1")])
|
||||
monkeypatch.setattr(module, "request", SimpleNamespace(args=_DummyArgs({})))
|
||||
|
||||
def _index_exist(index_name, dataset_id):
|
||||
seen["index_exist"] = (index_name, dataset_id)
|
||||
return True
|
||||
|
||||
class _Retriever:
|
||||
async def search(self, _query, index_name, dataset_ids, *_args, **_kwargs):
|
||||
seen["search"] = (index_name, dataset_ids)
|
||||
return SimpleNamespace(total=0, ids=[], field={}, highlight={})
|
||||
|
||||
_patch_docstore(monkeypatch, module, index_exist=_index_exist)
|
||||
monkeypatch.setattr(module.settings, "retriever", _Retriever())
|
||||
|
||||
res = _run(_route_core(module.list_chunks)("member-tenant", "ds-1", "doc-1"))
|
||||
|
||||
assert res["code"] == 0
|
||||
assert seen["index_exist"] == ("idx-owner-tenant", "ds-1")
|
||||
assert seen["search"] == ("idx-owner-tenant", ["ds-1"])
|
||||
|
||||
def test_add_chunk_access_guard(self, monkeypatch):
|
||||
module = _load_restful_chunk_module(monkeypatch)
|
||||
monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False)
|
||||
|
||||
@@ -377,7 +377,7 @@ def _load_chunk_module(monkeypatch):
|
||||
|
||||
@staticmethod
|
||||
def get_by_id(_kb_id):
|
||||
return True, SimpleNamespace(pagerank=0.6, tenant_embd_id=2, tenant_llm_id=1)
|
||||
return True, SimpleNamespace(pagerank=0.6, tenant_id="tenant-1", tenant_embd_id=2, tenant_llm_id=1)
|
||||
|
||||
kb_service_mod.KnowledgebaseService = _KnowledgebaseService
|
||||
monkeypatch.setitem(sys.modules, "api.db.services.knowledgebase_service", kb_service_mod)
|
||||
@@ -653,4 +653,3 @@ def test_restful_chunk_guard_branches_unit(monkeypatch):
|
||||
res = _run(_route_core(module.switch_chunks)("tenant-1", "kb-1", "doc-1"))
|
||||
assert res["message"] == "`available_int` or `available` is required.", res
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user