diff --git a/api/apps/restful_apis/chunk_api.py b/api/apps/restful_apis/chunk_api.py index 13b5cb5801..d3a30710e8 100644 --- a/api/apps/restful_apis/chunk_api.py +++ b/api/apps/restful_apis/chunk_api.py @@ -96,12 +96,22 @@ def _strip_chunk_runtime_fields(chunk): return chunk +def _get_dataset_tenant_id(dataset_id): + ok, kb = KnowledgebaseService.get_by_id(dataset_id) + if not ok: + return None + return kb.tenant_id + + @manager.route("/datasets//documents//chunks", methods=["GET"]) # noqa: F821 @login_required @add_tenant_id_to_kwargs async def list_chunks(tenant_id, dataset_id, document_id): if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") + dataset_tenant_id = _get_dataset_tenant_id(dataset_id) + if not dataset_tenant_id: + return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") doc = DocumentService.query(id=document_id, kb_id=dataset_id) if not doc: return get_error_data_result(message=f"You don't own the document {document_id}.") @@ -122,7 +132,7 @@ async def list_chunks(tenant_id, dataset_id, document_id): res = {"total": 0, "chunks": [], "doc": _map_doc(doc)} if req.get("id"): - chunk = settings.docStoreConn.get(req.get("id"), search.index_name(tenant_id), [dataset_id]) + chunk = settings.docStoreConn.get(req.get("id"), search.index_name(dataset_tenant_id), [dataset_id]) if not chunk: return get_result(message=f"Chunk not found: {dataset_id}/{req.get('id')}", code=RetCode.DATA_ERROR) if str(chunk.get("doc_id", chunk.get("document_id"))) != str(document_id): @@ -145,10 +155,10 @@ async def list_chunks(tenant_id, dataset_id, document_id): } res["chunks"].append(final_chunk) _ = Chunk(**final_chunk) - elif settings.docStoreConn.index_exist(search.index_name(tenant_id), dataset_id): + elif settings.docStoreConn.index_exist(search.index_name(dataset_tenant_id), dataset_id): sres = await settings.retriever.search( query, - search.index_name(tenant_id), + search.index_name(dataset_tenant_id), [dataset_id], emb_mdl=None, highlight=True, @@ -183,11 +193,14 @@ async def list_chunks(tenant_id, dataset_id, document_id): async def get_chunk(tenant_id, dataset_id, document_id, chunk_id): if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") + dataset_tenant_id = _get_dataset_tenant_id(dataset_id) + if not dataset_tenant_id: + return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") doc = DocumentService.query(id=document_id, kb_id=dataset_id) if not doc: return get_error_data_result(message=f"You don't own the document {document_id}.") try: - chunk = settings.docStoreConn.get(chunk_id, search.index_name(tenant_id), [dataset_id]) + chunk = settings.docStoreConn.get(chunk_id, search.index_name(dataset_tenant_id), [dataset_id]) if chunk is None or str(chunk.get("doc_id", chunk.get("document_id"))) != str(document_id): return get_result(data=False, message="Chunk not found!", code=RetCode.DATA_ERROR) return get_result(data=_strip_chunk_runtime_fields(chunk)) @@ -203,6 +216,9 @@ async def get_chunk(tenant_id, dataset_id, document_id, chunk_id): async def add_chunk(tenant_id, dataset_id, document_id): if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") + dataset_tenant_id = _get_dataset_tenant_id(dataset_id) + if not dataset_tenant_id: + return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") doc = DocumentService.query(id=document_id, kb_id=dataset_id) if not doc: return get_error_data_result(message=f"You don't own the document {document_id}.") @@ -254,12 +270,12 @@ async def add_chunk(tenant_id, dataset_id, document_id): model_config = get_model_config_by_id(tenant_embd_id) else: embd_id = DocumentService.get_embd_id(document_id) - model_config = get_model_config_by_type_and_name(tenant_id, LLMType.EMBEDDING.value, embd_id) + model_config = get_model_config_by_type_and_name(dataset_tenant_id, LLMType.EMBEDDING.value, embd_id) embd_mdl = TenantLLMService.model_instance(model_config) v, c = embd_mdl.encode([doc.name, req["content"] if not d["question_kwd"] else "\n".join(d["question_kwd"])]) v = 0.1 * v[0] + 0.9 * v[1] d[f"q_{len(v)}_vec"] = v.tolist() - settings.docStoreConn.insert([d], search.index_name(tenant_id), dataset_id) + settings.docStoreConn.insert([d], search.index_name(dataset_tenant_id), dataset_id) if image_base64: store_chunk_image(dataset_id, chunk_id, base64.b64decode(image_base64)) @@ -289,6 +305,9 @@ async def add_chunk(tenant_id, dataset_id, document_id): async def rm_chunk(tenant_id, dataset_id, document_id): if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") + dataset_tenant_id = _get_dataset_tenant_id(dataset_id) + if not dataset_tenant_id: + return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") docs = DocumentService.query(id=document_id, kb_id=dataset_id) if not docs: return get_error_data_result(message=f"You don't own the document {document_id}.") @@ -300,8 +319,8 @@ async def rm_chunk(tenant_id, dataset_id, document_id): if not chunk_ids: if req.get("delete_all") is True: doc = docs[0] - DocumentService.delete_chunk_images(doc, tenant_id) - chunk_number = settings.docStoreConn.delete({"doc_id": document_id}, search.index_name(tenant_id), dataset_id) + DocumentService.delete_chunk_images(doc, dataset_tenant_id) + chunk_number = settings.docStoreConn.delete({"doc_id": document_id}, search.index_name(dataset_tenant_id), dataset_id) if chunk_number != 0: DocumentService.decrement_chunk_num(document_id, dataset_id, 1, chunk_number, 0) return get_result(message=f"deleted {chunk_number} chunks") @@ -310,7 +329,7 @@ async def rm_chunk(tenant_id, dataset_id, document_id): unique_chunk_ids, duplicate_messages = check_duplicate_ids(chunk_ids, "chunk") chunk_number = settings.docStoreConn.delete( {"doc_id": document_id, "id": unique_chunk_ids}, - search.index_name(tenant_id), + search.index_name(dataset_tenant_id), dataset_id, ) if chunk_number != 0: @@ -333,11 +352,14 @@ async def rm_chunk(tenant_id, dataset_id, document_id): async def update_chunk(tenant_id, dataset_id, document_id, chunk_id): if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") + dataset_tenant_id = _get_dataset_tenant_id(dataset_id) + if not dataset_tenant_id: + return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") doc = DocumentService.query(id=document_id, kb_id=dataset_id) if not doc: return get_error_data_result(message=f"You don't own the document {document_id}.") doc = doc[0] - chunk = settings.docStoreConn.get(chunk_id, search.index_name(tenant_id), [dataset_id]) + chunk = settings.docStoreConn.get(chunk_id, search.index_name(dataset_tenant_id), [dataset_id]) if chunk is None or str(chunk.get("doc_id", chunk.get("document_id"))) != str(document_id): return get_error_data_result(f"Can't find this chunk {chunk_id}") req = await get_request_json() @@ -387,7 +409,7 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id): model_config = get_model_config_by_id(tenant_embd_id) else: embd_id = DocumentService.get_embd_id(document_id) - model_config = get_model_config_by_type_and_name(tenant_id, LLMType.EMBEDDING.value, embd_id) + model_config = get_model_config_by_type_and_name(dataset_tenant_id, LLMType.EMBEDDING.value, embd_id) embd_mdl = TenantLLMService.model_instance(model_config) if doc.parser_id == ParserType.QA: arr = [t for t in re.split(r"[\n\t]", d["content_with_weight"]) if len(t) > 1] @@ -404,7 +426,7 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id): ) v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] d[f"q_{len(v)}_vec"] = v.tolist() - settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id) + settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(dataset_tenant_id), dataset_id) if image_base64: store_chunk_image(dataset_id, chunk_id, base64.b64decode(image_base64)) return get_result() @@ -416,6 +438,9 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id): async def switch_chunks(tenant_id, dataset_id, document_id): if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") + dataset_tenant_id = _get_dataset_tenant_id(dataset_id) + if not dataset_tenant_id: + return get_error_data_result(message=f"You don't own the dataset {dataset_id}.") req = await get_request_json() if not req.get("chunk_ids"): return get_error_data_result(message="`chunk_ids` is required.") @@ -434,7 +459,7 @@ async def switch_chunks(tenant_id, dataset_id, document_id): if not settings.docStoreConn.update( {"id": cid}, {"available_int": available_int}, - search.index_name(tenant_id), + search.index_name(dataset_tenant_id), doc.kb_id, ): return get_error_data_result(message="Index updating failure") diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py index ca440d4ae0..b4ee851745 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py @@ -706,6 +706,36 @@ class TestDocRoutesUnit: assert res["data"]["total"] == 1 assert res["data"]["chunks"][0]["id"] == "chunk-1" + def test_list_chunks_uses_dataset_owner_index_for_team_dataset(self, monkeypatch): + module = _load_restful_chunk_module(monkeypatch) + seen = {} + monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: True) + monkeypatch.setattr( + module.KnowledgebaseService, + "get_by_id", + lambda _dataset_id: (True, SimpleNamespace(tenant_id="owner-tenant")), + ) + monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [_DummyDoc(kb_id="ds-1")]) + monkeypatch.setattr(module, "request", SimpleNamespace(args=_DummyArgs({}))) + + def _index_exist(index_name, dataset_id): + seen["index_exist"] = (index_name, dataset_id) + return True + + class _Retriever: + async def search(self, _query, index_name, dataset_ids, *_args, **_kwargs): + seen["search"] = (index_name, dataset_ids) + return SimpleNamespace(total=0, ids=[], field={}, highlight={}) + + _patch_docstore(monkeypatch, module, index_exist=_index_exist) + monkeypatch.setattr(module.settings, "retriever", _Retriever()) + + res = _run(_route_core(module.list_chunks)("member-tenant", "ds-1", "doc-1")) + + assert res["code"] == 0 + assert seen["index_exist"] == ("idx-owner-tenant", "ds-1") + assert seen["search"] == ("idx-owner-tenant", ["ds-1"]) + def test_add_chunk_access_guard(self, monkeypatch): module = _load_restful_chunk_module(monkeypatch) monkeypatch.setattr(module.KnowledgebaseService, "accessible", lambda **_kwargs: False) diff --git a/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py b/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py index 339bd19bd0..52c1ea5de6 100644 --- a/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py +++ b/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py @@ -377,7 +377,7 @@ def _load_chunk_module(monkeypatch): @staticmethod def get_by_id(_kb_id): - return True, SimpleNamespace(pagerank=0.6, tenant_embd_id=2, tenant_llm_id=1) + return True, SimpleNamespace(pagerank=0.6, tenant_id="tenant-1", tenant_embd_id=2, tenant_llm_id=1) kb_service_mod.KnowledgebaseService = _KnowledgebaseService monkeypatch.setitem(sys.modules, "api.db.services.knowledgebase_service", kb_service_mod) @@ -653,4 +653,3 @@ def test_restful_chunk_guard_branches_unit(monkeypatch): res = _run(_route_core(module.switch_chunks)("tenant-1", "kb-1", "doc-1")) assert res["message"] == "`available_int` or `available` is required.", res -