From 7e83c5f4219e4e11ae92cfda114b18c992f0ad91 Mon Sep 17 00:00:00 2001 From: Sebastion Date: Wed, 6 May 2026 07:55:41 +0100 Subject: [PATCH] fix: authorize beta document downloads by tenant (#14496) ## Summary This fixes a missing authorization check in the beta API document download endpoint: - **CWE:** CWE-862 (Missing Authorization) - **Severity:** Medium - **Affected route/file:** `GET /api/v1/documents/` in `api/apps/sdk/doc.py` - **Data flow:** the route reads a bearer beta API token, resolves the token with `APIToken.query(beta=token)`, accepts `document_id` directly from the URL, loads the document with `DocumentService.query(id=document_id)`, and then fetches the backing object through `File2DocumentService.get_storage_address()` / `settings.STORAGE_IMPL.get()`. Before this change, that flow verified that the API token was valid, but it did not verify that the token's tenant owned the document's knowledge base. A caller with any valid beta API token and a known document ID could therefore reach storage for a document belonging to another tenant. ## Fix The endpoint now takes the tenant ID from the resolved API token and checks the document's knowledge base with: ```python KnowledgebaseService.query(id=doc[0].kb_id, tenant_id=tenant_id) ``` If the knowledge base is not owned by the token tenant, the request returns an access error before any storage lookup occurs. This mirrors the tenant-scoped ownership checks used by the dataset-scoped document download path and keeps the patch small. ## Tests Added unit coverage for `download_doc()` to assert that: - the beta token tenant ID is used in the knowledge-base ownership lookup; - cross-tenant access returns `You do not have access to this document.`; - storage resolution is not called before tenant authorization succeeds; - the existing same-tenant empty-file and successful-download paths still run after the authorization gate passes. I also verified the final patch is limited to `api/apps/sdk/doc.py` and the related document SDK route unit test. A local `pytest` invocation could not complete in this checkout because the shared test fixture attempts to log in to a RAGFlow server at `127.0.0.1:9380`, which was not running in the local environment. ## Security analysis This is exploitable when an attacker has a valid beta API token for their own tenant and obtains or guesses a document ID from another tenant. The token alone should not grant access to other tenants' files, but the direct document route previously authorized only the token itself and not the requested resource. The new tenant-scoped knowledge-base check binds the requested document back to the token tenant before storage is accessed, preventing cross-tenant document downloads through this endpoint. Before submitting, we attempted to disprove this by checking whether existing dataset-scoped routes, token validation, or framework protections already enforced ownership. They do not apply to this direct document-ID route: it bypassed the dataset path parameter and used only `DocumentService.query(id=document_id)` before reading storage. cc @lewiswigmore --- api/apps/sdk/doc.py | 15 ++++++++++++ .../test_doc_sdk_routes_unit.py | 23 ++++++++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 9aa641ccfc..cf297c4b25 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -116,15 +116,30 @@ async def download_doc(document_id): if len(token) != 2: return get_error_data_result(message="Authorization is not valid!") token = token[1] + logging.info("Beta API token lookup attempted for document download") objs = APIToken.query(beta=token) if not objs: + logging.warning("Beta API token lookup failed for document download: invalid API key") return get_error_data_result(message='Authentication error: API key is invalid!"') + if len(objs) > 1: + logging.error("Beta API token lookup is ambiguous for document download: matches=%s", len(objs)) + return get_error_data_result(message="Authentication error: API key configuration is ambiguous.") + tenant_id = objs[0].tenant_id + logging.info("Beta API token authorized for document download: tenant_id=%s", tenant_id) if not document_id: return get_error_data_result(message="Specify document_id please.") doc = DocumentService.query(id=document_id) if not doc: return get_error_data_result(message=f"The dataset not own the document {document_id}.") + if not KnowledgebaseService.query(id=doc[0].kb_id, tenant_id=tenant_id): + logging.warning( + "cross-tenant access denied for document download: tenant_id=%s kb_id=%s document_id=%s", + tenant_id, + doc[0].kb_id, + document_id, + ) + return get_error_data_result(message="You do not have access to this document.") # The process of downloading doc_id, doc_location = File2DocumentService.get_storage_address(doc_id=document_id) # minio address file_stream = settings.STORAGE_IMPL.get(doc_id, doc_location) diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py index 4a6d022c6f..ca440d4ae0 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py @@ -516,7 +516,11 @@ class TestDocRoutesUnit: res = _run(module.download_doc("doc-1")) assert "API key is invalid" in res["message"] - monkeypatch.setattr(module.APIToken, "query", lambda **_kwargs: [SimpleNamespace()]) + monkeypatch.setattr(module.APIToken, "query", lambda **_kwargs: [SimpleNamespace(tenant_id="tenant-1"), SimpleNamespace(tenant_id="tenant-2")]) + res = _run(module.download_doc("doc-1")) + assert "API key configuration is ambiguous" in res["message"] + + monkeypatch.setattr(module.APIToken, "query", lambda **_kwargs: [SimpleNamespace(tenant_id="tenant-1")]) res = _run(module.download_doc("")) assert res["message"] == "Specify document_id please." @@ -525,6 +529,23 @@ class TestDocRoutesUnit: assert "not own the document" in res["message"] monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [_DummyDoc()]) + kb_query_calls = [] + + def _deny_kb_query(**kwargs): + kb_query_calls.append(kwargs) + return [] + + monkeypatch.setattr(module.KnowledgebaseService, "query", _deny_kb_query) + monkeypatch.setattr( + module.File2DocumentService, + "get_storage_address", + lambda **_kwargs: (_ for _ in ()).throw(AssertionError("storage lookup must not run before tenant authorization")), + ) + res = _run(module.download_doc("doc-1")) + assert res["message"] == "You do not have access to this document." + assert kb_query_calls == [{"id": "kb-1", "tenant_id": "tenant-1"}] + + monkeypatch.setattr(module.KnowledgebaseService, "query", lambda **_kwargs: [1]) monkeypatch.setattr(module.File2DocumentService, "get_storage_address", lambda **_kwargs: ("b", "n")) _patch_storage(monkeypatch, module, file_stream=b"") res = _run(module.download_doc("doc-1"))