From 2196f2260a6a9b4c87a968b8722cbc9c31975270 Mon Sep 17 00:00:00 2001 From: dripsmvcp <138900956+dripsmvcp@users.noreply.github.com> Date: Wed, 3 Jun 2026 18:58:26 -0700 Subject: [PATCH] fix(api): restore DocumentService.accessible check on /preview (#15508) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Restore the `DocumentService.accessible(doc_id, current_user.id)` check that PR #15146 dropped from the REST document preview handler. Any authenticated caller could download any tenant's document bytes by guessing/knowing the `doc_id`. ## Root cause `api/apps/restful_apis/document_api.py` — the `GET /documents//preview` handler called `DocumentService.get_by_id` and went straight to `File2DocumentService.get_storage_address` + `STORAGE_IMPL.get`, with no tenant check between the lookup and the read. The handler's docstring even promises "user must belong to the tenant that owns the document's knowledge base" — the code didn't enforce it. ## Fix - Add `current_user` to the existing `api.apps` import. - Immediately after `get_by_id`, call `DocumentService.accessible(doc_id, current_user.id)`; on denial, return the **same** `get_data_error_result(message="Document not found!")` shape used for the missing-doc branch. That makes a cross-tenant probe indistinguishable from a missing-doc probe, preventing ID enumeration (the issue body calls this out explicitly). - Emit `logging.warning` with caller user + doc_id for audit. - Restores symmetry with peer routes that already call `accessible(doc_id, user_id)` (e.g. `_run_sync` at `document_api.py:1380`). ## Test plan Adds `test/unit_test/api/apps/restful_apis/test_document_preview_accessible.py`: - **`test_cross_tenant_preview_is_denied`** — owner tenant ≠ caller tenant; asserts the response shape is `Document not found!` and the storage backend (`thread_pool_exec(STORAGE_IMPL.get, ...)`) is **never** invoked. - **`test_missing_doc_returns_not_found`** — missing-doc behaviour unchanged. Stub-loader pattern mirrors `test/unit_test/api/apps/sdk/test_dify_retrieval.py` (added in #15028, passing in CI). ## Provenance — how this fix was produced This PR was authored against a small cited knowledge base committed in the working tree as a `.vouch/` (see [vouchdev/vouch](https://github.com/vouchdev/vouch)). The loop used here: 1. **Grounding first.** Before reading the handler, queried the KB for prior context: `vouch context "tenant scoped accessible authorization"` → retrieved a cited claim distilled from PR #15028 (which restored the same `accessible()` check on `/dify/retrieval`). The retrieved rule: > *ragflow REST endpoints that load by tenant-scoped id must call `.accessible(id, tenant_id)` after `get_by_id` and before storage/DB read; deny with code 109 'No authorization.' and log a warning. Established by PR #15028.* 2. **Applied the pattern with a domain refinement.** For an API/JSON endpoint, `No authorization.` is the right denial shape. For a **byte-streaming, browser-facing** endpoint like `/preview`, leaking *existence* itself enables enumeration — so per the issue's expected behaviour, this PR denies with `Document not found!` (indistinguishable from missing) instead. Same auth check, narrower response. 3. **Recorded the refinement back into the KB** as a new cited claim, so the next IDOR-class issue starts already grounded in both the general pattern and the byte-route nuance. Net effect of the workflow: the fix replicates a known-good pattern instead of reinventing it, *and* the place where the pattern was nuanced is now retrievable for the next pass. Mechanism is fully independent of this PR — it's not a runtime dependency, just process discipline. Closes #15501 --- api/apps/restful_apis/document_api.py | 14 +- .../test_document_preview_accessible.py | 223 ++++++++++++++++++ 2 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 test/unit_test/api/apps/restful_apis/test_document_preview_accessible.py diff --git a/api/apps/restful_apis/document_api.py b/api/apps/restful_apis/document_api.py index 868ab28615..eeb0e8bf94 100644 --- a/api/apps/restful_apis/document_api.py +++ b/api/apps/restful_apis/document_api.py @@ -24,7 +24,7 @@ from quart import request, make_response,send_file from peewee import OperationalError from pydantic import ValidationError -from api.apps import AUTH_JWT, AUTH_API, AUTH_BETA, login_required +from api.apps import AUTH_JWT, AUTH_API, AUTH_BETA, current_user, login_required from api.constants import FILE_NAME_LEN_LIMIT, IMG_BASE64_PREFIX from api.apps.services.document_api_service import validate_document_update_fields, map_doc_keys, \ map_doc_keys_with_run_status, update_document_name_only, update_chunk_method, update_document_status_only, \ @@ -1923,6 +1923,18 @@ async def get(doc_id): e, doc = DocumentService.get_by_id(doc_id) if not e: return get_data_error_result(message="Document not found!") + if not DocumentService.accessible(doc_id, current_user.id): + # Issue #15501: PR #15146 dropped this check, letting any + # authenticated caller download any tenant's document bytes by + # guessing/knowing the doc_id. Return the same "Document not + # found!" shape used for missing docs so the response is + # indistinguishable to a cross-tenant probe (avoids ID + # enumeration). + logging.warning( + "Rejected /documents//preview cross-tenant access: " + "caller_user=%s doc_id=%s", current_user.id, doc_id, + ) + return get_data_error_result(message="Document not found!") b, n = File2DocumentService.get_storage_address(doc_id=doc_id) data = await thread_pool_exec(settings.STORAGE_IMPL.get, b, n) diff --git a/test/unit_test/api/apps/restful_apis/test_document_preview_accessible.py b/test/unit_test/api/apps/restful_apis/test_document_preview_accessible.py new file mode 100644 index 0000000000..590e9eed3a --- /dev/null +++ b/test/unit_test/api/apps/restful_apis/test_document_preview_accessible.py @@ -0,0 +1,223 @@ +# +# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Regression test for `/api/v1/documents//preview` (#15501). + +PR #15146 dropped the `DocumentService.accessible(doc_id, user_id)` check +from the REST preview handler. Any authenticated user could then download +any tenant's document bytes by guessing / knowing its doc_id. This test +locks in the restored check by asserting that a caller whose tenant does +NOT own the document gets the same `Document not found!` response a missing +doc would produce, and that the storage backend is never touched. +""" + +import asyncio +import importlib.util +import sys +from pathlib import Path +from types import ModuleType, SimpleNamespace + +import pytest + + +class _PassthroughManager: + def route(self, *_args, **_kwargs): + return lambda func: func + + +def _stub(monkeypatch, name, **attrs): + mod = ModuleType(name) + for key, value in attrs.items(): + setattr(mod, key, value) + monkeypatch.setitem(sys.modules, name, mod) + return mod + + +def _load_document_api( + monkeypatch, + *, + doc_get_by_id, + accessible_fn, + storage_get, +): + """Load api/apps/restful_apis/document_api.py with the minimum stubs.""" + + async def _make_response(payload): + return SimpleNamespace(payload=payload, headers={}) + + _stub( + monkeypatch, "api.apps", + current_user=SimpleNamespace(id="caller-tenant"), + login_required=lambda func: func, + ) + _stub(monkeypatch, "api.constants", FILE_NAME_LEN_LIMIT=128, IMG_BASE64_PREFIX="data:image/") + _stub( + monkeypatch, "api.apps.services.document_api_service", + validate_document_update_fields=lambda *_a, **_k: None, + map_doc_keys=lambda *_a, **_k: None, + map_doc_keys_with_run_status=lambda *_a, **_k: None, + update_document_name_only=lambda *_a, **_k: None, + update_chunk_method=lambda *_a, **_k: None, + update_document_status_only=lambda *_a, **_k: None, + reset_document_for_reparse=lambda *_a, **_k: None, + ) + _stub(monkeypatch, "api.db", VALID_FILE_TYPES=(), FileType=SimpleNamespace(VISUAL="visual")) + _stub(monkeypatch, "api.db.services", duplicate_name=lambda *_a, **_k: "x") + _stub(monkeypatch, "api.db.services.doc_metadata_service", DocMetadataService=SimpleNamespace()) + _stub(monkeypatch, "api.db.db_models", Task=SimpleNamespace()) + _stub( + monkeypatch, "api.db.services.document_service", + DocumentService=SimpleNamespace(get_by_id=lambda _id: doc_get_by_id, accessible=accessible_fn), + ) + _stub( + monkeypatch, "api.db.services.file2document_service", + File2DocumentService=SimpleNamespace(get_storage_address=lambda **_k: ("bucket", "key")), + ) + _stub(monkeypatch, "api.db.services.file_service", FileService=SimpleNamespace()) + _stub(monkeypatch, "api.db.services.knowledgebase_service", KnowledgebaseService=SimpleNamespace()) + _stub(monkeypatch, "api.common.check_team_permission", check_kb_team_permission=lambda *_a, **_k: True) + _stub(monkeypatch, "api.db.services.task_service", TaskService=SimpleNamespace(), cancel_all_task_of=lambda *_a, **_k: None) + _stub( + monkeypatch, "api.utils.api_utils", + construct_json_result=lambda **kw: {"kind": "json", **kw}, + get_data_error_result=lambda message="", code=0, data=False: {"kind": "data_error", "message": message}, + get_error_data_result=lambda *_a, **_k: {"kind": "error"}, + get_result=lambda *_a, **_k: {"kind": "result"}, + get_json_result=lambda *_a, **_k: {"kind": "json_result"}, + server_error_response=lambda e: {"kind": "server_error", "error": str(e)}, + add_tenant_id_to_kwargs=lambda *_a, **_k: None, + get_request_json=lambda: {}, + get_error_argument_result=lambda *_a, **_k: {"kind": "error_argument"}, + check_duplicate_ids=lambda *_a, **_k: True, + ) + _stub(monkeypatch, "api.utils.pagination_utils", validate_rest_api_page_size=lambda *_a, **_k: 10) + _stub( + monkeypatch, "api.utils.validation_utils", + UpdateDocumentReq=type("UpdateDocumentReq", (), {}), + DeleteDocumentReq=type("DeleteDocumentReq", (), {}), + format_validation_error_message=lambda *_a, **_k: "", + validate_and_parse_json_request=lambda *_a, **_k: ({}, None), + ) + _stub(monkeypatch, "common", settings=SimpleNamespace(STORAGE_IMPL=SimpleNamespace(get=storage_get))) + _stub( + monkeypatch, "common.constants", + ParserType=SimpleNamespace(NAIVE="naive"), + RetCode=SimpleNamespace(AUTHENTICATION_ERROR=109, DATA_ERROR=102), + TaskStatus=SimpleNamespace(RUNNING="1"), + SANDBOX_ARTIFACT_BUCKET="sandbox", + ) + _stub( + monkeypatch, "common.metadata_utils", + convert_conditions=lambda c: c, meta_filter=lambda *_a, **_k: [], + turn2jsonschema=lambda *_a, **_k: {}, + ) + _stub( + monkeypatch, "common.misc_utils", + get_uuid=lambda: "uuid", + thread_pool_exec=lambda fn, *a, **k: storage_get(*a, **k), + ) + _stub( + monkeypatch, "api.utils.file_utils", + filename_type=lambda *_a, **_k: "doc", + thumbnail=lambda *_a, **_k: b"", + ) + _stub( + monkeypatch, "api.utils.web_utils", + CONTENT_TYPE_MAP={"pdf": "application/pdf"}, + html2pdf=lambda *_a, **_k: b"", + is_valid_url=lambda *_a, **_k: True, + apply_safe_file_response_headers=lambda *_a, **_k: None, + ) + _stub(monkeypatch, "common.ssrf_guard", assert_url_is_safe=lambda *_a, **_k: None) + + quart_stub = ModuleType("quart") + quart_stub.request = SimpleNamespace(method="GET", args={}) + quart_stub.make_response = _make_response + quart_stub.send_file = lambda *_a, **_k: None + monkeypatch.setitem(sys.modules, "quart", quart_stub) + + # Third-party deps imported at module top — stub if not installed. + if "peewee" not in sys.modules: + _stub(monkeypatch, "peewee", OperationalError=type("OperationalError", (Exception,), {})) + if "pydantic" not in sys.modules: + _stub(monkeypatch, "pydantic", ValidationError=type("ValidationError", (Exception,), {})) + + # api.apps.settings is reached as `settings.STORAGE_IMPL.get`. + _stub( + monkeypatch, "api.apps.settings", + STORAGE_IMPL=SimpleNamespace(get=storage_get), + ) + + # parents[5] = repo root (test/unit_test/api/apps/restful_apis/) + repo_root = Path(__file__).resolve().parents[5] + module_path = repo_root / "api" / "apps" / "restful_apis" / "document_api.py" + spec = importlib.util.spec_from_file_location("test_document_api_module", module_path) + module = importlib.util.module_from_spec(spec) + module.manager = _PassthroughManager() + module.settings = SimpleNamespace(STORAGE_IMPL=SimpleNamespace(get=storage_get)) + monkeypatch.setitem(sys.modules, "test_document_api_module", module) + spec.loader.exec_module(module) + return module + + +@pytest.mark.p1 +class TestDocumentPreviewAccessCheck: + """Regression for #15501: cross-tenant document preview via /documents//preview.""" + + def test_cross_tenant_preview_is_denied(self, monkeypatch: pytest.MonkeyPatch) -> None: + """A caller whose tenant does NOT own the document gets the same + 'Document not found!' response a missing doc would, and storage is + never read.""" + doc = SimpleNamespace(id="DOC_VICTIM", name="secrets.pdf", type="application") + storage_calls: list[tuple] = [] + + def _storage_get(*args, **kwargs): + storage_calls.append((args, kwargs)) + return b"SECRET BYTES" + + def _accessible_only_for_owner(_doc_id, user_id): + return user_id == "tenant-owner" + + module = _load_document_api( + monkeypatch, + doc_get_by_id=(True, doc), + accessible_fn=_accessible_only_for_owner, + storage_get=_storage_get, + ) + + # current_user.id is "caller-tenant" via the api.apps stub above; + # tenant-owner is "tenant-owner" → not owner → must be denied. + result = asyncio.run(module.get("DOC_VICTIM")) + + assert isinstance(result, dict) and result.get("kind") == "data_error", result + assert "not found" in result["message"].lower() + assert storage_calls == [], "storage was read despite cross-tenant request" + + def test_missing_doc_returns_not_found(self, monkeypatch: pytest.MonkeyPatch) -> None: + """Missing-doc behaviour is unchanged: same 'Document not found!' shape.""" + + def _accessible_should_not_be_called(*_a, **_k): + raise AssertionError("accessible() must not be called for a missing doc") + + module = _load_document_api( + monkeypatch, + doc_get_by_id=(False, None), + accessible_fn=_accessible_should_not_be_called, + storage_get=lambda *_a, **_k: b"", + ) + + result = asyncio.run(module.get("DOC_DOES_NOT_EXIST")) + assert isinstance(result, dict) and result.get("kind") == "data_error" + assert "not found" in result["message"].lower()