fix: add document download endpoint and refactor existing download function (#14927)

### What problem does this PR solve? add document download endpoint and refactor existing download function ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-29 23:41:12 +08:00 · 2026-05-15 09:36:58 +08:00
parent 5a5bbee948
commit 58819f5d3e
2 changed files with 58 additions and 5 deletions
--- a/api/apps/restful_apis/document_api.py
+++ b/api/apps/restful_apis/document_api.py
@@ -1881,8 +1881,6 @@ async def download_attachment(tenant_id=None, doc_id=None, attachment_id=None):
        # Keep backward compatibility with older callers and unit tests that still
        # pass `attachment_id` instead of the route parameter name.
        doc_id = doc_id or attachment_id
-        if not DocumentService.accessible(doc_id, current_user.id):
-            return get_data_error_result(message="Document not found!")
        ext = request.args.get("ext", "markdown")
        data = await thread_pool_exec(settings.STORAGE_IMPL.get, tenant_id, doc_id)
        response = await make_response(data)
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@@ -28,7 +28,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.llm_service import LLMBundle
 from api.db.services.task_service import TaskService, cancel_all_task_of, queue_tasks
 from api.db.services.tenant_llm_service import TenantLLMService
-from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_request_json, get_result, server_error_response, token_required, add_tenant_id_to_kwargs
+from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_request_json, get_result, server_error_response, token_required
 from common import settings
 from common.constants import LLMType, RetCode, TaskStatus
 from common.metadata_utils import convert_conditions, meta_filter
@@ -53,8 +53,7 @@ def _enrich_chunks_with_document_metadata(chunks: list[dict], metadata_fields=No

@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["GET"])  # noqa: F821
@login_required
-@add_tenant_id_to_kwargs
-async def download(tenant_id, dataset_id, document_id):
+async def download(dataset_id, document_id):
    """
    Download a document from a dataset.
    ---
@@ -113,6 +112,62 @@ async def download(tenant_id, dataset_id, document_id):
 DOC_STOP_PARSING_INVALID_STATE_MESSAGE = "Can't stop parsing document that has not started or already completed"
 DOC_STOP_PARSING_INVALID_STATE_ERROR_CODE = "DOC_STOP_PARSING_INVALID_STATE"

+@manager.route("/documents/<document_id>", methods=["GET"])  # noqa: F821
+@login_required
+async def download_document(document_id):
+    """
+    Download a document.
+    ---
+    tags:
+      - Documents
+    security:
+      - ApiKeyAuth: []
+    produces:
+      - application/octet-stream
+    parameters:
+      - in: path
+        name: dataset_id
+        type: string
+        required: true
+        description: ID of the dataset.
+      - in: path
+        name: document_id
+        type: string
+        required: true
+        description: ID of the document to download.
+      - in: header
+        name: Authorization
+        type: string
+        required: true
+        description: Bearer token for authentication.
+    responses:
+      200:
+        description: Document file stream.
+        schema:
+          type: file
+      400:
+        description: Error message.
+        schema:
+          type: object
+    """
+    if not document_id:
+        return get_error_data_result(message="Specify document_id please.")
+    doc = DocumentService.query(id=document_id)
+    if not doc:
+        return get_error_data_result(message=f"The dataset not own the document {document_id}.")
+    # The process of downloading
+    doc_id, doc_location = File2DocumentService.get_storage_address(doc_id=document_id)  # minio address
+    file_stream = settings.STORAGE_IMPL.get(doc_id, doc_location)
+    if not file_stream:
+        return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR)
+    file = BytesIO(file_stream)
+    # Use send_file with a proper filename and MIME type
+    return await send_file(
+        file,
+        as_attachment=True,
+        attachment_filename=doc[0].name,
+        mimetype="application/octet-stream",  # Set a default MIME type
+    )

@manager.route("/datasets/<dataset_id>/chunks", methods=["POST"])  # noqa: F821
@token_required