Fix: dataset document download route (#14910)

### What problem does this PR solve? dataset document download route ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-29 23:41:12 +08:00 · 2026-05-14 10:59:06 +08:00
parent 1c0eaa504b
commit b89878c593
4 changed files with 41 additions and 58 deletions
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@@ -16,9 +16,10 @@
 import logging
 from io import BytesIO

-from quart import request, send_file
+from quart import send_file

-from api.db.db_models import APIToken, Document, Task
+from api.apps import login_required
+from api.db.db_models import Document, Task
 from api.db.joint_services.tenant_model_service import get_model_config_by_id, get_model_config_by_type_and_name, get_tenant_default_model_by_type
 from api.db.services.doc_metadata_service import DocMetadataService
 from api.db.services.document_service import DocumentService
@@ -27,7 +28,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.llm_service import LLMBundle
 from api.db.services.task_service import TaskService, cancel_all_task_of, queue_tasks
 from api.db.services.tenant_llm_service import TenantLLMService
-from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_request_json, get_result, server_error_response, token_required
+from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_request_json, get_result, server_error_response, token_required, add_tenant_id_to_kwargs
 from common import settings
 from common.constants import LLMType, RetCode, TaskStatus
 from common.metadata_utils import convert_conditions, meta_filter
@@ -51,7 +52,8 @@ def _enrich_chunks_with_document_metadata(chunks: list[dict], metadata_fields=No


@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["GET"])  # noqa: F821
-@token_required
+@login_required
+@add_tenant_id_to_kwargs
 async def download(tenant_id, dataset_id, document_id):
    """
    Download a document from a dataset.
@@ -90,8 +92,6 @@ async def download(tenant_id, dataset_id, document_id):
    """
    if not document_id:
        return get_error_data_result(message="Specify document_id please.")
-    if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
-        return get_error_data_result(message=f"You do not own the dataset {dataset_id}.")
    doc = DocumentService.query(kb_id=dataset_id, id=document_id)
    if not doc:
        return get_error_data_result(message=f"The dataset not own the document {document_id}.")
@@ -110,51 +110,6 @@ async def download(tenant_id, dataset_id, document_id):
    )


-@manager.route("/documents/<document_id>", methods=["GET"])  # noqa: F821
-async def download_doc(document_id):
-    token = request.headers.get("Authorization").split()
-    if len(token) != 2:
-        return get_error_data_result(message="Authorization is not valid!")
-    token = token[1]
-    logging.info("Beta API token lookup attempted for document download")
-    objs = APIToken.query(beta=token)
-    if not objs:
-        logging.warning("Beta API token lookup failed for document download: invalid API key")
-        return get_error_data_result(message='Authentication error: API key is invalid!"')
-    if len(objs) > 1:
-        logging.error("Beta API token lookup is ambiguous for document download: matches=%s", len(objs))
-        return get_error_data_result(message="Authentication error: API key configuration is ambiguous.")
-    tenant_id = objs[0].tenant_id
-    logging.info("Beta API token authorized for document download: tenant_id=%s", tenant_id)
-
-    if not document_id:
-        return get_error_data_result(message="Specify document_id please.")
-    doc = DocumentService.query(id=document_id)
-    if not doc:
-        return get_error_data_result(message=f"The dataset not own the document {document_id}.")
-    if not KnowledgebaseService.query(id=doc[0].kb_id, tenant_id=tenant_id):
-        logging.warning(
-            "cross-tenant access denied for document download: tenant_id=%s kb_id=%s document_id=%s",
-            tenant_id,
-            doc[0].kb_id,
-            document_id,
-        )
-        return get_error_data_result(message="You do not have access to this document.")
-    # The process of downloading
-    doc_id, doc_location = File2DocumentService.get_storage_address(doc_id=document_id)  # minio address
-    file_stream = settings.STORAGE_IMPL.get(doc_id, doc_location)
-    if not file_stream:
-        return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR)
-    file = BytesIO(file_stream)
-    # Use send_file with a proper filename and MIME type
-    return await send_file(
-        file,
-        as_attachment=True,
-        attachment_filename=doc[0].name,
-        mimetype="application/octet-stream",  # Set a default MIME type
-    )
-
-
 DOC_STOP_PARSING_INVALID_STATE_MESSAGE = "Can't stop parsing document that has not started or already completed"
 DOC_STOP_PARSING_INVALID_STATE_ERROR_CODE = "DOC_STOP_PARSING_INVALID_STATE"

--- a/web/src/pages/dataset/dataset/dataset-action-cell.tsx
+++ b/web/src/pages/dataset/dataset/dataset-action-cell.tsx
@@ -8,9 +8,10 @@ import {
 import { DocumentType } from '@/constants/knowledge';
 import { useRemoveDocument } from '@/hooks/use-document-request';
 import { IDocumentInfo } from '@/interfaces/database/document';
+import { downloadDatasetDocument } from '@/services/file-manager-service';
 import { formatFileSize } from '@/utils/common-util';
 import { formatDate } from '@/utils/date';
-import { downloadDocument } from '@/utils/file-util';
+import { downloadFileFromBlob } from '@/utils/file-util';
 import { Download, Eye, PenLine, Trash2 } from 'lucide-react';
 import { useCallback } from 'react';
 import { UseRenameDocumentShowType } from './use-rename-document';
@@ -34,12 +35,22 @@ export function DatasetActionCell({

  const { removeDocument } = useRemoveDocument();

-  const onDownloadDocument = useCallback(() => {
-    downloadDocument({
-      id,
-      filename: record.name,
-    });
-  }, [id, record.name]);
+  const onDownloadDocument = useCallback(async () => {
+    try {
+      const ext = record.name.split('.').pop()?.toLowerCase() || 'bin';
+      const response = await downloadDatasetDocument({
+        datasetId: record.dataset_id,
+        docId: id,
+        ext,
+      });
+      const blob = new Blob([response.data], {
+        type: response.data.type,
+      });
+      downloadFileFromBlob(blob, record.name);
+    } catch (error) {
+      console.error('Error downloading document:', error);
+    }
+  }, [id, record.dataset_id, record.name]);

  const handleRemove = useCallback(() => {
    removeDocument(id);
--- a/web/src/services/file-manager-service.ts
+++ b/web/src/services/file-manager-service.ts
@@ -12,6 +12,7 @@ const {
  getDocumentFile,
  getFile,
  moveFile,
+  getDatasetDocumentFileDownload,
  getDocumentFileDownload,
 } = api;

@@ -67,4 +68,18 @@ export const downloadFile = (data: { docId: string; ext: string }) => {
    responseType: 'blob',
  });
 };
+
+export const downloadDatasetDocument = (data: {
+  datasetId: string;
+  docId: string;
+  ext: string;
+}) => {
+  return request.get(
+    getDatasetDocumentFileDownload(data.datasetId, data.docId),
+    {
+      params: { ext: data.ext },
+      responseType: 'blob',
+    },
+  );
+};
 export default fileManagerService;
--- a/web/src/utils/api.ts
+++ b/web/src/utils/api.ts
@@ -126,6 +126,8 @@ export default {
    `${restAPIv1}/datasets/${datasetId}/documents?type=empty`,
  documentChangeParser: (datasetId: string, documentId: string) =>
    `${restAPIv1}/datasets/${datasetId}/documents/${documentId}`,
+  getDatasetDocumentFileDownload: (datasetId: string, documentId: string) =>
+    `${restAPIv1}/datasets/${datasetId}/documents/${documentId}`,
  documentThumbnails: `${restAPIv1}/thumbnails`,
  getDocumentFile: `${restAPIv1}/documents`,
  getDocumentFileDownload: (docId: string) =>