From b89878c593c059a2106494291cfaf5747ad1cddf Mon Sep 17 00:00:00 2001 From: buua436 Date: Thu, 14 May 2026 10:59:06 +0800 Subject: [PATCH] Fix: dataset document download route (#14910) ### What problem does this PR solve? dataset document download route ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/apps/sdk/doc.py | 57 ++----------------- .../dataset/dataset/dataset-action-cell.tsx | 25 +++++--- web/src/services/file-manager-service.ts | 15 +++++ web/src/utils/api.ts | 2 + 4 files changed, 41 insertions(+), 58 deletions(-) diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index a71b901617..85242f77b0 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -16,9 +16,10 @@ import logging from io import BytesIO -from quart import request, send_file +from quart import send_file -from api.db.db_models import APIToken, Document, Task +from api.apps import login_required +from api.db.db_models import Document, Task from api.db.joint_services.tenant_model_service import get_model_config_by_id, get_model_config_by_type_and_name, get_tenant_default_model_by_type from api.db.services.doc_metadata_service import DocMetadataService from api.db.services.document_service import DocumentService @@ -27,7 +28,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.llm_service import LLMBundle from api.db.services.task_service import TaskService, cancel_all_task_of, queue_tasks from api.db.services.tenant_llm_service import TenantLLMService -from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_request_json, get_result, server_error_response, token_required +from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_request_json, get_result, server_error_response, token_required, add_tenant_id_to_kwargs from common import settings from common.constants import LLMType, RetCode, TaskStatus from common.metadata_utils import convert_conditions, meta_filter @@ -51,7 +52,8 @@ def _enrich_chunks_with_document_metadata(chunks: list[dict], metadata_fields=No @manager.route("/datasets//documents/", methods=["GET"]) # noqa: F821 -@token_required +@login_required +@add_tenant_id_to_kwargs async def download(tenant_id, dataset_id, document_id): """ Download a document from a dataset. @@ -90,8 +92,6 @@ async def download(tenant_id, dataset_id, document_id): """ if not document_id: return get_error_data_result(message="Specify document_id please.") - if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): - return get_error_data_result(message=f"You do not own the dataset {dataset_id}.") doc = DocumentService.query(kb_id=dataset_id, id=document_id) if not doc: return get_error_data_result(message=f"The dataset not own the document {document_id}.") @@ -110,51 +110,6 @@ async def download(tenant_id, dataset_id, document_id): ) -@manager.route("/documents/", methods=["GET"]) # noqa: F821 -async def download_doc(document_id): - token = request.headers.get("Authorization").split() - if len(token) != 2: - return get_error_data_result(message="Authorization is not valid!") - token = token[1] - logging.info("Beta API token lookup attempted for document download") - objs = APIToken.query(beta=token) - if not objs: - logging.warning("Beta API token lookup failed for document download: invalid API key") - return get_error_data_result(message='Authentication error: API key is invalid!"') - if len(objs) > 1: - logging.error("Beta API token lookup is ambiguous for document download: matches=%s", len(objs)) - return get_error_data_result(message="Authentication error: API key configuration is ambiguous.") - tenant_id = objs[0].tenant_id - logging.info("Beta API token authorized for document download: tenant_id=%s", tenant_id) - - if not document_id: - return get_error_data_result(message="Specify document_id please.") - doc = DocumentService.query(id=document_id) - if not doc: - return get_error_data_result(message=f"The dataset not own the document {document_id}.") - if not KnowledgebaseService.query(id=doc[0].kb_id, tenant_id=tenant_id): - logging.warning( - "cross-tenant access denied for document download: tenant_id=%s kb_id=%s document_id=%s", - tenant_id, - doc[0].kb_id, - document_id, - ) - return get_error_data_result(message="You do not have access to this document.") - # The process of downloading - doc_id, doc_location = File2DocumentService.get_storage_address(doc_id=document_id) # minio address - file_stream = settings.STORAGE_IMPL.get(doc_id, doc_location) - if not file_stream: - return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR) - file = BytesIO(file_stream) - # Use send_file with a proper filename and MIME type - return await send_file( - file, - as_attachment=True, - attachment_filename=doc[0].name, - mimetype="application/octet-stream", # Set a default MIME type - ) - - DOC_STOP_PARSING_INVALID_STATE_MESSAGE = "Can't stop parsing document that has not started or already completed" DOC_STOP_PARSING_INVALID_STATE_ERROR_CODE = "DOC_STOP_PARSING_INVALID_STATE" diff --git a/web/src/pages/dataset/dataset/dataset-action-cell.tsx b/web/src/pages/dataset/dataset/dataset-action-cell.tsx index 722fd15ade..296b519ba5 100644 --- a/web/src/pages/dataset/dataset/dataset-action-cell.tsx +++ b/web/src/pages/dataset/dataset/dataset-action-cell.tsx @@ -8,9 +8,10 @@ import { import { DocumentType } from '@/constants/knowledge'; import { useRemoveDocument } from '@/hooks/use-document-request'; import { IDocumentInfo } from '@/interfaces/database/document'; +import { downloadDatasetDocument } from '@/services/file-manager-service'; import { formatFileSize } from '@/utils/common-util'; import { formatDate } from '@/utils/date'; -import { downloadDocument } from '@/utils/file-util'; +import { downloadFileFromBlob } from '@/utils/file-util'; import { Download, Eye, PenLine, Trash2 } from 'lucide-react'; import { useCallback } from 'react'; import { UseRenameDocumentShowType } from './use-rename-document'; @@ -34,12 +35,22 @@ export function DatasetActionCell({ const { removeDocument } = useRemoveDocument(); - const onDownloadDocument = useCallback(() => { - downloadDocument({ - id, - filename: record.name, - }); - }, [id, record.name]); + const onDownloadDocument = useCallback(async () => { + try { + const ext = record.name.split('.').pop()?.toLowerCase() || 'bin'; + const response = await downloadDatasetDocument({ + datasetId: record.dataset_id, + docId: id, + ext, + }); + const blob = new Blob([response.data], { + type: response.data.type, + }); + downloadFileFromBlob(blob, record.name); + } catch (error) { + console.error('Error downloading document:', error); + } + }, [id, record.dataset_id, record.name]); const handleRemove = useCallback(() => { removeDocument(id); diff --git a/web/src/services/file-manager-service.ts b/web/src/services/file-manager-service.ts index ad83a43c46..aca836d100 100644 --- a/web/src/services/file-manager-service.ts +++ b/web/src/services/file-manager-service.ts @@ -12,6 +12,7 @@ const { getDocumentFile, getFile, moveFile, + getDatasetDocumentFileDownload, getDocumentFileDownload, } = api; @@ -67,4 +68,18 @@ export const downloadFile = (data: { docId: string; ext: string }) => { responseType: 'blob', }); }; + +export const downloadDatasetDocument = (data: { + datasetId: string; + docId: string; + ext: string; +}) => { + return request.get( + getDatasetDocumentFileDownload(data.datasetId, data.docId), + { + params: { ext: data.ext }, + responseType: 'blob', + }, + ); +}; export default fileManagerService; diff --git a/web/src/utils/api.ts b/web/src/utils/api.ts index 5dbfc8e369..f3d6456c47 100644 --- a/web/src/utils/api.ts +++ b/web/src/utils/api.ts @@ -126,6 +126,8 @@ export default { `${restAPIv1}/datasets/${datasetId}/documents?type=empty`, documentChangeParser: (datasetId: string, documentId: string) => `${restAPIv1}/datasets/${datasetId}/documents/${documentId}`, + getDatasetDocumentFileDownload: (datasetId: string, documentId: string) => + `${restAPIv1}/datasets/${datasetId}/documents/${documentId}`, documentThumbnails: `${restAPIv1}/thumbnails`, getDocumentFile: `${restAPIv1}/documents`, getDocumentFileDownload: (docId: string) =>