From f703169117687dec3a6550fe21c8ea20c6497e70 Mon Sep 17 00:00:00 2001 From: buua436 Date: Fri, 8 May 2026 13:26:13 +0800 Subject: [PATCH] Refa: migrate document preview/download to RESTful API (#14633) ### What problem does this PR solve? migrate document preview/download to RESTful API ### Type of change - [x] Refactoring --- api/apps/backward_compat.py | 40 +++++++++++ api/apps/document_app.py | 71 ------------------- api/apps/restful_apis/document_api.py | 46 +++++++++++- docs/references/http_api_reference.md | 14 ++-- test/testcases/test_web_api/test_common.py | 26 ++++++- .../test_document_app/conftest.py | 22 +++++- .../test_document_metadata.py | 4 +- web/src/components/document-preview/hooks.ts | 4 +- web/src/hooks/use-document-request.ts | 16 ++--- web/src/pages/document-viewer/index.tsx | 4 +- web/src/utils/api.ts | 4 +- 11 files changed, 155 insertions(+), 96 deletions(-) delete mode 100644 api/apps/document_app.py diff --git a/api/apps/backward_compat.py b/api/apps/backward_compat.py index 6c2b4ee126..026d9b7d8b 100644 --- a/api/apps/backward_compat.py +++ b/api/apps/backward_compat.py @@ -29,6 +29,8 @@ Deprecated APIs and their replacements: - POST /api/v1/file/convert -> POST /api/v1/files/link-to-datasets - GET /api/v1/file/* -> GET /api/v1/files* - POST /api/v1/file/* -> POST /api/v1/files* +- GET /api/v1/document/get/{doc_id} -> GET /api/v1/documents/{doc_id}/preview +- GET /api/v1/document/download/{doc_id} -> GET /api/v1/documents/{doc_id}/download - POST /api/v1/sessions/related_questions -> POST /api/v1/chat/recommandation - PUT (chunk update) -> PATCH (chunk update) """ @@ -394,6 +396,44 @@ async def deprecated_file_upload_info(): tenant_id = current_user.id return await document_api.upload_info(tenant_id=tenant_id) + +# ============================================================================= +# Document APIs +# ============================================================================= + +@manager.route("/document/get/", methods=["GET"]) +@login_required +async def deprecated_document_get(doc_id): + """ + Deprecated: Use GET /api/v1/documents/{doc_id}/preview instead. + + Old path: GET /api/v1/document/get/{doc_id} + New path: GET /api/v1/documents/{doc_id}/preview + """ + logging.warning( + "API endpoint /api/v1/document/get/%s is deprecated. " + "Please use /api/v1/documents/%s/preview instead.", + doc_id, doc_id, + ) + return await document_api.get(doc_id) + + +@manager.route("/document/download/", methods=["GET"]) +@login_required +async def deprecated_document_download(doc_id): + """ + Deprecated: Use GET /api/v1/documents/{doc_id}/download instead. + + Old path: GET /api/v1/document/download/{doc_id} + New path: GET /api/v1/documents/{doc_id}/download + """ + logging.warning( + "API endpoint /api/v1/document/download/%s is deprecated. " + "Please use /api/v1/documents/%s/download instead.", + doc_id, doc_id, + ) + return await document_api.download_attachment(doc_id=doc_id) + # ============================================================================= # Agent Chat API # ============================================================================= diff --git a/api/apps/document_app.py b/api/apps/document_app.py deleted file mode 100644 index d48885ec90..0000000000 --- a/api/apps/document_app.py +++ /dev/null @@ -1,71 +0,0 @@ -# -# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -# -import re - -from quart import make_response, request - -from api.apps import current_user, login_required -from api.db import FileType -from api.db.services.document_service import DocumentService -from api.db.services.file2document_service import File2DocumentService -from api.utils.api_utils import ( - get_data_error_result, - server_error_response, -) -from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers -from common import settings -from common.misc_utils import thread_pool_exec - - -@manager.route("/get/", methods=["GET"]) # noqa: F821 -@login_required -async def get(doc_id): - try: - e, doc = DocumentService.get_by_id(doc_id) - if not e: - return get_data_error_result(message="Document not found!") - - b, n = File2DocumentService.get_storage_address(doc_id=doc_id) - data = await thread_pool_exec(settings.STORAGE_IMPL.get, b, n) - response = await make_response(data) - - ext = re.search(r"\.([^.]+)$", doc.name.lower()) - ext = ext.group(1) if ext else None - content_type = None - if ext: - fallback_prefix = "image" if doc.type == FileType.VISUAL.value else "application" - content_type = CONTENT_TYPE_MAP.get(ext, f"{fallback_prefix}/{ext}") - apply_safe_file_response_headers(response, content_type, ext) - return response - except Exception as e: - return server_error_response(e) - - -@manager.route("/download/", methods=["GET"]) # noqa: F821 -@login_required -async def download_attachment(attachment_id): - try: - ext = request.args.get("ext", "markdown") - data = await thread_pool_exec(settings.STORAGE_IMPL.get, current_user.id, attachment_id) - response = await make_response(data) - content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}") - apply_safe_file_response_headers(response, content_type, ext) - - return response - - except Exception as e: - return server_error_response(e) - diff --git a/api/apps/restful_apis/document_api.py b/api/apps/restful_apis/document_api.py index 0d3782f431..f57fe6b8ae 100644 --- a/api/apps/restful_apis/document_api.py +++ b/api/apps/restful_apis/document_api.py @@ -33,6 +33,7 @@ from api.db.services import duplicate_name from api.db.services.doc_metadata_service import DocMetadataService from api.db.db_models import Task from api.db.services.document_service import DocumentService +from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.db.services.knowledgebase_service import KnowledgebaseService from api.common.check_team_permission import check_kb_team_permission @@ -48,7 +49,7 @@ from common.constants import ParserType, RetCode, TaskStatus, SANDBOX_ARTIFACT_B from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema from common.misc_utils import get_uuid, thread_pool_exec from api.utils.file_utils import filename_type, thumbnail -from api.utils.web_utils import html2pdf, is_valid_url, apply_safe_file_response_headers +from api.utils.web_utils import CONTENT_TYPE_MAP, html2pdf, is_valid_url, apply_safe_file_response_headers from common.ssrf_guard import assert_url_is_safe from rag.nlp import search @@ -1854,3 +1855,46 @@ async def batch_update_document_status(tenant_id, dataset_id): if has_error: return get_json_result(data=result, message="Partial failure", code=RetCode.SERVER_ERROR) return get_json_result(data=result) + +@manager.route("/documents//preview", methods=["GET"]) # noqa: F821 +@login_required +async def get(doc_id): + try: + e, doc = DocumentService.get_by_id(doc_id) + if not e: + return get_data_error_result(message="Document not found!") + + b, n = File2DocumentService.get_storage_address(doc_id=doc_id) + data = await thread_pool_exec(settings.STORAGE_IMPL.get, b, n) + response = await make_response(data) + + ext = re.search(r"\.([^.]+)$", doc.name.lower()) + ext = ext.group(1) if ext else None + content_type = None + if ext: + fallback_prefix = "image" if doc.type == FileType.VISUAL.value else "application" + content_type = CONTENT_TYPE_MAP.get(ext, f"{fallback_prefix}/{ext}") + apply_safe_file_response_headers(response, content_type, ext) + return response + except Exception as e: + return server_error_response(e) + + +@manager.route("/documents//download", methods=["GET"]) # noqa: F821 +@login_required +@add_tenant_id_to_kwargs +async def download_attachment(tenant_id=None, doc_id=None, attachment_id=None): + try: + # Keep backward compatibility with older callers and unit tests that still + # pass `attachment_id` instead of the route parameter name. + doc_id = doc_id or attachment_id + ext = request.args.get("ext", "markdown") + data = await thread_pool_exec(settings.STORAGE_IMPL.get, tenant_id, doc_id) + response = await make_response(data) + content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}") + apply_safe_file_response_headers(response, content_type, ext) + + return response + + except Exception as e: + return server_error_response(e) diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index 8c2eba3a43..496f0ab31b 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -6879,14 +6879,18 @@ Failure: ### Download attachment -**GET** `/v1/document/download/{attachment_id}` +**GET** `/api/v1/documents/{doc_id}/download` + +:::caution DEPRECATED +The previous endpoint `GET /api/v1/document/download/{doc_id}` is deprecated. Please use this endpoint instead. +::: Downloads a runtime attachment previously uploaded via the [Upload document](#upload-document) method. #### Request - Method: GET -- URL: `/v1/document/download/{attachment_id}` +- URL: `/api/v1/documents/{doc_id}/download` - Headers: - `'Authorization: Bearer '` - Query parameter: @@ -6896,15 +6900,15 @@ Downloads a runtime attachment previously uploaded via the [Upload document](#up ```bash curl --request GET \ - --url 'http://{address}/v1/document/download/{attachment_id}?ext=pdf' \ + --url 'http://{address}/api/v1/documents/{doc_id}/download?ext=pdf' \ --header 'Authorization: Bearer ' \ --output ./downloaded_attachment.pdf ``` ##### Request parameters -- `attachment_id`: (*Path parameter*), `string`, *Required* - The `id` value returned by the [Upload document](#upload-document) method. +- `doc_id`: (*Path parameter*), `string`, *Required* + The document ID whose attachment should be downloaded. - `ext`: (*Query parameter*), `string`, *Optional* A file extension hint specifying the response's Content-Type. Defaults to `"markdown"`. Available values: - `"markdown"` diff --git a/test/testcases/test_web_api/test_common.py b/test/testcases/test_web_api/test_common.py index 3a8c54ce02..170d530af1 100644 --- a/test/testcases/test_web_api/test_common.py +++ b/test/testcases/test_web_api/test_common.py @@ -26,7 +26,6 @@ from utils.file_utils import create_txt_file HEADERS = {"Content-Type": "application/json"} DATASETS_URL = f"/api/{VERSION}/datasets" -DOCUMENT_APP_URL = f"/{VERSION}/document" CHUNK_APP_URL = f"/{VERSION}/chunk" CHUNK_API_URL = f"/api/{VERSION}/datasets/{{dataset_id}}/documents/{{document_id}}/chunks" # SESSION_WITH_CHAT_ASSISTANT_API_URL = "/api/v1/chats/{chat_id}/sessions" @@ -404,10 +403,33 @@ def document_infos(auth, dataset_id, params=None, payload=None, *, headers=HEADE def document_metadata_summary(auth, payload=None, *, headers=HEADERS, data=None): - res = requests.post(url=f"{HOST_ADDRESS}{DOCUMENT_APP_URL}/metadata/summary", headers=headers, auth=auth, json=payload, data=data) + dataset_id = (payload or {}).get("kb_id") + doc_ids = (payload or {}).get("doc_ids") + if not dataset_id: + return {"code": 101, "message": "KB ID is required"} + params = {} + if doc_ids: + params["doc_ids"] = ",".join(doc_ids) + res = requests.get(url=f"{HOST_ADDRESS}{DATASETS_URL}/{dataset_id}/metadata/summary", headers=headers, auth=auth, params=params, data=data) return res.json() +def document_get(auth, document_id, *, headers=HEADERS, data=None): + res = requests.get(url=f"{HOST_ADDRESS}/api/{VERSION}/documents/{document_id}/preview", headers=headers, auth=auth, data=data) + return res + + +def document_download(auth, attachment_id, *, ext="markdown", headers=HEADERS, data=None): + res = requests.get( + url=f"{HOST_ADDRESS}/api/{VERSION}/documents/{attachment_id}/download", + headers=headers, + auth=auth, + params={"ext": ext}, + data=data, + ) + return res + + def document_metadata_update(auth, dataset_id, payload=None, *, headers=HEADERS, data=None): """New unified API for updating document metadata. diff --git a/test/testcases/test_web_api/test_document_app/conftest.py b/test/testcases/test_web_api/test_document_app/conftest.py index 78b5a5fdf8..0e719a1527 100644 --- a/test/testcases/test_web_api/test_document_app/conftest.py +++ b/test/testcases/test_web_api/test_document_app/conftest.py @@ -126,11 +126,31 @@ def document_app_module(monkeypatch): monkeypatch.setitem(sys.modules, "xgboost", ModuleType("xgboost")) stub_apps = ModuleType("api.apps") + stub_apps.__path__ = [str(repo_root / "api" / "apps")] stub_apps.current_user = SimpleNamespace(id="user-1") stub_apps.login_required = lambda func: func monkeypatch.setitem(sys.modules, "api.apps", stub_apps) - module_path = repo_root / "api" / "apps" / "document_app.py" + stub_apps_services = ModuleType("api.apps.services") + stub_apps_services.__path__ = [str(repo_root / "api" / "apps" / "services")] + monkeypatch.setitem(sys.modules, "api.apps.services", stub_apps_services) + + document_api_service_mod = ModuleType("api.apps.services.document_api_service") + document_api_service_mod.validate_document_update_fields = lambda *_args, **_kwargs: (None, None) + document_api_service_mod.map_doc_keys = lambda doc: doc.to_dict() if hasattr(doc, "to_dict") else doc + + def _map_doc_keys_with_run_status(doc, run_status="0"): + payload = doc if isinstance(doc, dict) else doc.to_dict() + return {**payload, "run": run_status} + + document_api_service_mod.map_doc_keys_with_run_status = _map_doc_keys_with_run_status + document_api_service_mod.update_document_name_only = lambda *_args, **_kwargs: None + document_api_service_mod.update_chunk_method = lambda *_args, **_kwargs: None + document_api_service_mod.update_document_status_only = lambda *_args, **_kwargs: None + document_api_service_mod.reset_document_for_reparse = lambda *_args, **_kwargs: None + monkeypatch.setitem(sys.modules, "api.apps.services.document_api_service", document_api_service_mod) + + module_path = repo_root / "api" / "apps" / "restful_apis" / "document_api.py" spec = importlib.util.spec_from_file_location("test_document_app_unit", module_path) module = importlib.util.module_from_spec(spec) module.manager = _DummyManager() diff --git a/test/testcases/test_web_api/test_document_app/test_document_metadata.py b/test/testcases/test_web_api/test_document_app/test_document_metadata.py index bb69ef9803..5a843cdc3a 100644 --- a/test/testcases/test_web_api/test_document_app/test_document_metadata.py +++ b/test/testcases/test_web_api/test_document_app/test_document_metadata.py @@ -394,7 +394,7 @@ class TestDocumentMetadataUnit: "apply_safe_file_response_headers", lambda response, content_type, extension: response.headers.update({"content_type": content_type, "extension": extension}), ) - res = _run(module.download_attachment("att1")) + res = _run(module.download_attachment(attachment_id="att1")) assert isinstance(res, _DummyResponse) assert res.data == b"attachment" assert res.headers["content_type"] == "application/abc" @@ -405,7 +405,7 @@ class TestDocumentMetadataUnit: monkeypatch.setattr(module, "thread_pool_exec", raise_error) monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)}) - res = _run(module.download_attachment("att1")) + res = _run(module.download_attachment(attachment_id="att1")) assert res["code"] == 500 assert "download boom" in res["message"] diff --git a/web/src/components/document-preview/hooks.ts b/web/src/components/document-preview/hooks.ts index 79f2aa526d..fa17e3b66f 100644 --- a/web/src/components/document-preview/hooks.ts +++ b/web/src/components/document-preview/hooks.ts @@ -1,7 +1,7 @@ import { Authorization } from '@/constants/authorization'; import { useGetKnowledgeSearchParams } from '@/hooks/route-hook'; import { useGetPipelineResultSearchParams } from '@/pages/dataflow-result/hooks'; -import api, { webAPI } from '@/utils/api'; +import api, { restAPIv1 } from '@/utils/api'; import { getAuthorization } from '@/utils/authorization-util'; import jsPreviewExcel from '@js-preview/excel'; import { useSize } from 'ahooks'; @@ -57,7 +57,7 @@ export const useGetDocumentUrl = (isAgent: boolean) => { if (isAgent) { return api.downloadFile + `?id=${id}&created_by=${createdBy}`; } - return `${webAPI}/document/get/${documentId}`; + return `${restAPIv1}/documents/${documentId}/preview`; }, [createdBy, documentId, id, isAgent]); return url; diff --git a/web/src/hooks/use-document-request.ts b/web/src/hooks/use-document-request.ts index bb447e6c64..96ad1b0e2c 100644 --- a/web/src/hooks/use-document-request.ts +++ b/web/src/hooks/use-document-request.ts @@ -26,8 +26,7 @@ import kbService, { uploadDocument, webCrawlDocument, } from '@/services/knowledge-service'; -import { restAPIv1, webAPI } from '@/utils/api'; -import { getSearchValue } from '@/utils/common-util'; +import { restAPIv1 } from '@/utils/api'; import { buildChunkHighlights } from '@/utils/document-util'; import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; import { useDebounce } from 'ahooks'; @@ -214,6 +213,7 @@ export const useGetDocumentFilter = (): { const { id } = useParams(); const debouncedSearchString = useDebounce(searchString, { wait: 500 }); const [open, setOpen] = useState(0); + const datasetId = knowledgeId || id; const { data } = useQuery({ queryKey: [ DocumentApiAction.FetchDocumentFilter, @@ -221,7 +221,10 @@ export const useGetDocumentFilter = (): { knowledgeId, ], queryFn: async () => { - const { data } = await documentFilter(knowledgeId || id); + if (!datasetId) { + return; + } + const { data } = await documentFilter(datasetId); if (data.code === 0) { return data.data; } @@ -504,14 +507,11 @@ export const useCreateDocument = () => { }; export const useGetDocumentUrl = (documentId?: string) => { - const auth = getSearchValue('auth'); const getDocumentUrl = useCallback( (id?: string) => { - return auth - ? `${restAPIv1}/documents/${id || documentId}` - : `${webAPI}/document/get/${id || documentId}`; + return `${restAPIv1}/documents/${id || documentId}/preview`; }, - [documentId, auth], + [documentId], ); return getDocumentUrl; diff --git a/web/src/pages/document-viewer/index.tsx b/web/src/pages/document-viewer/index.tsx index d9f7da73a8..02c07faeae 100644 --- a/web/src/pages/document-viewer/index.tsx +++ b/web/src/pages/document-viewer/index.tsx @@ -1,5 +1,5 @@ import { Images } from '@/constants/common'; -import { restAPIv1, webAPI } from '@/utils/api'; +import { restAPIv1 } from '@/utils/api'; import { useParams, useSearchParams } from 'react-router'; // import Docx from './docx'; // import Excel from './excel'; @@ -29,7 +29,7 @@ const DocumentViewer = () => { const api = resource === 'files' ? `${restAPIv1}/files/${documentId}` - : `${webAPI}/document/get/${documentId}`; + : `${restAPIv1}/documents/${documentId}/preview`; // request.head if (ext === 'html' && documentId) { diff --git a/web/src/utils/api.ts b/web/src/utils/api.ts index b1c2c3e6e1..2e23727b76 100644 --- a/web/src/utils/api.ts +++ b/web/src/utils/api.ts @@ -127,9 +127,9 @@ export default { documentChangeParser: (datasetId: string, documentId: string) => `${restAPIv1}/datasets/${datasetId}/documents/${documentId}`, documentThumbnails: `${restAPIv1}/thumbnails`, - getDocumentFile: `${webAPI}/document/get`, + getDocumentFile: `${restAPIv1}/documents`, getDocumentFileDownload: (docId: string) => - `${webAPI}/document/download/${docId}`, + `${restAPIv1}/documents/${docId}/download`, documentUpload: (datasetId: string) => `${restAPIv1}/datasets/${datasetId}/documents`, webCrawl: (datasetId: string) =>