From f703169117687dec3a6550fe21c8ea20c6497e70 Mon Sep 17 00:00:00 2001
From: buua436 <sz_buua@foxmail.com>
Date: Fri, 8 May 2026 13:26:13 +0800
Subject: [PATCH] Refa: migrate document preview/download to RESTful API
 (#14633)

### What problem does this PR solve?

migrate document preview/download to RESTful API

### Type of change
- [x] Refactoring
---
 api/apps/backward_compat.py                   | 40 +++++++++++
 api/apps/document_app.py                      | 71 -------------------
 api/apps/restful_apis/document_api.py         | 46 +++++++++++-
 docs/references/http_api_reference.md         | 14 ++--
 test/testcases/test_web_api/test_common.py    | 26 ++++++-
 .../test_document_app/conftest.py             | 22 +++++-
 .../test_document_metadata.py                 |  4 +-
 web/src/components/document-preview/hooks.ts  |  4 +-
 web/src/hooks/use-document-request.ts         | 16 ++---
 web/src/pages/document-viewer/index.tsx       |  4 +-
 web/src/utils/api.ts                          |  4 +-
 11 files changed, 155 insertions(+), 96 deletions(-)
 delete mode 100644 api/apps/document_app.py

diff --git a/api/apps/backward_compat.py b/api/apps/backward_compat.py
index 6c2b4ee126..026d9b7d8b 100644
--- a/api/apps/backward_compat.py
+++ b/api/apps/backward_compat.py
@@ -29,6 +29,8 @@ Deprecated APIs and their replacements:
 - POST /api/v1/file/convert -> POST /api/v1/files/link-to-datasets
 - GET /api/v1/file/* -> GET /api/v1/files*
 - POST /api/v1/file/* -> POST /api/v1/files*
+- GET /api/v1/document/get/{doc_id} -> GET /api/v1/documents/{doc_id}/preview
+- GET /api/v1/document/download/{doc_id} -> GET /api/v1/documents/{doc_id}/download
 - POST /api/v1/sessions/related_questions -> POST /api/v1/chat/recommandation
 - PUT (chunk update) -> PATCH (chunk update)
 """
@@ -394,6 +396,44 @@ async def deprecated_file_upload_info():
     tenant_id = current_user.id
     return await document_api.upload_info(tenant_id=tenant_id)
 
+
+# =============================================================================
+# Document APIs
+# =============================================================================
+
+@manager.route("/document/get/<doc_id>", methods=["GET"])
+@login_required
+async def deprecated_document_get(doc_id):
+    """
+    Deprecated: Use GET /api/v1/documents/{doc_id}/preview instead.
+
+    Old path: GET /api/v1/document/get/{doc_id}
+    New path: GET /api/v1/documents/{doc_id}/preview
+    """
+    logging.warning(
+        "API endpoint /api/v1/document/get/%s is deprecated. "
+        "Please use /api/v1/documents/%s/preview instead.",
+        doc_id, doc_id,
+    )
+    return await document_api.get(doc_id)
+
+
+@manager.route("/document/download/<doc_id>", methods=["GET"])
+@login_required
+async def deprecated_document_download(doc_id):
+    """
+    Deprecated: Use GET /api/v1/documents/{doc_id}/download instead.
+
+    Old path: GET /api/v1/document/download/{doc_id}
+    New path: GET /api/v1/documents/{doc_id}/download
+    """
+    logging.warning(
+        "API endpoint /api/v1/document/download/%s is deprecated. "
+        "Please use /api/v1/documents/%s/download instead.",
+        doc_id, doc_id,
+    )
+    return await document_api.download_attachment(doc_id=doc_id)
+
 # =============================================================================
 # Agent Chat API
 # =============================================================================
diff --git a/api/apps/document_app.py b/api/apps/document_app.py
deleted file mode 100644
index d48885ec90..0000000000
--- a/api/apps/document_app.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#
-#  Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License
-#
-import re
-
-from quart import make_response, request
-
-from api.apps import current_user, login_required
-from api.db import FileType
-from api.db.services.document_service import DocumentService
-from api.db.services.file2document_service import File2DocumentService
-from api.utils.api_utils import (
-    get_data_error_result,
-    server_error_response,
-)
-from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers
-from common import settings
-from common.misc_utils import thread_pool_exec
-
-
-@manager.route("/get/<doc_id>", methods=["GET"])  # noqa: F821
-@login_required
-async def get(doc_id):
-    try:
-        e, doc = DocumentService.get_by_id(doc_id)
-        if not e:
-            return get_data_error_result(message="Document not found!")
-
-        b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
-        data = await thread_pool_exec(settings.STORAGE_IMPL.get, b, n)
-        response = await make_response(data)
-
-        ext = re.search(r"\.([^.]+)$", doc.name.lower())
-        ext = ext.group(1) if ext else None
-        content_type = None
-        if ext:
-            fallback_prefix = "image" if doc.type == FileType.VISUAL.value else "application"
-            content_type = CONTENT_TYPE_MAP.get(ext, f"{fallback_prefix}/{ext}")
-        apply_safe_file_response_headers(response, content_type, ext)
-        return response
-    except Exception as e:
-        return server_error_response(e)
-
-
-@manager.route("/download/<attachment_id>", methods=["GET"])  # noqa: F821
-@login_required
-async def download_attachment(attachment_id):
-    try:
-        ext = request.args.get("ext", "markdown")
-        data = await thread_pool_exec(settings.STORAGE_IMPL.get, current_user.id, attachment_id)
-        response = await make_response(data)
-        content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
-        apply_safe_file_response_headers(response, content_type, ext)
-
-        return response
-
-    except Exception as e:
-        return server_error_response(e)
-
diff --git a/api/apps/restful_apis/document_api.py b/api/apps/restful_apis/document_api.py
index 0d3782f431..f57fe6b8ae 100644
--- a/api/apps/restful_apis/document_api.py
+++ b/api/apps/restful_apis/document_api.py
@@ -33,6 +33,7 @@ from api.db.services import duplicate_name
 from api.db.services.doc_metadata_service import DocMetadataService
 from api.db.db_models import Task
 from api.db.services.document_service import DocumentService
+from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.common.check_team_permission import check_kb_team_permission
@@ -48,7 +49,7 @@ from common.constants import ParserType, RetCode, TaskStatus, SANDBOX_ARTIFACT_B
 from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema
 from common.misc_utils import get_uuid, thread_pool_exec
 from api.utils.file_utils import filename_type, thumbnail
-from api.utils.web_utils import html2pdf, is_valid_url, apply_safe_file_response_headers
+from api.utils.web_utils import CONTENT_TYPE_MAP, html2pdf, is_valid_url, apply_safe_file_response_headers
 from common.ssrf_guard import assert_url_is_safe
 from rag.nlp import search
 
@@ -1854,3 +1855,46 @@ async def batch_update_document_status(tenant_id, dataset_id):
     if has_error:
         return get_json_result(data=result, message="Partial failure", code=RetCode.SERVER_ERROR)
     return get_json_result(data=result)
+
+@manager.route("/documents/<doc_id>/preview", methods=["GET"])  # noqa: F821
+@login_required
+async def get(doc_id):
+    try:
+        e, doc = DocumentService.get_by_id(doc_id)
+        if not e:
+            return get_data_error_result(message="Document not found!")
+
+        b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
+        data = await thread_pool_exec(settings.STORAGE_IMPL.get, b, n)
+        response = await make_response(data)
+
+        ext = re.search(r"\.([^.]+)$", doc.name.lower())
+        ext = ext.group(1) if ext else None
+        content_type = None
+        if ext:
+            fallback_prefix = "image" if doc.type == FileType.VISUAL.value else "application"
+            content_type = CONTENT_TYPE_MAP.get(ext, f"{fallback_prefix}/{ext}")
+        apply_safe_file_response_headers(response, content_type, ext)
+        return response
+    except Exception as e:
+        return server_error_response(e)
+
+
+@manager.route("/documents/<doc_id>/download", methods=["GET"])  # noqa: F821
+@login_required
+@add_tenant_id_to_kwargs
+async def download_attachment(tenant_id=None, doc_id=None, attachment_id=None):
+    try:
+        # Keep backward compatibility with older callers and unit tests that still
+        # pass `attachment_id` instead of the route parameter name.
+        doc_id = doc_id or attachment_id
+        ext = request.args.get("ext", "markdown")
+        data = await thread_pool_exec(settings.STORAGE_IMPL.get, tenant_id, doc_id)
+        response = await make_response(data)
+        content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
+        apply_safe_file_response_headers(response, content_type, ext)
+
+        return response
+
+    except Exception as e:
+        return server_error_response(e)
diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md
index 8c2eba3a43..496f0ab31b 100644
--- a/docs/references/http_api_reference.md
+++ b/docs/references/http_api_reference.md
@@ -6879,14 +6879,18 @@ Failure:
 
 ### Download attachment
 
-**GET** `/v1/document/download/{attachment_id}`
+**GET** `/api/v1/documents/{doc_id}/download`
+
+:::caution DEPRECATED
+The previous endpoint `GET /api/v1/document/download/{doc_id}` is deprecated. Please use this endpoint instead.
+:::
 
 Downloads a runtime attachment previously uploaded via the [Upload document](#upload-document) method.
 
 #### Request
 
 - Method: GET
-- URL: `/v1/document/download/{attachment_id}`
+- URL: `/api/v1/documents/{doc_id}/download`
 - Headers:
   - `'Authorization: Bearer <YOUR_API_KEY>'`
 - Query parameter:
@@ -6896,15 +6900,15 @@ Downloads a runtime attachment previously uploaded via the [Upload document](#up
 
 ```bash
 curl --request GET \
-     --url 'http://{address}/v1/document/download/{attachment_id}?ext=pdf' \
+     --url 'http://{address}/api/v1/documents/{doc_id}/download?ext=pdf' \
      --header 'Authorization: Bearer <YOUR_API_KEY>' \
      --output ./downloaded_attachment.pdf
 ```
 
 ##### Request parameters
 
-- `attachment_id`: (*Path parameter*), `string`, *Required*
-  The `id` value returned by the [Upload document](#upload-document) method.
+- `doc_id`: (*Path parameter*), `string`, *Required*
+  The document ID whose attachment should be downloaded.
 - `ext`: (*Query parameter*), `string`, *Optional*
   A file extension hint specifying the response's Content-Type. Defaults to `"markdown"`. Available values:
   - `"markdown"`
diff --git a/test/testcases/test_web_api/test_common.py b/test/testcases/test_web_api/test_common.py
index 3a8c54ce02..170d530af1 100644
--- a/test/testcases/test_web_api/test_common.py
+++ b/test/testcases/test_web_api/test_common.py
@@ -26,7 +26,6 @@ from utils.file_utils import create_txt_file
 HEADERS = {"Content-Type": "application/json"}
 
 DATASETS_URL = f"/api/{VERSION}/datasets"
-DOCUMENT_APP_URL = f"/{VERSION}/document"
 CHUNK_APP_URL = f"/{VERSION}/chunk"
 CHUNK_API_URL = f"/api/{VERSION}/datasets/{{dataset_id}}/documents/{{document_id}}/chunks"
 # SESSION_WITH_CHAT_ASSISTANT_API_URL = "/api/v1/chats/{chat_id}/sessions"
@@ -404,10 +403,33 @@ def document_infos(auth, dataset_id, params=None, payload=None, *, headers=HEADE
 
 
 def document_metadata_summary(auth, payload=None, *, headers=HEADERS, data=None):
-    res = requests.post(url=f"{HOST_ADDRESS}{DOCUMENT_APP_URL}/metadata/summary", headers=headers, auth=auth, json=payload, data=data)
+    dataset_id = (payload or {}).get("kb_id")
+    doc_ids = (payload or {}).get("doc_ids")
+    if not dataset_id:
+        return {"code": 101, "message": "KB ID is required"}
+    params = {}
+    if doc_ids:
+        params["doc_ids"] = ",".join(doc_ids)
+    res = requests.get(url=f"{HOST_ADDRESS}{DATASETS_URL}/{dataset_id}/metadata/summary", headers=headers, auth=auth, params=params, data=data)
     return res.json()
 
 
+def document_get(auth, document_id, *, headers=HEADERS, data=None):
+    res = requests.get(url=f"{HOST_ADDRESS}/api/{VERSION}/documents/{document_id}/preview", headers=headers, auth=auth, data=data)
+    return res
+
+
+def document_download(auth, attachment_id, *, ext="markdown", headers=HEADERS, data=None):
+    res = requests.get(
+        url=f"{HOST_ADDRESS}/api/{VERSION}/documents/{attachment_id}/download",
+        headers=headers,
+        auth=auth,
+        params={"ext": ext},
+        data=data,
+    )
+    return res
+
+
 def document_metadata_update(auth, dataset_id, payload=None, *, headers=HEADERS, data=None):
     """New unified API for updating document metadata.
 
diff --git a/test/testcases/test_web_api/test_document_app/conftest.py b/test/testcases/test_web_api/test_document_app/conftest.py
index 78b5a5fdf8..0e719a1527 100644
--- a/test/testcases/test_web_api/test_document_app/conftest.py
+++ b/test/testcases/test_web_api/test_document_app/conftest.py
@@ -126,11 +126,31 @@ def document_app_module(monkeypatch):
     monkeypatch.setitem(sys.modules, "xgboost", ModuleType("xgboost"))
 
     stub_apps = ModuleType("api.apps")
+    stub_apps.__path__ = [str(repo_root / "api" / "apps")]
     stub_apps.current_user = SimpleNamespace(id="user-1")
     stub_apps.login_required = lambda func: func
     monkeypatch.setitem(sys.modules, "api.apps", stub_apps)
 
-    module_path = repo_root / "api" / "apps" / "document_app.py"
+    stub_apps_services = ModuleType("api.apps.services")
+    stub_apps_services.__path__ = [str(repo_root / "api" / "apps" / "services")]
+    monkeypatch.setitem(sys.modules, "api.apps.services", stub_apps_services)
+
+    document_api_service_mod = ModuleType("api.apps.services.document_api_service")
+    document_api_service_mod.validate_document_update_fields = lambda *_args, **_kwargs: (None, None)
+    document_api_service_mod.map_doc_keys = lambda doc: doc.to_dict() if hasattr(doc, "to_dict") else doc
+
+    def _map_doc_keys_with_run_status(doc, run_status="0"):
+        payload = doc if isinstance(doc, dict) else doc.to_dict()
+        return {**payload, "run": run_status}
+
+    document_api_service_mod.map_doc_keys_with_run_status = _map_doc_keys_with_run_status
+    document_api_service_mod.update_document_name_only = lambda *_args, **_kwargs: None
+    document_api_service_mod.update_chunk_method = lambda *_args, **_kwargs: None
+    document_api_service_mod.update_document_status_only = lambda *_args, **_kwargs: None
+    document_api_service_mod.reset_document_for_reparse = lambda *_args, **_kwargs: None
+    monkeypatch.setitem(sys.modules, "api.apps.services.document_api_service", document_api_service_mod)
+
+    module_path = repo_root / "api" / "apps" / "restful_apis" / "document_api.py"
     spec = importlib.util.spec_from_file_location("test_document_app_unit", module_path)
     module = importlib.util.module_from_spec(spec)
     module.manager = _DummyManager()
diff --git a/test/testcases/test_web_api/test_document_app/test_document_metadata.py b/test/testcases/test_web_api/test_document_app/test_document_metadata.py
index bb69ef9803..5a843cdc3a 100644
--- a/test/testcases/test_web_api/test_document_app/test_document_metadata.py
+++ b/test/testcases/test_web_api/test_document_app/test_document_metadata.py
@@ -394,7 +394,7 @@ class TestDocumentMetadataUnit:
             "apply_safe_file_response_headers",
             lambda response, content_type, extension: response.headers.update({"content_type": content_type, "extension": extension}),
         )
-        res = _run(module.download_attachment("att1"))
+        res = _run(module.download_attachment(attachment_id="att1"))
         assert isinstance(res, _DummyResponse)
         assert res.data == b"attachment"
         assert res.headers["content_type"] == "application/abc"
@@ -405,7 +405,7 @@ class TestDocumentMetadataUnit:
 
         monkeypatch.setattr(module, "thread_pool_exec", raise_error)
         monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)})
-        res = _run(module.download_attachment("att1"))
+        res = _run(module.download_attachment(attachment_id="att1"))
         assert res["code"] == 500
         assert "download boom" in res["message"]
 
diff --git a/web/src/components/document-preview/hooks.ts b/web/src/components/document-preview/hooks.ts
index 79f2aa526d..fa17e3b66f 100644
--- a/web/src/components/document-preview/hooks.ts
+++ b/web/src/components/document-preview/hooks.ts
@@ -1,7 +1,7 @@
 import { Authorization } from '@/constants/authorization';
 import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
 import { useGetPipelineResultSearchParams } from '@/pages/dataflow-result/hooks';
-import api, { webAPI } from '@/utils/api';
+import api, { restAPIv1 } from '@/utils/api';
 import { getAuthorization } from '@/utils/authorization-util';
 import jsPreviewExcel from '@js-preview/excel';
 import { useSize } from 'ahooks';
@@ -57,7 +57,7 @@ export const useGetDocumentUrl = (isAgent: boolean) => {
     if (isAgent) {
       return api.downloadFile + `?id=${id}&created_by=${createdBy}`;
     }
-    return `${webAPI}/document/get/${documentId}`;
+    return `${restAPIv1}/documents/${documentId}/preview`;
   }, [createdBy, documentId, id, isAgent]);
 
   return url;
diff --git a/web/src/hooks/use-document-request.ts b/web/src/hooks/use-document-request.ts
index bb447e6c64..96ad1b0e2c 100644
--- a/web/src/hooks/use-document-request.ts
+++ b/web/src/hooks/use-document-request.ts
@@ -26,8 +26,7 @@ import kbService, {
   uploadDocument,
   webCrawlDocument,
 } from '@/services/knowledge-service';
-import { restAPIv1, webAPI } from '@/utils/api';
-import { getSearchValue } from '@/utils/common-util';
+import { restAPIv1 } from '@/utils/api';
 import { buildChunkHighlights } from '@/utils/document-util';
 import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
 import { useDebounce } from 'ahooks';
@@ -214,6 +213,7 @@ export const useGetDocumentFilter = (): {
   const { id } = useParams();
   const debouncedSearchString = useDebounce(searchString, { wait: 500 });
   const [open, setOpen] = useState<number>(0);
+  const datasetId = knowledgeId || id;
   const { data } = useQuery({
     queryKey: [
       DocumentApiAction.FetchDocumentFilter,
@@ -221,7 +221,10 @@ export const useGetDocumentFilter = (): {
       knowledgeId,
     ],
     queryFn: async () => {
-      const { data } = await documentFilter(knowledgeId || id);
+      if (!datasetId) {
+        return;
+      }
+      const { data } = await documentFilter(datasetId);
       if (data.code === 0) {
         return data.data;
       }
@@ -504,14 +507,11 @@ export const useCreateDocument = () => {
 };
 
 export const useGetDocumentUrl = (documentId?: string) => {
-  const auth = getSearchValue('auth');
   const getDocumentUrl = useCallback(
     (id?: string) => {
-      return auth
-        ? `${restAPIv1}/documents/${id || documentId}`
-        : `${webAPI}/document/get/${id || documentId}`;
+      return `${restAPIv1}/documents/${id || documentId}/preview`;
     },
-    [documentId, auth],
+    [documentId],
   );
 
   return getDocumentUrl;
diff --git a/web/src/pages/document-viewer/index.tsx b/web/src/pages/document-viewer/index.tsx
index d9f7da73a8..02c07faeae 100644
--- a/web/src/pages/document-viewer/index.tsx
+++ b/web/src/pages/document-viewer/index.tsx
@@ -1,5 +1,5 @@
 import { Images } from '@/constants/common';
-import { restAPIv1, webAPI } from '@/utils/api';
+import { restAPIv1 } from '@/utils/api';
 import { useParams, useSearchParams } from 'react-router';
 // import Docx from './docx';
 // import Excel from './excel';
@@ -29,7 +29,7 @@ const DocumentViewer = () => {
   const api =
     resource === 'files'
       ? `${restAPIv1}/files/${documentId}`
-      : `${webAPI}/document/get/${documentId}`;
+      : `${restAPIv1}/documents/${documentId}/preview`;
   // request.head
 
   if (ext === 'html' && documentId) {
diff --git a/web/src/utils/api.ts b/web/src/utils/api.ts
index b1c2c3e6e1..2e23727b76 100644
--- a/web/src/utils/api.ts
+++ b/web/src/utils/api.ts
@@ -127,9 +127,9 @@ export default {
   documentChangeParser: (datasetId: string, documentId: string) =>
     `${restAPIv1}/datasets/${datasetId}/documents/${documentId}`,
   documentThumbnails: `${restAPIv1}/thumbnails`,
-  getDocumentFile: `${webAPI}/document/get`,
+  getDocumentFile: `${restAPIv1}/documents`,
   getDocumentFileDownload: (docId: string) =>
-    `${webAPI}/document/download/${docId}`,
+    `${restAPIv1}/documents/${docId}/download`,
   documentUpload: (datasetId: string) =>
     `${restAPIv1}/datasets/${datasetId}/documents`,
   webCrawl: (datasetId: string) =>