Refa: migrate document preview/download to RESTful API (#14633)

### What problem does this PR solve? migrate document preview/download to RESTful API ### Type of change - [x] Refactoring
2026-06-29 23:41:12 +08:00 · 2026-05-08 13:26:13 +08:00
parent 412fae7ac2
commit f703169117
11 changed files with 155 additions and 96 deletions
--- a/api/apps/backward_compat.py
+++ b/api/apps/backward_compat.py
@@ -29,6 +29,8 @@ Deprecated APIs and their replacements:
 - POST /api/v1/file/convert -> POST /api/v1/files/link-to-datasets
 - GET /api/v1/file/* -> GET /api/v1/files*
 - POST /api/v1/file/* -> POST /api/v1/files*
+- GET /api/v1/document/get/{doc_id} -> GET /api/v1/documents/{doc_id}/preview
+- GET /api/v1/document/download/{doc_id} -> GET /api/v1/documents/{doc_id}/download
 - POST /api/v1/sessions/related_questions -> POST /api/v1/chat/recommandation
 - PUT (chunk update) -> PATCH (chunk update)
 """
@@ -394,6 +396,44 @@ async def deprecated_file_upload_info():
    tenant_id = current_user.id
    return await document_api.upload_info(tenant_id=tenant_id)

+
+# =============================================================================
+# Document APIs
+# =============================================================================
+
+@manager.route("/document/get/<doc_id>", methods=["GET"])
+@login_required
+async def deprecated_document_get(doc_id):
+    """
+    Deprecated: Use GET /api/v1/documents/{doc_id}/preview instead.
+
+    Old path: GET /api/v1/document/get/{doc_id}
+    New path: GET /api/v1/documents/{doc_id}/preview
+    """
+    logging.warning(
+        "API endpoint /api/v1/document/get/%s is deprecated. "
+        "Please use /api/v1/documents/%s/preview instead.",
+        doc_id, doc_id,
+    )
+    return await document_api.get(doc_id)
+
+
+@manager.route("/document/download/<doc_id>", methods=["GET"])
+@login_required
+async def deprecated_document_download(doc_id):
+    """
+    Deprecated: Use GET /api/v1/documents/{doc_id}/download instead.
+
+    Old path: GET /api/v1/document/download/{doc_id}
+    New path: GET /api/v1/documents/{doc_id}/download
+    """
+    logging.warning(
+        "API endpoint /api/v1/document/download/%s is deprecated. "
+        "Please use /api/v1/documents/%s/download instead.",
+        doc_id, doc_id,
+    )
+    return await document_api.download_attachment(doc_id=doc_id)
+
 # =============================================================================
 # Agent Chat API
 # =============================================================================
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@@ -1,71 +0,0 @@
-#
-#  Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License
-#
-import re
-
-from quart import make_response, request
-
-from api.apps import current_user, login_required
-from api.db import FileType
-from api.db.services.document_service import DocumentService
-from api.db.services.file2document_service import File2DocumentService
-from api.utils.api_utils import (
-    get_data_error_result,
-    server_error_response,
-)
-from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers
-from common import settings
-from common.misc_utils import thread_pool_exec
-
-
-@manager.route("/get/<doc_id>", methods=["GET"])  # noqa: F821
-@login_required
-async def get(doc_id):
-    try:
-        e, doc = DocumentService.get_by_id(doc_id)
-        if not e:
-            return get_data_error_result(message="Document not found!")
-
-        b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
-        data = await thread_pool_exec(settings.STORAGE_IMPL.get, b, n)
-        response = await make_response(data)
-
-        ext = re.search(r"\.([^.]+)$", doc.name.lower())
-        ext = ext.group(1) if ext else None
-        content_type = None
-        if ext:
-            fallback_prefix = "image" if doc.type == FileType.VISUAL.value else "application"
-            content_type = CONTENT_TYPE_MAP.get(ext, f"{fallback_prefix}/{ext}")
-        apply_safe_file_response_headers(response, content_type, ext)
-        return response
-    except Exception as e:
-        return server_error_response(e)
-
-
-@manager.route("/download/<attachment_id>", methods=["GET"])  # noqa: F821
-@login_required
-async def download_attachment(attachment_id):
-    try:
-        ext = request.args.get("ext", "markdown")
-        data = await thread_pool_exec(settings.STORAGE_IMPL.get, current_user.id, attachment_id)
-        response = await make_response(data)
-        content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
-        apply_safe_file_response_headers(response, content_type, ext)
-
-        return response
-
-    except Exception as e:
-        return server_error_response(e)
-
--- a/api/apps/restful_apis/document_api.py
+++ b/api/apps/restful_apis/document_api.py
@@ -33,6 +33,7 @@ from api.db.services import duplicate_name
 from api.db.services.doc_metadata_service import DocMetadataService
 from api.db.db_models import Task
 from api.db.services.document_service import DocumentService
+from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.common.check_team_permission import check_kb_team_permission
@@ -48,7 +49,7 @@ from common.constants import ParserType, RetCode, TaskStatus, SANDBOX_ARTIFACT_B
 from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema
 from common.misc_utils import get_uuid, thread_pool_exec
 from api.utils.file_utils import filename_type, thumbnail
-from api.utils.web_utils import html2pdf, is_valid_url, apply_safe_file_response_headers
+from api.utils.web_utils import CONTENT_TYPE_MAP, html2pdf, is_valid_url, apply_safe_file_response_headers
 from common.ssrf_guard import assert_url_is_safe
 from rag.nlp import search

@@ -1854,3 +1855,46 @@ async def batch_update_document_status(tenant_id, dataset_id):
    if has_error:
        return get_json_result(data=result, message="Partial failure", code=RetCode.SERVER_ERROR)
    return get_json_result(data=result)
+
+@manager.route("/documents/<doc_id>/preview", methods=["GET"])  # noqa: F821
+@login_required
+async def get(doc_id):
+    try:
+        e, doc = DocumentService.get_by_id(doc_id)
+        if not e:
+            return get_data_error_result(message="Document not found!")
+
+        b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
+        data = await thread_pool_exec(settings.STORAGE_IMPL.get, b, n)
+        response = await make_response(data)
+
+        ext = re.search(r"\.([^.]+)$", doc.name.lower())
+        ext = ext.group(1) if ext else None
+        content_type = None
+        if ext:
+            fallback_prefix = "image" if doc.type == FileType.VISUAL.value else "application"
+            content_type = CONTENT_TYPE_MAP.get(ext, f"{fallback_prefix}/{ext}")
+        apply_safe_file_response_headers(response, content_type, ext)
+        return response
+    except Exception as e:
+        return server_error_response(e)
+
+
+@manager.route("/documents/<doc_id>/download", methods=["GET"])  # noqa: F821
+@login_required
+@add_tenant_id_to_kwargs
+async def download_attachment(tenant_id=None, doc_id=None, attachment_id=None):
+    try:
+        # Keep backward compatibility with older callers and unit tests that still
+        # pass `attachment_id` instead of the route parameter name.
+        doc_id = doc_id or attachment_id
+        ext = request.args.get("ext", "markdown")
+        data = await thread_pool_exec(settings.STORAGE_IMPL.get, tenant_id, doc_id)
+        response = await make_response(data)
+        content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
+        apply_safe_file_response_headers(response, content_type, ext)
+
+        return response
+
+    except Exception as e:
+        return server_error_response(e)
--- a/docs/references/http_api_reference.md
+++ b/docs/references/http_api_reference.md
@@ -6879,14 +6879,18 @@ Failure:
 ##### Request example

 ```bash
-```
+curl --request GET \
+     --url 'http://{address}/api/v1/documents/{doc_id}/download?ext=pdf' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --output ./downloaded_attachment.pdf
+```

 ##### Request parameters

 - `doc_id`: (*Path parameter*), `string`, *Required*
  The document ID whose attachment should be downloaded.
 - `ext`: (*Query parameter*), `string`, *Optional*
-  A file extension hint specifying the response's Content-Type. Defaults to `"markdown"`. Available values:
+  A file extension hint specifying the response's Content-Type. Defaults to `"markdown"`. Available values:
  - `"markdown"`
  - `"html"`
  - `"pdf"`
@@ -6896,15 +6900,15 @@ Downloads a runtime attachment previously uploaded via the [Upload document](#up

 #### Response

-Success:
+Success:

 Returns the file content as a binary stream with the relevant Content-Type header.

 Failure:

 ```json
-{
-    "code": 500,
+{
+    "code": 500,
    "message": "Internal server error"
 }
 ```
--- a/test/testcases/test_web_api/test_common.py
+++ b/test/testcases/test_web_api/test_common.py
@@ -26,7 +26,6 @@ from utils.file_utils import create_txt_file
 HEADERS = {"Content-Type": "application/json"}

 DATASETS_URL = f"/api/{VERSION}/datasets"
-DOCUMENT_APP_URL = f"/{VERSION}/document"
 CHUNK_APP_URL = f"/{VERSION}/chunk"
 CHUNK_API_URL = f"/api/{VERSION}/datasets/{{dataset_id}}/documents/{{document_id}}/chunks"
 # SESSION_WITH_CHAT_ASSISTANT_API_URL = "/api/v1/chats/{chat_id}/sessions"
@@ -404,10 +403,33 @@ def document_infos(auth, dataset_id, params=None, payload=None, *, headers=HEADE


 def document_metadata_summary(auth, payload=None, *, headers=HEADERS, data=None):
-    res = requests.post(url=f"{HOST_ADDRESS}{DOCUMENT_APP_URL}/metadata/summary", headers=headers, auth=auth, json=payload, data=data)
+    dataset_id = (payload or {}).get("kb_id")
+    doc_ids = (payload or {}).get("doc_ids")
+    if not dataset_id:
+        return {"code": 101, "message": "KB ID is required"}
+    params = {}
+    if doc_ids:
+        params["doc_ids"] = ",".join(doc_ids)
+    res = requests.get(url=f"{HOST_ADDRESS}{DATASETS_URL}/{dataset_id}/metadata/summary", headers=headers, auth=auth, params=params, data=data)
    return res.json()


+def document_get(auth, document_id, *, headers=HEADERS, data=None):
+    res = requests.get(url=f"{HOST_ADDRESS}/api/{VERSION}/documents/{document_id}/preview", headers=headers, auth=auth, data=data)
+    return res
+
+
+def document_download(auth, attachment_id, *, ext="markdown", headers=HEADERS, data=None):
+    res = requests.get(
+        url=f"{HOST_ADDRESS}/api/{VERSION}/documents/{attachment_id}/download",
+        headers=headers,
+        auth=auth,
+        params={"ext": ext},
+        data=data,
+    )
+    return res
+
+
 def document_metadata_update(auth, dataset_id, payload=None, *, headers=HEADERS, data=None):
    """New unified API for updating document metadata.

--- a/test/testcases/test_web_api/test_document_app/conftest.py
+++ b/test/testcases/test_web_api/test_document_app/conftest.py
@@ -126,11 +126,31 @@ def document_app_module(monkeypatch):
    monkeypatch.setitem(sys.modules, "xgboost", ModuleType("xgboost"))

    stub_apps = ModuleType("api.apps")
+    stub_apps.__path__ = [str(repo_root / "api" / "apps")]
    stub_apps.current_user = SimpleNamespace(id="user-1")
    stub_apps.login_required = lambda func: func
    monkeypatch.setitem(sys.modules, "api.apps", stub_apps)

-    module_path = repo_root / "api" / "apps" / "document_app.py"
+    stub_apps_services = ModuleType("api.apps.services")
+    stub_apps_services.__path__ = [str(repo_root / "api" / "apps" / "services")]
+    monkeypatch.setitem(sys.modules, "api.apps.services", stub_apps_services)
+
+    document_api_service_mod = ModuleType("api.apps.services.document_api_service")
+    document_api_service_mod.validate_document_update_fields = lambda *_args, **_kwargs: (None, None)
+    document_api_service_mod.map_doc_keys = lambda doc: doc.to_dict() if hasattr(doc, "to_dict") else doc
+
+    def _map_doc_keys_with_run_status(doc, run_status="0"):
+        payload = doc if isinstance(doc, dict) else doc.to_dict()
+        return {**payload, "run": run_status}
+
+    document_api_service_mod.map_doc_keys_with_run_status = _map_doc_keys_with_run_status
+    document_api_service_mod.update_document_name_only = lambda *_args, **_kwargs: None
+    document_api_service_mod.update_chunk_method = lambda *_args, **_kwargs: None
+    document_api_service_mod.update_document_status_only = lambda *_args, **_kwargs: None
+    document_api_service_mod.reset_document_for_reparse = lambda *_args, **_kwargs: None
+    monkeypatch.setitem(sys.modules, "api.apps.services.document_api_service", document_api_service_mod)
+
+    module_path = repo_root / "api" / "apps" / "restful_apis" / "document_api.py"
    spec = importlib.util.spec_from_file_location("test_document_app_unit", module_path)
    module = importlib.util.module_from_spec(spec)
    module.manager = _DummyManager()
--- a/test/testcases/test_web_api/test_document_app/test_document_metadata.py
+++ b/test/testcases/test_web_api/test_document_app/test_document_metadata.py
@@ -394,7 +394,7 @@ class TestDocumentMetadataUnit:
            "apply_safe_file_response_headers",
            lambda response, content_type, extension: response.headers.update({"content_type": content_type, "extension": extension}),
        )
-        res = _run(module.download_attachment("att1"))
+        res = _run(module.download_attachment(attachment_id="att1"))
        assert isinstance(res, _DummyResponse)
        assert res.data == b"attachment"
        assert res.headers["content_type"] == "application/abc"
@@ -405,7 +405,7 @@ class TestDocumentMetadataUnit:

        monkeypatch.setattr(module, "thread_pool_exec", raise_error)
        monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)})
-        res = _run(module.download_attachment("att1"))
+        res = _run(module.download_attachment(attachment_id="att1"))
        assert res["code"] == 500
        assert "download boom" in res["message"]

--- a/web/src/components/document-preview/hooks.ts
+++ b/web/src/components/document-preview/hooks.ts
@@ -1,7 +1,7 @@
 import { Authorization } from '@/constants/authorization';
 import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
 import { useGetPipelineResultSearchParams } from '@/pages/dataflow-result/hooks';
-import api, { webAPI } from '@/utils/api';
+import api, { restAPIv1 } from '@/utils/api';
 import { getAuthorization } from '@/utils/authorization-util';
 import jsPreviewExcel from '@js-preview/excel';
 import { useSize } from 'ahooks';
@@ -57,7 +57,7 @@ export const useGetDocumentUrl = (isAgent: boolean) => {
    if (isAgent) {
      return api.downloadFile + `?id=${id}&created_by=${createdBy}`;
    }
-    return `${webAPI}/document/get/${documentId}`;
+    return `${restAPIv1}/documents/${documentId}/preview`;
  }, [createdBy, documentId, id, isAgent]);

  return url;
--- a/web/src/hooks/use-document-request.ts
+++ b/web/src/hooks/use-document-request.ts
@@ -26,8 +26,7 @@ import kbService, {
  uploadDocument,
  webCrawlDocument,
 } from '@/services/knowledge-service';
-import { restAPIv1, webAPI } from '@/utils/api';
-import { getSearchValue } from '@/utils/common-util';
+import { restAPIv1 } from '@/utils/api';
 import { buildChunkHighlights } from '@/utils/document-util';
 import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
 import { useDebounce } from 'ahooks';
@@ -214,6 +213,7 @@ export const useGetDocumentFilter = (): {
  const { id } = useParams();
  const debouncedSearchString = useDebounce(searchString, { wait: 500 });
  const [open, setOpen] = useState<number>(0);
+  const datasetId = knowledgeId || id;
  const { data } = useQuery({
    queryKey: [
      DocumentApiAction.FetchDocumentFilter,
@@ -221,7 +221,10 @@ export const useGetDocumentFilter = (): {
      knowledgeId,
    ],
    queryFn: async () => {
-      const { data } = await documentFilter(knowledgeId || id);
+      if (!datasetId) {
+        return;
+      }
+      const { data } = await documentFilter(datasetId);
      if (data.code === 0) {
        return data.data;
      }
@@ -504,14 +507,11 @@ export const useCreateDocument = () => {
 };

 export const useGetDocumentUrl = (documentId?: string) => {
-  const auth = getSearchValue('auth');
  const getDocumentUrl = useCallback(
    (id?: string) => {
-      return auth
-        ? `${restAPIv1}/documents/${id || documentId}`
-        : `${webAPI}/document/get/${id || documentId}`;
+      return `${restAPIv1}/documents/${id || documentId}/preview`;
    },
-    [documentId, auth],
+    [documentId],
  );

  return getDocumentUrl;
--- a/web/src/pages/document-viewer/index.tsx
+++ b/web/src/pages/document-viewer/index.tsx
@@ -1,5 +1,5 @@
 import { Images } from '@/constants/common';
-import { restAPIv1, webAPI } from '@/utils/api';
+import { restAPIv1 } from '@/utils/api';
 import { useParams, useSearchParams } from 'react-router';
 // import Docx from './docx';
 // import Excel from './excel';
@@ -29,7 +29,7 @@ const DocumentViewer = () => {
  const api =
    resource === 'files'
      ? `${restAPIv1}/files/${documentId}`
-      : `${webAPI}/document/get/${documentId}`;
+      : `${restAPIv1}/documents/${documentId}/preview`;
  // request.head

  if (ext === 'html' && documentId) {
--- a/web/src/utils/api.ts
+++ b/web/src/utils/api.ts
@@ -127,9 +127,9 @@ export default {
  documentChangeParser: (datasetId: string, documentId: string) =>
    `${restAPIv1}/datasets/${datasetId}/documents/${documentId}`,
  documentThumbnails: `${restAPIv1}/thumbnails`,
-  getDocumentFile: `${webAPI}/document/get`,
+  getDocumentFile: `${restAPIv1}/documents`,
  getDocumentFileDownload: (docId: string) =>
-    `${webAPI}/document/download/${docId}`,
+    `${restAPIv1}/documents/${docId}/download`,
  documentUpload: (datasetId: string) =>
    `${restAPIv1}/datasets/${datasetId}/documents`,
  webCrawl: (datasetId: string) =>