Refa: migrate document preview/download to RESTful API (#14633)

### What problem does this PR solve?

migrate document preview/download to RESTful API

### Type of change
- [x] Refactoring
This commit is contained in:
buua436
2026-05-08 13:26:13 +08:00
committed by GitHub
parent 412fae7ac2
commit f703169117
11 changed files with 155 additions and 96 deletions

View File

@@ -29,6 +29,8 @@ Deprecated APIs and their replacements:
- POST /api/v1/file/convert -> POST /api/v1/files/link-to-datasets
- GET /api/v1/file/* -> GET /api/v1/files*
- POST /api/v1/file/* -> POST /api/v1/files*
- GET /api/v1/document/get/{doc_id} -> GET /api/v1/documents/{doc_id}/preview
- GET /api/v1/document/download/{doc_id} -> GET /api/v1/documents/{doc_id}/download
- POST /api/v1/sessions/related_questions -> POST /api/v1/chat/recommandation
- PUT (chunk update) -> PATCH (chunk update)
"""
@@ -394,6 +396,44 @@ async def deprecated_file_upload_info():
tenant_id = current_user.id
return await document_api.upload_info(tenant_id=tenant_id)
# =============================================================================
# Document APIs
# =============================================================================
@manager.route("/document/get/<doc_id>", methods=["GET"])
@login_required
async def deprecated_document_get(doc_id):
"""
Deprecated: Use GET /api/v1/documents/{doc_id}/preview instead.
Old path: GET /api/v1/document/get/{doc_id}
New path: GET /api/v1/documents/{doc_id}/preview
"""
logging.warning(
"API endpoint /api/v1/document/get/%s is deprecated. "
"Please use /api/v1/documents/%s/preview instead.",
doc_id, doc_id,
)
return await document_api.get(doc_id)
@manager.route("/document/download/<doc_id>", methods=["GET"])
@login_required
async def deprecated_document_download(doc_id):
"""
Deprecated: Use GET /api/v1/documents/{doc_id}/download instead.
Old path: GET /api/v1/document/download/{doc_id}
New path: GET /api/v1/documents/{doc_id}/download
"""
logging.warning(
"API endpoint /api/v1/document/download/%s is deprecated. "
"Please use /api/v1/documents/%s/download instead.",
doc_id, doc_id,
)
return await document_api.download_attachment(doc_id=doc_id)
# =============================================================================
# Agent Chat API
# =============================================================================

View File

@@ -1,71 +0,0 @@
#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
#
import re
from quart import make_response, request
from api.apps import current_user, login_required
from api.db import FileType
from api.db.services.document_service import DocumentService
from api.db.services.file2document_service import File2DocumentService
from api.utils.api_utils import (
get_data_error_result,
server_error_response,
)
from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers
from common import settings
from common.misc_utils import thread_pool_exec
@manager.route("/get/<doc_id>", methods=["GET"]) # noqa: F821
@login_required
async def get(doc_id):
try:
e, doc = DocumentService.get_by_id(doc_id)
if not e:
return get_data_error_result(message="Document not found!")
b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
data = await thread_pool_exec(settings.STORAGE_IMPL.get, b, n)
response = await make_response(data)
ext = re.search(r"\.([^.]+)$", doc.name.lower())
ext = ext.group(1) if ext else None
content_type = None
if ext:
fallback_prefix = "image" if doc.type == FileType.VISUAL.value else "application"
content_type = CONTENT_TYPE_MAP.get(ext, f"{fallback_prefix}/{ext}")
apply_safe_file_response_headers(response, content_type, ext)
return response
except Exception as e:
return server_error_response(e)
@manager.route("/download/<attachment_id>", methods=["GET"]) # noqa: F821
@login_required
async def download_attachment(attachment_id):
try:
ext = request.args.get("ext", "markdown")
data = await thread_pool_exec(settings.STORAGE_IMPL.get, current_user.id, attachment_id)
response = await make_response(data)
content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
apply_safe_file_response_headers(response, content_type, ext)
return response
except Exception as e:
return server_error_response(e)

View File

@@ -33,6 +33,7 @@ from api.db.services import duplicate_name
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.db_models import Task
from api.db.services.document_service import DocumentService
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.common.check_team_permission import check_kb_team_permission
@@ -48,7 +49,7 @@ from common.constants import ParserType, RetCode, TaskStatus, SANDBOX_ARTIFACT_B
from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema
from common.misc_utils import get_uuid, thread_pool_exec
from api.utils.file_utils import filename_type, thumbnail
from api.utils.web_utils import html2pdf, is_valid_url, apply_safe_file_response_headers
from api.utils.web_utils import CONTENT_TYPE_MAP, html2pdf, is_valid_url, apply_safe_file_response_headers
from common.ssrf_guard import assert_url_is_safe
from rag.nlp import search
@@ -1854,3 +1855,46 @@ async def batch_update_document_status(tenant_id, dataset_id):
if has_error:
return get_json_result(data=result, message="Partial failure", code=RetCode.SERVER_ERROR)
return get_json_result(data=result)
@manager.route("/documents/<doc_id>/preview", methods=["GET"]) # noqa: F821
@login_required
async def get(doc_id):
try:
e, doc = DocumentService.get_by_id(doc_id)
if not e:
return get_data_error_result(message="Document not found!")
b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
data = await thread_pool_exec(settings.STORAGE_IMPL.get, b, n)
response = await make_response(data)
ext = re.search(r"\.([^.]+)$", doc.name.lower())
ext = ext.group(1) if ext else None
content_type = None
if ext:
fallback_prefix = "image" if doc.type == FileType.VISUAL.value else "application"
content_type = CONTENT_TYPE_MAP.get(ext, f"{fallback_prefix}/{ext}")
apply_safe_file_response_headers(response, content_type, ext)
return response
except Exception as e:
return server_error_response(e)
@manager.route("/documents/<doc_id>/download", methods=["GET"]) # noqa: F821
@login_required
@add_tenant_id_to_kwargs
async def download_attachment(tenant_id=None, doc_id=None, attachment_id=None):
try:
# Keep backward compatibility with older callers and unit tests that still
# pass `attachment_id` instead of the route parameter name.
doc_id = doc_id or attachment_id
ext = request.args.get("ext", "markdown")
data = await thread_pool_exec(settings.STORAGE_IMPL.get, tenant_id, doc_id)
response = await make_response(data)
content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
apply_safe_file_response_headers(response, content_type, ext)
return response
except Exception as e:
return server_error_response(e)

View File

@@ -6879,14 +6879,18 @@ Failure:
##### Request example
```bash
```
curl --request GET \
--url 'http://{address}/api/v1/documents/{doc_id}/download?ext=pdf' \
--header 'Authorization: Bearer <YOUR_API_KEY>' \
--output ./downloaded_attachment.pdf
```
##### Request parameters
- `doc_id`: (*Path parameter*), `string`, *Required*
The document ID whose attachment should be downloaded.
- `ext`: (*Query parameter*), `string`, *Optional*
A file extension hint specifying the response's Content-Type. Defaults to `"markdown"`. Available values:
A file extension hint specifying the response's Content-Type. Defaults to `"markdown"`. Available values:
- `"markdown"`
- `"html"`
- `"pdf"`
@@ -6896,15 +6900,15 @@ Downloads a runtime attachment previously uploaded via the [Upload document](#up
#### Response
Success:
Success:
Returns the file content as a binary stream with the relevant Content-Type header.
Failure:
```json
{
"code": 500,
{
"code": 500,
"message": "Internal server error"
}
```

View File

@@ -26,7 +26,6 @@ from utils.file_utils import create_txt_file
HEADERS = {"Content-Type": "application/json"}
DATASETS_URL = f"/api/{VERSION}/datasets"
DOCUMENT_APP_URL = f"/{VERSION}/document"
CHUNK_APP_URL = f"/{VERSION}/chunk"
CHUNK_API_URL = f"/api/{VERSION}/datasets/{{dataset_id}}/documents/{{document_id}}/chunks"
# SESSION_WITH_CHAT_ASSISTANT_API_URL = "/api/v1/chats/{chat_id}/sessions"
@@ -404,10 +403,33 @@ def document_infos(auth, dataset_id, params=None, payload=None, *, headers=HEADE
def document_metadata_summary(auth, payload=None, *, headers=HEADERS, data=None):
res = requests.post(url=f"{HOST_ADDRESS}{DOCUMENT_APP_URL}/metadata/summary", headers=headers, auth=auth, json=payload, data=data)
dataset_id = (payload or {}).get("kb_id")
doc_ids = (payload or {}).get("doc_ids")
if not dataset_id:
return {"code": 101, "message": "KB ID is required"}
params = {}
if doc_ids:
params["doc_ids"] = ",".join(doc_ids)
res = requests.get(url=f"{HOST_ADDRESS}{DATASETS_URL}/{dataset_id}/metadata/summary", headers=headers, auth=auth, params=params, data=data)
return res.json()
def document_get(auth, document_id, *, headers=HEADERS, data=None):
res = requests.get(url=f"{HOST_ADDRESS}/api/{VERSION}/documents/{document_id}/preview", headers=headers, auth=auth, data=data)
return res
def document_download(auth, attachment_id, *, ext="markdown", headers=HEADERS, data=None):
res = requests.get(
url=f"{HOST_ADDRESS}/api/{VERSION}/documents/{attachment_id}/download",
headers=headers,
auth=auth,
params={"ext": ext},
data=data,
)
return res
def document_metadata_update(auth, dataset_id, payload=None, *, headers=HEADERS, data=None):
"""New unified API for updating document metadata.

View File

@@ -126,11 +126,31 @@ def document_app_module(monkeypatch):
monkeypatch.setitem(sys.modules, "xgboost", ModuleType("xgboost"))
stub_apps = ModuleType("api.apps")
stub_apps.__path__ = [str(repo_root / "api" / "apps")]
stub_apps.current_user = SimpleNamespace(id="user-1")
stub_apps.login_required = lambda func: func
monkeypatch.setitem(sys.modules, "api.apps", stub_apps)
module_path = repo_root / "api" / "apps" / "document_app.py"
stub_apps_services = ModuleType("api.apps.services")
stub_apps_services.__path__ = [str(repo_root / "api" / "apps" / "services")]
monkeypatch.setitem(sys.modules, "api.apps.services", stub_apps_services)
document_api_service_mod = ModuleType("api.apps.services.document_api_service")
document_api_service_mod.validate_document_update_fields = lambda *_args, **_kwargs: (None, None)
document_api_service_mod.map_doc_keys = lambda doc: doc.to_dict() if hasattr(doc, "to_dict") else doc
def _map_doc_keys_with_run_status(doc, run_status="0"):
payload = doc if isinstance(doc, dict) else doc.to_dict()
return {**payload, "run": run_status}
document_api_service_mod.map_doc_keys_with_run_status = _map_doc_keys_with_run_status
document_api_service_mod.update_document_name_only = lambda *_args, **_kwargs: None
document_api_service_mod.update_chunk_method = lambda *_args, **_kwargs: None
document_api_service_mod.update_document_status_only = lambda *_args, **_kwargs: None
document_api_service_mod.reset_document_for_reparse = lambda *_args, **_kwargs: None
monkeypatch.setitem(sys.modules, "api.apps.services.document_api_service", document_api_service_mod)
module_path = repo_root / "api" / "apps" / "restful_apis" / "document_api.py"
spec = importlib.util.spec_from_file_location("test_document_app_unit", module_path)
module = importlib.util.module_from_spec(spec)
module.manager = _DummyManager()

View File

@@ -394,7 +394,7 @@ class TestDocumentMetadataUnit:
"apply_safe_file_response_headers",
lambda response, content_type, extension: response.headers.update({"content_type": content_type, "extension": extension}),
)
res = _run(module.download_attachment("att1"))
res = _run(module.download_attachment(attachment_id="att1"))
assert isinstance(res, _DummyResponse)
assert res.data == b"attachment"
assert res.headers["content_type"] == "application/abc"
@@ -405,7 +405,7 @@ class TestDocumentMetadataUnit:
monkeypatch.setattr(module, "thread_pool_exec", raise_error)
monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)})
res = _run(module.download_attachment("att1"))
res = _run(module.download_attachment(attachment_id="att1"))
assert res["code"] == 500
assert "download boom" in res["message"]

View File

@@ -1,7 +1,7 @@
import { Authorization } from '@/constants/authorization';
import { useGetKnowledgeSearchParams } from '@/hooks/route-hook';
import { useGetPipelineResultSearchParams } from '@/pages/dataflow-result/hooks';
import api, { webAPI } from '@/utils/api';
import api, { restAPIv1 } from '@/utils/api';
import { getAuthorization } from '@/utils/authorization-util';
import jsPreviewExcel from '@js-preview/excel';
import { useSize } from 'ahooks';
@@ -57,7 +57,7 @@ export const useGetDocumentUrl = (isAgent: boolean) => {
if (isAgent) {
return api.downloadFile + `?id=${id}&created_by=${createdBy}`;
}
return `${webAPI}/document/get/${documentId}`;
return `${restAPIv1}/documents/${documentId}/preview`;
}, [createdBy, documentId, id, isAgent]);
return url;

View File

@@ -26,8 +26,7 @@ import kbService, {
uploadDocument,
webCrawlDocument,
} from '@/services/knowledge-service';
import { restAPIv1, webAPI } from '@/utils/api';
import { getSearchValue } from '@/utils/common-util';
import { restAPIv1 } from '@/utils/api';
import { buildChunkHighlights } from '@/utils/document-util';
import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
import { useDebounce } from 'ahooks';
@@ -214,6 +213,7 @@ export const useGetDocumentFilter = (): {
const { id } = useParams();
const debouncedSearchString = useDebounce(searchString, { wait: 500 });
const [open, setOpen] = useState<number>(0);
const datasetId = knowledgeId || id;
const { data } = useQuery({
queryKey: [
DocumentApiAction.FetchDocumentFilter,
@@ -221,7 +221,10 @@ export const useGetDocumentFilter = (): {
knowledgeId,
],
queryFn: async () => {
const { data } = await documentFilter(knowledgeId || id);
if (!datasetId) {
return;
}
const { data } = await documentFilter(datasetId);
if (data.code === 0) {
return data.data;
}
@@ -504,14 +507,11 @@ export const useCreateDocument = () => {
};
export const useGetDocumentUrl = (documentId?: string) => {
const auth = getSearchValue('auth');
const getDocumentUrl = useCallback(
(id?: string) => {
return auth
? `${restAPIv1}/documents/${id || documentId}`
: `${webAPI}/document/get/${id || documentId}`;
return `${restAPIv1}/documents/${id || documentId}/preview`;
},
[documentId, auth],
[documentId],
);
return getDocumentUrl;

View File

@@ -1,5 +1,5 @@
import { Images } from '@/constants/common';
import { restAPIv1, webAPI } from '@/utils/api';
import { restAPIv1 } from '@/utils/api';
import { useParams, useSearchParams } from 'react-router';
// import Docx from './docx';
// import Excel from './excel';
@@ -29,7 +29,7 @@ const DocumentViewer = () => {
const api =
resource === 'files'
? `${restAPIv1}/files/${documentId}`
: `${webAPI}/document/get/${documentId}`;
: `${restAPIv1}/documents/${documentId}/preview`;
// request.head
if (ext === 'html' && documentId) {

View File

@@ -127,9 +127,9 @@ export default {
documentChangeParser: (datasetId: string, documentId: string) =>
`${restAPIv1}/datasets/${datasetId}/documents/${documentId}`,
documentThumbnails: `${restAPIv1}/thumbnails`,
getDocumentFile: `${webAPI}/document/get`,
getDocumentFile: `${restAPIv1}/documents`,
getDocumentFileDownload: (docId: string) =>
`${webAPI}/document/download/${docId}`,
`${restAPIv1}/documents/${docId}/download`,
documentUpload: (datasetId: string) =>
`${restAPIv1}/datasets/${datasetId}/documents`,
webCrawl: (datasetId: string) =>