Refactor: migrate doc upload info used in chat (#14359)

### What problem does this PR solve?

Before migration: POST /v1/document/upload_info/
After migration: POST /api/v1/documentss/upload/

### Type of change

- [x] Refactoring
This commit is contained in:
Jack
2026-04-27 16:58:42 +08:00
committed by GitHub
parent c446c403de
commit 61a24a2c14
7 changed files with 165 additions and 116 deletions

View File

@@ -41,7 +41,6 @@ from common import settings
from common.constants import RetCode, TaskStatus
from common.file_utils import get_project_base_directory
from common.misc_utils import thread_pool_exec
from common.ssrf_guard import assert_url_is_safe
from deepdoc.parser.html_parser import RAGFlowHtmlParser
from rag.nlp import search
@@ -214,7 +213,6 @@ async def run():
except Exception as e:
return server_error_response(e)
@manager.route("/get/<doc_id>", methods=["GET"]) # noqa: F821
@login_required
async def get(doc_id):
@@ -400,38 +398,3 @@ async def parse():
txt = FileService.parse_docs(file_objs, current_user.id)
return get_json_result(data=txt)
@manager.route("/upload_info", methods=["POST"]) # noqa: F821
@login_required
async def upload_info():
files = await request.files
file_objs = files.getlist("file") if files and files.get("file") else []
url = request.args.get("url")
if file_objs and url:
return get_json_result(
data=False,
message="Provide either multipart file(s) or ?url=..., not both.",
code=RetCode.BAD_REQUEST,
)
if not file_objs and not url:
return get_json_result(
data=False,
message="Missing input: provide multipart file(s) or url",
code=RetCode.BAD_REQUEST,
)
try:
if url and not file_objs:
assert_url_is_safe(url)
return get_json_result(data=FileService.upload_info(current_user.id, None, url))
if len(file_objs) == 1:
return get_json_result(data=FileService.upload_info(current_user.id, file_objs[0], None))
results = [FileService.upload_info(current_user.id, f, None) for f in file_objs]
return get_json_result(data=results)
except Exception as e:
return server_error_response(e)

View File

@@ -45,11 +45,76 @@ from common import settings
from common.constants import ParserType, RetCode, SANDBOX_ARTIFACT_BUCKET, TaskStatus
from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema
from common.misc_utils import get_uuid, thread_pool_exec
from common.ssrf_guard import assert_url_is_safe
from api.utils.file_utils import filename_type, thumbnail
from api.utils.web_utils import html2pdf, is_valid_url
from rag.nlp import search
from api.utils.web_utils import apply_safe_file_response_headers
@manager.route("/documents/upload", methods=["POST"]) # noqa: F821
@login_required
@add_tenant_id_to_kwargs
async def upload_info(tenant_id: str):
"""
Upload a document and get its parsed info.
---
tags:
- Documents
security:
- ApiKeyAuth: []
parameters:
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
- in: formData
name: file
type: file
required: false
description: File to upload.
- in: query
name: url
type: string
required: false
description: URL to fetch file from.
responses:
200:
description: Successful operation.
"""
files = await request.files
file_objs = files.getlist("file") if files and files.get("file") else []
url = request.args.get("url")
if file_objs and url:
return get_error_argument_result("Provide either multipart file(s) or ?url=..., not both.")
if not file_objs and not url:
return get_error_argument_result("Missing input: provide multipart file(s) or url")
try:
if url and not file_objs:
try:
assert_url_is_safe(url)
except ValueError as ve:
logging.warning("upload_info: rejected unsafe url: %s", ve)
return get_error_argument_result(str(ve))
data = await thread_pool_exec(FileService.upload_info, tenant_id, None, url)
return get_result(data=data)
if len(file_objs) == 1:
data = await thread_pool_exec(FileService.upload_info, tenant_id, file_objs[0], None)
return get_result(data=data)
results = [await thread_pool_exec(FileService.upload_info, tenant_id, f, None) for f in file_objs]
return get_result(data=results)
except Exception as e:
logging.exception("upload_info failed")
return server_error_response(e)
@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PATCH"]) # noqa: F821
@login_required
@add_tenant_id_to_kwargs

View File

@@ -327,6 +327,53 @@ def upload_documents(auth, payload=None, files_path=None, *, filename_override=N
f.close()
def upload_info(auth, files_path=None, *, url=None):
"""
Call the /api/v1/documents/upload endpoint to get upload info.
This is used to get file metadata before actually uploading to a dataset.
Args:
auth: Authentication object
files_path: List of file paths to upload (optional)
url: URL to fetch file from (optional, can be used alone or with files_path to test mixed input rejection)
Returns:
Response JSON with upload info
"""
url_endpoint = f"{HOST_ADDRESS}/api/{VERSION}/documents/upload"
fields = []
file_objects = []
try:
if files_path:
for fp in files_path:
p = Path(fp)
f = p.open("rb")
fields.append(("file", (p.name, f)))
file_objects.append(f)
# Add url as query parameter if provided
if url:
url_endpoint = f"{url_endpoint}?url={url}"
# Handle empty fields (no files) - create empty MultipartEncoder
if not fields:
fields = [("empty", ("", ""))]
m = MultipartEncoder(fields=fields)
res = requests.post(
url=url_endpoint,
headers={"Content-Type": m.content_type},
auth=auth,
data=m,
)
return res.json()
finally:
for f in file_objects:
f.close()
def create_document(auth, payload=None, *, headers=HEADERS, data=None):
kb_id = payload.get("kb_id") if payload else None
request_payload = dict(payload or {})

View File

@@ -15,12 +15,12 @@
#
import asyncio
from pathlib import Path
import importlib.util
import sys
from types import ModuleType
import pytest
from test_common import upload_info
from configs import INVALID_API_TOKEN
from libs.auth import RAGFlowWebApiAuth
from utils.file_utils import create_txt_file
class _AwaitableValue:
@@ -61,81 +61,55 @@ def _run(coro):
return asyncio.run(coro)
def _load_document_app_module(monkeypatch):
repo_root = Path(__file__).resolve().parents[4]
common_mod = ModuleType("common")
common_mod.bulk_upload_documents = lambda *_args, **_kwargs: []
common_mod.delete_document = lambda *_args, **_kwargs: None
common_mod.list_documents = lambda *_args, **_kwargs: {"data": {"docs": []}}
monkeypatch.setitem(sys.modules, "common", common_mod)
module_path = repo_root / "test" / "testcases" / "test_web_api" / "test_document_app" / "conftest.py"
spec = importlib.util.spec_from_file_location("test_document_app_unit_conftest", module_path)
module = importlib.util.module_from_spec(spec)
sys.modules["test_document_app_unit_conftest"] = module
spec.loader.exec_module(module)
return module.document_app_module.__wrapped__(monkeypatch)
# ============================================================================
# End-to-End Tests
# ============================================================================
@pytest.mark.p2
def test_upload_info_rejects_mixed_inputs(monkeypatch):
module = _load_document_app_module(monkeypatch)
monkeypatch.setattr(module, "assert_url_is_safe", lambda url: ("example.com", "93.184.216.34"))
files = _DummyFiles({"file": [_DummyFile("a.txt")]})
monkeypatch.setattr(module, "request", _DummyRequest(files=files, args={"url": "https://example.com/a.txt"}))
class TestUploadInfoE2E:
"""End-to-end tests for the /api/v1/documents/upload endpoint"""
res = _run(module.upload_info())
assert res["code"] == module.RetCode.BAD_REQUEST
assert "not both" in res["message"]
def test_upload_info_requires_file_or_url_e2e(self, WebApiAuth):
"""Test that missing both file and url returns error"""
# Call without files and without url
res = upload_info(WebApiAuth)
assert res["code"] == 101, res
assert "Missing input" in res["message"] or "file" in res["message"].lower() or "url" in res["message"].lower()
def test_upload_info_rejects_mixed_inputs_e2e(self, WebApiAuth, tmp_path):
"""Test that providing both file and url returns error"""
# Create a file
fp = create_txt_file(tmp_path / "test.txt")
@pytest.mark.p2
def test_upload_info_requires_file_or_url(monkeypatch):
module = _load_document_app_module(monkeypatch)
monkeypatch.setattr(module, "request", _DummyRequest(files=_DummyFiles()))
# Call with both file and url - the API should reject this
res = upload_info(WebApiAuth, files_path=[fp], url="https://example.com/test.txt")
# The API should return an error when both file and url are provided
assert res["code"] == 101, res
assert "not both" in res["message"].lower() and "either" in res["message"].lower()
res = _run(module.upload_info())
assert res["code"] == module.RetCode.BAD_REQUEST
assert "Missing input" in res["message"]
def test_upload_info_supports_url_single_and_multiple_files_e2e(self, WebApiAuth, tmp_path):
"""Test that the endpoint supports URL, single file, and multiple files"""
# Test with URL
# Note: Using a real URL might fail if the URL is not accessible
# For E2E testing, we test with actual file uploads
# Test with single file
fp1 = create_txt_file(tmp_path / "single_file.txt")
res = upload_info(WebApiAuth, files_path=[fp1])
assert res["code"] == 0, res
assert "data" in res, res
@pytest.mark.p2
def test_upload_info_supports_url_single_and_multiple_files(monkeypatch):
module = _load_document_app_module(monkeypatch)
monkeypatch.setattr(module, "assert_url_is_safe", lambda url: ("example.com", "93.184.216.34"))
captured = []
# Test with multiple files
fp2 = create_txt_file(tmp_path / "file_a.txt")
fp3 = create_txt_file(tmp_path / "file_b.txt")
res = upload_info(WebApiAuth, files_path=[fp2, fp3])
assert res["code"] == 0, res
assert "data" in res, res
# Should return a list for multiple files
if isinstance(res["data"], list):
assert len(res["data"]) == 2, res
def fake_upload_info(user_id, file_obj, url=None):
captured.append((user_id, getattr(file_obj, "filename", None), url))
if url is not None:
return {"kind": "url", "value": url}
return {"kind": "file", "value": file_obj.filename}
monkeypatch.setattr(module.FileService, "upload_info", fake_upload_info)
monkeypatch.setattr(module, "request", _DummyRequest(files=_DummyFiles(), args={"url": "https://example.com/a.txt"}))
res = _run(module.upload_info())
assert res["code"] == 0
assert res["data"] == {"kind": "url", "value": "https://example.com/a.txt"}
monkeypatch.setattr(module, "request", _DummyRequest(files=_DummyFiles({"file": _DummyFile("single.txt")})))
res = _run(module.upload_info())
assert res["code"] == 0
assert res["data"] == {"kind": "file", "value": "single.txt"}
monkeypatch.setattr(
module,
"request",
_DummyRequest(files=_DummyFiles({"file": [_DummyFile("a.txt"), _DummyFile("b.txt")]})),
)
res = _run(module.upload_info())
assert res["code"] == 0
assert res["data"] == [
{"kind": "file", "value": "a.txt"},
{"kind": "file", "value": "b.txt"},
]
assert captured == [
("user-1", None, "https://example.com/a.txt"),
("user-1", "single.txt", None),
("user-1", "a.txt", None),
("user-1", "b.txt", None),
]
def test_upload_info_invalid_auth(self):
"""Test that invalid authentication returns error"""
res = upload_info(RAGFlowWebApiAuth(INVALID_API_TOKEN), files_path=[])
assert res["code"] == 401, res

View File

@@ -492,9 +492,9 @@ export function useUploadAndParseFile() {
formData.append('file', file);
formData.append('conversation_id', conversationId || id);
const { data } = await chatService.uploadAndParse(
const { data } = await chatService.documentInfoUpload(
{
url: api.uploadAndParse,
url: api.documentInfoUpload,
signal: controller.current.signal,
data: formData,
onUploadProgress: ({ progress }) => {

View File

@@ -19,7 +19,7 @@ const {
chatsTts,
chatsMindmap,
chatsRelatedQuestions,
uploadAndParse,
documentInfoUpload,
fetchExternalChatInfo,
} = api;
@@ -92,9 +92,9 @@ const methods = {
url: chatsRelatedQuestions,
method: 'post',
},
uploadAndParse: {
documentInfoUpload: {
method: 'post',
url: uploadAndParse,
url: documentInfoUpload,
},
fetchExternalChatInfo: {
url: fetchExternalChatInfo,

View File

@@ -130,7 +130,7 @@ export default {
`${restAPIv1}/datasets/${datasetId}/documents`,
webCrawl: (datasetId: string) =>
`${restAPIv1}/datasets/${datasetId}/documents?type=web`,
uploadAndParse: `${webAPI}/document/upload_info`,
documentInfoUpload: `${restAPIv1}/documents/upload`,
setMeta: `${webAPI}/document/set_meta`,
getDatasetFilter: (datasetId: string) =>
`${restAPIv1}/datasets/${datasetId}/documents?type=filter`,