mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-04 01:29:35 +08:00
Refactor: migrate doc upload info used in chat (#14359)
### What problem does this PR solve? Before migration: POST /v1/document/upload_info/ After migration: POST /api/v1/documentss/upload/ ### Type of change - [x] Refactoring
This commit is contained in:
@@ -41,7 +41,6 @@ from common import settings
|
||||
from common.constants import RetCode, TaskStatus
|
||||
from common.file_utils import get_project_base_directory
|
||||
from common.misc_utils import thread_pool_exec
|
||||
from common.ssrf_guard import assert_url_is_safe
|
||||
from deepdoc.parser.html_parser import RAGFlowHtmlParser
|
||||
from rag.nlp import search
|
||||
|
||||
@@ -214,7 +213,6 @@ async def run():
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route("/get/<doc_id>", methods=["GET"]) # noqa: F821
|
||||
@login_required
|
||||
async def get(doc_id):
|
||||
@@ -400,38 +398,3 @@ async def parse():
|
||||
txt = FileService.parse_docs(file_objs, current_user.id)
|
||||
|
||||
return get_json_result(data=txt)
|
||||
|
||||
|
||||
@manager.route("/upload_info", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
async def upload_info():
|
||||
files = await request.files
|
||||
file_objs = files.getlist("file") if files and files.get("file") else []
|
||||
url = request.args.get("url")
|
||||
|
||||
if file_objs and url:
|
||||
return get_json_result(
|
||||
data=False,
|
||||
message="Provide either multipart file(s) or ?url=..., not both.",
|
||||
code=RetCode.BAD_REQUEST,
|
||||
)
|
||||
|
||||
if not file_objs and not url:
|
||||
return get_json_result(
|
||||
data=False,
|
||||
message="Missing input: provide multipart file(s) or url",
|
||||
code=RetCode.BAD_REQUEST,
|
||||
)
|
||||
|
||||
try:
|
||||
if url and not file_objs:
|
||||
assert_url_is_safe(url)
|
||||
return get_json_result(data=FileService.upload_info(current_user.id, None, url))
|
||||
|
||||
if len(file_objs) == 1:
|
||||
return get_json_result(data=FileService.upload_info(current_user.id, file_objs[0], None))
|
||||
|
||||
results = [FileService.upload_info(current_user.id, f, None) for f in file_objs]
|
||||
return get_json_result(data=results)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
@@ -45,11 +45,76 @@ from common import settings
|
||||
from common.constants import ParserType, RetCode, SANDBOX_ARTIFACT_BUCKET, TaskStatus
|
||||
from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema
|
||||
from common.misc_utils import get_uuid, thread_pool_exec
|
||||
from common.ssrf_guard import assert_url_is_safe
|
||||
from api.utils.file_utils import filename_type, thumbnail
|
||||
from api.utils.web_utils import html2pdf, is_valid_url
|
||||
from rag.nlp import search
|
||||
from api.utils.web_utils import apply_safe_file_response_headers
|
||||
|
||||
|
||||
@manager.route("/documents/upload", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
@add_tenant_id_to_kwargs
|
||||
async def upload_info(tenant_id: str):
|
||||
"""
|
||||
Upload a document and get its parsed info.
|
||||
---
|
||||
tags:
|
||||
- Documents
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
parameters:
|
||||
- in: header
|
||||
name: Authorization
|
||||
type: string
|
||||
required: true
|
||||
description: Bearer token for authentication.
|
||||
- in: formData
|
||||
name: file
|
||||
type: file
|
||||
required: false
|
||||
description: File to upload.
|
||||
- in: query
|
||||
name: url
|
||||
type: string
|
||||
required: false
|
||||
description: URL to fetch file from.
|
||||
responses:
|
||||
200:
|
||||
description: Successful operation.
|
||||
"""
|
||||
files = await request.files
|
||||
file_objs = files.getlist("file") if files and files.get("file") else []
|
||||
url = request.args.get("url")
|
||||
|
||||
if file_objs and url:
|
||||
return get_error_argument_result("Provide either multipart file(s) or ?url=..., not both.")
|
||||
|
||||
if not file_objs and not url:
|
||||
return get_error_argument_result("Missing input: provide multipart file(s) or url")
|
||||
|
||||
try:
|
||||
if url and not file_objs:
|
||||
try:
|
||||
assert_url_is_safe(url)
|
||||
except ValueError as ve:
|
||||
logging.warning("upload_info: rejected unsafe url: %s", ve)
|
||||
return get_error_argument_result(str(ve))
|
||||
|
||||
data = await thread_pool_exec(FileService.upload_info, tenant_id, None, url)
|
||||
return get_result(data=data)
|
||||
|
||||
if len(file_objs) == 1:
|
||||
data = await thread_pool_exec(FileService.upload_info, tenant_id, file_objs[0], None)
|
||||
return get_result(data=data)
|
||||
|
||||
results = [await thread_pool_exec(FileService.upload_info, tenant_id, f, None) for f in file_objs]
|
||||
return get_result(data=results)
|
||||
except Exception as e:
|
||||
logging.exception("upload_info failed")
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PATCH"]) # noqa: F821
|
||||
@login_required
|
||||
@add_tenant_id_to_kwargs
|
||||
|
||||
@@ -327,6 +327,53 @@ def upload_documents(auth, payload=None, files_path=None, *, filename_override=N
|
||||
f.close()
|
||||
|
||||
|
||||
def upload_info(auth, files_path=None, *, url=None):
|
||||
"""
|
||||
Call the /api/v1/documents/upload endpoint to get upload info.
|
||||
This is used to get file metadata before actually uploading to a dataset.
|
||||
|
||||
Args:
|
||||
auth: Authentication object
|
||||
files_path: List of file paths to upload (optional)
|
||||
url: URL to fetch file from (optional, can be used alone or with files_path to test mixed input rejection)
|
||||
|
||||
Returns:
|
||||
Response JSON with upload info
|
||||
"""
|
||||
url_endpoint = f"{HOST_ADDRESS}/api/{VERSION}/documents/upload"
|
||||
|
||||
fields = []
|
||||
file_objects = []
|
||||
try:
|
||||
if files_path:
|
||||
for fp in files_path:
|
||||
p = Path(fp)
|
||||
f = p.open("rb")
|
||||
fields.append(("file", (p.name, f)))
|
||||
file_objects.append(f)
|
||||
|
||||
# Add url as query parameter if provided
|
||||
if url:
|
||||
url_endpoint = f"{url_endpoint}?url={url}"
|
||||
|
||||
# Handle empty fields (no files) - create empty MultipartEncoder
|
||||
if not fields:
|
||||
fields = [("empty", ("", ""))]
|
||||
|
||||
m = MultipartEncoder(fields=fields)
|
||||
|
||||
res = requests.post(
|
||||
url=url_endpoint,
|
||||
headers={"Content-Type": m.content_type},
|
||||
auth=auth,
|
||||
data=m,
|
||||
)
|
||||
return res.json()
|
||||
finally:
|
||||
for f in file_objects:
|
||||
f.close()
|
||||
|
||||
|
||||
def create_document(auth, payload=None, *, headers=HEADERS, data=None):
|
||||
kb_id = payload.get("kb_id") if payload else None
|
||||
request_payload = dict(payload or {})
|
||||
|
||||
@@ -15,12 +15,12 @@
|
||||
#
|
||||
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
import importlib.util
|
||||
import sys
|
||||
from types import ModuleType
|
||||
|
||||
import pytest
|
||||
from test_common import upload_info
|
||||
from configs import INVALID_API_TOKEN
|
||||
from libs.auth import RAGFlowWebApiAuth
|
||||
from utils.file_utils import create_txt_file
|
||||
|
||||
|
||||
class _AwaitableValue:
|
||||
@@ -61,81 +61,55 @@ def _run(coro):
|
||||
return asyncio.run(coro)
|
||||
|
||||
|
||||
def _load_document_app_module(monkeypatch):
|
||||
repo_root = Path(__file__).resolve().parents[4]
|
||||
common_mod = ModuleType("common")
|
||||
common_mod.bulk_upload_documents = lambda *_args, **_kwargs: []
|
||||
common_mod.delete_document = lambda *_args, **_kwargs: None
|
||||
common_mod.list_documents = lambda *_args, **_kwargs: {"data": {"docs": []}}
|
||||
monkeypatch.setitem(sys.modules, "common", common_mod)
|
||||
module_path = repo_root / "test" / "testcases" / "test_web_api" / "test_document_app" / "conftest.py"
|
||||
spec = importlib.util.spec_from_file_location("test_document_app_unit_conftest", module_path)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules["test_document_app_unit_conftest"] = module
|
||||
spec.loader.exec_module(module)
|
||||
return module.document_app_module.__wrapped__(monkeypatch)
|
||||
|
||||
# ============================================================================
|
||||
# End-to-End Tests
|
||||
# ============================================================================
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_upload_info_rejects_mixed_inputs(monkeypatch):
|
||||
module = _load_document_app_module(monkeypatch)
|
||||
monkeypatch.setattr(module, "assert_url_is_safe", lambda url: ("example.com", "93.184.216.34"))
|
||||
files = _DummyFiles({"file": [_DummyFile("a.txt")]})
|
||||
monkeypatch.setattr(module, "request", _DummyRequest(files=files, args={"url": "https://example.com/a.txt"}))
|
||||
class TestUploadInfoE2E:
|
||||
"""End-to-end tests for the /api/v1/documents/upload endpoint"""
|
||||
|
||||
res = _run(module.upload_info())
|
||||
assert res["code"] == module.RetCode.BAD_REQUEST
|
||||
assert "not both" in res["message"]
|
||||
def test_upload_info_requires_file_or_url_e2e(self, WebApiAuth):
|
||||
"""Test that missing both file and url returns error"""
|
||||
# Call without files and without url
|
||||
res = upload_info(WebApiAuth)
|
||||
assert res["code"] == 101, res
|
||||
assert "Missing input" in res["message"] or "file" in res["message"].lower() or "url" in res["message"].lower()
|
||||
|
||||
def test_upload_info_rejects_mixed_inputs_e2e(self, WebApiAuth, tmp_path):
|
||||
"""Test that providing both file and url returns error"""
|
||||
# Create a file
|
||||
fp = create_txt_file(tmp_path / "test.txt")
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_upload_info_requires_file_or_url(monkeypatch):
|
||||
module = _load_document_app_module(monkeypatch)
|
||||
monkeypatch.setattr(module, "request", _DummyRequest(files=_DummyFiles()))
|
||||
# Call with both file and url - the API should reject this
|
||||
res = upload_info(WebApiAuth, files_path=[fp], url="https://example.com/test.txt")
|
||||
# The API should return an error when both file and url are provided
|
||||
assert res["code"] == 101, res
|
||||
assert "not both" in res["message"].lower() and "either" in res["message"].lower()
|
||||
|
||||
res = _run(module.upload_info())
|
||||
assert res["code"] == module.RetCode.BAD_REQUEST
|
||||
assert "Missing input" in res["message"]
|
||||
def test_upload_info_supports_url_single_and_multiple_files_e2e(self, WebApiAuth, tmp_path):
|
||||
"""Test that the endpoint supports URL, single file, and multiple files"""
|
||||
# Test with URL
|
||||
# Note: Using a real URL might fail if the URL is not accessible
|
||||
# For E2E testing, we test with actual file uploads
|
||||
|
||||
# Test with single file
|
||||
fp1 = create_txt_file(tmp_path / "single_file.txt")
|
||||
res = upload_info(WebApiAuth, files_path=[fp1])
|
||||
assert res["code"] == 0, res
|
||||
assert "data" in res, res
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_upload_info_supports_url_single_and_multiple_files(monkeypatch):
|
||||
module = _load_document_app_module(monkeypatch)
|
||||
monkeypatch.setattr(module, "assert_url_is_safe", lambda url: ("example.com", "93.184.216.34"))
|
||||
captured = []
|
||||
# Test with multiple files
|
||||
fp2 = create_txt_file(tmp_path / "file_a.txt")
|
||||
fp3 = create_txt_file(tmp_path / "file_b.txt")
|
||||
res = upload_info(WebApiAuth, files_path=[fp2, fp3])
|
||||
assert res["code"] == 0, res
|
||||
assert "data" in res, res
|
||||
# Should return a list for multiple files
|
||||
if isinstance(res["data"], list):
|
||||
assert len(res["data"]) == 2, res
|
||||
|
||||
def fake_upload_info(user_id, file_obj, url=None):
|
||||
captured.append((user_id, getattr(file_obj, "filename", None), url))
|
||||
if url is not None:
|
||||
return {"kind": "url", "value": url}
|
||||
return {"kind": "file", "value": file_obj.filename}
|
||||
|
||||
monkeypatch.setattr(module.FileService, "upload_info", fake_upload_info)
|
||||
|
||||
monkeypatch.setattr(module, "request", _DummyRequest(files=_DummyFiles(), args={"url": "https://example.com/a.txt"}))
|
||||
res = _run(module.upload_info())
|
||||
assert res["code"] == 0
|
||||
assert res["data"] == {"kind": "url", "value": "https://example.com/a.txt"}
|
||||
|
||||
monkeypatch.setattr(module, "request", _DummyRequest(files=_DummyFiles({"file": _DummyFile("single.txt")})))
|
||||
res = _run(module.upload_info())
|
||||
assert res["code"] == 0
|
||||
assert res["data"] == {"kind": "file", "value": "single.txt"}
|
||||
|
||||
monkeypatch.setattr(
|
||||
module,
|
||||
"request",
|
||||
_DummyRequest(files=_DummyFiles({"file": [_DummyFile("a.txt"), _DummyFile("b.txt")]})),
|
||||
)
|
||||
res = _run(module.upload_info())
|
||||
assert res["code"] == 0
|
||||
assert res["data"] == [
|
||||
{"kind": "file", "value": "a.txt"},
|
||||
{"kind": "file", "value": "b.txt"},
|
||||
]
|
||||
assert captured == [
|
||||
("user-1", None, "https://example.com/a.txt"),
|
||||
("user-1", "single.txt", None),
|
||||
("user-1", "a.txt", None),
|
||||
("user-1", "b.txt", None),
|
||||
]
|
||||
def test_upload_info_invalid_auth(self):
|
||||
"""Test that invalid authentication returns error"""
|
||||
res = upload_info(RAGFlowWebApiAuth(INVALID_API_TOKEN), files_path=[])
|
||||
assert res["code"] == 401, res
|
||||
|
||||
@@ -492,9 +492,9 @@ export function useUploadAndParseFile() {
|
||||
formData.append('file', file);
|
||||
formData.append('conversation_id', conversationId || id);
|
||||
|
||||
const { data } = await chatService.uploadAndParse(
|
||||
const { data } = await chatService.documentInfoUpload(
|
||||
{
|
||||
url: api.uploadAndParse,
|
||||
url: api.documentInfoUpload,
|
||||
signal: controller.current.signal,
|
||||
data: formData,
|
||||
onUploadProgress: ({ progress }) => {
|
||||
|
||||
@@ -19,7 +19,7 @@ const {
|
||||
chatsTts,
|
||||
chatsMindmap,
|
||||
chatsRelatedQuestions,
|
||||
uploadAndParse,
|
||||
documentInfoUpload,
|
||||
fetchExternalChatInfo,
|
||||
} = api;
|
||||
|
||||
@@ -92,9 +92,9 @@ const methods = {
|
||||
url: chatsRelatedQuestions,
|
||||
method: 'post',
|
||||
},
|
||||
uploadAndParse: {
|
||||
documentInfoUpload: {
|
||||
method: 'post',
|
||||
url: uploadAndParse,
|
||||
url: documentInfoUpload,
|
||||
},
|
||||
fetchExternalChatInfo: {
|
||||
url: fetchExternalChatInfo,
|
||||
|
||||
@@ -130,7 +130,7 @@ export default {
|
||||
`${restAPIv1}/datasets/${datasetId}/documents`,
|
||||
webCrawl: (datasetId: string) =>
|
||||
`${restAPIv1}/datasets/${datasetId}/documents?type=web`,
|
||||
uploadAndParse: `${webAPI}/document/upload_info`,
|
||||
documentInfoUpload: `${restAPIv1}/documents/upload`,
|
||||
setMeta: `${webAPI}/document/set_meta`,
|
||||
getDatasetFilter: (datasetId: string) =>
|
||||
`${restAPIv1}/datasets/${datasetId}/documents?type=filter`,
|
||||
|
||||
Reference in New Issue
Block a user