mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
Consolidateion of document upload API (#14106)
### What problem does this PR solve? Consolidation WEB API & HTTP API for document upload Before consolidation Web API: POST /v1/document/upload Http API - POST /api/v1/datasets/<dataset_id>/documents After consolidation, Restful API -- POST /api/v1/datasets/<dataset_id>/documents ### Type of change - [x] Refactoring
This commit is contained in:
@@ -19,8 +19,8 @@ from quart import request
|
||||
from peewee import OperationalError
|
||||
from pydantic import ValidationError
|
||||
|
||||
from api.apps.services.document_api_service import rename_doc_key, validate_document_update_fields, \
|
||||
update_document_name_only, update_chunk_method_only, update_document_status_only
|
||||
from api.apps.services.document_api_service import map_doc_keys_with_run_status, validate_document_update_fields, \
|
||||
update_document_name_only, update_chunk_method_only, update_document_status_only, map_doc_keys
|
||||
from api.db.services.doc_metadata_service import DocMetadataService
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
@@ -143,7 +143,7 @@ async def update_document(tenant_id, dataset_id, document_id):
|
||||
except OperationalError as e:
|
||||
logging.exception(e)
|
||||
return get_error_data_result(message="Database operation failed")
|
||||
renamed_doc = rename_doc_key(doc)
|
||||
renamed_doc = map_doc_keys(doc)
|
||||
return get_result(data=renamed_doc)
|
||||
|
||||
|
||||
@@ -183,3 +183,136 @@ async def metadata_summary(dataset_id, tenant_id):
|
||||
return get_result(data={"summary": summary})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route("/datasets/<dataset_id>/documents", methods=["POST"]) # noqa: F821
|
||||
@login_required
|
||||
@add_tenant_id_to_kwargs
|
||||
async def upload_document(dataset_id, tenant_id):
|
||||
"""
|
||||
Upload documents to a dataset.
|
||||
---
|
||||
tags:
|
||||
- Documents
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
parameters:
|
||||
- in: path
|
||||
name: dataset_id
|
||||
type: string
|
||||
required: true
|
||||
description: ID of the dataset.
|
||||
- in: header
|
||||
name: Authorization
|
||||
type: string
|
||||
required: true
|
||||
description: Bearer token for authentication.
|
||||
- in: formData
|
||||
name: file
|
||||
type: file
|
||||
required: true
|
||||
description: Document files to upload.
|
||||
- in: formData
|
||||
name: parent_path
|
||||
type: string
|
||||
description: Optional nested path under the parent folder. Uses '/' separators.
|
||||
- in: query
|
||||
name: return_raw_files
|
||||
type: boolean
|
||||
required: false
|
||||
default: false
|
||||
description: Whether to skip document key mapping and return raw document data
|
||||
responses:
|
||||
200:
|
||||
description: Successfully uploaded documents.
|
||||
schema:
|
||||
type: object
|
||||
properties:
|
||||
data:
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
description: Document ID.
|
||||
name:
|
||||
type: string
|
||||
description: Document name.
|
||||
chunk_count:
|
||||
type: integer
|
||||
description: Number of chunks.
|
||||
token_count:
|
||||
type: integer
|
||||
description: Number of tokens.
|
||||
dataset_id:
|
||||
type: string
|
||||
description: ID of the dataset.
|
||||
chunk_method:
|
||||
type: string
|
||||
description: Chunking method used.
|
||||
run:
|
||||
type: string
|
||||
description: Processing status.
|
||||
"""
|
||||
from api.constants import FILE_NAME_LEN_LIMIT
|
||||
from api.common.check_team_permission import check_kb_team_permission
|
||||
from api.db.services.file_service import FileService
|
||||
from common.misc_utils import thread_pool_exec
|
||||
|
||||
form = await request.form
|
||||
files = await request.files
|
||||
|
||||
# Validation
|
||||
if "file" not in files:
|
||||
logging.error("No file part!")
|
||||
return get_error_data_result(message="No file part!", code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
file_objs = files.getlist("file")
|
||||
for file_obj in file_objs:
|
||||
if file_obj is None or file_obj.filename is None or file_obj.filename == "":
|
||||
logging.error("No file selected!")
|
||||
return get_error_data_result(message="No file selected!", code=RetCode.ARGUMENT_ERROR)
|
||||
if len(file_obj.filename.encode("utf-8")) > FILE_NAME_LEN_LIMIT:
|
||||
msg = f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less."
|
||||
logging.error(msg)
|
||||
return get_error_data_result(message=msg, code=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
# KB Lookup
|
||||
e, kb = KnowledgebaseService.get_by_id(dataset_id)
|
||||
if not e:
|
||||
logging.error(f"Can't find the dataset with ID {dataset_id}!")
|
||||
return get_error_data_result(message=f"Can't find the dataset with ID {dataset_id}!", code=RetCode.DATA_ERROR)
|
||||
|
||||
# Permission Check
|
||||
if not check_kb_team_permission(kb, tenant_id):
|
||||
logging.error("No authorization.")
|
||||
return get_error_data_result(message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)
|
||||
|
||||
# File Upload (async)
|
||||
err, files = await thread_pool_exec(
|
||||
FileService.upload_document, kb, file_objs, tenant_id,
|
||||
parent_path=form.get("parent_path")
|
||||
)
|
||||
if err:
|
||||
msg = "\n".join(err)
|
||||
logging.error(msg)
|
||||
return get_error_data_result(message=msg, code=RetCode.SERVER_ERROR)
|
||||
|
||||
if not files:
|
||||
msg = "There seems to be an issue with your file format. please verify it is correct and not corrupted."
|
||||
logging.error(msg)
|
||||
return get_error_data_result(message=msg, code=RetCode.DATA_ERROR)
|
||||
|
||||
files = [f[0] for f in files] # remove the blob
|
||||
|
||||
# Check if we should return raw files without document key mapping
|
||||
return_raw_files = request.args.get("return_raw_files", "false").lower() == "true"
|
||||
|
||||
if return_raw_files:
|
||||
return get_result(data=files)
|
||||
|
||||
renamed_doc_list = [map_doc_keys_with_run_status(doc, run_status="0") for doc in files]
|
||||
return get_result(data=renamed_doc_list)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user