Files
ragflow/api/apps/restful_apis/file2document_api.py

181 lines
7.2 KiB
Python

#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
#
import asyncio
import logging
from pathlib import Path
from api.common.check_team_permission import check_file_team_permission, check_kb_team_permission
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.apps import login_required, current_user
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.utils.api_utils import get_data_error_result, get_json_result, get_request_json, server_error_response, validate_request
from common.misc_utils import get_uuid
from api.db import FileType
from api.db.services.document_service import DocumentService
logger = logging.getLogger(__name__)
def _convert_files(file_ids, kb_ids, user_id):
"""Synchronous worker: add new docs for the given file/kb pairs while preserving existing links.
Previously this function replaced all existing links with the new ones, which caused
multi-select "link to knowledge base" to overwrite previous links. Now it only creates
documents for knowledge bases that are not already linked to the file, and leaves
existing links untouched.
"""
for id in file_ids:
e, file = FileService.get_by_id(id)
if not e:
continue
existing_links = {inform.document_id for inform in File2DocumentService.get_by_file_id(id)}
existing_kb_ids = set()
for doc_id in existing_links:
e, doc = DocumentService.get_by_id(doc_id)
if e and doc:
existing_kb_ids.add(doc.kb_id)
for kb_id in kb_ids:
if kb_id in existing_kb_ids:
continue
e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e:
continue
doc = DocumentService.insert(
{
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": FileService.get_parser(file.type, file.name, kb.parser_id),
"pipeline_id": kb.pipeline_id,
"parser_config": kb.parser_config,
"created_by": user_id,
"type": file.type,
"name": file.name,
"suffix": Path(file.name).suffix.lstrip("."),
"location": file.location,
"size": file.size,
}
)
File2DocumentService.insert(
{
"id": get_uuid(),
"file_id": id,
"document_id": doc.id,
}
)
@manager.route("/files/link-to-datasets", methods=["POST"]) # noqa: F821
@login_required
@validate_request("file_ids", "kb_ids")
async def convert():
req = await get_request_json()
kb_ids = req["kb_ids"]
file_ids = req["file_ids"]
try:
files = FileService.get_by_ids(file_ids)
files_set = {file.id: file for file in files}
# Validate all files exist before starting any work
for file_id in file_ids:
if not files_set.get(file_id):
logger.warning(
"user_id=%s resource_type=file resource_id=%s action=validate_file_lookup result=not_found file_ids=%s kb_ids=%s",
current_user.id,
file_id,
file_ids,
kb_ids,
)
return get_data_error_result(message="File not found!")
# Validate all kb_ids exist before scheduling background work
kb_map = {}
for kb_id in kb_ids:
e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e:
logger.warning(
"user_id=%s resource_type=dataset resource_id=%s action=validate_dataset_lookup result=not_found file_ids=%s kb_ids=%s",
current_user.id,
kb_id,
file_ids,
kb_ids,
)
return get_data_error_result(message="Can't find this dataset!")
kb_map[kb_id] = kb
# Expand folders to their innermost file IDs
all_file_ids = []
for file_id in file_ids:
file = files_set[file_id]
if file.type == FileType.FOLDER.value:
all_file_ids.extend(FileService.get_all_innermost_file_ids(file_id, []))
else:
all_file_ids.append(file_id)
user_id = current_user.id
for file_id in all_file_ids:
e, file = FileService.get_by_id(file_id)
if not e or not file:
logger.warning(
"user_id=%s resource_type=file resource_id=%s action=validate_expanded_file_lookup result=not_found file_ids=%s kb_ids=%s",
user_id,
file_id,
file_ids,
kb_ids,
)
return get_data_error_result(message="File not found!")
if not check_file_team_permission(file, user_id):
logger.warning(
"user_id=%s resource_type=file resource_id=%s action=authorize_file result=denied file_ids=%s kb_ids=%s",
user_id,
file_id,
file_ids,
kb_ids,
)
return get_data_error_result(message="No authorization.")
for kb_id, kb in kb_map.items():
if not check_kb_team_permission(kb, user_id):
logger.warning(
"user_id=%s resource_type=dataset resource_id=%s action=authorize_dataset result=denied file_ids=%s kb_ids=%s",
user_id,
kb_id,
file_ids,
kb_ids,
)
return get_data_error_result(message="No authorization.")
# Run the blocking DB work in a thread so the event loop is not blocked.
# For large folders this prevents 504 Gateway Timeout by returning as
# soon as the background task is scheduled.
loop = asyncio.get_running_loop()
future = loop.run_in_executor(None, _convert_files, all_file_ids, kb_ids, user_id)
future.add_done_callback(lambda f: logging.error("_convert_files failed: %s", f.exception()) if f.exception() else None)
logger.info(
"user_id=%s resource_type=file_to_dataset_link resource_id=batch action=schedule_convert result=scheduled file_ids=%s kb_ids=%s",
user_id,
all_file_ids,
kb_ids,
)
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)