Refactor: Merge document update API (#13962)

### What problem does this PR solve?

Refactor: merge document.rename into document.update_document

### Type of change

- [x] Refactoring


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Added a unified document update API (PUT) supporting name, metadata,
parser/chunk settings, and status changes.

* **Breaking Changes**
* Legacy single-parameter rename endpoint removed; renames now require
dataset + document identifiers.
  * `/list` now reads dataset id from a different query parameter.

* **Validation / Bug Fixes**
* Stricter meta_fields and parser-config validation; unauthenticated
requests return 401.

* **Frontend**
  * UI now sends dataset id when saving document names.

* **Tests**
* Numerous unit and HTTP tests adjusted or removed to match new API and
validations.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: MkDev11 <94194147+MkDev11@users.noreply.github.com>
Co-authored-by: mkdev11 <YOUR_GITHUB_ID+MkDev11@users.noreply.github.com>
Co-authored-by: mkdev11 <MkDev11@users.noreply.github.com>
Co-authored-by: Qi Wang <wangq8@outlook.com>
Co-authored-by: dataCenter430 <161712630+dataCenter430@users.noreply.github.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
This commit is contained in:
Jack
2026-04-09 11:17:38 +08:00
committed by GitHub
parent c13f8856a1
commit 577c96bf2a
14 changed files with 651 additions and 710 deletions

View File

@@ -15,7 +15,6 @@
#
import json
import os.path
import pathlib
import re
from pathlib import Path, PurePosixPath, PureWindowsPath
@@ -49,7 +48,7 @@ from common.file_utils import get_project_base_directory
from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema
from common.misc_utils import get_uuid, thread_pool_exec
from deepdoc.parser.html_parser import RAGFlowHtmlParser
from rag.nlp import rag_tokenizer, search
from rag.nlp import search
def _is_safe_download_filename(name: str) -> bool:
@@ -666,61 +665,6 @@ async def run():
except Exception as e:
return server_error_response(e)
@manager.route("/rename", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_id", "name")
async def rename():
req = await get_request_json()
uid = current_user.id
try:
def _rename_sync():
if not DocumentService.accessible(req["doc_id"], uid):
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(message="Document not found!")
if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(doc.name.lower()).suffix:
return get_json_result(data=False, message="The extension of file can't be changed", code=RetCode.ARGUMENT_ERROR)
if len(req["name"].encode("utf-8")) > FILE_NAME_LEN_LIMIT:
return get_json_result(data=False, message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", code=RetCode.ARGUMENT_ERROR)
for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
if d.name == req["name"]:
return get_data_error_result(message="Duplicated document name in the same dataset.")
if not DocumentService.update_by_id(req["doc_id"], {"name": req["name"]}):
return get_data_error_result(message="Database error (Document rename)!")
informs = File2DocumentService.get_by_document_id(req["doc_id"])
if informs:
e, file = FileService.get_by_id(informs[0].file_id)
FileService.update_by_id(file.id, {"name": req["name"]})
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
title_tks = rag_tokenizer.tokenize(req["name"])
es_body = {
"docnm_kwd": req["name"],
"title_tks": title_tks,
"title_sm_tks": rag_tokenizer.fine_grained_tokenize(title_tks),
}
if settings.docStoreConn.index_exist(search.index_name(tenant_id), doc.kb_id):
settings.docStoreConn.update(
{"doc_id": req["doc_id"]},
es_body,
search.index_name(tenant_id),
doc.kb_id,
)
return get_json_result(data=True)
return await thread_pool_exec(_rename_sync)
except Exception as e:
return server_error_response(e)
@manager.route("/get/<doc_id>", methods=["GET"]) # noqa: F821
@login_required
async def get(doc_id):

View File

@@ -0,0 +1,146 @@
#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from peewee import OperationalError
from pydantic import ValidationError
from api.apps.services.document_api_service import rename_doc_key, validate_document_update_fields, \
update_document_name_only, update_chunk_method_only, update_document_status_only
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.document_service import DocumentService
from api.db.services.knowledgebase_service import KnowledgebaseService
from common.constants import RetCode
from api.apps import login_required
from api.utils.api_utils import get_error_data_result, get_result, add_tenant_id_to_kwargs, get_request_json
from api.utils.validation_utils import (
UpdateDocumentReq, format_validation_error_message,
)
@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PUT"]) # noqa: F821
@login_required
@add_tenant_id_to_kwargs
async def update_document(tenant_id, dataset_id, document_id):
"""
Update a document within a dataset.
---
tags:
- Documents
security:
- ApiKeyAuth: []
parameters:
- in: path
name: dataset_id
type: string
required: true
description: ID of the dataset.
- in: path
name: document_id
type: string
required: true
description: ID of the document to update.
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
- in: body
name: body
description: Document update parameters.
required: true
schema:
type: object
properties:
name:
type: string
description: New name of the document.
parser_config:
type: object
description: Parser configuration.
chunk_method:
type: string
description: Chunking method.
enabled:
type: boolean
description: Document status.
responses:
200:
description: Document updated successfully.
schema:
type: object
"""
req = await get_request_json()
# Verify ownership and existence of dataset and document
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
return get_error_data_result(message="You don't own the dataset.")
e, kb = KnowledgebaseService.get_by_id(dataset_id)
if not e:
return get_error_data_result(message="Can't find this dataset!")
# Prepare data for validation
docs = DocumentService.query(kb_id=dataset_id, id=document_id)
if not docs:
return get_error_data_result(message="The dataset doesn't own the document.")
# Validate document update request parameters
try:
update_doc_req = UpdateDocumentReq(**req)
except ValidationError as e:
return get_error_data_result(message=format_validation_error_message(e), code=RetCode.DATA_ERROR)
doc = docs[0]
# further check with inner status (from DB)
error_msg, error_code = validate_document_update_fields(update_doc_req, doc, req)
if error_msg:
return get_error_data_result(message=error_msg, code=error_code)
# All validations passed, now perform all updates
# meta_fields provided, then update it
if "meta_fields" in req:
if not DocMetadataService.update_document_metadata(document_id, update_doc_req.meta_fields):
return get_error_data_result(message="Failed to update metadata")
# doc name provided from request and diff with existing value, update
if "name" in req and req["name"] != doc.name:
if error := update_document_name_only(document_id, req["name"]):
return error
# parser config provided (already validated in UpdateDocumentReq), update it
if update_doc_req.parser_config:
DocumentService.update_parser_config(doc.id, req["parser_config"])
# chunk method provided - the update method will check if it's different with existing one
if update_doc_req.chunk_method:
if error := update_chunk_method_only(req, doc, dataset_id, tenant_id):
return error
if "enabled" in req: # already checked in UpdateDocumentReq - it's int if it's present
# "enabled" flag provided, the update method will check if it's changed and then update if so
if error := update_document_status_only(int(req["enabled"]), doc, kb):
return error
try:
original_doc_id = doc.id
ok, doc = DocumentService.get_by_id(doc.id)
if not ok:
return get_error_data_result(message=f"Can not get document by id:{original_doc_id}")
except OperationalError as e:
logging.exception(e)
return get_error_data_result(message="Database operation failed")
renamed_doc = rename_doc_key(doc)
return get_result(data=renamed_doc)

View File

@@ -15,13 +15,11 @@
#
import datetime
import json
import logging
import re
from io import BytesIO
import xxhash
from peewee import OperationalError
from pydantic import BaseModel, Field, validator, ValidationError
from pydantic import BaseModel, Field, validator
from quart import request, send_file
from api.constants import FILE_NAME_LEN_LIMIT
@@ -35,10 +33,8 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from api.db.services.task_service import TaskService, cancel_all_task_of, queue_tasks
from api.db.services.tenant_llm_service import TenantLLMService
from api.utils import validation_utils
from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_parser_config, get_request_json, get_result, server_error_response, token_required
from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_request_json, get_result, server_error_response, token_required
from api.utils.image_utils import store_chunk_image
from api.utils.validation_utils import format_validation_error_message, UpdateDocumentReq
from common import settings
from common.constants import FileSource, LLMType, ParserType, RetCode, TaskStatus
from common.metadata_utils import convert_conditions, meta_filter
@@ -184,215 +180,6 @@ async def upload(dataset_id, tenant_id):
renamed_doc_list.append(renamed_doc)
return get_result(data=renamed_doc_list)
def _update_document_name_only(document_id, req_doc_name):
"""Update document name only (without validation)."""
if not DocumentService.update_by_id(document_id, {"name": req_doc_name}):
return get_error_data_result(message="Database error (Document rename)!")
informs = File2DocumentService.get_by_document_id(document_id)
if informs:
e, file = FileService.get_by_id(informs[0].file_id)
FileService.update_by_id(file.id, {"name": req_doc_name})
return None
def _update_chunk_method_only(req, doc, dataset_id, tenant_id):
"""Update chunk method only (without validation)."""
if doc.parser_id.lower() != req["chunk_method"].lower():
# if chunk method changed
e = DocumentService.update_by_id(
doc.id,
{
"parser_id": req["chunk_method"],
"progress": 0,
"progress_msg": "",
"run": TaskStatus.UNSTART.value,
},
)
if not e:
return get_error_data_result(message="Document not found!")
if not req.get("parser_config"):
req["parser_config"] = get_parser_config(req["chunk_method"], req.get("parser_config"))
DocumentService.update_parser_config(doc.id, req["parser_config"])
if doc.token_num > 0:
e = DocumentService.increment_chunk_num(
doc.id,
doc.kb_id,
doc.token_num * -1,
doc.chunk_num * -1,
doc.process_duration * -1,
)
if not e:
return get_error_data_result(message="Document not found!")
settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id)
return None
def _update_document_status_only(status:int, doc, kb):
"""Update document status only (without validation)."""
if doc.status is None or (int(doc.status) != status):
try:
if not DocumentService.update_by_id(doc.id, {"status": str(status)}):
return get_error_data_result(message="Database error (Document update)!")
settings.docStoreConn.update({"doc_id": doc.id}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id)
except Exception as e:
return server_error_response(e)
return None
@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PUT"]) # noqa: F821
@token_required
async def update_doc(tenant_id, dataset_id, document_id):
"""
Update a document within a dataset.
---
tags:
- Documents
security:
- ApiKeyAuth: []
parameters:
- in: path
name: dataset_id
type: string
required: true
description: ID of the dataset.
- in: path
name: document_id
type: string
required: true
description: ID of the document to update.
- in: header
name: Authorization
type: string
required: true
description: Bearer token for authentication.
- in: body
name: body
description: Document update parameters.
required: true
schema:
type: object
properties:
name:
type: string
description: New name of the document.
parser_config:
type: object
description: Parser configuration.
chunk_method:
type: string
description: Chunking method.
enabled:
type: boolean
description: Document status.
responses:
200:
description: Document updated successfully.
schema:
type: object
"""
req = await get_request_json()
# Verify ownership and existence of dataset and document
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
return get_error_data_result(message="You don't own the dataset.")
e, kb = KnowledgebaseService.get_by_id(dataset_id)
if not e:
return get_error_data_result(message="Can't find this dataset!")
# Prepare data for validation
docs = DocumentService.query(kb_id=dataset_id, id=document_id)
if not docs:
return get_error_data_result(message="The dataset doesn't own the document.")
# Validate document update request parameters
try:
update_doc_req = UpdateDocumentReq(**req)
except ValidationError as e:
return get_error_data_result(message=format_validation_error_message(e), code=RetCode.DATA_ERROR)
doc = docs[0]
# further check with inner status (from DB)
error_msg, error_code = _validate_document_update_fields(update_doc_req, doc, req)
if error_msg:
return get_error_data_result(message=error_msg, code=error_code)
# All validations passed, now perform all updates
# meta_fields provided, then update it
if update_doc_req.meta_fields:
if not DocMetadataService.update_document_metadata(document_id, update_doc_req.meta_fields):
return get_error_data_result(message="Failed to update metadata")
# doc name provided from request and diff with existing value, update
if "name" in req and req["name"] != doc.name:
if error := _update_document_name_only(document_id, req["name"]):
return error
# parser config provided (already validated in UpdateDocumentReq), update it
if update_doc_req.parser_config:
DocumentService.update_parser_config(doc.id, req["parser_config"])
# chunk method provided - the update method will check if it's different with existing one
if update_doc_req.chunk_method:
if error := _update_chunk_method_only(req, doc, dataset_id, tenant_id):
return error
if "enabled" in req: # already checked in UpdateDocumentReq - it's int if it's present
# "enabled" flag provided, the update method will check if it's changed and then update if so
if error := _update_document_status_only(int(req["enabled"]), doc, kb):
return error
try:
ok, doc = DocumentService.get_by_id(doc.id)
if not ok:
return get_error_data_result(message="Dataset created failed")
except OperationalError as e:
logging.exception(e)
return get_error_data_result(message="Database operation failed")
key_mapping = {
"chunk_num": "chunk_count",
"kb_id": "dataset_id",
"token_num": "token_count",
"parser_id": "chunk_method",
}
run_mapping = {
"0": "UNSTART",
"1": "RUNNING",
"2": "CANCEL",
"3": "DONE",
"4": "FAIL",
}
renamed_doc = {}
for key, value in doc.to_dict().items():
new_key = key_mapping.get(key, key)
renamed_doc[new_key] = value
if key == "run":
renamed_doc["run"] = run_mapping.get(str(value))
return get_result(data=renamed_doc)
def _validate_document_update_fields(update_doc_req:UpdateDocumentReq, doc, req):
"""Validate document update fields in a single method."""
# Validate immutable fields
error_msg, error_code = validation_utils.validate_immutable_fields(update_doc_req, doc)
if error_msg:
return error_msg, error_code
# Validate document name if present
if "name" in req and req["name"] != doc.name:
docs_from_name = DocumentService.query(name=req["name"], kb_id=doc.kb_id)
error_msg, error_code = validation_utils.validate_document_name(req["name"], doc, docs_from_name)
if error_msg:
return error_msg, error_code
# Validate chunk method if present
if "chunk_method" in req:
error_msg, error_code = validation_utils.validate_chunk_method(doc, req["chunk_method"])
if error_msg:
return error_msg, error_code
return None, None
@manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["GET"]) # noqa: F821
@token_required
async def download(tenant_id, dataset_id, document_id):

View File

@@ -0,0 +1,201 @@
#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from api.db.services.document_service import DocumentService
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.utils import validation_utils
from common import settings
from common.constants import TaskStatus
from api.utils.api_utils import get_error_data_result, server_error_response, get_parser_config
from api.utils.validation_utils import UpdateDocumentReq
from rag.nlp import rag_tokenizer, search
def update_document_name_only(document_id, req_doc_name):
"""
Update document name only (without validation).
:param document_id: id (string) of the document
:param req_doc_name: new name (string) from request for the document
:return: None if all are good; otherwise returns the error message in the JSON format
"""
if not DocumentService.update_by_id(document_id, {"name": req_doc_name}):
return get_error_data_result(message="Database error (Document rename)!")
informs = File2DocumentService.get_by_document_id(document_id)
if informs:
e, file = FileService.get_by_id(informs[0].file_id)
FileService.update_by_id(file.id, {"name": req_doc_name})
# Add logic to update index - refer to rename method in document_app.py
tenant_id = DocumentService.get_tenant_id(document_id)
title_tks = rag_tokenizer.tokenize(req_doc_name)
es_body = {
"docnm_kwd": req_doc_name,
"title_tks": title_tks,
"title_sm_tks": rag_tokenizer.fine_grained_tokenize(title_tks),
}
ok, doc = DocumentService.get_by_id(document_id)
if not ok:
return get_error_data_result(message=f"Not able to find document by id:{document_id}")
if settings.docStoreConn.index_exist(search.index_name(tenant_id), doc.kb_id):
settings.docStoreConn.update(
{"doc_id": document_id},
es_body,
search.index_name(tenant_id),
doc.kb_id,
)
return None
def update_chunk_method_only(req, doc, dataset_id, tenant_id):
"""
Update chunk method only (without validation).
Updates the chunk method and parser configuration for a document,
and resets the document's progress if the chunk method changes.
Also clears existing chunks from the document store if the method changes.
Args:
req: The request dictionary containing chunk_method and parser_config.
doc: The document model from the database.
dataset_id: The ID of the dataset containing the document.
tenant_id: The tenant ID for the document store.
Returns:
None if successful, or an error result dictionary if failed.
"""
if doc.parser_id.lower() != req["chunk_method"].lower():
# if chunk method changed
e = DocumentService.update_by_id(
doc.id,
{
"parser_id": req["chunk_method"],
"progress": 0,
"progress_msg": "",
"run": TaskStatus.UNSTART.value,
},
)
if not e:
return get_error_data_result(message="Document not found!")
if not req.get("parser_config"):
req["parser_config"] = get_parser_config(req["chunk_method"], req.get("parser_config"))
DocumentService.update_parser_config(doc.id, req["parser_config"])
if doc.token_num > 0:
e = DocumentService.increment_chunk_num(
doc.id,
doc.kb_id,
doc.token_num * -1,
doc.chunk_num * -1,
doc.process_duration * -1,
)
if not e:
return get_error_data_result(message="Document not found!")
settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id)
return None
def update_document_status_only(status:int, doc, kb):
"""
Update document status only (without validation).
Updates the enabled/disabled status of a document and updates
the corresponding index in the document store.
Args:
status: The new status value (0 for disabled, 1 for enabled).
doc: The document model from the database.
kb: The knowledge base model.
Returns:
None if successful, or an error result dictionary if failed.
"""
if doc.status is None or (int(doc.status) != status):
try:
if not DocumentService.update_by_id(doc.id, {"status": str(status)}):
return get_error_data_result(message="Database error (Document update)!")
settings.docStoreConn.update({"doc_id": doc.id}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id)
except Exception as e:
return server_error_response(e)
return None
def validate_document_update_fields(update_doc_req:UpdateDocumentReq, doc, req):
"""
Validate document update fields in a single method.
Performs comprehensive validation of all document update fields,
including immutable fields, document name, and chunk method.
Args:
update_doc_req: The validated update document request.
doc: The document model from the database.
req: The original request dictionary.
Returns:
A tuple of (error_message, error_code) if validation fails,
or (None, None) if validation passes.
"""
# Validate immutable fields
error_msg, error_code = validation_utils.validate_immutable_fields(update_doc_req, doc)
if error_msg:
return error_msg, error_code
# Validate document name if present
if "name" in req and req["name"] != doc.name:
docs_from_name = DocumentService.query(name=req["name"], kb_id=doc.kb_id)
error_msg, error_code = validation_utils.validate_document_name(req["name"], doc, docs_from_name)
if error_msg:
return error_msg, error_code
# Validate chunk method if present
if "chunk_method" in req:
error_msg, error_code = validation_utils.validate_chunk_method(doc, req["chunk_method"])
if error_msg:
return error_msg, error_code
return None, None
def rename_doc_key(doc):
"""
Rename document keys to match API response format.
Converts internal document model field names to the external API
response field names (e.g., 'chunk_num' -> 'chunk_count').
Args:
doc: The document model from the database.
Returns:
A dictionary with renamed keys for API response.
"""
key_mapping = {
"chunk_num": "chunk_count",
"kb_id": "dataset_id",
"token_num": "token_count",
"parser_id": "chunk_method",
}
run_mapping = {
"0": "UNSTART",
"1": "RUNNING",
"2": "CANCEL",
"3": "DONE",
"4": "FAIL",
}
renamed_doc = {}
for key, value in doc.to_dict().items():
new_key = key_mapping.get(key, key)
renamed_doc[new_key] = value
if key == "run":
renamed_doc["run"] = run_mapping.get(str(value))
return renamed_doc

View File

@@ -402,7 +402,14 @@ class ParserConfig(Base):
ext: Annotated[dict, Field(default={})]
class UpdateDocumentReq(Base):
"""
Request model for updating a document.
This model validates the request parameters for updating a document,
including name, chunk method, enabled status, and other metadata.
"""
model_config = ConfigDict(extra='ignore')
name: Annotated[str | None, Field(default=None, max_length=65535)]
chunk_method: Annotated[str | None, Field(default=None, max_length=65535)]
enabled: Annotated[int | None, Field(default=None, ge=0, le=1)]
chunk_count: Annotated[int | None, Field(default=None, ge=0)]
@@ -432,6 +439,22 @@ class UpdateDocumentReq(Base):
return enabled
@field_validator("meta_fields", mode="after")
@classmethod
def validate_document_meta_fields(cls, meta_fields: dict | None):
if meta_fields is None:
return None
if not isinstance(meta_fields, dict):
raise PydanticCustomError("format_invalid", "Only dictionary type supported")
for k, v in meta_fields.items():
if isinstance(v, list):
if not all(isinstance(i, (str, int, float)) for i in v):
raise PydanticCustomError("format_invalid", "The type is not supported in list: {v}", {"v":v})
elif not isinstance(v, (str, int, float)):
raise PydanticCustomError("format_invalid", "The type is not supported: {v}", {"v":v})
return meta_fields
class CreateDatasetReq(Base):
name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(...)]
avatar: Annotated[str | None, Field(default=None, max_length=65535)]
@@ -854,7 +877,20 @@ class ListFileReq(BaseModel):
def validate_immutable_fields(update_doc_req:UpdateDocumentReq, doc):
"""Validate that immutable fields have not been changed."""
"""
Validate that immutable fields have not been changed.
Checks that fields like chunk_count, token_count, and progress
cannot be modified directly by the user.
Args:
update_doc_req: The validated update document request.
doc: The document model from the database.
Returns:
A tuple of (error_message, error_code) if validation fails,
or (None, None) if validation passes.
"""
if update_doc_req.chunk_count and update_doc_req.chunk_count != int(getattr(doc, "chunk_num", -1)):
return "Can't change `chunk_count`.", RetCode.DATA_ERROR
@@ -871,7 +907,24 @@ def validate_immutable_fields(update_doc_req:UpdateDocumentReq, doc):
def validate_document_name(req_doc_name:str, doc, docs_from_name):
"""Validate document name update."""
"""
Validate document name update.
Checks that the new document name is valid:
- Must be a string
- Must not exceed the file name length limit
- File extension cannot be changed
- Must not duplicate an existing document name in the same dataset.
Args:
req_doc_name: The new document name to validate.
doc: The document model from the database.
docs_from_name: Query result for documents with the new name.
Returns:
A tuple of (error_message, error_code) if validation fails,
or (None, None) if validation passes.
"""
if not isinstance(req_doc_name, str):
return f"AttributeError('{type(req_doc_name).__name__}' object has no attribute 'encode')", RetCode.EXCEPTION_ERROR
if len(req_doc_name.encode("utf-8")) > FILE_NAME_LEN_LIMIT:
@@ -885,7 +938,20 @@ def validate_document_name(req_doc_name:str, doc, docs_from_name):
return None, None
def validate_chunk_method(doc, chunk_method=None):
"""Validate chunk method update."""
"""
Validate chunk method update.
Checks if the chunk method is valid for the given document,
particularly for visual documents or specific file types.
Args:
doc: The document model from the database.
chunk_method: The chunk method to validate.
Returns:
A tuple of (error_message, error_code) if validation fails,
or (None, None) if validation passes.
"""
if chunk_method is not None and len(chunk_method) == 0: # will not be detected in UpdateDocumentReq
return "`chunk_method` (empty string) is not valid", RetCode.DATA_ERROR
if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):

View File

@@ -366,123 +366,6 @@ class TestDocRoutesUnit:
assert res["code"] == module.RetCode.SERVER_ERROR
assert res["message"] == "upload failed"
def test_update_doc_guards_and_error_paths(self, monkeypatch):
module = _load_doc_module(monkeypatch)
doc = _DummyDoc()
monkeypatch.setattr(module.KnowledgebaseService, "query", lambda **_kwargs: [])
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({}))
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert res["message"] == "You don't own the dataset."
monkeypatch.setattr(module.KnowledgebaseService, "query", lambda **_kwargs: [1])
monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _id: (False, None))
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert res["message"] == "Can't find this dataset!"
monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _id: (True, SimpleNamespace(tenant_id="tenant-1")))
monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [])
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert "doesn't own the document" in res["message"]
monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [doc])
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_count": 100}))
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert "chunk_count" in res["message"]
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"token_count": 100}))
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert "token_count" in res["message"]
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"progress": 100}))
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert "progress" in res["message"]
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"meta_fields": []}))
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert res["message"] == "Field: <meta_fields> - Message: <Input should be a valid dictionary> - Value: <[]>"
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"meta_fields": {"k": "v"}}))
monkeypatch.setattr(module.DocMetadataService, "update_document_metadata", lambda *_args, **_kwargs: False)
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert res["message"] == "Failed to update metadata"
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"name": "a" * (module.FILE_NAME_LEN_LIMIT + 1)}))
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert res["code"] == module.RetCode.ARGUMENT_ERROR
assert "bytes or less" in res["message"]
monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: False)
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"name": "new.txt"}))
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert "Document rename" in res["message"]
def test_update_doc_chunk_method_enabled_and_db_error(self, monkeypatch):
module = _load_doc_module(monkeypatch)
from api.db import FileType
visual_doc = _DummyDoc(parser_id="naive", doc_type=FileType.VISUAL)
kb = SimpleNamespace(tenant_id="tenant-1")
monkeypatch.setattr(module.KnowledgebaseService, "query", lambda **_kwargs: [1])
monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _id: (True, kb))
monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [visual_doc])
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_method": "naive"}))
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert res["message"] == "Not supported yet!"
doc = _DummyDoc(token_num=2, chunk_num=1, parser_id="naive")
monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [doc])
monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: False)
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_method": "manual"}))
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert res["message"] == "Document not found!"
monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: True)
monkeypatch.setattr(module.DocumentService, "update_parser_config", lambda *_args, **_kwargs: None)
monkeypatch.setattr(module.DocumentService, "increment_chunk_num", lambda *_args, **_kwargs: False)
_patch_docstore(monkeypatch, module, delete=lambda *_args, **_kwargs: None)
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_method": "manual"}))
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert res["message"] == "Document not found!"
monkeypatch.setattr(module.DocumentService, "increment_chunk_num", lambda *_args, **_kwargs: True)
doc_for_enabled = _DummyDoc(status=False)
monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [doc_for_enabled])
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _id: (True, doc_for_enabled))
monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: False)
_patch_docstore(monkeypatch, module, update=lambda *_args, **_kwargs: None, delete=lambda *_args, **_kwargs: None)
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"enabled": 1}))
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert "Document update" in res["message"]
monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: True)
_patch_docstore(monkeypatch, module, update=lambda *_args, **_kwargs: (_ for _ in ()).throw(RuntimeError("boom")), delete=lambda *_args, **_kwargs: None)
monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)})
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert res["code"] == 500
assert "boom" in res["message"]
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _id: (False, None))
_patch_docstore(monkeypatch, module, update=lambda *_args, **_kwargs: None, delete=lambda *_args, **_kwargs: None)
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert res["message"] == "Dataset created failed"
# cover token reset + docStore deletion branch
doc_reset = _DummyDoc(token_num=3, chunk_num=2, parser_id="naive", run=0)
monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [doc_reset])
monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: True)
monkeypatch.setattr(module.DocumentService, "increment_chunk_num", lambda *_args, **_kwargs: True)
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _id: (True, doc_reset))
_patch_docstore(monkeypatch, module, delete=lambda *_args, **_kwargs: None, update=lambda *_args, **_kwargs: None)
monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_method": "manual"}))
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert res["code"] == 0
def _raise_operational_error(_id):
raise module.OperationalError("db down")
monkeypatch.setattr(module.DocumentService, "get_by_id", _raise_operational_error)
res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1"))
assert res["message"] == "Database operation failed"
def test_download_and_download_doc_errors(self, monkeypatch):
module = _load_doc_module(monkeypatch)
_patch_send_file(monkeypatch, module)

View File

@@ -21,16 +21,17 @@ from configs import DOCUMENT_NAME_LIMIT, INVALID_API_TOKEN, INVALID_ID_32
from libs.auth import RAGFlowHttpApiAuth
from configs import DEFAULT_PARSER_CONFIG
@pytest.mark.p1
class TestAuthorization:
@pytest.mark.parametrize(
"invalid_auth, expected_code, expected_message",
[
(None, 0, "`Authorization` can't be empty"),
(None, 401, "<Unauthorized '401: Unauthorized'>"),
(
RAGFlowHttpApiAuth(INVALID_API_TOKEN),
109,
"Authentication error: API key is invalid!",
401,
"<Unauthorized '401: Unauthorized'>",
),
],
)
@@ -53,13 +54,13 @@ class TestDocumentsUpdated:
),
(
0,
100,
"""AttributeError(\'int\' object has no attribute \'encode\')""",
102,
"Field: <name> - Message: <Input should be a valid string> - Value: <0>",
),
(
None,
100,
"""AttributeError(\'NoneType\' object has no attribute \'encode\')""",
"AttributeError('NoneType' object has no attribute 'encode')",
),
(
"",
@@ -93,7 +94,7 @@ class TestDocumentsUpdated:
else:
assert res["message"] == expected_message
@pytest.mark.p3
@pytest.mark.p2
@pytest.mark.parametrize(
"document_id, expected_code, expected_message",
[
@@ -110,7 +111,7 @@ class TestDocumentsUpdated:
assert res["code"] == expected_code
assert res["message"] == expected_message
@pytest.mark.p3
@pytest.mark.p2
@pytest.mark.parametrize(
"dataset_id, expected_code, expected_message",
[
@@ -127,10 +128,28 @@ class TestDocumentsUpdated:
assert res["code"] == expected_code
assert res["message"] == expected_message
@pytest.mark.p3
@pytest.mark.p2
@pytest.mark.parametrize(
"meta_fields, expected_code, expected_message",
[({"test": "test"}, 0, ""), ("test", 102, "meta_fields must be a dictionary")],
[
# Valid meta_fields
({"test": "test"}, 0, ""),
# Valid meta_fields with various types
({"author": "alice", "year": 2024}, 0, ""),
({"tags": ["tag1", "tag2"]}, 0, ""),
({"count": 42, "price": 19.99}, 0, ""),
# Invalid type - string instead of dict
("test", 102, "Field: <meta_fields> - Message: <Input should be a valid dictionary> - Value: <test>"),
# Invalid type - list instead of dict
([], 102, "Field: <meta_fields> - Message: <Input should be a valid dictionary> - Value: <[]>"),
# Invalid - list containing objects (unsupported type in list)
({"tags": [{"x": {"a": "b"}}]}, 102, "Field: <meta_fields> - Message: <The type is not supported in list: [{'x': {'a': 'b'}}]> - Value: <{'tags': [{'x': {'a': 'b'}}]}>"),
({"tags": [{"x": 1}]}, 102, "Field: <meta_fields> - Message: <The type is not supported in list: [{'x': 1}]> - Value: <{'tags': [{'x': 1}]}>"),
# Invalid - nested object with unsupported type
({"obj": {"x": 1}}, 102, "Field: <meta_fields> - Message: <The type is not supported: {'x': 1}> - Value: <{'obj': {'x': 1}}>"),
# Valid types of list
({"tags": [2, 1]}, 0, ""),
],
)
def test_meta_fields(self, HttpApiAuth, add_documents, meta_fields, expected_code, expected_message):
dataset_id, document_ids = add_documents
@@ -139,7 +158,22 @@ class TestDocumentsUpdated:
res = list_documents(HttpApiAuth, dataset_id, {"id": document_ids[0]})
assert res["data"]["docs"][0]["meta_fields"] == meta_fields
else:
assert res["message"] == expected_message
assert expected_message in res["message"] or res["message"] == expected_message
@pytest.mark.p2
@pytest.mark.parametrize(
"meta_fields, expected_code, expected_message",
[
# Test with invalid document ID (not owned by dataset)
({"author": "alice"}, 102, "The dataset doesn't own the document."),
],
)
def test_meta_fields_invalid_document(self, HttpApiAuth, add_documents, meta_fields, expected_code, expected_message):
"""Test meta_fields update with invalid document ID"""
dataset_id, _ = add_documents
res = update_document(HttpApiAuth, dataset_id, "invalid_doc_id_12345678901234567890", {"meta_fields": meta_fields})
assert res["code"] == expected_code
assert expected_message in res["message"]
@pytest.mark.p2
@pytest.mark.parametrize(
@@ -297,6 +331,30 @@ class TestDocumentsUpdated:
assert res["code"] == expected_code
assert res["message"] == expected_message
@pytest.mark.p2
@pytest.mark.parametrize(
"payload, expected_code, expected_message",
[
({"chunk_count": 100}, 102, "Can't change `chunk_count`."),
({"token_count": 100}, 102, "Can't change `token_count`."),
({"progress": 2.0}, 102, "Field: <progress> - Message: <Input should be less than or equal to 1> - Value: <2.0>"),
({"progress": 1.0}, 102, "Can't change `progress`."),
({"meta_fields": []}, 102, "Field: <meta_fields> - Message: <Input should be a valid dictionary> - Value: <[]>"),
],
)
def test_update_doc_guards_and_error_paths(self, HttpApiAuth, add_documents, payload, expected_code, expected_message):
"""
Test various guard conditions and error paths for document update functionality.
This includes testing for invalid dataset ownership, document ownership,
immutable fields, and validation errors.
"""
dataset_id, document_ids = add_documents
document_id = document_ids[0]
res = update_document(HttpApiAuth, dataset_id, document_id, payload)
assert res["code"] == expected_code
if expected_message:
assert expected_message in res["message"] or res["message"] == expected_message
DEFAULT_PARSER_CONFIG_FOR_TEST = {
@@ -328,6 +386,7 @@ DEFAULT_PARSER_CONFIG_FOR_TEST = {
},
}
class TestUpdateDocumentParserConfig:
@pytest.mark.p2
@pytest.mark.parametrize(
@@ -343,37 +402,32 @@ class TestUpdateDocumentParserConfig:
pytest.param(
"naive",
{"chunk_token_num": -1},
100,
"AssertionError('chunk_token_num should be in range from 1 to 100000000')",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.chunk_token_num> - Message: <Input should be greater than or equal to 1> - Value: <-1>",
),
pytest.param(
"naive",
{"chunk_token_num": 0},
100,
"AssertionError('chunk_token_num should be in range from 1 to 100000000')",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.chunk_token_num> - Message: <Input should be greater than or equal to 1> - Value: <0>",
),
pytest.param(
"naive",
{"chunk_token_num": 100000000},
100,
"AssertionError('chunk_token_num should be in range from 1 to 100000000')",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.chunk_token_num> - Message: <Input should be less than or equal to 2048> - Value: <100000000>",
),
pytest.param(
"naive",
{"chunk_token_num": 3.14},
102,
"",
marks=pytest.mark.skip(reason="issues/6098"),
"Field: <parser_config.chunk_token_num> - Message: <Input should be a valid integer> - Value: <3.14>",
),
pytest.param(
"naive",
{"chunk_token_num": "1024"},
100,
"",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.chunk_token_num> - Message: <Input should be a valid integer> - Value: <1024>",
),
(
"naive",
@@ -392,148 +446,135 @@ class TestUpdateDocumentParserConfig:
pytest.param(
"naive",
{"html4excel": 1},
100,
"AssertionError('html4excel should be True or False')",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.html4excel> - Message: <Input should be a valid boolean> - Value: <1>",
),
("naive", {"delimiter": ""}, 102, "Field: <parser_config.delimiter> - Message: <String should have at least 1 character> - Value: <>"),
("naive", {"delimiter": "`##`"}, 0, ""),
pytest.param(
"naive",
{"delimiter": 1},
100,
"",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.delimiter> - Message: <Input should be a valid string> - Value: <1>",
),
pytest.param(
"naive",
{"task_page_size": -1},
100,
"AssertionError('task_page_size should be in range from 1 to 100000000')",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.task_page_size> - Message: <Input should be greater than or equal to 1> - Value: <-1>",
),
pytest.param(
"naive",
{"task_page_size": 0},
100,
"AssertionError('task_page_size should be in range from 1 to 100000000')",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.task_page_size> - Message: <Input should be greater than or equal to 1> - Value: <0>",
),
pytest.param(
"naive",
{"task_page_size": 100000000},
100,
"AssertionError('task_page_size should be in range from 1 to 100000000')",
marks=pytest.mark.skip(reason="issues/6098"),
0,
"",
),
pytest.param(
"naive",
{"task_page_size": 3.14},
100,
"",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.task_page_size> - Message: <Input should be a valid integer> - Value: <3.14>",
),
pytest.param(
"naive",
{"task_page_size": "1024"},
100,
"",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.task_page_size> - Message: <Input should be a valid integer> - Value: <1024>",
),
(
"naive",
{
"raptor": {
"use_raptor": {"a": "b"},
}
},
102,
"Field: <parser_config.raptor.use_raptor> - Message: <Input should be a valid boolean> - Value: <{'a': 'b'}>",
),
("naive", {"raptor": {"use_raptor": {
"a": "b"
},}}, 102, "Field: <parser_config.raptor.use_raptor> - Message: <Input should be a valid boolean> - Value: <{'a': 'b'}>"),
("naive", {"raptor": {"use_raptor": False}}, 0, ""),
pytest.param(
"naive",
{"invalid_key": "invalid_value"},
100,
"""AssertionError("Abnormal \'parser_config\'. Invalid key: invalid_key")""",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.invalid_key> - Message: <Extra inputs are not permitted> - Value: <invalid_value>",
),
pytest.param(
"naive",
{"auto_keywords": -1},
100,
"AssertionError('auto_keywords should be in range from 0 to 32')",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.auto_keywords> - Message: <Input should be greater than or equal to 0> - Value: <-1>",
),
pytest.param(
"naive",
{"auto_keywords": 32},
100,
"AssertionError('auto_keywords should be in range from 0 to 32')",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"auto_questions": 3.14},
100,
0,
"",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"auto_keywords": "1024"},
100,
"",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.auto_keywords> - Message: <Input should be a valid integer> - Value: <1024>",
),
pytest.param(
"naive",
{"auto_keywords": 3.14},
102,
"Field: <parser_config.auto_keywords> - Message: <Input should be a valid integer> - Value: <3.14>",
),
pytest.param(
"naive",
{"auto_questions": -1},
100,
"AssertionError('auto_questions should be in range from 0 to 10')",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.auto_questions> - Message: <Input should be greater than or equal to 0> - Value: <-1>",
),
pytest.param(
"naive",
{"auto_questions": 10},
100,
"AssertionError('auto_questions should be in range from 0 to 10')",
marks=pytest.mark.skip(reason="issues/6098"),
0,
"",
),
pytest.param(
"naive",
{"auto_questions": 3.14},
100,
"",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.auto_questions> - Message: <Input should be a valid integer> - Value: <3.14>",
),
pytest.param(
"naive",
{"auto_questions": "1024"},
100,
"",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.auto_questions> - Message: <Input should be a valid integer> - Value: <1024>",
),
pytest.param(
"naive",
{"topn_tags": -1},
100,
"AssertionError('topn_tags should be in range from 0 to 10')",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.topn_tags> - Message: <Input should be greater than or equal to 1> - Value: <-1>",
),
pytest.param(
"naive",
{"topn_tags": 10},
100,
"AssertionError('topn_tags should be in range from 0 to 10')",
marks=pytest.mark.skip(reason="issues/6098"),
0,
"",
),
pytest.param(
"naive",
{"topn_tags": 3.14},
100,
"",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.topn_tags> - Message: <Input should be a valid integer> - Value: <3.14>",
),
pytest.param(
"naive",
{"topn_tags": "1024"},
100,
"",
marks=pytest.mark.skip(reason="issues/6098"),
102,
"Field: <parser_config.topn_tags> - Message: <Input should be a valid integer> - Value: <1024>",
),
],
)

View File

@@ -67,7 +67,7 @@ class TestDocumentsUpdated:
updated_doc = dataset.list_documents(id=document.id)[0]
assert updated_doc.name == name, str(updated_doc)
@pytest.mark.p3
@pytest.mark.p2
@pytest.mark.parametrize(
"meta_fields, expected_message",
[
@@ -238,27 +238,81 @@ class TestDocumentsUpdated:
document.update(payload)
assert expected_message in str(exception_info.value), str(exception_info.value)
@pytest.mark.p3
def test_immutable_fields_chunk_count(self, add_document):
document, _ = add_document # Unpack the tuple to get the document object
with pytest.raises(Exception) as exception_info:
document.update({"chunk_count": 999}) # Attempt to change immutable field
assert "Can't change `chunk_count`" in str(exception_info.value), str(exception_info.value)
@pytest.mark.p2
@pytest.mark.parametrize(
"payload, expected_message",
[
({"chunk_count": 1}, "Can't change `chunk_count`"),
],
)
def test_immutable_fields_chunk_count(self, add_documents, payload, expected_message):
_, documents = add_documents
document = documents[0]
@pytest.mark.p3
def test_immutable_fields_token_count(self, add_document):
document, _ = add_document # Unpack the tuple to get the document object
with pytest.raises(Exception) as exception_info:
document.update({"token_count": 9999}) # Attempt to change immutable field
assert "Can't change `token_num`" in str(exception_info.value), str(exception_info.value)
document.update(payload)
assert expected_message in str(exception_info.value), str(exception_info.value)
@pytest.mark.p2
@pytest.mark.parametrize(
"payload, expected_message",
[
({"token_count": 9999}, "Can't change `token_count`"), # Attempt to change immutable field
],
)
def test_immutable_fields_token_count(self, add_documents, payload, expected_message):
_, documents = add_documents
document = documents[0]
@pytest.mark.p3
def test_immutable_fields_progress(self, add_document):
document, _ = add_document # Unpack the tuple to get the document object
with pytest.raises(Exception) as exception_info:
document.update({"progress": 0.5}) # Attempt to change immutable field
assert "Can't change `progress`" in str(exception_info.value), str(exception_info.value)
document.update(payload)
assert expected_message in str(exception_info.value), str(exception_info.value)
@pytest.mark.p2
@pytest.mark.parametrize(
"payload, expected_message",
[
({"progress": 0.5}, "Can't change `progress`"), # Attempt to change immutable field
({"progress": 1.5}, "Field: <progress> - Message: <Input should be less than or equal to 1> - Value: <1.5>"), # Attempt to change immutable field
],
)
def test_immutable_fields_progress(self, add_documents, payload, expected_message):
_, documents = add_documents
document = documents[0]
with pytest.raises(Exception) as exception_info:
document.update(payload)
assert expected_message in str(exception_info.value), str(exception_info.value)
DEFAULT_PARSER_CONFIG_FOR_TEST = {
"layout_recognize": "DeepDOC",
"chunk_token_num": 512,
"delimiter": "\n",
"auto_keywords": 0,
"auto_questions": 0,
"html4excel": False,
"topn_tags": 3,
"raptor": {
"use_raptor": True,
"prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n {cluster_content}\nThe above is the content you need to summarize.",
"max_token": 256,
"threshold": 0.1,
"max_cluster": 64,
"random_seed": 0,
},
"graphrag": {
"use_graphrag": True,
"entity_types": [
"organization",
"person",
"geo",
"event",
"category",
],
"method": "light",
},
}
class TestUpdateDocumentParserConfig:
@pytest.mark.p2
@@ -268,15 +322,14 @@ class TestUpdateDocumentParserConfig:
("naive", {}, ""),
pytest.param(
"naive",
DEFAULT_PARSER_CONFIG,
DEFAULT_PARSER_CONFIG_FOR_TEST,
"",
marks=pytest.mark.skip(reason="DEFAULT_PARSER_CONFIG contains fields not allowed in document update API"),
),
pytest.param(
"naive",
{"chunk_token_num": -1},
"chunk_token_num should be in range from 1 to 100000000",
marks=pytest.mark.skip(reason="issues/6098"),
"Field: <parser_config.chunk_token_num> - Message: <Input should be greater than or equal to 1> - Value: <-1>",
),
(
"naive",
@@ -327,8 +380,7 @@ class TestUpdateDocumentParserConfig:
pytest.param(
"naive",
{"task_page_size": 100000000},
"task_page_size should be in range from 1 to 100000000",
marks=pytest.mark.skip(reason="API validation differs from expected message"),
"",
),
(
"naive",
@@ -360,8 +412,7 @@ class TestUpdateDocumentParserConfig:
pytest.param(
"naive",
{"auto_keywords": 32},
"auto_keywords should be in range from 0 to 32",
marks=pytest.mark.skip(reason="API validation differs from expected message"),
"",
),
(
"naive",
@@ -381,8 +432,7 @@ class TestUpdateDocumentParserConfig:
pytest.param(
"naive",
{"auto_questions": 10},
"auto_questions should be in range from 0 to 10",
marks=pytest.mark.skip(reason="API validation differs from expected message"),
"",
),
(
"naive",
@@ -402,8 +452,7 @@ class TestUpdateDocumentParserConfig:
pytest.param(
"naive",
{"topn_tags": 10},
"topn_tags should be in range from 0 to 10",
marks=pytest.mark.skip(reason="API validation differs from expected message"),
"",
),
(
"naive",

View File

@@ -435,16 +435,6 @@ def document_change_status(auth, payload=None, *, headers=HEADERS, data=None):
return res.json()
def document_rename(auth, payload=None, *, headers=HEADERS, data=None):
res = requests.post(url=f"{HOST_ADDRESS}{DOCUMENT_APP_URL}/rename", headers=headers, auth=auth, json=payload, data=data)
return res.json()
def document_set_meta(auth, payload=None, *, headers=HEADERS, data=None):
res = requests.post(url=f"{HOST_ADDRESS}{DOCUMENT_APP_URL}/set_meta", headers=headers, auth=auth, json=payload, data=data)
return res.json()
def bulk_upload_documents(auth, kb_id, num, tmp_path):
fps = []
for i in range(num):

View File

@@ -22,8 +22,6 @@ from common import (
document_filter,
document_infos,
document_metadata_summary,
document_rename,
document_set_meta,
document_update_metadata_setting,
)
from configs import INVALID_API_TOKEN
@@ -82,21 +80,6 @@ class TestAuthorization:
assert res["code"] == expected_code, res
assert expected_fragment in res["message"], res
@pytest.mark.p2
@pytest.mark.parametrize("invalid_auth, expected_code, expected_fragment", INVALID_AUTH_CASES)
def test_rename_auth_invalid(self, invalid_auth, expected_code, expected_fragment):
res = document_rename(invalid_auth, {"doc_id": "doc_id", "name": "rename.txt"})
assert res["code"] == expected_code, res
assert expected_fragment in res["message"], res
@pytest.mark.p2
@pytest.mark.parametrize("invalid_auth, expected_code, expected_fragment", INVALID_AUTH_CASES)
def test_set_meta_auth_invalid(self, invalid_auth, expected_code, expected_fragment):
res = document_set_meta(invalid_auth, {"doc_id": "doc_id", "meta": "{}"})
assert res["code"] == expected_code, res
assert expected_fragment in res["message"], res
class TestDocumentMetadata:
@pytest.mark.p2
def test_filter(self, WebApiAuth, add_dataset_func):
@@ -163,28 +146,6 @@ class TestDocumentMetadata:
assert info_res["code"] == 0, info_res
assert info_res["data"][0]["status"] == "1", info_res
@pytest.mark.p2
def test_rename(self, WebApiAuth, add_document_func):
_, doc_id = add_document_func
name = f"renamed_{doc_id}.txt"
res = document_rename(WebApiAuth, {"doc_id": doc_id, "name": name})
assert res["code"] == 0, res
assert res["data"] is True, res
info_res = document_infos(WebApiAuth, {"doc_ids": [doc_id]})
assert info_res["code"] == 0, info_res
assert info_res["data"][0]["name"] == name, info_res
@pytest.mark.p2
def test_set_meta(self, WebApiAuth, add_document_func):
_, doc_id = add_document_func
res = document_set_meta(WebApiAuth, {"doc_id": doc_id, "meta": "{\"author\": \"alice\"}"})
assert res["code"] == 0, res
assert res["data"] is True, res
info_res = document_infos(WebApiAuth, {"doc_ids": [doc_id]})
assert info_res["code"] == 0, info_res
meta_fields = info_res["data"][0].get("meta_fields", {})
assert meta_fields.get("author") == "alice", info_res
class TestDocumentMetadataNegative:
@pytest.mark.p3
@@ -231,20 +192,6 @@ class TestDocumentMetadataNegative:
assert res["code"] == 101, res
assert "Status" in res["message"], res
@pytest.mark.p3
def test_rename_extension_mismatch(self, WebApiAuth, add_document_func):
_, doc_id = add_document_func
res = document_rename(WebApiAuth, {"doc_id": doc_id, "name": "renamed.pdf"})
assert res["code"] == 101, res
assert "extension" in res["message"], res
@pytest.mark.p3
def test_set_meta_invalid_type(self, WebApiAuth, add_document_func):
_, doc_id = add_document_func
res = document_set_meta(WebApiAuth, {"doc_id": doc_id, "meta": "[]"})
assert res["code"] == 101, res
assert "dictionary" in res["message"], res
def _run(coro):
return asyncio.run(coro)
@@ -590,87 +537,6 @@ class TestDocumentMetadataUnit:
assert res["code"] == 0
assert res["data"]["doc1"]["status"] == "1"
def test_rename_branch_matrix_and_exception_unit(self, document_app_module, monkeypatch):
module = document_app_module
file_updates = []
es_updates = []
async def fake_thread_pool_exec(func, *_args, **_kwargs):
return func()
monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec)
monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: "tenant1")
monkeypatch.setattr(module.rag_tokenizer, "tokenize", lambda _name: ["token"])
monkeypatch.setattr(module.rag_tokenizer, "fine_grained_tokenize", lambda _tokens: ["fine"])
monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)})
class _DocStore:
def index_exist(self, _index_name, _kb_id):
return True
def update(self, where, payload, _index_name, _kb_id):
es_updates.append((where, payload))
monkeypatch.setattr(module.settings, "docStoreConn", _DocStore())
monkeypatch.setattr(module.search, "index_name", lambda tenant_id: f"idx_{tenant_id}")
def set_req(name):
async def fake_request_json():
return {"doc_id": "doc1", "name": name}
monkeypatch.setattr(module, "get_request_json", fake_request_json)
set_req("renamed.txt")
monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: False)
res = _run(module.rename.__wrapped__())
assert res["code"] == module.RetCode.AUTHENTICATION_ERROR
monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: True)
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None))
res = _run(module.rename.__wrapped__())
assert res["code"] == module.RetCode.DATA_ERROR
assert "Document not found!" in res["message"]
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, SimpleNamespace(id="doc1", name="origin.txt", kb_id="kb1")))
set_req("renamed.pdf")
res = _run(module.rename.__wrapped__())
assert res["code"] == module.RetCode.ARGUMENT_ERROR
assert "extension" in res["message"]
too_long = "a" * (module.FILE_NAME_LEN_LIMIT + 1) + ".txt"
set_req(too_long)
res = _run(module.rename.__wrapped__())
assert res["code"] == module.RetCode.ARGUMENT_ERROR
assert "bytes or less" in res["message"]
set_req("dup.txt")
monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [SimpleNamespace(name="dup.txt")])
res = _run(module.rename.__wrapped__())
assert res["code"] == module.RetCode.DATA_ERROR
assert "Duplicated document name" in res["message"]
set_req("ok.txt")
monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [])
monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: True)
monkeypatch.setattr(module.File2DocumentService, "get_by_document_id", lambda _doc_id: [SimpleNamespace(file_id="file1")])
monkeypatch.setattr(module.FileService, "get_by_id", lambda _file_id: (True, SimpleNamespace(id="file1")))
monkeypatch.setattr(module.FileService, "update_by_id", lambda file_id, payload: file_updates.append((file_id, payload)))
res = _run(module.rename.__wrapped__())
assert res["code"] == 0
assert file_updates == [("file1", {"name": "ok.txt"})]
assert es_updates[0][0] == {"doc_id": "doc1"}
assert es_updates[0][1]["docnm_kwd"] == "ok.txt"
assert es_updates[0][1]["title_tks"] == ["token"]
assert es_updates[0][1]["title_sm_tks"] == ["fine"]
def raise_db_error(*_args, **_kwargs):
raise RuntimeError("rename boom")
monkeypatch.setattr(module.DocumentService, "update_by_id", raise_db_error)
res = _run(module.rename.__wrapped__())
assert res["code"] == 500
assert "rename boom" in res["message"]
def test_get_route_not_found_success_and_exception_unit(self, document_app_module, monkeypatch):
module = document_app_module
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None))
@@ -932,50 +798,3 @@ class TestDocumentMetadataUnit:
res = _run(module.get_image("bucket-name"))
assert res["code"] == 500
assert "image boom" in res["message"]
def test_set_meta_validation_and_persistence_matrix_unit(self, document_app_module, monkeypatch):
module = document_app_module
def set_req(payload):
async def fake_request_json():
return payload
monkeypatch.setattr(module, "get_request_json", fake_request_json)
set_req({"doc_id": "doc1", "meta": "{}"})
monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: False)
res = _run(module.set_meta.__wrapped__())
assert res["code"] == module.RetCode.AUTHENTICATION_ERROR
monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: True)
set_req({"doc_id": "doc1", "meta": "[]"})
res = _run(module.set_meta.__wrapped__())
assert res["code"] == module.RetCode.ARGUMENT_ERROR
assert "Only dictionary type supported." in res["message"]
set_req({"doc_id": "doc1", "meta": '{"tags":[{"x":1}]}'})
res = _run(module.set_meta.__wrapped__())
assert res["code"] == module.RetCode.ARGUMENT_ERROR
assert "The type is not supported in list" in res["message"]
set_req({"doc_id": "doc1", "meta": '{"obj":{"x":1}}'})
res = _run(module.set_meta.__wrapped__())
assert res["code"] == module.RetCode.ARGUMENT_ERROR
assert "The type is not supported" in res["message"]
set_req({"doc_id": "doc1", "meta": "{"})
res = _run(module.set_meta.__wrapped__())
assert res["code"] == module.RetCode.ARGUMENT_ERROR
assert "Json syntax error:" in res["message"]
set_req({"doc_id": "doc1", "meta": '{"author":"alice"}'})
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None))
res = _run(module.set_meta.__wrapped__())
assert res["code"] == module.RetCode.DATA_ERROR
assert "Document not found!" in res["message"]
monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, SimpleNamespace(id="doc1")))
monkeypatch.setattr(module.DocMetadataService, "update_document_metadata", lambda *_args, **_kwargs: False)
res = _run(module.set_meta.__wrapped__())
assert res["code"] == module.RetCode.DATA_ERROR
assert "Database error (meta updates)!" in res["message"]

View File

@@ -15,7 +15,10 @@ import {
} from '@/interfaces/request/document';
import i18n from '@/locales/config';
import { EMPTY_METADATA_FIELD } from '@/pages/dataset/dataset/use-select-filters';
import kbService, { listDocument } from '@/services/knowledge-service';
import kbService, {
listDocument,
renameDocument,
} from '@/services/knowledge-service';
import api, { restAPIv1, webAPI } from '@/utils/api';
import { getSearchValue } from '@/utils/common-util';
import { buildChunkHighlights } from '@/utils/document-util';
@@ -336,12 +339,13 @@ export const useSaveDocumentName = () => {
mutationFn: async ({
name,
documentId,
kbId,
}: {
name: string;
documentId: string;
kbId: string;
}) => {
const { data } = await kbService.document_rename({
doc_id: documentId,
const { data } = await renameDocument(kbId, documentId, {
name: name,
});
if (data.code === 0) {

View File

@@ -15,14 +15,18 @@ export const useRenameDocument = () => {
const onRenameOk = useCallback(
async (name: string) => {
if (record?.id) {
const ret = await saveName({ documentId: record.id, name });
if (record?.id && record?.kb_id) {
const ret = await saveName({
documentId: record.id,
name,
kbId: record.kb_id,
});
if (ret === 0) {
hideRenameModal();
}
}
},
[record?.id, saveName, hideRenameModal],
[record?.id, record?.kb_id, saveName, hideRenameModal],
);
const handleShow = useCallback(

View File

@@ -77,7 +77,7 @@ const methods = {
},
document_rename: {
url: document_rename,
method: 'post',
method: 'put',
},
document_create: {
url: document_create,
@@ -251,6 +251,12 @@ export const listDocument = (
export const documentFilter = (kb_id: string) =>
request.post(api.get_dataset_filter, { kb_id });
export const renameDocument = (
datasetId: string,
documentId: string,
data: { name?: string },
) => request.put(api.document_rename(datasetId, documentId), { data });
export const getMetaDataService = ({
kb_id,
doc_ids,

View File

@@ -109,7 +109,8 @@ export default {
document_change_status: `${webAPI}/document/change_status`,
document_rm: `${webAPI}/document/rm`,
document_delete: `${webAPI}/api/document`,
document_rename: `${webAPI}/document/rename`,
document_rename: (datasetId: string, documentId: string) =>
`${restAPIv1}/datasets/${datasetId}/documents/${documentId}`,
document_create: `${webAPI}/document/create`,
document_run: `${webAPI}/document/run`,
document_change_parser: `${webAPI}/document/change_parser`,