diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 580eef6e55..1f443374fb 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -15,7 +15,6 @@ # import json import os.path -import pathlib import re from pathlib import Path, PurePosixPath, PureWindowsPath @@ -49,7 +48,7 @@ from common.file_utils import get_project_base_directory from common.metadata_utils import convert_conditions, meta_filter, turn2jsonschema from common.misc_utils import get_uuid, thread_pool_exec from deepdoc.parser.html_parser import RAGFlowHtmlParser -from rag.nlp import rag_tokenizer, search +from rag.nlp import search def _is_safe_download_filename(name: str) -> bool: @@ -666,61 +665,6 @@ async def run(): except Exception as e: return server_error_response(e) - -@manager.route("/rename", methods=["POST"]) # noqa: F821 -@login_required -@validate_request("doc_id", "name") -async def rename(): - req = await get_request_json() - uid = current_user.id - try: - - def _rename_sync(): - if not DocumentService.accessible(req["doc_id"], uid): - return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR) - - e, doc = DocumentService.get_by_id(req["doc_id"]) - if not e: - return get_data_error_result(message="Document not found!") - if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(doc.name.lower()).suffix: - return get_json_result(data=False, message="The extension of file can't be changed", code=RetCode.ARGUMENT_ERROR) - if len(req["name"].encode("utf-8")) > FILE_NAME_LEN_LIMIT: - return get_json_result(data=False, message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", code=RetCode.ARGUMENT_ERROR) - - for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id): - if d.name == req["name"]: - return get_data_error_result(message="Duplicated document name in the same dataset.") - - if not DocumentService.update_by_id(req["doc_id"], {"name": req["name"]}): - return get_data_error_result(message="Database error (Document rename)!") - - informs = File2DocumentService.get_by_document_id(req["doc_id"]) - if informs: - e, file = FileService.get_by_id(informs[0].file_id) - FileService.update_by_id(file.id, {"name": req["name"]}) - - tenant_id = DocumentService.get_tenant_id(req["doc_id"]) - title_tks = rag_tokenizer.tokenize(req["name"]) - es_body = { - "docnm_kwd": req["name"], - "title_tks": title_tks, - "title_sm_tks": rag_tokenizer.fine_grained_tokenize(title_tks), - } - if settings.docStoreConn.index_exist(search.index_name(tenant_id), doc.kb_id): - settings.docStoreConn.update( - {"doc_id": req["doc_id"]}, - es_body, - search.index_name(tenant_id), - doc.kb_id, - ) - return get_json_result(data=True) - - return await thread_pool_exec(_rename_sync) - - except Exception as e: - return server_error_response(e) - - @manager.route("/get/", methods=["GET"]) # noqa: F821 @login_required async def get(doc_id): diff --git a/api/apps/restful_apis/document_api.py b/api/apps/restful_apis/document_api.py new file mode 100644 index 0000000000..669f895a96 --- /dev/null +++ b/api/apps/restful_apis/document_api.py @@ -0,0 +1,146 @@ +# +# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import logging + +from peewee import OperationalError +from pydantic import ValidationError + +from api.apps.services.document_api_service import rename_doc_key, validate_document_update_fields, \ + update_document_name_only, update_chunk_method_only, update_document_status_only +from api.db.services.doc_metadata_service import DocMetadataService +from api.db.services.document_service import DocumentService +from api.db.services.knowledgebase_service import KnowledgebaseService +from common.constants import RetCode +from api.apps import login_required +from api.utils.api_utils import get_error_data_result, get_result, add_tenant_id_to_kwargs, get_request_json +from api.utils.validation_utils import ( + UpdateDocumentReq, format_validation_error_message, +) + +@manager.route("/datasets//documents/", methods=["PUT"]) # noqa: F821 +@login_required +@add_tenant_id_to_kwargs +async def update_document(tenant_id, dataset_id, document_id): + """ + Update a document within a dataset. + --- + tags: + - Documents + security: + - ApiKeyAuth: [] + parameters: + - in: path + name: dataset_id + type: string + required: true + description: ID of the dataset. + - in: path + name: document_id + type: string + required: true + description: ID of the document to update. + - in: header + name: Authorization + type: string + required: true + description: Bearer token for authentication. + - in: body + name: body + description: Document update parameters. + required: true + schema: + type: object + properties: + name: + type: string + description: New name of the document. + parser_config: + type: object + description: Parser configuration. + chunk_method: + type: string + description: Chunking method. + enabled: + type: boolean + description: Document status. + responses: + 200: + description: Document updated successfully. + schema: + type: object + """ + req = await get_request_json() + + # Verify ownership and existence of dataset and document + if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): + return get_error_data_result(message="You don't own the dataset.") + e, kb = KnowledgebaseService.get_by_id(dataset_id) + if not e: + return get_error_data_result(message="Can't find this dataset!") + + # Prepare data for validation + docs = DocumentService.query(kb_id=dataset_id, id=document_id) + if not docs: + return get_error_data_result(message="The dataset doesn't own the document.") + + # Validate document update request parameters + try: + update_doc_req = UpdateDocumentReq(**req) + except ValidationError as e: + return get_error_data_result(message=format_validation_error_message(e), code=RetCode.DATA_ERROR) + + doc = docs[0] + + # further check with inner status (from DB) + error_msg, error_code = validate_document_update_fields(update_doc_req, doc, req) + if error_msg: + return get_error_data_result(message=error_msg, code=error_code) + + # All validations passed, now perform all updates + # meta_fields provided, then update it + if "meta_fields" in req: + if not DocMetadataService.update_document_metadata(document_id, update_doc_req.meta_fields): + return get_error_data_result(message="Failed to update metadata") + # doc name provided from request and diff with existing value, update + if "name" in req and req["name"] != doc.name: + if error := update_document_name_only(document_id, req["name"]): + return error + + # parser config provided (already validated in UpdateDocumentReq), update it + if update_doc_req.parser_config: + DocumentService.update_parser_config(doc.id, req["parser_config"]) + + # chunk method provided - the update method will check if it's different with existing one + if update_doc_req.chunk_method: + if error := update_chunk_method_only(req, doc, dataset_id, tenant_id): + return error + + if "enabled" in req: # already checked in UpdateDocumentReq - it's int if it's present + # "enabled" flag provided, the update method will check if it's changed and then update if so + if error := update_document_status_only(int(req["enabled"]), doc, kb): + return error + + try: + original_doc_id = doc.id + ok, doc = DocumentService.get_by_id(doc.id) + if not ok: + return get_error_data_result(message=f"Can not get document by id:{original_doc_id}") + except OperationalError as e: + logging.exception(e) + return get_error_data_result(message="Database operation failed") + renamed_doc = rename_doc_key(doc) + return get_result(data=renamed_doc) + diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index c01ac38485..72964ae35a 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -15,13 +15,11 @@ # import datetime import json -import logging import re from io import BytesIO import xxhash -from peewee import OperationalError -from pydantic import BaseModel, Field, validator, ValidationError +from pydantic import BaseModel, Field, validator from quart import request, send_file from api.constants import FILE_NAME_LEN_LIMIT @@ -35,10 +33,8 @@ from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.llm_service import LLMBundle from api.db.services.task_service import TaskService, cancel_all_task_of, queue_tasks from api.db.services.tenant_llm_service import TenantLLMService -from api.utils import validation_utils -from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_parser_config, get_request_json, get_result, server_error_response, token_required +from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_request_json, get_result, server_error_response, token_required from api.utils.image_utils import store_chunk_image -from api.utils.validation_utils import format_validation_error_message, UpdateDocumentReq from common import settings from common.constants import FileSource, LLMType, ParserType, RetCode, TaskStatus from common.metadata_utils import convert_conditions, meta_filter @@ -184,215 +180,6 @@ async def upload(dataset_id, tenant_id): renamed_doc_list.append(renamed_doc) return get_result(data=renamed_doc_list) - -def _update_document_name_only(document_id, req_doc_name): - """Update document name only (without validation).""" - if not DocumentService.update_by_id(document_id, {"name": req_doc_name}): - return get_error_data_result(message="Database error (Document rename)!") - - informs = File2DocumentService.get_by_document_id(document_id) - if informs: - e, file = FileService.get_by_id(informs[0].file_id) - FileService.update_by_id(file.id, {"name": req_doc_name}) - return None - -def _update_chunk_method_only(req, doc, dataset_id, tenant_id): - """Update chunk method only (without validation).""" - if doc.parser_id.lower() != req["chunk_method"].lower(): - # if chunk method changed - e = DocumentService.update_by_id( - doc.id, - { - "parser_id": req["chunk_method"], - "progress": 0, - "progress_msg": "", - "run": TaskStatus.UNSTART.value, - }, - ) - if not e: - return get_error_data_result(message="Document not found!") - if not req.get("parser_config"): - req["parser_config"] = get_parser_config(req["chunk_method"], req.get("parser_config")) - DocumentService.update_parser_config(doc.id, req["parser_config"]) - if doc.token_num > 0: - e = DocumentService.increment_chunk_num( - doc.id, - doc.kb_id, - doc.token_num * -1, - doc.chunk_num * -1, - doc.process_duration * -1, - ) - if not e: - return get_error_data_result(message="Document not found!") - settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id) - return None - -def _update_document_status_only(status:int, doc, kb): - """Update document status only (without validation).""" - if doc.status is None or (int(doc.status) != status): - try: - if not DocumentService.update_by_id(doc.id, {"status": str(status)}): - return get_error_data_result(message="Database error (Document update)!") - settings.docStoreConn.update({"doc_id": doc.id}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id) - except Exception as e: - return server_error_response(e) - return None - - -@manager.route("/datasets//documents/", methods=["PUT"]) # noqa: F821 -@token_required -async def update_doc(tenant_id, dataset_id, document_id): - """ - Update a document within a dataset. - --- - tags: - - Documents - security: - - ApiKeyAuth: [] - parameters: - - in: path - name: dataset_id - type: string - required: true - description: ID of the dataset. - - in: path - name: document_id - type: string - required: true - description: ID of the document to update. - - in: header - name: Authorization - type: string - required: true - description: Bearer token for authentication. - - in: body - name: body - description: Document update parameters. - required: true - schema: - type: object - properties: - name: - type: string - description: New name of the document. - parser_config: - type: object - description: Parser configuration. - chunk_method: - type: string - description: Chunking method. - enabled: - type: boolean - description: Document status. - responses: - 200: - description: Document updated successfully. - schema: - type: object - """ - req = await get_request_json() - - # Verify ownership and existence of dataset and document - if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): - return get_error_data_result(message="You don't own the dataset.") - e, kb = KnowledgebaseService.get_by_id(dataset_id) - if not e: - return get_error_data_result(message="Can't find this dataset!") - - # Prepare data for validation - docs = DocumentService.query(kb_id=dataset_id, id=document_id) - if not docs: - return get_error_data_result(message="The dataset doesn't own the document.") - - # Validate document update request parameters - try: - update_doc_req = UpdateDocumentReq(**req) - except ValidationError as e: - return get_error_data_result(message=format_validation_error_message(e), code=RetCode.DATA_ERROR) - - doc = docs[0] - - # further check with inner status (from DB) - error_msg, error_code = _validate_document_update_fields(update_doc_req, doc, req) - if error_msg: - return get_error_data_result(message=error_msg, code=error_code) - - # All validations passed, now perform all updates - # meta_fields provided, then update it - if update_doc_req.meta_fields: - if not DocMetadataService.update_document_metadata(document_id, update_doc_req.meta_fields): - return get_error_data_result(message="Failed to update metadata") - # doc name provided from request and diff with existing value, update - if "name" in req and req["name"] != doc.name: - if error := _update_document_name_only(document_id, req["name"]): - return error - - # parser config provided (already validated in UpdateDocumentReq), update it - if update_doc_req.parser_config: - DocumentService.update_parser_config(doc.id, req["parser_config"]) - - # chunk method provided - the update method will check if it's different with existing one - if update_doc_req.chunk_method: - if error := _update_chunk_method_only(req, doc, dataset_id, tenant_id): - return error - - if "enabled" in req: # already checked in UpdateDocumentReq - it's int if it's present - # "enabled" flag provided, the update method will check if it's changed and then update if so - if error := _update_document_status_only(int(req["enabled"]), doc, kb): - return error - - try: - ok, doc = DocumentService.get_by_id(doc.id) - if not ok: - return get_error_data_result(message="Dataset created failed") - except OperationalError as e: - logging.exception(e) - return get_error_data_result(message="Database operation failed") - - key_mapping = { - "chunk_num": "chunk_count", - "kb_id": "dataset_id", - "token_num": "token_count", - "parser_id": "chunk_method", - } - run_mapping = { - "0": "UNSTART", - "1": "RUNNING", - "2": "CANCEL", - "3": "DONE", - "4": "FAIL", - } - renamed_doc = {} - for key, value in doc.to_dict().items(): - new_key = key_mapping.get(key, key) - renamed_doc[new_key] = value - if key == "run": - renamed_doc["run"] = run_mapping.get(str(value)) - - return get_result(data=renamed_doc) - -def _validate_document_update_fields(update_doc_req:UpdateDocumentReq, doc, req): - """Validate document update fields in a single method.""" - # Validate immutable fields - error_msg, error_code = validation_utils.validate_immutable_fields(update_doc_req, doc) - if error_msg: - return error_msg, error_code - - # Validate document name if present - if "name" in req and req["name"] != doc.name: - docs_from_name = DocumentService.query(name=req["name"], kb_id=doc.kb_id) - error_msg, error_code = validation_utils.validate_document_name(req["name"], doc, docs_from_name) - if error_msg: - return error_msg, error_code - - # Validate chunk method if present - if "chunk_method" in req: - error_msg, error_code = validation_utils.validate_chunk_method(doc, req["chunk_method"]) - if error_msg: - return error_msg, error_code - - return None, None - @manager.route("/datasets//documents/", methods=["GET"]) # noqa: F821 @token_required async def download(tenant_id, dataset_id, document_id): diff --git a/api/apps/services/document_api_service.py b/api/apps/services/document_api_service.py new file mode 100644 index 0000000000..f479b0f32e --- /dev/null +++ b/api/apps/services/document_api_service.py @@ -0,0 +1,201 @@ +# +# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from api.db.services.document_service import DocumentService +from api.db.services.file2document_service import File2DocumentService +from api.db.services.file_service import FileService +from api.utils import validation_utils +from common import settings +from common.constants import TaskStatus +from api.utils.api_utils import get_error_data_result, server_error_response, get_parser_config +from api.utils.validation_utils import UpdateDocumentReq +from rag.nlp import rag_tokenizer, search + + +def update_document_name_only(document_id, req_doc_name): + """ + Update document name only (without validation). + :param document_id: id (string) of the document + :param req_doc_name: new name (string) from request for the document + :return: None if all are good; otherwise returns the error message in the JSON format + """ + if not DocumentService.update_by_id(document_id, {"name": req_doc_name}): + return get_error_data_result(message="Database error (Document rename)!") + + informs = File2DocumentService.get_by_document_id(document_id) + if informs: + e, file = FileService.get_by_id(informs[0].file_id) + FileService.update_by_id(file.id, {"name": req_doc_name}) + # Add logic to update index - refer to rename method in document_app.py + tenant_id = DocumentService.get_tenant_id(document_id) + title_tks = rag_tokenizer.tokenize(req_doc_name) + es_body = { + "docnm_kwd": req_doc_name, + "title_tks": title_tks, + "title_sm_tks": rag_tokenizer.fine_grained_tokenize(title_tks), + } + ok, doc = DocumentService.get_by_id(document_id) + if not ok: + return get_error_data_result(message=f"Not able to find document by id:{document_id}") + if settings.docStoreConn.index_exist(search.index_name(tenant_id), doc.kb_id): + settings.docStoreConn.update( + {"doc_id": document_id}, + es_body, + search.index_name(tenant_id), + doc.kb_id, + ) + return None + +def update_chunk_method_only(req, doc, dataset_id, tenant_id): + """ + Update chunk method only (without validation). + + Updates the chunk method and parser configuration for a document, + and resets the document's progress if the chunk method changes. + Also clears existing chunks from the document store if the method changes. + + Args: + req: The request dictionary containing chunk_method and parser_config. + doc: The document model from the database. + dataset_id: The ID of the dataset containing the document. + tenant_id: The tenant ID for the document store. + + Returns: + None if successful, or an error result dictionary if failed. + """ + if doc.parser_id.lower() != req["chunk_method"].lower(): + # if chunk method changed + e = DocumentService.update_by_id( + doc.id, + { + "parser_id": req["chunk_method"], + "progress": 0, + "progress_msg": "", + "run": TaskStatus.UNSTART.value, + }, + ) + if not e: + return get_error_data_result(message="Document not found!") + if not req.get("parser_config"): + req["parser_config"] = get_parser_config(req["chunk_method"], req.get("parser_config")) + DocumentService.update_parser_config(doc.id, req["parser_config"]) + if doc.token_num > 0: + e = DocumentService.increment_chunk_num( + doc.id, + doc.kb_id, + doc.token_num * -1, + doc.chunk_num * -1, + doc.process_duration * -1, + ) + if not e: + return get_error_data_result(message="Document not found!") + settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id) + return None + +def update_document_status_only(status:int, doc, kb): + """ + Update document status only (without validation). + + Updates the enabled/disabled status of a document and updates + the corresponding index in the document store. + + Args: + status: The new status value (0 for disabled, 1 for enabled). + doc: The document model from the database. + kb: The knowledge base model. + + Returns: + None if successful, or an error result dictionary if failed. + """ + if doc.status is None or (int(doc.status) != status): + try: + if not DocumentService.update_by_id(doc.id, {"status": str(status)}): + return get_error_data_result(message="Database error (Document update)!") + settings.docStoreConn.update({"doc_id": doc.id}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id) + except Exception as e: + return server_error_response(e) + return None + + +def validate_document_update_fields(update_doc_req:UpdateDocumentReq, doc, req): + """ + Validate document update fields in a single method. + + Performs comprehensive validation of all document update fields, + including immutable fields, document name, and chunk method. + + Args: + update_doc_req: The validated update document request. + doc: The document model from the database. + req: The original request dictionary. + + Returns: + A tuple of (error_message, error_code) if validation fails, + or (None, None) if validation passes. + """ + # Validate immutable fields + error_msg, error_code = validation_utils.validate_immutable_fields(update_doc_req, doc) + if error_msg: + return error_msg, error_code + + # Validate document name if present + if "name" in req and req["name"] != doc.name: + docs_from_name = DocumentService.query(name=req["name"], kb_id=doc.kb_id) + error_msg, error_code = validation_utils.validate_document_name(req["name"], doc, docs_from_name) + if error_msg: + return error_msg, error_code + + # Validate chunk method if present + if "chunk_method" in req: + error_msg, error_code = validation_utils.validate_chunk_method(doc, req["chunk_method"]) + if error_msg: + return error_msg, error_code + + return None, None + +def rename_doc_key(doc): + """ + Rename document keys to match API response format. + + Converts internal document model field names to the external API + response field names (e.g., 'chunk_num' -> 'chunk_count'). + + Args: + doc: The document model from the database. + + Returns: + A dictionary with renamed keys for API response. + """ + key_mapping = { + "chunk_num": "chunk_count", + "kb_id": "dataset_id", + "token_num": "token_count", + "parser_id": "chunk_method", + } + run_mapping = { + "0": "UNSTART", + "1": "RUNNING", + "2": "CANCEL", + "3": "DONE", + "4": "FAIL", + } + renamed_doc = {} + for key, value in doc.to_dict().items(): + new_key = key_mapping.get(key, key) + renamed_doc[new_key] = value + if key == "run": + renamed_doc["run"] = run_mapping.get(str(value)) + return renamed_doc + diff --git a/api/utils/validation_utils.py b/api/utils/validation_utils.py index 19033bef8b..acce492627 100644 --- a/api/utils/validation_utils.py +++ b/api/utils/validation_utils.py @@ -402,7 +402,14 @@ class ParserConfig(Base): ext: Annotated[dict, Field(default={})] class UpdateDocumentReq(Base): + """ + Request model for updating a document. + + This model validates the request parameters for updating a document, + including name, chunk method, enabled status, and other metadata. + """ model_config = ConfigDict(extra='ignore') + name: Annotated[str | None, Field(default=None, max_length=65535)] chunk_method: Annotated[str | None, Field(default=None, max_length=65535)] enabled: Annotated[int | None, Field(default=None, ge=0, le=1)] chunk_count: Annotated[int | None, Field(default=None, ge=0)] @@ -432,6 +439,22 @@ class UpdateDocumentReq(Base): return enabled + @field_validator("meta_fields", mode="after") + @classmethod + def validate_document_meta_fields(cls, meta_fields: dict | None): + if meta_fields is None: + return None + + if not isinstance(meta_fields, dict): + raise PydanticCustomError("format_invalid", "Only dictionary type supported") + for k, v in meta_fields.items(): + if isinstance(v, list): + if not all(isinstance(i, (str, int, float)) for i in v): + raise PydanticCustomError("format_invalid", "The type is not supported in list: {v}", {"v":v}) + elif not isinstance(v, (str, int, float)): + raise PydanticCustomError("format_invalid", "The type is not supported: {v}", {"v":v}) + return meta_fields + class CreateDatasetReq(Base): name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(...)] avatar: Annotated[str | None, Field(default=None, max_length=65535)] @@ -854,7 +877,20 @@ class ListFileReq(BaseModel): def validate_immutable_fields(update_doc_req:UpdateDocumentReq, doc): - """Validate that immutable fields have not been changed.""" + """ + Validate that immutable fields have not been changed. + + Checks that fields like chunk_count, token_count, and progress + cannot be modified directly by the user. + + Args: + update_doc_req: The validated update document request. + doc: The document model from the database. + + Returns: + A tuple of (error_message, error_code) if validation fails, + or (None, None) if validation passes. + """ if update_doc_req.chunk_count and update_doc_req.chunk_count != int(getattr(doc, "chunk_num", -1)): return "Can't change `chunk_count`.", RetCode.DATA_ERROR @@ -871,7 +907,24 @@ def validate_immutable_fields(update_doc_req:UpdateDocumentReq, doc): def validate_document_name(req_doc_name:str, doc, docs_from_name): - """Validate document name update.""" + """ + Validate document name update. + + Checks that the new document name is valid: + - Must be a string + - Must not exceed the file name length limit + - File extension cannot be changed + - Must not duplicate an existing document name in the same dataset. + + Args: + req_doc_name: The new document name to validate. + doc: The document model from the database. + docs_from_name: Query result for documents with the new name. + + Returns: + A tuple of (error_message, error_code) if validation fails, + or (None, None) if validation passes. + """ if not isinstance(req_doc_name, str): return f"AttributeError('{type(req_doc_name).__name__}' object has no attribute 'encode')", RetCode.EXCEPTION_ERROR if len(req_doc_name.encode("utf-8")) > FILE_NAME_LEN_LIMIT: @@ -885,7 +938,20 @@ def validate_document_name(req_doc_name:str, doc, docs_from_name): return None, None def validate_chunk_method(doc, chunk_method=None): - """Validate chunk method update.""" + """ + Validate chunk method update. + + Checks if the chunk method is valid for the given document, + particularly for visual documents or specific file types. + + Args: + doc: The document model from the database. + chunk_method: The chunk method to validate. + + Returns: + A tuple of (error_message, error_code) if validation fails, + or (None, None) if validation passes. + """ if chunk_method is not None and len(chunk_method) == 0: # will not be detected in UpdateDocumentReq return "`chunk_method` (empty string) is not valid", RetCode.DATA_ERROR if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name): diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py index 618d4743c2..8f4ea7653d 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_doc_sdk_routes_unit.py @@ -366,123 +366,6 @@ class TestDocRoutesUnit: assert res["code"] == module.RetCode.SERVER_ERROR assert res["message"] == "upload failed" - def test_update_doc_guards_and_error_paths(self, monkeypatch): - module = _load_doc_module(monkeypatch) - doc = _DummyDoc() - monkeypatch.setattr(module.KnowledgebaseService, "query", lambda **_kwargs: []) - monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({})) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert res["message"] == "You don't own the dataset." - - monkeypatch.setattr(module.KnowledgebaseService, "query", lambda **_kwargs: [1]) - monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _id: (False, None)) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert res["message"] == "Can't find this dataset!" - - monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _id: (True, SimpleNamespace(tenant_id="tenant-1"))) - monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: []) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert "doesn't own the document" in res["message"] - - monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [doc]) - monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_count": 100})) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert "chunk_count" in res["message"] - - monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"token_count": 100})) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert "token_count" in res["message"] - - monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"progress": 100})) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert "progress" in res["message"] - - monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"meta_fields": []})) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert res["message"] == "Field: - Message: - Value: <[]>" - - monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"meta_fields": {"k": "v"}})) - monkeypatch.setattr(module.DocMetadataService, "update_document_metadata", lambda *_args, **_kwargs: False) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert res["message"] == "Failed to update metadata" - - monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"name": "a" * (module.FILE_NAME_LEN_LIMIT + 1)})) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert res["code"] == module.RetCode.ARGUMENT_ERROR - assert "bytes or less" in res["message"] - - monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: False) - monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"name": "new.txt"})) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert "Document rename" in res["message"] - - def test_update_doc_chunk_method_enabled_and_db_error(self, monkeypatch): - module = _load_doc_module(monkeypatch) - from api.db import FileType - visual_doc = _DummyDoc(parser_id="naive", doc_type=FileType.VISUAL) - kb = SimpleNamespace(tenant_id="tenant-1") - monkeypatch.setattr(module.KnowledgebaseService, "query", lambda **_kwargs: [1]) - monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _id: (True, kb)) - monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [visual_doc]) - monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_method": "naive"})) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert res["message"] == "Not supported yet!" - - doc = _DummyDoc(token_num=2, chunk_num=1, parser_id="naive") - monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [doc]) - monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: False) - monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_method": "manual"})) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert res["message"] == "Document not found!" - - monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: True) - monkeypatch.setattr(module.DocumentService, "update_parser_config", lambda *_args, **_kwargs: None) - monkeypatch.setattr(module.DocumentService, "increment_chunk_num", lambda *_args, **_kwargs: False) - _patch_docstore(monkeypatch, module, delete=lambda *_args, **_kwargs: None) - monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_method": "manual"})) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert res["message"] == "Document not found!" - - monkeypatch.setattr(module.DocumentService, "increment_chunk_num", lambda *_args, **_kwargs: True) - doc_for_enabled = _DummyDoc(status=False) - monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [doc_for_enabled]) - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _id: (True, doc_for_enabled)) - monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: False) - _patch_docstore(monkeypatch, module, update=lambda *_args, **_kwargs: None, delete=lambda *_args, **_kwargs: None) - monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"enabled": 1})) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert "Document update" in res["message"] - - monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: True) - _patch_docstore(monkeypatch, module, update=lambda *_args, **_kwargs: (_ for _ in ()).throw(RuntimeError("boom")), delete=lambda *_args, **_kwargs: None) - monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)}) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert res["code"] == 500 - assert "boom" in res["message"] - - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _id: (False, None)) - _patch_docstore(monkeypatch, module, update=lambda *_args, **_kwargs: None, delete=lambda *_args, **_kwargs: None) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert res["message"] == "Dataset created failed" - - # cover token reset + docStore deletion branch - doc_reset = _DummyDoc(token_num=3, chunk_num=2, parser_id="naive", run=0) - monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [doc_reset]) - monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: True) - monkeypatch.setattr(module.DocumentService, "increment_chunk_num", lambda *_args, **_kwargs: True) - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _id: (True, doc_reset)) - _patch_docstore(monkeypatch, module, delete=lambda *_args, **_kwargs: None, update=lambda *_args, **_kwargs: None) - monkeypatch.setattr(module, "get_request_json", lambda: _AwaitableValue({"chunk_method": "manual"})) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert res["code"] == 0 - - def _raise_operational_error(_id): - raise module.OperationalError("db down") - - monkeypatch.setattr(module.DocumentService, "get_by_id", _raise_operational_error) - res = _run(module.update_doc.__wrapped__("tenant-1", "ds-1", "doc-1")) - assert res["message"] == "Database operation failed" - def test_download_and_download_doc_errors(self, monkeypatch): module = _load_doc_module(monkeypatch) _patch_send_file(monkeypatch, module) diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py index c153a41193..c022a0e97b 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py @@ -21,16 +21,17 @@ from configs import DOCUMENT_NAME_LIMIT, INVALID_API_TOKEN, INVALID_ID_32 from libs.auth import RAGFlowHttpApiAuth from configs import DEFAULT_PARSER_CONFIG + @pytest.mark.p1 class TestAuthorization: @pytest.mark.parametrize( "invalid_auth, expected_code, expected_message", [ - (None, 0, "`Authorization` can't be empty"), + (None, 401, ""), ( RAGFlowHttpApiAuth(INVALID_API_TOKEN), - 109, - "Authentication error: API key is invalid!", + 401, + "", ), ], ) @@ -53,13 +54,13 @@ class TestDocumentsUpdated: ), ( 0, - 100, - """AttributeError(\'int\' object has no attribute \'encode\')""", + 102, + "Field: - Message: - Value: <0>", ), ( None, 100, - """AttributeError(\'NoneType\' object has no attribute \'encode\')""", + "AttributeError('NoneType' object has no attribute 'encode')", ), ( "", @@ -93,7 +94,7 @@ class TestDocumentsUpdated: else: assert res["message"] == expected_message - @pytest.mark.p3 + @pytest.mark.p2 @pytest.mark.parametrize( "document_id, expected_code, expected_message", [ @@ -110,7 +111,7 @@ class TestDocumentsUpdated: assert res["code"] == expected_code assert res["message"] == expected_message - @pytest.mark.p3 + @pytest.mark.p2 @pytest.mark.parametrize( "dataset_id, expected_code, expected_message", [ @@ -127,10 +128,28 @@ class TestDocumentsUpdated: assert res["code"] == expected_code assert res["message"] == expected_message - @pytest.mark.p3 + @pytest.mark.p2 @pytest.mark.parametrize( "meta_fields, expected_code, expected_message", - [({"test": "test"}, 0, ""), ("test", 102, "meta_fields must be a dictionary")], + [ + # Valid meta_fields + ({"test": "test"}, 0, ""), + # Valid meta_fields with various types + ({"author": "alice", "year": 2024}, 0, ""), + ({"tags": ["tag1", "tag2"]}, 0, ""), + ({"count": 42, "price": 19.99}, 0, ""), + # Invalid type - string instead of dict + ("test", 102, "Field: - Message: - Value: "), + # Invalid type - list instead of dict + ([], 102, "Field: - Message: - Value: <[]>"), + # Invalid - list containing objects (unsupported type in list) + ({"tags": [{"x": {"a": "b"}}]}, 102, "Field: - Message: - Value: <{'tags': [{'x': {'a': 'b'}}]}>"), + ({"tags": [{"x": 1}]}, 102, "Field: - Message: - Value: <{'tags': [{'x': 1}]}>"), + # Invalid - nested object with unsupported type + ({"obj": {"x": 1}}, 102, "Field: - Message: - Value: <{'obj': {'x': 1}}>"), + # Valid types of list + ({"tags": [2, 1]}, 0, ""), + ], ) def test_meta_fields(self, HttpApiAuth, add_documents, meta_fields, expected_code, expected_message): dataset_id, document_ids = add_documents @@ -139,7 +158,22 @@ class TestDocumentsUpdated: res = list_documents(HttpApiAuth, dataset_id, {"id": document_ids[0]}) assert res["data"]["docs"][0]["meta_fields"] == meta_fields else: - assert res["message"] == expected_message + assert expected_message in res["message"] or res["message"] == expected_message + + @pytest.mark.p2 + @pytest.mark.parametrize( + "meta_fields, expected_code, expected_message", + [ + # Test with invalid document ID (not owned by dataset) + ({"author": "alice"}, 102, "The dataset doesn't own the document."), + ], + ) + def test_meta_fields_invalid_document(self, HttpApiAuth, add_documents, meta_fields, expected_code, expected_message): + """Test meta_fields update with invalid document ID""" + dataset_id, _ = add_documents + res = update_document(HttpApiAuth, dataset_id, "invalid_doc_id_12345678901234567890", {"meta_fields": meta_fields}) + assert res["code"] == expected_code + assert expected_message in res["message"] @pytest.mark.p2 @pytest.mark.parametrize( @@ -297,6 +331,30 @@ class TestDocumentsUpdated: assert res["code"] == expected_code assert res["message"] == expected_message + @pytest.mark.p2 + @pytest.mark.parametrize( + "payload, expected_code, expected_message", + [ + ({"chunk_count": 100}, 102, "Can't change `chunk_count`."), + ({"token_count": 100}, 102, "Can't change `token_count`."), + ({"progress": 2.0}, 102, "Field: - Message: - Value: <2.0>"), + ({"progress": 1.0}, 102, "Can't change `progress`."), + ({"meta_fields": []}, 102, "Field: - Message: - Value: <[]>"), + ], + ) + def test_update_doc_guards_and_error_paths(self, HttpApiAuth, add_documents, payload, expected_code, expected_message): + """ + Test various guard conditions and error paths for document update functionality. + This includes testing for invalid dataset ownership, document ownership, + immutable fields, and validation errors. + """ + dataset_id, document_ids = add_documents + document_id = document_ids[0] + + res = update_document(HttpApiAuth, dataset_id, document_id, payload) + assert res["code"] == expected_code + if expected_message: + assert expected_message in res["message"] or res["message"] == expected_message DEFAULT_PARSER_CONFIG_FOR_TEST = { @@ -328,6 +386,7 @@ DEFAULT_PARSER_CONFIG_FOR_TEST = { }, } + class TestUpdateDocumentParserConfig: @pytest.mark.p2 @pytest.mark.parametrize( @@ -343,37 +402,32 @@ class TestUpdateDocumentParserConfig: pytest.param( "naive", {"chunk_token_num": -1}, - 100, - "AssertionError('chunk_token_num should be in range from 1 to 100000000')", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <-1>", ), pytest.param( "naive", {"chunk_token_num": 0}, - 100, - "AssertionError('chunk_token_num should be in range from 1 to 100000000')", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <0>", ), pytest.param( "naive", {"chunk_token_num": 100000000}, - 100, - "AssertionError('chunk_token_num should be in range from 1 to 100000000')", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <100000000>", ), pytest.param( "naive", {"chunk_token_num": 3.14}, 102, - "", - marks=pytest.mark.skip(reason="issues/6098"), + "Field: - Message: - Value: <3.14>", ), pytest.param( "naive", {"chunk_token_num": "1024"}, - 100, - "", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <1024>", ), ( "naive", @@ -392,148 +446,135 @@ class TestUpdateDocumentParserConfig: pytest.param( "naive", {"html4excel": 1}, - 100, - "AssertionError('html4excel should be True or False')", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <1>", ), ("naive", {"delimiter": ""}, 102, "Field: - Message: - Value: <>"), ("naive", {"delimiter": "`##`"}, 0, ""), pytest.param( "naive", {"delimiter": 1}, - 100, - "", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <1>", ), pytest.param( "naive", {"task_page_size": -1}, - 100, - "AssertionError('task_page_size should be in range from 1 to 100000000')", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <-1>", ), pytest.param( "naive", {"task_page_size": 0}, - 100, - "AssertionError('task_page_size should be in range from 1 to 100000000')", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <0>", ), pytest.param( "naive", {"task_page_size": 100000000}, - 100, - "AssertionError('task_page_size should be in range from 1 to 100000000')", - marks=pytest.mark.skip(reason="issues/6098"), + 0, + "", ), pytest.param( "naive", {"task_page_size": 3.14}, - 100, - "", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <3.14>", ), pytest.param( "naive", {"task_page_size": "1024"}, - 100, - "", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <1024>", + ), + ( + "naive", + { + "raptor": { + "use_raptor": {"a": "b"}, + } + }, + 102, + "Field: - Message: - Value: <{'a': 'b'}>", ), - ("naive", {"raptor": {"use_raptor": { - "a": "b" - },}}, 102, "Field: - Message: - Value: <{'a': 'b'}>"), ("naive", {"raptor": {"use_raptor": False}}, 0, ""), pytest.param( "naive", {"invalid_key": "invalid_value"}, - 100, - """AssertionError("Abnormal \'parser_config\'. Invalid key: invalid_key")""", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: ", ), pytest.param( "naive", {"auto_keywords": -1}, - 100, - "AssertionError('auto_keywords should be in range from 0 to 32')", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <-1>", ), pytest.param( "naive", {"auto_keywords": 32}, - 100, - "AssertionError('auto_keywords should be in range from 0 to 32')", - marks=pytest.mark.skip(reason="issues/6098"), - ), - pytest.param( - "naive", - {"auto_questions": 3.14}, - 100, + 0, "", - marks=pytest.mark.skip(reason="issues/6098"), ), pytest.param( "naive", {"auto_keywords": "1024"}, - 100, - "", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <1024>", + ), + pytest.param( + "naive", + {"auto_keywords": 3.14}, + 102, + "Field: - Message: - Value: <3.14>", ), pytest.param( "naive", {"auto_questions": -1}, - 100, - "AssertionError('auto_questions should be in range from 0 to 10')", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <-1>", ), pytest.param( "naive", {"auto_questions": 10}, - 100, - "AssertionError('auto_questions should be in range from 0 to 10')", - marks=pytest.mark.skip(reason="issues/6098"), + 0, + "", ), pytest.param( "naive", {"auto_questions": 3.14}, - 100, - "", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <3.14>", ), pytest.param( "naive", {"auto_questions": "1024"}, - 100, - "", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <1024>", ), pytest.param( "naive", {"topn_tags": -1}, - 100, - "AssertionError('topn_tags should be in range from 0 to 10')", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <-1>", ), pytest.param( "naive", {"topn_tags": 10}, - 100, - "AssertionError('topn_tags should be in range from 0 to 10')", - marks=pytest.mark.skip(reason="issues/6098"), + 0, + "", ), pytest.param( "naive", {"topn_tags": 3.14}, - 100, - "", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <3.14>", ), pytest.param( "naive", {"topn_tags": "1024"}, - 100, - "", - marks=pytest.mark.skip(reason="issues/6098"), + 102, + "Field: - Message: - Value: <1024>", ), ], ) diff --git a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py index 11abe738a6..dc9ce49b75 100644 --- a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py +++ b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py @@ -67,7 +67,7 @@ class TestDocumentsUpdated: updated_doc = dataset.list_documents(id=document.id)[0] assert updated_doc.name == name, str(updated_doc) - @pytest.mark.p3 + @pytest.mark.p2 @pytest.mark.parametrize( "meta_fields, expected_message", [ @@ -238,27 +238,81 @@ class TestDocumentsUpdated: document.update(payload) assert expected_message in str(exception_info.value), str(exception_info.value) - @pytest.mark.p3 - def test_immutable_fields_chunk_count(self, add_document): - document, _ = add_document # Unpack the tuple to get the document object - with pytest.raises(Exception) as exception_info: - document.update({"chunk_count": 999}) # Attempt to change immutable field - assert "Can't change `chunk_count`" in str(exception_info.value), str(exception_info.value) + @pytest.mark.p2 + @pytest.mark.parametrize( + "payload, expected_message", + [ + ({"chunk_count": 1}, "Can't change `chunk_count`"), + ], + ) + def test_immutable_fields_chunk_count(self, add_documents, payload, expected_message): + _, documents = add_documents + document = documents[0] - @pytest.mark.p3 - def test_immutable_fields_token_count(self, add_document): - document, _ = add_document # Unpack the tuple to get the document object with pytest.raises(Exception) as exception_info: - document.update({"token_count": 9999}) # Attempt to change immutable field - assert "Can't change `token_num`" in str(exception_info.value), str(exception_info.value) + document.update(payload) + assert expected_message in str(exception_info.value), str(exception_info.value) + + @pytest.mark.p2 + @pytest.mark.parametrize( + "payload, expected_message", + [ + ({"token_count": 9999}, "Can't change `token_count`"), # Attempt to change immutable field + ], + ) + def test_immutable_fields_token_count(self, add_documents, payload, expected_message): + _, documents = add_documents + document = documents[0] - @pytest.mark.p3 - def test_immutable_fields_progress(self, add_document): - document, _ = add_document # Unpack the tuple to get the document object with pytest.raises(Exception) as exception_info: - document.update({"progress": 0.5}) # Attempt to change immutable field - assert "Can't change `progress`" in str(exception_info.value), str(exception_info.value) + document.update(payload) + assert expected_message in str(exception_info.value), str(exception_info.value) + @pytest.mark.p2 + @pytest.mark.parametrize( + "payload, expected_message", + [ + ({"progress": 0.5}, "Can't change `progress`"), # Attempt to change immutable field + ({"progress": 1.5}, "Field: - Message: - Value: <1.5>"), # Attempt to change immutable field + ], + ) + def test_immutable_fields_progress(self, add_documents, payload, expected_message): + _, documents = add_documents + document = documents[0] + + with pytest.raises(Exception) as exception_info: + document.update(payload) + assert expected_message in str(exception_info.value), str(exception_info.value) + + +DEFAULT_PARSER_CONFIG_FOR_TEST = { + "layout_recognize": "DeepDOC", + "chunk_token_num": 512, + "delimiter": "\n", + "auto_keywords": 0, + "auto_questions": 0, + "html4excel": False, + "topn_tags": 3, + "raptor": { + "use_raptor": True, + "prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n {cluster_content}\nThe above is the content you need to summarize.", + "max_token": 256, + "threshold": 0.1, + "max_cluster": 64, + "random_seed": 0, + }, + "graphrag": { + "use_graphrag": True, + "entity_types": [ + "organization", + "person", + "geo", + "event", + "category", + ], + "method": "light", + }, +} class TestUpdateDocumentParserConfig: @pytest.mark.p2 @@ -268,15 +322,14 @@ class TestUpdateDocumentParserConfig: ("naive", {}, ""), pytest.param( "naive", - DEFAULT_PARSER_CONFIG, + DEFAULT_PARSER_CONFIG_FOR_TEST, "", marks=pytest.mark.skip(reason="DEFAULT_PARSER_CONFIG contains fields not allowed in document update API"), ), pytest.param( "naive", {"chunk_token_num": -1}, - "chunk_token_num should be in range from 1 to 100000000", - marks=pytest.mark.skip(reason="issues/6098"), + "Field: - Message: - Value: <-1>", ), ( "naive", @@ -327,8 +380,7 @@ class TestUpdateDocumentParserConfig: pytest.param( "naive", {"task_page_size": 100000000}, - "task_page_size should be in range from 1 to 100000000", - marks=pytest.mark.skip(reason="API validation differs from expected message"), + "", ), ( "naive", @@ -360,8 +412,7 @@ class TestUpdateDocumentParserConfig: pytest.param( "naive", {"auto_keywords": 32}, - "auto_keywords should be in range from 0 to 32", - marks=pytest.mark.skip(reason="API validation differs from expected message"), + "", ), ( "naive", @@ -381,8 +432,7 @@ class TestUpdateDocumentParserConfig: pytest.param( "naive", {"auto_questions": 10}, - "auto_questions should be in range from 0 to 10", - marks=pytest.mark.skip(reason="API validation differs from expected message"), + "", ), ( "naive", @@ -402,8 +452,7 @@ class TestUpdateDocumentParserConfig: pytest.param( "naive", {"topn_tags": 10}, - "topn_tags should be in range from 0 to 10", - marks=pytest.mark.skip(reason="API validation differs from expected message"), + "", ), ( "naive", diff --git a/test/testcases/test_web_api/common.py b/test/testcases/test_web_api/common.py index 8b87698805..490ff5b0a6 100644 --- a/test/testcases/test_web_api/common.py +++ b/test/testcases/test_web_api/common.py @@ -435,16 +435,6 @@ def document_change_status(auth, payload=None, *, headers=HEADERS, data=None): return res.json() -def document_rename(auth, payload=None, *, headers=HEADERS, data=None): - res = requests.post(url=f"{HOST_ADDRESS}{DOCUMENT_APP_URL}/rename", headers=headers, auth=auth, json=payload, data=data) - return res.json() - - -def document_set_meta(auth, payload=None, *, headers=HEADERS, data=None): - res = requests.post(url=f"{HOST_ADDRESS}{DOCUMENT_APP_URL}/set_meta", headers=headers, auth=auth, json=payload, data=data) - return res.json() - - def bulk_upload_documents(auth, kb_id, num, tmp_path): fps = [] for i in range(num): diff --git a/test/testcases/test_web_api/test_document_app/test_document_metadata.py b/test/testcases/test_web_api/test_document_app/test_document_metadata.py index 6d2c788cec..3554b81b67 100644 --- a/test/testcases/test_web_api/test_document_app/test_document_metadata.py +++ b/test/testcases/test_web_api/test_document_app/test_document_metadata.py @@ -22,8 +22,6 @@ from common import ( document_filter, document_infos, document_metadata_summary, - document_rename, - document_set_meta, document_update_metadata_setting, ) from configs import INVALID_API_TOKEN @@ -82,21 +80,6 @@ class TestAuthorization: assert res["code"] == expected_code, res assert expected_fragment in res["message"], res - @pytest.mark.p2 - @pytest.mark.parametrize("invalid_auth, expected_code, expected_fragment", INVALID_AUTH_CASES) - def test_rename_auth_invalid(self, invalid_auth, expected_code, expected_fragment): - res = document_rename(invalid_auth, {"doc_id": "doc_id", "name": "rename.txt"}) - assert res["code"] == expected_code, res - assert expected_fragment in res["message"], res - - @pytest.mark.p2 - @pytest.mark.parametrize("invalid_auth, expected_code, expected_fragment", INVALID_AUTH_CASES) - def test_set_meta_auth_invalid(self, invalid_auth, expected_code, expected_fragment): - res = document_set_meta(invalid_auth, {"doc_id": "doc_id", "meta": "{}"}) - assert res["code"] == expected_code, res - assert expected_fragment in res["message"], res - - class TestDocumentMetadata: @pytest.mark.p2 def test_filter(self, WebApiAuth, add_dataset_func): @@ -163,28 +146,6 @@ class TestDocumentMetadata: assert info_res["code"] == 0, info_res assert info_res["data"][0]["status"] == "1", info_res - @pytest.mark.p2 - def test_rename(self, WebApiAuth, add_document_func): - _, doc_id = add_document_func - name = f"renamed_{doc_id}.txt" - res = document_rename(WebApiAuth, {"doc_id": doc_id, "name": name}) - assert res["code"] == 0, res - assert res["data"] is True, res - info_res = document_infos(WebApiAuth, {"doc_ids": [doc_id]}) - assert info_res["code"] == 0, info_res - assert info_res["data"][0]["name"] == name, info_res - - @pytest.mark.p2 - def test_set_meta(self, WebApiAuth, add_document_func): - _, doc_id = add_document_func - res = document_set_meta(WebApiAuth, {"doc_id": doc_id, "meta": "{\"author\": \"alice\"}"}) - assert res["code"] == 0, res - assert res["data"] is True, res - info_res = document_infos(WebApiAuth, {"doc_ids": [doc_id]}) - assert info_res["code"] == 0, info_res - meta_fields = info_res["data"][0].get("meta_fields", {}) - assert meta_fields.get("author") == "alice", info_res - class TestDocumentMetadataNegative: @pytest.mark.p3 @@ -231,20 +192,6 @@ class TestDocumentMetadataNegative: assert res["code"] == 101, res assert "Status" in res["message"], res - @pytest.mark.p3 - def test_rename_extension_mismatch(self, WebApiAuth, add_document_func): - _, doc_id = add_document_func - res = document_rename(WebApiAuth, {"doc_id": doc_id, "name": "renamed.pdf"}) - assert res["code"] == 101, res - assert "extension" in res["message"], res - - @pytest.mark.p3 - def test_set_meta_invalid_type(self, WebApiAuth, add_document_func): - _, doc_id = add_document_func - res = document_set_meta(WebApiAuth, {"doc_id": doc_id, "meta": "[]"}) - assert res["code"] == 101, res - assert "dictionary" in res["message"], res - def _run(coro): return asyncio.run(coro) @@ -590,87 +537,6 @@ class TestDocumentMetadataUnit: assert res["code"] == 0 assert res["data"]["doc1"]["status"] == "1" - def test_rename_branch_matrix_and_exception_unit(self, document_app_module, monkeypatch): - module = document_app_module - file_updates = [] - es_updates = [] - - async def fake_thread_pool_exec(func, *_args, **_kwargs): - return func() - - monkeypatch.setattr(module, "thread_pool_exec", fake_thread_pool_exec) - monkeypatch.setattr(module.DocumentService, "get_tenant_id", lambda _doc_id: "tenant1") - monkeypatch.setattr(module.rag_tokenizer, "tokenize", lambda _name: ["token"]) - monkeypatch.setattr(module.rag_tokenizer, "fine_grained_tokenize", lambda _tokens: ["fine"]) - monkeypatch.setattr(module, "server_error_response", lambda e: {"code": 500, "message": str(e)}) - - class _DocStore: - def index_exist(self, _index_name, _kb_id): - return True - - def update(self, where, payload, _index_name, _kb_id): - es_updates.append((where, payload)) - - monkeypatch.setattr(module.settings, "docStoreConn", _DocStore()) - monkeypatch.setattr(module.search, "index_name", lambda tenant_id: f"idx_{tenant_id}") - - def set_req(name): - async def fake_request_json(): - return {"doc_id": "doc1", "name": name} - - monkeypatch.setattr(module, "get_request_json", fake_request_json) - - set_req("renamed.txt") - monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: False) - res = _run(module.rename.__wrapped__()) - assert res["code"] == module.RetCode.AUTHENTICATION_ERROR - - monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: True) - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None)) - res = _run(module.rename.__wrapped__()) - assert res["code"] == module.RetCode.DATA_ERROR - assert "Document not found!" in res["message"] - - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, SimpleNamespace(id="doc1", name="origin.txt", kb_id="kb1"))) - set_req("renamed.pdf") - res = _run(module.rename.__wrapped__()) - assert res["code"] == module.RetCode.ARGUMENT_ERROR - assert "extension" in res["message"] - - too_long = "a" * (module.FILE_NAME_LEN_LIMIT + 1) + ".txt" - set_req(too_long) - res = _run(module.rename.__wrapped__()) - assert res["code"] == module.RetCode.ARGUMENT_ERROR - assert "bytes or less" in res["message"] - - set_req("dup.txt") - monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: [SimpleNamespace(name="dup.txt")]) - res = _run(module.rename.__wrapped__()) - assert res["code"] == module.RetCode.DATA_ERROR - assert "Duplicated document name" in res["message"] - - set_req("ok.txt") - monkeypatch.setattr(module.DocumentService, "query", lambda **_kwargs: []) - monkeypatch.setattr(module.DocumentService, "update_by_id", lambda *_args, **_kwargs: True) - monkeypatch.setattr(module.File2DocumentService, "get_by_document_id", lambda _doc_id: [SimpleNamespace(file_id="file1")]) - monkeypatch.setattr(module.FileService, "get_by_id", lambda _file_id: (True, SimpleNamespace(id="file1"))) - monkeypatch.setattr(module.FileService, "update_by_id", lambda file_id, payload: file_updates.append((file_id, payload))) - res = _run(module.rename.__wrapped__()) - assert res["code"] == 0 - assert file_updates == [("file1", {"name": "ok.txt"})] - assert es_updates[0][0] == {"doc_id": "doc1"} - assert es_updates[0][1]["docnm_kwd"] == "ok.txt" - assert es_updates[0][1]["title_tks"] == ["token"] - assert es_updates[0][1]["title_sm_tks"] == ["fine"] - - def raise_db_error(*_args, **_kwargs): - raise RuntimeError("rename boom") - - monkeypatch.setattr(module.DocumentService, "update_by_id", raise_db_error) - res = _run(module.rename.__wrapped__()) - assert res["code"] == 500 - assert "rename boom" in res["message"] - def test_get_route_not_found_success_and_exception_unit(self, document_app_module, monkeypatch): module = document_app_module monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None)) @@ -932,50 +798,3 @@ class TestDocumentMetadataUnit: res = _run(module.get_image("bucket-name")) assert res["code"] == 500 assert "image boom" in res["message"] - - def test_set_meta_validation_and_persistence_matrix_unit(self, document_app_module, monkeypatch): - module = document_app_module - - def set_req(payload): - async def fake_request_json(): - return payload - - monkeypatch.setattr(module, "get_request_json", fake_request_json) - - set_req({"doc_id": "doc1", "meta": "{}"}) - monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: False) - res = _run(module.set_meta.__wrapped__()) - assert res["code"] == module.RetCode.AUTHENTICATION_ERROR - - monkeypatch.setattr(module.DocumentService, "accessible", lambda *_args, **_kwargs: True) - set_req({"doc_id": "doc1", "meta": "[]"}) - res = _run(module.set_meta.__wrapped__()) - assert res["code"] == module.RetCode.ARGUMENT_ERROR - assert "Only dictionary type supported." in res["message"] - - set_req({"doc_id": "doc1", "meta": '{"tags":[{"x":1}]}'}) - res = _run(module.set_meta.__wrapped__()) - assert res["code"] == module.RetCode.ARGUMENT_ERROR - assert "The type is not supported in list" in res["message"] - - set_req({"doc_id": "doc1", "meta": '{"obj":{"x":1}}'}) - res = _run(module.set_meta.__wrapped__()) - assert res["code"] == module.RetCode.ARGUMENT_ERROR - assert "The type is not supported" in res["message"] - - set_req({"doc_id": "doc1", "meta": "{"}) - res = _run(module.set_meta.__wrapped__()) - assert res["code"] == module.RetCode.ARGUMENT_ERROR - assert "Json syntax error:" in res["message"] - - set_req({"doc_id": "doc1", "meta": '{"author":"alice"}'}) - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (False, None)) - res = _run(module.set_meta.__wrapped__()) - assert res["code"] == module.RetCode.DATA_ERROR - assert "Document not found!" in res["message"] - - monkeypatch.setattr(module.DocumentService, "get_by_id", lambda _doc_id: (True, SimpleNamespace(id="doc1"))) - monkeypatch.setattr(module.DocMetadataService, "update_document_metadata", lambda *_args, **_kwargs: False) - res = _run(module.set_meta.__wrapped__()) - assert res["code"] == module.RetCode.DATA_ERROR - assert "Database error (meta updates)!" in res["message"] diff --git a/web/src/hooks/use-document-request.ts b/web/src/hooks/use-document-request.ts index e7a135a89d..417167cedf 100644 --- a/web/src/hooks/use-document-request.ts +++ b/web/src/hooks/use-document-request.ts @@ -15,7 +15,10 @@ import { } from '@/interfaces/request/document'; import i18n from '@/locales/config'; import { EMPTY_METADATA_FIELD } from '@/pages/dataset/dataset/use-select-filters'; -import kbService, { listDocument } from '@/services/knowledge-service'; +import kbService, { + listDocument, + renameDocument, +} from '@/services/knowledge-service'; import api, { restAPIv1, webAPI } from '@/utils/api'; import { getSearchValue } from '@/utils/common-util'; import { buildChunkHighlights } from '@/utils/document-util'; @@ -336,12 +339,13 @@ export const useSaveDocumentName = () => { mutationFn: async ({ name, documentId, + kbId, }: { name: string; documentId: string; + kbId: string; }) => { - const { data } = await kbService.document_rename({ - doc_id: documentId, + const { data } = await renameDocument(kbId, documentId, { name: name, }); if (data.code === 0) { diff --git a/web/src/pages/dataset/dataset/use-rename-document.ts b/web/src/pages/dataset/dataset/use-rename-document.ts index 698a3f9e64..95eb7b4e84 100644 --- a/web/src/pages/dataset/dataset/use-rename-document.ts +++ b/web/src/pages/dataset/dataset/use-rename-document.ts @@ -15,14 +15,18 @@ export const useRenameDocument = () => { const onRenameOk = useCallback( async (name: string) => { - if (record?.id) { - const ret = await saveName({ documentId: record.id, name }); + if (record?.id && record?.kb_id) { + const ret = await saveName({ + documentId: record.id, + name, + kbId: record.kb_id, + }); if (ret === 0) { hideRenameModal(); } } }, - [record?.id, saveName, hideRenameModal], + [record?.id, record?.kb_id, saveName, hideRenameModal], ); const handleShow = useCallback( diff --git a/web/src/services/knowledge-service.ts b/web/src/services/knowledge-service.ts index e727fc5005..d26f44da19 100644 --- a/web/src/services/knowledge-service.ts +++ b/web/src/services/knowledge-service.ts @@ -77,7 +77,7 @@ const methods = { }, document_rename: { url: document_rename, - method: 'post', + method: 'put', }, document_create: { url: document_create, @@ -251,6 +251,12 @@ export const listDocument = ( export const documentFilter = (kb_id: string) => request.post(api.get_dataset_filter, { kb_id }); +export const renameDocument = ( + datasetId: string, + documentId: string, + data: { name?: string }, +) => request.put(api.document_rename(datasetId, documentId), { data }); + export const getMetaDataService = ({ kb_id, doc_ids, diff --git a/web/src/utils/api.ts b/web/src/utils/api.ts index b773fee466..7edce6a205 100644 --- a/web/src/utils/api.ts +++ b/web/src/utils/api.ts @@ -109,7 +109,8 @@ export default { document_change_status: `${webAPI}/document/change_status`, document_rm: `${webAPI}/document/rm`, document_delete: `${webAPI}/api/document`, - document_rename: `${webAPI}/document/rename`, + document_rename: (datasetId: string, documentId: string) => + `${restAPIv1}/datasets/${datasetId}/documents/${documentId}`, document_create: `${webAPI}/document/create`, document_run: `${webAPI}/document/run`, document_change_parser: `${webAPI}/document/change_parser`,