Fix: #6098 - Add validation logic for parser_config when update document (#13911)

### What problem does this PR solve? Add validation logic for parser_config. Refactor the processing flow. Before change, validation logics and update logics are mixed up - some validation logis executes followed by some update logic executes and then another such "validation-and-then-update" which is not good. After change, all validation logic executes firstly. Update logic will be executed after ALL validation logic executed. Validation logic for parameters (that come from front end) will be checked using Pydantic. For validation logic that depends on data from DB, they will be in separate methods. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Refactoring
2026-06-29 23:41:12 +08:00 · 2026-04-07 11:33:05 +08:00
parent 5673245134
commit c4b0aaa874
11 changed files with 1079 additions and 193 deletions
--- a/api/utils/validation_utils.py
+++ b/api/utils/validation_utils.py
@@ -13,6 +13,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import math
+import pathlib
+import re
 from collections import Counter
 import string
 from typing import Annotated, Any, Literal
@@ -32,7 +35,9 @@ from pydantic import (
 from pydantic_core import PydanticCustomError
 from werkzeug.exceptions import BadRequest, UnsupportedMediaType

-from api.constants import DATASET_NAME_LIMIT
+from api.constants import DATASET_NAME_LIMIT, FILE_NAME_LEN_LIMIT
+from api.db import FileType
+from common.constants import RetCode


 async def validate_and_parse_json_request(
@@ -390,6 +395,36 @@ class ParserConfig(Base):
    pages: Annotated[list[list[int]] | None, Field(default=None)]
    ext: Annotated[dict, Field(default={})]

+class UpdateDocumentReq(Base):
+    model_config = ConfigDict(extra='ignore')
+    chunk_method: Annotated[str | None, Field(default=None, max_length=65535)]
+    enabled: Annotated[int | None, Field(default=None, ge=0, le=1)]
+    chunk_count: Annotated[int | None, Field(default=None, ge=0)]
+    token_count: Annotated[int | None, Field(default=None, ge=0)]
+    progress: Annotated[float | None, Field(default=None, ge=0.0, le=1.0)]
+    parser_config: Annotated[ParserConfig | None, Field(default=None)]
+    meta_fields: Annotated[dict | None, Field(default={})]
+
+    @field_validator("chunk_method", mode="after")
+    @classmethod
+    def validate_document_chunk_method(cls, chunk_method: str | None):
+        if chunk_method:
+            # Validate chunk method if present
+            valid_chunk_method = {"naive", "manual", "qa", "table", "paper", "book", "laws", "presentation", "picture", "one", "knowledge_graph", "email", "tag"}
+            if chunk_method not in valid_chunk_method:
+                raise PydanticCustomError("format_invalid", "`chunk_method` {chunk_method} doesn't exist", {"chunk_method":chunk_method})
+
+        return chunk_method
+
+    @field_validator("enabled", mode="after")
+    @classmethod
+    def validate_document_enabled(cls, enabled: str | None):
+        if enabled:
+            converted = int(enabled)
+            if converted < 0 or converted > 1:
+                raise PydanticCustomError("format_invalid", "`enabled` value invalid, only accept 0 or 1 but is {enabled}", {"enabled":enabled})
+
+        return enabled

 class CreateDatasetReq(Base):
    name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(...)]
@@ -810,3 +845,44 @@ class ListFileReq(BaseModel):
    page_size: Annotated[int, Field(default=15, ge=1, le=100)]
    orderby: Annotated[str, Field(default="create_time")]
    desc: Annotated[bool, Field(default=True)]
+
+
+def validate_immutable_fields(update_doc_req:UpdateDocumentReq, doc):
+    """Validate that immutable fields have not been changed."""
+    if update_doc_req.chunk_count and update_doc_req.chunk_count != int(getattr(doc, "chunk_num", -1)):
+        return "Can't change `chunk_count`.", RetCode.DATA_ERROR
+
+    if update_doc_req.token_count and update_doc_req.token_count != int(getattr(doc, "token_num", -1)):
+        return "Can't change `token_count`.", RetCode.DATA_ERROR
+
+    if update_doc_req.progress:
+        progress_from_db = float(getattr(doc, "progress", -1.0))
+        # should not use "==" to compare two float values
+        if not math.isclose(update_doc_req.progress, progress_from_db):
+            return "Can't change `progress`.", RetCode.DATA_ERROR
+
+    return None, None
+
+
+def validate_document_name(req_doc_name:str, doc, docs_from_name):
+    """Validate document name update."""
+    if not isinstance(req_doc_name, str):
+        return f"AttributeError('{type(req_doc_name).__name__}' object has no attribute 'encode')", RetCode.EXCEPTION_ERROR
+    if len(req_doc_name.encode("utf-8")) > FILE_NAME_LEN_LIMIT:
+        return f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", RetCode.ARGUMENT_ERROR
+    if pathlib.Path(req_doc_name.lower()).suffix != pathlib.Path(doc.name.lower()).suffix:
+        return "The extension of file can't be changed", RetCode.ARGUMENT_ERROR
+
+    for d in docs_from_name:
+        if d.name == req_doc_name:
+            return "Duplicated document name in the same dataset.", RetCode.DATA_ERROR
+    return None, None
+
+def validate_chunk_method(doc, chunk_method=None):
+    """Validate chunk method update."""
+    if chunk_method is not None and len(chunk_method) == 0: # will not be detected in UpdateDocumentReq
+        return "`chunk_method` (empty string) is not valid", RetCode.DATA_ERROR
+    if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
+        return "Not supported yet!", RetCode.DATA_ERROR
+    return None, None
+