Fix: #6098 - Add validation logic for parser_config when update document (#13911)

### What problem does this PR solve?

Add validation logic for parser_config.
Refactor the processing flow. Before change, validation logics and
update logics are mixed up - some validation logis executes followed by
some update logic executes and then another such
"validation-and-then-update" which is not good. After change, all
validation logic executes firstly. Update logic will be executed after
ALL validation logic executed.
Validation logic for parameters (that come from front end) will be
checked using Pydantic. For validation logic that depends on data from
DB, they will be in separate methods.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
This commit is contained in:
Jack
2026-04-07 11:33:05 +08:00
committed by GitHub
parent 5673245134
commit c4b0aaa874
11 changed files with 1079 additions and 193 deletions

View File

@@ -13,6 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import math
import pathlib
import re
from collections import Counter
import string
from typing import Annotated, Any, Literal
@@ -32,7 +35,9 @@ from pydantic import (
from pydantic_core import PydanticCustomError
from werkzeug.exceptions import BadRequest, UnsupportedMediaType
from api.constants import DATASET_NAME_LIMIT
from api.constants import DATASET_NAME_LIMIT, FILE_NAME_LEN_LIMIT
from api.db import FileType
from common.constants import RetCode
async def validate_and_parse_json_request(
@@ -390,6 +395,36 @@ class ParserConfig(Base):
pages: Annotated[list[list[int]] | None, Field(default=None)]
ext: Annotated[dict, Field(default={})]
class UpdateDocumentReq(Base):
model_config = ConfigDict(extra='ignore')
chunk_method: Annotated[str | None, Field(default=None, max_length=65535)]
enabled: Annotated[int | None, Field(default=None, ge=0, le=1)]
chunk_count: Annotated[int | None, Field(default=None, ge=0)]
token_count: Annotated[int | None, Field(default=None, ge=0)]
progress: Annotated[float | None, Field(default=None, ge=0.0, le=1.0)]
parser_config: Annotated[ParserConfig | None, Field(default=None)]
meta_fields: Annotated[dict | None, Field(default={})]
@field_validator("chunk_method", mode="after")
@classmethod
def validate_document_chunk_method(cls, chunk_method: str | None):
if chunk_method:
# Validate chunk method if present
valid_chunk_method = {"naive", "manual", "qa", "table", "paper", "book", "laws", "presentation", "picture", "one", "knowledge_graph", "email", "tag"}
if chunk_method not in valid_chunk_method:
raise PydanticCustomError("format_invalid", "`chunk_method` {chunk_method} doesn't exist", {"chunk_method":chunk_method})
return chunk_method
@field_validator("enabled", mode="after")
@classmethod
def validate_document_enabled(cls, enabled: str | None):
if enabled:
converted = int(enabled)
if converted < 0 or converted > 1:
raise PydanticCustomError("format_invalid", "`enabled` value invalid, only accept 0 or 1 but is {enabled}", {"enabled":enabled})
return enabled
class CreateDatasetReq(Base):
name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(...)]
@@ -810,3 +845,44 @@ class ListFileReq(BaseModel):
page_size: Annotated[int, Field(default=15, ge=1, le=100)]
orderby: Annotated[str, Field(default="create_time")]
desc: Annotated[bool, Field(default=True)]
def validate_immutable_fields(update_doc_req:UpdateDocumentReq, doc):
"""Validate that immutable fields have not been changed."""
if update_doc_req.chunk_count and update_doc_req.chunk_count != int(getattr(doc, "chunk_num", -1)):
return "Can't change `chunk_count`.", RetCode.DATA_ERROR
if update_doc_req.token_count and update_doc_req.token_count != int(getattr(doc, "token_num", -1)):
return "Can't change `token_count`.", RetCode.DATA_ERROR
if update_doc_req.progress:
progress_from_db = float(getattr(doc, "progress", -1.0))
# should not use "==" to compare two float values
if not math.isclose(update_doc_req.progress, progress_from_db):
return "Can't change `progress`.", RetCode.DATA_ERROR
return None, None
def validate_document_name(req_doc_name:str, doc, docs_from_name):
"""Validate document name update."""
if not isinstance(req_doc_name, str):
return f"AttributeError('{type(req_doc_name).__name__}' object has no attribute 'encode')", RetCode.EXCEPTION_ERROR
if len(req_doc_name.encode("utf-8")) > FILE_NAME_LEN_LIMIT:
return f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", RetCode.ARGUMENT_ERROR
if pathlib.Path(req_doc_name.lower()).suffix != pathlib.Path(doc.name.lower()).suffix:
return "The extension of file can't be changed", RetCode.ARGUMENT_ERROR
for d in docs_from_name:
if d.name == req_doc_name:
return "Duplicated document name in the same dataset.", RetCode.DATA_ERROR
return None, None
def validate_chunk_method(doc, chunk_method=None):
"""Validate chunk method update."""
if chunk_method is not None and len(chunk_method) == 0: # will not be detected in UpdateDocumentReq
return "`chunk_method` (empty string) is not valid", RetCode.DATA_ERROR
if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
return "Not supported yet!", RetCode.DATA_ERROR
return None, None