From a0f9ae16d2d84660bc5ee7db8acf7a89a697c3e3 Mon Sep 17 00:00:00 2001 From: euvre <93761161+euvre@users.noreply.github.com> Date: Wed, 29 Apr 2026 10:46:28 +0000 Subject: [PATCH] Fix: RAPTOR "Generation scope" reset to "Single file" when selecting "Dataset" (#14477) ## Problem In the Dataset Configuration page, changing the RAPTOR **Generation scope** from "Single file" to "Dataset" and clicking **Save** did not persist the change. After refreshing or re-entering the page, the scope always reverted to "Single file". ## Root Cause 1. **Backend**: The `RaptorConfig` Pydantic model in `api/utils/validation_utils.py` was configured with `extra="forbid"` but did not declare a `scope` field. When the frontend sent `"scope": "dataset"`, Pydantic rejected the request. 2. **Frontend**: The `extractRaptorConfigExt` utility in `web/src/hooks/parser-config-utils.ts` treated `scope` as an unknown field and moved it into the nested `ext` object. Consequently, the backend could not read `raptor_config.get("scope", "file")` correctly, so the default `"file"` was always used. ## Changes - Added `scope: Literal["file", "dataset"]` to the backend `RaptorConfig` model with a default of `"file"`. - Added `scope` to the known-field whitelist in the frontend `extractRaptorConfigExt` helper so it is transmitted as a top-level raptor field instead of being buried in `ext`. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Signed-off-by: noob --- api/utils/validation_utils.py | 47 +++++++++++++--------------- web/src/hooks/parser-config-utils.ts | 2 ++ 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/api/utils/validation_utils.py b/api/utils/validation_utils.py index 0ce4a8b170..8ded91261c 100644 --- a/api/utils/validation_utils.py +++ b/api/utils/validation_utils.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import logging import math import pathlib import re @@ -22,16 +23,7 @@ from typing import Annotated, Any, Literal from uuid import UUID from quart import Request -from pydantic import ( - BaseModel, - ConfigDict, - Field, - StringConstraints, - ValidationError, - field_validator, - model_validator, - ValidationInfo -) +from pydantic import BaseModel, ConfigDict, Field, StringConstraints, ValidationError, field_validator, model_validator, ValidationInfo from pydantic_core import PydanticCustomError from werkzeug.exceptions import BadRequest, UnsupportedMediaType @@ -170,12 +162,13 @@ def validate_and_parse_request_args(request: Request, validator: type[BaseModel] args = request.args.to_dict(flat=True) # Handle ext parameter: parse JSON string to dict if it's a string - if 'ext' in args and isinstance(args['ext'], str): + if "ext" in args and isinstance(args["ext"], str): import json + try: - args['ext'] = json.loads(args['ext']) + args["ext"] = json.loads(args["ext"]) except json.JSONDecodeError: - pass # Keep the string and let validation handle the error + logging.debug("Failed to decode query arg 'ext' as JSON; passing raw value to validator") try: if extras is not None: @@ -350,6 +343,7 @@ class RaptorConfig(Base): threshold: Annotated[float, Field(default=0.1, ge=0.0, le=1.0)] max_cluster: Annotated[int, Field(default=64, ge=1, le=1024)] random_seed: Annotated[int, Field(default=0, ge=0)] + scope: Annotated[Literal["file", "dataset"], Field(default="file")] auto_disable_for_structured_data: Annotated[bool, Field(default=True)] ext: Annotated[dict, Field(default={})] @@ -401,6 +395,7 @@ class ParserConfig(Base): pages: Annotated[list[list[int]] | None, Field(default=None)] ext: Annotated[dict, Field(default={})] + class UpdateDocumentReq(Base): """ Request model for updating a document. @@ -408,7 +403,8 @@ class UpdateDocumentReq(Base): This model validates the request parameters for updating a document, including name, chunk method, enabled status, and other metadata. """ - model_config = ConfigDict(extra='ignore') + + model_config = ConfigDict(extra="ignore") name: Annotated[str | None, Field(default=None, max_length=65535)] chunk_method: Annotated[str | None, Field(default=None, max_length=65535)] pipeline_id: Annotated[str | None, Field(default=None, max_length=65535)] @@ -426,7 +422,7 @@ class UpdateDocumentReq(Base): # Validate chunk method if present valid_chunk_method = {"naive", "manual", "qa", "table", "paper", "book", "laws", "presentation", "picture", "one", "knowledge_graph", "email", "tag"} if chunk_method not in valid_chunk_method: - raise PydanticCustomError("format_invalid", "`chunk_method` {chunk_method} doesn't exist", {"chunk_method":chunk_method}) + raise PydanticCustomError("format_invalid", "`chunk_method` {chunk_method} doesn't exist", {"chunk_method": chunk_method}) return chunk_method @@ -436,7 +432,7 @@ class UpdateDocumentReq(Base): if enabled: converted = int(enabled) if converted < 0 or converted > 1: - raise PydanticCustomError("format_invalid", "`enabled` value invalid, only accept 0 or 1 but is {enabled}", {"enabled":enabled}) + raise PydanticCustomError("format_invalid", "`enabled` value invalid, only accept 0 or 1 but is {enabled}", {"enabled": enabled}) return enabled @@ -451,11 +447,12 @@ class UpdateDocumentReq(Base): for k, v in meta_fields.items(): if isinstance(v, list): if not all(isinstance(i, (str, int, float)) for i in v): - raise PydanticCustomError("format_invalid", "The type is not supported in list: {v}", {"v":v}) + raise PydanticCustomError("format_invalid", "The type is not supported in list: {v}", {"v": v}) elif not isinstance(v, (str, int, float)): - raise PydanticCustomError("format_invalid", "The type is not supported: {v}", {"v":v}) + raise PydanticCustomError("format_invalid", "The type is not supported: {v}", {"v": v}) return meta_fields + class CreateDatasetReq(Base): name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(...)] avatar: Annotated[str | None, Field(default=None, max_length=65535)] @@ -708,8 +705,7 @@ class CreateDatasetReq(Base): @classmethod def validate_chunk_method(cls, v: Any, handler, info: ValidationInfo) -> Any: """Wrap validation to unify error messages, including type errors (e.g. list).""" - allowed = {"naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table", - "tag", "resume"} + allowed = {"naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table", "tag", "resume"} error_msg = "Input should be 'naive', 'book', 'email', 'laws', 'manual', 'one', 'paper', 'picture', 'presentation', 'qa', 'table', 'tag' or 'resume'" try: # Run inner validation (type checking) @@ -864,6 +860,7 @@ class ListDatasetReq(BaseListReq): # ---- File Management Request Models ---- + class CreateFolderReq(Base): name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=255), Field(...)] parent_id: Annotated[str | None, Field(default=None)] @@ -879,7 +876,7 @@ class MoveFileReq(Base): dest_file_id: Annotated[str | None, Field(default=None)] new_name: Annotated[str | None, StringConstraints(strip_whitespace=True, min_length=1, max_length=255), Field(default=None)] - @model_validator(mode='after') + @model_validator(mode="after") def check_operation(self): if not self.dest_file_id and not self.new_name: raise ValueError("At least one of dest_file_id or new_name must be provided") @@ -899,7 +896,7 @@ class ListFileReq(BaseModel): desc: Annotated[bool, Field(default=True)] -def validate_immutable_fields(update_doc_req:UpdateDocumentReq, doc): +def validate_immutable_fields(update_doc_req: UpdateDocumentReq, doc): """ Validate that immutable fields have not been changed. @@ -929,7 +926,7 @@ def validate_immutable_fields(update_doc_req:UpdateDocumentReq, doc): return None, None -def validate_document_name(req_doc_name:str, doc, docs_from_name): +def validate_document_name(req_doc_name: str, doc, docs_from_name): """ Validate document name update. @@ -960,6 +957,7 @@ def validate_document_name(req_doc_name:str, doc, docs_from_name): return "Duplicated document name in the same dataset.", RetCode.DATA_ERROR return None, None + def validate_chunk_method(doc, chunk_method=None): """ Validate chunk method update. @@ -975,9 +973,8 @@ def validate_chunk_method(doc, chunk_method=None): A tuple of (error_message, error_code) if validation fails, or (None, None) if validation passes. """ - if chunk_method is not None and len(chunk_method) == 0: # will not be detected in UpdateDocumentReq + if chunk_method is not None and len(chunk_method) == 0: # will not be detected in UpdateDocumentReq return "`chunk_method` (empty string) is not valid", RetCode.DATA_ERROR if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name): return "Not supported yet!", RetCode.DATA_ERROR return None, None - diff --git a/web/src/hooks/parser-config-utils.ts b/web/src/hooks/parser-config-utils.ts index bc617cfde1..c02a42a01a 100644 --- a/web/src/hooks/parser-config-utils.ts +++ b/web/src/hooks/parser-config-utils.ts @@ -20,6 +20,7 @@ export const extractRaptorConfigExt = ( threshold, max_cluster, random_seed, + scope, auto_disable_for_structured_data, ext, ...raptorExt @@ -31,6 +32,7 @@ export const extractRaptorConfigExt = ( threshold, max_cluster, random_seed, + scope, auto_disable_for_structured_data, ext: { ...ext, ...raptorExt }, };