Fix: RAPTOR "Generation scope" reset to "Single file" when selecting "Dataset" (#14477)

## Problem
In the Dataset Configuration page, changing the RAPTOR **Generation
scope** from "Single file" to "Dataset" and clicking **Save** did not
persist the change. After refreshing or re-entering the page, the scope
always reverted to "Single file".

## Root Cause
1. **Backend**: The `RaptorConfig` Pydantic model in
`api/utils/validation_utils.py` was configured with `extra="forbid"` but
did not declare a `scope` field. When the frontend sent `"scope":
"dataset"`, Pydantic rejected the request.
2. **Frontend**: The `extractRaptorConfigExt` utility in
`web/src/hooks/parser-config-utils.ts` treated `scope` as an unknown
field and moved it into the nested `ext` object. Consequently, the
backend could not read `raptor_config.get("scope", "file")` correctly,
so the default `"file"` was always used.

## Changes
- Added `scope: Literal["file", "dataset"]` to the backend
`RaptorConfig` model with a default of `"file"`.
- Added `scope` to the known-field whitelist in the frontend
`extractRaptorConfigExt` helper so it is transmitted as a top-level
raptor field instead of being buried in `ext`.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Signed-off-by: noob <yixiao121314@outlook.com>
This commit is contained in:
euvre
2026-04-29 10:46:28 +00:00
committed by GitHub
parent 1b84892e3a
commit a0f9ae16d2
2 changed files with 24 additions and 25 deletions

View File

@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import math
import pathlib
import re
@@ -22,16 +23,7 @@ from typing import Annotated, Any, Literal
from uuid import UUID
from quart import Request
from pydantic import (
BaseModel,
ConfigDict,
Field,
StringConstraints,
ValidationError,
field_validator,
model_validator,
ValidationInfo
)
from pydantic import BaseModel, ConfigDict, Field, StringConstraints, ValidationError, field_validator, model_validator, ValidationInfo
from pydantic_core import PydanticCustomError
from werkzeug.exceptions import BadRequest, UnsupportedMediaType
@@ -170,12 +162,13 @@ def validate_and_parse_request_args(request: Request, validator: type[BaseModel]
args = request.args.to_dict(flat=True)
# Handle ext parameter: parse JSON string to dict if it's a string
if 'ext' in args and isinstance(args['ext'], str):
if "ext" in args and isinstance(args["ext"], str):
import json
try:
args['ext'] = json.loads(args['ext'])
args["ext"] = json.loads(args["ext"])
except json.JSONDecodeError:
pass # Keep the string and let validation handle the error
logging.debug("Failed to decode query arg 'ext' as JSON; passing raw value to validator")
try:
if extras is not None:
@@ -350,6 +343,7 @@ class RaptorConfig(Base):
threshold: Annotated[float, Field(default=0.1, ge=0.0, le=1.0)]
max_cluster: Annotated[int, Field(default=64, ge=1, le=1024)]
random_seed: Annotated[int, Field(default=0, ge=0)]
scope: Annotated[Literal["file", "dataset"], Field(default="file")]
auto_disable_for_structured_data: Annotated[bool, Field(default=True)]
ext: Annotated[dict, Field(default={})]
@@ -401,6 +395,7 @@ class ParserConfig(Base):
pages: Annotated[list[list[int]] | None, Field(default=None)]
ext: Annotated[dict, Field(default={})]
class UpdateDocumentReq(Base):
"""
Request model for updating a document.
@@ -408,7 +403,8 @@ class UpdateDocumentReq(Base):
This model validates the request parameters for updating a document,
including name, chunk method, enabled status, and other metadata.
"""
model_config = ConfigDict(extra='ignore')
model_config = ConfigDict(extra="ignore")
name: Annotated[str | None, Field(default=None, max_length=65535)]
chunk_method: Annotated[str | None, Field(default=None, max_length=65535)]
pipeline_id: Annotated[str | None, Field(default=None, max_length=65535)]
@@ -426,7 +422,7 @@ class UpdateDocumentReq(Base):
# Validate chunk method if present
valid_chunk_method = {"naive", "manual", "qa", "table", "paper", "book", "laws", "presentation", "picture", "one", "knowledge_graph", "email", "tag"}
if chunk_method not in valid_chunk_method:
raise PydanticCustomError("format_invalid", "`chunk_method` {chunk_method} doesn't exist", {"chunk_method":chunk_method})
raise PydanticCustomError("format_invalid", "`chunk_method` {chunk_method} doesn't exist", {"chunk_method": chunk_method})
return chunk_method
@@ -436,7 +432,7 @@ class UpdateDocumentReq(Base):
if enabled:
converted = int(enabled)
if converted < 0 or converted > 1:
raise PydanticCustomError("format_invalid", "`enabled` value invalid, only accept 0 or 1 but is {enabled}", {"enabled":enabled})
raise PydanticCustomError("format_invalid", "`enabled` value invalid, only accept 0 or 1 but is {enabled}", {"enabled": enabled})
return enabled
@@ -451,11 +447,12 @@ class UpdateDocumentReq(Base):
for k, v in meta_fields.items():
if isinstance(v, list):
if not all(isinstance(i, (str, int, float)) for i in v):
raise PydanticCustomError("format_invalid", "The type is not supported in list: {v}", {"v":v})
raise PydanticCustomError("format_invalid", "The type is not supported in list: {v}", {"v": v})
elif not isinstance(v, (str, int, float)):
raise PydanticCustomError("format_invalid", "The type is not supported: {v}", {"v":v})
raise PydanticCustomError("format_invalid", "The type is not supported: {v}", {"v": v})
return meta_fields
class CreateDatasetReq(Base):
name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(...)]
avatar: Annotated[str | None, Field(default=None, max_length=65535)]
@@ -708,8 +705,7 @@ class CreateDatasetReq(Base):
@classmethod
def validate_chunk_method(cls, v: Any, handler, info: ValidationInfo) -> Any:
"""Wrap validation to unify error messages, including type errors (e.g. list)."""
allowed = {"naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table",
"tag", "resume"}
allowed = {"naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table", "tag", "resume"}
error_msg = "Input should be 'naive', 'book', 'email', 'laws', 'manual', 'one', 'paper', 'picture', 'presentation', 'qa', 'table', 'tag' or 'resume'"
try:
# Run inner validation (type checking)
@@ -864,6 +860,7 @@ class ListDatasetReq(BaseListReq):
# ---- File Management Request Models ----
class CreateFolderReq(Base):
name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=255), Field(...)]
parent_id: Annotated[str | None, Field(default=None)]
@@ -879,7 +876,7 @@ class MoveFileReq(Base):
dest_file_id: Annotated[str | None, Field(default=None)]
new_name: Annotated[str | None, StringConstraints(strip_whitespace=True, min_length=1, max_length=255), Field(default=None)]
@model_validator(mode='after')
@model_validator(mode="after")
def check_operation(self):
if not self.dest_file_id and not self.new_name:
raise ValueError("At least one of dest_file_id or new_name must be provided")
@@ -899,7 +896,7 @@ class ListFileReq(BaseModel):
desc: Annotated[bool, Field(default=True)]
def validate_immutable_fields(update_doc_req:UpdateDocumentReq, doc):
def validate_immutable_fields(update_doc_req: UpdateDocumentReq, doc):
"""
Validate that immutable fields have not been changed.
@@ -929,7 +926,7 @@ def validate_immutable_fields(update_doc_req:UpdateDocumentReq, doc):
return None, None
def validate_document_name(req_doc_name:str, doc, docs_from_name):
def validate_document_name(req_doc_name: str, doc, docs_from_name):
"""
Validate document name update.
@@ -960,6 +957,7 @@ def validate_document_name(req_doc_name:str, doc, docs_from_name):
return "Duplicated document name in the same dataset.", RetCode.DATA_ERROR
return None, None
def validate_chunk_method(doc, chunk_method=None):
"""
Validate chunk method update.
@@ -975,9 +973,8 @@ def validate_chunk_method(doc, chunk_method=None):
A tuple of (error_message, error_code) if validation fails,
or (None, None) if validation passes.
"""
if chunk_method is not None and len(chunk_method) == 0: # will not be detected in UpdateDocumentReq
if chunk_method is not None and len(chunk_method) == 0: # will not be detected in UpdateDocumentReq
return "`chunk_method` (empty string) is not valid", RetCode.DATA_ERROR
if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
return "Not supported yet!", RetCode.DATA_ERROR
return None, None