mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
Feat: expose parent-child chunking configuration via HTTP API and Python SDK (#13940)
… ### What problem does this PR solve? Closes #13857 Parent-child chunking was introduced in v0.23.0 but is only configurable through the web UI. Users managing datasets programmatically cannot enable it via the HTTP API or Python SDK because `ParserConfig` uses `extra="forbid"`, rejecting the `children_delimiter` field at validation. ### What does this PR change? Adds a `parent_child` nested config to `ParserConfig`, following the same pattern as `raptor` and `graphrag`: ```json "parser_config": { "parent_child": { "use_parent_child": true, "children_delimiter": "\n" } } ``` - api/utils/validation_utils.py — new ParentChildConfig model, added to ParserConfig - api/utils/api_utils.py — naive defaults + flatten to children_delimiter for the execution layer - api/apps/services/dataset_api_service.py — flatten on the update path - test/testcases/configs.py — updated DEFAULT_PARSER_CONFIG - test/testcases/test_http_api/test_dataset_management/test_create_dataset.py — 4 valid + 2 invalid test cases No changes to the execution layer (rag/app/naive.py, rag/nlp/search.py). Existing UI flow via ext is unaffected. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Added parent-child chunking configuration for dataset creation and updates with new `use_parent_child` toggle and customizable `children_delimiter` setting to specify how parent chunks are split into child chunks. * **Documentation** * Updated HTTP and Python API references with parent-child chunking configuration details and examples. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
@@ -440,6 +440,10 @@ def get_parser_config(chunk_method, parser_config):
|
||||
],
|
||||
"method": "light",
|
||||
},
|
||||
"parent_child": {
|
||||
"use_parent_child": False,
|
||||
"children_delimiter": "\n",
|
||||
},
|
||||
},
|
||||
"qa": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"tag": None,
|
||||
@@ -467,16 +471,23 @@ def get_parser_config(chunk_method, parser_config):
|
||||
# If no parser_config provided, return default merged with base defaults
|
||||
if not parser_config:
|
||||
if default_config is None:
|
||||
return deep_merge(base_defaults, {})
|
||||
return deep_merge(base_defaults, default_config)
|
||||
merged_config = deep_merge(base_defaults, {})
|
||||
else:
|
||||
merged_config = deep_merge(base_defaults, default_config)
|
||||
elif default_config is None:
|
||||
# If parser_config is provided but no defaults for this method
|
||||
merged_config = deep_merge(base_defaults, parser_config)
|
||||
else:
|
||||
# Ensure raptor and graph_rag fields have default values if not provided
|
||||
merged_config = deep_merge(base_defaults, default_config)
|
||||
merged_config = deep_merge(merged_config, parser_config)
|
||||
|
||||
# If parser_config is provided, merge with defaults to ensure required fields exist
|
||||
if default_config is None:
|
||||
return deep_merge(base_defaults, parser_config)
|
||||
|
||||
# Ensure raptor and graph_rag fields have default values if not provided
|
||||
merged_config = deep_merge(base_defaults, default_config)
|
||||
merged_config = deep_merge(merged_config, parser_config)
|
||||
# Flatten parent_child config into children_delimiter for the execution layer
|
||||
pc = merged_config.get("parent_child", {})
|
||||
if pc.get("use_parent_child"):
|
||||
merged_config["children_delimiter"] = pc.get("children_delimiter", "\n")
|
||||
elif pc:
|
||||
merged_config["children_delimiter"] = ""
|
||||
|
||||
return merged_config
|
||||
|
||||
|
||||
@@ -362,6 +362,11 @@ class GraphragConfig(Base):
|
||||
resolution: Annotated[bool, Field(default=False)]
|
||||
|
||||
|
||||
class ParentChildConfig(Base):
|
||||
use_parent_child: Annotated[bool, Field(default=False)]
|
||||
children_delimiter: Annotated[str, Field(default=r"\n", min_length=1)]
|
||||
|
||||
|
||||
class AutoMetadataField(Base):
|
||||
"""Schema for a single auto-metadata field configuration."""
|
||||
|
||||
@@ -387,6 +392,7 @@ class ParserConfig(Base):
|
||||
graphrag: Annotated[GraphragConfig, Field(default_factory=lambda: GraphragConfig(use_graphrag=False))]
|
||||
html4excel: Annotated[bool, Field(default=False)]
|
||||
layout_recognize: Annotated[str, Field(default="DeepDOC")]
|
||||
parent_child: Annotated[ParentChildConfig, Field(default_factory=lambda: ParentChildConfig(use_parent_child=False))]
|
||||
raptor: Annotated[RaptorConfig, Field(default_factory=lambda: RaptorConfig(use_raptor=False))]
|
||||
tag_kb_ids: Annotated[list[str], Field(default_factory=list)]
|
||||
topn_tags: Annotated[int, Field(default=1, ge=1, le=10)]
|
||||
|
||||
Reference in New Issue
Block a user