diff --git a/api/apps/services/dataset_api_service.py b/api/apps/services/dataset_api_service.py index 5755ed4f77..189a38ae10 100644 --- a/api/apps/services/dataset_api_service.py +++ b/api/apps/services/dataset_api_service.py @@ -211,6 +211,13 @@ async def update_dataset(tenant_id: str, dataset_id: str, req: dict): parser_config.update(req_ext_fields) req["parser_config"] = deep_merge(kb.parser_config, parser_config) + # Flatten parent_child config into children_delimiter for the execution layer + pc = req["parser_config"].get("parent_child", {}) + if pc.get("use_parent_child"): + req["parser_config"]["children_delimiter"] = pc.get("children_delimiter", "\n") + elif pc: + req["parser_config"]["children_delimiter"] = "" + if (chunk_method := req.get("parser_id")) and chunk_method != kb.parser_id: if not req.get("parser_config"): req["parser_config"] = get_parser_config(chunk_method, None) diff --git a/api/utils/api_utils.py b/api/utils/api_utils.py index 4e5d611cdc..fe6f6d0d44 100644 --- a/api/utils/api_utils.py +++ b/api/utils/api_utils.py @@ -440,6 +440,10 @@ def get_parser_config(chunk_method, parser_config): ], "method": "light", }, + "parent_child": { + "use_parent_child": False, + "children_delimiter": "\n", + }, }, "qa": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, "tag": None, @@ -467,16 +471,23 @@ def get_parser_config(chunk_method, parser_config): # If no parser_config provided, return default merged with base defaults if not parser_config: if default_config is None: - return deep_merge(base_defaults, {}) - return deep_merge(base_defaults, default_config) + merged_config = deep_merge(base_defaults, {}) + else: + merged_config = deep_merge(base_defaults, default_config) + elif default_config is None: + # If parser_config is provided but no defaults for this method + merged_config = deep_merge(base_defaults, parser_config) + else: + # Ensure raptor and graph_rag fields have default values if not provided + merged_config = deep_merge(base_defaults, default_config) + merged_config = deep_merge(merged_config, parser_config) - # If parser_config is provided, merge with defaults to ensure required fields exist - if default_config is None: - return deep_merge(base_defaults, parser_config) - - # Ensure raptor and graph_rag fields have default values if not provided - merged_config = deep_merge(base_defaults, default_config) - merged_config = deep_merge(merged_config, parser_config) + # Flatten parent_child config into children_delimiter for the execution layer + pc = merged_config.get("parent_child", {}) + if pc.get("use_parent_child"): + merged_config["children_delimiter"] = pc.get("children_delimiter", "\n") + elif pc: + merged_config["children_delimiter"] = "" return merged_config diff --git a/api/utils/validation_utils.py b/api/utils/validation_utils.py index d2454fc266..19033bef8b 100644 --- a/api/utils/validation_utils.py +++ b/api/utils/validation_utils.py @@ -362,6 +362,11 @@ class GraphragConfig(Base): resolution: Annotated[bool, Field(default=False)] +class ParentChildConfig(Base): + use_parent_child: Annotated[bool, Field(default=False)] + children_delimiter: Annotated[str, Field(default=r"\n", min_length=1)] + + class AutoMetadataField(Base): """Schema for a single auto-metadata field configuration.""" @@ -387,6 +392,7 @@ class ParserConfig(Base): graphrag: Annotated[GraphragConfig, Field(default_factory=lambda: GraphragConfig(use_graphrag=False))] html4excel: Annotated[bool, Field(default=False)] layout_recognize: Annotated[str, Field(default="DeepDOC")] + parent_child: Annotated[ParentChildConfig, Field(default_factory=lambda: ParentChildConfig(use_parent_child=False))] raptor: Annotated[RaptorConfig, Field(default_factory=lambda: RaptorConfig(use_raptor=False))] tag_kb_ids: Annotated[list[str], Field(default_factory=list)] topn_tags: Annotated[int, Field(default=1, ge=1, le=10)] diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index 659f1957b3..ac0fc58035 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -566,6 +566,9 @@ curl --request POST \ - Defaults to: `{"use_raptor": false}` - `"graphrag"`: `object` GRAPHRAG-specific settings. - Defaults to: `{"use_graphrag": false}` + - `"parent_child"`: `object` Parent-child chunking settings. When enabled, each chunk is further split into smaller child chunks using `children_delimiter`. At retrieval time, matched child chunks are replaced by their parent's full text before being passed to the LLM, giving precise vector matching with broader context. + - `"use_parent_child"`: `bool` Whether to enable parent-child chunking. Defaults to `false`. + - `"children_delimiter"`: `string` The delimiter used to split a parent chunk into child chunks. Only takes effect when `"use_parent_child"` is `true`. Defaults to `"\n"`. - If `"chunk_method"` is `"qa"`, `"manuel"`, `"paper"`, `"book"`, `"laws"`, or `"presentation"`, the `"parser_config"` object contains the following attribute: - `"raptor"`: `object` RAPTOR-specific settings. - Defaults to: `{"use_raptor": false}`. @@ -820,6 +823,9 @@ curl --request PUT \ - Defaults to: `{"use_raptor": false}` - `"graphrag"`: `object` GRAPHRAG-specific settings. - Defaults to: `{"use_graphrag": false}` + - `"parent_child"`: `object` Parent-child chunking settings. When enabled, each chunk is further split into smaller child chunks using `children_delimiter`. At retrieval time, matched child chunks are replaced by their parent's full text before being passed to the LLM, giving precise vector matching with broader context. + - `"use_parent_child"`: `bool` Whether to enable parent-child chunking. Defaults to `false`. + - `"children_delimiter"`: `string` The delimiter used to split a parent chunk into child chunks. Only takes effect when `"use_parent_child"` is `true`. Defaults to `"\n"`. - If `"chunk_method"` is `"qa"`, `"manuel"`, `"paper"`, `"book"`, `"laws"`, or `"presentation"`, the `"parser_config"` object contains the following attribute: - `"raptor"`: `object` RAPTOR-specific settings. - Defaults to: `{"use_raptor": false}`. diff --git a/docs/references/python_api_reference.md b/docs/references/python_api_reference.md index 1709550263..e2e0f7bad0 100644 --- a/docs/references/python_api_reference.md +++ b/docs/references/python_api_reference.md @@ -187,7 +187,7 @@ The chunking method of the dataset to create. Available options: The parser configuration of the dataset. A `ParserConfig` object's attributes vary based on the selected `chunk_method`: - `chunk_method`=`"naive"`: - `{"chunk_token_num":512,"delimiter":"\\n","html4excel":False,"layout_recognize":True,"raptor":{"use_raptor":False}}`. + `{"chunk_token_num":512,"delimiter":"\\n","html4excel":False,"layout_recognize":True,"raptor":{"use_raptor":False},"parent_child":{"use_parent_child":False,"children_delimiter":"\\n"}}`. - `chunk_method`=`"qa"`: `{"raptor": {"use_raptor": False}}` - `chunk_method`=`"manuel"`: @@ -480,7 +480,7 @@ A dictionary representing the attributes to update, with the following keys: - `"email"`: Email - `"parser_config"`: `dict[str, Any]` The parsing configuration for the document. Its attributes vary based on the selected `"chunk_method"`: - `"chunk_method"`=`"naive"`: - `{"chunk_token_num":128,"delimiter":"\\n","html4excel":False,"layout_recognize":True,"raptor":{"use_raptor":False}}`. + `{"chunk_token_num":128,"delimiter":"\\n","html4excel":False,"layout_recognize":True,"raptor":{"use_raptor":False},"parent_child":{"use_parent_child":False,"children_delimiter":"\\n"}}`. - `chunk_method`=`"qa"`: `{"raptor": {"use_raptor": False}}` - `chunk_method`=`"manuel"`: diff --git a/test/testcases/configs.py b/test/testcases/configs.py index 9700da23f2..546cd378c9 100644 --- a/test/testcases/configs.py +++ b/test/testcases/configs.py @@ -66,4 +66,9 @@ DEFAULT_PARSER_CONFIG = { ], "method": "light", }, + "parent_child": { + "use_parent_child": False, + "children_delimiter": "\n", + }, + "children_delimiter": "", } diff --git a/test/testcases/test_http_api/test_dataset_management/test_create_dataset.py b/test/testcases/test_http_api/test_dataset_management/test_create_dataset.py index 5a01155afb..5cada305fb 100644 --- a/test/testcases/test_http_api/test_dataset_management/test_create_dataset.py +++ b/test/testcases/test_http_api/test_dataset_management/test_create_dataset.py @@ -449,6 +449,10 @@ class TestDatasetCreate: ("raptor_max_cluster_mid", {"raptor": {"max_cluster": 512}}), ("raptor_max_cluster_max", {"raptor": {"max_cluster": 1024}}), ("raptor_random_seed_min", {"raptor": {"random_seed": 0}}), + ("parent_child_true", {"parent_child": {"use_parent_child": True}}), + ("parent_child_false", {"parent_child": {"use_parent_child": False}}), + ("parent_child_delimiter", {"parent_child": {"children_delimiter": "\n\n"}}), + ("parent_child_delimiter_custom", {"parent_child": {"use_parent_child": True, "children_delimiter": "。"}}), ], ids=[ "auto_keywords_min", @@ -499,6 +503,10 @@ class TestDatasetCreate: "raptor_max_cluster_mid", "raptor_max_cluster_max", "raptor_random_seed_min", + "parent_child_true", + "parent_child_false", + "parent_child_delimiter", + "parent_child_delimiter_custom", ], ) def test_parser_config(self, HttpApiAuth, name, parser_config): @@ -570,6 +578,8 @@ class TestDatasetCreate: ("raptor_random_seed_float_not_allowed", {"raptor": {"random_seed": 3.14}}, "Input should be a valid integer"), ("raptor_random_seed_type_invalid", {"raptor": {"random_seed": "string"}}, "Input should be a valid integer"), ("parser_config_type_invalid", {"delimiter": "a" * 65536}, "Parser config exceeds size limit (max 65,535 characters)"), + ("parent_child_type_invalid", {"parent_child": {"use_parent_child": "string"}}, "Input should be a valid boolean"), + ("parent_child_delimiter_empty", {"parent_child": {"children_delimiter": ""}}, "String should have at least 1 character"), ], ids=[ "auto_keywords_min_limit", @@ -626,6 +636,8 @@ class TestDatasetCreate: "raptor_random_seed_float_not_allowed", "raptor_random_seed_type_invalid", "parser_config_type_invalid", + "parent_child_type_invalid", + "parent_child_delimiter_empty", ], ) def test_parser_config_invalid(self, HttpApiAuth, name, parser_config, expected_message):