mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
fix(deepdoc): attach lone header lines to the following section when delimiter is set (#16109)
## Summary Fixes #15487 — lone markdown headers are no longer isolated as empty chunks when a custom `delimiter` is set. - Merge consecutive lone headers before attaching to the following prose body - Skip code fences, tables, lists, and blockquotes via `_is_attachable_body()` - Unit tests include the `# Title / ## Intro / Body` regression from CodeRabbit review ## Validation - `pytest test/unit_test/deepdoc/parser/test_markdown_parser.py` (11 passed locally) Closes #15487
This commit is contained in:
@@ -144,3 +144,20 @@ class TestMarkdownElementExtractorTables:
|
||||
"<table>\n<tr><td>A</td></tr>\n<tr><td>B</td></tr>\n</table>",
|
||||
"After",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
class TestMarkdownElementExtractorDelimiterHeaders:
|
||||
def test_custom_delimiter_merges_consecutive_lone_headers_with_body(self, markdown_element_extractor):
|
||||
text = "# Title\n## Intro\nBody paragraph"
|
||||
|
||||
sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`")
|
||||
|
||||
assert sections == ["# Title\n## Intro\nBody paragraph"]
|
||||
|
||||
def test_custom_delimiter_merges_single_lone_header_with_body(self, markdown_element_extractor):
|
||||
text = "## Section\nBody paragraph"
|
||||
|
||||
sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`")
|
||||
|
||||
assert sections == ["## Section\nBody paragraph"]
|
||||
|
||||
Reference in New Issue
Block a user