fix(deepdoc): attach lone header lines to the following section when delimiter is set (#16109)

## Summary
Fixes #15487 — lone markdown headers are no longer isolated as empty
chunks when a custom `delimiter` is set.

- Merge consecutive lone headers before attaching to the following prose
body
- Skip code fences, tables, lists, and blockquotes via
`_is_attachable_body()`
- Unit tests include the `# Title / ## Intro / Body` regression from
CodeRabbit review

## Validation
- `pytest test/unit_test/deepdoc/parser/test_markdown_parser.py` (11
passed locally)

Closes #15487
This commit is contained in:
jaso0n0818
2026-06-18 16:24:09 +10:00
committed by GitHub
parent 27d723e13a
commit a70c7e8cc7
2 changed files with 87 additions and 1 deletions

View File

@@ -144,3 +144,20 @@ class TestMarkdownElementExtractorTables:
"<table>\n<tr><td>A</td></tr>\n<tr><td>B</td></tr>\n</table>",
"After",
]
@pytest.mark.p2
class TestMarkdownElementExtractorDelimiterHeaders:
def test_custom_delimiter_merges_consecutive_lone_headers_with_body(self, markdown_element_extractor):
text = "# Title\n## Intro\nBody paragraph"
sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`")
assert sections == ["# Title\n## Intro\nBody paragraph"]
def test_custom_delimiter_merges_single_lone_header_with_body(self, markdown_element_extractor):
text = "## Section\nBody paragraph"
sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`")
assert sections == ["## Section\nBody paragraph"]