From a70c7e8cc7f7c48ab4f181fdc3ee77cef31c4754 Mon Sep 17 00:00:00 2001 From: jaso0n0818 Date: Thu, 18 Jun 2026 16:24:09 +1000 Subject: [PATCH] fix(deepdoc): attach lone header lines to the following section when delimiter is set (#16109) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Fixes #15487 — lone markdown headers are no longer isolated as empty chunks when a custom `delimiter` is set. - Merge consecutive lone headers before attaching to the following prose body - Skip code fences, tables, lists, and blockquotes via `_is_attachable_body()` - Unit tests include the `# Title / ## Intro / Body` regression from CodeRabbit review ## Validation - `pytest test/unit_test/deepdoc/parser/test_markdown_parser.py` (11 passed locally) Closes #15487 --- deepdoc/parser/markdown_parser.py | 71 ++++++++++++++++++- .../deepdoc/parser/test_markdown_parser.py | 17 +++++ 2 files changed, 87 insertions(+), 1 deletion(-) diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py index a24799e5ab..a79f552142 100644 --- a/deepdoc/parser/markdown_parser.py +++ b/deepdoc/parser/markdown_parser.py @@ -306,7 +306,76 @@ class MarkdownElementExtractor: dels = self.get_delimiters(delimiter) if len(dels) > 0: text = "\n".join(self.lines) - return self._extract_delimited_elements(text, dels, include_meta) + sections = self._extract_delimited_elements(text, dels, include_meta) + # Attach lone header lines to the section that follows them so that + # "## Title\n" never becomes an isolated chunk when the delimiter + # splits at every newline. A header is "lone" when it occupies a + # single line (no embedded newline after stripping). + def _is_lone_header(section_content): + stripped = section_content.strip() + return bool(re.match(r"^#{1,6}\s+\S", stripped)) and "\n" not in stripped + + def _is_attachable_body(section_content): + """True when the following chunk is prose body, not code/table/list/etc.""" + stripped = section_content.strip() + if not stripped: + return False + first_line = stripped.split("\n", 1)[0] + if self._get_fence_marker(first_line): + return False + if first_line.lstrip().startswith("|"): + return False + if re.match(r"^\S+\s*\|", first_line): + return False + if first_line.lstrip().startswith("<"): + return False + if re.match(r"^\s*[-*+]\s+", first_line) or re.match(r"^\s*\d+\.\s+", first_line): + return False + if first_line.lstrip().startswith(">"): + return False + return True + + merged = [] + merged_header_count = 0 + i = 0 + while i < len(sections): + content = sections[i]["content"] if include_meta else sections[i] + if _is_lone_header(content): + header_parts = [content.strip()] + j = i + 1 + while j < len(sections): + next_content = sections[j]["content"] if include_meta else sections[j] + if not _is_lone_header(next_content): + break + header_parts.append(next_content.strip()) + j += 1 + if j < len(sections): + body_content = sections[j]["content"] if include_meta else sections[j] + if _is_attachable_body(body_content): + combined = "\n".join(header_parts) + "\n" + body_content + if include_meta: + merged.append({ + **sections[i], + "content": combined, + "end_line": sections[j]["end_line"], + }) + else: + merged.append(combined) + merged_header_count += len(header_parts) + i = j + 1 + continue + for k in range(i, j): + merged.append(sections[k]) + i = j + continue + merged.append(sections[i]) + i += 1 + if merged_header_count: + logging.debug( + "markdown_parser: merged %d lone header line(s) into following sections", + merged_header_count, + ) + return merged while i < len(self.lines): line = self.lines[i] diff --git a/test/unit_test/deepdoc/parser/test_markdown_parser.py b/test/unit_test/deepdoc/parser/test_markdown_parser.py index fee848be50..54976bc28b 100644 --- a/test/unit_test/deepdoc/parser/test_markdown_parser.py +++ b/test/unit_test/deepdoc/parser/test_markdown_parser.py @@ -144,3 +144,20 @@ class TestMarkdownElementExtractorTables: "\n\n\n
A
B
", "After", ] + + +@pytest.mark.p2 +class TestMarkdownElementExtractorDelimiterHeaders: + def test_custom_delimiter_merges_consecutive_lone_headers_with_body(self, markdown_element_extractor): + text = "# Title\n## Intro\nBody paragraph" + + sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`") + + assert sections == ["# Title\n## Intro\nBody paragraph"] + + def test_custom_delimiter_merges_single_lone_header_with_body(self, markdown_element_extractor): + text = "## Section\nBody paragraph" + + sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`") + + assert sections == ["## Section\nBody paragraph"]