From a70c7e8cc7f7c48ab4f181fdc3ee77cef31c4754 Mon Sep 17 00:00:00 2001
From: jaso0n0818 <hirakawatsuneteru@gmail.com>
Date: Thu, 18 Jun 2026 16:24:09 +1000
Subject: [PATCH] fix(deepdoc): attach lone header lines to the following
 section when delimiter is set (#16109)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary
Fixes #15487 — lone markdown headers are no longer isolated as empty
chunks when a custom `delimiter` is set.

- Merge consecutive lone headers before attaching to the following prose
body
- Skip code fences, tables, lists, and blockquotes via
`_is_attachable_body()`
- Unit tests include the `# Title / ## Intro / Body` regression from
CodeRabbit review

## Validation
- `pytest test/unit_test/deepdoc/parser/test_markdown_parser.py` (11
passed locally)

Closes #15487
---
 deepdoc/parser/markdown_parser.py             | 71 ++++++++++++++++++-
 .../deepdoc/parser/test_markdown_parser.py    | 17 +++++
 2 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py
index a24799e5ab..a79f552142 100644
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@@ -306,7 +306,76 @@ class MarkdownElementExtractor:
             dels = self.get_delimiters(delimiter)
         if len(dels) > 0:
             text = "\n".join(self.lines)
-            return self._extract_delimited_elements(text, dels, include_meta)
+            sections = self._extract_delimited_elements(text, dels, include_meta)
+            # Attach lone header lines to the section that follows them so that
+            # "## Title\n" never becomes an isolated chunk when the delimiter
+            # splits at every newline.  A header is "lone" when it occupies a
+            # single line (no embedded newline after stripping).
+            def _is_lone_header(section_content):
+                stripped = section_content.strip()
+                return bool(re.match(r"^#{1,6}\s+\S", stripped)) and "\n" not in stripped
+
+            def _is_attachable_body(section_content):
+                """True when the following chunk is prose body, not code/table/list/etc."""
+                stripped = section_content.strip()
+                if not stripped:
+                    return False
+                first_line = stripped.split("\n", 1)[0]
+                if self._get_fence_marker(first_line):
+                    return False
+                if first_line.lstrip().startswith("|"):
+                    return False
+                if re.match(r"^\S+\s*\|", first_line):
+                    return False
+                if first_line.lstrip().startswith("<"):
+                    return False
+                if re.match(r"^\s*[-*+]\s+", first_line) or re.match(r"^\s*\d+\.\s+", first_line):
+                    return False
+                if first_line.lstrip().startswith(">"):
+                    return False
+                return True
+
+            merged = []
+            merged_header_count = 0
+            i = 0
+            while i < len(sections):
+                content = sections[i]["content"] if include_meta else sections[i]
+                if _is_lone_header(content):
+                    header_parts = [content.strip()]
+                    j = i + 1
+                    while j < len(sections):
+                        next_content = sections[j]["content"] if include_meta else sections[j]
+                        if not _is_lone_header(next_content):
+                            break
+                        header_parts.append(next_content.strip())
+                        j += 1
+                    if j < len(sections):
+                        body_content = sections[j]["content"] if include_meta else sections[j]
+                        if _is_attachable_body(body_content):
+                            combined = "\n".join(header_parts) + "\n" + body_content
+                            if include_meta:
+                                merged.append({
+                                    **sections[i],
+                                    "content": combined,
+                                    "end_line": sections[j]["end_line"],
+                                })
+                            else:
+                                merged.append(combined)
+                            merged_header_count += len(header_parts)
+                            i = j + 1
+                            continue
+                    for k in range(i, j):
+                        merged.append(sections[k])
+                    i = j
+                    continue
+                merged.append(sections[i])
+                i += 1
+            if merged_header_count:
+                logging.debug(
+                    "markdown_parser: merged %d lone header line(s) into following sections",
+                    merged_header_count,
+                )
+            return merged
         while i < len(self.lines):
             line = self.lines[i]
 
diff --git a/test/unit_test/deepdoc/parser/test_markdown_parser.py b/test/unit_test/deepdoc/parser/test_markdown_parser.py
index fee848be50..54976bc28b 100644
--- a/test/unit_test/deepdoc/parser/test_markdown_parser.py
+++ b/test/unit_test/deepdoc/parser/test_markdown_parser.py
@@ -144,3 +144,20 @@ class TestMarkdownElementExtractorTables:
             "<table>\n<tr><td>A</td></tr>\n<tr><td>B</td></tr>\n</table>",
             "After",
         ]
+
+
+@pytest.mark.p2
+class TestMarkdownElementExtractorDelimiterHeaders:
+    def test_custom_delimiter_merges_consecutive_lone_headers_with_body(self, markdown_element_extractor):
+        text = "# Title\n## Intro\nBody paragraph"
+
+        sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`")
+
+        assert sections == ["# Title\n## Intro\nBody paragraph"]
+
+    def test_custom_delimiter_merges_single_lone_header_with_body(self, markdown_element_extractor):
+        text = "## Section\nBody paragraph"
+
+        sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`")
+
+        assert sections == ["## Section\nBody paragraph"]