diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py index e911a22ac8..d144cb75b0 100644 --- a/deepdoc/parser/markdown_parser.py +++ b/deepdoc/parser/markdown_parser.py @@ -15,6 +15,7 @@ # limitations under the License. # +import logging import re from markdown import markdown @@ -132,6 +133,95 @@ class MarkdownElementExtractor: toks = sorted(set(toks), key=lambda x: -len(x)) return "|".join(re.escape(t) for t in toks if t) + def _get_fence_marker(self, line): + match = re.match(r"^[ \t]{0,3}(?P`{3,}|~{3,})(?:.*)$", line) + if not match: + return None + fence = match.group("fence") + return fence[0], len(fence) + + def _is_closing_fence(self, line, fence_char, fence_len): + pattern = r"^[ \t]{0,3}" + re.escape(fence_char) + r"{" + str(fence_len) + r",}\s*$" + return re.match(pattern, line) is not None + + def _line_start_offsets(self, text): + offsets = [] + offset = 0 + for line in self.lines: + offsets.append(offset) + offset += len(line) + 1 + return offsets + + def _fenced_code_ranges(self, text): + ranges = [] + line_offsets = self._line_start_offsets(text) + + i = 0 + while i < len(self.lines): + marker = self._get_fence_marker(self.lines[i]) + if not marker: + i += 1 + continue + + fence_char, fence_len = marker + start_pos = line_offsets[i] + end_line = len(self.lines) - 1 + for j in range(i + 1, len(self.lines)): + if self._is_closing_fence(self.lines[j], fence_char, fence_len): + end_line = j + break + + end_pos = min(len(text), line_offsets[end_line] + len(self.lines[end_line])) + ranges.append((start_pos, end_pos)) + i = end_line + 1 + + return ranges + + def _append_delimited_section(self, sections, text, start, end, include_meta): + part = text[start:end] + if not part or not part.strip(): + return + if include_meta: + sections.append( + { + "content": part.strip(), + "start_line": text.count("\n", 0, start), + "end_line": text.count("\n", 0, end), + } + ) + else: + sections.append(part.strip()) + + def _extract_delimited_elements(self, text, delimiters, include_meta=False): + sections = [] + pattern = re.compile(delimiters) + protected_ranges = self._fenced_code_ranges(text) + if protected_ranges: + logging.debug("markdown_parser: detected %d fenced ranges for delimiter extraction", len(protected_ranges)) + protected_idx = 0 + last_end = 0 + + for match in pattern.finditer(text): + while protected_idx < len(protected_ranges) and protected_ranges[protected_idx][1] <= match.start(): + protected_idx += 1 + + if protected_idx < len(protected_ranges): + start, end = protected_ranges[protected_idx] + if start <= match.start() < end: + logging.debug( + "markdown_parser: skipped delimiter match at pos=%d delimiter=%r inside fenced range %s", + match.start(), + match.group(), + (start, end), + ) + continue + + self._append_delimited_section(sections, text, last_end, match.start(), include_meta) + last_end = match.end() + + self._append_delimited_section(sections, text, last_end, len(text), include_meta) + return sections + def extract_elements(self, delimiter=None, include_meta=False): """Extract individual elements (headers, code blocks, lists, etc.)""" sections = [] @@ -142,34 +232,7 @@ class MarkdownElementExtractor: dels = self.get_delimiters(delimiter) if len(dels) > 0: text = "\n".join(self.lines) - if include_meta: - pattern = re.compile(dels) - last_end = 0 - for m in pattern.finditer(text): - part = text[last_end : m.start()] - if part and part.strip(): - sections.append( - { - "content": part.strip(), - "start_line": text.count("\n", 0, last_end), - "end_line": text.count("\n", 0, m.start()), - } - ) - last_end = m.end() - - part = text[last_end:] - if part and part.strip(): - sections.append( - { - "content": part.strip(), - "start_line": text.count("\n", 0, last_end), - "end_line": text.count("\n", 0, len(text)), - } - ) - else: - parts = re.split(dels, text) - sections = [p.strip() for p in parts if p and p.strip()] - return sections + return self._extract_delimited_elements(text, dels, include_meta) while i < len(self.lines): line = self.lines[i] @@ -178,7 +241,7 @@ class MarkdownElementExtractor: element = self._extract_header(i) sections.append(element if include_meta else element["content"]) i = element["end_line"] + 1 - elif line.strip().startswith("```"): + elif self._get_fence_marker(line): # code block element = self._extract_code_block(i) sections.append(element if include_meta else element["content"]) @@ -218,12 +281,13 @@ class MarkdownElementExtractor: def _extract_code_block(self, start_pos): end_pos = start_pos content_lines = [self.lines[start_pos]] + fence_char, fence_len = self._get_fence_marker(self.lines[start_pos]) # Find the end of the code block for i in range(start_pos + 1, len(self.lines)): content_lines.append(self.lines[i]) end_pos = i - if self.lines[i].strip().startswith("```"): + if self._is_closing_fence(self.lines[i], fence_char, fence_len): break return { @@ -292,13 +356,13 @@ class MarkdownElementExtractor: while i < len(self.lines): line = self.lines[i] # stop if we encounter a block element - if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"): + if re.match(r"^#{1,6}\s+.*$", line) or self._get_fence_marker(line) or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"): break elif not line.strip(): # check if the next line is a block element if i + 1 < len(self.lines) and ( re.match(r"^#{1,6}\s+.*$", self.lines[i + 1]) - or self.lines[i + 1].strip().startswith("```") + or self._get_fence_marker(self.lines[i + 1]) or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1]) or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1]) or self.lines[i + 1].strip().startswith(">") diff --git a/test/unit_test/deepdoc/parser/test_markdown_parser.py b/test/unit_test/deepdoc/parser/test_markdown_parser.py new file mode 100644 index 0000000000..8d51b32658 --- /dev/null +++ b/test/unit_test/deepdoc/parser/test_markdown_parser.py @@ -0,0 +1,108 @@ +# +# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import importlib.util +import sys +import types +from pathlib import Path + +import pytest + +_REPO = Path(__file__).parents[4] + + +@pytest.fixture +def markdown_element_extractor(monkeypatch): + try: + import markdown # noqa: F401 + except ModuleNotFoundError: + markdown_stub = types.ModuleType("markdown") + markdown_stub.markdown = lambda text, extensions=None: text + monkeypatch.setitem(sys.modules, "markdown", markdown_stub) + + spec = importlib.util.spec_from_file_location( + "test_markdown_parser_dynamic", + _REPO / "deepdoc" / "parser" / "markdown_parser.py", + ) + assert spec and spec.loader + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod.MarkdownElementExtractor + + +@pytest.mark.p2 +class TestMarkdownElementExtractorFences: + def test_custom_delimiter_preserves_backtick_fence(self, markdown_element_extractor): + text = "# Title\n```python\nprint('a')\nprint('b')\n```\nAfter" + + sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`", include_meta=True) + + assert [section["content"] for section in sections] == [ + "# Title", + "```python\nprint('a')\nprint('b')\n```", + "After", + ] + assert sections[1]["start_line"] == 1 + assert sections[1]["end_line"] == 4 + + def test_custom_delimiter_still_splits_outside_fences(self, markdown_element_extractor): + text = "Before\n~~~python\nprint('inside')\n~~~\nAfter" + + sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`") + + assert sections == [ + "Before", + "~~~python\nprint('inside')\n~~~", + "After", + ] + + def test_tilde_fence_is_code_block_without_custom_delimiter(self, markdown_element_extractor): + text = "# Title\n~~~python\nprint('a')\n~~~\nAfter" + + sections = markdown_element_extractor(text).extract_elements(include_meta=True) + + assert [section["content"] for section in sections] == [ + "# Title", + "~~~python\nprint('a')\n~~~", + "After", + ] + assert sections[1]["type"] == "code_block" + assert sections[1]["start_line"] == 1 + assert sections[1]["end_line"] == 3 + + def test_longer_outer_fence_preserves_nested_shorter_fence(self, markdown_element_extractor): + text = "````markdown\n```python\nprint('inner')\n```\n````\nAfter" + + sections = markdown_element_extractor(text).extract_elements(include_meta=True) + + assert [section["content"] for section in sections] == [ + "````markdown\n```python\nprint('inner')\n```\n````", + "After", + ] + assert sections[0]["type"] == "code_block" + assert sections[0]["start_line"] == 0 + assert sections[0]["end_line"] == 4 + + def test_custom_delimiter_preserves_longer_outer_fence(self, markdown_element_extractor): + text = "Before\n````markdown\n```python\nprint('inner')\n```\n````\nAfter" + + sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`") + + assert sections == [ + "Before", + "````markdown\n```python\nprint('inner')\n```\n````", + "After", + ]