Fix: markdown fenced code block extraction (#15630)

### What problem does this PR solve?

Markdown extraction currently applies custom delimiters before
respecting fenced code blocks. When a delimiter such as a newline is
configured, fenced code can be split into separate chunks, and longer
outer fences can be closed incorrectly by shorter nested fences.

This PR keeps the fix intentionally narrow for the Markdown chunking
discussion in #15482:

- preserve fenced code blocks when delimiter-based extraction is used
- support both backtick and tilde fences
- respect fence length so longer outer fences can contain shorter inner
fences
- keep delimiter splitting unchanged outside fenced blocks

Refs #15482

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

### Testing

- `ruff check deepdoc/parser/markdown_parser.py
test/unit_test/deepdoc/parser/test_markdown_parser.py`
- `python3 run_tests.py -t
test/unit_test/deepdoc/parser/test_markdown_parser.py`
This commit is contained in:
bitloi
2026-06-04 02:33:46 -03:00
committed by GitHub
parent c70f19e138
commit 01a5598aa5
2 changed files with 204 additions and 32 deletions

View File

@@ -0,0 +1,108 @@
#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import importlib.util
import sys
import types
from pathlib import Path
import pytest
_REPO = Path(__file__).parents[4]
@pytest.fixture
def markdown_element_extractor(monkeypatch):
try:
import markdown # noqa: F401
except ModuleNotFoundError:
markdown_stub = types.ModuleType("markdown")
markdown_stub.markdown = lambda text, extensions=None: text
monkeypatch.setitem(sys.modules, "markdown", markdown_stub)
spec = importlib.util.spec_from_file_location(
"test_markdown_parser_dynamic",
_REPO / "deepdoc" / "parser" / "markdown_parser.py",
)
assert spec and spec.loader
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod.MarkdownElementExtractor
@pytest.mark.p2
class TestMarkdownElementExtractorFences:
def test_custom_delimiter_preserves_backtick_fence(self, markdown_element_extractor):
text = "# Title\n```python\nprint('a')\nprint('b')\n```\nAfter"
sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`", include_meta=True)
assert [section["content"] for section in sections] == [
"# Title",
"```python\nprint('a')\nprint('b')\n```",
"After",
]
assert sections[1]["start_line"] == 1
assert sections[1]["end_line"] == 4
def test_custom_delimiter_still_splits_outside_fences(self, markdown_element_extractor):
text = "Before\n~~~python\nprint('inside')\n~~~\nAfter"
sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`")
assert sections == [
"Before",
"~~~python\nprint('inside')\n~~~",
"After",
]
def test_tilde_fence_is_code_block_without_custom_delimiter(self, markdown_element_extractor):
text = "# Title\n~~~python\nprint('a')\n~~~\nAfter"
sections = markdown_element_extractor(text).extract_elements(include_meta=True)
assert [section["content"] for section in sections] == [
"# Title",
"~~~python\nprint('a')\n~~~",
"After",
]
assert sections[1]["type"] == "code_block"
assert sections[1]["start_line"] == 1
assert sections[1]["end_line"] == 3
def test_longer_outer_fence_preserves_nested_shorter_fence(self, markdown_element_extractor):
text = "````markdown\n```python\nprint('inner')\n```\n````\nAfter"
sections = markdown_element_extractor(text).extract_elements(include_meta=True)
assert [section["content"] for section in sections] == [
"````markdown\n```python\nprint('inner')\n```\n````",
"After",
]
assert sections[0]["type"] == "code_block"
assert sections[0]["start_line"] == 0
assert sections[0]["end_line"] == 4
def test_custom_delimiter_preserves_longer_outer_fence(self, markdown_element_extractor):
text = "Before\n````markdown\n```python\nprint('inner')\n```\n````\nAfter"
sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`")
assert sections == [
"Before",
"````markdown\n```python\nprint('inner')\n```\n````",
"After",
]