Files
ragflow/test/unit_test/deepdoc/parser/test_markdown_parser.py
bitloi 9f3e289b78 Fix: preserve markdown tables during delimiter extraction (#15632)
### What problem does this PR solve?

Markdown extraction can split tables row by row when delimiter-based
extraction uses a newline delimiter. That loses table structure during
chunking even though delimiters should still split normally outside
tables.

This PR keeps the follow-up to #15482 intentionally narrow:

- preserve Markdown pipe tables during delimiter-based extraction
- preserve borderless pipe tables during delimiter-based extraction
- preserve multiline HTML tables during delimiter-based extraction
- keep delimiter splitting unchanged outside protected table ranges

Refs #15482

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

### Testing

- `ruff check deepdoc/parser/markdown_parser.py
test/unit_test/deepdoc/parser/test_markdown_parser.py`
- `python3 run_tests.py -t
test/unit_test/deepdoc/parser/test_markdown_parser.py`
- `git diff --check`
2026-06-05 10:35:33 +08:00

147 lines
5.2 KiB
Python

#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import importlib.util
import sys
import types
from pathlib import Path
import pytest
_REPO = Path(__file__).parents[4]
@pytest.fixture
def markdown_element_extractor(monkeypatch):
try:
import markdown # noqa: F401
except ModuleNotFoundError:
markdown_stub = types.ModuleType("markdown")
markdown_stub.markdown = lambda text, extensions=None: text
monkeypatch.setitem(sys.modules, "markdown", markdown_stub)
spec = importlib.util.spec_from_file_location(
"test_markdown_parser_dynamic",
_REPO / "deepdoc" / "parser" / "markdown_parser.py",
)
assert spec and spec.loader
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod.MarkdownElementExtractor
@pytest.mark.p2
class TestMarkdownElementExtractorFences:
def test_custom_delimiter_preserves_backtick_fence(self, markdown_element_extractor):
text = "# Title\n```python\nprint('a')\nprint('b')\n```\nAfter"
sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`", include_meta=True)
assert [section["content"] for section in sections] == [
"# Title",
"```python\nprint('a')\nprint('b')\n```",
"After",
]
assert sections[1]["start_line"] == 1
assert sections[1]["end_line"] == 4
def test_custom_delimiter_still_splits_outside_fences(self, markdown_element_extractor):
text = "Before\n~~~python\nprint('inside')\n~~~\nAfter"
sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`")
assert sections == [
"Before",
"~~~python\nprint('inside')\n~~~",
"After",
]
def test_tilde_fence_is_code_block_without_custom_delimiter(self, markdown_element_extractor):
text = "# Title\n~~~python\nprint('a')\n~~~\nAfter"
sections = markdown_element_extractor(text).extract_elements(include_meta=True)
assert [section["content"] for section in sections] == [
"# Title",
"~~~python\nprint('a')\n~~~",
"After",
]
assert sections[1]["type"] == "code_block"
assert sections[1]["start_line"] == 1
assert sections[1]["end_line"] == 3
def test_longer_outer_fence_preserves_nested_shorter_fence(self, markdown_element_extractor):
text = "````markdown\n```python\nprint('inner')\n```\n````\nAfter"
sections = markdown_element_extractor(text).extract_elements(include_meta=True)
assert [section["content"] for section in sections] == [
"````markdown\n```python\nprint('inner')\n```\n````",
"After",
]
assert sections[0]["type"] == "code_block"
assert sections[0]["start_line"] == 0
assert sections[0]["end_line"] == 4
def test_custom_delimiter_preserves_longer_outer_fence(self, markdown_element_extractor):
text = "Before\n````markdown\n```python\nprint('inner')\n```\n````\nAfter"
sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`")
assert sections == [
"Before",
"````markdown\n```python\nprint('inner')\n```\n````",
"After",
]
@pytest.mark.p2
class TestMarkdownElementExtractorTables:
def test_custom_delimiter_preserves_pipe_table(self, markdown_element_extractor):
text = "# Title\n\n| Name | Value |\n| --- | --- |\n| A | 1 |\n| B | 2 |\n\nAfter"
sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`", include_meta=True)
assert [section["content"] for section in sections] == [
"# Title",
"| Name | Value |\n| --- | --- |\n| A | 1 |\n| B | 2 |",
"After",
]
assert sections[1]["start_line"] == 2
assert sections[1]["end_line"] == 5
def test_custom_delimiter_preserves_borderless_pipe_table(self, markdown_element_extractor):
text = "Before\nName | Value\n--- | ---\nA | 1\nB | 2\nAfter"
sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`")
assert sections == [
"Before",
"Name | Value\n--- | ---\nA | 1\nB | 2",
"After",
]
def test_custom_delimiter_preserves_html_table(self, markdown_element_extractor):
text = "Before\n<table>\n<tr><td>A</td></tr>\n<tr><td>B</td></tr>\n</table>\nAfter"
sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`")
assert sections == [
"Before",
"<table>\n<tr><td>A</td></tr>\n<tr><td>B</td></tr>\n</table>",
"After",
]