fix: deduplicate markdown table chunks (#16143)

This commit is contained in:
helloxjade
2026-06-24 13:22:57 +08:00
committed by GitHub
parent 39b194453d
commit 1b2da645c3
3 changed files with 41 additions and 6 deletions

View File

@@ -43,6 +43,25 @@ def markdown_element_extractor(monkeypatch):
return mod.MarkdownElementExtractor
@pytest.fixture
def markdown_parser_module(monkeypatch):
try:
import markdown # noqa: F401
except ModuleNotFoundError:
markdown_stub = types.ModuleType("markdown")
markdown_stub.markdown = lambda text, extensions=None: text
monkeypatch.setitem(sys.modules, "markdown", markdown_stub)
spec = importlib.util.spec_from_file_location(
"test_markdown_parser_dynamic_module",
_REPO / "deepdoc" / "parser" / "markdown_parser.py",
)
assert spec and spec.loader
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod
@pytest.mark.p2
class TestMarkdownElementExtractorFences:
def test_custom_delimiter_preserves_backtick_fence(self, markdown_element_extractor):
@@ -147,6 +166,22 @@ class TestMarkdownElementExtractorTables:
@pytest.mark.p2
class TestMarkdownTableDedup:
def test_separate_tables_removes_pipe_table_from_text_sections(self, markdown_parser_module):
"""Ensure separated pipe tables do not leak back into text chunks."""
text = "Before\n\n| Name | Value |\n| --- | --- |\n| A | 1 |\n| B | 2 |\n\nAfter"
parser = markdown_parser_module.RAGFlowMarkdownParser()
remainder, tables = parser.extract_tables_and_remainder(f"{text}\n", separate_tables=True)
sections = markdown_parser_module.MarkdownElementExtractor(remainder).extract_elements(include_meta=False)
assert len(tables) == 1
assert "| Name | Value |" not in remainder
assert len(sections) == 1
assert "Before" in sections[0]
assert "After" in sections[0]
assert "| Name | Value |" not in sections[0]
class TestMarkdownElementExtractorDelimiterHeaders:
def test_custom_delimiter_merges_consecutive_lone_headers_with_body(self, markdown_element_extractor):
text = "# Title\n## Intro\nBody paragraph"