From 1b2da645c361912dc98e4ef2d1452dfb45ceddbc Mon Sep 17 00:00:00 2001 From: helloxjade <52953016+helloxjade@users.noreply.github.com> Date: Wed, 24 Jun 2026 13:22:57 +0800 Subject: [PATCH] fix: deduplicate markdown table chunks (#16143) --- rag/app/naive.py | 10 +++--- rag/flow/parser/parser.py | 2 +- .../deepdoc/parser/test_markdown_parser.py | 35 +++++++++++++++++++ 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/rag/app/naive.py b/rag/app/naive.py index 211ee29bcd..a1903e58ff 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -781,6 +781,7 @@ class Markdown(MarkdownParser): return images, cache def __call__(self, filename, binary=None, separate_tables=True, delimiter=None, return_section_images=False): + """Parse markdown into text sections and optional standalone table chunks.""" if binary: encoding = find_codec(binary) txt = binary.decode(encoding, errors="ignore") @@ -789,10 +790,9 @@ class Markdown(MarkdownParser): txt = f.read() remainder, tables = self.extract_tables_and_remainder(f"{txt}\n", separate_tables=separate_tables) - # To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410. - # extractor = MarkdownElementExtractor(remainder) - extractor = MarkdownElementExtractor(txt) - image_refs = self.extract_image_urls_with_lines(txt) + parsing_text = remainder if separate_tables else txt + extractor = MarkdownElementExtractor(parsing_text) + image_refs = self.extract_image_urls_with_lines(parsing_text) element_sections = extractor.extract_elements(delimiter, include_meta=True) sections = [] @@ -1017,7 +1017,7 @@ def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang= sections, tables, section_images = markdown_parser( filename, binary, - separate_tables=False, + separate_tables=True, delimiter=parser_config.get("delimiter", "\n!?;。;!?"), return_section_images=True, ) diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 79bf1a63a9..970cfcb498 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -955,7 +955,7 @@ class Parser(ProcessBase): sections, tables, section_images = markdown_parser( name, blob, - separate_tables=False, + separate_tables=True, delimiter=conf.get("delimiter"), return_section_images=True, ) diff --git a/test/unit_test/deepdoc/parser/test_markdown_parser.py b/test/unit_test/deepdoc/parser/test_markdown_parser.py index 54976bc28b..9d8d6c53ed 100644 --- a/test/unit_test/deepdoc/parser/test_markdown_parser.py +++ b/test/unit_test/deepdoc/parser/test_markdown_parser.py @@ -43,6 +43,25 @@ def markdown_element_extractor(monkeypatch): return mod.MarkdownElementExtractor +@pytest.fixture +def markdown_parser_module(monkeypatch): + try: + import markdown # noqa: F401 + except ModuleNotFoundError: + markdown_stub = types.ModuleType("markdown") + markdown_stub.markdown = lambda text, extensions=None: text + monkeypatch.setitem(sys.modules, "markdown", markdown_stub) + + spec = importlib.util.spec_from_file_location( + "test_markdown_parser_dynamic_module", + _REPO / "deepdoc" / "parser" / "markdown_parser.py", + ) + assert spec and spec.loader + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + @pytest.mark.p2 class TestMarkdownElementExtractorFences: def test_custom_delimiter_preserves_backtick_fence(self, markdown_element_extractor): @@ -147,6 +166,22 @@ class TestMarkdownElementExtractorTables: @pytest.mark.p2 +class TestMarkdownTableDedup: + def test_separate_tables_removes_pipe_table_from_text_sections(self, markdown_parser_module): + """Ensure separated pipe tables do not leak back into text chunks.""" + text = "Before\n\n| Name | Value |\n| --- | --- |\n| A | 1 |\n| B | 2 |\n\nAfter" + + parser = markdown_parser_module.RAGFlowMarkdownParser() + remainder, tables = parser.extract_tables_and_remainder(f"{text}\n", separate_tables=True) + sections = markdown_parser_module.MarkdownElementExtractor(remainder).extract_elements(include_meta=False) + + assert len(tables) == 1 + assert "| Name | Value |" not in remainder + assert len(sections) == 1 + assert "Before" in sections[0] + assert "After" in sections[0] + assert "| Name | Value |" not in sections[0] + class TestMarkdownElementExtractorDelimiterHeaders: def test_custom_delimiter_merges_consecutive_lone_headers_with_body(self, markdown_element_extractor): text = "# Title\n## Intro\nBody paragraph"