mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
fix: deduplicate markdown table chunks (#16143)
This commit is contained in:
@@ -781,6 +781,7 @@ class Markdown(MarkdownParser):
|
||||
return images, cache
|
||||
|
||||
def __call__(self, filename, binary=None, separate_tables=True, delimiter=None, return_section_images=False):
|
||||
"""Parse markdown into text sections and optional standalone table chunks."""
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
@@ -789,10 +790,9 @@ class Markdown(MarkdownParser):
|
||||
txt = f.read()
|
||||
|
||||
remainder, tables = self.extract_tables_and_remainder(f"{txt}\n", separate_tables=separate_tables)
|
||||
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
|
||||
# extractor = MarkdownElementExtractor(remainder)
|
||||
extractor = MarkdownElementExtractor(txt)
|
||||
image_refs = self.extract_image_urls_with_lines(txt)
|
||||
parsing_text = remainder if separate_tables else txt
|
||||
extractor = MarkdownElementExtractor(parsing_text)
|
||||
image_refs = self.extract_image_urls_with_lines(parsing_text)
|
||||
element_sections = extractor.extract_elements(delimiter, include_meta=True)
|
||||
|
||||
sections = []
|
||||
@@ -1017,7 +1017,7 @@ def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang=
|
||||
sections, tables, section_images = markdown_parser(
|
||||
filename,
|
||||
binary,
|
||||
separate_tables=False,
|
||||
separate_tables=True,
|
||||
delimiter=parser_config.get("delimiter", "\n!?;。;!?"),
|
||||
return_section_images=True,
|
||||
)
|
||||
|
||||
@@ -955,7 +955,7 @@ class Parser(ProcessBase):
|
||||
sections, tables, section_images = markdown_parser(
|
||||
name,
|
||||
blob,
|
||||
separate_tables=False,
|
||||
separate_tables=True,
|
||||
delimiter=conf.get("delimiter"),
|
||||
return_section_images=True,
|
||||
)
|
||||
|
||||
@@ -43,6 +43,25 @@ def markdown_element_extractor(monkeypatch):
|
||||
return mod.MarkdownElementExtractor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def markdown_parser_module(monkeypatch):
|
||||
try:
|
||||
import markdown # noqa: F401
|
||||
except ModuleNotFoundError:
|
||||
markdown_stub = types.ModuleType("markdown")
|
||||
markdown_stub.markdown = lambda text, extensions=None: text
|
||||
monkeypatch.setitem(sys.modules, "markdown", markdown_stub)
|
||||
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"test_markdown_parser_dynamic_module",
|
||||
_REPO / "deepdoc" / "parser" / "markdown_parser.py",
|
||||
)
|
||||
assert spec and spec.loader
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod)
|
||||
return mod
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
class TestMarkdownElementExtractorFences:
|
||||
def test_custom_delimiter_preserves_backtick_fence(self, markdown_element_extractor):
|
||||
@@ -147,6 +166,22 @@ class TestMarkdownElementExtractorTables:
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
class TestMarkdownTableDedup:
|
||||
def test_separate_tables_removes_pipe_table_from_text_sections(self, markdown_parser_module):
|
||||
"""Ensure separated pipe tables do not leak back into text chunks."""
|
||||
text = "Before\n\n| Name | Value |\n| --- | --- |\n| A | 1 |\n| B | 2 |\n\nAfter"
|
||||
|
||||
parser = markdown_parser_module.RAGFlowMarkdownParser()
|
||||
remainder, tables = parser.extract_tables_and_remainder(f"{text}\n", separate_tables=True)
|
||||
sections = markdown_parser_module.MarkdownElementExtractor(remainder).extract_elements(include_meta=False)
|
||||
|
||||
assert len(tables) == 1
|
||||
assert "| Name | Value |" not in remainder
|
||||
assert len(sections) == 1
|
||||
assert "Before" in sections[0]
|
||||
assert "After" in sections[0]
|
||||
assert "| Name | Value |" not in sections[0]
|
||||
|
||||
class TestMarkdownElementExtractorDelimiterHeaders:
|
||||
def test_custom_delimiter_merges_consecutive_lone_headers_with_body(self, markdown_element_extractor):
|
||||
text = "# Title\n## Intro\nBody paragraph"
|
||||
|
||||
Reference in New Issue
Block a user