From d398d617caf9d639d5a0a57e67970bc7cb9bb698 Mon Sep 17 00:00:00 2001 From: monsterDavid Date: Mon, 1 Jun 2026 05:15:04 -0700 Subject: [PATCH] fix(mineru): skip page chrome blocks to prevent duplicate chunks (#15387) ## Summary - Skip MinerU `header`, `footer`, and `page_number` blocks when converting `content_list.json` into sections. - Ignore unsupported block types explicitly so future MinerU output types cannot re-emit the previous text block. Fixes duplicate text in General/naive chunks when parsing PDFs via MinerU (reported with repeated page headers and body text in slices). Closes #15335 ## Test plan - [x] `pytest test/unit_test/deepdoc/parser/test_mineru_parser.py -v` (4/4 passed) --- deepdoc/parser/mineru_parser.py | 17 +++++-- .../mineru/bmw_page_chrome_content_list.json | 11 +++++ .../deepdoc/parser/test_mineru_parser.py | 47 +++++++++++++++++++ 3 files changed, 72 insertions(+), 3 deletions(-) create mode 100644 test/fixtures/mineru/bmw_page_chrome_content_list.json diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index b0c0eafb6e..f96aed32a8 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -52,6 +52,9 @@ class MinerUContentType(StrEnum): EQUATION = "equation" CODE = "code" LIST = "list" + HEADER = "header" + FOOTER = "footer" + PAGE_NUMBER = "page_number" DISCARDED = "discarded" @@ -654,7 +657,7 @@ class MinerUParser(RAGFlowPdfParser): def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None): sections = [] for output in outputs: - match output["type"]: + match output.get("type"): case MinerUContentType.TEXT: section = output.get("text", "") case MinerUContentType.TABLE: @@ -677,8 +680,16 @@ class MinerUParser(RAGFlowPdfParser): section = output.get("code_body", "") + "\n".join(output.get("code_caption", [])) case MinerUContentType.LIST: section = "\n".join(output.get("list_items", [])) - case MinerUContentType.DISCARDED: - continue # Skip discarded blocks entirely + case ( + MinerUContentType.HEADER + | MinerUContentType.FOOTER + | MinerUContentType.PAGE_NUMBER + | MinerUContentType.DISCARDED + ): + continue + case _: + self.logger.debug("[MinerU] Skip unsupported section type=%s", output.get("type")) + continue section = self._sanitize_section_text(section) if not section: diff --git a/test/fixtures/mineru/bmw_page_chrome_content_list.json b/test/fixtures/mineru/bmw_page_chrome_content_list.json new file mode 100644 index 0000000000..fb1c4fe017 --- /dev/null +++ b/test/fixtures/mineru/bmw_page_chrome_content_list.json @@ -0,0 +1,11 @@ +[ + {"type": "text", "text": "打开和关闭", "bbox": [100, 100, 200, 120], "page_idx": 0}, + {"type": "text", "text": "车辆装备", "bbox": [100, 130, 200, 150], "page_idx": 0}, + {"type": "header", "text": "打开和关闭", "bbox": [556, 20, 663, 40], "page_idx": 0}, + {"type": "text", "text": "车辆钥匙", "bbox": [100, 160, 200, 180], "page_idx": 0}, + {"type": "page_number", "text": "77", "bbox": [933, 956, 968, 976], "page_idx": 0}, + {"type": "footer", "text": "Online Edition for Part no. 01405A89DE4 - VI/23", "bbox": [281, 978, 718, 994], "page_idx": 0}, + {"type": "text", "text": "概述", "bbox": [100, 190, 200, 210], "page_idx": 0}, + {"type": "header", "text": "概述", "bbox": [556, 20, 663, 40], "page_idx": 1}, + {"type": "text", "text": "安全提示", "bbox": [100, 220, 200, 240], "page_idx": 1} +] diff --git a/test/unit_test/deepdoc/parser/test_mineru_parser.py b/test/unit_test/deepdoc/parser/test_mineru_parser.py index 9e624e51b1..c7055ed31a 100644 --- a/test/unit_test/deepdoc/parser/test_mineru_parser.py +++ b/test/unit_test/deepdoc/parser/test_mineru_parser.py @@ -66,3 +66,50 @@ def test_transfer_to_sections_logs_sections_dropped_after_sanitization(monkeypat assert sections == [] assert "Skip section after sanitization" in caplog.text assert f"type={module.MinerUContentType.TEXT}" in caplog.text + + +def test_transfer_to_sections_skips_page_chrome_without_duplicating_text(monkeypatch): + module = _load_mineru_parser(monkeypatch) + parser = module.MinerUParser() + fixture_path = Path(__file__).resolve().parents[3] / "fixtures" / "mineru" / "bmw_page_chrome_content_list.json" + outputs = __import__("json").loads(fixture_path.read_text(encoding="utf-8")) + + sections = parser._transfer_to_sections(outputs, parse_method="raw") + texts = [section[0] for section in sections] + + assert texts == ["打开和关闭", "车辆装备", "车辆钥匙", "概述", "安全提示"] + assert texts.count("打开和关闭") == 1 + assert texts.count("概述") == 1 + assert "77" not in texts + assert "Online Edition for Part no." not in " ".join(texts) + + +def test_transfer_to_sections_skips_unknown_types_without_duplicating_text(monkeypatch, caplog): + module = _load_mineru_parser(monkeypatch) + parser = module.MinerUParser() + outputs = [ + { + "type": module.MinerUContentType.TEXT, + "text": "Primary content", + "page_idx": 0, + "bbox": (0, 0, 1, 1), + }, + { + "type": "sidebar", + "text": "Should not repeat previous section", + "page_idx": 0, + "bbox": (0, 0, 1, 1), + }, + { + "type": module.MinerUContentType.TEXT, + "text": "Next content", + "page_idx": 0, + "bbox": (0, 0, 1, 1), + }, + ] + + with caplog.at_level(logging.DEBUG, logger=parser.logger.name): + sections = parser._transfer_to_sections(outputs, parse_method="raw") + + assert [section[0] for section in sections] == ["Primary content", "Next content"] + assert "Skip unsupported section type=sidebar" in caplog.text