From d398d617caf9d639d5a0a57e67970bc7cb9bb698 Mon Sep 17 00:00:00 2001
From: monsterDavid <monsterdavidliu@gmail.com>
Date: Mon, 1 Jun 2026 05:15:04 -0700
Subject: [PATCH] fix(mineru): skip page chrome blocks to prevent duplicate
 chunks (#15387)

## Summary
- Skip MinerU `header`, `footer`, and `page_number` blocks when
converting `content_list.json` into sections.
- Ignore unsupported block types explicitly so future MinerU output
types cannot re-emit the previous text block.

Fixes duplicate text in General/naive chunks when parsing PDFs via
MinerU (reported with repeated page headers and body text in slices).

Closes #15335

## Test plan
- [x] `pytest test/unit_test/deepdoc/parser/test_mineru_parser.py -v`
(4/4 passed)
---
 deepdoc/parser/mineru_parser.py               | 17 +++++--
 .../mineru/bmw_page_chrome_content_list.json  | 11 +++++
 .../deepdoc/parser/test_mineru_parser.py      | 47 +++++++++++++++++++
 3 files changed, 72 insertions(+), 3 deletions(-)
 create mode 100644 test/fixtures/mineru/bmw_page_chrome_content_list.json

diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index b0c0eafb6e..f96aed32a8 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -52,6 +52,9 @@ class MinerUContentType(StrEnum):
     EQUATION = "equation"
     CODE = "code"
     LIST = "list"
+    HEADER = "header"
+    FOOTER = "footer"
+    PAGE_NUMBER = "page_number"
     DISCARDED = "discarded"
 
 
@@ -654,7 +657,7 @@ class MinerUParser(RAGFlowPdfParser):
     def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
         sections = []
         for output in outputs:
-            match output["type"]:
+            match output.get("type"):
                 case MinerUContentType.TEXT:
                     section = output.get("text", "")
                 case MinerUContentType.TABLE:
@@ -677,8 +680,16 @@ class MinerUParser(RAGFlowPdfParser):
                     section = output.get("code_body", "") + "\n".join(output.get("code_caption", []))
                 case MinerUContentType.LIST:
                     section = "\n".join(output.get("list_items", []))
-                case MinerUContentType.DISCARDED:
-                    continue  # Skip discarded blocks entirely
+                case (
+                    MinerUContentType.HEADER
+                    | MinerUContentType.FOOTER
+                    | MinerUContentType.PAGE_NUMBER
+                    | MinerUContentType.DISCARDED
+                ):
+                    continue
+                case _:
+                    self.logger.debug("[MinerU] Skip unsupported section type=%s", output.get("type"))
+                    continue
 
             section = self._sanitize_section_text(section)
             if not section:
diff --git a/test/fixtures/mineru/bmw_page_chrome_content_list.json b/test/fixtures/mineru/bmw_page_chrome_content_list.json
new file mode 100644
index 0000000000..fb1c4fe017
--- /dev/null
+++ b/test/fixtures/mineru/bmw_page_chrome_content_list.json
@@ -0,0 +1,11 @@
+[
+  {"type": "text", "text": "打开和关闭", "bbox": [100, 100, 200, 120], "page_idx": 0},
+  {"type": "text", "text": "车辆装备", "bbox": [100, 130, 200, 150], "page_idx": 0},
+  {"type": "header", "text": "打开和关闭", "bbox": [556, 20, 663, 40], "page_idx": 0},
+  {"type": "text", "text": "车辆钥匙", "bbox": [100, 160, 200, 180], "page_idx": 0},
+  {"type": "page_number", "text": "77", "bbox": [933, 956, 968, 976], "page_idx": 0},
+  {"type": "footer", "text": "Online Edition for Part no. 01405A89DE4 - VI/23", "bbox": [281, 978, 718, 994], "page_idx": 0},
+  {"type": "text", "text": "概述", "bbox": [100, 190, 200, 210], "page_idx": 0},
+  {"type": "header", "text": "概述", "bbox": [556, 20, 663, 40], "page_idx": 1},
+  {"type": "text", "text": "安全提示", "bbox": [100, 220, 200, 240], "page_idx": 1}
+]
diff --git a/test/unit_test/deepdoc/parser/test_mineru_parser.py b/test/unit_test/deepdoc/parser/test_mineru_parser.py
index 9e624e51b1..c7055ed31a 100644
--- a/test/unit_test/deepdoc/parser/test_mineru_parser.py
+++ b/test/unit_test/deepdoc/parser/test_mineru_parser.py
@@ -66,3 +66,50 @@ def test_transfer_to_sections_logs_sections_dropped_after_sanitization(monkeypat
     assert sections == []
     assert "Skip section after sanitization" in caplog.text
     assert f"type={module.MinerUContentType.TEXT}" in caplog.text
+
+
+def test_transfer_to_sections_skips_page_chrome_without_duplicating_text(monkeypatch):
+    module = _load_mineru_parser(monkeypatch)
+    parser = module.MinerUParser()
+    fixture_path = Path(__file__).resolve().parents[3] / "fixtures" / "mineru" / "bmw_page_chrome_content_list.json"
+    outputs = __import__("json").loads(fixture_path.read_text(encoding="utf-8"))
+
+    sections = parser._transfer_to_sections(outputs, parse_method="raw")
+    texts = [section[0] for section in sections]
+
+    assert texts == ["打开和关闭", "车辆装备", "车辆钥匙", "概述", "安全提示"]
+    assert texts.count("打开和关闭") == 1
+    assert texts.count("概述") == 1
+    assert "77" not in texts
+    assert "Online Edition for Part no." not in " ".join(texts)
+
+
+def test_transfer_to_sections_skips_unknown_types_without_duplicating_text(monkeypatch, caplog):
+    module = _load_mineru_parser(monkeypatch)
+    parser = module.MinerUParser()
+    outputs = [
+        {
+            "type": module.MinerUContentType.TEXT,
+            "text": "Primary content",
+            "page_idx": 0,
+            "bbox": (0, 0, 1, 1),
+        },
+        {
+            "type": "sidebar",
+            "text": "Should not repeat previous section",
+            "page_idx": 0,
+            "bbox": (0, 0, 1, 1),
+        },
+        {
+            "type": module.MinerUContentType.TEXT,
+            "text": "Next content",
+            "page_idx": 0,
+            "bbox": (0, 0, 1, 1),
+        },
+    ]
+
+    with caplog.at_level(logging.DEBUG, logger=parser.logger.name):
+        sections = parser._transfer_to_sections(outputs, parse_method="raw")
+
+    assert [section[0] for section in sections] == ["Primary content", "Next content"]
+    assert "Skip unsupported section type=sidebar" in caplog.text