fix(mineru): skip page chrome blocks to prevent duplicate chunks (#15387)

## Summary - Skip MinerU `header`, `footer`, and `page_number` blocks when converting `content_list.json` into sections. - Ignore unsupported block types explicitly so future MinerU output types cannot re-emit the previous text block. Fixes duplicate text in General/naive chunks when parsing PDFs via MinerU (reported with repeated page headers and body text in slices). Closes #15335 ## Test plan - [x] `pytest test/unit_test/deepdoc/parser/test_mineru_parser.py -v` (4/4 passed)
2026-06-29 15:31:05 +08:00 · 2026-06-01 05:15:04 -07:00
parent f0e4f2d5d8
commit d398d617ca
3 changed files with 72 additions and 3 deletions
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -52,6 +52,9 @@ class MinerUContentType(StrEnum):
    EQUATION = "equation"
    CODE = "code"
    LIST = "list"
+    HEADER = "header"
+    FOOTER = "footer"
+    PAGE_NUMBER = "page_number"
    DISCARDED = "discarded"


@@ -654,7 +657,7 @@ class MinerUParser(RAGFlowPdfParser):
    def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
        sections = []
        for output in outputs:
-            match output["type"]:
+            match output.get("type"):
                case MinerUContentType.TEXT:
                    section = output.get("text", "")
                case MinerUContentType.TABLE:
@@ -677,8 +680,16 @@ class MinerUParser(RAGFlowPdfParser):
                    section = output.get("code_body", "") + "\n".join(output.get("code_caption", []))
                case MinerUContentType.LIST:
                    section = "\n".join(output.get("list_items", []))
-                case MinerUContentType.DISCARDED:
-                    continue  # Skip discarded blocks entirely
+                case (
+                    MinerUContentType.HEADER
+                    | MinerUContentType.FOOTER
+                    | MinerUContentType.PAGE_NUMBER
+                    | MinerUContentType.DISCARDED
+                ):
+                    continue
+                case _:
+                    self.logger.debug("[MinerU] Skip unsupported section type=%s", output.get("type"))
+                    continue

            section = self._sanitize_section_text(section)
            if not section: