fix(mineru): skip page chrome blocks to prevent duplicate chunks (#15387)

## Summary
- Skip MinerU `header`, `footer`, and `page_number` blocks when
converting `content_list.json` into sections.
- Ignore unsupported block types explicitly so future MinerU output
types cannot re-emit the previous text block.

Fixes duplicate text in General/naive chunks when parsing PDFs via
MinerU (reported with repeated page headers and body text in slices).

Closes #15335

## Test plan
- [x] `pytest test/unit_test/deepdoc/parser/test_mineru_parser.py -v`
(4/4 passed)
This commit is contained in:
monsterDavid
2026-06-01 05:15:04 -07:00
committed by GitHub
parent f0e4f2d5d8
commit d398d617ca
3 changed files with 72 additions and 3 deletions

View File

@@ -52,6 +52,9 @@ class MinerUContentType(StrEnum):
EQUATION = "equation"
CODE = "code"
LIST = "list"
HEADER = "header"
FOOTER = "footer"
PAGE_NUMBER = "page_number"
DISCARDED = "discarded"
@@ -654,7 +657,7 @@ class MinerUParser(RAGFlowPdfParser):
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
sections = []
for output in outputs:
match output["type"]:
match output.get("type"):
case MinerUContentType.TEXT:
section = output.get("text", "")
case MinerUContentType.TABLE:
@@ -677,8 +680,16 @@ class MinerUParser(RAGFlowPdfParser):
section = output.get("code_body", "") + "\n".join(output.get("code_caption", []))
case MinerUContentType.LIST:
section = "\n".join(output.get("list_items", []))
case MinerUContentType.DISCARDED:
continue # Skip discarded blocks entirely
case (
MinerUContentType.HEADER
| MinerUContentType.FOOTER
| MinerUContentType.PAGE_NUMBER
| MinerUContentType.DISCARDED
):
continue
case _:
self.logger.debug("[MinerU] Skip unsupported section type=%s", output.get("type"))
continue
section = self._sanitize_section_text(section)
if not section: