mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
fix(mineru): skip page chrome blocks to prevent duplicate chunks (#15387)
## Summary - Skip MinerU `header`, `footer`, and `page_number` blocks when converting `content_list.json` into sections. - Ignore unsupported block types explicitly so future MinerU output types cannot re-emit the previous text block. Fixes duplicate text in General/naive chunks when parsing PDFs via MinerU (reported with repeated page headers and body text in slices). Closes #15335 ## Test plan - [x] `pytest test/unit_test/deepdoc/parser/test_mineru_parser.py -v` (4/4 passed)
This commit is contained in:
@@ -52,6 +52,9 @@ class MinerUContentType(StrEnum):
|
||||
EQUATION = "equation"
|
||||
CODE = "code"
|
||||
LIST = "list"
|
||||
HEADER = "header"
|
||||
FOOTER = "footer"
|
||||
PAGE_NUMBER = "page_number"
|
||||
DISCARDED = "discarded"
|
||||
|
||||
|
||||
@@ -654,7 +657,7 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
|
||||
sections = []
|
||||
for output in outputs:
|
||||
match output["type"]:
|
||||
match output.get("type"):
|
||||
case MinerUContentType.TEXT:
|
||||
section = output.get("text", "")
|
||||
case MinerUContentType.TABLE:
|
||||
@@ -677,8 +680,16 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
section = output.get("code_body", "") + "\n".join(output.get("code_caption", []))
|
||||
case MinerUContentType.LIST:
|
||||
section = "\n".join(output.get("list_items", []))
|
||||
case MinerUContentType.DISCARDED:
|
||||
continue # Skip discarded blocks entirely
|
||||
case (
|
||||
MinerUContentType.HEADER
|
||||
| MinerUContentType.FOOTER
|
||||
| MinerUContentType.PAGE_NUMBER
|
||||
| MinerUContentType.DISCARDED
|
||||
):
|
||||
continue
|
||||
case _:
|
||||
self.logger.debug("[MinerU] Skip unsupported section type=%s", output.get("type"))
|
||||
continue
|
||||
|
||||
section = self._sanitize_section_text(section)
|
||||
if not section:
|
||||
|
||||
Reference in New Issue
Block a user