From c502001d9ebf5233eede424c912e0943dd065ea6 Mon Sep 17 00:00:00 2001 From: Idriss Sbaaoui <112825897+6ba3i@users.noreply.github.com> Date: Wed, 6 May 2026 14:03:57 +0800 Subject: [PATCH] Fix MinerU output fallback and NameError regression (#14538) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? This fixes a MinerU parsing failure where output JSON was not found in nested v0.24.0 layouts, and also fixes a `content_names` NameError in `_read_output()`. As a result, successful MinerU API runs no longer end with false “MinerU not found” parsing failures. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/mineru_parser.py | 52 +++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index fd147686a7..2c3f63ae3f 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -514,7 +514,7 @@ class MinerUParser(RAGFlowPdfParser): return sanitized or "unnamed" safe_stem = _sanitize_filename(file_stem) - content_names = (f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json") + content_names = tuple(dict.fromkeys((f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json"))) allowed_names = set(content_names) self.logger.info(f"[MinerU] Expected output files: {', '.join(sorted(allowed_names))}") self.logger.info(f"[MinerU] Searching output in: {output_dir}") @@ -554,13 +554,55 @@ class MinerUParser(RAGFlowPdfParser): for candidate in output_dir.glob(f"**/{parse_subdir}/{content_name}"): self.logger.info(f"[MinerU] Trying parse-method path: {candidate}") attempted.append(candidate) - if candidate.exists(): - subdir = candidate.parent - json_file = candidate - break + subdir = candidate.parent + json_file = candidate + break if json_file: break + if not json_file: + stem_dirs = tuple(dict.fromkeys((file_stem, safe_stem))) + patterns = [] + if parse_subdir: + for stem_dir in stem_dirs: + patterns.extend( + [ + f"**/{stem_dir}/{parse_subdir}/content_list.json", + f"**/{stem_dir}/{parse_subdir}/*_content_list.json", + ] + ) + patterns.extend( + [ + f"**/{parse_subdir}/content_list.json", + f"**/{parse_subdir}/*_content_list.json", + ] + ) + for stem_dir in stem_dirs: + patterns.extend( + [ + f"**/{stem_dir}/content_list.json", + f"**/{stem_dir}/*_content_list.json", + ] + ) + patterns.extend(["**/content_list.json", "**/*_content_list.json"]) + + for pattern in patterns: + for candidate in sorted(output_dir.glob(pattern)): + self.logger.info(f"[MinerU] Trying fallback path: {candidate}") + if candidate.name.endswith("_content_list.json"): + rel_parts = candidate.relative_to(output_dir).parts + in_stem_dir = any(stem_dir in rel_parts for stem_dir in stem_dirs) + stem_match = candidate.stem.startswith(file_stem) or candidate.stem.startswith(safe_stem) + if not (stem_match or in_stem_dir): + self.logger.info(f"[MinerU] Skip unrelated fallback candidate: {candidate}") + continue + attempted.append(candidate) + subdir = candidate.parent + json_file = candidate + break + if json_file: + break + if not json_file: raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}")