diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 17cfa99e25..548baddcb6 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -288,13 +288,13 @@ class MinerUParser(RAGFlowPdfParser): headers = {"Accept": "application/json"} try: - self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/pdf_parse backend={options.backend} server_url={data.get('server_url')}") + self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse backend={options.backend} server_url={data.get('server_url')}") if callback: - callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/pdf_parse") + callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse") with open(pdf_file_path, "rb") as pdf_file: files = {"files": (pdf_file_name + ".pdf", pdf_file, "application/pdf")} with requests.post( - url=f"{self.mineru_api}/pdf_parse", + url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers, @@ -303,27 +303,22 @@ class MinerUParser(RAGFlowPdfParser): ) as response: response.raise_for_status() content_type = response.headers.get("Content-Type", "") - if content_type.startswith("application/zip"): - self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...") - - if callback: - callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...") - - with open(output_zip_path, "wb") as f: - response.raw.decode_content = True - shutil.copyfileobj(response.raw, f) - - self.logger.info(f"[MinerU] Unzip to {output_path}...") - self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/") - - if callback: - callback(0.40, f"[MinerU] Unzip to {output_path}...") - else: - self.logger.warning(f"[MinerU] not zip returned from api: {content_type}") - except Exception as e: + if not content_type.startswith("application/zip"): + raise RuntimeError(f"[MinerU] not zip returned from api: {content_type}") + self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...") + if callback: + callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...") + with open(output_zip_path, "wb") as f: + response.raw.decode_content = True + shutil.copyfileobj(response.raw, f) + self.logger.info(f"[MinerU] Unzip to {output_path}...") + self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/") + if callback: + callback(0.40, f"[MinerU] Unzip to {output_path}...") + self.logger.info("[MinerU] Api completed successfully.") + return Path(output_path) + except requests.RequestException as e: raise RuntimeError(f"[MinerU] api failed with exception {e}") - self.logger.info("[MinerU] Api completed successfully.") - return Path(output_path) def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None): self.page_from = page_from @@ -517,7 +512,8 @@ class MinerUParser(RAGFlowPdfParser): return sanitized or "unnamed" safe_stem = _sanitize_filename(file_stem) - allowed_names = {f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json"} + content_names = (f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json") + allowed_names = set(content_names) self.logger.info(f"[MinerU] Expected output files: {', '.join(sorted(allowed_names))}") self.logger.info(f"[MinerU] Searching output in: {output_dir}") @@ -542,6 +538,27 @@ class MinerUParser(RAGFlowPdfParser): subdir = nested_alt.parent json_file = nested_alt + if not json_file: + parse_subdir = None + if backend.startswith("pipeline"): + parse_subdir = method + elif backend.startswith("hybrid"): + parse_subdir = f"hybrid_{method}" + elif backend.startswith("vlm"): + parse_subdir = "vlm" + + if parse_subdir: + for content_name in content_names: + for candidate in output_dir.glob(f"**/{parse_subdir}/{content_name}"): + self.logger.info(f"[MinerU] Trying parse-method path: {candidate}") + attempted.append(candidate) + if candidate.exists(): + subdir = candidate.parent + json_file = candidate + break + if json_file: + break + if not json_file: raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}")