mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
Fix: MinerU 3.x output discovery and API contract (#14282)
### What problem does this PR solve? update MinerU parser to most recent minerU v3 logic ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@@ -288,13 +288,13 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
|
||||
headers = {"Accept": "application/json"}
|
||||
try:
|
||||
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/pdf_parse backend={options.backend} server_url={data.get('server_url')}")
|
||||
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse backend={options.backend} server_url={data.get('server_url')}")
|
||||
if callback:
|
||||
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/pdf_parse")
|
||||
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
||||
with open(pdf_file_path, "rb") as pdf_file:
|
||||
files = {"files": (pdf_file_name + ".pdf", pdf_file, "application/pdf")}
|
||||
with requests.post(
|
||||
url=f"{self.mineru_api}/pdf_parse",
|
||||
url=f"{self.mineru_api}/file_parse",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers,
|
||||
@@ -303,27 +303,22 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
) as response:
|
||||
response.raise_for_status()
|
||||
content_type = response.headers.get("Content-Type", "")
|
||||
if content_type.startswith("application/zip"):
|
||||
self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
||||
|
||||
if callback:
|
||||
callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
||||
|
||||
with open(output_zip_path, "wb") as f:
|
||||
response.raw.decode_content = True
|
||||
shutil.copyfileobj(response.raw, f)
|
||||
|
||||
self.logger.info(f"[MinerU] Unzip to {output_path}...")
|
||||
self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")
|
||||
|
||||
if callback:
|
||||
callback(0.40, f"[MinerU] Unzip to {output_path}...")
|
||||
else:
|
||||
self.logger.warning(f"[MinerU] not zip returned from api: {content_type}")
|
||||
except Exception as e:
|
||||
if not content_type.startswith("application/zip"):
|
||||
raise RuntimeError(f"[MinerU] not zip returned from api: {content_type}")
|
||||
self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
||||
if callback:
|
||||
callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
||||
with open(output_zip_path, "wb") as f:
|
||||
response.raw.decode_content = True
|
||||
shutil.copyfileobj(response.raw, f)
|
||||
self.logger.info(f"[MinerU] Unzip to {output_path}...")
|
||||
self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")
|
||||
if callback:
|
||||
callback(0.40, f"[MinerU] Unzip to {output_path}...")
|
||||
self.logger.info("[MinerU] Api completed successfully.")
|
||||
return Path(output_path)
|
||||
except requests.RequestException as e:
|
||||
raise RuntimeError(f"[MinerU] api failed with exception {e}")
|
||||
self.logger.info("[MinerU] Api completed successfully.")
|
||||
return Path(output_path)
|
||||
|
||||
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
|
||||
self.page_from = page_from
|
||||
@@ -517,7 +512,8 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
return sanitized or "unnamed"
|
||||
|
||||
safe_stem = _sanitize_filename(file_stem)
|
||||
allowed_names = {f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json"}
|
||||
content_names = (f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json")
|
||||
allowed_names = set(content_names)
|
||||
self.logger.info(f"[MinerU] Expected output files: {', '.join(sorted(allowed_names))}")
|
||||
self.logger.info(f"[MinerU] Searching output in: {output_dir}")
|
||||
|
||||
@@ -542,6 +538,27 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
subdir = nested_alt.parent
|
||||
json_file = nested_alt
|
||||
|
||||
if not json_file:
|
||||
parse_subdir = None
|
||||
if backend.startswith("pipeline"):
|
||||
parse_subdir = method
|
||||
elif backend.startswith("hybrid"):
|
||||
parse_subdir = f"hybrid_{method}"
|
||||
elif backend.startswith("vlm"):
|
||||
parse_subdir = "vlm"
|
||||
|
||||
if parse_subdir:
|
||||
for content_name in content_names:
|
||||
for candidate in output_dir.glob(f"**/{parse_subdir}/{content_name}"):
|
||||
self.logger.info(f"[MinerU] Trying parse-method path: {candidate}")
|
||||
attempted.append(candidate)
|
||||
if candidate.exists():
|
||||
subdir = candidate.parent
|
||||
json_file = candidate
|
||||
break
|
||||
if json_file:
|
||||
break
|
||||
|
||||
if not json_file:
|
||||
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user