2025-10-17 09:55:39 +08:00
|
|
|
#
|
|
|
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
#
|
|
|
|
|
import json
|
2026-05-24 22:06:36 -10:00
|
|
|
import html
|
2025-10-17 09:55:39 +08:00
|
|
|
import logging
|
2025-11-04 16:03:30 +08:00
|
|
|
import os
|
2025-10-17 09:55:39 +08:00
|
|
|
import re
|
2026-01-13 12:24:50 +08:00
|
|
|
import shutil
|
2025-10-17 09:55:39 +08:00
|
|
|
import sys
|
|
|
|
|
import tempfile
|
|
|
|
|
import threading
|
2025-11-04 16:03:30 +08:00
|
|
|
import zipfile
|
2025-12-16 07:15:25 +02:00
|
|
|
from dataclasses import dataclass
|
2025-10-17 09:55:39 +08:00
|
|
|
from io import BytesIO
|
|
|
|
|
from os import PathLike
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import Any, Callable, Optional
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
import pdfplumber
|
2025-11-04 16:03:30 +08:00
|
|
|
import requests
|
2025-10-17 09:55:39 +08:00
|
|
|
from PIL import Image
|
2026-05-15 08:40:53 +02:00
|
|
|
from enum import StrEnum
|
2025-10-17 09:55:39 +08:00
|
|
|
|
|
|
|
|
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
2026-04-03 19:26:45 +08:00
|
|
|
from deepdoc.parser.utils import extract_pdf_outlines
|
2025-10-17 09:55:39 +08:00
|
|
|
|
Fix: Remove hardcoded page limits causing parsing failures on large PDFs (>300 pages) (#14382)
### What problem does this PR solve?
Fixes #14196
## Problem
When using DeepDOC to parse large PDFs (over 1000 pages), the parser
silently truncated processing at 300 pages due to a hardcoded default
`page_to=299` in `RAGFlowPdfParser.__images__()`. This caused:
- **Errors** on pages beyond the limit
- **Poor image quality** as the parser attempted to compensate with
missing page data
- **Inconsistent chunk splitting** between full PDF imports and partial
imports
Additionally, the codebase scattered magic numbers (`299`, `600`,
`10000`, `100000`, `100000000`, `10000000000`, `10**9`) across 22 files
as sentinel values for "parse all pages", making future maintenance
error-prone.
## Root Cause
```python
# deepdoc/parser/pdf_parser.py (before)
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
# Only the first 300 pages were rendered; everything beyond was silently dropped
```
While most callers in `rag/app/*.py` correctly passed `to_page=100000`,
the base class `RAGFlowPdfParser.__call__()` and `parse_into_bboxes()`
invoked `__images__` **without** forwarding `page_from`/`page_to`,
falling back to the restrictive default of 299.
## Solution
### 1. Define constants in `common/constants.py`
```python
MAXIMUM_PAGE_NUMBER = 100000 # Used by the parsing layer
MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000 # Used by the task/DB layer
```
### 2. Replace all hardcoded sentinel values
| Layer | Files Changed | Old Values | New Value |
|---|---|---|---|
| **Deepdoc parsers** | `pdf_parser.py`, `mineru_parser.py`,
`docling_parser.py`, `opendataloader_parser.py`, `paddleocr_parser.py`,
`docx_parser.py` | `299`, `600`, `10**9`, `100000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Chunk parsers** | `naive.py`, `book.py`, `qa.py`, `one.py`,
`manual.py`, `paper.py`, `presentation.py`, `laws.py`, `resume.py`,
`email.py`, `table.py` | `100000`, `10000`, `10000000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Task/DB layer** | `db_models.py`, `task_service.py`,
`document_service.py`, `file_service.py` | `100000000` |
`MAXIMUM_TASK_PAGE_NUMBER` |
### 3. Fix `parse_into_bboxes()` missing parameters
Added `from_page`/`to_page` parameters to `parse_into_bboxes()` so that
the `rag/flow/parser/parser.py` DeepDOC path no longer falls back to the
restrictive default.
## Files Changed (22)
- `common/constants.py`
- `deepdoc/parser/pdf_parser.py`
- `deepdoc/parser/mineru_parser.py`
- `deepdoc/parser/docling_parser.py`
- `deepdoc/parser/opendataloader_parser.py`
- `deepdoc/parser/paddleocr_parser.py`
- `deepdoc/parser/docx_parser.py`
- `rag/app/naive.py`
- `rag/app/book.py`
- `rag/app/qa.py`
- `rag/app/one.py`
- `rag/app/manual.py`
- `rag/app/paper.py`
- `rag/app/presentation.py`
- `rag/app/laws.py`
- `rag/app/resume.py`
- `rag/app/email.py`
- `rag/app/table.py`
- `api/db/db_models.py`
- `api/db/services/task_service.py`
- `api/db/services/document_service.py`
- `api/db/services/file_service.py`
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
---------
Signed-off-by: noob <yixiao121314@outlook.com>
2026-04-27 06:57:20 +00:00
|
|
|
from common.constants import MAXIMUM_PAGE_NUMBER
|
|
|
|
|
|
2025-10-17 09:55:39 +08:00
|
|
|
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
|
|
|
|
if LOCK_KEY_pdfplumber not in sys.modules:
|
|
|
|
|
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MinerUContentType(StrEnum):
|
|
|
|
|
IMAGE = "image"
|
|
|
|
|
TABLE = "table"
|
|
|
|
|
TEXT = "text"
|
|
|
|
|
EQUATION = "equation"
|
2025-10-27 17:04:13 +08:00
|
|
|
CODE = "code"
|
|
|
|
|
LIST = "list"
|
2026-06-01 05:15:04 -07:00
|
|
|
HEADER = "header"
|
|
|
|
|
FOOTER = "footer"
|
|
|
|
|
PAGE_NUMBER = "page_number"
|
2025-10-27 17:04:13 +08:00
|
|
|
DISCARDED = "discarded"
|
2025-10-17 09:55:39 +08:00
|
|
|
|
|
|
|
|
|
2025-12-16 07:15:25 +02:00
|
|
|
# Mapping from language names to MinerU language codes
|
|
|
|
|
LANGUAGE_TO_MINERU_MAP = {
|
|
|
|
|
'English': 'en',
|
|
|
|
|
'Chinese': 'ch',
|
|
|
|
|
'Traditional Chinese': 'chinese_cht',
|
|
|
|
|
'Russian': 'east_slavic',
|
|
|
|
|
'Ukrainian': 'east_slavic',
|
|
|
|
|
'Indonesian': 'latin',
|
|
|
|
|
'Spanish': 'latin',
|
|
|
|
|
'Vietnamese': 'latin',
|
|
|
|
|
'Japanese': 'japan',
|
|
|
|
|
'Korean': 'korean',
|
|
|
|
|
'Portuguese BR': 'latin',
|
|
|
|
|
'German': 'latin',
|
|
|
|
|
'French': 'latin',
|
|
|
|
|
'Italian': 'latin',
|
|
|
|
|
'Tamil': 'ta',
|
|
|
|
|
'Telugu': 'te',
|
|
|
|
|
'Kannada': 'ka',
|
|
|
|
|
'Thai': 'th',
|
|
|
|
|
'Greek': 'el',
|
|
|
|
|
'Hindi': 'devanagari',
|
2026-02-14 10:51:29 +02:00
|
|
|
'Bulgarian': 'cyrillic',
|
2026-03-18 16:09:32 +03:00
|
|
|
'Turkish': 'latin',
|
2025-12-16 07:15:25 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MinerUBackend(StrEnum):
|
|
|
|
|
"""MinerU processing backend options."""
|
|
|
|
|
|
|
|
|
|
PIPELINE = "pipeline" # Traditional multimodel pipeline (default)
|
|
|
|
|
VLM_TRANSFORMERS = "vlm-transformers" # Vision-language model using HuggingFace Transformers
|
|
|
|
|
VLM_MLX_ENGINE = "vlm-mlx-engine" # Faster, requires Apple Silicon and macOS 13.5+
|
|
|
|
|
VLM_VLLM_ENGINE = "vlm-vllm-engine" # Local vLLM engine, requires local GPU
|
|
|
|
|
VLM_VLLM_ASYNC_ENGINE = "vlm-vllm-async-engine" # Asynchronous vLLM engine, new in MinerU API
|
|
|
|
|
VLM_LMDEPLOY_ENGINE = "vlm-lmdeploy-engine" # LMDeploy engine
|
|
|
|
|
VLM_HTTP_CLIENT = "vlm-http-client" # HTTP client for remote vLLM server (CPU only)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MinerULanguage(StrEnum):
|
|
|
|
|
"""MinerU supported languages for OCR (pipeline backend only)."""
|
|
|
|
|
|
|
|
|
|
CH = "ch" # Chinese
|
|
|
|
|
CH_SERVER = "ch_server" # Chinese (server)
|
|
|
|
|
CH_LITE = "ch_lite" # Chinese (lite)
|
|
|
|
|
EN = "en" # English
|
|
|
|
|
KOREAN = "korean" # Korean
|
|
|
|
|
JAPAN = "japan" # Japanese
|
|
|
|
|
CHINESE_CHT = "chinese_cht" # Chinese Traditional
|
|
|
|
|
TA = "ta" # Tamil
|
|
|
|
|
TE = "te" # Telugu
|
|
|
|
|
KA = "ka" # Kannada
|
|
|
|
|
TH = "th" # Thai
|
|
|
|
|
EL = "el" # Greek
|
|
|
|
|
LATIN = "latin" # Latin
|
|
|
|
|
ARABIC = "arabic" # Arabic
|
|
|
|
|
EAST_SLAVIC = "east_slavic" # East Slavic
|
|
|
|
|
CYRILLIC = "cyrillic" # Cyrillic
|
|
|
|
|
DEVANAGARI = "devanagari" # Devanagari
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MinerUParseMethod(StrEnum):
|
|
|
|
|
"""MinerU PDF parsing methods (pipeline backend only)."""
|
|
|
|
|
|
|
|
|
|
AUTO = "auto" # Automatically determine the method based on the file type
|
|
|
|
|
TXT = "txt" # Use text extraction method
|
|
|
|
|
OCR = "ocr" # Use OCR method for image-based PDFs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class MinerUParseOptions:
|
|
|
|
|
"""Options for MinerU PDF parsing."""
|
|
|
|
|
|
|
|
|
|
backend: MinerUBackend = MinerUBackend.PIPELINE
|
|
|
|
|
lang: Optional[MinerULanguage] = None # language for OCR (pipeline backend only)
|
|
|
|
|
method: MinerUParseMethod = MinerUParseMethod.AUTO
|
|
|
|
|
server_url: Optional[str] = None
|
|
|
|
|
delete_output: bool = True
|
|
|
|
|
parse_method: str = "raw"
|
|
|
|
|
formula_enable: bool = True
|
|
|
|
|
table_enable: bool = True
|
|
|
|
|
|
|
|
|
|
|
2025-10-17 09:55:39 +08:00
|
|
|
class MinerUParser(RAGFlowPdfParser):
|
2025-12-11 17:33:12 +08:00
|
|
|
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""):
|
2025-11-04 16:03:30 +08:00
|
|
|
self.mineru_api = mineru_api.rstrip("/")
|
|
|
|
|
self.mineru_server_url = mineru_server_url.rstrip("/")
|
2025-11-18 15:22:52 +08:00
|
|
|
self.outlines = []
|
2025-10-17 09:55:39 +08:00
|
|
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
|
|
|
|
2026-01-13 12:24:50 +08:00
|
|
|
@staticmethod
|
|
|
|
|
def _is_zipinfo_symlink(member: zipfile.ZipInfo) -> bool:
|
|
|
|
|
return (member.external_attr >> 16) & 0o170000 == 0o120000
|
|
|
|
|
|
2025-10-30 17:31:46 +08:00
|
|
|
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
2025-12-03 17:24:37 +08:00
|
|
|
self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}")
|
2026-01-13 12:24:50 +08:00
|
|
|
base_dir = Path(extract_to).resolve()
|
2025-11-04 16:03:30 +08:00
|
|
|
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
2026-01-13 12:24:50 +08:00
|
|
|
members = zip_ref.infolist()
|
2025-10-30 17:31:46 +08:00
|
|
|
if not root_dir:
|
2026-01-13 12:24:50 +08:00
|
|
|
if members and members[0].filename.endswith("/"):
|
|
|
|
|
root_dir = members[0].filename
|
2025-10-30 17:31:46 +08:00
|
|
|
else:
|
|
|
|
|
root_dir = None
|
2026-01-13 12:24:50 +08:00
|
|
|
if root_dir:
|
|
|
|
|
root_dir = root_dir.replace("\\", "/")
|
|
|
|
|
if not root_dir.endswith("/"):
|
|
|
|
|
root_dir += "/"
|
|
|
|
|
|
|
|
|
|
for member in members:
|
|
|
|
|
if member.flag_bits & 0x1:
|
|
|
|
|
raise RuntimeError(f"[MinerU] Encrypted zip entry not supported: {member.filename}")
|
|
|
|
|
if self._is_zipinfo_symlink(member):
|
|
|
|
|
raise RuntimeError(f"[MinerU] Symlink zip entry not supported: {member.filename}")
|
|
|
|
|
|
|
|
|
|
name = member.filename.replace("\\", "/")
|
|
|
|
|
if root_dir and name == root_dir:
|
2025-10-30 17:31:46 +08:00
|
|
|
self.logger.info("[MinerU] Ignore root folder...")
|
|
|
|
|
continue
|
2026-01-13 12:24:50 +08:00
|
|
|
if root_dir and name.startswith(root_dir):
|
|
|
|
|
name = name[len(root_dir) :]
|
|
|
|
|
if not name:
|
|
|
|
|
continue
|
|
|
|
|
if name.startswith("/") or name.startswith("//") or re.match(r"^[A-Za-z]:", name):
|
|
|
|
|
raise RuntimeError(f"[MinerU] Unsafe zip path (absolute): {member.filename}")
|
|
|
|
|
|
|
|
|
|
parts = [p for p in name.split("/") if p not in ("", ".")]
|
|
|
|
|
if any(p == ".." for p in parts):
|
|
|
|
|
raise RuntimeError(f"[MinerU] Unsafe zip path (traversal): {member.filename}")
|
2025-11-04 16:03:30 +08:00
|
|
|
|
2026-01-13 12:24:50 +08:00
|
|
|
rel_path = os.path.join(*parts) if parts else ""
|
|
|
|
|
dest_path = (Path(extract_to) / rel_path).resolve(strict=False)
|
|
|
|
|
if dest_path != base_dir and base_dir not in dest_path.parents:
|
|
|
|
|
raise RuntimeError(f"[MinerU] Unsafe zip path (escape): {member.filename}")
|
2025-11-04 16:03:30 +08:00
|
|
|
|
2025-10-30 17:31:46 +08:00
|
|
|
if member.is_dir():
|
2026-01-13 12:24:50 +08:00
|
|
|
os.makedirs(dest_path, exist_ok=True)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
os.makedirs(dest_path.parent, exist_ok=True)
|
|
|
|
|
with zip_ref.open(member) as src, open(dest_path, "wb") as dst:
|
|
|
|
|
shutil.copyfileobj(src, dst)
|
2025-10-30 17:31:46 +08:00
|
|
|
|
2025-12-16 07:15:25 +02:00
|
|
|
@staticmethod
|
|
|
|
|
def _is_http_endpoint_valid(url, timeout=5):
|
2025-10-30 17:31:46 +08:00
|
|
|
try:
|
|
|
|
|
response = requests.head(url, timeout=timeout, allow_redirects=True)
|
|
|
|
|
return response.status_code in [200, 301, 302, 307, 308]
|
|
|
|
|
except Exception:
|
|
|
|
|
return False
|
|
|
|
|
|
2026-05-24 22:06:36 -10:00
|
|
|
@staticmethod
|
|
|
|
|
def _sanitize_section_text(section: str) -> str:
|
|
|
|
|
"""Normalize MinerU text blocks before chunking.
|
|
|
|
|
|
|
|
|
|
MinerU may return HTML fragments (e.g. table_body with <tr>/<td>/<br>).
|
|
|
|
|
Keep human-readable text while removing tag noise that hurts chunking.
|
|
|
|
|
"""
|
|
|
|
|
if not section:
|
|
|
|
|
return ""
|
|
|
|
|
section = html.unescape(section)
|
|
|
|
|
# Preserve rough structure before dropping tags.
|
|
|
|
|
section = re.sub(r"(?is)<\s*br\s*/?\s*>", "\n", section)
|
|
|
|
|
section = re.sub(r"(?is)</\s*(p|div|li|tr|h[1-6]|table|caption)\s*>", "\n", section)
|
|
|
|
|
section = re.sub(r"(?is)<[^>]+>", "", section)
|
|
|
|
|
# Collapse whitespace while preserving line boundaries.
|
|
|
|
|
section = re.sub(r"[ \t]+\n", "\n", section)
|
|
|
|
|
section = re.sub(r"\n{3,}", "\n\n", section)
|
|
|
|
|
section = re.sub(r"[ \t]{2,}", " ", section)
|
|
|
|
|
return section.strip()
|
|
|
|
|
|
2025-11-04 16:03:30 +08:00
|
|
|
def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
|
|
|
|
|
reason = ""
|
|
|
|
|
|
2025-12-17 12:58:48 +08:00
|
|
|
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine", "vlm-mlx-engine", "vlm-vllm-async-engine", "vlm-lmdeploy-engine"]
|
2025-11-04 16:03:30 +08:00
|
|
|
if backend not in valid_backends:
|
2025-12-17 12:58:48 +08:00
|
|
|
reason = f"[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
|
2025-12-03 17:24:37 +08:00
|
|
|
self.logger.warning(reason)
|
2025-11-04 16:03:30 +08:00
|
|
|
return False, reason
|
|
|
|
|
|
2025-12-17 12:58:48 +08:00
|
|
|
if not self.mineru_api:
|
|
|
|
|
reason = "[MinerU] MINERU_APISERVER not configured."
|
|
|
|
|
self.logger.warning(reason)
|
|
|
|
|
return False, reason
|
2025-11-04 16:03:30 +08:00
|
|
|
|
2025-12-17 12:58:48 +08:00
|
|
|
api_openapi = f"{self.mineru_api}/openapi.json"
|
2025-10-17 09:55:39 +08:00
|
|
|
try:
|
2025-12-17 12:58:48 +08:00
|
|
|
api_ok = self._is_http_endpoint_valid(api_openapi)
|
|
|
|
|
self.logger.info(f"[MinerU] API openapi.json reachable={api_ok} url={api_openapi}")
|
|
|
|
|
if not api_ok:
|
|
|
|
|
reason = f"[MinerU] MinerU API not accessible: {api_openapi}"
|
|
|
|
|
return False, reason
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
reason = f"[MinerU] MinerU API check failed: {exc}"
|
|
|
|
|
self.logger.warning(reason)
|
|
|
|
|
return False, reason
|
2025-10-30 17:31:46 +08:00
|
|
|
|
2025-12-17 12:58:48 +08:00
|
|
|
if backend == "vlm-http-client":
|
|
|
|
|
resolved_server = server_url or self.mineru_server_url
|
|
|
|
|
if not resolved_server:
|
|
|
|
|
reason = "[MinerU] MINERU_SERVER_URL required for vlm-http-client backend."
|
|
|
|
|
self.logger.warning(reason)
|
2025-12-11 17:33:12 +08:00
|
|
|
return False, reason
|
2025-12-17 12:58:48 +08:00
|
|
|
try:
|
|
|
|
|
server_ok = self._is_http_endpoint_valid(resolved_server)
|
|
|
|
|
self.logger.info(f"[MinerU] vlm-http-client server check reachable={server_ok} url={resolved_server}")
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
self.logger.warning(f"[MinerU] vlm-http-client server probe failed: {resolved_server}: {exc}")
|
2025-10-17 09:55:39 +08:00
|
|
|
|
2025-12-17 12:58:48 +08:00
|
|
|
return True, reason
|
2025-11-04 16:03:30 +08:00
|
|
|
|
2025-12-17 12:58:48 +08:00
|
|
|
def _run_mineru(
|
|
|
|
|
self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
|
|
|
|
|
) -> Path:
|
|
|
|
|
return self._run_mineru_api(input_path, output_dir, options, callback)
|
2025-11-04 16:03:30 +08:00
|
|
|
|
2025-12-17 12:58:48 +08:00
|
|
|
def _run_mineru_api(
|
|
|
|
|
self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
|
|
|
|
|
) -> Path:
|
2025-10-30 17:31:46 +08:00
|
|
|
pdf_file_path = str(input_path)
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(pdf_file_path):
|
|
|
|
|
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
|
|
|
|
|
|
|
|
|
|
pdf_file_name = Path(pdf_file_path).stem.strip()
|
2025-12-17 12:58:48 +08:00
|
|
|
output_path = tempfile.mkdtemp(prefix=f"{pdf_file_name}_{options.method}_", dir=str(output_dir))
|
|
|
|
|
output_zip_path = os.path.join(str(output_dir), f"{Path(output_path).name}.zip")
|
2025-10-30 17:31:46 +08:00
|
|
|
|
|
|
|
|
data = {
|
|
|
|
|
"output_dir": "./output",
|
2025-12-16 07:15:25 +02:00
|
|
|
"lang_list": options.lang,
|
|
|
|
|
"backend": options.backend,
|
|
|
|
|
"parse_method": options.method,
|
|
|
|
|
"formula_enable": options.formula_enable,
|
|
|
|
|
"table_enable": options.table_enable,
|
2025-10-30 17:31:46 +08:00
|
|
|
"server_url": None,
|
|
|
|
|
"return_md": True,
|
|
|
|
|
"return_middle_json": True,
|
|
|
|
|
"return_model_output": True,
|
|
|
|
|
"return_content_list": True,
|
|
|
|
|
"return_images": True,
|
|
|
|
|
"response_format_zip": True,
|
|
|
|
|
"start_page_id": 0,
|
2025-11-04 16:03:30 +08:00
|
|
|
"end_page_id": 99999,
|
2025-10-30 17:31:46 +08:00
|
|
|
}
|
|
|
|
|
|
2025-12-17 12:58:48 +08:00
|
|
|
if options.server_url:
|
|
|
|
|
data["server_url"] = options.server_url
|
|
|
|
|
elif self.mineru_server_url:
|
|
|
|
|
data["server_url"] = self.mineru_server_url
|
|
|
|
|
|
2025-12-17 19:48:24 +08:00
|
|
|
self.logger.info(f"[MinerU] request {data=}")
|
|
|
|
|
self.logger.info(f"[MinerU] request {options=}")
|
2025-12-17 12:58:48 +08:00
|
|
|
|
2025-11-04 16:03:30 +08:00
|
|
|
headers = {"Accept": "application/json"}
|
2025-10-30 17:31:46 +08:00
|
|
|
try:
|
2026-04-22 14:44:41 +08:00
|
|
|
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse backend={options.backend} server_url={data.get('server_url')}")
|
2025-10-30 17:31:46 +08:00
|
|
|
if callback:
|
2026-04-22 14:44:41 +08:00
|
|
|
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
2026-01-13 12:24:50 +08:00
|
|
|
with open(pdf_file_path, "rb") as pdf_file:
|
|
|
|
|
files = {"files": (pdf_file_name + ".pdf", pdf_file, "application/pdf")}
|
|
|
|
|
with requests.post(
|
2026-04-22 14:44:41 +08:00
|
|
|
url=f"{self.mineru_api}/file_parse",
|
2026-01-13 12:24:50 +08:00
|
|
|
files=files,
|
|
|
|
|
data=data,
|
|
|
|
|
headers=headers,
|
|
|
|
|
timeout=1800,
|
|
|
|
|
stream=True,
|
|
|
|
|
) as response:
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
content_type = response.headers.get("Content-Type", "")
|
2026-04-22 14:44:41 +08:00
|
|
|
if not content_type.startswith("application/zip"):
|
|
|
|
|
raise RuntimeError(f"[MinerU] not zip returned from api: {content_type}")
|
|
|
|
|
self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
|
|
|
|
if callback:
|
|
|
|
|
callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
|
|
|
|
with open(output_zip_path, "wb") as f:
|
|
|
|
|
response.raw.decode_content = True
|
|
|
|
|
shutil.copyfileobj(response.raw, f)
|
|
|
|
|
self.logger.info(f"[MinerU] Unzip to {output_path}...")
|
|
|
|
|
self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")
|
|
|
|
|
if callback:
|
|
|
|
|
callback(0.40, f"[MinerU] Unzip to {output_path}...")
|
|
|
|
|
self.logger.info("[MinerU] Api completed successfully.")
|
|
|
|
|
return Path(output_path)
|
|
|
|
|
except requests.RequestException as e:
|
2025-10-30 17:31:46 +08:00
|
|
|
raise RuntimeError(f"[MinerU] api failed with exception {e}")
|
2025-10-17 09:55:39 +08:00
|
|
|
|
Fix: Remove hardcoded page limits causing parsing failures on large PDFs (>300 pages) (#14382)
### What problem does this PR solve?
Fixes #14196
## Problem
When using DeepDOC to parse large PDFs (over 1000 pages), the parser
silently truncated processing at 300 pages due to a hardcoded default
`page_to=299` in `RAGFlowPdfParser.__images__()`. This caused:
- **Errors** on pages beyond the limit
- **Poor image quality** as the parser attempted to compensate with
missing page data
- **Inconsistent chunk splitting** between full PDF imports and partial
imports
Additionally, the codebase scattered magic numbers (`299`, `600`,
`10000`, `100000`, `100000000`, `10000000000`, `10**9`) across 22 files
as sentinel values for "parse all pages", making future maintenance
error-prone.
## Root Cause
```python
# deepdoc/parser/pdf_parser.py (before)
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
# Only the first 300 pages were rendered; everything beyond was silently dropped
```
While most callers in `rag/app/*.py` correctly passed `to_page=100000`,
the base class `RAGFlowPdfParser.__call__()` and `parse_into_bboxes()`
invoked `__images__` **without** forwarding `page_from`/`page_to`,
falling back to the restrictive default of 299.
## Solution
### 1. Define constants in `common/constants.py`
```python
MAXIMUM_PAGE_NUMBER = 100000 # Used by the parsing layer
MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000 # Used by the task/DB layer
```
### 2. Replace all hardcoded sentinel values
| Layer | Files Changed | Old Values | New Value |
|---|---|---|---|
| **Deepdoc parsers** | `pdf_parser.py`, `mineru_parser.py`,
`docling_parser.py`, `opendataloader_parser.py`, `paddleocr_parser.py`,
`docx_parser.py` | `299`, `600`, `10**9`, `100000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Chunk parsers** | `naive.py`, `book.py`, `qa.py`, `one.py`,
`manual.py`, `paper.py`, `presentation.py`, `laws.py`, `resume.py`,
`email.py`, `table.py` | `100000`, `10000`, `10000000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Task/DB layer** | `db_models.py`, `task_service.py`,
`document_service.py`, `file_service.py` | `100000000` |
`MAXIMUM_TASK_PAGE_NUMBER` |
### 3. Fix `parse_into_bboxes()` missing parameters
Added `from_page`/`to_page` parameters to `parse_into_bboxes()` so that
the `rag/flow/parser/parser.py` DeepDOC path no longer falls back to the
restrictive default.
## Files Changed (22)
- `common/constants.py`
- `deepdoc/parser/pdf_parser.py`
- `deepdoc/parser/mineru_parser.py`
- `deepdoc/parser/docling_parser.py`
- `deepdoc/parser/opendataloader_parser.py`
- `deepdoc/parser/paddleocr_parser.py`
- `deepdoc/parser/docx_parser.py`
- `rag/app/naive.py`
- `rag/app/book.py`
- `rag/app/qa.py`
- `rag/app/one.py`
- `rag/app/manual.py`
- `rag/app/paper.py`
- `rag/app/presentation.py`
- `rag/app/laws.py`
- `rag/app/resume.py`
- `rag/app/email.py`
- `rag/app/table.py`
- `api/db/db_models.py`
- `api/db/services/task_service.py`
- `api/db/services/document_service.py`
- `api/db/services/file_service.py`
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
---------
Signed-off-by: noob <yixiao121314@outlook.com>
2026-04-27 06:57:20 +00:00
|
|
|
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None):
|
2025-10-17 09:55:39 +08:00
|
|
|
self.page_from = page_from
|
|
|
|
|
self.page_to = page_to
|
|
|
|
|
try:
|
|
|
|
|
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
|
|
|
|
|
self.pdf = pdf
|
2025-12-16 07:15:25 +02:00
|
|
|
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in
|
|
|
|
|
enumerate(self.pdf.pages[page_from:page_to])]
|
2025-10-17 09:55:39 +08:00
|
|
|
except Exception as e:
|
|
|
|
|
self.page_images = None
|
|
|
|
|
self.total_page = 0
|
2025-12-03 17:24:37 +08:00
|
|
|
self.logger.exception(e)
|
2025-10-17 09:55:39 +08:00
|
|
|
|
|
|
|
|
def _line_tag(self, bx):
|
|
|
|
|
pn = [bx["page_idx"] + 1]
|
2025-11-19 13:59:32 +08:00
|
|
|
positions = bx.get("bbox", (0, 0, 0, 0))
|
2025-10-17 09:55:39 +08:00
|
|
|
x0, top, x1, bott = positions
|
2026-03-10 09:02:01 +02:00
|
|
|
# Normalize flipped coordinates (MinerU may report inverted bbox for flipped images)
|
|
|
|
|
if x0 > x1:
|
|
|
|
|
x0, x1 = x1, x0
|
|
|
|
|
if top > bott:
|
|
|
|
|
top, bott = bott, top
|
2025-10-17 09:55:39 +08:00
|
|
|
|
|
|
|
|
if hasattr(self, "page_images") and self.page_images and len(self.page_images) > bx["page_idx"]:
|
|
|
|
|
page_width, page_height = self.page_images[bx["page_idx"]].size
|
|
|
|
|
x0 = (x0 / 1000.0) * page_width
|
|
|
|
|
x1 = (x1 / 1000.0) * page_width
|
|
|
|
|
top = (top / 1000.0) * page_height
|
|
|
|
|
bott = (bott / 1000.0) * page_height
|
|
|
|
|
|
|
|
|
|
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
|
|
|
|
|
|
|
|
|
|
def crop(self, text, ZM=1, need_position=False):
|
|
|
|
|
imgs = []
|
|
|
|
|
poss = self.extract_positions(text)
|
|
|
|
|
if not poss:
|
|
|
|
|
if need_position:
|
|
|
|
|
return None, None
|
|
|
|
|
return
|
|
|
|
|
|
2025-11-18 17:01:54 +08:00
|
|
|
if not getattr(self, "page_images", None):
|
|
|
|
|
self.logger.warning("[MinerU] crop called without page images; skipping image generation.")
|
|
|
|
|
if need_position:
|
|
|
|
|
return None, None
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
page_count = len(self.page_images)
|
|
|
|
|
|
|
|
|
|
filtered_poss = []
|
|
|
|
|
for pns, left, right, top, bottom in poss:
|
|
|
|
|
if not pns:
|
|
|
|
|
self.logger.warning("[MinerU] Empty page index list in crop; skipping this position.")
|
|
|
|
|
continue
|
|
|
|
|
valid_pns = [p for p in pns if 0 <= p < page_count]
|
|
|
|
|
if not valid_pns:
|
|
|
|
|
self.logger.warning(f"[MinerU] All page indices {pns} out of range for {page_count} pages; skipping.")
|
|
|
|
|
continue
|
|
|
|
|
filtered_poss.append((valid_pns, left, right, top, bottom))
|
|
|
|
|
|
|
|
|
|
poss = filtered_poss
|
|
|
|
|
if not poss:
|
|
|
|
|
self.logger.warning("[MinerU] No valid positions after filtering; skip cropping.")
|
|
|
|
|
if need_position:
|
|
|
|
|
return None, None
|
|
|
|
|
return
|
|
|
|
|
|
2025-10-17 09:55:39 +08:00
|
|
|
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
|
|
|
|
GAP = 6
|
|
|
|
|
pos = poss[0]
|
2025-11-18 17:01:54 +08:00
|
|
|
first_page_idx = pos[0][0]
|
|
|
|
|
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
2025-10-17 09:55:39 +08:00
|
|
|
pos = poss[-1]
|
2025-11-18 17:01:54 +08:00
|
|
|
last_page_idx = pos[0][-1]
|
|
|
|
|
if not (0 <= last_page_idx < page_count):
|
2025-12-16 07:15:25 +02:00
|
|
|
self.logger.warning(
|
|
|
|
|
f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
|
2025-11-18 17:01:54 +08:00
|
|
|
if need_position:
|
|
|
|
|
return None, None
|
|
|
|
|
return
|
|
|
|
|
last_page_height = self.page_images[last_page_idx].size[1]
|
|
|
|
|
poss.append(
|
|
|
|
|
(
|
|
|
|
|
[last_page_idx],
|
|
|
|
|
pos[1],
|
|
|
|
|
pos[2],
|
|
|
|
|
min(last_page_height, pos[4] + GAP),
|
|
|
|
|
min(last_page_height, pos[4] + 120),
|
|
|
|
|
)
|
|
|
|
|
)
|
2025-10-17 09:55:39 +08:00
|
|
|
|
|
|
|
|
positions = []
|
|
|
|
|
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
|
|
|
|
right = left + max_width
|
|
|
|
|
|
|
|
|
|
if bottom <= top:
|
|
|
|
|
bottom = top + 2
|
|
|
|
|
|
|
|
|
|
for pn in pns[1:]:
|
2025-11-18 17:01:54 +08:00
|
|
|
if 0 <= pn - 1 < page_count:
|
|
|
|
|
bottom += self.page_images[pn - 1].size[1]
|
|
|
|
|
else:
|
2025-12-16 07:15:25 +02:00
|
|
|
self.logger.warning(
|
|
|
|
|
f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
|
2025-11-18 17:01:54 +08:00
|
|
|
|
|
|
|
|
if not (0 <= pns[0] < page_count):
|
2025-12-16 07:15:25 +02:00
|
|
|
self.logger.warning(
|
|
|
|
|
f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
|
2025-11-18 17:01:54 +08:00
|
|
|
continue
|
2025-10-17 09:55:39 +08:00
|
|
|
|
|
|
|
|
img0 = self.page_images[pns[0]]
|
|
|
|
|
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
|
2026-03-10 09:02:01 +02:00
|
|
|
if x0 > x1:
|
|
|
|
|
x0, x1 = x1, x0
|
|
|
|
|
if y0 > y1:
|
|
|
|
|
y0, y1 = y1, y0
|
|
|
|
|
if x1 <= x0 or y1 <= y0:
|
|
|
|
|
continue
|
2025-10-17 09:55:39 +08:00
|
|
|
crop0 = img0.crop((x0, y0, x1, y1))
|
|
|
|
|
imgs.append(crop0)
|
|
|
|
|
if 0 < ii < len(poss) - 1:
|
|
|
|
|
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
|
|
|
|
|
|
|
|
|
|
bottom -= img0.size[1]
|
|
|
|
|
for pn in pns[1:]:
|
2025-11-18 17:01:54 +08:00
|
|
|
if not (0 <= pn < page_count):
|
2025-12-16 07:15:25 +02:00
|
|
|
self.logger.warning(
|
|
|
|
|
f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
|
2025-11-18 17:01:54 +08:00
|
|
|
continue
|
2025-10-17 09:55:39 +08:00
|
|
|
page = self.page_images[pn]
|
|
|
|
|
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
|
2026-03-10 09:02:01 +02:00
|
|
|
if x0 > x1:
|
|
|
|
|
x0, x1 = x1, x0
|
|
|
|
|
if y0 > y1:
|
|
|
|
|
y0, y1 = y1, y0
|
|
|
|
|
if x1 <= x0 or y1 <= y0:
|
|
|
|
|
bottom -= page.size[1]
|
|
|
|
|
continue
|
2025-10-17 09:55:39 +08:00
|
|
|
cimgp = page.crop((x0, y0, x1, y1))
|
|
|
|
|
imgs.append(cimgp)
|
|
|
|
|
if 0 < ii < len(poss) - 1:
|
|
|
|
|
positions.append((pn + self.page_from, x0, x1, y0, y1))
|
|
|
|
|
bottom -= page.size[1]
|
|
|
|
|
|
|
|
|
|
if not imgs:
|
|
|
|
|
if need_position:
|
|
|
|
|
return None, None
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
height = 0
|
|
|
|
|
for img in imgs:
|
|
|
|
|
height += img.size[1] + GAP
|
|
|
|
|
height = int(height)
|
|
|
|
|
width = int(np.max([i.size[0] for i in imgs]))
|
|
|
|
|
pic = Image.new("RGB", (width, height), (245, 245, 245))
|
|
|
|
|
height = 0
|
|
|
|
|
for ii, img in enumerate(imgs):
|
|
|
|
|
if ii == 0 or ii + 1 == len(imgs):
|
|
|
|
|
img = img.convert("RGBA")
|
|
|
|
|
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
|
|
|
|
|
overlay.putalpha(128)
|
|
|
|
|
img = Image.alpha_composite(img, overlay).convert("RGB")
|
|
|
|
|
pic.paste(img, (0, int(height)))
|
|
|
|
|
height += img.size[1] + GAP
|
|
|
|
|
|
|
|
|
|
if need_position:
|
|
|
|
|
return pic, positions
|
|
|
|
|
return pic
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def extract_positions(txt: str):
|
|
|
|
|
poss = []
|
|
|
|
|
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
|
|
|
|
|
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
|
|
|
|
|
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
|
|
|
|
|
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
|
|
|
|
return poss
|
|
|
|
|
|
2025-12-16 07:15:25 +02:00
|
|
|
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[
|
|
|
|
|
dict[str, Any]]:
|
2025-12-01 12:17:43 +08:00
|
|
|
json_file = None
|
|
|
|
|
subdir = None
|
2025-12-03 17:24:37 +08:00
|
|
|
attempted = []
|
|
|
|
|
|
|
|
|
|
# mirror MinerU's sanitize_filename to align ZIP naming
|
|
|
|
|
def _sanitize_filename(name: str) -> str:
|
|
|
|
|
sanitized = re.sub(r"[/\\\.]{2,}|[/\\]", "", name)
|
|
|
|
|
sanitized = re.sub(r"[^\w.-]", "_", sanitized, flags=re.UNICODE)
|
|
|
|
|
if sanitized.startswith("."):
|
|
|
|
|
sanitized = "_" + sanitized[1:]
|
|
|
|
|
return sanitized or "unnamed"
|
|
|
|
|
|
|
|
|
|
safe_stem = _sanitize_filename(file_stem)
|
2026-05-06 14:03:57 +08:00
|
|
|
content_names = tuple(dict.fromkeys((f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json")))
|
2026-04-22 14:44:41 +08:00
|
|
|
allowed_names = set(content_names)
|
2025-12-03 17:24:37 +08:00
|
|
|
self.logger.info(f"[MinerU] Expected output files: {', '.join(sorted(allowed_names))}")
|
2025-12-17 12:58:48 +08:00
|
|
|
self.logger.info(f"[MinerU] Searching output in: {output_dir}")
|
|
|
|
|
|
|
|
|
|
jf = output_dir / f"{file_stem}_content_list.json"
|
|
|
|
|
self.logger.info(f"[MinerU] Trying original path: {jf}")
|
|
|
|
|
attempted.append(jf)
|
|
|
|
|
if jf.exists():
|
|
|
|
|
subdir = output_dir
|
|
|
|
|
json_file = jf
|
|
|
|
|
else:
|
|
|
|
|
alt = output_dir / f"{safe_stem}_content_list.json"
|
2025-12-03 17:24:37 +08:00
|
|
|
self.logger.info(f"[MinerU] Trying sanitized filename: {alt}")
|
|
|
|
|
attempted.append(alt)
|
|
|
|
|
if alt.exists():
|
2025-12-17 12:58:48 +08:00
|
|
|
subdir = output_dir
|
2025-12-03 17:24:37 +08:00
|
|
|
json_file = alt
|
2025-12-17 12:58:48 +08:00
|
|
|
else:
|
|
|
|
|
nested_alt = output_dir / safe_stem / f"{safe_stem}_content_list.json"
|
|
|
|
|
self.logger.info(f"[MinerU] Trying sanitized nested path: {nested_alt}")
|
|
|
|
|
attempted.append(nested_alt)
|
|
|
|
|
if nested_alt.exists():
|
|
|
|
|
subdir = nested_alt.parent
|
|
|
|
|
json_file = nested_alt
|
2026-05-19 12:28:31 +08:00
|
|
|
else:
|
|
|
|
|
# Try vlm subdirectory (for vlm-http-client backend)
|
|
|
|
|
vlm_path = output_dir / "vlm" / f"{file_stem}_content_list.json"
|
|
|
|
|
self.logger.info(f"[MinerU] Trying vlm subdirectory: {vlm_path}")
|
|
|
|
|
attempted.append(vlm_path)
|
|
|
|
|
if vlm_path.exists():
|
|
|
|
|
subdir = vlm_path.parent
|
|
|
|
|
json_file = vlm_path
|
|
|
|
|
else:
|
|
|
|
|
vlm_safe = output_dir / "vlm" / f"{safe_stem}_content_list.json"
|
|
|
|
|
self.logger.info(f"[MinerU] Trying vlm subdirectory with sanitized name: {vlm_safe}")
|
|
|
|
|
attempted.append(vlm_safe)
|
|
|
|
|
if vlm_safe.exists():
|
|
|
|
|
subdir = vlm_safe.parent
|
|
|
|
|
json_file = vlm_safe
|
2025-12-03 17:24:37 +08:00
|
|
|
|
2026-04-22 14:44:41 +08:00
|
|
|
if not json_file:
|
|
|
|
|
parse_subdir = None
|
|
|
|
|
if backend.startswith("pipeline"):
|
|
|
|
|
parse_subdir = method
|
|
|
|
|
elif backend.startswith("hybrid"):
|
|
|
|
|
parse_subdir = f"hybrid_{method}"
|
|
|
|
|
elif backend.startswith("vlm"):
|
|
|
|
|
parse_subdir = "vlm"
|
|
|
|
|
|
|
|
|
|
if parse_subdir:
|
|
|
|
|
for content_name in content_names:
|
|
|
|
|
for candidate in output_dir.glob(f"**/{parse_subdir}/{content_name}"):
|
|
|
|
|
self.logger.info(f"[MinerU] Trying parse-method path: {candidate}")
|
|
|
|
|
attempted.append(candidate)
|
2026-05-06 14:03:57 +08:00
|
|
|
subdir = candidate.parent
|
|
|
|
|
json_file = candidate
|
|
|
|
|
break
|
2026-04-22 14:44:41 +08:00
|
|
|
if json_file:
|
|
|
|
|
break
|
|
|
|
|
|
2026-05-06 14:03:57 +08:00
|
|
|
if not json_file:
|
|
|
|
|
stem_dirs = tuple(dict.fromkeys((file_stem, safe_stem)))
|
|
|
|
|
patterns = []
|
|
|
|
|
if parse_subdir:
|
|
|
|
|
for stem_dir in stem_dirs:
|
|
|
|
|
patterns.extend(
|
|
|
|
|
[
|
|
|
|
|
f"**/{stem_dir}/{parse_subdir}/content_list.json",
|
|
|
|
|
f"**/{stem_dir}/{parse_subdir}/*_content_list.json",
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
patterns.extend(
|
|
|
|
|
[
|
|
|
|
|
f"**/{parse_subdir}/content_list.json",
|
|
|
|
|
f"**/{parse_subdir}/*_content_list.json",
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
for stem_dir in stem_dirs:
|
|
|
|
|
patterns.extend(
|
|
|
|
|
[
|
|
|
|
|
f"**/{stem_dir}/content_list.json",
|
|
|
|
|
f"**/{stem_dir}/*_content_list.json",
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
patterns.extend(["**/content_list.json", "**/*_content_list.json"])
|
|
|
|
|
|
|
|
|
|
for pattern in patterns:
|
|
|
|
|
for candidate in sorted(output_dir.glob(pattern)):
|
|
|
|
|
self.logger.info(f"[MinerU] Trying fallback path: {candidate}")
|
|
|
|
|
if candidate.name.endswith("_content_list.json"):
|
|
|
|
|
rel_parts = candidate.relative_to(output_dir).parts
|
|
|
|
|
in_stem_dir = any(stem_dir in rel_parts for stem_dir in stem_dirs)
|
|
|
|
|
stem_match = candidate.stem.startswith(file_stem) or candidate.stem.startswith(safe_stem)
|
|
|
|
|
if not (stem_match or in_stem_dir):
|
|
|
|
|
self.logger.info(f"[MinerU] Skip unrelated fallback candidate: {candidate}")
|
|
|
|
|
continue
|
|
|
|
|
attempted.append(candidate)
|
|
|
|
|
subdir = candidate.parent
|
|
|
|
|
json_file = candidate
|
|
|
|
|
break
|
|
|
|
|
if json_file:
|
|
|
|
|
break
|
|
|
|
|
|
2025-12-01 12:17:43 +08:00
|
|
|
if not json_file:
|
2025-12-03 17:24:37 +08:00
|
|
|
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}")
|
2025-10-17 09:55:39 +08:00
|
|
|
|
|
|
|
|
with open(json_file, "r", encoding="utf-8") as f:
|
|
|
|
|
data = json.load(f)
|
|
|
|
|
|
|
|
|
|
for item in data:
|
|
|
|
|
for key in ("img_path", "table_img_path", "equation_img_path"):
|
|
|
|
|
if key in item and item[key]:
|
|
|
|
|
item[key] = str((subdir / item[key]).resolve())
|
|
|
|
|
return data
|
|
|
|
|
|
2026-06-17 13:06:07 +08:00
|
|
|
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None, table_enable: bool = False):
|
2025-10-17 09:55:39 +08:00
|
|
|
sections = []
|
|
|
|
|
for output in outputs:
|
2026-06-01 05:15:04 -07:00
|
|
|
match output.get("type"):
|
2025-10-17 09:55:39 +08:00
|
|
|
case MinerUContentType.TEXT:
|
task executor issues (#12006)
### What problem does this PR solve?
**Fixes #8706** - `InfinityException: TOO_MANY_CONNECTIONS` when running
multiple task executor workers
### Problem Description
When running RAGFlow with 8-16 task executor workers, most workers fail
to start properly. Checking logs revealed that workers were
stuck/hanging during Infinity connection initialization - only 1-2
workers would successfully register in Redis while the rest remained
blocked.
### Root Cause
The Infinity SDK `ConnectionPool` pre-allocates all connections in
`__init__`. With the default `max_size=32` and multiple workers (e.g.,
16), this creates 16×32=512 connections immediately on startup,
exceeding Infinity's default 128 connection limit. Workers hang while
waiting for connections that can never be established.
### Changes
1. **Prevent Infinity connection storm** (`rag/utils/infinity_conn.py`,
`rag/svr/task_executor.py`)
- Reduced ConnectionPool `max_size` from 32 to 4 (sufficient since
operations are synchronous)
- Added staggered startup delay (2s per worker) to spread connection
initialization
2. **Handle None children_delimiter** (`rag/app/naive.py`)
- Use `or ""` to handle explicitly set None values from parser config
3. **MinerU parser robustness** (`deepdoc/parser/mineru_parser.py`)
- Use `.get()` for optional output fields that may be missing
- Fix DISCARDED block handling: change `pass` to `continue` to skip
discarded blocks entirely
### Why `max_size=4` is sufficient
| Workers | Pool Size | Total Connections | Infinity Limit |
|---------|-----------|-------------------|----------------|
| 16 | 32 | 512 | 128 ❌ |
| 16 | 4 | 64 | 128 ✅ |
| 32 | 4 | 128 | 128 ✅ |
- All RAGFlow operations are synchronous: `get_conn()` → operation →
`release_conn()`
- No parallel `docStoreConn` operations in the codebase
- Maximum 1-2 concurrent connections needed per worker; 4 provides
safety margin
### MinerU DISCARDED block bug
When MinerU returns blocks with `type: "discarded"` (headers, footers,
watermarks, page numbers, artifacts), the previous code used `pass`
which left the `section` variable undefined, causing:
- **UnboundLocalError** if DISCARDED is the first block
- **Duplicate content** if DISCARDED follows another block (stale value
from previous iteration)
**Root cause confirmed via MinerU source code:**
From
[`mineru/utils/enum_class.py`](https://github.com/opendatalab/MinerU/blob/main/mineru/utils/enum_class.py#L14):
```python
class BlockType:
DISCARDED = 'discarded'
# VLM 2.5+ also has: HEADER, FOOTER, PAGE_NUMBER, ASIDE_TEXT, PAGE_FOOTNOTE
```
Per [MinerU
documentation](https://opendatalab.github.io/MinerU/reference/output_files/),
discarded blocks contain content that should be filtered out for clean
text extraction.
**Fix:** Changed `pass` to `continue` to skip discarded blocks entirely.
### Testing
- Verified all 16 workers now register successfully in Redis
- All workers heartbeating correctly
- Document parsing works as expected
- MinerU parsing with DISCARDED blocks no longer crashes
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
---------
Co-authored-by: user210 <user210@rt>
2025-12-18 04:03:30 +02:00
|
|
|
section = output.get("text", "")
|
2025-10-17 09:55:39 +08:00
|
|
|
case MinerUContentType.TABLE:
|
2025-12-16 07:15:25 +02:00
|
|
|
section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(
|
|
|
|
|
output.get("table_footnote", []))
|
2025-10-31 10:07:56 +08:00
|
|
|
if not section.strip():
|
|
|
|
|
section = "FAILED TO PARSE TABLE"
|
2025-10-17 09:55:39 +08:00
|
|
|
case MinerUContentType.IMAGE:
|
2025-12-16 07:15:25 +02:00
|
|
|
section = "".join(output.get("image_caption", [])) + "\n" + "".join(
|
|
|
|
|
output.get("image_footnote", []))
|
Feat: VLM image descriptions in MinerU parser (#14869) (#14946)
## Summary
Closes #14869.
Adds VLM-based semantic descriptions to **image chunks produced by the
MinerU parser**, closing a long-standing parity gap with the deepdoc
parser's `VisionFigureParser`. A maintainer flagged this in #13342
("We may add the VLM enhancement to MinerU parser as well") and an
earlier proposal exists in #13824; this PR lands the change end-to-end
inside the existing parser plumbing.
## Why
Today the MinerU parser returns image chunks containing only the
native `image_caption` and `image_footnote` strings from MinerU's
JSON. When neither is present (or when both are sparse), the chunk
carries effectively no searchable content for the figure and
retrieval misses it entirely. Users who configured a local VLM
(reporter's case: Gemma-4-31B) had to post-process MinerU's
`tmp/*.json` themselves.
The deepdoc parser already solves this via
[`VisionFigureParser`](deepdoc/parser/figure_parser.py): when the
tenant has an `IMAGE2TEXT` model configured, each figure gets a
semantic description merged into its chunk. This PR brings the same
behavior to MinerU.
## What changed
### `deepdoc/parser/mineru_parser.py`
- **New method `_enhance_images_with_vlm(outputs, vision_model,
callback=None)`** —
collects every `IMAGE` block with a readable `img_path`, runs
`rag.app.picture.vision_llm_chunk` in a 10-worker
`ThreadPoolExecutor` using the existing
`vision_llm_figure_describe_prompt`, and writes the result back as
`vlm_description`. Per-image failures are logged and skipped — they
never abort the run.
- **`_transfer_to_sections` (IMAGE branch)** — folds
`vlm_description` into the section text alongside caption +
footnote, so the description becomes part of the chunk and is
searchable / retrievable.
- **`parse_pdf`** — after `_read_output`, calls
`_enhance_images_with_vlm(outputs, vision_model, callback=callback)`
when a `vision_model` kwarg is supplied. Wrapped in `try / except`
so a VLM outage cannot break parsing.
### `rag/app/naive.py` (`by_mineru`)
After successfully resolving the MinerU OCR parser, also resolves the
tenant's default `LLMType.IMAGE2TEXT` model via
`get_tenant_default_model_by_type`, wraps it in an `LLMBundle`, and
injects it as `kwargs["vision_model"]` before delegating to
`parse_pdf`.
## Behavior
| Tenant config | Behavior |
|---|---|
| `IMAGE2TEXT` model configured | MinerU image chunks contain `caption +
footnote + VLM description`. Retrieval against figures now actually
works. |
| No `IMAGE2TEXT` model configured | Exact same output as today (caption
+ footnote only). Lookup fails silently with an info log; no error, no
regression. |
| VLM call fails for a single image | That image silently falls back to
caption + footnote; other images proceed. |
| Caller already passes `vision_model` in kwargs | We don't override it
— `if "vision_model" not in kwargs` guards the lookup. |
## Files
- `deepdoc/parser/mineru_parser.py` (+56)
- `rag/app/naive.py` (+13)
2026-05-19 01:08:10 -07:00
|
|
|
# If a vision model enriched this image with a semantic
|
|
|
|
|
# description (see _enhance_images_with_vlm), embed it in
|
|
|
|
|
# the chunk so it becomes searchable / retrievable.
|
|
|
|
|
vlm_description = (output.get("vlm_description") or "").strip()
|
|
|
|
|
if vlm_description:
|
|
|
|
|
section = (section.strip("\n") + "\n" + vlm_description).strip("\n") if section.strip() else vlm_description
|
2025-10-17 09:55:39 +08:00
|
|
|
case MinerUContentType.EQUATION:
|
task executor issues (#12006)
### What problem does this PR solve?
**Fixes #8706** - `InfinityException: TOO_MANY_CONNECTIONS` when running
multiple task executor workers
### Problem Description
When running RAGFlow with 8-16 task executor workers, most workers fail
to start properly. Checking logs revealed that workers were
stuck/hanging during Infinity connection initialization - only 1-2
workers would successfully register in Redis while the rest remained
blocked.
### Root Cause
The Infinity SDK `ConnectionPool` pre-allocates all connections in
`__init__`. With the default `max_size=32` and multiple workers (e.g.,
16), this creates 16×32=512 connections immediately on startup,
exceeding Infinity's default 128 connection limit. Workers hang while
waiting for connections that can never be established.
### Changes
1. **Prevent Infinity connection storm** (`rag/utils/infinity_conn.py`,
`rag/svr/task_executor.py`)
- Reduced ConnectionPool `max_size` from 32 to 4 (sufficient since
operations are synchronous)
- Added staggered startup delay (2s per worker) to spread connection
initialization
2. **Handle None children_delimiter** (`rag/app/naive.py`)
- Use `or ""` to handle explicitly set None values from parser config
3. **MinerU parser robustness** (`deepdoc/parser/mineru_parser.py`)
- Use `.get()` for optional output fields that may be missing
- Fix DISCARDED block handling: change `pass` to `continue` to skip
discarded blocks entirely
### Why `max_size=4` is sufficient
| Workers | Pool Size | Total Connections | Infinity Limit |
|---------|-----------|-------------------|----------------|
| 16 | 32 | 512 | 128 ❌ |
| 16 | 4 | 64 | 128 ✅ |
| 32 | 4 | 128 | 128 ✅ |
- All RAGFlow operations are synchronous: `get_conn()` → operation →
`release_conn()`
- No parallel `docStoreConn` operations in the codebase
- Maximum 1-2 concurrent connections needed per worker; 4 provides
safety margin
### MinerU DISCARDED block bug
When MinerU returns blocks with `type: "discarded"` (headers, footers,
watermarks, page numbers, artifacts), the previous code used `pass`
which left the `section` variable undefined, causing:
- **UnboundLocalError** if DISCARDED is the first block
- **Duplicate content** if DISCARDED follows another block (stale value
from previous iteration)
**Root cause confirmed via MinerU source code:**
From
[`mineru/utils/enum_class.py`](https://github.com/opendatalab/MinerU/blob/main/mineru/utils/enum_class.py#L14):
```python
class BlockType:
DISCARDED = 'discarded'
# VLM 2.5+ also has: HEADER, FOOTER, PAGE_NUMBER, ASIDE_TEXT, PAGE_FOOTNOTE
```
Per [MinerU
documentation](https://opendatalab.github.io/MinerU/reference/output_files/),
discarded blocks contain content that should be filtered out for clean
text extraction.
**Fix:** Changed `pass` to `continue` to skip discarded blocks entirely.
### Testing
- Verified all 16 workers now register successfully in Redis
- All workers heartbeating correctly
- Document parsing works as expected
- MinerU parsing with DISCARDED blocks no longer crashes
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
---------
Co-authored-by: user210 <user210@rt>
2025-12-18 04:03:30 +02:00
|
|
|
section = output.get("text", "")
|
2025-10-27 17:04:13 +08:00
|
|
|
case MinerUContentType.CODE:
|
task executor issues (#12006)
### What problem does this PR solve?
**Fixes #8706** - `InfinityException: TOO_MANY_CONNECTIONS` when running
multiple task executor workers
### Problem Description
When running RAGFlow with 8-16 task executor workers, most workers fail
to start properly. Checking logs revealed that workers were
stuck/hanging during Infinity connection initialization - only 1-2
workers would successfully register in Redis while the rest remained
blocked.
### Root Cause
The Infinity SDK `ConnectionPool` pre-allocates all connections in
`__init__`. With the default `max_size=32` and multiple workers (e.g.,
16), this creates 16×32=512 connections immediately on startup,
exceeding Infinity's default 128 connection limit. Workers hang while
waiting for connections that can never be established.
### Changes
1. **Prevent Infinity connection storm** (`rag/utils/infinity_conn.py`,
`rag/svr/task_executor.py`)
- Reduced ConnectionPool `max_size` from 32 to 4 (sufficient since
operations are synchronous)
- Added staggered startup delay (2s per worker) to spread connection
initialization
2. **Handle None children_delimiter** (`rag/app/naive.py`)
- Use `or ""` to handle explicitly set None values from parser config
3. **MinerU parser robustness** (`deepdoc/parser/mineru_parser.py`)
- Use `.get()` for optional output fields that may be missing
- Fix DISCARDED block handling: change `pass` to `continue` to skip
discarded blocks entirely
### Why `max_size=4` is sufficient
| Workers | Pool Size | Total Connections | Infinity Limit |
|---------|-----------|-------------------|----------------|
| 16 | 32 | 512 | 128 ❌ |
| 16 | 4 | 64 | 128 ✅ |
| 32 | 4 | 128 | 128 ✅ |
- All RAGFlow operations are synchronous: `get_conn()` → operation →
`release_conn()`
- No parallel `docStoreConn` operations in the codebase
- Maximum 1-2 concurrent connections needed per worker; 4 provides
safety margin
### MinerU DISCARDED block bug
When MinerU returns blocks with `type: "discarded"` (headers, footers,
watermarks, page numbers, artifacts), the previous code used `pass`
which left the `section` variable undefined, causing:
- **UnboundLocalError** if DISCARDED is the first block
- **Duplicate content** if DISCARDED follows another block (stale value
from previous iteration)
**Root cause confirmed via MinerU source code:**
From
[`mineru/utils/enum_class.py`](https://github.com/opendatalab/MinerU/blob/main/mineru/utils/enum_class.py#L14):
```python
class BlockType:
DISCARDED = 'discarded'
# VLM 2.5+ also has: HEADER, FOOTER, PAGE_NUMBER, ASIDE_TEXT, PAGE_FOOTNOTE
```
Per [MinerU
documentation](https://opendatalab.github.io/MinerU/reference/output_files/),
discarded blocks contain content that should be filtered out for clean
text extraction.
**Fix:** Changed `pass` to `continue` to skip discarded blocks entirely.
### Testing
- Verified all 16 workers now register successfully in Redis
- All workers heartbeating correctly
- Document parsing works as expected
- MinerU parsing with DISCARDED blocks no longer crashes
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
---------
Co-authored-by: user210 <user210@rt>
2025-12-18 04:03:30 +02:00
|
|
|
section = output.get("code_body", "") + "\n".join(output.get("code_caption", []))
|
2025-10-27 17:04:13 +08:00
|
|
|
case MinerUContentType.LIST:
|
|
|
|
|
section = "\n".join(output.get("list_items", []))
|
2026-06-01 05:15:04 -07:00
|
|
|
case (
|
|
|
|
|
MinerUContentType.HEADER
|
|
|
|
|
| MinerUContentType.FOOTER
|
|
|
|
|
| MinerUContentType.PAGE_NUMBER
|
|
|
|
|
| MinerUContentType.DISCARDED
|
|
|
|
|
):
|
|
|
|
|
continue
|
|
|
|
|
case _:
|
|
|
|
|
self.logger.debug("[MinerU] Skip unsupported section type=%s", output.get("type"))
|
|
|
|
|
continue
|
2025-10-17 09:55:39 +08:00
|
|
|
|
2026-06-17 13:06:07 +08:00
|
|
|
if not table_enable:
|
|
|
|
|
section = self._sanitize_section_text(section)
|
2026-05-24 22:06:36 -10:00
|
|
|
if not section:
|
|
|
|
|
self.logger.debug("[MinerU] Skip section after sanitization: type=%s", output.get("type"))
|
|
|
|
|
continue
|
|
|
|
|
|
2026-04-03 19:26:45 +08:00
|
|
|
if section and parse_method in {"manual", "pipeline"}:
|
2025-11-20 19:07:17 +08:00
|
|
|
sections.append((section, output["type"], self._line_tag(output)))
|
|
|
|
|
elif section and parse_method == "paper":
|
|
|
|
|
sections.append((section + self._line_tag(output), output["type"]))
|
|
|
|
|
else:
|
2025-10-17 09:55:39 +08:00
|
|
|
sections.append((section, self._line_tag(output)))
|
|
|
|
|
return sections
|
|
|
|
|
|
|
|
|
|
def _transfer_to_tables(self, outputs: list[dict[str, Any]]):
|
|
|
|
|
return []
|
|
|
|
|
|
Feat: VLM image descriptions in MinerU parser (#14869) (#14946)
## Summary
Closes #14869.
Adds VLM-based semantic descriptions to **image chunks produced by the
MinerU parser**, closing a long-standing parity gap with the deepdoc
parser's `VisionFigureParser`. A maintainer flagged this in #13342
("We may add the VLM enhancement to MinerU parser as well") and an
earlier proposal exists in #13824; this PR lands the change end-to-end
inside the existing parser plumbing.
## Why
Today the MinerU parser returns image chunks containing only the
native `image_caption` and `image_footnote` strings from MinerU's
JSON. When neither is present (or when both are sparse), the chunk
carries effectively no searchable content for the figure and
retrieval misses it entirely. Users who configured a local VLM
(reporter's case: Gemma-4-31B) had to post-process MinerU's
`tmp/*.json` themselves.
The deepdoc parser already solves this via
[`VisionFigureParser`](deepdoc/parser/figure_parser.py): when the
tenant has an `IMAGE2TEXT` model configured, each figure gets a
semantic description merged into its chunk. This PR brings the same
behavior to MinerU.
## What changed
### `deepdoc/parser/mineru_parser.py`
- **New method `_enhance_images_with_vlm(outputs, vision_model,
callback=None)`** —
collects every `IMAGE` block with a readable `img_path`, runs
`rag.app.picture.vision_llm_chunk` in a 10-worker
`ThreadPoolExecutor` using the existing
`vision_llm_figure_describe_prompt`, and writes the result back as
`vlm_description`. Per-image failures are logged and skipped — they
never abort the run.
- **`_transfer_to_sections` (IMAGE branch)** — folds
`vlm_description` into the section text alongside caption +
footnote, so the description becomes part of the chunk and is
searchable / retrievable.
- **`parse_pdf`** — after `_read_output`, calls
`_enhance_images_with_vlm(outputs, vision_model, callback=callback)`
when a `vision_model` kwarg is supplied. Wrapped in `try / except`
so a VLM outage cannot break parsing.
### `rag/app/naive.py` (`by_mineru`)
After successfully resolving the MinerU OCR parser, also resolves the
tenant's default `LLMType.IMAGE2TEXT` model via
`get_tenant_default_model_by_type`, wraps it in an `LLMBundle`, and
injects it as `kwargs["vision_model"]` before delegating to
`parse_pdf`.
## Behavior
| Tenant config | Behavior |
|---|---|
| `IMAGE2TEXT` model configured | MinerU image chunks contain `caption +
footnote + VLM description`. Retrieval against figures now actually
works. |
| No `IMAGE2TEXT` model configured | Exact same output as today (caption
+ footnote only). Lookup fails silently with an info log; no error, no
regression. |
| VLM call fails for a single image | That image silently falls back to
caption + footnote; other images proceed. |
| Caller already passes `vision_model` in kwargs | We don't override it
— `if "vision_model" not in kwargs` guards the lookup. |
## Files
- `deepdoc/parser/mineru_parser.py` (+56)
- `rag/app/naive.py` (+13)
2026-05-19 01:08:10 -07:00
|
|
|
def _enhance_images_with_vlm(self, outputs: list[dict[str, Any]], vision_model, callback: Optional[Callable] = None):
|
|
|
|
|
"""Generate semantic descriptions for image blocks via the tenant's
|
|
|
|
|
IMAGE2TEXT model, mirroring deepdoc's VisionFigureParser. Each
|
|
|
|
|
IMAGE block with a readable img_path gets a ``vlm_description``
|
|
|
|
|
field that ``_transfer_to_sections`` then folds into the chunk
|
|
|
|
|
text — closing issue #14869.
|
|
|
|
|
"""
|
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
|
from rag.app.picture import vision_llm_chunk
|
|
|
|
|
from rag.prompts.generator import vision_llm_figure_describe_prompt
|
|
|
|
|
|
|
|
|
|
image_jobs = [
|
|
|
|
|
(idx, item)
|
|
|
|
|
for idx, item in enumerate(outputs)
|
|
|
|
|
if item.get("type") == MinerUContentType.IMAGE
|
|
|
|
|
and item.get("img_path")
|
|
|
|
|
and os.path.exists(item["img_path"])
|
|
|
|
|
]
|
|
|
|
|
if not image_jobs:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if callback:
|
|
|
|
|
callback(0.78, f"[MinerU] Generating VLM descriptions for {len(image_jobs)} images...")
|
|
|
|
|
|
|
|
|
|
prompt = vision_llm_figure_describe_prompt()
|
|
|
|
|
|
|
|
|
|
def worker(idx, item):
|
|
|
|
|
try:
|
|
|
|
|
with Image.open(item["img_path"]) as img:
|
|
|
|
|
img.load()
|
|
|
|
|
desc = vision_llm_chunk(binary=img, vision_model=vision_model, prompt=prompt)
|
|
|
|
|
return idx, (desc or "").strip()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.warning(f"[MinerU] VLM description failed for image #{idx}: {e}")
|
|
|
|
|
return idx, ""
|
|
|
|
|
|
|
|
|
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
|
|
|
futures = [executor.submit(worker, idx, item) for idx, item in image_jobs]
|
|
|
|
|
for fut in as_completed(futures):
|
|
|
|
|
idx, desc = fut.result()
|
|
|
|
|
if desc:
|
|
|
|
|
outputs[idx]["vlm_description"] = desc
|
|
|
|
|
|
2025-10-17 09:55:39 +08:00
|
|
|
def parse_pdf(
|
2025-12-16 07:15:25 +02:00
|
|
|
self,
|
|
|
|
|
filepath: str | PathLike[str],
|
|
|
|
|
binary: BytesIO | bytes,
|
|
|
|
|
callback: Optional[Callable] = None,
|
|
|
|
|
*,
|
|
|
|
|
output_dir: Optional[str] = None,
|
|
|
|
|
backend: str = "pipeline",
|
|
|
|
|
server_url: Optional[str] = None,
|
|
|
|
|
delete_output: bool = True,
|
|
|
|
|
parse_method: str = "raw",
|
|
|
|
|
**kwargs,
|
2025-10-17 09:55:39 +08:00
|
|
|
) -> tuple:
|
|
|
|
|
import shutil
|
|
|
|
|
|
2026-04-03 19:26:45 +08:00
|
|
|
self.outlines = extract_pdf_outlines(binary if binary is not None else filepath)
|
2025-10-17 09:55:39 +08:00
|
|
|
temp_pdf = None
|
|
|
|
|
created_tmp_dir = False
|
|
|
|
|
|
2025-12-17 12:58:48 +08:00
|
|
|
parser_cfg = kwargs.get('parser_config', {})
|
|
|
|
|
lang = parser_cfg.get('mineru_lang') or kwargs.get('lang', 'English')
|
|
|
|
|
mineru_lang_code = LANGUAGE_TO_MINERU_MAP.get(lang, 'ch') # Defaults to Chinese if not matched
|
|
|
|
|
mineru_method_raw_str = parser_cfg.get('mineru_parse_method', 'auto')
|
|
|
|
|
enable_formula = parser_cfg.get('mineru_formula_enable', True)
|
|
|
|
|
enable_table = parser_cfg.get('mineru_table_enable', True)
|
2025-12-16 07:15:25 +02:00
|
|
|
|
2025-10-30 17:31:46 +08:00
|
|
|
# remove spaces, or mineru crash, and _read_output fail too
|
|
|
|
|
file_path = Path(filepath)
|
|
|
|
|
pdf_file_name = file_path.stem.replace(" ", "") + ".pdf"
|
|
|
|
|
pdf_file_path_valid = os.path.join(file_path.parent, pdf_file_name)
|
|
|
|
|
|
2025-10-17 09:55:39 +08:00
|
|
|
if binary:
|
|
|
|
|
temp_dir = Path(tempfile.mkdtemp(prefix="mineru_bin_pdf_"))
|
2025-10-30 17:31:46 +08:00
|
|
|
temp_pdf = temp_dir / pdf_file_name
|
2025-10-17 09:55:39 +08:00
|
|
|
with open(temp_pdf, "wb") as f:
|
|
|
|
|
f.write(binary)
|
|
|
|
|
pdf = temp_pdf
|
|
|
|
|
self.logger.info(f"[MinerU] Received binary PDF -> {temp_pdf}")
|
|
|
|
|
if callback:
|
|
|
|
|
callback(0.15, f"[MinerU] Received binary PDF -> {temp_pdf}")
|
|
|
|
|
else:
|
2025-10-30 17:31:46 +08:00
|
|
|
if pdf_file_path_valid != filepath:
|
|
|
|
|
self.logger.info(f"[MinerU] Remove all space in file name: {pdf_file_path_valid}")
|
|
|
|
|
shutil.move(filepath, pdf_file_path_valid)
|
|
|
|
|
pdf = Path(pdf_file_path_valid)
|
2025-10-17 09:55:39 +08:00
|
|
|
if not pdf.exists():
|
|
|
|
|
if callback:
|
|
|
|
|
callback(-1, f"[MinerU] PDF not found: {pdf}")
|
|
|
|
|
raise FileNotFoundError(f"[MinerU] PDF not found: {pdf}")
|
|
|
|
|
|
|
|
|
|
if output_dir:
|
|
|
|
|
out_dir = Path(output_dir)
|
|
|
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
else:
|
|
|
|
|
out_dir = Path(tempfile.mkdtemp(prefix="mineru_pdf_"))
|
|
|
|
|
created_tmp_dir = True
|
|
|
|
|
|
2025-12-17 12:58:48 +08:00
|
|
|
self.logger.info(f"[MinerU] Output directory: {out_dir} backend={backend} api={self.mineru_api} server_url={server_url or self.mineru_server_url}")
|
2025-10-17 09:55:39 +08:00
|
|
|
if callback:
|
|
|
|
|
callback(0.15, f"[MinerU] Output directory: {out_dir}")
|
|
|
|
|
|
|
|
|
|
self.__images__(pdf, zoomin=1)
|
|
|
|
|
|
|
|
|
|
try:
|
2025-12-16 07:15:25 +02:00
|
|
|
options = MinerUParseOptions(
|
|
|
|
|
backend=MinerUBackend(backend),
|
|
|
|
|
lang=MinerULanguage(mineru_lang_code),
|
|
|
|
|
method=MinerUParseMethod(mineru_method_raw_str),
|
|
|
|
|
server_url=server_url,
|
|
|
|
|
delete_output=delete_output,
|
|
|
|
|
parse_method=parse_method,
|
|
|
|
|
formula_enable=enable_formula,
|
|
|
|
|
table_enable=enable_table,
|
|
|
|
|
)
|
2025-12-17 12:58:48 +08:00
|
|
|
final_out_dir = self._run_mineru(pdf, out_dir, options, callback=callback)
|
|
|
|
|
outputs = self._read_output(final_out_dir, pdf.stem, method=mineru_method_raw_str, backend=backend)
|
2025-10-17 09:55:39 +08:00
|
|
|
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
|
|
|
|
if callback:
|
|
|
|
|
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
2025-12-01 12:17:43 +08:00
|
|
|
|
Feat: VLM image descriptions in MinerU parser (#14869) (#14946)
## Summary
Closes #14869.
Adds VLM-based semantic descriptions to **image chunks produced by the
MinerU parser**, closing a long-standing parity gap with the deepdoc
parser's `VisionFigureParser`. A maintainer flagged this in #13342
("We may add the VLM enhancement to MinerU parser as well") and an
earlier proposal exists in #13824; this PR lands the change end-to-end
inside the existing parser plumbing.
## Why
Today the MinerU parser returns image chunks containing only the
native `image_caption` and `image_footnote` strings from MinerU's
JSON. When neither is present (or when both are sparse), the chunk
carries effectively no searchable content for the figure and
retrieval misses it entirely. Users who configured a local VLM
(reporter's case: Gemma-4-31B) had to post-process MinerU's
`tmp/*.json` themselves.
The deepdoc parser already solves this via
[`VisionFigureParser`](deepdoc/parser/figure_parser.py): when the
tenant has an `IMAGE2TEXT` model configured, each figure gets a
semantic description merged into its chunk. This PR brings the same
behavior to MinerU.
## What changed
### `deepdoc/parser/mineru_parser.py`
- **New method `_enhance_images_with_vlm(outputs, vision_model,
callback=None)`** —
collects every `IMAGE` block with a readable `img_path`, runs
`rag.app.picture.vision_llm_chunk` in a 10-worker
`ThreadPoolExecutor` using the existing
`vision_llm_figure_describe_prompt`, and writes the result back as
`vlm_description`. Per-image failures are logged and skipped — they
never abort the run.
- **`_transfer_to_sections` (IMAGE branch)** — folds
`vlm_description` into the section text alongside caption +
footnote, so the description becomes part of the chunk and is
searchable / retrievable.
- **`parse_pdf`** — after `_read_output`, calls
`_enhance_images_with_vlm(outputs, vision_model, callback=callback)`
when a `vision_model` kwarg is supplied. Wrapped in `try / except`
so a VLM outage cannot break parsing.
### `rag/app/naive.py` (`by_mineru`)
After successfully resolving the MinerU OCR parser, also resolves the
tenant's default `LLMType.IMAGE2TEXT` model via
`get_tenant_default_model_by_type`, wraps it in an `LLMBundle`, and
injects it as `kwargs["vision_model"]` before delegating to
`parse_pdf`.
## Behavior
| Tenant config | Behavior |
|---|---|
| `IMAGE2TEXT` model configured | MinerU image chunks contain `caption +
footnote + VLM description`. Retrieval against figures now actually
works. |
| No `IMAGE2TEXT` model configured | Exact same output as today (caption
+ footnote only). Lookup fails silently with an info log; no error, no
regression. |
| VLM call fails for a single image | That image silently falls back to
caption + footnote; other images proceed. |
| Caller already passes `vision_model` in kwargs | We don't override it
— `if "vision_model" not in kwargs` guards the lookup. |
## Files
- `deepdoc/parser/mineru_parser.py` (+56)
- `rag/app/naive.py` (+13)
2026-05-19 01:08:10 -07:00
|
|
|
vision_model = kwargs.get("vision_model")
|
|
|
|
|
if vision_model is not None:
|
|
|
|
|
try:
|
|
|
|
|
self._enhance_images_with_vlm(outputs, vision_model, callback=callback)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
self.logger.warning(f"[MinerU] VLM image enhancement failed: {e}. Continuing without descriptions.")
|
|
|
|
|
|
2026-06-17 13:06:07 +08:00
|
|
|
return self._transfer_to_sections(outputs, parse_method, enable_table), self._transfer_to_tables(outputs)
|
2025-10-17 09:55:39 +08:00
|
|
|
finally:
|
|
|
|
|
if temp_pdf and temp_pdf.exists():
|
|
|
|
|
try:
|
|
|
|
|
temp_pdf.unlink()
|
|
|
|
|
temp_pdf.parent.rmdir()
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
if delete_output and created_tmp_dir and out_dir.exists():
|
|
|
|
|
try:
|
|
|
|
|
shutil.rmtree(out_dir)
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
parser = MinerUParser("mineru")
|
2025-11-04 16:03:30 +08:00
|
|
|
ok, reason = parser.check_installation()
|
|
|
|
|
print("MinerU available:", ok)
|
2025-10-17 09:55:39 +08:00
|
|
|
|
|
|
|
|
filepath = ""
|
|
|
|
|
with open(filepath, "rb") as file:
|
|
|
|
|
outputs = parser.parse_pdf(filepath=filepath, binary=file.read())
|
|
|
|
|
for output in outputs:
|
|
|
|
|
print(output)
|