mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
Fix: Remove hardcoded page limits causing parsing failures on large PDFs (>300 pages) (#14382)
### What problem does this PR solve? Fixes #14196 ## Problem When using DeepDOC to parse large PDFs (over 1000 pages), the parser silently truncated processing at 300 pages due to a hardcoded default `page_to=299` in `RAGFlowPdfParser.__images__()`. This caused: - **Errors** on pages beyond the limit - **Poor image quality** as the parser attempted to compensate with missing page data - **Inconsistent chunk splitting** between full PDF imports and partial imports Additionally, the codebase scattered magic numbers (`299`, `600`, `10000`, `100000`, `100000000`, `10000000000`, `10**9`) across 22 files as sentinel values for "parse all pages", making future maintenance error-prone. ## Root Cause ```python # deepdoc/parser/pdf_parser.py (before) def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): # Only the first 300 pages were rendered; everything beyond was silently dropped ``` While most callers in `rag/app/*.py` correctly passed `to_page=100000`, the base class `RAGFlowPdfParser.__call__()` and `parse_into_bboxes()` invoked `__images__` **without** forwarding `page_from`/`page_to`, falling back to the restrictive default of 299. ## Solution ### 1. Define constants in `common/constants.py` ```python MAXIMUM_PAGE_NUMBER = 100000 # Used by the parsing layer MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000 # Used by the task/DB layer ``` ### 2. Replace all hardcoded sentinel values | Layer | Files Changed | Old Values | New Value | |---|---|---|---| | **Deepdoc parsers** | `pdf_parser.py`, `mineru_parser.py`, `docling_parser.py`, `opendataloader_parser.py`, `paddleocr_parser.py`, `docx_parser.py` | `299`, `600`, `10**9`, `100000000` | `MAXIMUM_PAGE_NUMBER` | | **Chunk parsers** | `naive.py`, `book.py`, `qa.py`, `one.py`, `manual.py`, `paper.py`, `presentation.py`, `laws.py`, `resume.py`, `email.py`, `table.py` | `100000`, `10000`, `10000000000` | `MAXIMUM_PAGE_NUMBER` | | **Task/DB layer** | `db_models.py`, `task_service.py`, `document_service.py`, `file_service.py` | `100000000` | `MAXIMUM_TASK_PAGE_NUMBER` | ### 3. Fix `parse_into_bboxes()` missing parameters Added `from_page`/`to_page` parameters to `parse_into_bboxes()` so that the `rag/flow/parser/parser.py` DeepDOC path no longer falls back to the restrictive default. ## Files Changed (22) - `common/constants.py` - `deepdoc/parser/pdf_parser.py` - `deepdoc/parser/mineru_parser.py` - `deepdoc/parser/docling_parser.py` - `deepdoc/parser/opendataloader_parser.py` - `deepdoc/parser/paddleocr_parser.py` - `deepdoc/parser/docx_parser.py` - `rag/app/naive.py` - `rag/app/book.py` - `rag/app/qa.py` - `rag/app/one.py` - `rag/app/manual.py` - `rag/app/paper.py` - `rag/app/presentation.py` - `rag/app/laws.py` - `rag/app/resume.py` - `rag/app/email.py` - `rag/app/table.py` - `api/db/db_models.py` - `api/db/services/task_service.py` - `api/db/services/document_service.py` - `api/db/services/file_service.py` ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Refactoring --------- Signed-off-by: noob <yixiao121314@outlook.com>
This commit is contained in:
@@ -30,10 +30,12 @@ import pdfplumber
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
from common.constants import MAXIMUM_PAGE_NUMBER
|
||||
|
||||
try:
|
||||
from docling.document_converter import DocumentConverter
|
||||
except Exception:
|
||||
DocumentConverter = None
|
||||
DocumentConverter = None
|
||||
|
||||
try:
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
@@ -125,7 +127,7 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
self.logger.error(f"[Docling] init DocumentConverter failed: {e}")
|
||||
return False
|
||||
|
||||
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
|
||||
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None):
|
||||
self.page_from = page_from
|
||||
self.page_to = page_to
|
||||
bytes_io = None
|
||||
|
||||
@@ -21,6 +21,7 @@ from collections import Counter
|
||||
from rag.nlp import rag_tokenizer
|
||||
from io import BytesIO
|
||||
import logging
|
||||
from common.constants import MAXIMUM_PAGE_NUMBER
|
||||
from docx.image.exceptions import (
|
||||
InvalidImageStreamError,
|
||||
UnexpectedEndOfFileError,
|
||||
@@ -158,7 +159,7 @@ class RAGFlowDocxParser:
|
||||
return lines
|
||||
return ["\n".join(lines)]
|
||||
|
||||
def __call__(self, fnm, from_page=0, to_page=100000000):
|
||||
def __call__(self, fnm, from_page=0, to_page=MAXIMUM_PAGE_NUMBER):
|
||||
self.doc = Document(fnm) if isinstance(
|
||||
fnm, str) else Document(BytesIO(fnm))
|
||||
pn = 0 # parsed page
|
||||
|
||||
@@ -37,6 +37,8 @@ from strenum import StrEnum
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
from deepdoc.parser.utils import extract_pdf_outlines
|
||||
|
||||
from common.constants import MAXIMUM_PAGE_NUMBER
|
||||
|
||||
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
||||
if LOCK_KEY_pdfplumber not in sys.modules:
|
||||
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
|
||||
@@ -320,7 +322,7 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
except requests.RequestException as e:
|
||||
raise RuntimeError(f"[MinerU] api failed with exception {e}")
|
||||
|
||||
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
|
||||
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None):
|
||||
self.page_from = page_from
|
||||
self.page_to = page_to
|
||||
try:
|
||||
|
||||
@@ -15,6 +15,8 @@ import pdfplumber
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
from common.constants import MAXIMUM_PAGE_NUMBER
|
||||
|
||||
try:
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
except Exception:
|
||||
@@ -153,7 +155,7 @@ class OpenDataLoaderParser(RAGFlowPdfParser):
|
||||
self.logger.warning(f"[OpenDataLoader] Health check failed: {exc}")
|
||||
return False
|
||||
|
||||
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
|
||||
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None):
|
||||
self.page_from = page_from
|
||||
self.page_to = page_to
|
||||
bytes_io = None
|
||||
|
||||
@@ -29,6 +29,8 @@ import pdfplumber
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
from common.constants import MAXIMUM_PAGE_NUMBER
|
||||
|
||||
try:
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
except Exception:
|
||||
@@ -425,7 +427,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
"""Convert API response to table tuples."""
|
||||
return []
|
||||
|
||||
def __images__(self, fnm, page_from=0, page_to=10**9, callback=None):
|
||||
def __images__(self, fnm, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None):
|
||||
"""Generate page images from PDF for cropping."""
|
||||
self.page_from = page_from
|
||||
self.page_to = page_to
|
||||
|
||||
@@ -37,6 +37,7 @@ from pypdf import PdfReader as pdf2_read
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.metrics import silhouette_score
|
||||
|
||||
from common.constants import MAXIMUM_PAGE_NUMBER
|
||||
from common.file_utils import get_project_base_directory
|
||||
from deepdoc.vision import OCR, AscendLayoutRecognizer, LayoutRecognizer, Recognizer, TableStructureRecognizer
|
||||
from rag.nlp import rag_tokenizer
|
||||
@@ -1521,7 +1522,7 @@ class RAGFlowPdfParser:
|
||||
except Exception:
|
||||
logging.exception("total_page_number")
|
||||
|
||||
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
|
||||
def __images__(self, fnm, zoomin=3, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None):
|
||||
self.lefted_chars = []
|
||||
self.mean_height = []
|
||||
self.mean_width = []
|
||||
@@ -1541,7 +1542,7 @@ class RAGFlowPdfParser:
|
||||
self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
|
||||
self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
|
||||
self.page_chars = [[] for _ in range(len(self.page_images))] # If failed to extract, using empty list instead.
|
||||
|
||||
# Detect garbled pages and clear their chars so the OCR
|
||||
# path will be used instead. Two detection strategies:
|
||||
@@ -1694,10 +1695,10 @@ class RAGFlowPdfParser:
|
||||
tbls = self._extract_table_figure(need_image, zoomin, return_html, False)
|
||||
return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
|
||||
|
||||
def parse_into_bboxes(self, fnm, callback=None, zoomin=3):
|
||||
def parse_into_bboxes(self, fnm, callback=None, zoomin=3, from_page=0, to_page=MAXIMUM_PAGE_NUMBER):
|
||||
start = timer()
|
||||
self.outlines = extract_pdf_outlines(fnm)
|
||||
self.__images__(fnm, zoomin, callback=callback)
|
||||
self.__images__(fnm, zoomin, from_page, to_page, callback=callback)
|
||||
if callback:
|
||||
callback(0.40, "OCR finished ({:.2f}s)".format(timer() - start))
|
||||
|
||||
@@ -1943,7 +1944,7 @@ class RAGFlowPdfParser:
|
||||
|
||||
|
||||
class PlainParser:
|
||||
def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
|
||||
def __call__(self, filename, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, **kwargs):
|
||||
lines = []
|
||||
try:
|
||||
self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename))
|
||||
@@ -1969,7 +1970,7 @@ class VisionParser(RAGFlowPdfParser):
|
||||
self.vision_model = vision_model
|
||||
self.outlines = []
|
||||
|
||||
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
|
||||
def __images__(self, fnm, zoomin=3, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None):
|
||||
try:
|
||||
with sys.modules[LOCK_KEY_pdfplumber]:
|
||||
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
|
||||
@@ -1980,7 +1981,7 @@ class VisionParser(RAGFlowPdfParser):
|
||||
self.total_page = 0
|
||||
logging.exception("VisionParser __images__")
|
||||
|
||||
def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
|
||||
def __call__(self, filename, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, **kwargs):
|
||||
callback = kwargs.get("callback", lambda prog, msg: None)
|
||||
zoomin = kwargs.get("zoomin", 3)
|
||||
self.__images__(fnm=filename, zoomin=zoomin, page_from=from_page, page_to=to_page, callback=callback)
|
||||
|
||||
Reference in New Issue
Block a user