Fix: Remove hardcoded page limits causing parsing failures on large PDFs (>300 pages) (#14382)

### What problem does this PR solve?

Fixes #14196

## Problem

When using DeepDOC to parse large PDFs (over 1000 pages), the parser
silently truncated processing at 300 pages due to a hardcoded default
`page_to=299` in `RAGFlowPdfParser.__images__()`. This caused:

- **Errors** on pages beyond the limit
- **Poor image quality** as the parser attempted to compensate with
missing page data
- **Inconsistent chunk splitting** between full PDF imports and partial
imports

Additionally, the codebase scattered magic numbers (`299`, `600`,
`10000`, `100000`, `100000000`, `10000000000`, `10**9`) across 22 files
as sentinel values for "parse all pages", making future maintenance
error-prone.

## Root Cause

```python
# deepdoc/parser/pdf_parser.py (before)
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
    # Only the first 300 pages were rendered; everything beyond was silently dropped
```

While most callers in `rag/app/*.py` correctly passed `to_page=100000`,
the base class `RAGFlowPdfParser.__call__()` and `parse_into_bboxes()`
invoked `__images__` **without** forwarding `page_from`/`page_to`,
falling back to the restrictive default of 299.

## Solution

### 1. Define constants in `common/constants.py`

```python
MAXIMUM_PAGE_NUMBER = 100000                        # Used by the parsing layer
MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000  # Used by the task/DB layer
```

### 2. Replace all hardcoded sentinel values

| Layer | Files Changed | Old Values | New Value |
|---|---|---|---|
| **Deepdoc parsers** | `pdf_parser.py`, `mineru_parser.py`,
`docling_parser.py`, `opendataloader_parser.py`, `paddleocr_parser.py`,
`docx_parser.py` | `299`, `600`, `10**9`, `100000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Chunk parsers** | `naive.py`, `book.py`, `qa.py`, `one.py`,
`manual.py`, `paper.py`, `presentation.py`, `laws.py`, `resume.py`,
`email.py`, `table.py` | `100000`, `10000`, `10000000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Task/DB layer** | `db_models.py`, `task_service.py`,
`document_service.py`, `file_service.py` | `100000000` |
`MAXIMUM_TASK_PAGE_NUMBER` |

### 3. Fix `parse_into_bboxes()` missing parameters

Added `from_page`/`to_page` parameters to `parse_into_bboxes()` so that
the `rag/flow/parser/parser.py` DeepDOC path no longer falls back to the
restrictive default.

## Files Changed (22)

- `common/constants.py`
- `deepdoc/parser/pdf_parser.py`
- `deepdoc/parser/mineru_parser.py`
- `deepdoc/parser/docling_parser.py`
- `deepdoc/parser/opendataloader_parser.py`
- `deepdoc/parser/paddleocr_parser.py`
- `deepdoc/parser/docx_parser.py`
- `rag/app/naive.py`
- `rag/app/book.py`
- `rag/app/qa.py`
- `rag/app/one.py`
- `rag/app/manual.py`
- `rag/app/paper.py`
- `rag/app/presentation.py`
- `rag/app/laws.py`
- `rag/app/resume.py`
- `rag/app/email.py`
- `rag/app/table.py`
- `api/db/db_models.py`
- `api/db/services/task_service.py`
- `api/db/services/document_service.py`
- `api/db/services/file_service.py`

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring

---------

Signed-off-by: noob <yixiao121314@outlook.com>
This commit is contained in:
euvre
2026-04-27 06:57:20 +00:00
committed by GitHub
parent c3eac4103a
commit 2846a93998
24 changed files with 99 additions and 68 deletions

View File

@@ -30,10 +30,12 @@ import pdfplumber
import requests
from PIL import Image
from common.constants import MAXIMUM_PAGE_NUMBER
try:
from docling.document_converter import DocumentConverter
except Exception:
DocumentConverter = None
DocumentConverter = None
try:
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
@@ -125,7 +127,7 @@ class DoclingParser(RAGFlowPdfParser):
self.logger.error(f"[Docling] init DocumentConverter failed: {e}")
return False
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None):
self.page_from = page_from
self.page_to = page_to
bytes_io = None

View File

@@ -21,6 +21,7 @@ from collections import Counter
from rag.nlp import rag_tokenizer
from io import BytesIO
import logging
from common.constants import MAXIMUM_PAGE_NUMBER
from docx.image.exceptions import (
InvalidImageStreamError,
UnexpectedEndOfFileError,
@@ -158,7 +159,7 @@ class RAGFlowDocxParser:
return lines
return ["\n".join(lines)]
def __call__(self, fnm, from_page=0, to_page=100000000):
def __call__(self, fnm, from_page=0, to_page=MAXIMUM_PAGE_NUMBER):
self.doc = Document(fnm) if isinstance(
fnm, str) else Document(BytesIO(fnm))
pn = 0 # parsed page

View File

@@ -37,6 +37,8 @@ from strenum import StrEnum
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
from deepdoc.parser.utils import extract_pdf_outlines
from common.constants import MAXIMUM_PAGE_NUMBER
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
if LOCK_KEY_pdfplumber not in sys.modules:
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
@@ -320,7 +322,7 @@ class MinerUParser(RAGFlowPdfParser):
except requests.RequestException as e:
raise RuntimeError(f"[MinerU] api failed with exception {e}")
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None):
self.page_from = page_from
self.page_to = page_to
try:

View File

@@ -15,6 +15,8 @@ import pdfplumber
import requests
from PIL import Image
from common.constants import MAXIMUM_PAGE_NUMBER
try:
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
except Exception:
@@ -153,7 +155,7 @@ class OpenDataLoaderParser(RAGFlowPdfParser):
self.logger.warning(f"[OpenDataLoader] Health check failed: {exc}")
return False
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None):
self.page_from = page_from
self.page_to = page_to
bytes_io = None

View File

@@ -29,6 +29,8 @@ import pdfplumber
import requests
from PIL import Image
from common.constants import MAXIMUM_PAGE_NUMBER
try:
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
except Exception:
@@ -425,7 +427,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
"""Convert API response to table tuples."""
return []
def __images__(self, fnm, page_from=0, page_to=10**9, callback=None):
def __images__(self, fnm, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None):
"""Generate page images from PDF for cropping."""
self.page_from = page_from
self.page_to = page_to

View File

@@ -37,6 +37,7 @@ from pypdf import PdfReader as pdf2_read
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from common.constants import MAXIMUM_PAGE_NUMBER
from common.file_utils import get_project_base_directory
from deepdoc.vision import OCR, AscendLayoutRecognizer, LayoutRecognizer, Recognizer, TableStructureRecognizer
from rag.nlp import rag_tokenizer
@@ -1521,7 +1522,7 @@ class RAGFlowPdfParser:
except Exception:
logging.exception("total_page_number")
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
def __images__(self, fnm, zoomin=3, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None):
self.lefted_chars = []
self.mean_height = []
self.mean_width = []
@@ -1541,7 +1542,7 @@ class RAGFlowPdfParser:
self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
except Exception as e:
logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
self.page_chars = [[] for _ in range(len(self.page_images))] # If failed to extract, using empty list instead.
# Detect garbled pages and clear their chars so the OCR
# path will be used instead. Two detection strategies:
@@ -1694,10 +1695,10 @@ class RAGFlowPdfParser:
tbls = self._extract_table_figure(need_image, zoomin, return_html, False)
return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
def parse_into_bboxes(self, fnm, callback=None, zoomin=3):
def parse_into_bboxes(self, fnm, callback=None, zoomin=3, from_page=0, to_page=MAXIMUM_PAGE_NUMBER):
start = timer()
self.outlines = extract_pdf_outlines(fnm)
self.__images__(fnm, zoomin, callback=callback)
self.__images__(fnm, zoomin, from_page, to_page, callback=callback)
if callback:
callback(0.40, "OCR finished ({:.2f}s)".format(timer() - start))
@@ -1943,7 +1944,7 @@ class RAGFlowPdfParser:
class PlainParser:
def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
def __call__(self, filename, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, **kwargs):
lines = []
try:
self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename))
@@ -1969,7 +1970,7 @@ class VisionParser(RAGFlowPdfParser):
self.vision_model = vision_model
self.outlines = []
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
def __images__(self, fnm, zoomin=3, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None):
try:
with sys.modules[LOCK_KEY_pdfplumber]:
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
@@ -1980,7 +1981,7 @@ class VisionParser(RAGFlowPdfParser):
self.total_page = 0
logging.exception("VisionParser __images__")
def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
def __call__(self, filename, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, **kwargs):
callback = kwargs.get("callback", lambda prog, msg: None)
zoomin = kwargs.get("zoomin", 3)
self.__images__(fnm=filename, zoomin=zoomin, page_from=from_page, page_to=to_page, callback=callback)