2025-01-21 20:52:28 +08:00
|
|
|
|
#
|
|
|
|
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
|
|
|
|
#
|
2024-08-15 09:17:36 +08:00
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
|
#
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
#
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
#
|
2025-01-21 20:52:28 +08:00
|
|
|
|
|
2024-11-14 17:13:48 +08:00
|
|
|
|
import logging
|
2024-08-15 09:17:36 +08:00
|
|
|
|
import re
|
2024-12-30 10:32:19 +08:00
|
|
|
|
import csv
|
2024-08-15 09:17:36 +08:00
|
|
|
|
from copy import deepcopy
|
|
|
|
|
|
from io import BytesIO
|
|
|
|
|
|
from timeit import default_timer as timer
|
|
|
|
|
|
from openpyxl import load_workbook
|
2024-09-29 10:29:56 +08:00
|
|
|
|
|
Fix: Remove hardcoded page limits causing parsing failures on large PDFs (>300 pages) (#14382)
### What problem does this PR solve?
Fixes #14196
## Problem
When using DeepDOC to parse large PDFs (over 1000 pages), the parser
silently truncated processing at 300 pages due to a hardcoded default
`page_to=299` in `RAGFlowPdfParser.__images__()`. This caused:
- **Errors** on pages beyond the limit
- **Poor image quality** as the parser attempted to compensate with
missing page data
- **Inconsistent chunk splitting** between full PDF imports and partial
imports
Additionally, the codebase scattered magic numbers (`299`, `600`,
`10000`, `100000`, `100000000`, `10000000000`, `10**9`) across 22 files
as sentinel values for "parse all pages", making future maintenance
error-prone.
## Root Cause
```python
# deepdoc/parser/pdf_parser.py (before)
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
# Only the first 300 pages were rendered; everything beyond was silently dropped
```
While most callers in `rag/app/*.py` correctly passed `to_page=100000`,
the base class `RAGFlowPdfParser.__call__()` and `parse_into_bboxes()`
invoked `__images__` **without** forwarding `page_from`/`page_to`,
falling back to the restrictive default of 299.
## Solution
### 1. Define constants in `common/constants.py`
```python
MAXIMUM_PAGE_NUMBER = 100000 # Used by the parsing layer
MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000 # Used by the task/DB layer
```
### 2. Replace all hardcoded sentinel values
| Layer | Files Changed | Old Values | New Value |
|---|---|---|---|
| **Deepdoc parsers** | `pdf_parser.py`, `mineru_parser.py`,
`docling_parser.py`, `opendataloader_parser.py`, `paddleocr_parser.py`,
`docx_parser.py` | `299`, `600`, `10**9`, `100000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Chunk parsers** | `naive.py`, `book.py`, `qa.py`, `one.py`,
`manual.py`, `paper.py`, `presentation.py`, `laws.py`, `resume.py`,
`email.py`, `table.py` | `100000`, `10000`, `10000000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Task/DB layer** | `db_models.py`, `task_service.py`,
`document_service.py`, `file_service.py` | `100000000` |
`MAXIMUM_TASK_PAGE_NUMBER` |
### 3. Fix `parse_into_bboxes()` missing parameters
Added `from_page`/`to_page` parameters to `parse_into_bboxes()` so that
the `rag/flow/parser/parser.py` DeepDOC path no longer falls back to the
restrictive default.
## Files Changed (22)
- `common/constants.py`
- `deepdoc/parser/pdf_parser.py`
- `deepdoc/parser/mineru_parser.py`
- `deepdoc/parser/docling_parser.py`
- `deepdoc/parser/opendataloader_parser.py`
- `deepdoc/parser/paddleocr_parser.py`
- `deepdoc/parser/docx_parser.py`
- `rag/app/naive.py`
- `rag/app/book.py`
- `rag/app/qa.py`
- `rag/app/one.py`
- `rag/app/manual.py`
- `rag/app/paper.py`
- `rag/app/presentation.py`
- `rag/app/laws.py`
- `rag/app/resume.py`
- `rag/app/email.py`
- `rag/app/table.py`
- `api/db/db_models.py`
- `api/db/services/task_service.py`
- `api/db/services/document_service.py`
- `api/db/services/file_service.py`
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
---------
Signed-off-by: noob <yixiao121314@outlook.com>
2026-04-27 06:57:20 +00:00
|
|
|
|
from common.constants import MAXIMUM_PAGE_NUMBER
|
2024-09-29 12:47:09 +08:00
|
|
|
|
from deepdoc.parser.utils import get_text
|
|
|
|
|
|
from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level
|
2024-08-15 09:17:36 +08:00
|
|
|
|
from rag.nlp import rag_tokenizer, tokenize_table, concat_img
|
|
|
|
|
|
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
|
|
|
|
|
from docx import Document
|
|
|
|
|
|
from markdown import markdown
|
2024-09-29 12:47:09 +08:00
|
|
|
|
|
2025-10-28 09:46:32 +08:00
|
|
|
|
from common.float_utils import get_float
|
2025-03-18 11:13:44 +08:00
|
|
|
|
|
2025-01-09 17:07:21 +08:00
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
|
class Excel(ExcelParser):
|
|
|
|
|
|
def __call__(self, fnm, binary=None, callback=None):
|
|
|
|
|
|
if not binary:
|
|
|
|
|
|
wb = load_workbook(fnm)
|
|
|
|
|
|
else:
|
|
|
|
|
|
wb = load_workbook(BytesIO(binary))
|
|
|
|
|
|
total = 0
|
|
|
|
|
|
for sheetname in wb.sheetnames:
|
|
|
|
|
|
total += len(list(wb[sheetname].rows))
|
|
|
|
|
|
|
|
|
|
|
|
res, fails = [], []
|
|
|
|
|
|
for sheetname in wb.sheetnames:
|
|
|
|
|
|
ws = wb[sheetname]
|
|
|
|
|
|
rows = list(ws.rows)
|
|
|
|
|
|
for i, r in enumerate(rows):
|
|
|
|
|
|
q, a = "", ""
|
|
|
|
|
|
for cell in r:
|
|
|
|
|
|
if not cell.value:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if not q:
|
|
|
|
|
|
q = str(cell.value)
|
|
|
|
|
|
elif not a:
|
|
|
|
|
|
a = str(cell.value)
|
|
|
|
|
|
else:
|
|
|
|
|
|
break
|
|
|
|
|
|
if q and a:
|
|
|
|
|
|
res.append((q, a))
|
|
|
|
|
|
else:
|
|
|
|
|
|
fails.append(str(i + 1))
|
|
|
|
|
|
if len(res) % 999 == 0:
|
|
|
|
|
|
callback(len(res) *
|
|
|
|
|
|
0.6 /
|
2025-01-09 17:07:21 +08:00
|
|
|
|
total, ("Extract pairs: {}".format(len(res)) +
|
2024-08-15 09:17:36 +08:00
|
|
|
|
(f"{len(fails)} failure, line: %s..." %
|
|
|
|
|
|
(",".join(fails[:3])) if fails else "")))
|
|
|
|
|
|
|
2025-01-09 17:07:21 +08:00
|
|
|
|
callback(0.6, ("Extract pairs: {}. ".format(len(res)) + (
|
2024-08-15 09:17:36 +08:00
|
|
|
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
|
|
|
|
|
self.is_english = is_english(
|
|
|
|
|
|
[rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
|
|
|
|
|
|
return res
|
|
|
|
|
|
|
2024-10-15 10:11:09 +08:00
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
|
class Pdf(PdfParser):
|
|
|
|
|
|
def __call__(self, filename, binary=None, from_page=0,
|
Fix: Remove hardcoded page limits causing parsing failures on large PDFs (>300 pages) (#14382)
### What problem does this PR solve?
Fixes #14196
## Problem
When using DeepDOC to parse large PDFs (over 1000 pages), the parser
silently truncated processing at 300 pages due to a hardcoded default
`page_to=299` in `RAGFlowPdfParser.__images__()`. This caused:
- **Errors** on pages beyond the limit
- **Poor image quality** as the parser attempted to compensate with
missing page data
- **Inconsistent chunk splitting** between full PDF imports and partial
imports
Additionally, the codebase scattered magic numbers (`299`, `600`,
`10000`, `100000`, `100000000`, `10000000000`, `10**9`) across 22 files
as sentinel values for "parse all pages", making future maintenance
error-prone.
## Root Cause
```python
# deepdoc/parser/pdf_parser.py (before)
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
# Only the first 300 pages were rendered; everything beyond was silently dropped
```
While most callers in `rag/app/*.py` correctly passed `to_page=100000`,
the base class `RAGFlowPdfParser.__call__()` and `parse_into_bboxes()`
invoked `__images__` **without** forwarding `page_from`/`page_to`,
falling back to the restrictive default of 299.
## Solution
### 1. Define constants in `common/constants.py`
```python
MAXIMUM_PAGE_NUMBER = 100000 # Used by the parsing layer
MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000 # Used by the task/DB layer
```
### 2. Replace all hardcoded sentinel values
| Layer | Files Changed | Old Values | New Value |
|---|---|---|---|
| **Deepdoc parsers** | `pdf_parser.py`, `mineru_parser.py`,
`docling_parser.py`, `opendataloader_parser.py`, `paddleocr_parser.py`,
`docx_parser.py` | `299`, `600`, `10**9`, `100000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Chunk parsers** | `naive.py`, `book.py`, `qa.py`, `one.py`,
`manual.py`, `paper.py`, `presentation.py`, `laws.py`, `resume.py`,
`email.py`, `table.py` | `100000`, `10000`, `10000000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Task/DB layer** | `db_models.py`, `task_service.py`,
`document_service.py`, `file_service.py` | `100000000` |
`MAXIMUM_TASK_PAGE_NUMBER` |
### 3. Fix `parse_into_bboxes()` missing parameters
Added `from_page`/`to_page` parameters to `parse_into_bboxes()` so that
the `rag/flow/parser/parser.py` DeepDOC path no longer falls back to the
restrictive default.
## Files Changed (22)
- `common/constants.py`
- `deepdoc/parser/pdf_parser.py`
- `deepdoc/parser/mineru_parser.py`
- `deepdoc/parser/docling_parser.py`
- `deepdoc/parser/opendataloader_parser.py`
- `deepdoc/parser/paddleocr_parser.py`
- `deepdoc/parser/docx_parser.py`
- `rag/app/naive.py`
- `rag/app/book.py`
- `rag/app/qa.py`
- `rag/app/one.py`
- `rag/app/manual.py`
- `rag/app/paper.py`
- `rag/app/presentation.py`
- `rag/app/laws.py`
- `rag/app/resume.py`
- `rag/app/email.py`
- `rag/app/table.py`
- `api/db/db_models.py`
- `api/db/services/task_service.py`
- `api/db/services/document_service.py`
- `api/db/services/file_service.py`
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
---------
Signed-off-by: noob <yixiao121314@outlook.com>
2026-04-27 06:57:20 +00:00
|
|
|
|
to_page=MAXIMUM_PAGE_NUMBER, zoomin=3, callback=None):
|
2024-08-15 09:17:36 +08:00
|
|
|
|
start = timer()
|
2024-11-30 18:48:06 +08:00
|
|
|
|
callback(msg="OCR started")
|
2024-08-15 09:17:36 +08:00
|
|
|
|
self.__images__(
|
|
|
|
|
|
filename if not binary else binary,
|
|
|
|
|
|
zoomin,
|
|
|
|
|
|
from_page,
|
|
|
|
|
|
to_page,
|
|
|
|
|
|
callback
|
|
|
|
|
|
)
|
2024-11-30 18:48:06 +08:00
|
|
|
|
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
|
|
|
|
|
logging.debug("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
|
2024-08-15 09:17:36 +08:00
|
|
|
|
start = timer()
|
|
|
|
|
|
self._layouts_rec(zoomin, drop=False)
|
2024-11-30 18:48:06 +08:00
|
|
|
|
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
|
|
|
|
|
|
|
|
|
|
|
|
start = timer()
|
2024-08-15 09:17:36 +08:00
|
|
|
|
self._table_transformer_job(zoomin)
|
2024-11-30 18:48:06 +08:00
|
|
|
|
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
|
|
|
|
|
|
|
|
|
|
|
|
start = timer()
|
2024-08-15 09:17:36 +08:00
|
|
|
|
self._text_merge()
|
2024-11-30 18:48:06 +08:00
|
|
|
|
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
|
2024-08-15 09:17:36 +08:00
|
|
|
|
tbls = self._extract_table_figure(True, zoomin, True, True)
|
2025-12-29 12:01:18 +08:00
|
|
|
|
# self._naive_vertical_merge()
|
2024-08-15 09:17:36 +08:00
|
|
|
|
# self._concat_downward()
|
2025-12-29 12:01:18 +08:00
|
|
|
|
# self._filter_forpages()
|
2024-11-14 17:13:48 +08:00
|
|
|
|
logging.debug("layouts: {}".format(timer() - start))
|
2024-08-15 09:17:36 +08:00
|
|
|
|
sections = [b["text"] for b in self.boxes]
|
|
|
|
|
|
bull_x0_list = []
|
|
|
|
|
|
q_bull, reg = qbullets_category(sections)
|
|
|
|
|
|
if q_bull == -1:
|
|
|
|
|
|
raise ValueError("Unable to recognize Q&A structure.")
|
|
|
|
|
|
qai_list = []
|
|
|
|
|
|
last_q, last_a, last_tag = '', '', ''
|
|
|
|
|
|
last_index = -1
|
2025-12-29 12:01:18 +08:00
|
|
|
|
last_box = {'text': ''}
|
2024-08-15 09:17:36 +08:00
|
|
|
|
last_bull = None
|
2025-12-29 12:01:18 +08:00
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
|
def sort_key(element):
|
|
|
|
|
|
tbls_pn = element[1][0][0]
|
|
|
|
|
|
tbls_top = element[1][0][3]
|
|
|
|
|
|
return tbls_pn, tbls_top
|
2025-12-29 12:01:18 +08:00
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
|
tbls.sort(key=sort_key)
|
|
|
|
|
|
tbl_index = 0
|
|
|
|
|
|
last_pn, last_bottom = 0, 0
|
|
|
|
|
|
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
|
|
|
|
|
|
for box in self.boxes:
|
|
|
|
|
|
section, line_tag = box['text'], self._line_tag(box, zoomin)
|
|
|
|
|
|
has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list)
|
|
|
|
|
|
last_box, last_index, last_bull = box, index, has_bull
|
2025-03-18 11:13:44 +08:00
|
|
|
|
line_pn = get_float(line_tag.lstrip('@@').split('\t')[0])
|
|
|
|
|
|
line_top = get_float(line_tag.rstrip('##').split('\t')[3])
|
2024-08-15 09:17:36 +08:00
|
|
|
|
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
|
|
|
|
|
|
if not has_bull: # No question bullet
|
|
|
|
|
|
if not last_q:
|
2025-12-29 12:01:18 +08:00
|
|
|
|
if tbl_pn < line_pn or (tbl_pn == line_pn and tbl_top <= line_top): # image passed
|
2024-08-15 09:17:36 +08:00
|
|
|
|
tbl_index += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
else:
|
|
|
|
|
|
sum_tag = line_tag
|
|
|
|
|
|
sum_section = section
|
2025-12-29 12:01:18 +08:00
|
|
|
|
while ((tbl_pn == last_pn and tbl_top >= last_bottom) or (tbl_pn > last_pn)) \
|
|
|
|
|
|
and ((tbl_pn == line_pn and tbl_top <= line_top) or (
|
|
|
|
|
|
tbl_pn < line_pn)): # add image at the middle of current answer
|
2024-08-15 09:17:36 +08:00
|
|
|
|
sum_tag = f'{tbl_tag}{sum_tag}'
|
|
|
|
|
|
sum_section = f'{tbl_text}{sum_section}'
|
|
|
|
|
|
tbl_index += 1
|
2025-12-29 12:01:18 +08:00
|
|
|
|
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls,
|
|
|
|
|
|
tbl_index)
|
2024-08-15 09:17:36 +08:00
|
|
|
|
last_a = f'{last_a}{sum_section}'
|
|
|
|
|
|
last_tag = f'{last_tag}{sum_tag}'
|
|
|
|
|
|
else:
|
|
|
|
|
|
if last_q:
|
2025-12-29 12:01:18 +08:00
|
|
|
|
while ((tbl_pn == last_pn and tbl_top >= last_bottom) or (tbl_pn > last_pn)) \
|
|
|
|
|
|
and ((tbl_pn == line_pn and tbl_top <= line_top) or (
|
|
|
|
|
|
tbl_pn < line_pn)): # add image at the end of last answer
|
2024-08-15 09:17:36 +08:00
|
|
|
|
last_tag = f'{last_tag}{tbl_tag}'
|
|
|
|
|
|
last_a = f'{last_a}{tbl_text}'
|
|
|
|
|
|
tbl_index += 1
|
2025-12-29 12:01:18 +08:00
|
|
|
|
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls,
|
|
|
|
|
|
tbl_index)
|
2024-08-15 09:17:36 +08:00
|
|
|
|
image, poss = self.crop(last_tag, need_position=True)
|
|
|
|
|
|
qai_list.append((last_q, last_a, image, poss))
|
|
|
|
|
|
last_q, last_a, last_tag = '', '', ''
|
|
|
|
|
|
last_q = has_bull.group()
|
|
|
|
|
|
_, end = has_bull.span()
|
|
|
|
|
|
last_a = section[end:]
|
|
|
|
|
|
last_tag = line_tag
|
|
|
|
|
|
last_bottom = float(line_tag.rstrip('##').split('\t')[4])
|
|
|
|
|
|
last_pn = line_pn
|
|
|
|
|
|
if last_q:
|
|
|
|
|
|
qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
|
|
|
|
|
|
return qai_list, tbls
|
2024-10-15 10:11:09 +08:00
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
|
def get_tbls_info(self, tbls, tbl_index):
|
|
|
|
|
|
if tbl_index >= len(tbls):
|
|
|
|
|
|
return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
|
2025-12-29 12:01:18 +08:00
|
|
|
|
tbl_pn = tbls[tbl_index][1][0][0] + 1
|
2024-08-15 09:17:36 +08:00
|
|
|
|
tbl_left = tbls[tbl_index][1][0][1]
|
|
|
|
|
|
tbl_right = tbls[tbl_index][1][0][2]
|
|
|
|
|
|
tbl_top = tbls[tbl_index][1][0][3]
|
|
|
|
|
|
tbl_bottom = tbls[tbl_index][1][0][4]
|
|
|
|
|
|
tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
|
|
|
|
|
|
.format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
|
2024-12-08 14:21:12 +08:00
|
|
|
|
_tbl_text = ''.join(tbls[tbl_index][0][1])
|
2025-01-23 12:30:46 +08:00
|
|
|
|
return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, _tbl_text
|
2024-10-15 10:11:09 +08:00
|
|
|
|
|
|
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
|
class Docx(DocxParser):
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
|
pass
|
2024-10-15 10:11:09 +08:00
|
|
|
|
|
Fix: Remove hardcoded page limits causing parsing failures on large PDFs (>300 pages) (#14382)
### What problem does this PR solve?
Fixes #14196
## Problem
When using DeepDOC to parse large PDFs (over 1000 pages), the parser
silently truncated processing at 300 pages due to a hardcoded default
`page_to=299` in `RAGFlowPdfParser.__images__()`. This caused:
- **Errors** on pages beyond the limit
- **Poor image quality** as the parser attempted to compensate with
missing page data
- **Inconsistent chunk splitting** between full PDF imports and partial
imports
Additionally, the codebase scattered magic numbers (`299`, `600`,
`10000`, `100000`, `100000000`, `10000000000`, `10**9`) across 22 files
as sentinel values for "parse all pages", making future maintenance
error-prone.
## Root Cause
```python
# deepdoc/parser/pdf_parser.py (before)
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
# Only the first 300 pages were rendered; everything beyond was silently dropped
```
While most callers in `rag/app/*.py` correctly passed `to_page=100000`,
the base class `RAGFlowPdfParser.__call__()` and `parse_into_bboxes()`
invoked `__images__` **without** forwarding `page_from`/`page_to`,
falling back to the restrictive default of 299.
## Solution
### 1. Define constants in `common/constants.py`
```python
MAXIMUM_PAGE_NUMBER = 100000 # Used by the parsing layer
MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000 # Used by the task/DB layer
```
### 2. Replace all hardcoded sentinel values
| Layer | Files Changed | Old Values | New Value |
|---|---|---|---|
| **Deepdoc parsers** | `pdf_parser.py`, `mineru_parser.py`,
`docling_parser.py`, `opendataloader_parser.py`, `paddleocr_parser.py`,
`docx_parser.py` | `299`, `600`, `10**9`, `100000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Chunk parsers** | `naive.py`, `book.py`, `qa.py`, `one.py`,
`manual.py`, `paper.py`, `presentation.py`, `laws.py`, `resume.py`,
`email.py`, `table.py` | `100000`, `10000`, `10000000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Task/DB layer** | `db_models.py`, `task_service.py`,
`document_service.py`, `file_service.py` | `100000000` |
`MAXIMUM_TASK_PAGE_NUMBER` |
### 3. Fix `parse_into_bboxes()` missing parameters
Added `from_page`/`to_page` parameters to `parse_into_bboxes()` so that
the `rag/flow/parser/parser.py` DeepDOC path no longer falls back to the
restrictive default.
## Files Changed (22)
- `common/constants.py`
- `deepdoc/parser/pdf_parser.py`
- `deepdoc/parser/mineru_parser.py`
- `deepdoc/parser/docling_parser.py`
- `deepdoc/parser/opendataloader_parser.py`
- `deepdoc/parser/paddleocr_parser.py`
- `deepdoc/parser/docx_parser.py`
- `rag/app/naive.py`
- `rag/app/book.py`
- `rag/app/qa.py`
- `rag/app/one.py`
- `rag/app/manual.py`
- `rag/app/paper.py`
- `rag/app/presentation.py`
- `rag/app/laws.py`
- `rag/app/resume.py`
- `rag/app/email.py`
- `rag/app/table.py`
- `api/db/db_models.py`
- `api/db/services/task_service.py`
- `api/db/services/document_service.py`
- `api/db/services/file_service.py`
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
---------
Signed-off-by: noob <yixiao121314@outlook.com>
2026-04-27 06:57:20 +00:00
|
|
|
|
def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, callback=None):
|
2024-08-15 09:17:36 +08:00
|
|
|
|
self.doc = Document(
|
|
|
|
|
|
filename) if not binary else Document(BytesIO(binary))
|
|
|
|
|
|
pn = 0
|
|
|
|
|
|
last_answer, last_image = "", None
|
|
|
|
|
|
question_stack, level_stack = [], []
|
|
|
|
|
|
qai_list = []
|
|
|
|
|
|
for p in self.doc.paragraphs:
|
|
|
|
|
|
if pn > to_page:
|
|
|
|
|
|
break
|
|
|
|
|
|
question_level, p_text = 0, ''
|
|
|
|
|
|
if from_page <= pn < to_page and p.text.strip():
|
|
|
|
|
|
question_level, p_text = docx_question_level(p)
|
2025-12-29 12:01:18 +08:00
|
|
|
|
if not question_level or question_level > 6: # not a question
|
2024-08-15 09:17:36 +08:00
|
|
|
|
last_answer = f'{last_answer}\n{p_text}'
|
|
|
|
|
|
current_image = self.get_picture(self.doc, p)
|
|
|
|
|
|
last_image = concat_img(last_image, current_image)
|
2025-12-29 12:01:18 +08:00
|
|
|
|
else: # is a question
|
2024-08-15 09:17:36 +08:00
|
|
|
|
if last_answer or last_image:
|
|
|
|
|
|
sum_question = '\n'.join(question_stack)
|
|
|
|
|
|
if sum_question:
|
|
|
|
|
|
qai_list.append((sum_question, last_answer, last_image))
|
|
|
|
|
|
last_answer, last_image = '', None
|
|
|
|
|
|
|
|
|
|
|
|
i = question_level
|
|
|
|
|
|
while question_stack and i <= level_stack[-1]:
|
|
|
|
|
|
question_stack.pop()
|
|
|
|
|
|
level_stack.pop()
|
|
|
|
|
|
question_stack.append(p_text)
|
|
|
|
|
|
level_stack.append(question_level)
|
|
|
|
|
|
for run in p.runs:
|
|
|
|
|
|
if 'lastRenderedPageBreak' in run._element.xml:
|
|
|
|
|
|
pn += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
|
|
|
|
|
pn += 1
|
|
|
|
|
|
if last_answer:
|
|
|
|
|
|
sum_question = '\n'.join(question_stack)
|
|
|
|
|
|
if sum_question:
|
|
|
|
|
|
qai_list.append((sum_question, last_answer, last_image))
|
2024-11-30 18:48:06 +08:00
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
|
tbls = []
|
|
|
|
|
|
for tb in self.doc.tables:
|
2025-12-29 12:01:18 +08:00
|
|
|
|
html = "<table>"
|
2024-08-15 09:17:36 +08:00
|
|
|
|
for r in tb.rows:
|
|
|
|
|
|
html += "<tr>"
|
|
|
|
|
|
i = 0
|
|
|
|
|
|
while i < len(r.cells):
|
|
|
|
|
|
span = 1
|
|
|
|
|
|
c = r.cells[i]
|
2025-12-29 12:01:18 +08:00
|
|
|
|
for j in range(i + 1, len(r.cells)):
|
2024-08-15 09:17:36 +08:00
|
|
|
|
if c.text == r.cells[j].text:
|
|
|
|
|
|
span += 1
|
|
|
|
|
|
i = j
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
|
|
|
|
|
|
html += "</tr>"
|
|
|
|
|
|
html += "</table>"
|
|
|
|
|
|
tbls.append(((None, html), ""))
|
|
|
|
|
|
return qai_list, tbls
|
|
|
|
|
|
|
2024-10-15 10:11:09 +08:00
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
|
def rmPrefix(txt):
|
|
|
|
|
|
return re.sub(
|
|
|
|
|
|
r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def beAdocPdf(d, q, a, eng, image, poss):
|
|
|
|
|
|
qprefix = "Question: " if eng else "问题:"
|
|
|
|
|
|
aprefix = "Answer: " if eng else "回答:"
|
|
|
|
|
|
d["content_with_weight"] = "\t".join(
|
|
|
|
|
|
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
|
|
|
|
|
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
|
|
|
|
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
2025-05-13 14:30:36 +08:00
|
|
|
|
if image:
|
|
|
|
|
|
d["image"] = image
|
|
|
|
|
|
d["doc_type_kwd"] = "image"
|
2024-08-15 09:17:36 +08:00
|
|
|
|
add_positions(d, poss)
|
|
|
|
|
|
return d
|
|
|
|
|
|
|
2024-10-15 10:11:09 +08:00
|
|
|
|
|
2025-01-09 17:07:21 +08:00
|
|
|
|
def beAdocDocx(d, q, a, eng, image, row_num=-1):
|
2024-08-15 09:17:36 +08:00
|
|
|
|
qprefix = "Question: " if eng else "问题:"
|
|
|
|
|
|
aprefix = "Answer: " if eng else "回答:"
|
|
|
|
|
|
d["content_with_weight"] = "\t".join(
|
|
|
|
|
|
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
|
|
|
|
|
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
|
|
|
|
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
2025-05-13 14:30:36 +08:00
|
|
|
|
if image:
|
|
|
|
|
|
d["image"] = image
|
|
|
|
|
|
d["doc_type_kwd"] = "image"
|
2025-01-09 17:07:21 +08:00
|
|
|
|
if row_num >= 0:
|
|
|
|
|
|
d["top_int"] = [row_num]
|
2024-08-15 09:17:36 +08:00
|
|
|
|
return d
|
|
|
|
|
|
|
2024-10-15 10:11:09 +08:00
|
|
|
|
|
2025-01-09 17:07:21 +08:00
|
|
|
|
def beAdoc(d, q, a, eng, row_num=-1):
|
2024-08-15 09:17:36 +08:00
|
|
|
|
qprefix = "Question: " if eng else "问题:"
|
|
|
|
|
|
aprefix = "Answer: " if eng else "回答:"
|
|
|
|
|
|
d["content_with_weight"] = "\t".join(
|
|
|
|
|
|
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
|
|
|
|
|
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
|
|
|
|
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
2025-01-09 17:07:21 +08:00
|
|
|
|
if row_num >= 0:
|
|
|
|
|
|
d["top_int"] = [row_num]
|
2024-08-15 09:17:36 +08:00
|
|
|
|
return d
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def mdQuestionLevel(s):
|
|
|
|
|
|
match = re.match(r'#*', s)
|
|
|
|
|
|
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
|
|
|
|
|
|
|
2024-10-15 10:11:09 +08:00
|
|
|
|
|
Fix: Remove hardcoded page limits causing parsing failures on large PDFs (>300 pages) (#14382)
### What problem does this PR solve?
Fixes #14196
## Problem
When using DeepDOC to parse large PDFs (over 1000 pages), the parser
silently truncated processing at 300 pages due to a hardcoded default
`page_to=299` in `RAGFlowPdfParser.__images__()`. This caused:
- **Errors** on pages beyond the limit
- **Poor image quality** as the parser attempted to compensate with
missing page data
- **Inconsistent chunk splitting** between full PDF imports and partial
imports
Additionally, the codebase scattered magic numbers (`299`, `600`,
`10000`, `100000`, `100000000`, `10000000000`, `10**9`) across 22 files
as sentinel values for "parse all pages", making future maintenance
error-prone.
## Root Cause
```python
# deepdoc/parser/pdf_parser.py (before)
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
# Only the first 300 pages were rendered; everything beyond was silently dropped
```
While most callers in `rag/app/*.py` correctly passed `to_page=100000`,
the base class `RAGFlowPdfParser.__call__()` and `parse_into_bboxes()`
invoked `__images__` **without** forwarding `page_from`/`page_to`,
falling back to the restrictive default of 299.
## Solution
### 1. Define constants in `common/constants.py`
```python
MAXIMUM_PAGE_NUMBER = 100000 # Used by the parsing layer
MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000 # Used by the task/DB layer
```
### 2. Replace all hardcoded sentinel values
| Layer | Files Changed | Old Values | New Value |
|---|---|---|---|
| **Deepdoc parsers** | `pdf_parser.py`, `mineru_parser.py`,
`docling_parser.py`, `opendataloader_parser.py`, `paddleocr_parser.py`,
`docx_parser.py` | `299`, `600`, `10**9`, `100000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Chunk parsers** | `naive.py`, `book.py`, `qa.py`, `one.py`,
`manual.py`, `paper.py`, `presentation.py`, `laws.py`, `resume.py`,
`email.py`, `table.py` | `100000`, `10000`, `10000000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Task/DB layer** | `db_models.py`, `task_service.py`,
`document_service.py`, `file_service.py` | `100000000` |
`MAXIMUM_TASK_PAGE_NUMBER` |
### 3. Fix `parse_into_bboxes()` missing parameters
Added `from_page`/`to_page` parameters to `parse_into_bboxes()` so that
the `rag/flow/parser/parser.py` DeepDOC path no longer falls back to the
restrictive default.
## Files Changed (22)
- `common/constants.py`
- `deepdoc/parser/pdf_parser.py`
- `deepdoc/parser/mineru_parser.py`
- `deepdoc/parser/docling_parser.py`
- `deepdoc/parser/opendataloader_parser.py`
- `deepdoc/parser/paddleocr_parser.py`
- `deepdoc/parser/docx_parser.py`
- `rag/app/naive.py`
- `rag/app/book.py`
- `rag/app/qa.py`
- `rag/app/one.py`
- `rag/app/manual.py`
- `rag/app/paper.py`
- `rag/app/presentation.py`
- `rag/app/laws.py`
- `rag/app/resume.py`
- `rag/app/email.py`
- `rag/app/table.py`
- `api/db/db_models.py`
- `api/db/services/task_service.py`
- `api/db/services/document_service.py`
- `api/db/services/file_service.py`
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
---------
Signed-off-by: noob <yixiao121314@outlook.com>
2026-04-27 06:57:20 +00:00
|
|
|
|
def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, **kwargs):
|
2024-08-15 09:17:36 +08:00
|
|
|
|
"""
|
|
|
|
|
|
Excel and csv(txt) format files are supported.
|
2025-12-08 12:21:18 +08:00
|
|
|
|
If the file is in Excel format, there should be 2 column question and answer without header.
|
2024-08-15 09:17:36 +08:00
|
|
|
|
And question column is ahead of answer column.
|
|
|
|
|
|
And it's O.K if it has multiple sheets as long as the columns are rightly composed.
|
|
|
|
|
|
|
|
|
|
|
|
If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
|
|
|
|
|
|
|
|
|
|
|
|
All the deformed lines will be ignored.
|
|
|
|
|
|
Every pair of Q&A will be treated as a chunk.
|
|
|
|
|
|
"""
|
|
|
|
|
|
eng = lang.lower() == "english"
|
|
|
|
|
|
res = []
|
|
|
|
|
|
doc = {
|
|
|
|
|
|
"docnm_kwd": filename,
|
|
|
|
|
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
|
|
|
|
|
}
|
|
|
|
|
|
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
|
|
|
|
|
callback(0.1, "Start to parse.")
|
|
|
|
|
|
excel_parser = Excel()
|
2025-01-09 17:07:21 +08:00
|
|
|
|
for ii, (q, a) in enumerate(excel_parser(filename, binary, callback)):
|
|
|
|
|
|
res.append(beAdoc(deepcopy(doc), q, a, eng, ii))
|
2024-08-15 09:17:36 +08:00
|
|
|
|
return res
|
2024-10-29 10:08:08 +08:00
|
|
|
|
|
2024-12-30 10:32:19 +08:00
|
|
|
|
elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
|
2024-08-15 09:17:36 +08:00
|
|
|
|
callback(0.1, "Start to parse.")
|
2024-09-29 12:47:09 +08:00
|
|
|
|
txt = get_text(filename, binary)
|
2024-08-15 09:17:36 +08:00
|
|
|
|
lines = txt.split("\n")
|
|
|
|
|
|
comma, tab = 0, 0
|
2024-12-08 14:21:12 +08:00
|
|
|
|
for line in lines:
|
|
|
|
|
|
if len(line.split(",")) == 2:
|
|
|
|
|
|
comma += 1
|
|
|
|
|
|
if len(line.split("\t")) == 2:
|
|
|
|
|
|
tab += 1
|
2024-08-15 09:17:36 +08:00
|
|
|
|
delimiter = "\t" if tab >= comma else ","
|
|
|
|
|
|
|
|
|
|
|
|
fails = []
|
|
|
|
|
|
question, answer = "", ""
|
|
|
|
|
|
i = 0
|
|
|
|
|
|
while i < len(lines):
|
|
|
|
|
|
arr = lines[i].split(delimiter)
|
|
|
|
|
|
if len(arr) != 2:
|
2024-12-08 14:21:12 +08:00
|
|
|
|
if question:
|
|
|
|
|
|
answer += "\n" + lines[i]
|
2024-08-15 09:17:36 +08:00
|
|
|
|
else:
|
2025-12-29 12:01:18 +08:00
|
|
|
|
fails.append(str(i + 1))
|
2024-08-15 09:17:36 +08:00
|
|
|
|
elif len(arr) == 2:
|
2024-12-08 14:21:12 +08:00
|
|
|
|
if question and answer:
|
2025-01-09 17:07:21 +08:00
|
|
|
|
res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
|
2024-08-15 09:17:36 +08:00
|
|
|
|
question, answer = arr
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
if len(res) % 999 == 0:
|
|
|
|
|
|
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
|
|
|
|
|
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
|
|
|
|
|
|
2024-12-08 14:21:12 +08:00
|
|
|
|
if question:
|
2025-01-09 17:07:21 +08:00
|
|
|
|
res.append(beAdoc(deepcopy(doc), question, answer, eng, len(lines)))
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
|
|
|
|
|
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
|
|
|
|
|
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
|
|
|
|
|
|
|
|
|
|
|
return res
|
2024-10-29 10:08:08 +08:00
|
|
|
|
|
2024-12-30 10:32:19 +08:00
|
|
|
|
elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
|
|
|
|
|
|
callback(0.1, "Start to parse.")
|
|
|
|
|
|
txt = get_text(filename, binary)
|
|
|
|
|
|
lines = txt.split("\n")
|
|
|
|
|
|
delimiter = "\t" if any("\t" in line for line in lines) else ","
|
|
|
|
|
|
|
|
|
|
|
|
fails = []
|
|
|
|
|
|
question, answer = "", ""
|
|
|
|
|
|
res = []
|
|
|
|
|
|
reader = csv.reader(lines, delimiter=delimiter)
|
|
|
|
|
|
|
|
|
|
|
|
for i, row in enumerate(reader):
|
|
|
|
|
|
if len(row) != 2:
|
|
|
|
|
|
if question:
|
|
|
|
|
|
answer += "\n" + lines[i]
|
|
|
|
|
|
else:
|
|
|
|
|
|
fails.append(str(i + 1))
|
|
|
|
|
|
elif len(row) == 2:
|
|
|
|
|
|
if question and answer:
|
2025-01-09 17:07:21 +08:00
|
|
|
|
res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
|
2024-12-30 10:32:19 +08:00
|
|
|
|
question, answer = row
|
|
|
|
|
|
if len(res) % 999 == 0:
|
|
|
|
|
|
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
|
|
|
|
|
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
|
|
|
|
|
|
|
|
|
|
|
if question:
|
2025-01-24 14:47:19 +08:00
|
|
|
|
res.append(beAdoc(deepcopy(doc), question, answer, eng, len(list(reader))))
|
2024-12-30 10:32:19 +08:00
|
|
|
|
|
|
|
|
|
|
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
|
|
|
|
|
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
|
|
|
|
|
return res
|
|
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
|
|
|
|
|
callback(0.1, "Start to parse.")
|
|
|
|
|
|
pdf_parser = Pdf()
|
|
|
|
|
|
qai_list, tbls = pdf_parser(filename if not binary else binary,
|
2025-06-25 10:25:45 +08:00
|
|
|
|
from_page=from_page, to_page=to_page, callback=callback)
|
2024-08-15 09:17:36 +08:00
|
|
|
|
for q, a, image, poss in qai_list:
|
|
|
|
|
|
res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
|
|
|
|
|
|
return res
|
2024-10-29 10:08:08 +08:00
|
|
|
|
|
2025-12-29 12:54:31 +08:00
|
|
|
|
elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE):
|
2024-08-15 09:17:36 +08:00
|
|
|
|
callback(0.1, "Start to parse.")
|
2024-09-29 12:47:09 +08:00
|
|
|
|
txt = get_text(filename, binary)
|
2024-08-15 09:17:36 +08:00
|
|
|
|
lines = txt.split("\n")
|
2024-12-08 14:21:12 +08:00
|
|
|
|
_last_question, last_answer = "", ""
|
2024-08-15 09:17:36 +08:00
|
|
|
|
question_stack, level_stack = [], []
|
|
|
|
|
|
code_block = False
|
2024-12-08 14:21:12 +08:00
|
|
|
|
for index, line in enumerate(lines):
|
|
|
|
|
|
if line.strip().startswith('```'):
|
2024-08-15 09:17:36 +08:00
|
|
|
|
code_block = not code_block
|
|
|
|
|
|
question_level, question = 0, ''
|
|
|
|
|
|
if not code_block:
|
2024-12-08 14:21:12 +08:00
|
|
|
|
question_level, question = mdQuestionLevel(line)
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
2025-12-29 12:01:18 +08:00
|
|
|
|
if not question_level or question_level > 6: # not a question
|
2024-12-08 14:21:12 +08:00
|
|
|
|
last_answer = f'{last_answer}\n{line}'
|
2025-12-29 12:01:18 +08:00
|
|
|
|
else: # is a question
|
2024-08-15 09:17:36 +08:00
|
|
|
|
if last_answer.strip():
|
|
|
|
|
|
sum_question = '\n'.join(question_stack)
|
|
|
|
|
|
if sum_question:
|
2025-12-29 12:01:18 +08:00
|
|
|
|
res.append(beAdoc(deepcopy(doc), sum_question,
|
|
|
|
|
|
markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
|
2024-08-15 09:17:36 +08:00
|
|
|
|
last_answer = ''
|
|
|
|
|
|
|
|
|
|
|
|
i = question_level
|
|
|
|
|
|
while question_stack and i <= level_stack[-1]:
|
|
|
|
|
|
question_stack.pop()
|
|
|
|
|
|
level_stack.pop()
|
|
|
|
|
|
question_stack.append(question)
|
|
|
|
|
|
level_stack.append(question_level)
|
|
|
|
|
|
if last_answer.strip():
|
|
|
|
|
|
sum_question = '\n'.join(question_stack)
|
|
|
|
|
|
if sum_question:
|
2025-12-29 12:01:18 +08:00
|
|
|
|
res.append(beAdoc(deepcopy(doc), sum_question,
|
|
|
|
|
|
markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
|
2024-08-15 09:17:36 +08:00
|
|
|
|
return res
|
2024-10-22 15:25:23 +08:00
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
|
elif re.search(r"\.docx$", filename, re.IGNORECASE):
|
|
|
|
|
|
docx_parser = Docx()
|
|
|
|
|
|
qai_list, tbls = docx_parser(filename, binary,
|
Fix: Remove hardcoded page limits causing parsing failures on large PDFs (>300 pages) (#14382)
### What problem does this PR solve?
Fixes #14196
## Problem
When using DeepDOC to parse large PDFs (over 1000 pages), the parser
silently truncated processing at 300 pages due to a hardcoded default
`page_to=299` in `RAGFlowPdfParser.__images__()`. This caused:
- **Errors** on pages beyond the limit
- **Poor image quality** as the parser attempted to compensate with
missing page data
- **Inconsistent chunk splitting** between full PDF imports and partial
imports
Additionally, the codebase scattered magic numbers (`299`, `600`,
`10000`, `100000`, `100000000`, `10000000000`, `10**9`) across 22 files
as sentinel values for "parse all pages", making future maintenance
error-prone.
## Root Cause
```python
# deepdoc/parser/pdf_parser.py (before)
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
# Only the first 300 pages were rendered; everything beyond was silently dropped
```
While most callers in `rag/app/*.py` correctly passed `to_page=100000`,
the base class `RAGFlowPdfParser.__call__()` and `parse_into_bboxes()`
invoked `__images__` **without** forwarding `page_from`/`page_to`,
falling back to the restrictive default of 299.
## Solution
### 1. Define constants in `common/constants.py`
```python
MAXIMUM_PAGE_NUMBER = 100000 # Used by the parsing layer
MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000 # Used by the task/DB layer
```
### 2. Replace all hardcoded sentinel values
| Layer | Files Changed | Old Values | New Value |
|---|---|---|---|
| **Deepdoc parsers** | `pdf_parser.py`, `mineru_parser.py`,
`docling_parser.py`, `opendataloader_parser.py`, `paddleocr_parser.py`,
`docx_parser.py` | `299`, `600`, `10**9`, `100000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Chunk parsers** | `naive.py`, `book.py`, `qa.py`, `one.py`,
`manual.py`, `paper.py`, `presentation.py`, `laws.py`, `resume.py`,
`email.py`, `table.py` | `100000`, `10000`, `10000000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Task/DB layer** | `db_models.py`, `task_service.py`,
`document_service.py`, `file_service.py` | `100000000` |
`MAXIMUM_TASK_PAGE_NUMBER` |
### 3. Fix `parse_into_bboxes()` missing parameters
Added `from_page`/`to_page` parameters to `parse_into_bboxes()` so that
the `rag/flow/parser/parser.py` DeepDOC path no longer falls back to the
restrictive default.
## Files Changed (22)
- `common/constants.py`
- `deepdoc/parser/pdf_parser.py`
- `deepdoc/parser/mineru_parser.py`
- `deepdoc/parser/docling_parser.py`
- `deepdoc/parser/opendataloader_parser.py`
- `deepdoc/parser/paddleocr_parser.py`
- `deepdoc/parser/docx_parser.py`
- `rag/app/naive.py`
- `rag/app/book.py`
- `rag/app/qa.py`
- `rag/app/one.py`
- `rag/app/manual.py`
- `rag/app/paper.py`
- `rag/app/presentation.py`
- `rag/app/laws.py`
- `rag/app/resume.py`
- `rag/app/email.py`
- `rag/app/table.py`
- `api/db/db_models.py`
- `api/db/services/task_service.py`
- `api/db/services/document_service.py`
- `api/db/services/file_service.py`
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
---------
Signed-off-by: noob <yixiao121314@outlook.com>
2026-04-27 06:57:20 +00:00
|
|
|
|
from_page=0, to_page=MAXIMUM_PAGE_NUMBER, callback=callback)
|
2024-08-15 09:17:36 +08:00
|
|
|
|
res = tokenize_table(tbls, doc, eng)
|
2025-01-09 17:07:21 +08:00
|
|
|
|
for i, (q, a, image) in enumerate(qai_list):
|
|
|
|
|
|
res.append(beAdocDocx(deepcopy(doc), q, a, eng, image, i))
|
2024-08-15 09:17:36 +08:00
|
|
|
|
return res
|
|
|
|
|
|
|
|
|
|
|
|
raise NotImplementedError(
|
|
|
|
|
|
"Excel, csv(txt), pdf, markdown and docx format files are supported.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
import sys
|
|
|
|
|
|
|
2025-12-29 12:01:18 +08:00
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
|
def dummy(prog=None, msg=""):
|
|
|
|
|
|
pass
|
2025-12-29 12:01:18 +08:00
|
|
|
|
|
|
|
|
|
|
|
2025-06-25 10:25:45 +08:00
|
|
|
|
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|