From 2846a939981b41e155ef9975727bfb0e7f7a0ca8 Mon Sep 17 00:00:00 2001 From: euvre <93761161+euvre@users.noreply.github.com> Date: Mon, 27 Apr 2026 06:57:20 +0000 Subject: [PATCH] Fix: Remove hardcoded page limits causing parsing failures on large PDFs (>300 pages) (#14382) ### What problem does this PR solve? Fixes #14196 ## Problem When using DeepDOC to parse large PDFs (over 1000 pages), the parser silently truncated processing at 300 pages due to a hardcoded default `page_to=299` in `RAGFlowPdfParser.__images__()`. This caused: - **Errors** on pages beyond the limit - **Poor image quality** as the parser attempted to compensate with missing page data - **Inconsistent chunk splitting** between full PDF imports and partial imports Additionally, the codebase scattered magic numbers (`299`, `600`, `10000`, `100000`, `100000000`, `10000000000`, `10**9`) across 22 files as sentinel values for "parse all pages", making future maintenance error-prone. ## Root Cause ```python # deepdoc/parser/pdf_parser.py (before) def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): # Only the first 300 pages were rendered; everything beyond was silently dropped ``` While most callers in `rag/app/*.py` correctly passed `to_page=100000`, the base class `RAGFlowPdfParser.__call__()` and `parse_into_bboxes()` invoked `__images__` **without** forwarding `page_from`/`page_to`, falling back to the restrictive default of 299. ## Solution ### 1. Define constants in `common/constants.py` ```python MAXIMUM_PAGE_NUMBER = 100000 # Used by the parsing layer MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000 # Used by the task/DB layer ``` ### 2. Replace all hardcoded sentinel values | Layer | Files Changed | Old Values | New Value | |---|---|---|---| | **Deepdoc parsers** | `pdf_parser.py`, `mineru_parser.py`, `docling_parser.py`, `opendataloader_parser.py`, `paddleocr_parser.py`, `docx_parser.py` | `299`, `600`, `10**9`, `100000000` | `MAXIMUM_PAGE_NUMBER` | | **Chunk parsers** | `naive.py`, `book.py`, `qa.py`, `one.py`, `manual.py`, `paper.py`, `presentation.py`, `laws.py`, `resume.py`, `email.py`, `table.py` | `100000`, `10000`, `10000000000` | `MAXIMUM_PAGE_NUMBER` | | **Task/DB layer** | `db_models.py`, `task_service.py`, `document_service.py`, `file_service.py` | `100000000` | `MAXIMUM_TASK_PAGE_NUMBER` | ### 3. Fix `parse_into_bboxes()` missing parameters Added `from_page`/`to_page` parameters to `parse_into_bboxes()` so that the `rag/flow/parser/parser.py` DeepDOC path no longer falls back to the restrictive default. ## Files Changed (22) - `common/constants.py` - `deepdoc/parser/pdf_parser.py` - `deepdoc/parser/mineru_parser.py` - `deepdoc/parser/docling_parser.py` - `deepdoc/parser/opendataloader_parser.py` - `deepdoc/parser/paddleocr_parser.py` - `deepdoc/parser/docx_parser.py` - `rag/app/naive.py` - `rag/app/book.py` - `rag/app/qa.py` - `rag/app/one.py` - `rag/app/manual.py` - `rag/app/paper.py` - `rag/app/presentation.py` - `rag/app/laws.py` - `rag/app/resume.py` - `rag/app/email.py` - `rag/app/table.py` - `api/db/db_models.py` - `api/db/services/task_service.py` - `api/db/services/document_service.py` - `api/db/services/file_service.py` ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Refactoring --------- Signed-off-by: noob --- api/db/db_models.py | 4 ++-- api/db/services/document_service.py | 8 +++---- api/db/services/file_service.py | 4 ++-- api/db/services/task_service.py | 12 +++++----- common/constants.py | 6 +++++ deepdoc/parser/docling_parser.py | 6 +++-- deepdoc/parser/docx_parser.py | 3 ++- deepdoc/parser/mineru_parser.py | 4 +++- deepdoc/parser/opendataloader_parser.py | 4 +++- deepdoc/parser/paddleocr_parser.py | 4 +++- deepdoc/parser/pdf_parser.py | 15 +++++++------ rag/app/book.py | 5 +++-- rag/app/email.py | 3 ++- rag/app/laws.py | 10 ++++----- rag/app/manual.py | 10 ++++----- rag/app/naive.py | 22 +++++++++---------- rag/app/one.py | 7 +++--- rag/app/paper.py | 6 ++--- rag/app/presentation.py | 9 ++++---- rag/app/qa.py | 9 ++++---- rag/app/resume.py | 3 ++- rag/app/table.py | 5 +++-- .../test_chat_sdk_routes_unit.py | 4 ++++ .../test_session_sdk_routes_unit.py | 4 ++++ 24 files changed, 99 insertions(+), 68 deletions(-) diff --git a/api/db/db_models.py b/api/db/db_models.py index 433ed78afe..f1dd46b2bf 100644 --- a/api/db/db_models.py +++ b/api/db/db_models.py @@ -55,7 +55,7 @@ from api.utils.configs import deserialize_b64, serialize_b64 from common.time_utils import current_timestamp, timestamp_to_date, date_string_to_timestamp from common.decorator import singleton -from common.constants import ParserType +from common.constants import ParserType, MAXIMUM_TASK_PAGE_NUMBER from common import settings @@ -945,7 +945,7 @@ class Task(DataBaseModel): id = CharField(max_length=32, primary_key=True) doc_id = CharField(max_length=32, null=False, index=True) from_page = IntegerField(default=0) - to_page = IntegerField(default=100000000) + to_page = IntegerField(default=MAXIMUM_TASK_PAGE_NUMBER) task_type = CharField(max_length=32, null=False, default="") priority = IntegerField(default=0) diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index c606d07951..2b1a8617b3 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -35,7 +35,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.doc_metadata_service import DocMetadataService from common.misc_utils import get_uuid from common.time_utils import current_timestamp, get_format_time -from common.constants import LLMType, ParserType, StatusEnum, TaskStatus, SVR_CONSUMER_GROUP_NAME +from common.constants import LLMType, ParserType, StatusEnum, TaskStatus, SVR_CONSUMER_GROUP_NAME, MAXIMUM_PAGE_NUMBER, MAXIMUM_TASK_PAGE_NUMBER from rag.nlp import rag_tokenizer, search from rag.utils.redis_conn import REDIS_CONN from common.doc_store.doc_store_base import OrderByExpr @@ -1000,8 +1000,8 @@ def queue_raptor_o_graphrag_tasks(sample_doc, ty, priority, fake_doc_id="", doc_ return { "id": get_uuid(), "doc_id": fake_doc_id, - "from_page": 100000000, - "to_page": 100000000, + "from_page": MAXIMUM_TASK_PAGE_NUMBER, + "to_page": MAXIMUM_TASK_PAGE_NUMBER, "task_type": ty, "progress_msg": datetime.now().strftime("%H:%M:%S") + " created task " + ty, "begin_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), @@ -1069,7 +1069,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id): for d, blob in files: doc_nm[d["id"]] = d["name"] for d, blob in files: - kwargs = {"callback": dummy, "parser_config": parser_config, "from_page": 0, "to_page": 100000, "tenant_id": kb.tenant_id, "lang": kb.language} + kwargs = {"callback": dummy, "parser_config": parser_config, "from_page": 0, "to_page": MAXIMUM_PAGE_NUMBER, "tenant_id": kb.tenant_id, "lang": kb.language} threads.append(exe.submit(FACTORY.get(d["parser_id"], naive).chunk, d["name"], blob, **kwargs)) for (docinfo, _), th in zip(files, threads): diff --git a/api/db/services/file_service.py b/api/db/services/file_service.py index 079bf4390c..11a5565b38 100644 --- a/api/db/services/file_service.py +++ b/api/db/services/file_service.py @@ -36,7 +36,7 @@ from api.db.services.document_service import DocumentService from api.db.services.file2document_service import File2DocumentService from common.misc_utils import get_uuid from common.ssrf_guard import assert_url_is_safe -from common.constants import TaskStatus, FileSource, ParserType +from common.constants import TaskStatus, FileSource, ParserType, MAXIMUM_PAGE_NUMBER from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.task_service import TaskService from api.utils.file_utils import filename_type, read_potential_broken_pdf, thumbnail_img, sanitize_path @@ -553,7 +553,7 @@ class FileService(CommonService): FACTORY = {ParserType.PRESENTATION.value: presentation, ParserType.PICTURE.value: picture, ParserType.AUDIO.value: audio, ParserType.EMAIL.value: email} parser_config = {"chunk_token_num": 16096, "delimiter": "\n!?;。;!?", "layout_recognize": layout_recognize or "Plain Text"} - kwargs = {"lang": "English", "callback": dummy, "parser_config": parser_config, "from_page": 0, "to_page": 100000, "tenant_id": current_user.id if current_user else tenant_id} + kwargs = {"lang": "English", "callback": dummy, "parser_config": parser_config, "from_page": 0, "to_page": MAXIMUM_PAGE_NUMBER, "tenant_id": current_user.id if current_user else tenant_id} file_type = filename_type(filename) if img_base64 and file_type == FileType.VISUAL.value: return GptV4.image2base64(blob) diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py index 8081732307..cb9967f08a 100644 --- a/api/db/services/task_service.py +++ b/api/db/services/task_service.py @@ -29,7 +29,7 @@ from api.db.services.common_service import CommonService from api.db.services.document_service import DocumentService from common.misc_utils import get_uuid from common.time_utils import current_timestamp -from common.constants import StatusEnum, TaskStatus +from common.constants import StatusEnum, TaskStatus, MAXIMUM_PAGE_NUMBER, MAXIMUM_TASK_PAGE_NUMBER from deepdoc.parser.excel_parser import RAGFlowExcelParser from rag.utils.redis_conn import REDIS_CONN from common import settings @@ -379,7 +379,7 @@ def queue_tasks(doc: dict, bucket: str, name: str, priority: int): "doc_id": doc["id"], "progress": 0.0, "from_page": 0, - "to_page": 100000000, + "to_page": MAXIMUM_TASK_PAGE_NUMBER, "begin_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), } @@ -395,8 +395,8 @@ def queue_tasks(doc: dict, bucket: str, name: str, priority: int): if doc["parser_id"] == "paper": page_size = doc["parser_config"].get("task_page_size") or 22 if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC" or doc["parser_config"].get("toc_extraction", False): - page_size = 10 ** 9 - page_ranges = doc["parser_config"].get("pages") or [(1, 10 ** 5)] + page_size = MAXIMUM_TASK_PAGE_NUMBER + page_ranges = doc["parser_config"].get("pages") or [(1, MAXIMUM_PAGE_NUMBER)] for s, e in page_ranges: s -= 1 s = max(0, s) @@ -495,7 +495,7 @@ def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config: return 0 task["chunk_ids"] = prev_task["chunk_ids"] task["progress"] = 1.0 - if "from_page" in task and "to_page" in task and int(task['to_page']) - int(task['from_page']) >= 10 ** 6: + if "from_page" in task and "to_page" in task and (int(task['to_page']) - int(task['from_page']) >= 10 ** 6 or (int(task['from_page']) == MAXIMUM_TASK_PAGE_NUMBER and int(task['to_page']) == MAXIMUM_TASK_PAGE_NUMBER)): task["progress_msg"] = f"Page({task['from_page']}~{task['to_page']}): " else: task["progress_msg"] = "" @@ -530,7 +530,7 @@ def queue_dataflow(tenant_id:str, flow_id:str, task_id:str, doc_id:str=CANVAS_DE id=task_id, doc_id=doc_id, from_page=0, - to_page=100000000, + to_page=MAXIMUM_TASK_PAGE_NUMBER, task_type="dataflow" if not rerun else "dataflow_rerun", priority=priority, begin_at= datetime.now().strftime("%Y-%m-%d %H:%M:%S"), diff --git a/common/constants.py b/common/constants.py index 5d5588845a..5ab9acaa50 100644 --- a/common/constants.py +++ b/common/constants.py @@ -244,6 +244,12 @@ SVR_QUEUE_NAME = "rag_flow_svr_queue" SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_task_broker" TAG_FLD = "tag_feas" +# Maximum page number used as "unlimited" sentinel value. +# Parsing layer (chunk/Pdf.__call__) uses MAXIMUM_PAGE_NUMBER. +# Task/DB layer (Task model) uses MAXIMUM_PAGE_NUMBER * 1000 to avoid collision with user-specified page ranges. +MAXIMUM_PAGE_NUMBER = 100000 +MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000 + MINERU_ENV_KEYS = ["MINERU_APISERVER", "MINERU_OUTPUT_DIR", "MINERU_BACKEND", "MINERU_SERVER_URL", "MINERU_DELETE_OUTPUT"] MINERU_DEFAULT_CONFIG = { diff --git a/deepdoc/parser/docling_parser.py b/deepdoc/parser/docling_parser.py index 2e7d475148..948a7acb0c 100644 --- a/deepdoc/parser/docling_parser.py +++ b/deepdoc/parser/docling_parser.py @@ -30,10 +30,12 @@ import pdfplumber import requests from PIL import Image +from common.constants import MAXIMUM_PAGE_NUMBER + try: from docling.document_converter import DocumentConverter except Exception: - DocumentConverter = None + DocumentConverter = None try: from deepdoc.parser.pdf_parser import RAGFlowPdfParser @@ -125,7 +127,7 @@ class DoclingParser(RAGFlowPdfParser): self.logger.error(f"[Docling] init DocumentConverter failed: {e}") return False - def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None): + def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None): self.page_from = page_from self.page_to = page_to bytes_io = None diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py index 0257a320f7..2d56729b74 100644 --- a/deepdoc/parser/docx_parser.py +++ b/deepdoc/parser/docx_parser.py @@ -21,6 +21,7 @@ from collections import Counter from rag.nlp import rag_tokenizer from io import BytesIO import logging +from common.constants import MAXIMUM_PAGE_NUMBER from docx.image.exceptions import ( InvalidImageStreamError, UnexpectedEndOfFileError, @@ -158,7 +159,7 @@ class RAGFlowDocxParser: return lines return ["\n".join(lines)] - def __call__(self, fnm, from_page=0, to_page=100000000): + def __call__(self, fnm, from_page=0, to_page=MAXIMUM_PAGE_NUMBER): self.doc = Document(fnm) if isinstance( fnm, str) else Document(BytesIO(fnm)) pn = 0 # parsed page diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 548baddcb6..fd147686a7 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -37,6 +37,8 @@ from strenum import StrEnum from deepdoc.parser.pdf_parser import RAGFlowPdfParser from deepdoc.parser.utils import extract_pdf_outlines +from common.constants import MAXIMUM_PAGE_NUMBER + LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber" if LOCK_KEY_pdfplumber not in sys.modules: sys.modules[LOCK_KEY_pdfplumber] = threading.Lock() @@ -320,7 +322,7 @@ class MinerUParser(RAGFlowPdfParser): except requests.RequestException as e: raise RuntimeError(f"[MinerU] api failed with exception {e}") - def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None): + def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None): self.page_from = page_from self.page_to = page_to try: diff --git a/deepdoc/parser/opendataloader_parser.py b/deepdoc/parser/opendataloader_parser.py index c0e5fa50ba..ed496d1c49 100644 --- a/deepdoc/parser/opendataloader_parser.py +++ b/deepdoc/parser/opendataloader_parser.py @@ -15,6 +15,8 @@ import pdfplumber import requests from PIL import Image +from common.constants import MAXIMUM_PAGE_NUMBER + try: from deepdoc.parser.pdf_parser import RAGFlowPdfParser except Exception: @@ -153,7 +155,7 @@ class OpenDataLoaderParser(RAGFlowPdfParser): self.logger.warning(f"[OpenDataLoader] Health check failed: {exc}") return False - def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None): + def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None): self.page_from = page_from self.page_to = page_to bytes_io = None diff --git a/deepdoc/parser/paddleocr_parser.py b/deepdoc/parser/paddleocr_parser.py index a23852e89c..c697971266 100644 --- a/deepdoc/parser/paddleocr_parser.py +++ b/deepdoc/parser/paddleocr_parser.py @@ -29,6 +29,8 @@ import pdfplumber import requests from PIL import Image +from common.constants import MAXIMUM_PAGE_NUMBER + try: from deepdoc.parser.pdf_parser import RAGFlowPdfParser except Exception: @@ -425,7 +427,7 @@ class PaddleOCRParser(RAGFlowPdfParser): """Convert API response to table tuples.""" return [] - def __images__(self, fnm, page_from=0, page_to=10**9, callback=None): + def __images__(self, fnm, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None): """Generate page images from PDF for cropping.""" self.page_from = page_from self.page_to = page_to diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index b3a6adec8b..d1aebef1f3 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -37,6 +37,7 @@ from pypdf import PdfReader as pdf2_read from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score +from common.constants import MAXIMUM_PAGE_NUMBER from common.file_utils import get_project_base_directory from deepdoc.vision import OCR, AscendLayoutRecognizer, LayoutRecognizer, Recognizer, TableStructureRecognizer from rag.nlp import rag_tokenizer @@ -1521,7 +1522,7 @@ class RAGFlowPdfParser: except Exception: logging.exception("total_page_number") - def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): + def __images__(self, fnm, zoomin=3, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None): self.lefted_chars = [] self.mean_height = [] self.mean_width = [] @@ -1541,7 +1542,7 @@ class RAGFlowPdfParser: self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]] except Exception as e: logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}") - self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead. + self.page_chars = [[] for _ in range(len(self.page_images))] # If failed to extract, using empty list instead. # Detect garbled pages and clear their chars so the OCR # path will be used instead. Two detection strategies: @@ -1694,10 +1695,10 @@ class RAGFlowPdfParser: tbls = self._extract_table_figure(need_image, zoomin, return_html, False) return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls - def parse_into_bboxes(self, fnm, callback=None, zoomin=3): + def parse_into_bboxes(self, fnm, callback=None, zoomin=3, from_page=0, to_page=MAXIMUM_PAGE_NUMBER): start = timer() self.outlines = extract_pdf_outlines(fnm) - self.__images__(fnm, zoomin, callback=callback) + self.__images__(fnm, zoomin, from_page, to_page, callback=callback) if callback: callback(0.40, "OCR finished ({:.2f}s)".format(timer() - start)) @@ -1943,7 +1944,7 @@ class RAGFlowPdfParser: class PlainParser: - def __call__(self, filename, from_page=0, to_page=100000, **kwargs): + def __call__(self, filename, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, **kwargs): lines = [] try: self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename)) @@ -1969,7 +1970,7 @@ class VisionParser(RAGFlowPdfParser): self.vision_model = vision_model self.outlines = [] - def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): + def __images__(self, fnm, zoomin=3, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None): try: with sys.modules[LOCK_KEY_pdfplumber]: self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) @@ -1980,7 +1981,7 @@ class VisionParser(RAGFlowPdfParser): self.total_page = 0 logging.exception("VisionParser __images__") - def __call__(self, filename, from_page=0, to_page=100000, **kwargs): + def __call__(self, filename, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, **kwargs): callback = kwargs.get("callback", lambda prog, msg: None) zoomin = kwargs.get("zoomin", 3) self.__images__(fnm=filename, zoomin=zoomin, page_from=from_page, page_to=to_page, callback=callback) diff --git a/rag/app/book.py b/rag/app/book.py index b3af3ed9dc..8611f38401 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -21,6 +21,7 @@ from io import BytesIO from deepdoc.parser.utils import get_text from rag.app import naive from rag.app.naive import by_plaintext, PARSERS +from common.constants import MAXIMUM_PAGE_NUMBER from common.parser_config_utils import normalize_layout_recognizer from rag.nlp import bullets_category, is_english, remove_contents_table, hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, tokenize_chunks, attach_media_context from rag.nlp import rag_tokenizer @@ -31,7 +32,7 @@ from rag.utils.lazy_image import LazyImage class Pdf(PdfParser): - def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): + def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, zoomin=3, callback=None): from timeit import default_timer as timer start = timer() @@ -59,7 +60,7 @@ class Pdf(PdfParser): return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes], tbls -def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, **kwargs): """ Supported file formats are docx, pdf, txt. Since a book is long and not all the parts are useful, if it's a PDF, diff --git a/rag/app/email.py b/rag/app/email.py index ea01a337e1..9edaddcb79 100644 --- a/rag/app/email.py +++ b/rag/app/email.py @@ -18,6 +18,7 @@ import logging from email import policy from email.parser import BytesParser from rag.app.naive import chunk as naive_chunk +from common.constants import MAXIMUM_PAGE_NUMBER import re from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks from deepdoc.parser import HtmlParser, TxtParser @@ -29,7 +30,7 @@ def chunk( filename, binary=None, from_page=0, - to_page=100000, + to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, **kwargs, diff --git a/rag/app/laws.py b/rag/app/laws.py index eb26c154d8..e2fe885ffa 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -19,7 +19,7 @@ import re from io import BytesIO from docx import Document -from common.constants import ParserType +from common.constants import ParserType, MAXIMUM_PAGE_NUMBER from deepdoc.parser.utils import get_text from rag.nlp import bullets_category, remove_contents_table, make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge from rag.nlp import rag_tokenizer, Node @@ -36,7 +36,7 @@ class Docx(DocxParser): line = re.sub(r"\u3000", " ", line).strip() return line - def old_call(self, filename, binary=None, from_page=0, to_page=100000): + def old_call(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER): self.doc = Document(filename) if not binary else Document(BytesIO(binary)) pn = 0 lines = [] @@ -53,7 +53,7 @@ class Docx(DocxParser): pn += 1 return [line for line in lines if line] - def __call__(self, filename, binary=None, from_page=0, to_page=100000): + def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER): self.doc = Document(filename) if not binary else Document(BytesIO(binary)) pn = 0 lines = [] @@ -98,7 +98,7 @@ class Pdf(PdfParser): self.model_speciess = ParserType.LAWS.value super().__init__() - def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): + def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, zoomin=3, callback=None): from timeit import default_timer as timer start = timer() @@ -117,7 +117,7 @@ class Pdf(PdfParser): return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None -def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, **kwargs): """ Supported file formats are docx, pdf, txt. """ diff --git a/rag/app/manual.py b/rag/app/manual.py index cb946d49ac..576d06fafb 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -18,7 +18,7 @@ import logging import copy import re -from common.constants import ParserType +from common.constants import ParserType, MAXIMUM_PAGE_NUMBER from io import BytesIO from deepdoc.parser.utils import extract_pdf_outlines from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context, concat_img @@ -35,7 +35,7 @@ class Pdf(PdfParser): self.model_speciess = ParserType.MANUAL.value super().__init__() - def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): + def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, zoomin=3, callback=None): from timeit import default_timer as timer start = timer() @@ -71,7 +71,7 @@ class Docx(DocxParser): def __init__(self): pass - def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None): + def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, callback=None): self.doc = Document(filename) if not binary else Document(BytesIO(binary)) pn = 0 last_answer, last_image = "", None @@ -134,7 +134,7 @@ class Docx(DocxParser): return ti_list, tbls -def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, **kwargs): """ Only pdf is supported. """ @@ -276,7 +276,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca elif re.search(r"\.docx?$", filename, re.IGNORECASE): docx_parser = Docx() - ti_list, tbls = docx_parser(filename, binary, from_page=0, to_page=10000, callback=callback) + ti_list, tbls = docx_parser(filename, binary, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, callback=callback) tbls = vision_figure_parser_docx_wrapper(sections=ti_list, tbls=tbls, callback=callback, **kwargs) res = tokenize_table(tbls, doc, eng) for text, image in ti_list: diff --git a/rag/app/naive.py b/rag/app/naive.py index 9218c20c1e..513f503b65 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -29,7 +29,7 @@ from markdown import markdown from PIL import Image from common.token_utils import num_tokens_from_string -from common.constants import LLMType +from common.constants import LLMType, MAXIMUM_PAGE_NUMBER from api.db.services.llm_service import LLMBundle from api.db.joint_services.tenant_model_service import get_model_config_by_type_and_name, get_tenant_default_model_by_type from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html @@ -83,7 +83,7 @@ def _normalize_section_text_for_rtl_presentation_forms(sections): return normalized_sections -def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs): +def by_deepdoc(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, **kwargs): callback = callback binary = binary pdf_parser = pdf_cls() if pdf_cls else Pdf() @@ -102,7 +102,7 @@ def by_mineru( filename, binary=None, from_page=0, - to_page=100000, + to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, @@ -148,7 +148,7 @@ def by_mineru( return None, None, None -def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs): +def by_docling(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, **kwargs): pdf_parser = DoclingParser() parse_method = kwargs.get("parse_method", "raw") @@ -173,7 +173,7 @@ def by_opendataloader( filename, binary=None, from_page=0, - to_page=100000, + to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, @@ -217,7 +217,7 @@ def by_opendataloader( return None, None, None -def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs): +def by_tcadp(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, **kwargs): tcadp_parser = TCADPParser() if not tcadp_parser.check_installation(): @@ -232,7 +232,7 @@ def by_paddleocr( filename, binary=None, from_page=0, - to_page=100000, + to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, @@ -279,7 +279,7 @@ def by_paddleocr( return None, None, None -def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): +def by_plaintext(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, callback=None, **kwargs): layout_recognizer = (kwargs.get("layout_recognizer") or "").strip() if (not layout_recognizer) or (layout_recognizer == "Plain Text"): pdf_parser = PlainParser() @@ -423,7 +423,7 @@ class Docx(DocxParser): return "" - def __call__(self, filename, binary=None, from_page=0, to_page=100000): + def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER): self.doc = Document(filename) if not binary else Document(BytesIO(binary)) pn = 0 lines = [] @@ -586,7 +586,7 @@ class Pdf(PdfParser): def __init__(self): super().__init__() - def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, separate_tables_figures=False): + def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, zoomin=3, callback=None, separate_tables_figures=False): start = timer() first_start = start callback(msg="OCR started") @@ -775,7 +775,7 @@ def load_from_xml_v2(baseURI, rels_item_xml): return srels -def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, **kwargs): """ Supported file formats are docx, pdf, excel, txt. This method apply the naive ways to chunk files. diff --git a/rag/app/one.py b/rag/app/one.py index d8bfdf58b8..d5fbbfcc8a 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -24,11 +24,12 @@ from rag.nlp import rag_tokenizer, tokenize from deepdoc.parser import PdfParser, ExcelParser, HtmlParser from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper_naive from rag.app.naive import by_plaintext, PARSERS +from common.constants import MAXIMUM_PAGE_NUMBER, MAXIMUM_TASK_PAGE_NUMBER from common.parser_config_utils import normalize_layout_recognizer class Pdf(PdfParser): - def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): + def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, zoomin=3, callback=None): from timeit import default_timer as timer start = timer() @@ -55,7 +56,7 @@ class Pdf(PdfParser): return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls -def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, **kwargs): """ Supported file formats are docx, pdf, excel, txt. One file forms a chunk which maintains original text order. @@ -126,7 +127,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") excel_parser = ExcelParser() - sections = excel_parser.html(binary, 1000000000) + sections = excel_parser.html(binary, MAXIMUM_TASK_PAGE_NUMBER) elif re.search(r"\.(txt|md|markdown|mdx)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") diff --git a/rag/app/paper.py b/rag/app/paper.py index 818338d9a5..82ddb8bc83 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -19,7 +19,7 @@ import copy import re from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper -from common.constants import ParserType +from common.constants import ParserType, MAXIMUM_PAGE_NUMBER from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, \ tokenize_chunks, attach_media_context from deepdoc.parser import PdfParser @@ -34,7 +34,7 @@ class Pdf(PdfParser): super().__init__() def __call__(self, filename, binary=None, from_page=0, - to_page=100000, zoomin=3, callback=None): + to_page=MAXIMUM_PAGE_NUMBER, zoomin=3, callback=None): from timeit import default_timer as timer start = timer() callback(msg="OCR started") @@ -146,7 +146,7 @@ class Pdf(PdfParser): } -def chunk(filename, binary=None, from_page=0, to_page=100000, +def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, **kwargs): """ Only pdf is supported. diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 390955041a..e49d1bd2d8 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -25,6 +25,7 @@ from pypdf import PdfReader as pdf2_read from deepdoc.parser import PdfParser, PlainParser from deepdoc.parser.ppt_parser import RAGFlowPptParser from rag.app.naive import by_plaintext, PARSERS +from common.constants import MAXIMUM_PAGE_NUMBER from common.parser_config_utils import normalize_layout_recognizer from rag.nlp import rag_tokenizer from rag.nlp import tokenize @@ -35,7 +36,7 @@ class Pdf(PdfParser): def __init__(self): super().__init__() - def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, **kwargs): + def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, zoomin=3, callback=None, **kwargs): # 1. OCR callback(msg="OCR started") self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback) @@ -115,7 +116,7 @@ class Pdf(PdfParser): class PlainPdf(PlainParser): - def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): + def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, callback=None, **kwargs): self.pdf = pdf2_read(filename if not binary else BytesIO(binary)) page_txt = [] for page in self.pdf.pages[from_page:to_page]: @@ -124,7 +125,7 @@ class PlainPdf(PlainParser): return [(txt, None) for txt in page_txt], [] -def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, parser_config=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, parser_config=None, **kwargs): """ The supported file formats are pdf, ppt, pptx. Every page will be treated as a chunk. And the thumbnail of every page will be stored. @@ -139,7 +140,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca if re.search(r"\.pptx?$", filename, re.IGNORECASE): try: ppt_parser = RAGFlowPptParser() - for pn, txt in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)): + for pn, txt in enumerate(ppt_parser(filename if not binary else binary, from_page, MAXIMUM_PAGE_NUMBER, callback)): d = copy.deepcopy(doc) pn += from_page d["doc_type_kwd"] = "image" diff --git a/rag/app/qa.py b/rag/app/qa.py index da6d72cf73..8843c0a6e0 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -22,6 +22,7 @@ from io import BytesIO from timeit import default_timer as timer from openpyxl import load_workbook +from common.constants import MAXIMUM_PAGE_NUMBER from deepdoc.parser.utils import get_text from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level from rag.nlp import rag_tokenizer, tokenize_table, concat_img @@ -77,7 +78,7 @@ class Excel(ExcelParser): class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, - to_page=100000, zoomin=3, callback=None): + to_page=MAXIMUM_PAGE_NUMBER, zoomin=3, callback=None): start = timer() callback(msg="OCR started") self.__images__( @@ -191,7 +192,7 @@ class Docx(DocxParser): def __init__(self): pass - def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None): + def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, callback=None): self.doc = Document( filename) if not binary else Document(BytesIO(binary)) pn = 0 @@ -304,7 +305,7 @@ def mdQuestionLevel(s): return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s) -def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, **kwargs): """ Excel and csv(txt) format files are supported. If the file is in Excel format, there should be 2 column question and answer without header. @@ -449,7 +450,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca elif re.search(r"\.docx$", filename, re.IGNORECASE): docx_parser = Docx() qai_list, tbls = docx_parser(filename, binary, - from_page=0, to_page=10000, callback=callback) + from_page=0, to_page=MAXIMUM_PAGE_NUMBER, callback=callback) res = tokenize_table(tbls, doc, eng) for i, (q, a, image) in enumerate(qai_list): res.append(beAdocDocx(deepcopy(doc), q, a, eng, image, i)) diff --git a/rag/app/resume.py b/rag/app/resume.py index b1225e6a9e..a244c75219 100644 --- a/rag/app/resume.py +++ b/rag/app/resume.py @@ -40,6 +40,7 @@ from typing import Optional import numpy as np from common import settings +from common.constants import MAXIMUM_PAGE_NUMBER # tiktoken for long random string filtering (ref: SmartResume should_remove strategy) try: @@ -2465,7 +2466,7 @@ def _blackout_text_regions(image: "np.ndarray", meta_blocks: list[dict], page_id -def chunk(filename, binary, tenant_id, from_page=0, to_page=100000, +def chunk(filename, binary, tenant_id, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, **kwargs): """ Resume parsing entry function (compatible with task_executor.py) diff --git a/rag/app/table.py b/rag/app/table.py index acdd3b0df5..ea553ca0f9 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -30,6 +30,7 @@ from dateutil.parser import parse as datetime_parse from api.db.services.knowledgebase_service import KnowledgebaseService from deepdoc.parser.figure_parser import vision_figure_parser_figure_xlsx_wrapper +from common.constants import MAXIMUM_TASK_PAGE_NUMBER from deepdoc.parser.utils import get_text from rag.nlp import rag_tokenizer, tokenize, tokenize_table from deepdoc.parser import ExcelParser @@ -37,7 +38,7 @@ from common import settings class Excel(ExcelParser): - def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=None, **kwargs): + def __call__(self, fnm, binary=None, from_page=0, to_page=MAXIMUM_TASK_PAGE_NUMBER, callback=None, **kwargs): if not binary: wb = Excel._load_excel_to_workbook(fnm) else: @@ -357,7 +358,7 @@ def column_data_type(arr): return arr, ty -def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese", callback=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_TASK_PAGE_NUMBER, lang="Chinese", callback=None, **kwargs): """ Excel and csv(txt) format files are supported. For csv or txt file, the delimiter between columns is TAB. diff --git a/test/testcases/test_http_api/test_chat_assistant_management/test_chat_sdk_routes_unit.py b/test/testcases/test_http_api/test_chat_assistant_management/test_chat_sdk_routes_unit.py index 9d72a63da6..a8d4f95cba 100644 --- a/test/testcases/test_http_api/test_chat_assistant_management/test_chat_sdk_routes_unit.py +++ b/test/testcases/test_http_api/test_chat_assistant_management/test_chat_sdk_routes_unit.py @@ -210,6 +210,10 @@ def _load_chat_module(monkeypatch): common_constants_mod.LLMType = _StubLLMType common_constants_mod.RetCode = _StubRetCode common_constants_mod.StatusEnum = _StubStatusEnum + # Import pure-Python constants from the real module (no heavy deps) + from common.constants import MAXIMUM_PAGE_NUMBER as _MPN, MAXIMUM_TASK_PAGE_NUMBER as _MTPN + common_constants_mod.MAXIMUM_PAGE_NUMBER = _MPN + common_constants_mod.MAXIMUM_TASK_PAGE_NUMBER = _MTPN monkeypatch.setitem(sys.modules, "common.constants", common_constants_mod) misc_utils_mod = ModuleType("common.misc_utils") diff --git a/test/testcases/test_http_api/test_session_management/test_session_sdk_routes_unit.py b/test/testcases/test_http_api/test_session_management/test_session_sdk_routes_unit.py index 53973614f8..f442db5196 100644 --- a/test/testcases/test_http_api/test_session_management/test_session_sdk_routes_unit.py +++ b/test/testcases/test_http_api/test_session_management/test_session_sdk_routes_unit.py @@ -245,6 +245,10 @@ def _load_session_module(monkeypatch): common_constants_mod.SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_task_broker" common_constants_mod.PAGERANK_FLD = "pagerank_fea" common_constants_mod.TAG_FLD = "tag_feas" + # Import pure-Python constants from the real module (no heavy deps) + from common.constants import MAXIMUM_PAGE_NUMBER as _MPN, MAXIMUM_TASK_PAGE_NUMBER as _MTPN + common_constants_mod.MAXIMUM_PAGE_NUMBER = _MPN + common_constants_mod.MAXIMUM_TASK_PAGE_NUMBER = _MTPN monkeypatch.setitem(sys.modules, "common.constants", common_constants_mod) deepdoc_pkg = ModuleType("deepdoc")