From c446c403deb749e8e290de83bbf5f18d29f9a265 Mon Sep 17 00:00:00 2001 From: Zhichang Yu Date: Mon, 27 Apr 2026 16:52:43 +0800 Subject: [PATCH] perf: lazy img_np loading and chunked parse_into_bboxes for large PDFs (#14385) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - **Lazy img_np loading**: `np.array(img)` is now deferred until the first OCR text extraction is actually needed, avoiding unnecessary memory allocation for pages that already have text. - **Chunked parse_into_bboxes**: Large PDFs (>50 pages, configurable via `PDF_PARSER_PAGE_BATCH_SIZE`) are processed in batches. Each chunk's boxes are normalized with `_to_global_boxes` to produce globally consistent page numbers and position tags. - **DLA early init**: Move remote-client initialization before model loading in `LayoutRecognizer.__init__` so `DEEPDOC_URL` (or legacy `TENSORRT_DLA_SVR`) short-circuits unnecessary model download for parser containers relying on remote inference. - **Fix outline regression**: Restore `self.outlines = extract_pdf_outlines(fnm)` in `parse_into_bboxes`; this was dropped during refactoring and is required by downstream `remove_toc` and metadata handling in `rag/flow/parser/parser.py`. ## Test plan - [ ] Small PDF (<=50 pages): verify parse succeeds and `self.outlines` is populated - [ ] Large PDF (>50 pages): verify chunked processing produces globally consistent page numbers - [ ] With `DEEPDOC_URL` set: verify remote DLA client is used and local model is not downloaded - [ ] With legacy `TENSORRT_DLA_SVR` set: verify backward compatibility 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.7 --- deepdoc/parser/pdf_parser.py | 81 +++++++++++++++++++++++++---- deepdoc/vision/layout_recognizer.py | 19 ++++--- 2 files changed, 82 insertions(+), 18 deletions(-) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index d1aebef1f3..3a5bd16627 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -774,9 +774,11 @@ class RAGFlowPdfParser: logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s") start = timer() boxes_to_reg = [] - img_np = np.array(img) + img_np = None for b in bxs: if not b["text"]: + if img_np is None: + img_np = np.asarray(img) left, right, top, bott = b["x0"] * ZM, b["x1"] * ZM, b["top"] * ZM, b["bottom"] * ZM b["box_image"] = self.ocr.get_rotate_crop_image(img_np, np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32)) boxes_to_reg.append(b) @@ -1696,18 +1698,51 @@ class RAGFlowPdfParser: return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls def parse_into_bboxes(self, fnm, callback=None, zoomin=3, from_page=0, to_page=MAXIMUM_PAGE_NUMBER): - start = timer() self.outlines = extract_pdf_outlines(fnm) - self.__images__(fnm, zoomin, from_page, to_page, callback=callback) - if callback: - callback(0.40, "OCR finished ({:.2f}s)".format(timer() - start)) + batch_size = max(1, int(os.getenv("PDF_PARSER_PAGE_BATCH_SIZE", "50"))) + if isinstance(fnm, str): + total_pages = self.total_page_number(fnm) + else: + total_pages = self.total_page_number(fnm, binary=fnm) + if total_pages is None: + effective_to_page = to_page + logging.warning( + "parse_into_bboxes: total_page_number returned None; using caller-supplied to_page=%s", + to_page, + ) + else: + effective_to_page = min(to_page, total_pages) + + if effective_to_page - from_page <= batch_size: + self.__images__(fnm, zoomin, page_from=from_page, page_to=effective_to_page, callback=callback) + return self._parse_loaded_window_into_bboxes(zoomin, callback=callback) + + logging.info( + "parse_into_bboxes uses chunk mode: from_page=%s, effective_to_page=%s, batch_size=%s", + from_page, + effective_to_page, + batch_size, + ) + all_boxes = [] + start = timer() + for page_from in range(from_page, effective_to_page, batch_size): + page_to = min(page_from + batch_size, effective_to_page) + self.__images__(fnm, zoomin, page_from=page_from, page_to=page_to, callback=None) + chunk_boxes = self._parse_loaded_window_into_bboxes(zoomin) + all_boxes.extend(self._to_global_boxes(chunk_boxes)) + if callback: + callback((page_to - from_page) / max(1, effective_to_page - from_page), f"Structured: {page_to}/{effective_to_page} pages") + + logging.info("parse_into_bboxes chunk mode cost %.2fs", timer() - start) + return all_boxes + + def _parse_loaded_window_into_bboxes(self, zoomin=3, callback=None): start = timer() self._layouts_rec(zoomin) if callback: callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) - # Read table auto-rotation setting from environment variable auto_rotate_tables = os.getenv("TABLE_AUTO_ROTATE", "true").lower() in ("true", "1", "yes") start = timer() @@ -1743,13 +1778,9 @@ class RAGFlowPdfParser: dy = top1 - bottom2 else: dy = 0 - return math.sqrt(dx * dx + dy * dy) # + (pn2-pn1)*10000 + return math.sqrt(dx * dx + dy * dy) for (img, txt), poss in tbls_or_figs: - # Positions coming from _extract_table_figure carry absolute 0-based page - # indices (page_from offset). Convert back to chunk-local indices so we - # stay consistent with self.boxes/page_cum_height, which are all relative - # to the current parsing window. local_poss = [] for pn, left, right, top, bott in poss: local_pn = pn - self.page_from @@ -1805,6 +1836,34 @@ class RAGFlowPdfParser: callback(1, "Structured ({:.2f}s)".format(timer() - start)) return deepcopy(self.boxes) + @staticmethod + def _offset_position_tag(text, page_offset): + if not text or page_offset <= 0: + return text + + def _replace(match): + pages = [str(int(p) + page_offset) for p in match.group(1).split("-")] + return f"@@{'-'.join(pages)}\t" + + return re.sub(r"@@([0-9-]+)\t", _replace, text) + + def _to_global_boxes(self, boxes): + if self.page_from <= 0: + return boxes + + for box in boxes: + box["page_number"] = int(box.get("page_number", 1)) + self.page_from + if isinstance(box.get("position_tag"), str): + box["position_tag"] = self._offset_position_tag(box["position_tag"], self.page_from) + if isinstance(box.get("positions"), list): + box["positions"] = [ + [int(pos[0]) + self.page_from, *pos[1:]] + if isinstance(pos, list) and len(pos) > 0 and isinstance(pos[0], (int, float)) + else pos + for pos in box["positions"] + ] + return boxes + @staticmethod def remove_tag(txt): return re.sub(r"@@[\t0-9.-]+?##", "", txt) diff --git a/deepdoc/vision/layout_recognizer.py b/deepdoc/vision/layout_recognizer.py index be1f8667ce..9befbe2936 100644 --- a/deepdoc/vision/layout_recognizer.py +++ b/deepdoc/vision/layout_recognizer.py @@ -46,6 +46,18 @@ class LayoutRecognizer(Recognizer): ] def __init__(self, domain): + self.garbage_layouts = ["footer", "header", "reference"] + self.client = None + + dla_url = os.environ.get("DEEPDOC_URL") or os.environ.get("TENSORRT_DLA_SVR") + if dla_url: + from deepdoc.vision.dla_cli import DLAClient + + self.client = DLAClient(dla_url) + env_used = "DEEPDOC_URL" if os.environ.get("DEEPDOC_URL") else "TENSORRT_DLA_SVR" + logging.info(f"LayoutRecognizer using remote DLA client at {dla_url} (via {env_used})") + return + try: model_dir = os.path.join(get_project_base_directory(), "rag/res/deepdoc") super().__init__(self.labels, domain, model_dir) @@ -53,13 +65,6 @@ class LayoutRecognizer(Recognizer): model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc", local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"), local_dir_use_symlinks=False) super().__init__(self.labels, domain, model_dir) - self.garbage_layouts = ["footer", "header", "reference"] - self.client = None - if os.environ.get("TENSORRT_DLA_SVR"): - from deepdoc.vision.dla_cli import DLAClient - - self.client = DLAClient(os.environ["TENSORRT_DLA_SVR"]) - def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True): def __is_garbage(b): patt = [r"\(cid\s*:\s*\d+\s*\)"]