From d32967eda864bf44b3fbd1d5abb0d4c3b2615253 Mon Sep 17 00:00:00 2001 From: Stephen Hu <812791840@qq.com> Date: Mon, 23 Mar 2026 21:24:40 +0800 Subject: [PATCH] refactor: let excel use lazy image loader (#13558) ### What problem does this PR solve? let excel use lazy image loader ### Type of change - [x] Refactoring --------- Co-authored-by: Yingfeng --- deepdoc/parser/docx_parser.py | 4 ++-- deepdoc/parser/excel_parser.py | 6 +++--- deepdoc/parser/figure_parser.py | 2 +- rag/app/book.py | 4 ++-- rag/app/table.py | 2 +- rag/nlp/__init__.py | 9 ++++----- rag/utils/lazy_image.py | 21 ++++++++++++--------- 7 files changed, 25 insertions(+), 23 deletions(-) diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py index a17543cbf4..0257a320f7 100644 --- a/deepdoc/parser/docx_parser.py +++ b/deepdoc/parser/docx_parser.py @@ -26,7 +26,7 @@ from docx.image.exceptions import ( UnexpectedEndOfFileError, UnrecognizedImageError, ) -from rag.utils.lazy_image import LazyDocxImage +from rag.utils.lazy_image import LazyImage class RAGFlowDocxParser: def get_picture(self, document, paragraph): @@ -66,7 +66,7 @@ class RAGFlowDocxParser: image_blobs.append(image_blob) if not image_blobs: return None - return LazyDocxImage(image_blobs) + return LazyImage(image_blobs) def __extract_table_content(self, tb): diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index b75d31f6a4..acbd98f228 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -18,9 +18,9 @@ from io import BytesIO import pandas as pd from openpyxl import Workbook, load_workbook -from PIL import Image from rag.nlp import find_codec +from rag.utils.lazy_image import LazyImage # copied from `/openpyxl/cell/cell.py` ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]") @@ -122,7 +122,7 @@ class RAGFlowExcelParser: for img in images: try: img_bytes = img._data() - pil_img = Image.open(BytesIO(img_bytes)).convert("RGB") + lazy_img = LazyImage([img_bytes]) anchor = img.anchor if hasattr(anchor, "_from") and hasattr(anchor, "_to"): @@ -139,7 +139,7 @@ class RAGFlowExcelParser: item = { "sheet": sheetname or ws.title, - "image": pil_img, + "image": lazy_img, "image_description": "", "row_from": r1, "col_from": c1, diff --git a/deepdoc/parser/figure_parser.py b/deepdoc/parser/figure_parser.py index 3b85e5648f..cd0fa51f51 100644 --- a/deepdoc/parser/figure_parser.py +++ b/deepdoc/parser/figure_parser.py @@ -75,7 +75,7 @@ def vision_figure_parser_figure_xlsx_wrapper(images,callback=None, **kwargs): vision_model = None if vision_model: figures_data = [(( - img["image"], # Image.Image + img["image"], # Image.Image or LazyImage (converted by ensure_pil_image) [img["image_description"]] # description list (must be list) ), [ diff --git a/rag/app/book.py b/rag/app/book.py index 847aec50a3..b3af3ed9dc 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -27,7 +27,7 @@ from rag.nlp import rag_tokenizer from deepdoc.parser import PdfParser, HtmlParser from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper from PIL import Image -from rag.utils.lazy_image import LazyDocxImage +from rag.utils.lazy_image import LazyImage class Pdf(PdfParser): @@ -89,7 +89,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca sections = [ (item[0], item[1] if item[1] is not None else "") for item in sections - if not isinstance(item[1], (Image.Image, LazyDocxImage)) + if not isinstance(item[1], (Image.Image, LazyImage)) ] callback(0.8, "Finish parsing.") diff --git a/rag/app/table.py b/rag/app/table.py index f521ab23d6..acdd3b0df5 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -115,7 +115,7 @@ class Excel(ExcelParser): tables.append( ( ( - img["image"], # Image.Image + img["image"], # Image.Image or LazyImage [img["image_description"]] # description list (must be list) ), [ diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index be1cef05b9..f95cc8266b 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -1212,18 +1212,17 @@ def docx_question_level(p, bull=-1): def concat_img(img1, img2): - from rag.utils.lazy_image import ensure_pil_image, LazyDocxImage + from rag.utils.lazy_image import ensure_pil_image, LazyImage - # Fast path: preserve laziness when both sides are LazyDocxImage or None. - if (img1 is None or isinstance(img1, LazyDocxImage)) and \ - (img2 is None or isinstance(img2, LazyDocxImage)): + if (img1 is None or isinstance(img1, LazyImage)) and \ + (img2 is None or isinstance(img2, LazyImage)): if img1 and not img2: return img1 if not img1 and img2: return img2 if not img1 and not img2: return None - return LazyDocxImage.merge(img1, img2) + return LazyImage.merge(img1, img2) img1 = ensure_pil_image(img1) or img1 img2 = ensure_pil_image(img2) or img2 diff --git a/rag/utils/lazy_image.py b/rag/utils/lazy_image.py index c120bef913..f2164e7c51 100644 --- a/rag/utils/lazy_image.py +++ b/rag/utils/lazy_image.py @@ -6,7 +6,7 @@ from PIL import Image from rag.nlp import concat_img -class LazyDocxImage: +class LazyImage: def __init__(self, blobs, source=None): self._blobs = [b for b in (blobs or []) if b] self.source = source @@ -31,7 +31,7 @@ class LazyDocxImage: try: image = Image.open(BytesIO(blob)).convert("RGB") except Exception as e: - logging.info(f"LazyDocxImage: skip bad image blob: {e}") + logging.info(f"LazyImage: skip bad image blob: {e}") continue if res_img is None: @@ -91,33 +91,36 @@ class LazyDocxImage: @staticmethod def merge(a, b): """ - Merge two LazyDocxImage instances by combining their blob lists. + Merge two LazyImage instances by combining their blob lists. """ - a_blobs = a._blobs if isinstance(a, LazyDocxImage) else [] - b_blobs = b._blobs if isinstance(b, LazyDocxImage) else [] + a_blobs = a._blobs if isinstance(a, LazyImage) else [] + b_blobs = b._blobs if isinstance(b, LazyImage) else [] combined = a_blobs + b_blobs if not combined: return None - merged = LazyDocxImage(combined) + merged = LazyImage(combined) return merged +LazyDocxImage = LazyImage + + def ensure_pil_image(img): if isinstance(img, Image.Image): return img - if isinstance(img, LazyDocxImage): + if isinstance(img, LazyImage): return img.to_pil() return None def is_image_like(img): - return isinstance(img, Image.Image) or isinstance(img, LazyDocxImage) + return isinstance(img, Image.Image) or isinstance(img, LazyImage) def open_image_for_processing(img, *, allow_bytes=False): if isinstance(img, Image.Image): return img, False - if isinstance(img, LazyDocxImage): + if isinstance(img, LazyImage): return img.to_pil_detached(), True if allow_bytes and isinstance(img, (bytes, bytearray)): try: