From d0ca388bece8e0d86cbf7833e0e72aa2459cccad Mon Sep 17 00:00:00 2001 From: eviaaaaa <2278596667@qq.com> Date: Wed, 11 Mar 2026 10:00:07 +0800 Subject: [PATCH] Refa: implement unified lazy image loading for Docx parsers (qa/manual) (#13329) ## Summary This PR is the direct successor to the previous `docx` lazy-loading implementation. It addresses the technical debt intentionally left out in the last PR by fully migrating the `qa` and `manual` parsing strategies to the new lazy-loading model. Additionally, this PR comprehensively refactors the underlying `docx` parsing pipeline to eliminate significant code redundancy and introduces robust fallback mechanisms to handle completely corrupted image streams safely. ## What's Changed * **Centralized Abstraction (`docx_parser.py`)**: Moved the `get_picture` extraction logic up to the `RAGFlowDocxParser` base class. Previously, `naive`, `qa`, and `manual` parsers maintained separate, redundant copies of this method. All downstream strategies now natively gather raw blobs and return `LazyDocxImage` objects automatically. * **Robust Corrupted Image Fallback (`docx_parser.py`)**: Handled edge cases where `python-docx` encounters critically malformed magic headers. Implemented an explicit `try-except` structure that safely intercepts `UnrecognizedImageError` (and similar exceptions) and seamlessly falls back to retrieving the raw binary via `getattr(related_part, "blob", None)`, preventing parser crashes on damaged documents. * **Legacy Code & Redundancy Purge**: * Removed the duplicate `get_picture` methods from `naive.py`, `qa.py`, and `manual.py`. * Removed the standalone, immediate-decoding `concat_img` method in `manual.py`. It has been completely replaced by the globally unified, lazy-loading-compatible `rag.nlp.concat_img`. * Cleaned up unused legacy imports (e.g., `PIL.Image`, docx exception packages) across all updated strategy files. ## Scope To keep this PR focused, I have restricted these changes strictly to the unification of `docx` extraction logic and the lazy-load migration of `qa` and `manual`. ## Validation & Testing I've tested this to ensure no regressions and validated the fallback logic: * **Output Consistency**: Compared identical `.docx` inputs using `qa` and `manual` strategies before and after this branch: chunk counts, extracted text, table HTML, and attached images match perfectly. * **Memory Footprint Drop**: Confirmed a noticeable drop in peak memory usage when processing image-dense documents through the `qa` and `manual` pipelines, bringing them up to parity with the `naive` strategy's performance gains. ## Breaking Changes * None. --- deepdoc/parser/docx_parser.py | 47 ++++++++++++++++++++++++++++++++++- rag/app/manual.py | 44 ++------------------------------ rag/app/naive.py | 36 --------------------------- rag/app/qa.py | 12 --------- rag/nlp/__init__.py | 15 +++++++++-- rag/utils/lazy_image.py | 13 ++++++++++ 6 files changed, 74 insertions(+), 93 deletions(-) diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py index 2a65841e24..a17543cbf4 100644 --- a/deepdoc/parser/docx_parser.py +++ b/deepdoc/parser/docx_parser.py @@ -20,9 +20,54 @@ import pandas as pd from collections import Counter from rag.nlp import rag_tokenizer from io import BytesIO - +import logging +from docx.image.exceptions import ( + InvalidImageStreamError, + UnexpectedEndOfFileError, + UnrecognizedImageError, +) +from rag.utils.lazy_image import LazyDocxImage class RAGFlowDocxParser: + def get_picture(self, document, paragraph): + imgs = paragraph._element.xpath(".//pic:pic") + if not imgs: + return None + image_blobs = [] + for img in imgs: + embed = img.xpath(".//a:blip/@r:embed") + if not embed: + continue + embed = embed[0] + image_blob = None + try: + related_part = document.part.related_parts[embed] + except Exception as e: + logging.warning(f"Skipping image due to unexpected error getting related_part: {e}") + continue + + try: + image = related_part.image + if image is not None: + image_blob = image.blob + except ( + UnrecognizedImageError, + UnexpectedEndOfFileError, + InvalidImageStreamError, + UnicodeDecodeError, + ) as e: + logging.info(f"Damaged image encountered, attempting blob fallback: {e}") + except Exception as e: + logging.warning(f"Unexpected error getting image, attempting blob fallback: {e}") + + if image_blob is None: + image_blob = getattr(related_part, "blob", None) + if image_blob: + image_blobs.append(image_blob) + if not image_blobs: + return None + return LazyDocxImage(image_blobs) + def __extract_table_content(self, tb): df = [] diff --git a/rag/app/manual.py b/rag/app/manual.py index 5f3b587920..e2af0706f2 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -20,12 +20,11 @@ import re from common.constants import ParserType from io import BytesIO -from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context +from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context, concat_img from common.token_utils import num_tokens_from_string from deepdoc.parser import PdfParser, DocxParser from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper, vision_figure_parser_docx_wrapper from docx import Document -from PIL import Image from rag.app.naive import by_plaintext, PARSERS from common.parser_config_utils import normalize_layout_recognizer @@ -71,45 +70,6 @@ class Docx(DocxParser): def __init__(self): pass - def get_picture(self, document, paragraph): - img = paragraph._element.xpath(".//pic:pic") - if not img: - return None - try: - img = img[0] - embed = img.xpath(".//a:blip/@r:embed")[0] - related_part = document.part.related_parts[embed] - image = related_part.image - if image is not None: - image = Image.open(BytesIO(image.blob)) - return image - elif related_part.blob is not None: - image = Image.open(BytesIO(related_part.blob)) - return image - else: - return None - except Exception: - return None - - def concat_img(self, img1, img2): - if img1 and not img2: - return img1 - if not img1 and img2: - return img2 - if not img1 and not img2: - return None - width1, height1 = img1.size - width2, height2 = img2.size - - new_width = max(width1, width2) - new_height = height1 + height2 - new_image = Image.new("RGB", (new_width, new_height)) - - new_image.paste(img1, (0, 0)) - new_image.paste(img2, (0, height1)) - - return new_image - def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None): self.doc = Document(filename) if not binary else Document(BytesIO(binary)) pn = 0 @@ -125,7 +85,7 @@ class Docx(DocxParser): if not question_level or question_level > 6: # not a question last_answer = f"{last_answer}\n{p_text}" current_image = self.get_picture(self.doc, p) - last_image = self.concat_img(last_image, current_image) + last_image = concat_img(last_image, current_image) else: # is a question if last_answer or last_image: sum_question = "\n".join(question_stack) diff --git a/rag/app/naive.py b/rag/app/naive.py index 0620190744..1d2d0ebbf7 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -21,7 +21,6 @@ from functools import reduce from io import BytesIO from timeit import default_timer as timer from docx import Document -from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.table import Table as DocxTable from docx.text.paragraph import Paragraph @@ -34,7 +33,6 @@ from common.constants import LLMType from api.db.services.llm_service import LLMBundle from api.db.joint_services.tenant_model_service import get_model_config_by_type_and_name, get_tenant_default_model_by_type from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html -from rag.utils.lazy_image import LazyDocxImage from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper from deepdoc.parser.pdf_parser import PlainParser, VisionParser @@ -265,40 +263,6 @@ class Docx(DocxParser): def __init__(self): pass - def get_picture(self, document, paragraph): - imgs = paragraph._element.xpath(".//pic:pic") - if not imgs: - return None - image_blobs = [] - for img in imgs: - embed = img.xpath(".//a:blip/@r:embed") - if not embed: - continue - embed = embed[0] - try: - related_part = document.part.related_parts[embed] - image_blob = related_part.image.blob - except UnrecognizedImageError: - logging.info("Unrecognized image format. Skipping image.") - continue - except UnexpectedEndOfFileError: - logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.") - continue - except InvalidImageStreamError: - logging.info("The recognized image stream appears to be corrupted. Skipping image.") - continue - except UnicodeDecodeError: - logging.info("The recognized image stream appears to be corrupted. Skipping image.") - continue - except Exception as e: - logging.warning(f"The recognized image stream appears to be corrupted. Skipping image, exception: {e}") - continue - image_blobs.append(image_blob) - - if not image_blobs: - return None - return LazyDocxImage(image_blobs) - def __clean(self, line): line = re.sub(r"\u3000", " ", line).strip() return line diff --git a/rag/app/qa.py b/rag/app/qa.py index 95678faaa2..da6d72cf73 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -27,7 +27,6 @@ from rag.nlp import is_english, random_choices, qbullets_category, add_positions from rag.nlp import rag_tokenizer, tokenize_table, concat_img from deepdoc.parser import PdfParser, ExcelParser, DocxParser from docx import Document -from PIL import Image from markdown import markdown from common.float_utils import get_float @@ -192,17 +191,6 @@ class Docx(DocxParser): def __init__(self): pass - def get_picture(self, document, paragraph): - img = paragraph._element.xpath('.//pic:pic') - if not img: - return None - img = img[0] - embed = img.xpath('.//a:blip/@r:embed')[0] - related_part = document.part.related_parts[embed] - image = related_part.image - image = Image.open(BytesIO(image.blob)).convert('RGB') - return image - def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None): self.doc = Document( filename) if not binary else Document(BytesIO(binary)) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 364e953881..be1cef05b9 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -1200,7 +1200,7 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。 def docx_question_level(p, bull=-1): txt = re.sub(r"\u3000", " ", p.text).strip() - if p.style.name.startswith('Heading'): + if hasattr(p.style, 'name') and p.style.name and p.style.name.startswith('Heading'): return int(p.style.name.split(' ')[-1]), txt else: if bull < 0: @@ -1212,7 +1212,18 @@ def docx_question_level(p, bull=-1): def concat_img(img1, img2): - from rag.utils.lazy_image import ensure_pil_image + from rag.utils.lazy_image import ensure_pil_image, LazyDocxImage + + # Fast path: preserve laziness when both sides are LazyDocxImage or None. + if (img1 is None or isinstance(img1, LazyDocxImage)) and \ + (img2 is None or isinstance(img2, LazyDocxImage)): + if img1 and not img2: + return img1 + if not img1 and img2: + return img2 + if not img1 and not img2: + return None + return LazyDocxImage.merge(img1, img2) img1 = ensure_pil_image(img1) or img1 img2 = ensure_pil_image(img2) or img2 diff --git a/rag/utils/lazy_image.py b/rag/utils/lazy_image.py index 7de2bfd5ce..c120bef913 100644 --- a/rag/utils/lazy_image.py +++ b/rag/utils/lazy_image.py @@ -88,6 +88,19 @@ class LazyDocxImage: self.close() return False + @staticmethod + def merge(a, b): + """ + Merge two LazyDocxImage instances by combining their blob lists. + """ + a_blobs = a._blobs if isinstance(a, LazyDocxImage) else [] + b_blobs = b._blobs if isinstance(b, LazyDocxImage) else [] + combined = a_blobs + b_blobs + if not combined: + return None + merged = LazyDocxImage(combined) + return merged + def ensure_pil_image(img): if isinstance(img, Image.Image):