From d0ca388bece8e0d86cbf7833e0e72aa2459cccad Mon Sep 17 00:00:00 2001
From: eviaaaaa <2278596667@qq.com>
Date: Wed, 11 Mar 2026 10:00:07 +0800
Subject: [PATCH] Refa: implement unified lazy image loading for Docx parsers
 (qa/manual) (#13329)

## Summary
This PR is the direct successor to the previous `docx` lazy-loading
implementation. It addresses the technical debt intentionally left out
in the last PR by fully migrating the `qa` and `manual` parsing
strategies to the new lazy-loading model.

Additionally, this PR comprehensively refactors the underlying `docx`
parsing pipeline to eliminate significant code redundancy and introduces
robust fallback mechanisms to handle completely corrupted image streams
safely.


## What's Changed

* **Centralized Abstraction (`docx_parser.py`)**: Moved the
`get_picture` extraction logic up to the `RAGFlowDocxParser` base class.
Previously, `naive`, `qa`, and `manual` parsers maintained separate,
redundant copies of this method. All downstream strategies now natively
gather raw blobs and return `LazyDocxImage` objects automatically.
* **Robust Corrupted Image Fallback (`docx_parser.py`)**: Handled edge
cases where `python-docx` encounters critically malformed magic headers.
Implemented an explicit `try-except` structure that safely intercepts
`UnrecognizedImageError` (and similar exceptions) and seamlessly falls
back to retrieving the raw binary via `getattr(related_part, "blob",
None)`, preventing parser crashes on damaged documents.

* **Legacy Code & Redundancy Purge**:
* Removed the duplicate `get_picture` methods from `naive.py`, `qa.py`,
and `manual.py`.
* Removed the standalone, immediate-decoding `concat_img` method in
`manual.py`. It has been completely replaced by the globally unified,
lazy-loading-compatible `rag.nlp.concat_img`.
* Cleaned up unused legacy imports (e.g., `PIL.Image`, docx exception
packages) across all updated strategy files.

## Scope
To keep this PR focused, I have restricted these changes strictly to the
unification of `docx` extraction logic and the lazy-load migration of
`qa` and `manual`.

## Validation & Testing
I've tested this to ensure no regressions and validated the fallback
logic:

* **Output Consistency**: Compared identical `.docx` inputs using `qa`
and `manual` strategies before and after this branch: chunk counts,
extracted text, table HTML, and attached images match perfectly.
* **Memory Footprint Drop**: Confirmed a noticeable drop in peak memory
usage when processing image-dense documents through the `qa` and
`manual` pipelines, bringing them up to parity with the `naive`
strategy's performance gains.

## Breaking Changes
* None.
---
 deepdoc/parser/docx_parser.py | 47 ++++++++++++++++++++++++++++++++++-
 rag/app/manual.py             | 44 ++------------------------------
 rag/app/naive.py              | 36 ---------------------------
 rag/app/qa.py                 | 12 ---------
 rag/nlp/__init__.py           | 15 +++++++++--
 rag/utils/lazy_image.py       | 13 ++++++++++
 6 files changed, 74 insertions(+), 93 deletions(-)

diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py
index 2a65841e24..a17543cbf4 100644
--- a/deepdoc/parser/docx_parser.py
+++ b/deepdoc/parser/docx_parser.py
@@ -20,9 +20,54 @@ import pandas as pd
 from collections import Counter
 from rag.nlp import rag_tokenizer
 from io import BytesIO
-
+import logging
+from docx.image.exceptions import (
+    InvalidImageStreamError,
+    UnexpectedEndOfFileError,
+    UnrecognizedImageError,
+)
+from rag.utils.lazy_image import LazyDocxImage
 
 class RAGFlowDocxParser:
+    def get_picture(self, document, paragraph):
+        imgs = paragraph._element.xpath(".//pic:pic")
+        if not imgs:
+            return None
+        image_blobs = []
+        for img in imgs:
+            embed = img.xpath(".//a:blip/@r:embed")
+            if not embed:
+                continue
+            embed = embed[0]
+            image_blob = None
+            try:
+                related_part = document.part.related_parts[embed]
+            except Exception as e:
+                logging.warning(f"Skipping image due to unexpected error getting related_part: {e}")
+                continue
+
+            try:
+                image = related_part.image
+                if image is not None:
+                    image_blob = image.blob
+            except (
+                UnrecognizedImageError,
+                UnexpectedEndOfFileError,
+                InvalidImageStreamError,
+                UnicodeDecodeError,
+            ) as e:
+                logging.info(f"Damaged image encountered, attempting blob fallback: {e}")
+            except Exception as e:
+                logging.warning(f"Unexpected error getting image, attempting blob fallback: {e}")
+
+            if image_blob is None:
+                image_blob = getattr(related_part, "blob", None)
+            if image_blob:
+                image_blobs.append(image_blob)
+        if not image_blobs:
+            return None
+        return LazyDocxImage(image_blobs)
+
 
     def __extract_table_content(self, tb):
         df = []
diff --git a/rag/app/manual.py b/rag/app/manual.py
index 5f3b587920..e2af0706f2 100644
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -20,12 +20,11 @@ import re
 
 from common.constants import ParserType
 from io import BytesIO
-from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
+from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context, concat_img
 from common.token_utils import num_tokens_from_string
 from deepdoc.parser import PdfParser, DocxParser
 from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper, vision_figure_parser_docx_wrapper
 from docx import Document
-from PIL import Image
 from rag.app.naive import by_plaintext, PARSERS
 from common.parser_config_utils import normalize_layout_recognizer
 
@@ -71,45 +70,6 @@ class Docx(DocxParser):
     def __init__(self):
         pass
 
-    def get_picture(self, document, paragraph):
-        img = paragraph._element.xpath(".//pic:pic")
-        if not img:
-            return None
-        try:
-            img = img[0]
-            embed = img.xpath(".//a:blip/@r:embed")[0]
-            related_part = document.part.related_parts[embed]
-            image = related_part.image
-            if image is not None:
-                image = Image.open(BytesIO(image.blob))
-                return image
-            elif related_part.blob is not None:
-                image = Image.open(BytesIO(related_part.blob))
-                return image
-            else:
-                return None
-        except Exception:
-            return None
-
-    def concat_img(self, img1, img2):
-        if img1 and not img2:
-            return img1
-        if not img1 and img2:
-            return img2
-        if not img1 and not img2:
-            return None
-        width1, height1 = img1.size
-        width2, height2 = img2.size
-
-        new_width = max(width1, width2)
-        new_height = height1 + height2
-        new_image = Image.new("RGB", (new_width, new_height))
-
-        new_image.paste(img1, (0, 0))
-        new_image.paste(img2, (0, height1))
-
-        return new_image
-
     def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
         self.doc = Document(filename) if not binary else Document(BytesIO(binary))
         pn = 0
@@ -125,7 +85,7 @@ class Docx(DocxParser):
             if not question_level or question_level > 6:  # not a question
                 last_answer = f"{last_answer}\n{p_text}"
                 current_image = self.get_picture(self.doc, p)
-                last_image = self.concat_img(last_image, current_image)
+                last_image = concat_img(last_image, current_image)
             else:  # is a question
                 if last_answer or last_image:
                     sum_question = "\n".join(question_stack)
diff --git a/rag/app/naive.py b/rag/app/naive.py
index 0620190744..1d2d0ebbf7 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -21,7 +21,6 @@ from functools import reduce
 from io import BytesIO
 from timeit import default_timer as timer
 from docx import Document
-from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
 from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.table import Table as DocxTable
 from docx.text.paragraph import Paragraph
@@ -34,7 +33,6 @@ from common.constants import LLMType
 from api.db.services.llm_service import LLMBundle
 from api.db.joint_services.tenant_model_service import get_model_config_by_type_and_name, get_tenant_default_model_by_type
 from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
-from rag.utils.lazy_image import LazyDocxImage
 from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
 from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper
 from deepdoc.parser.pdf_parser import PlainParser, VisionParser
@@ -265,40 +263,6 @@ class Docx(DocxParser):
     def __init__(self):
         pass
 
-    def get_picture(self, document, paragraph):
-        imgs = paragraph._element.xpath(".//pic:pic")
-        if not imgs:
-            return None
-        image_blobs = []
-        for img in imgs:
-            embed = img.xpath(".//a:blip/@r:embed")
-            if not embed:
-                continue
-            embed = embed[0]
-            try:
-                related_part = document.part.related_parts[embed]
-                image_blob = related_part.image.blob
-            except UnrecognizedImageError:
-                logging.info("Unrecognized image format. Skipping image.")
-                continue
-            except UnexpectedEndOfFileError:
-                logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
-                continue
-            except InvalidImageStreamError:
-                logging.info("The recognized image stream appears to be corrupted. Skipping image.")
-                continue
-            except UnicodeDecodeError:
-                logging.info("The recognized image stream appears to be corrupted. Skipping image.")
-                continue
-            except Exception as e:
-                logging.warning(f"The recognized image stream appears to be corrupted. Skipping image, exception: {e}")
-                continue
-            image_blobs.append(image_blob)
-
-        if not image_blobs:
-            return None
-        return LazyDocxImage(image_blobs)
-
     def __clean(self, line):
         line = re.sub(r"\u3000", " ", line).strip()
         return line
diff --git a/rag/app/qa.py b/rag/app/qa.py
index 95678faaa2..da6d72cf73 100644
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@@ -27,7 +27,6 @@ from rag.nlp import is_english, random_choices, qbullets_category, add_positions
 from rag.nlp import rag_tokenizer, tokenize_table, concat_img
 from deepdoc.parser import PdfParser, ExcelParser, DocxParser
 from docx import Document
-from PIL import Image
 from markdown import markdown
 
 from common.float_utils import get_float
@@ -192,17 +191,6 @@ class Docx(DocxParser):
     def __init__(self):
         pass
 
-    def get_picture(self, document, paragraph):
-        img = paragraph._element.xpath('.//pic:pic')
-        if not img:
-            return None
-        img = img[0]
-        embed = img.xpath('.//a:blip/@r:embed')[0]
-        related_part = document.part.related_parts[embed]
-        image = related_part.image
-        image = Image.open(BytesIO(image.blob)).convert('RGB')
-        return image
-
     def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
         self.doc = Document(
             filename) if not binary else Document(BytesIO(binary))
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index 364e953881..be1cef05b9 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -1200,7 +1200,7 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
 
 def docx_question_level(p, bull=-1):
     txt = re.sub(r"\u3000", " ", p.text).strip()
-    if p.style.name.startswith('Heading'):
+    if hasattr(p.style, 'name') and p.style.name and p.style.name.startswith('Heading'):
         return int(p.style.name.split(' ')[-1]), txt
     else:
         if bull < 0:
@@ -1212,7 +1212,18 @@ def docx_question_level(p, bull=-1):
 
 
 def concat_img(img1, img2):
-    from rag.utils.lazy_image import ensure_pil_image
+    from rag.utils.lazy_image import ensure_pil_image, LazyDocxImage
+
+    # Fast path: preserve laziness when both sides are LazyDocxImage or None.
+    if (img1 is None or isinstance(img1, LazyDocxImage)) and \
+       (img2 is None or isinstance(img2, LazyDocxImage)):
+        if img1 and not img2:
+            return img1
+        if not img1 and img2:
+            return img2
+        if not img1 and not img2:
+            return None
+        return LazyDocxImage.merge(img1, img2)
 
     img1 = ensure_pil_image(img1) or img1
     img2 = ensure_pil_image(img2) or img2
diff --git a/rag/utils/lazy_image.py b/rag/utils/lazy_image.py
index 7de2bfd5ce..c120bef913 100644
--- a/rag/utils/lazy_image.py
+++ b/rag/utils/lazy_image.py
@@ -88,6 +88,19 @@ class LazyDocxImage:
         self.close()
         return False
 
+    @staticmethod
+    def merge(a, b):
+        """
+        Merge two LazyDocxImage instances by combining their blob lists.
+        """
+        a_blobs = a._blobs if isinstance(a, LazyDocxImage) else []
+        b_blobs = b._blobs if isinstance(b, LazyDocxImage) else []
+        combined = a_blobs + b_blobs
+        if not combined:
+            return None
+        merged = LazyDocxImage(combined)
+        return merged
+
 
 def ensure_pil_image(img):
     if isinstance(img, Image.Image):