2025-11-03 15:20:46 +08:00
|
|
|
#
|
|
|
|
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
import base64
|
|
|
|
|
import logging
|
|
|
|
|
from functools import partial
|
|
|
|
|
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
from PIL import Image
|
|
|
|
|
|
2026-01-20 13:29:37 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
from common.misc_utils import thread_pool_exec
|
refactor(word): lazy-load DOCX images to reduce peak memory without changing output (#13233)
**Summary**
This PR tackles a significant memory bottleneck when processing
image-heavy Word documents. Previously, our pipeline eagerly decoded
DOCX images into `PIL.Image` objects, which caused high peak memory
usage. To solve this, I've introduced a **lazy-loading approach**:
images are now stored as raw blobs and only decoded exactly when and
where they are consumed.
This successfully reduces the memory footprint while keeping the parsing
output completely identical to before.
**What's Changed**
Instead of a dry file-by-file list, here is the logical breakdown of the
updates:
* **The Core Abstraction (`lazy_image.py`)**: Introduced `LazyDocxImage`
along with helper APIs to handle lazy decoding, image-type checks, and
NumPy compatibility. It also supports `.close()` and detached PIL access
to ensure safe lifecycle management and prevent memory leaks.
* **Pipeline Integration (`naive.py`, `figure_parser.py`, etc.)**:
Updated the general DOCX picture extraction to return these new lazy
images. Downstream consumers (like the figure/VLM flow and base64
encoding paths) now decode images right at the use site using detached
PIL instances, avoiding shared-instance side effects.
* **Compatibility Hooks (`operators.py`, `book.py`, etc.)**: Added
necessary compatibility conversions so these lazy images flow smoothly
through existing merging, filtering, and presentation steps without
breaking.
**Scope & What is Intentionally Left Out**
To keep this PR focused, I have restricted these changes strictly to the
**general Word pipeline** and its downstream consumers.
The `QA` and `manual` Word parsing pipelines are explicitly **not
modified** in this PR. They can be safely migrated to this new lazy-load
model in a subsequent, standalone PR.
**Design Considerations**
I briefly considered adding image compression during processing, but
decided against it to avoid any potential quality degradation in the
derived outputs. I also held off on a massive pipeline re-architecture
to avoid overly invasive changes right now.
**Validation & Testing**
I've tested this to ensure no regressions:
* Compared identical DOCX inputs before and after this branch: chunk
counts, extracted text, table HTML, and image descriptions match
perfectly.
* **Confirmed a noticeable drop in peak memory usage when processing
image-dense documents.** For a 30MB Word document containing 243 1080p
screenshots, memory consumption is reduced by approximately 1.5GB.
**Breaking Changes**
None.
2026-02-28 11:22:31 +08:00
|
|
|
from rag.utils.lazy_image import open_image_for_processing
|
2026-01-20 13:29:37 +08:00
|
|
|
|
2025-11-03 15:20:46 +08:00
|
|
|
test_image_base64 = "iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAIAAAD/gAIDAAAA6ElEQVR4nO3QwQ3AIBDAsIP9d25XIC+EZE8QZc18w5l9O+AlZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBT+IYAHHLHkdEgAAAABJRU5ErkJggg=="
|
|
|
|
|
test_image = base64.b64decode(test_image_base64)
|
|
|
|
|
|
2025-12-29 12:01:18 +08:00
|
|
|
|
|
|
|
|
async def image2id(d: dict, storage_put_func: partial, objname: str, bucket: str = "imagetemps"):
|
2025-11-03 15:20:46 +08:00
|
|
|
import logging
|
|
|
|
|
from io import BytesIO
|
2026-05-27 21:54:17 +08:00
|
|
|
from rag.svr.task_executor_limiter import minio_limiter
|
2025-12-09 19:23:14 +08:00
|
|
|
|
2025-11-04 20:13:52 +08:00
|
|
|
if "image" not in d:
|
|
|
|
|
return
|
|
|
|
|
if not d["image"]:
|
|
|
|
|
del d["image"]
|
2025-11-03 15:20:46 +08:00
|
|
|
return
|
|
|
|
|
|
2025-12-09 19:23:14 +08:00
|
|
|
def encode_image():
|
|
|
|
|
with BytesIO() as buf:
|
refactor(word): lazy-load DOCX images to reduce peak memory without changing output (#13233)
**Summary**
This PR tackles a significant memory bottleneck when processing
image-heavy Word documents. Previously, our pipeline eagerly decoded
DOCX images into `PIL.Image` objects, which caused high peak memory
usage. To solve this, I've introduced a **lazy-loading approach**:
images are now stored as raw blobs and only decoded exactly when and
where they are consumed.
This successfully reduces the memory footprint while keeping the parsing
output completely identical to before.
**What's Changed**
Instead of a dry file-by-file list, here is the logical breakdown of the
updates:
* **The Core Abstraction (`lazy_image.py`)**: Introduced `LazyDocxImage`
along with helper APIs to handle lazy decoding, image-type checks, and
NumPy compatibility. It also supports `.close()` and detached PIL access
to ensure safe lifecycle management and prevent memory leaks.
* **Pipeline Integration (`naive.py`, `figure_parser.py`, etc.)**:
Updated the general DOCX picture extraction to return these new lazy
images. Downstream consumers (like the figure/VLM flow and base64
encoding paths) now decode images right at the use site using detached
PIL instances, avoiding shared-instance side effects.
* **Compatibility Hooks (`operators.py`, `book.py`, etc.)**: Added
necessary compatibility conversions so these lazy images flow smoothly
through existing merging, filtering, and presentation steps without
breaking.
**Scope & What is Intentionally Left Out**
To keep this PR focused, I have restricted these changes strictly to the
**general Word pipeline** and its downstream consumers.
The `QA` and `manual` Word parsing pipelines are explicitly **not
modified** in this PR. They can be safely migrated to this new lazy-load
model in a subsequent, standalone PR.
**Design Considerations**
I briefly considered adding image compression during processing, but
decided against it to avoid any potential quality degradation in the
derived outputs. I also held off on a massive pipeline re-architecture
to avoid overly invasive changes right now.
**Validation & Testing**
I've tested this to ensure no regressions:
* Compared identical DOCX inputs before and after this branch: chunk
counts, extracted text, table HTML, and image descriptions match
perfectly.
* **Confirmed a noticeable drop in peak memory usage when processing
image-dense documents.** For a 30MB Word document containing 243 1080p
screenshots, memory consumption is reduced by approximately 1.5GB.
**Breaking Changes**
None.
2026-02-28 11:22:31 +08:00
|
|
|
img, close_after = open_image_for_processing(d["image"], allow_bytes=False)
|
2025-12-09 19:23:14 +08:00
|
|
|
|
|
|
|
|
if isinstance(img, bytes):
|
|
|
|
|
buf.write(img)
|
|
|
|
|
buf.seek(0)
|
|
|
|
|
return buf.getvalue()
|
|
|
|
|
|
refactor(word): lazy-load DOCX images to reduce peak memory without changing output (#13233)
**Summary**
This PR tackles a significant memory bottleneck when processing
image-heavy Word documents. Previously, our pipeline eagerly decoded
DOCX images into `PIL.Image` objects, which caused high peak memory
usage. To solve this, I've introduced a **lazy-loading approach**:
images are now stored as raw blobs and only decoded exactly when and
where they are consumed.
This successfully reduces the memory footprint while keeping the parsing
output completely identical to before.
**What's Changed**
Instead of a dry file-by-file list, here is the logical breakdown of the
updates:
* **The Core Abstraction (`lazy_image.py`)**: Introduced `LazyDocxImage`
along with helper APIs to handle lazy decoding, image-type checks, and
NumPy compatibility. It also supports `.close()` and detached PIL access
to ensure safe lifecycle management and prevent memory leaks.
* **Pipeline Integration (`naive.py`, `figure_parser.py`, etc.)**:
Updated the general DOCX picture extraction to return these new lazy
images. Downstream consumers (like the figure/VLM flow and base64
encoding paths) now decode images right at the use site using detached
PIL instances, avoiding shared-instance side effects.
* **Compatibility Hooks (`operators.py`, `book.py`, etc.)**: Added
necessary compatibility conversions so these lazy images flow smoothly
through existing merging, filtering, and presentation steps without
breaking.
**Scope & What is Intentionally Left Out**
To keep this PR focused, I have restricted these changes strictly to the
**general Word pipeline** and its downstream consumers.
The `QA` and `manual` Word parsing pipelines are explicitly **not
modified** in this PR. They can be safely migrated to this new lazy-load
model in a subsequent, standalone PR.
**Design Considerations**
I briefly considered adding image compression during processing, but
decided against it to avoid any potential quality degradation in the
derived outputs. I also held off on a massive pipeline re-architecture
to avoid overly invasive changes right now.
**Validation & Testing**
I've tested this to ensure no regressions:
* Compared identical DOCX inputs before and after this branch: chunk
counts, extracted text, table HTML, and image descriptions match
perfectly.
* **Confirmed a noticeable drop in peak memory usage when processing
image-dense documents.** For a 30MB Word document containing 243 1080p
screenshots, memory consumption is reduced by approximately 1.5GB.
**Breaking Changes**
None.
2026-02-28 11:22:31 +08:00
|
|
|
if not isinstance(img, Image.Image):
|
|
|
|
|
return None
|
|
|
|
|
|
2025-12-09 19:23:14 +08:00
|
|
|
if img.mode in ("RGBA", "P"):
|
refactor(word): lazy-load DOCX images to reduce peak memory without changing output (#13233)
**Summary**
This PR tackles a significant memory bottleneck when processing
image-heavy Word documents. Previously, our pipeline eagerly decoded
DOCX images into `PIL.Image` objects, which caused high peak memory
usage. To solve this, I've introduced a **lazy-loading approach**:
images are now stored as raw blobs and only decoded exactly when and
where they are consumed.
This successfully reduces the memory footprint while keeping the parsing
output completely identical to before.
**What's Changed**
Instead of a dry file-by-file list, here is the logical breakdown of the
updates:
* **The Core Abstraction (`lazy_image.py`)**: Introduced `LazyDocxImage`
along with helper APIs to handle lazy decoding, image-type checks, and
NumPy compatibility. It also supports `.close()` and detached PIL access
to ensure safe lifecycle management and prevent memory leaks.
* **Pipeline Integration (`naive.py`, `figure_parser.py`, etc.)**:
Updated the general DOCX picture extraction to return these new lazy
images. Downstream consumers (like the figure/VLM flow and base64
encoding paths) now decode images right at the use site using detached
PIL instances, avoiding shared-instance side effects.
* **Compatibility Hooks (`operators.py`, `book.py`, etc.)**: Added
necessary compatibility conversions so these lazy images flow smoothly
through existing merging, filtering, and presentation steps without
breaking.
**Scope & What is Intentionally Left Out**
To keep this PR focused, I have restricted these changes strictly to the
**general Word pipeline** and its downstream consumers.
The `QA` and `manual` Word parsing pipelines are explicitly **not
modified** in this PR. They can be safely migrated to this new lazy-load
model in a subsequent, standalone PR.
**Design Considerations**
I briefly considered adding image compression during processing, but
decided against it to avoid any potential quality degradation in the
derived outputs. I also held off on a massive pipeline re-architecture
to avoid overly invasive changes right now.
**Validation & Testing**
I've tested this to ensure no regressions:
* Compared identical DOCX inputs before and after this branch: chunk
counts, extracted text, table HTML, and image descriptions match
perfectly.
* **Confirmed a noticeable drop in peak memory usage when processing
image-dense documents.** For a 30MB Word document containing 243 1080p
screenshots, memory consumption is reduced by approximately 1.5GB.
**Breaking Changes**
None.
2026-02-28 11:22:31 +08:00
|
|
|
orig_img = img
|
2025-12-09 19:23:14 +08:00
|
|
|
img = img.convert("RGB")
|
refactor(word): lazy-load DOCX images to reduce peak memory without changing output (#13233)
**Summary**
This PR tackles a significant memory bottleneck when processing
image-heavy Word documents. Previously, our pipeline eagerly decoded
DOCX images into `PIL.Image` objects, which caused high peak memory
usage. To solve this, I've introduced a **lazy-loading approach**:
images are now stored as raw blobs and only decoded exactly when and
where they are consumed.
This successfully reduces the memory footprint while keeping the parsing
output completely identical to before.
**What's Changed**
Instead of a dry file-by-file list, here is the logical breakdown of the
updates:
* **The Core Abstraction (`lazy_image.py`)**: Introduced `LazyDocxImage`
along with helper APIs to handle lazy decoding, image-type checks, and
NumPy compatibility. It also supports `.close()` and detached PIL access
to ensure safe lifecycle management and prevent memory leaks.
* **Pipeline Integration (`naive.py`, `figure_parser.py`, etc.)**:
Updated the general DOCX picture extraction to return these new lazy
images. Downstream consumers (like the figure/VLM flow and base64
encoding paths) now decode images right at the use site using detached
PIL instances, avoiding shared-instance side effects.
* **Compatibility Hooks (`operators.py`, `book.py`, etc.)**: Added
necessary compatibility conversions so these lazy images flow smoothly
through existing merging, filtering, and presentation steps without
breaking.
**Scope & What is Intentionally Left Out**
To keep this PR focused, I have restricted these changes strictly to the
**general Word pipeline** and its downstream consumers.
The `QA` and `manual` Word parsing pipelines are explicitly **not
modified** in this PR. They can be safely migrated to this new lazy-load
model in a subsequent, standalone PR.
**Design Considerations**
I briefly considered adding image compression during processing, but
decided against it to avoid any potential quality degradation in the
derived outputs. I also held off on a massive pipeline re-architecture
to avoid overly invasive changes right now.
**Validation & Testing**
I've tested this to ensure no regressions:
* Compared identical DOCX inputs before and after this branch: chunk
counts, extracted text, table HTML, and image descriptions match
perfectly.
* **Confirmed a noticeable drop in peak memory usage when processing
image-dense documents.** For a 30MB Word document containing 243 1080p
screenshots, memory consumption is reduced by approximately 1.5GB.
**Breaking Changes**
None.
2026-02-28 11:22:31 +08:00
|
|
|
if close_after:
|
|
|
|
|
try:
|
|
|
|
|
orig_img.close()
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
2025-12-09 19:23:14 +08:00
|
|
|
|
2025-11-03 15:20:46 +08:00
|
|
|
try:
|
2025-12-09 19:23:14 +08:00
|
|
|
img.save(buf, format="JPEG")
|
2026-02-28 14:43:35 +08:00
|
|
|
buf.seek(0)
|
|
|
|
|
return buf.getvalue()
|
2025-11-03 15:20:46 +08:00
|
|
|
except OSError as e:
|
2025-12-09 19:23:14 +08:00
|
|
|
logging.warning(f"Saving image exception: {e}")
|
|
|
|
|
return None
|
2026-02-28 14:43:35 +08:00
|
|
|
finally:
|
|
|
|
|
if close_after:
|
|
|
|
|
try:
|
|
|
|
|
img.close()
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
2025-12-09 19:23:14 +08:00
|
|
|
|
2026-01-20 13:29:37 +08:00
|
|
|
jpeg_binary = await thread_pool_exec(encode_image)
|
2025-12-09 19:23:14 +08:00
|
|
|
if jpeg_binary is None:
|
|
|
|
|
del d["image"]
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
async with minio_limiter:
|
2026-01-20 13:29:37 +08:00
|
|
|
await thread_pool_exec(
|
2025-12-09 19:23:14 +08:00
|
|
|
lambda: storage_put_func(bucket=bucket, fnm=objname, binary=jpeg_binary)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
d["img_id"] = f"{bucket}-{objname}"
|
|
|
|
|
|
|
|
|
|
if not isinstance(d["image"], bytes):
|
|
|
|
|
d["image"].close()
|
|
|
|
|
del d["image"]
|
2025-11-03 15:20:46 +08:00
|
|
|
|
|
|
|
|
|
2026-06-01 19:52:51 -07:00
|
|
|
def parse_storage_composite_id(composite_id: str) -> tuple[str, str] | None:
|
|
|
|
|
"""Split a ``{bucket}-{object_key}`` storage ID on the first hyphen only.
|
|
|
|
|
|
|
|
|
|
``image2id`` stores ``img_id`` as ``f"{bucket}-{objname}"``. The object key
|
|
|
|
|
may contain additional hyphens (e.g. ``page-1.jpg``).
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
composite_id: Composite storage identifier.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
``(bucket, object_key)`` when valid, otherwise ``None``.
|
|
|
|
|
"""
|
|
|
|
|
parts = composite_id.split("-", 1)
|
|
|
|
|
if len(parts) != 2 or not parts[0] or not parts[1] or composite_id.endswith("-"):
|
|
|
|
|
return None
|
|
|
|
|
return parts[0], parts[1]
|
|
|
|
|
|
|
|
|
|
|
2025-12-29 12:01:18 +08:00
|
|
|
def id2image(image_id: str | None, storage_get_func: partial):
|
2026-06-01 19:52:51 -07:00
|
|
|
"""Load a PIL image from storage using a composite ``img_id``.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
image_id: Value produced by ``image2id`` (``{bucket}-{object_key}``).
|
|
|
|
|
storage_get_func: Callable ``(bucket=, fnm=)`` returning raw bytes.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A PIL ``Image`` instance, or ``None`` when the ID is invalid or load fails.
|
|
|
|
|
"""
|
2025-11-03 15:20:46 +08:00
|
|
|
if not image_id:
|
|
|
|
|
return
|
2026-06-01 19:52:51 -07:00
|
|
|
parsed = parse_storage_composite_id(image_id)
|
|
|
|
|
if not parsed:
|
|
|
|
|
logging.debug("Invalid image_id composite format: %s", image_id)
|
2025-11-03 15:20:46 +08:00
|
|
|
return
|
2026-06-01 19:52:51 -07:00
|
|
|
bkt, nm = parsed
|
2025-11-03 15:20:46 +08:00
|
|
|
try:
|
2025-12-24 16:58:14 +08:00
|
|
|
blob = storage_get_func(bucket=bkt, fnm=nm)
|
2025-11-03 15:20:46 +08:00
|
|
|
if not blob:
|
|
|
|
|
return
|
|
|
|
|
return Image.open(BytesIO(blob))
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.exception(e)
|