rag/app/presentation.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import copy
import logging
import re
from collections import defaultdict
from io import BytesIO

from pypdf import PdfReader as pdf2_read

from deepdoc.parser import PdfParser, PlainParser
from deepdoc.parser.ppt_parser import RAGFlowPptParser
from rag.app.naive import by_plaintext, PARSERS
from common.parser_config_utils import normalize_layout_recognizer
from rag.nlp import rag_tokenizer
from rag.nlp import tokenize
from rag.utils.lazy_image import ensure_pil_image, is_image_like


class Pdf(PdfParser):
    def __init__(self):
        super().__init__()

    def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, **kwargs):
        # 1. OCR
        callback(msg="OCR started")
        self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)

        # 2. Layout Analysis
        callback(msg="Layout Analysis")
        self._layouts_rec(zoomin)

        # 3. Table Analysis
        callback(msg="Table Analysis")
        self._table_transformer_job(zoomin)

        # 4. Text Merge
        self._text_merge()

        # 5. Extract Tables (Force HTML)
        tbls = self._extract_table_figure(True, zoomin, True, True)

        # 6. Re-assemble Page Content
        page_items = defaultdict(list)

        # (A) Add text
        for b in self.boxes:
            # b["page_number"] is relative page number，must + from_page
            global_page_num = b["page_number"] + from_page
            if not (from_page < global_page_num <= to_page + from_page):
                continue
            page_items[global_page_num].append({"top": b["top"], "x0": b["x0"], "text": b["text"], "type": "text"})

        # (B) Add table and figure
        for (img, content), positions in tbls:
            if not positions:
                continue

            if isinstance(content, list):
                final_text = "\n".join(content)
            elif isinstance(content, str):
                final_text = content
            else:
                final_text = str(content)

            try:
                pn_index = positions[0][0]
                if isinstance(pn_index, list):
                    pn_index = pn_index[0]

                # pn_index in tbls is absolute page number
                current_page_num = int(pn_index) + 1
            except Exception as e:
                print(f"Error parsing position: {e}")
                continue

            if not (from_page < current_page_num <= to_page + from_page):
                continue

            top = positions[0][3]
            left = positions[0][1]

            page_items[current_page_num].append({"top": top, "x0": left, "text": final_text, "type": "table_or_figure"})

        # 7. Generate result
        res = []
        for i in range(len(self.page_images)):
            current_pn = from_page + i + 1
            items = page_items.get(current_pn, [])
            # Sort by vertical position
            items.sort(key=lambda x: (x["top"], x["x0"]))
            full_page_text = "\n\n".join([item["text"] for item in items])
            if not full_page_text.strip():
                full_page_text = f"[No text or data found in Page {current_pn}]"
            page_img = self.page_images[i]
            res.append((full_page_text, page_img))

        callback(0.9, "Parsing finished")

        return res, []


class PlainPdf(PlainParser):
    def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
        self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
        page_txt = []
        for page in self.pdf.pages[from_page:to_page]:
            page_txt.append(page.extract_text())
        callback(0.9, "Parsing finished")
        return [(txt, None) for txt in page_txt], []


def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, parser_config=None, **kwargs):
    """
    The supported file formats are pdf, ppt, pptx.
    Every page will be treated as a chunk. And the thumbnail of every page will be stored.
    PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
    """
    if parser_config is None:
        parser_config = {}
    eng = lang.lower() == "english"
    doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
    res = []
    if re.search(r"\.pptx?$", filename, re.IGNORECASE):
        try:
            ppt_parser = RAGFlowPptParser()
            for pn, txt in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
                d = copy.deepcopy(doc)
                pn += from_page
                d["doc_type_kwd"] = "image"
                d["page_num_int"] = [pn + 1]
                d["top_int"] = [0]
                d["position_int"] = [(pn + 1, 0, 0, 0, 0)]
                tokenize(d, txt, eng)
                res.append(d)
            return res
        except Exception as e:
            logging.warning(f"python-pptx parsing failed for {filename}: {e}, trying tika as fallback")
            if callback:
                callback(0.1, "python-pptx failed, trying tika as fallback")
            
            try:
                from tika import parser as tika_parser
            except Exception as tika_error:
                error_msg = f"tika not available: {tika_error}. Unsupported .ppt/.pptx parsing."
                if callback:
                    callback(0.8, error_msg)
                logging.warning(f"{error_msg} for {filename}.")
                raise NotImplementedError(error_msg)
            
            if binary:
                binary_data = binary
            else:
                with open(filename, 'rb') as f:
                    binary_data = f.read()
            doc_parsed = tika_parser.from_buffer(BytesIO(binary_data))
            
            if doc_parsed.get("content", None) is not None:
                sections = doc_parsed["content"].split("\n")
                sections = [s for s in sections if s.strip()]
                
                for pn, txt in enumerate(sections):
                    d = copy.deepcopy(doc)
                    pn += from_page
                    d["doc_type_kwd"] = "text"
                    d["page_num_int"] = [pn + 1]
                    d["top_int"] = [0]
                    d["position_int"] = [(pn + 1, 0, 0, 0, 0)]
                    tokenize(d, txt, eng)
                    res.append(d)
                
                if callback:
                    callback(0.8, "Finish parsing with tika.")
                return res
            else:
                error_msg = f"tika.parser got empty content from {filename}."
                if callback:
                    callback(0.8, error_msg)
                logging.warning(error_msg)
                raise NotImplementedError(error_msg)
    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))

        if isinstance(layout_recognizer, bool):
            layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"

        name = layout_recognizer.strip().lower()
        parser = PARSERS.get(name, by_plaintext)
        callback(0.1, "Start to parse.")

        sections, _, _ = parser(
            filename=filename,
            binary=binary,
            from_page=from_page,
            to_page=to_page,
            lang=lang,
            callback=callback,
            pdf_cls=Pdf,
            layout_recognizer=layout_recognizer,
            mineru_llm_name=parser_model_name,
            paddleocr_llm_name=parser_model_name,
            **kwargs,
        )

        if not sections:
            return []

        if name in ["tcadp", "docling", "mineru", "paddleocr"]:
            parser_config["chunk_token_num"] = 0

        callback(0.8, "Finish parsing.")

        for pn, (txt, img) in enumerate(sections):
            d = copy.deepcopy(doc)
            pn += from_page
            if not is_image_like(img):
                img = None
            else:
                img = ensure_pil_image(img)
            d["image"] = img
            d["page_num_int"] = [pn + 1]
            d["top_int"] = [0]
            d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
            tokenize(d, txt, eng)
            res.append(d)
        return res

    raise NotImplementedError("file type not supported yet(ppt, pptx, pdf supported)")


if __name__ == "__main__":
    import sys

    def dummy(a, b):
        pass

    chunk(sys.argv[1], callback=dummy)
-												Update comments (#4569)

### What problem does this PR solve?

Add license statement.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-01-21 20:52:28 +08:00
+								#
 								#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
 								#
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								#  Licensed under the Apache License, Version 2.0 (the "License");
 								#  you may not use this file except in compliance with the License.
 								#  You may obtain a copy of the License at
 								#
 								#      http://www.apache.org/licenses/LICENSE-2.0
 								#
 								#  Unless required by applicable law or agreed to in writing, software
 								#  distributed under the License is distributed on an "AS IS" BASIS,
 								#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								#  See the License for the specific language governing permissions and
 								#  limitations under the License.
 								#
-												Update comments (#4569)

### What problem does this PR solve?

Add license statement.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-01-21 20:52:28 +08:00
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								import copy
-												feat: Implement legacy .ppt parsing via Tika (alternative to Aspose) (#12932)

## What problem does this PR solve?
This PR implements parsing support for legacy PowerPoint files (`.ppt`,
97-2003 format).
Currently, parsing these files fails because `python-pptx` **natively
lacks support** for the legacy OLE2 binary format.

## **Context:**
I originally using `aspose-slides` for this purpose. However, since
`aspose-slides` is **no longer a project dependency**, I implemented a
fallback mechanism using the existing `tika-server` to ensure
compatibility and stability.

## **Key Changes:**
- **Fallback Logic**: Modified `rag/app/presentation.py` to catch
`python-pptx` failures and automatically fall back to Tika parsing.
- **No New Dependencies**: Utilizes the `tika` service that is already
part of the RAGFlow stack.
- **Note**: Since Tika focuses on text extraction, this implementation
extracts text content but does not generate slide thumbnails .
## 🧪 Test / Verification Results

### 1. Before (The Issue)
I have verified the fix using a legacy `.ppt` file (`math(1).ppt`,
~8MB).
<img width="963" height="970" alt="image"
src="https://github.com/user-attachments/assets/468c4ba8-f90b-4d7b-b969-9c5f5e42c474"
/>

### 2. After (The Fix)
With this PR, the system detects the failure in python-pptx and
successfully falls back to Tika. The text is extracted correctly.
<img width="1467" height="1121" alt="image"
src="https://github.com/user-attachments/assets/fa0fed3b-b923-4c86-ba2c-24b3ce6ee7a6"
/>


**Type of change**
- [x] New Feature (non-breaking change which adds functionality)

Signed-off-by: evilhero <2278596667@qq.com>
Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
											
										
										
											2026-02-02 13:40:51 +08:00
+								import logging
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								import re
-												feat: improve presentation PdfParser (#11639)

The old presentation PdfParser lost table format after parse

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-02 17:35:14 +08:00
+								from collections import defaultdict
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								from io import BytesIO
-												Fix: upgrade pypdf to 6.7.5 and migrate from deprecated pypdf2 to fix CVE-2026-28804 and CVE-2023-36464 (#13454)

### What problem does this PR solve?

This PR addresses security vulnerabilities in PDF processing
dependencies identified by Trivy security scan:

1. CVE-2026-28804 (MEDIUM): pypdf 6.7.4 vulnerable to inefficient
decoding of ASCIIHexDecode streams
2. CVE-2023-36464 (MEDIUM): pypdf2 3.0.1 susceptible to infinite loop
when parsing malformed comments

Since pypdf2 is deprecated with no available fixes, this PR migrates all
pypdf2 usage to the actively maintained pypdf library (version 6.7.5),
which resolves
both vulnerabilities.


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-09 04:06:00 +00:00
+								from pypdf import PdfReader as pdf2_read
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
-												Refa: remove aspose dependency. (#12910)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-30 14:06:19 +08:00
+								from deepdoc.parser import PdfParser, PlainParser
-												Refa: remove ppt image. (#12909)

### What problem does this PR solve?

remove `aspose`

### Type of change

- [x] Refactoring
											
										
										
											2026-01-30 13:35:42 +08:00
+								from deepdoc.parser.ppt_parser import RAGFlowPptParser
-												Fix: fix pdf_parser ignored in rag/app/naive.py (#11065)

### What problem does this PR solve?

Fix: fix pdf_parser ignored in rag/app/naive.py #11000

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-06 15:20:35 +08:00
+								from rag.app.naive import by_plaintext, PARSERS
-												Fix: model not authorized (#12001)

### What problem does this PR solve?

Fix model not authorized. #11973.


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-17 19:48:24 +08:00
+								from common.parser_config_utils import normalize_layout_recognizer
-												feat: improve presentation PdfParser (#11639)

The old presentation PdfParser lost table format after parse

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-02 17:35:14 +08:00
+								from rag.nlp import rag_tokenizer
-												Refa: remove aspose dependency. (#12910)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-30 14:06:19 +08:00
+								from rag.nlp import tokenize
-												refactor(word): lazy-load DOCX images to reduce peak memory without changing output (#13233)

**Summary**
This PR tackles a significant memory bottleneck when processing
image-heavy Word documents. Previously, our pipeline eagerly decoded
DOCX images into `PIL.Image` objects, which caused high peak memory
usage. To solve this, I've introduced a **lazy-loading approach**:
images are now stored as raw blobs and only decoded exactly when and
where they are consumed.

This successfully reduces the memory footprint while keeping the parsing
output completely identical to before.

**What's Changed**
Instead of a dry file-by-file list, here is the logical breakdown of the
updates:

* **The Core Abstraction (`lazy_image.py`)**: Introduced `LazyDocxImage`
along with helper APIs to handle lazy decoding, image-type checks, and
NumPy compatibility. It also supports `.close()` and detached PIL access
to ensure safe lifecycle management and prevent memory leaks.
* **Pipeline Integration (`naive.py`, `figure_parser.py`, etc.)**:
Updated the general DOCX picture extraction to return these new lazy
images. Downstream consumers (like the figure/VLM flow and base64
encoding paths) now decode images right at the use site using detached
PIL instances, avoiding shared-instance side effects.
* **Compatibility Hooks (`operators.py`, `book.py`, etc.)**: Added
necessary compatibility conversions so these lazy images flow smoothly
through existing merging, filtering, and presentation steps without
breaking.

**Scope & What is Intentionally Left Out**
To keep this PR focused, I have restricted these changes strictly to the
**general Word pipeline** and its downstream consumers.
The `QA` and `manual` Word parsing pipelines are explicitly **not
modified** in this PR. They can be safely migrated to this new lazy-load
model in a subsequent, standalone PR.

**Design Considerations**
I briefly considered adding image compression during processing, but
decided against it to avoid any potential quality degradation in the
derived outputs. I also held off on a massive pipeline re-architecture
to avoid overly invasive changes right now.

**Validation & Testing**
I've tested this to ensure no regressions:

* Compared identical DOCX inputs before and after this branch: chunk
counts, extracted text, table HTML, and image descriptions match
perfectly.
* **Confirmed a noticeable drop in peak memory usage when processing
image-dense documents.** For a 30MB Word document containing 243 1080p
screenshots, memory consumption is reduced by approximately 1.5GB.

**Breaking Changes**
None.
											
										
										
											2026-02-28 11:22:31 +08:00
+								from rag.utils.lazy_image import ensure_pil_image, is_image_like
-												feat: improve presentation PdfParser (#11639)

The old presentation PdfParser lost table format after parse

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-02 17:35:14 +08:00
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
 								class Pdf(PdfParser):
 								    def __init__(self):
 								        super().__init__()
-												feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-09 17:48:45 +08:00
+								    def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, **kwargs):
-												feat: improve presentation PdfParser (#11639)

The old presentation PdfParser lost table format after parse

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-02 17:35:14 +08:00
+								        # 1. OCR
-												Update progress info and start welcome info (#3768)

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

### Type of change

- [x] Refactoring

---------

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-11-30 18:48:06 +08:00
+								        callback(msg="OCR started")
-												feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-09 17:48:45 +08:00
+								        self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
-												feat: improve presentation PdfParser (#11639)

The old presentation PdfParser lost table format after parse

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-02 17:35:14 +08:00
 								        # 2. Layout Analysis
 								        callback(msg="Layout Analysis")
 								        self._layouts_rec(zoomin)
 								        # 3. Table Analysis
 								        callback(msg="Table Analysis")
 								        self._table_transformer_job(zoomin)
 								        # 4. Text Merge
 								        self._text_merge()
 								        # 5. Extract Tables (Force HTML)
 								        tbls = self._extract_table_figure(True, zoomin, True, True)
 								        # 6. Re-assemble Page Content
 								        page_items = defaultdict(list)
 								        # (A) Add text
 								        for b in self.boxes:
-												Fix: relative page_number in boxes (#11712)

page_number in boxes is relative page number，must + from_page

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-04 11:23:34 +08:00
+								            # b["page_number"] is relative page number，must + from_page
 								            global_page_num = b["page_number"] + from_page
 								            if not (from_page < global_page_num <= to_page + from_page):
-												feat: improve presentation PdfParser (#11639)

The old presentation PdfParser lost table format after parse

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-02 17:35:14 +08:00
+								                continue
-												feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-09 17:48:45 +08:00
+								            page_items[global_page_num].append({"top": b["top"], "x0": b["x0"], "text": b["text"], "type": "text"})
-												feat: improve presentation PdfParser (#11639)

The old presentation PdfParser lost table format after parse

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-02 17:35:14 +08:00
 								        # (B) Add table and figure
 								        for (img, content), positions in tbls:
 								            if not positions:
 								                continue
 								            if isinstance(content, list):
 								                final_text = "\n".join(content)
 								            elif isinstance(content, str):
 								                final_text = content
 								            else:
 								                final_text = str(content)
 								            try:
 								                pn_index = positions[0][0]
 								                if isinstance(pn_index, list):
 								                    pn_index = pn_index[0]
-												Fix: relative page_number in boxes (#11712)

page_number in boxes is relative page number，must + from_page

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-04 11:23:34 +08:00
 								                # pn_index in tbls is absolute page number
-												feat: improve presentation PdfParser (#11639)

The old presentation PdfParser lost table format after parse

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-02 17:35:14 +08:00
+								                current_page_num = int(pn_index) + 1
 								            except Exception as e:
 								                print(f"Error parsing position: {e}")
 								                continue
 								            if not (from_page < current_page_num <= to_page + from_page):
 								                continue
 								            top = positions[0][3]
 								            left = positions[0][1]
-												feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-09 17:48:45 +08:00
+								            page_items[current_page_num].append({"top": top, "x0": left, "text": final_text, "type": "table_or_figure"})
-												feat: improve presentation PdfParser (#11639)

The old presentation PdfParser lost table format after parse

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-02 17:35:14 +08:00
 								        # 7. Generate result
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								        res = []
-												feat: improve presentation PdfParser (#11639)

The old presentation PdfParser lost table format after parse

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-02 17:35:14 +08:00
+								        for i in range(len(self.page_images)):
 								            current_pn = from_page + i + 1
 								            items = page_items.get(current_pn, [])
 								            # Sort by vertical position
 								            items.sort(key=lambda x: (x["top"], x["x0"]))
 								            full_page_text = "\n\n".join([item["text"] for item in items])
 								            if not full_page_text.strip():
 								                full_page_text = f"[No text or data found in Page {current_pn}]"
 								            page_img = self.page_images[i]
 								            res.append((full_page_text, page_img))
 								        callback(0.9, "Parsing finished")
-												Feat: Support more chunking methods (#11000)

### What problem does this PR solve?

Feat: Support more chunking methods #10772 

This PR enables multiple chunking methods — including books, laws,
naive, one, and presentation — to be used with all existing PDF parsers
(DeepDOC, MinerU, Docling, TCADP, Plain Text, and Vision modes).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-05 13:00:42 +08:00
+								        return res, []
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
 								class PlainPdf(PlainParser):
-												feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-09 17:48:45 +08:00
+								    def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								        self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
 								        page_txt = []
-												feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-09 17:48:45 +08:00
+								        for page in self.pdf.pages[from_page:to_page]:
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								            page_txt.append(page.extract_text())
 								        callback(0.9, "Parsing finished")
-												Feat: Support more chunking methods (#11000)

### What problem does this PR solve?

Feat: Support more chunking methods #10772 

This PR enables multiple chunking methods — including books, laws,
naive, one, and presentation — to be used with all existing PDF parsers
(DeepDOC, MinerU, Docling, TCADP, Plain Text, and Vision modes).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-05 13:00:42 +08:00
+								        return [(txt, None) for txt in page_txt], []
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
-												feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-09 17:48:45 +08:00
+								def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, parser_config=None, **kwargs):
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								    """
-												feat: Implement legacy .ppt parsing via Tika (alternative to Aspose) (#12932)

## What problem does this PR solve?
This PR implements parsing support for legacy PowerPoint files (`.ppt`,
97-2003 format).
Currently, parsing these files fails because `python-pptx` **natively
lacks support** for the legacy OLE2 binary format.

## **Context:**
I originally using `aspose-slides` for this purpose. However, since
`aspose-slides` is **no longer a project dependency**, I implemented a
fallback mechanism using the existing `tika-server` to ensure
compatibility and stability.

## **Key Changes:**
- **Fallback Logic**: Modified `rag/app/presentation.py` to catch
`python-pptx` failures and automatically fall back to Tika parsing.
- **No New Dependencies**: Utilizes the `tika` service that is already
part of the RAGFlow stack.
- **Note**: Since Tika focuses on text extraction, this implementation
extracts text content but does not generate slide thumbnails .
## 🧪 Test / Verification Results

### 1. Before (The Issue)
I have verified the fix using a legacy `.ppt` file (`math(1).ppt`,
~8MB).
<img width="963" height="970" alt="image"
src="https://github.com/user-attachments/assets/468c4ba8-f90b-4d7b-b969-9c5f5e42c474"
/>

### 2. After (The Fix)
With this PR, the system detects the failure in python-pptx and
successfully falls back to Tika. The text is extracted correctly.
<img width="1467" height="1121" alt="image"
src="https://github.com/user-attachments/assets/fa0fed3b-b923-4c86-ba2c-24b3ce6ee7a6"
/>


**Type of change**
- [x] New Feature (non-breaking change which adds functionality)

Signed-off-by: evilhero <2278596667@qq.com>
Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
											
										
										
											2026-02-02 13:40:51 +08:00
+								    The supported file formats are pdf, ppt, pptx.
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								    Every page will be treated as a chunk. And the thumbnail of every page will be stored.
 								    PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
 								    """
-												Fix parser_config access for layout_recognize in presentation.py (#8492)

### What problem does this PR solve?
This PR addresses an issue in the presentation parser where the
`layout_recognize` configuration was incorrectly retrieved from
`kwargs.get("layout_recognize", "DeepDOC")`. Instead, it should be
sourced from the `parser_config` parameter, specifically
`parser_config.get("layout_recognize", "DeepDOC")`.

This mismatch could cause the parser to default to the "DeepDOC" layout
recognizer, ignoring any alternative recognition method specified in the
parser configuration. As a result, PDF document parsing might use an
incorrect recognition engine.

The fix ensures the presentation parser consistently uses the
`layout_recognize` setting from `parser_config`, aligning with the
configuration access patterns used elsewhere in the codebase.

### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-06-26 10:54:43 +07:00
+								    if parser_config is None:
 								        parser_config = {}
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								    eng = lang.lower() == "english"
-												feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-09 17:48:45 +08:00
+								    doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
 								    res = []
 								    if re.search(r"\.pptx?$", filename, re.IGNORECASE):
-												feat: Implement legacy .ppt parsing via Tika (alternative to Aspose) (#12932)

## What problem does this PR solve?
This PR implements parsing support for legacy PowerPoint files (`.ppt`,
97-2003 format).
Currently, parsing these files fails because `python-pptx` **natively
lacks support** for the legacy OLE2 binary format.

## **Context:**
I originally using `aspose-slides` for this purpose. However, since
`aspose-slides` is **no longer a project dependency**, I implemented a
fallback mechanism using the existing `tika-server` to ensure
compatibility and stability.

## **Key Changes:**
- **Fallback Logic**: Modified `rag/app/presentation.py` to catch
`python-pptx` failures and automatically fall back to Tika parsing.
- **No New Dependencies**: Utilizes the `tika` service that is already
part of the RAGFlow stack.
- **Note**: Since Tika focuses on text extraction, this implementation
extracts text content but does not generate slide thumbnails .
## 🧪 Test / Verification Results

### 1. Before (The Issue)
I have verified the fix using a legacy `.ppt` file (`math(1).ppt`,
~8MB).
<img width="963" height="970" alt="image"
src="https://github.com/user-attachments/assets/468c4ba8-f90b-4d7b-b969-9c5f5e42c474"
/>

### 2. After (The Fix)
With this PR, the system detects the failure in python-pptx and
successfully falls back to Tika. The text is extracted correctly.
<img width="1467" height="1121" alt="image"
src="https://github.com/user-attachments/assets/fa0fed3b-b923-4c86-ba2c-24b3ce6ee7a6"
/>


**Type of change**
- [x] New Feature (non-breaking change which adds functionality)

Signed-off-by: evilhero <2278596667@qq.com>
Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
											
										
										
											2026-02-02 13:40:51 +08:00
+								        try:
 								            ppt_parser = RAGFlowPptParser()
 								            for pn, txt in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
 								                d = copy.deepcopy(doc)
 								                pn += from_page
 								                d["doc_type_kwd"] = "image"
 								                d["page_num_int"] = [pn + 1]
 								                d["top_int"] = [0]
 								                d["position_int"] = [(pn + 1, 0, 0, 0, 0)]
 								                tokenize(d, txt, eng)
 								                res.append(d)
 								            return res
 								        except Exception as e:
 								            logging.warning(f"python-pptx parsing failed for {filename}: {e}, trying tika as fallback")
 								            if callback:
 								                callback(0.1, "python-pptx failed, trying tika as fallback")
 								            try:
 								                from tika import parser as tika_parser
 								            except Exception as tika_error:
 								                error_msg = f"tika not available: {tika_error}. Unsupported .ppt/.pptx parsing."
 								                if callback:
 								                    callback(0.8, error_msg)
 								                logging.warning(f"{error_msg} for {filename}.")
 								                raise NotImplementedError(error_msg)
-												Refactor: improve close for presentation (#12957)

### What problem does this PR solve?

improve close for presentation

### Type of change

- [x] Refactoring
											
										
										
											2026-02-03 10:24:27 +08:00
+								            if binary:
 								                binary_data = binary
 								            else:
 								                with open(filename, 'rb') as f:
 								                    binary_data = f.read()
-												feat: Implement legacy .ppt parsing via Tika (alternative to Aspose) (#12932)

## What problem does this PR solve?
This PR implements parsing support for legacy PowerPoint files (`.ppt`,
97-2003 format).
Currently, parsing these files fails because `python-pptx` **natively
lacks support** for the legacy OLE2 binary format.

## **Context:**
I originally using `aspose-slides` for this purpose. However, since
`aspose-slides` is **no longer a project dependency**, I implemented a
fallback mechanism using the existing `tika-server` to ensure
compatibility and stability.

## **Key Changes:**
- **Fallback Logic**: Modified `rag/app/presentation.py` to catch
`python-pptx` failures and automatically fall back to Tika parsing.
- **No New Dependencies**: Utilizes the `tika` service that is already
part of the RAGFlow stack.
- **Note**: Since Tika focuses on text extraction, this implementation
extracts text content but does not generate slide thumbnails .
## 🧪 Test / Verification Results

### 1. Before (The Issue)
I have verified the fix using a legacy `.ppt` file (`math(1).ppt`,
~8MB).
<img width="963" height="970" alt="image"
src="https://github.com/user-attachments/assets/468c4ba8-f90b-4d7b-b969-9c5f5e42c474"
/>

### 2. After (The Fix)
With this PR, the system detects the failure in python-pptx and
successfully falls back to Tika. The text is extracted correctly.
<img width="1467" height="1121" alt="image"
src="https://github.com/user-attachments/assets/fa0fed3b-b923-4c86-ba2c-24b3ce6ee7a6"
/>


**Type of change**
- [x] New Feature (non-breaking change which adds functionality)

Signed-off-by: evilhero <2278596667@qq.com>
Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
											
										
										
											2026-02-02 13:40:51 +08:00
+								            doc_parsed = tika_parser.from_buffer(BytesIO(binary_data))
 								            if doc_parsed.get("content", None) is not None:
 								                sections = doc_parsed["content"].split("\n")
 								                sections = [s for s in sections if s.strip()]
 								                for pn, txt in enumerate(sections):
 								                    d = copy.deepcopy(doc)
 								                    pn += from_page
 								                    d["doc_type_kwd"] = "text"
 								                    d["page_num_int"] = [pn + 1]
 								                    d["top_int"] = [0]
 								                    d["position_int"] = [(pn + 1, 0, 0, 0, 0)]
 								                    tokenize(d, txt, eng)
 								                    res.append(d)
 								                if callback:
 								                    callback(0.8, "Finish parsing with tika.")
 								                return res
 								            else:
 								                error_msg = f"tika.parser got empty content from {filename}."
 								                if callback:
 								                    callback(0.8, error_msg)
 								                logging.warning(error_msg)
 								                raise NotImplementedError(error_msg)
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-												feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-09 17:48:45 +08:00
+								        layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
-												Fix: presentation of PDF using vlm. (#8133)

### What problem does this PR solve?

#8109

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-06-09 15:01:52 +08:00
-												Feat: Support more chunking methods (#11000)

### What problem does this PR solve?

Feat: Support more chunking methods #10772 

This PR enables multiple chunking methods — including books, laws,
naive, one, and presentation — to be used with all existing PDF parsers
(DeepDOC, MinerU, Docling, TCADP, Plain Text, and Vision modes).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-05 13:00:42 +08:00
+								        if isinstance(layout_recognizer, bool):
 								            layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
 								        name = layout_recognizer.strip().lower()
-												Fix: fix pdf_parser ignored in rag/app/naive.py (#11065)

### What problem does this PR solve?

Fix: fix pdf_parser ignored in rag/app/naive.py #11000

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-06 15:20:35 +08:00
+								        parser = PARSERS.get(name, by_plaintext)
-												Feat: Support more chunking methods (#11000)

### What problem does this PR solve?

Feat: Support more chunking methods #10772 

This PR enables multiple chunking methods — including books, laws,
naive, one, and presentation — to be used with all existing PDF parsers
(DeepDOC, MinerU, Docling, TCADP, Plain Text, and Vision modes).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-05 13:00:42 +08:00
+								        callback(0.1, "Start to parse.")
 								        sections, _, _ = parser(
-												feat: improve presentation PdfParser (#11639)

The old presentation PdfParser lost table format after parse

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-02 17:35:14 +08:00
+								            filename=filename,
 								            binary=binary,
 								            from_page=from_page,
 								            to_page=to_page,
 								            lang=lang,
 								            callback=callback,
 								            pdf_cls=Pdf,
 								            layout_recognizer=layout_recognizer,
-												Fix: model not authorized (#12001)

### What problem does this PR solve?

Fix model not authorized. #11973.


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-17 19:48:24 +08:00
+								            mineru_llm_name=parser_model_name,
-												feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-09 17:48:45 +08:00
+								            paddleocr_llm_name=parser_model_name,
 								            **kwargs,
-												Feat: Support more chunking methods (#11000)

### What problem does this PR solve?

Feat: Support more chunking methods #10772 

This PR enables multiple chunking methods — including books, laws,
naive, one, and presentation — to be used with all existing PDF parsers
(DeepDOC, MinerU, Docling, TCADP, Plain Text, and Vision modes).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-05 13:00:42 +08:00
+								        )
 								        if not sections:
 								            return []
-												feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-09 17:48:45 +08:00
+								        if name in ["tcadp", "docling", "mineru", "paddleocr"]:
-												Feat: Support more chunking methods (#11000)

### What problem does this PR solve?

Feat: Support more chunking methods #10772 

This PR enables multiple chunking methods — including books, laws,
naive, one, and presentation — to be used with all existing PDF parsers
(DeepDOC, MinerU, Docling, TCADP, Plain Text, and Vision modes).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-05 13:00:42 +08:00
+								            parser_config["chunk_token_num"] = 0
-												feat: improve presentation PdfParser (#11639)

The old presentation PdfParser lost table format after parse

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-02 17:35:14 +08:00
-												Fix: presentation of PDF using vlm. (#8133)

### What problem does this PR solve?

#8109

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-06-09 15:01:52 +08:00
+								        callback(0.8, "Finish parsing.")
-												Feat: Support more chunking methods (#11000)

### What problem does this PR solve?

Feat: Support more chunking methods #10772 

This PR enables multiple chunking methods — including books, laws,
naive, one, and presentation — to be used with all existing PDF parsers
(DeepDOC, MinerU, Docling, TCADP, Plain Text, and Vision modes).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-05 13:00:42 +08:00
-												Fix: presentation of PDF using vlm. (#8133)

### What problem does this PR solve?

#8109

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-06-09 15:01:52 +08:00
+								        for pn, (txt, img) in enumerate(sections):
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								            d = copy.deepcopy(doc)
 								            pn += from_page
-												refactor(word): lazy-load DOCX images to reduce peak memory without changing output (#13233)

**Summary**
This PR tackles a significant memory bottleneck when processing
image-heavy Word documents. Previously, our pipeline eagerly decoded
DOCX images into `PIL.Image` objects, which caused high peak memory
usage. To solve this, I've introduced a **lazy-loading approach**:
images are now stored as raw blobs and only decoded exactly when and
where they are consumed.

This successfully reduces the memory footprint while keeping the parsing
output completely identical to before.

**What's Changed**
Instead of a dry file-by-file list, here is the logical breakdown of the
updates:

* **The Core Abstraction (`lazy_image.py`)**: Introduced `LazyDocxImage`
along with helper APIs to handle lazy decoding, image-type checks, and
NumPy compatibility. It also supports `.close()` and detached PIL access
to ensure safe lifecycle management and prevent memory leaks.
* **Pipeline Integration (`naive.py`, `figure_parser.py`, etc.)**:
Updated the general DOCX picture extraction to return these new lazy
images. Downstream consumers (like the figure/VLM flow and base64
encoding paths) now decode images right at the use site using detached
PIL instances, avoiding shared-instance side effects.
* **Compatibility Hooks (`operators.py`, `book.py`, etc.)**: Added
necessary compatibility conversions so these lazy images flow smoothly
through existing merging, filtering, and presentation steps without
breaking.

**Scope & What is Intentionally Left Out**
To keep this PR focused, I have restricted these changes strictly to the
**general Word pipeline** and its downstream consumers.
The `QA` and `manual` Word parsing pipelines are explicitly **not
modified** in this PR. They can be safely migrated to this new lazy-load
model in a subsequent, standalone PR.

**Design Considerations**
I briefly considered adding image compression during processing, but
decided against it to avoid any potential quality degradation in the
derived outputs. I also held off on a massive pipeline re-architecture
to avoid overly invasive changes right now.

**Validation & Testing**
I've tested this to ensure no regressions:

* Compared identical DOCX inputs before and after this branch: chunk
counts, extracted text, table HTML, and image descriptions match
perfectly.
* **Confirmed a noticeable drop in peak memory usage when processing
image-dense documents.** For a 30MB Word document containing 243 1080p
screenshots, memory consumption is reduced by approximately 1.5GB.

**Breaking Changes**
None.
											
										
										
											2026-02-28 11:22:31 +08:00
+								            if not is_image_like(img):
-												Fix: presentation parsing & Embedding encode exception handling (#11933)

### What problem does this PR solve?

Fix: presentation parsing #11920
Fix: Embeddin encode exception handling
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-13 11:37:42 +08:00
+								                img = None
-												refactor(word): lazy-load DOCX images to reduce peak memory without changing output (#13233)

**Summary**
This PR tackles a significant memory bottleneck when processing
image-heavy Word documents. Previously, our pipeline eagerly decoded
DOCX images into `PIL.Image` objects, which caused high peak memory
usage. To solve this, I've introduced a **lazy-loading approach**:
images are now stored as raw blobs and only decoded exactly when and
where they are consumed.

This successfully reduces the memory footprint while keeping the parsing
output completely identical to before.

**What's Changed**
Instead of a dry file-by-file list, here is the logical breakdown of the
updates:

* **The Core Abstraction (`lazy_image.py`)**: Introduced `LazyDocxImage`
along with helper APIs to handle lazy decoding, image-type checks, and
NumPy compatibility. It also supports `.close()` and detached PIL access
to ensure safe lifecycle management and prevent memory leaks.
* **Pipeline Integration (`naive.py`, `figure_parser.py`, etc.)**:
Updated the general DOCX picture extraction to return these new lazy
images. Downstream consumers (like the figure/VLM flow and base64
encoding paths) now decode images right at the use site using detached
PIL instances, avoiding shared-instance side effects.
* **Compatibility Hooks (`operators.py`, `book.py`, etc.)**: Added
necessary compatibility conversions so these lazy images flow smoothly
through existing merging, filtering, and presentation steps without
breaking.

**Scope & What is Intentionally Left Out**
To keep this PR focused, I have restricted these changes strictly to the
**general Word pipeline** and its downstream consumers.
The `QA` and `manual` Word parsing pipelines are explicitly **not
modified** in this PR. They can be safely migrated to this new lazy-load
model in a subsequent, standalone PR.

**Design Considerations**
I briefly considered adding image compression during processing, but
decided against it to avoid any potential quality degradation in the
derived outputs. I also held off on a massive pipeline re-architecture
to avoid overly invasive changes right now.

**Validation & Testing**
I've tested this to ensure no regressions:

* Compared identical DOCX inputs before and after this branch: chunk
counts, extracted text, table HTML, and image descriptions match
perfectly.
* **Confirmed a noticeable drop in peak memory usage when processing
image-dense documents.** For a 30MB Word document containing 243 1080p
screenshots, memory consumption is reduced by approximately 1.5GB.

**Breaking Changes**
None.
											
										
										
											2026-02-28 11:22:31 +08:00
+								            else:
 								                img = ensure_pil_image(img)
-												Fix: presentation parsing & Embedding encode exception handling (#11933)

### What problem does this PR solve?

Fix: presentation parsing #11920
Fix: Embeddin encode exception handling
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-13 11:37:42 +08:00
+								            d["image"] = img
-												Rename page_num_list, top_list, position_list (#3940)

### What problem does this PR solve?

Rename page_num_list, top_list, position_list to page_num_int, top_int,
position_int

### Type of change

- [x] Refactoring
											
										
										
											2024-12-10 16:32:58 +08:00
+								            d["page_num_int"] = [pn + 1]
 								            d["top_int"] = [0]
-												feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-09 17:48:45 +08:00
+								            d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								            tokenize(d, txt, eng)
 								            res.append(d)
 								        return res
-												feat: Implement legacy .ppt parsing via Tika (alternative to Aspose) (#12932)

## What problem does this PR solve?
This PR implements parsing support for legacy PowerPoint files (`.ppt`,
97-2003 format).
Currently, parsing these files fails because `python-pptx` **natively
lacks support** for the legacy OLE2 binary format.

## **Context:**
I originally using `aspose-slides` for this purpose. However, since
`aspose-slides` is **no longer a project dependency**, I implemented a
fallback mechanism using the existing `tika-server` to ensure
compatibility and stability.

## **Key Changes:**
- **Fallback Logic**: Modified `rag/app/presentation.py` to catch
`python-pptx` failures and automatically fall back to Tika parsing.
- **No New Dependencies**: Utilizes the `tika` service that is already
part of the RAGFlow stack.
- **Note**: Since Tika focuses on text extraction, this implementation
extracts text content but does not generate slide thumbnails .
## 🧪 Test / Verification Results

### 1. Before (The Issue)
I have verified the fix using a legacy `.ppt` file (`math(1).ppt`,
~8MB).
<img width="963" height="970" alt="image"
src="https://github.com/user-attachments/assets/468c4ba8-f90b-4d7b-b969-9c5f5e42c474"
/>

### 2. After (The Fix)
With this PR, the system detects the failure in python-pptx and
successfully falls back to Tika. The text is extracted correctly.
<img width="1467" height="1121" alt="image"
src="https://github.com/user-attachments/assets/fa0fed3b-b923-4c86-ba2c-24b3ce6ee7a6"
/>


**Type of change**
- [x] New Feature (non-breaking change which adds functionality)

Signed-off-by: evilhero <2278596667@qq.com>
Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
											
										
										
											2026-02-02 13:40:51 +08:00
+								    raise NotImplementedError("file type not supported yet(ppt, pptx, pdf supported)")
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
 								if __name__ == "__main__":
 								    import sys
 								    def dummy(a, b):
 								        pass
-												feat: improve presentation PdfParser (#11639)

The old presentation PdfParser lost table format after parse

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-02 17:35:14 +08:00
-												Format file format from Windows/dos to Unix (#1949)

### What problem does this PR solve?

Related source file is in Windows/DOS format, they are format to Unix
format.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-08-15 09:17:36 +08:00
+								    chunk(sys.argv[1], callback=dummy)