deepdoc/parser/docx_parser.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

from docx import Document
import re
import pandas as pd
from collections import Counter
from rag.nlp import rag_tokenizer
from io import BytesIO
import logging
from docx.image.exceptions import (
    InvalidImageStreamError,
    UnexpectedEndOfFileError,
    UnrecognizedImageError,
)
from rag.utils.lazy_image import LazyDocxImage

class RAGFlowDocxParser:
    def get_picture(self, document, paragraph):
        imgs = paragraph._element.xpath(".//pic:pic")
        if not imgs:
            return None
        image_blobs = []
        for img in imgs:
            embed = img.xpath(".//a:blip/@r:embed")
            if not embed:
                continue
            embed = embed[0]
            image_blob = None
            try:
                related_part = document.part.related_parts[embed]
            except Exception as e:
                logging.warning(f"Skipping image due to unexpected error getting related_part: {e}")
                continue

            try:
                image = related_part.image
                if image is not None:
                    image_blob = image.blob
            except (
                UnrecognizedImageError,
                UnexpectedEndOfFileError,
                InvalidImageStreamError,
                UnicodeDecodeError,
            ) as e:
                logging.info(f"Damaged image encountered, attempting blob fallback: {e}")
            except Exception as e:
                logging.warning(f"Unexpected error getting image, attempting blob fallback: {e}")

            if image_blob is None:
                image_blob = getattr(related_part, "blob", None)
            if image_blob:
                image_blobs.append(image_blob)
        if not image_blobs:
            return None
        return LazyDocxImage(image_blobs)


    def __extract_table_content(self, tb):
        df = []
        for row in tb.rows:
            df.append([c.text for c in row.cells])
        return self.__compose_table_content(pd.DataFrame(df))

    def __compose_table_content(self, df):

        def blockType(b):
            pattern = [
                ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
                (r"^(20|19)[0-9]{2}年$", "Dt"),
                (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
                ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
                (r"^第*[一二三四1-4]季度$", "Dt"),
                (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
                (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
                ("^[0-9.,+%/ -]+$", "Nu"),
                (r"^[0-9A-Z/\._~-]+$", "Ca"),
                (r"^[A-Z]*[a-z' -]+$", "En"),
                (r"^[0-9.,+-]+[0-9A-Za-z/$￥%<>（）()' -]+$", "NE"),
                (r"^.{1}$", "Sg")
            ]
            for p, n in pattern:
                if re.search(p, b):
                    return n
            tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
            if len(tks) > 3:
                if len(tks) < 12:
                    return "Tx"
                else:
                    return "Lx"

            if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
                return "Nr"

            return "Ot"

        if len(df) < 2:
            return []
        max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
            1, len(df)) for j in range(len(df.iloc[i, :]))])
        max_type = max(max_type.items(), key=lambda x: x[1])[0]

        colnm = len(df.iloc[0, :])
        hdrows = [0]  # header is not necessarily appear in the first line
        if max_type == "Nu":
            for r in range(1, len(df)):
                tys = Counter([blockType(str(df.iloc[r, j]))
                              for j in range(len(df.iloc[r, :]))])
                tys = max(tys.items(), key=lambda x: x[1])[0]
                if tys != max_type:
                    hdrows.append(r)

        lines = []
        for i in range(1, len(df)):
            if i in hdrows:
                continue
            hr = [r - i for r in hdrows]
            hr = [r for r in hr if r < 0]
            t = len(hr) - 1
            while t > 0:
                if hr[t] - hr[t - 1] > 1:
                    hr = hr[t:]
                    break
                t -= 1
            headers = []
            for j in range(len(df.iloc[i, :])):
                t = []
                for h in hr:
                    x = str(df.iloc[i + h, j]).strip()
                    if x in t:
                        continue
                    t.append(x)
                t = ",".join(t)
                if t:
                    t += ": "
                headers.append(t)
            cells = []
            for j in range(len(df.iloc[i, :])):
                if not str(df.iloc[i, j]):
                    continue
                cells.append(headers[j] + str(df.iloc[i, j]))
            lines.append(";".join(cells))

        if colnm > 3:
            return lines
        return ["\n".join(lines)]

    def __call__(self, fnm, from_page=0, to_page=100000000):
        self.doc = Document(fnm) if isinstance(
            fnm, str) else Document(BytesIO(fnm))
        pn = 0 # parsed page
        secs = [] # parsed contents
        for p in self.doc.paragraphs:
            if pn > to_page:
                break

            runs_within_single_paragraph = [] # save runs within the range of pages
            for run in p.runs:
                if pn > to_page:
                    break
                if from_page <= pn < to_page and p.text.strip():
                    runs_within_single_paragraph.append(run.text) # append run.text first

                # wrap page break checker into a static method
                if 'lastRenderedPageBreak' in run._element.xml:
                    pn += 1

            secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph

        tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
        return secs, tbls
-												Update comments (#4569)

### What problem does this PR solve?

Add license statement.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-01-21 20:52:28 +08:00
+								#
 								#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
 								#
-												Update readme and add license (#1018)

### What problem does this PR solve?

- Update readme
- Add license

### Type of change

- [x] Documentation Update

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-06-01 16:24:10 +08:00
+								#  Licensed under the Apache License, Version 2.0 (the "License");
 								#  you may not use this file except in compliance with the License.
 								#  You may obtain a copy of the License at
 								#
 								#      http://www.apache.org/licenses/LICENSE-2.0
 								#
 								#  Unless required by applicable law or agreed to in writing, software
 								#  distributed under the License is distributed on an "AS IS" BASIS,
 								#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								#  See the License for the specific language governing permissions and
 								#  limitations under the License.
 								#
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								from docx import Document
 								import re
 								import pandas as pd
 								from collections import Counter
-												refine code (#595)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
											
										
										
											2024-04-28 19:13:33 +08:00
+								from rag.nlp import rag_tokenizer
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
+								from io import BytesIO
-												Refa: implement unified lazy image loading for Docx parsers (qa/manual) (#13329)

## Summary
This PR is the direct successor to the previous `docx` lazy-loading
implementation. It addresses the technical debt intentionally left out
in the last PR by fully migrating the `qa` and `manual` parsing
strategies to the new lazy-loading model.

Additionally, this PR comprehensively refactors the underlying `docx`
parsing pipeline to eliminate significant code redundancy and introduces
robust fallback mechanisms to handle completely corrupted image streams
safely.


## What's Changed

* **Centralized Abstraction (`docx_parser.py`)**: Moved the
`get_picture` extraction logic up to the `RAGFlowDocxParser` base class.
Previously, `naive`, `qa`, and `manual` parsers maintained separate,
redundant copies of this method. All downstream strategies now natively
gather raw blobs and return `LazyDocxImage` objects automatically.
* **Robust Corrupted Image Fallback (`docx_parser.py`)**: Handled edge
cases where `python-docx` encounters critically malformed magic headers.
Implemented an explicit `try-except` structure that safely intercepts
`UnrecognizedImageError` (and similar exceptions) and seamlessly falls
back to retrieving the raw binary via `getattr(related_part, "blob",
None)`, preventing parser crashes on damaged documents.

* **Legacy Code & Redundancy Purge**:
* Removed the duplicate `get_picture` methods from `naive.py`, `qa.py`,
and `manual.py`.
* Removed the standalone, immediate-decoding `concat_img` method in
`manual.py`. It has been completely replaced by the globally unified,
lazy-loading-compatible `rag.nlp.concat_img`.
* Cleaned up unused legacy imports (e.g., `PIL.Image`, docx exception
packages) across all updated strategy files.

## Scope
To keep this PR focused, I have restricted these changes strictly to the
unification of `docx` extraction logic and the lazy-load migration of
`qa` and `manual`.

## Validation & Testing
I've tested this to ensure no regressions and validated the fallback
logic:

* **Output Consistency**: Compared identical `.docx` inputs using `qa`
and `manual` strategies before and after this branch: chunk counts,
extracted text, table HTML, and attached images match perfectly.
* **Memory Footprint Drop**: Confirmed a noticeable drop in peak memory
usage when processing image-dense documents through the `qa` and
`manual` pipelines, bringing them up to parity with the `naive`
strategy's performance gains.

## Breaking Changes
* None.
											
										
										
											2026-03-11 10:00:07 +08:00
+								import logging
 								from docx.image.exceptions import (
 								    InvalidImageStreamError,
 								    UnexpectedEndOfFileError,
 								    UnrecognizedImageError,
 								)
 								from rag.utils.lazy_image import LazyDocxImage
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
-												refactor code (#583)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
											
										
										
											2024-04-28 13:19:54 +08:00
+								class RAGFlowDocxParser:
-												Refa: implement unified lazy image loading for Docx parsers (qa/manual) (#13329)

## Summary
This PR is the direct successor to the previous `docx` lazy-loading
implementation. It addresses the technical debt intentionally left out
in the last PR by fully migrating the `qa` and `manual` parsing
strategies to the new lazy-loading model.

Additionally, this PR comprehensively refactors the underlying `docx`
parsing pipeline to eliminate significant code redundancy and introduces
robust fallback mechanisms to handle completely corrupted image streams
safely.


## What's Changed

* **Centralized Abstraction (`docx_parser.py`)**: Moved the
`get_picture` extraction logic up to the `RAGFlowDocxParser` base class.
Previously, `naive`, `qa`, and `manual` parsers maintained separate,
redundant copies of this method. All downstream strategies now natively
gather raw blobs and return `LazyDocxImage` objects automatically.
* **Robust Corrupted Image Fallback (`docx_parser.py`)**: Handled edge
cases where `python-docx` encounters critically malformed magic headers.
Implemented an explicit `try-except` structure that safely intercepts
`UnrecognizedImageError` (and similar exceptions) and seamlessly falls
back to retrieving the raw binary via `getattr(related_part, "blob",
None)`, preventing parser crashes on damaged documents.

* **Legacy Code & Redundancy Purge**:
* Removed the duplicate `get_picture` methods from `naive.py`, `qa.py`,
and `manual.py`.
* Removed the standalone, immediate-decoding `concat_img` method in
`manual.py`. It has been completely replaced by the globally unified,
lazy-loading-compatible `rag.nlp.concat_img`.
* Cleaned up unused legacy imports (e.g., `PIL.Image`, docx exception
packages) across all updated strategy files.

## Scope
To keep this PR focused, I have restricted these changes strictly to the
unification of `docx` extraction logic and the lazy-load migration of
`qa` and `manual`.

## Validation & Testing
I've tested this to ensure no regressions and validated the fallback
logic:

* **Output Consistency**: Compared identical `.docx` inputs using `qa`
and `manual` strategies before and after this branch: chunk counts,
extracted text, table HTML, and attached images match perfectly.
* **Memory Footprint Drop**: Confirmed a noticeable drop in peak memory
usage when processing image-dense documents through the `qa` and
`manual` pipelines, bringing them up to parity with the `naive`
strategy's performance gains.

## Breaking Changes
* None.
											
										
										
											2026-03-11 10:00:07 +08:00
+								    def get_picture(self, document, paragraph):
 								        imgs = paragraph._element.xpath(".//pic:pic")
 								        if not imgs:
 								            return None
 								        image_blobs = []
 								        for img in imgs:
 								            embed = img.xpath(".//a:blip/@r:embed")
 								            if not embed:
 								                continue
 								            embed = embed[0]
 								            image_blob = None
 								            try:
 								                related_part = document.part.related_parts[embed]
 								            except Exception as e:
 								                logging.warning(f"Skipping image due to unexpected error getting related_part: {e}")
 								                continue
 								            try:
 								                image = related_part.image
 								                if image is not None:
 								                    image_blob = image.blob
 								            except (
 								                UnrecognizedImageError,
 								                UnexpectedEndOfFileError,
 								                InvalidImageStreamError,
 								                UnicodeDecodeError,
 								            ) as e:
 								                logging.info(f"Damaged image encountered, attempting blob fallback: {e}")
 								            except Exception as e:
 								                logging.warning(f"Unexpected error getting image, attempting blob fallback: {e}")
 								            if image_blob is None:
 								                image_blob = getattr(related_part, "blob", None)
 								            if image_blob:
 								                image_blobs.append(image_blob)
 								        if not image_blobs:
 								            return None
 								        return LazyDocxImage(image_blobs)
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
 								    def __extract_table_content(self, tb):
 								        df = []
 								        for row in tb.rows:
 								            df.append([c.text for c in row.cells])
 								        return self.__compose_table_content(pd.DataFrame(df))
 								    def __compose_table_content(self, df):
 								        def blockType(b):
-												Refactor parser code (#9042)

### What problem does this PR solve?

Refactor code

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-07-25 12:04:07 +08:00
+								            pattern = [
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								                ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
 								                (r"^(20|19)[0-9]{2}年$", "Dt"),
 								                (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
 								                ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
 								                (r"^第*[一二三四1-4]季度$", "Dt"),
 								                (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
 								                (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
 								                ("^[0-9.,+%/ -]+$", "Nu"),
 								                (r"^[0-9A-Z/\._~-]+$", "Ca"),
 								                (r"^[A-Z]*[a-z' -]+$", "En"),
 								                (r"^[0-9.,+-]+[0-9A-Za-z/$￥%<>（）()' -]+$", "NE"),
 								                (r"^.{1}$", "Sg")
 								            ]
-												Refactor parser code (#9042)

### What problem does this PR solve?

Refactor code

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-07-25 12:04:07 +08:00
+								            for p, n in pattern:
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								                if re.search(p, b):
 								                    return n
-												Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 13:00:38 +08:00
+								            tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								            if len(tks) > 3:
 								                if len(tks) < 12:
 								                    return "Tx"
 								                else:
 								                    return "Lx"
-												refine code (#595)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
											
										
										
											2024-04-28 19:13:33 +08:00
+								            if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								                return "Nr"
 								            return "Ot"
 								        if len(df) < 2:
 								            return []
 								        max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
 , len(df)) for j in range(len(df.iloc[i, :]))])
 								        max_type = max(max_type.items(), key=lambda x: x[1])[0]
 								        colnm = len(df.iloc[0, :])
-												Fix typo in code (#8327)

### What problem does this PR solve?

Fix typo in code

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-06-18 09:41:09 +08:00
+								        hdrows = [0]  # header is not necessarily appear in the first line
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        if max_type == "Nu":
 								            for r in range(1, len(df)):
 								                tys = Counter([blockType(str(df.iloc[r, j]))
 								                              for j in range(len(df.iloc[r, :]))])
 								                tys = max(tys.items(), key=lambda x: x[1])[0]
 								                if tys != max_type:
 								                    hdrows.append(r)
 								        lines = []
 								        for i in range(1, len(df)):
 								            if i in hdrows:
 								                continue
 								            hr = [r - i for r in hdrows]
 								            hr = [r for r in hr if r < 0]
 								            t = len(hr) - 1
 								            while t > 0:
 								                if hr[t] - hr[t - 1] > 1:
 								                    hr = hr[t:]
 								                    break
 								                t -= 1
 								            headers = []
 								            for j in range(len(df.iloc[i, :])):
 								                t = []
 								                for h in hr:
 								                    x = str(df.iloc[i + h, j]).strip()
 								                    if x in t:
 								                        continue
 								                    t.append(x)
 								                t = ",".join(t)
 								                if t:
 								                    t += ": "
 								                headers.append(t)
 								            cells = []
 								            for j in range(len(df.iloc[i, :])):
 								                if not str(df.iloc[i, j]):
 								                    continue
 								                cells.append(headers[j] + str(df.iloc[i, j]))
 								            lines.append(";".join(cells))
 								        if colnm > 3:
 								            return lines
 								        return ["\n".join(lines)]
-												Fix:#3230 When parsing a docx file using the Book parsing method, to_page is always -1, resulting in a block count of 0 even if parsing is successful (#3249)

### What problem does this PR solve?

When parsing a docx file using the Book parsing method, to_page is
always -1, resulting in a block count of 0 even if parsing is successful

Fix:#3230

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2024-11-08 09:21:42 +08:00
+								    def __call__(self, fnm, from_page=0, to_page=100000000):
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								        self.doc = Document(fnm) if isinstance(
 								            fnm, str) else Document(BytesIO(fnm))
-												fix too long query exception (#1195)

### What problem does this PR solve?

#1161 
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-06-18 09:50:59 +08:00
+								        pn = 0 # parsed page
 								        secs = [] # parsed contents
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								        for p in self.doc.paragraphs:
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								            if pn > to_page:
 								                break
-												fix too long query exception (#1195)

### What problem does this PR solve?

#1161 
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-06-18 09:50:59 +08:00
 								            runs_within_single_paragraph = [] # save runs within the range of pages
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								            for run in p.runs:
-												fix too long query exception (#1195)

### What problem does this PR solve?

#1161 
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-06-18 09:50:59 +08:00
+								                if pn > to_page:
 								                    break
 								                if from_page <= pn < to_page and p.text.strip():
 								                    runs_within_single_paragraph.append(run.text) # append run.text first
 								                # wrap page break checker into a static method
-												fix bug of ragflowdocxpparser (#1642)

### What problem does this PR solve?

#1627

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-07-23 09:25:32 +08:00
+								                if 'lastRenderedPageBreak' in run._element.xml:
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								                    pn += 1
-												Fix:#3230 When parsing a docx file using the Book parsing method, to_page is always -1, resulting in a block count of 0 even if parsing is successful (#3249)

### What problem does this PR solve?

When parsing a docx file using the Book parsing method, to_page is
always -1, resulting in a block count of 0 even if parsing is successful

Fix:#3230

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2024-11-08 09:21:42 +08:00
+								            secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
-												fix too long query exception (#1195)

### What problem does this PR solve?

#1161 
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-06-18 09:50:59 +08:00
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
 								        return secs, tbls