deepdoc/parser/epub_parser.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import logging
import warnings
import zipfile
from io import BytesIO
from xml.etree import ElementTree

from .html_parser import RAGFlowHtmlParser

# OPF XML namespaces
_OPF_NS = "http://www.idpf.org/2007/opf"
_CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container"

# Media types that contain readable XHTML content
_XHTML_MEDIA_TYPES = {"application/xhtml+xml", "text/html", "text/xml"}

logger = logging.getLogger(__name__)


class RAGFlowEpubParser:
    """Parse EPUB files by extracting XHTML content in spine (reading) order
    and delegating to RAGFlowHtmlParser for chunking."""

    def __call__(self, fnm, binary=None, chunk_token_num=512):
        if binary is not None:
            if not binary:
                logger.warning(
                    "RAGFlowEpubParser received an empty EPUB binary payload for %r",
                    fnm,
                )
                raise ValueError("Empty EPUB binary payload")
            zf = zipfile.ZipFile(BytesIO(binary))
        else:
            zf = zipfile.ZipFile(fnm)

        try:
            content_items = self._get_spine_items(zf)
            all_sections = []
            html_parser = RAGFlowHtmlParser()

            for item_path in content_items:
                try:
                    html_bytes = zf.read(item_path)
                except KeyError:
                    continue
                if not html_bytes:
                    logger.debug("Skipping empty EPUB content item: %s", item_path)
                    continue
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore", category=UserWarning)
                    sections = html_parser(
                        item_path, binary=html_bytes, chunk_token_num=chunk_token_num
                    )
                all_sections.extend(sections)

            return all_sections
        finally:
            zf.close()

    @staticmethod
    def _get_spine_items(zf):
        """Return content file paths in spine (reading) order."""
        # 1. Find the OPF file path from META-INF/container.xml
        try:
            container_xml = zf.read("META-INF/container.xml")
        except KeyError:
            return RAGFlowEpubParser._fallback_xhtml_order(zf)

        try:
            container_root = ElementTree.fromstring(container_xml)
        except ElementTree.ParseError:
            logger.warning("Failed to parse META-INF/container.xml; falling back to XHTML order.")
            return RAGFlowEpubParser._fallback_xhtml_order(zf)

        rootfile_el = container_root.find(f".//{{{_CONTAINER_NS}}}rootfile")
        if rootfile_el is None:
            return RAGFlowEpubParser._fallback_xhtml_order(zf)

        opf_path = rootfile_el.get("full-path", "")
        if not opf_path:
            return RAGFlowEpubParser._fallback_xhtml_order(zf)

        # Base directory of the OPF file (content paths are relative to it)
        opf_dir = opf_path.rsplit("/", 1)[0] + "/" if "/" in opf_path else ""

        # 2. Parse the OPF file
        try:
            opf_xml = zf.read(opf_path)
        except KeyError:
            return RAGFlowEpubParser._fallback_xhtml_order(zf)

        try:
            opf_root = ElementTree.fromstring(opf_xml)
        except ElementTree.ParseError:
            logger.warning("Failed to parse OPF file '%s'; falling back to XHTML order.", opf_path)
            return RAGFlowEpubParser._fallback_xhtml_order(zf)

        # 3. Build id->href+mediatype map from <manifest>
        manifest = {}
        for item in opf_root.findall(f".//{{{_OPF_NS}}}item"):
            item_id = item.get("id", "")
            href = item.get("href", "")
            media_type = item.get("media-type", "")
            if item_id and href:
                manifest[item_id] = (href, media_type)

        # 4. Walk <spine> to get reading order
        spine_items = []
        for itemref in opf_root.findall(f".//{{{_OPF_NS}}}itemref"):
            idref = itemref.get("idref", "")
            if idref not in manifest:
                continue
            href, media_type = manifest[idref]
            if media_type not in _XHTML_MEDIA_TYPES:
                continue
            spine_items.append(opf_dir + href)

        return (
            spine_items if spine_items else RAGFlowEpubParser._fallback_xhtml_order(zf)
        )

    @staticmethod
    def _fallback_xhtml_order(zf):
        """Fallback: return all .xhtml/.html files sorted alphabetically."""
        return sorted(
            n
            for n in zf.namelist()
            if n.lower().endswith((".xhtml", ".html", ".htm"))
            and not n.startswith("META-INF/")
        )
Feat: support epub parsing (#13650) Closes #1398 ### What problem does this PR solve? Adds native support for EPUB files. EPUB content is extracted in spine (reading) order and parsed using the existing HTML parser. No new dependencies required. ### Type of change - [x] New Feature (non-breaking change which adds functionality) To check this parser manually: ```python uv run --python 3.12 python -c " from deepdoc.parser import EpubParser with open('$HOME/some_epub_book.epub', 'rb') as f: data = f.read() sections = EpubParser()(None, binary=data, chunk_token_num=512) print(f'Got {len(sections)} sections') for i, s in enumerate(sections[:5]): print(f'\n--- Section {i} ---') print(s[:200]) " ``` 2026-03-17 15:14:06 +03:00			`#`
			`# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`import logging`
			`import warnings`
			`import zipfile`
			`from io import BytesIO`
			`from xml.etree import ElementTree`

			`from .html_parser import RAGFlowHtmlParser`

			`# OPF XML namespaces`
			`_OPF_NS = "http://www.idpf.org/2007/opf"`
			`_CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container"`

			`# Media types that contain readable XHTML content`
			`_XHTML_MEDIA_TYPES = {"application/xhtml+xml", "text/html", "text/xml"}`

			`logger = logging.getLogger(__name__)`


			`class RAGFlowEpubParser:`
			`"""Parse EPUB files by extracting XHTML content in spine (reading) order`
			`and delegating to RAGFlowHtmlParser for chunking."""`

			`def __call__(self, fnm, binary=None, chunk_token_num=512):`
			`if binary is not None:`
			`if not binary:`
			`logger.warning(`
			`"RAGFlowEpubParser received an empty EPUB binary payload for %r",`
			`fnm,`
			`)`
			`raise ValueError("Empty EPUB binary payload")`
			`zf = zipfile.ZipFile(BytesIO(binary))`
			`else:`
			`zf = zipfile.ZipFile(fnm)`

			`try:`
			`content_items = self._get_spine_items(zf)`
			`all_sections = []`
			`html_parser = RAGFlowHtmlParser()`

			`for item_path in content_items:`
			`try:`
			`html_bytes = zf.read(item_path)`
			`except KeyError:`
			`continue`
			`if not html_bytes:`
			`logger.debug("Skipping empty EPUB content item: %s", item_path)`
			`continue`
			`with warnings.catch_warnings():`
			`warnings.filterwarnings("ignore", category=UserWarning)`
			`sections = html_parser(`
			`item_path, binary=html_bytes, chunk_token_num=chunk_token_num`
			`)`
			`all_sections.extend(sections)`

			`return all_sections`
			`finally:`
			`zf.close()`

			`@staticmethod`
			`def _get_spine_items(zf):`
			`"""Return content file paths in spine (reading) order."""`
			`# 1. Find the OPF file path from META-INF/container.xml`
			`try:`
			`container_xml = zf.read("META-INF/container.xml")`
			`except KeyError:`
			`return RAGFlowEpubParser._fallback_xhtml_order(zf)`

			`try:`
			`container_root = ElementTree.fromstring(container_xml)`
			`except ElementTree.ParseError:`
			`logger.warning("Failed to parse META-INF/container.xml; falling back to XHTML order.")`
			`return RAGFlowEpubParser._fallback_xhtml_order(zf)`

			`rootfile_el = container_root.find(f".//{{{_CONTAINER_NS}}}rootfile")`
			`if rootfile_el is None:`
			`return RAGFlowEpubParser._fallback_xhtml_order(zf)`

			`opf_path = rootfile_el.get("full-path", "")`
			`if not opf_path:`
			`return RAGFlowEpubParser._fallback_xhtml_order(zf)`

			`# Base directory of the OPF file (content paths are relative to it)`
			`opf_dir = opf_path.rsplit("/", 1)[0] + "/" if "/" in opf_path else ""`

			`# 2. Parse the OPF file`
			`try:`
			`opf_xml = zf.read(opf_path)`
			`except KeyError:`
			`return RAGFlowEpubParser._fallback_xhtml_order(zf)`

			`try:`
			`opf_root = ElementTree.fromstring(opf_xml)`
			`except ElementTree.ParseError:`
			`logger.warning("Failed to parse OPF file '%s'; falling back to XHTML order.", opf_path)`
			`return RAGFlowEpubParser._fallback_xhtml_order(zf)`

			`# 3. Build id->href+mediatype map from <manifest>`
			`manifest = {}`
			`for item in opf_root.findall(f".//{{{_OPF_NS}}}item"):`
			`item_id = item.get("id", "")`
			`href = item.get("href", "")`
			`media_type = item.get("media-type", "")`
			`if item_id and href:`
			`manifest[item_id] = (href, media_type)`

			`# 4. Walk <spine> to get reading order`
			`spine_items = []`
			`for itemref in opf_root.findall(f".//{{{_OPF_NS}}}itemref"):`
			`idref = itemref.get("idref", "")`
			`if idref not in manifest:`
			`continue`
			`href, media_type = manifest[idref]`
			`if media_type not in _XHTML_MEDIA_TYPES:`
			`continue`
			`spine_items.append(opf_dir + href)`

			`return (`
			`spine_items if spine_items else RAGFlowEpubParser._fallback_xhtml_order(zf)`
			`)`

			`@staticmethod`
			`def _fallback_xhtml_order(zf):`
			`"""Fallback: return all .xhtml/.html files sorted alphabetically."""`
			`return sorted(`
			`n`
			`for n in zf.namelist()`
			`if n.lower().endswith((".xhtml", ".html", ".htm"))`
			`and not n.startswith("META-INF/")`
			`)`