Files
ragflow/deepdoc/parser/epub_parser.py

146 lines
5.2 KiB
Python
Raw Permalink Normal View History

#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import warnings
import zipfile
from io import BytesIO
from xml.etree import ElementTree
from .html_parser import RAGFlowHtmlParser
# OPF XML namespaces
_OPF_NS = "http://www.idpf.org/2007/opf"
_CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container"
# Media types that contain readable XHTML content
_XHTML_MEDIA_TYPES = {"application/xhtml+xml", "text/html", "text/xml"}
logger = logging.getLogger(__name__)
class RAGFlowEpubParser:
"""Parse EPUB files by extracting XHTML content in spine (reading) order
and delegating to RAGFlowHtmlParser for chunking."""
def __call__(self, fnm, binary=None, chunk_token_num=512):
if binary is not None:
if not binary:
logger.warning(
"RAGFlowEpubParser received an empty EPUB binary payload for %r",
fnm,
)
raise ValueError("Empty EPUB binary payload")
zf = zipfile.ZipFile(BytesIO(binary))
else:
zf = zipfile.ZipFile(fnm)
try:
content_items = self._get_spine_items(zf)
all_sections = []
html_parser = RAGFlowHtmlParser()
for item_path in content_items:
try:
html_bytes = zf.read(item_path)
except KeyError:
continue
if not html_bytes:
logger.debug("Skipping empty EPUB content item: %s", item_path)
continue
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
sections = html_parser(
item_path, binary=html_bytes, chunk_token_num=chunk_token_num
)
all_sections.extend(sections)
return all_sections
finally:
zf.close()
@staticmethod
def _get_spine_items(zf):
"""Return content file paths in spine (reading) order."""
# 1. Find the OPF file path from META-INF/container.xml
try:
container_xml = zf.read("META-INF/container.xml")
except KeyError:
return RAGFlowEpubParser._fallback_xhtml_order(zf)
try:
container_root = ElementTree.fromstring(container_xml)
except ElementTree.ParseError:
logger.warning("Failed to parse META-INF/container.xml; falling back to XHTML order.")
return RAGFlowEpubParser._fallback_xhtml_order(zf)
rootfile_el = container_root.find(f".//{{{_CONTAINER_NS}}}rootfile")
if rootfile_el is None:
return RAGFlowEpubParser._fallback_xhtml_order(zf)
opf_path = rootfile_el.get("full-path", "")
if not opf_path:
return RAGFlowEpubParser._fallback_xhtml_order(zf)
# Base directory of the OPF file (content paths are relative to it)
opf_dir = opf_path.rsplit("/", 1)[0] + "/" if "/" in opf_path else ""
# 2. Parse the OPF file
try:
opf_xml = zf.read(opf_path)
except KeyError:
return RAGFlowEpubParser._fallback_xhtml_order(zf)
try:
opf_root = ElementTree.fromstring(opf_xml)
except ElementTree.ParseError:
logger.warning("Failed to parse OPF file '%s'; falling back to XHTML order.", opf_path)
return RAGFlowEpubParser._fallback_xhtml_order(zf)
# 3. Build id->href+mediatype map from <manifest>
manifest = {}
for item in opf_root.findall(f".//{{{_OPF_NS}}}item"):
item_id = item.get("id", "")
href = item.get("href", "")
media_type = item.get("media-type", "")
if item_id and href:
manifest[item_id] = (href, media_type)
# 4. Walk <spine> to get reading order
spine_items = []
for itemref in opf_root.findall(f".//{{{_OPF_NS}}}itemref"):
idref = itemref.get("idref", "")
if idref not in manifest:
continue
href, media_type = manifest[idref]
if media_type not in _XHTML_MEDIA_TYPES:
continue
spine_items.append(opf_dir + href)
return (
spine_items if spine_items else RAGFlowEpubParser._fallback_xhtml_order(zf)
)
@staticmethod
def _fallback_xhtml_order(zf):
"""Fallback: return all .xhtml/.html files sorted alphabetically."""
return sorted(
n
for n in zf.namelist()
if n.lower().endswith((".xhtml", ".html", ".htm"))
and not n.startswith("META-INF/")
)