mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
146 lines
5.2 KiB
Python
146 lines
5.2 KiB
Python
|
|
#
|
||
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||
|
|
#
|
||
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
|
# you may not use this file except in compliance with the License.
|
||
|
|
# You may obtain a copy of the License at
|
||
|
|
#
|
||
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
|
#
|
||
|
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
|
# See the License for the specific language governing permissions and
|
||
|
|
# limitations under the License.
|
||
|
|
#
|
||
|
|
|
||
|
|
import logging
|
||
|
|
import warnings
|
||
|
|
import zipfile
|
||
|
|
from io import BytesIO
|
||
|
|
from xml.etree import ElementTree
|
||
|
|
|
||
|
|
from .html_parser import RAGFlowHtmlParser
|
||
|
|
|
||
|
|
# OPF XML namespaces
|
||
|
|
_OPF_NS = "http://www.idpf.org/2007/opf"
|
||
|
|
_CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container"
|
||
|
|
|
||
|
|
# Media types that contain readable XHTML content
|
||
|
|
_XHTML_MEDIA_TYPES = {"application/xhtml+xml", "text/html", "text/xml"}
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
|
||
|
|
class RAGFlowEpubParser:
|
||
|
|
"""Parse EPUB files by extracting XHTML content in spine (reading) order
|
||
|
|
and delegating to RAGFlowHtmlParser for chunking."""
|
||
|
|
|
||
|
|
def __call__(self, fnm, binary=None, chunk_token_num=512):
|
||
|
|
if binary is not None:
|
||
|
|
if not binary:
|
||
|
|
logger.warning(
|
||
|
|
"RAGFlowEpubParser received an empty EPUB binary payload for %r",
|
||
|
|
fnm,
|
||
|
|
)
|
||
|
|
raise ValueError("Empty EPUB binary payload")
|
||
|
|
zf = zipfile.ZipFile(BytesIO(binary))
|
||
|
|
else:
|
||
|
|
zf = zipfile.ZipFile(fnm)
|
||
|
|
|
||
|
|
try:
|
||
|
|
content_items = self._get_spine_items(zf)
|
||
|
|
all_sections = []
|
||
|
|
html_parser = RAGFlowHtmlParser()
|
||
|
|
|
||
|
|
for item_path in content_items:
|
||
|
|
try:
|
||
|
|
html_bytes = zf.read(item_path)
|
||
|
|
except KeyError:
|
||
|
|
continue
|
||
|
|
if not html_bytes:
|
||
|
|
logger.debug("Skipping empty EPUB content item: %s", item_path)
|
||
|
|
continue
|
||
|
|
with warnings.catch_warnings():
|
||
|
|
warnings.filterwarnings("ignore", category=UserWarning)
|
||
|
|
sections = html_parser(
|
||
|
|
item_path, binary=html_bytes, chunk_token_num=chunk_token_num
|
||
|
|
)
|
||
|
|
all_sections.extend(sections)
|
||
|
|
|
||
|
|
return all_sections
|
||
|
|
finally:
|
||
|
|
zf.close()
|
||
|
|
|
||
|
|
@staticmethod
|
||
|
|
def _get_spine_items(zf):
|
||
|
|
"""Return content file paths in spine (reading) order."""
|
||
|
|
# 1. Find the OPF file path from META-INF/container.xml
|
||
|
|
try:
|
||
|
|
container_xml = zf.read("META-INF/container.xml")
|
||
|
|
except KeyError:
|
||
|
|
return RAGFlowEpubParser._fallback_xhtml_order(zf)
|
||
|
|
|
||
|
|
try:
|
||
|
|
container_root = ElementTree.fromstring(container_xml)
|
||
|
|
except ElementTree.ParseError:
|
||
|
|
logger.warning("Failed to parse META-INF/container.xml; falling back to XHTML order.")
|
||
|
|
return RAGFlowEpubParser._fallback_xhtml_order(zf)
|
||
|
|
|
||
|
|
rootfile_el = container_root.find(f".//{{{_CONTAINER_NS}}}rootfile")
|
||
|
|
if rootfile_el is None:
|
||
|
|
return RAGFlowEpubParser._fallback_xhtml_order(zf)
|
||
|
|
|
||
|
|
opf_path = rootfile_el.get("full-path", "")
|
||
|
|
if not opf_path:
|
||
|
|
return RAGFlowEpubParser._fallback_xhtml_order(zf)
|
||
|
|
|
||
|
|
# Base directory of the OPF file (content paths are relative to it)
|
||
|
|
opf_dir = opf_path.rsplit("/", 1)[0] + "/" if "/" in opf_path else ""
|
||
|
|
|
||
|
|
# 2. Parse the OPF file
|
||
|
|
try:
|
||
|
|
opf_xml = zf.read(opf_path)
|
||
|
|
except KeyError:
|
||
|
|
return RAGFlowEpubParser._fallback_xhtml_order(zf)
|
||
|
|
|
||
|
|
try:
|
||
|
|
opf_root = ElementTree.fromstring(opf_xml)
|
||
|
|
except ElementTree.ParseError:
|
||
|
|
logger.warning("Failed to parse OPF file '%s'; falling back to XHTML order.", opf_path)
|
||
|
|
return RAGFlowEpubParser._fallback_xhtml_order(zf)
|
||
|
|
|
||
|
|
# 3. Build id->href+mediatype map from <manifest>
|
||
|
|
manifest = {}
|
||
|
|
for item in opf_root.findall(f".//{{{_OPF_NS}}}item"):
|
||
|
|
item_id = item.get("id", "")
|
||
|
|
href = item.get("href", "")
|
||
|
|
media_type = item.get("media-type", "")
|
||
|
|
if item_id and href:
|
||
|
|
manifest[item_id] = (href, media_type)
|
||
|
|
|
||
|
|
# 4. Walk <spine> to get reading order
|
||
|
|
spine_items = []
|
||
|
|
for itemref in opf_root.findall(f".//{{{_OPF_NS}}}itemref"):
|
||
|
|
idref = itemref.get("idref", "")
|
||
|
|
if idref not in manifest:
|
||
|
|
continue
|
||
|
|
href, media_type = manifest[idref]
|
||
|
|
if media_type not in _XHTML_MEDIA_TYPES:
|
||
|
|
continue
|
||
|
|
spine_items.append(opf_dir + href)
|
||
|
|
|
||
|
|
return (
|
||
|
|
spine_items if spine_items else RAGFlowEpubParser._fallback_xhtml_order(zf)
|
||
|
|
)
|
||
|
|
|
||
|
|
@staticmethod
|
||
|
|
def _fallback_xhtml_order(zf):
|
||
|
|
"""Fallback: return all .xhtml/.html files sorted alphabetically."""
|
||
|
|
return sorted(
|
||
|
|
n
|
||
|
|
for n in zf.namelist()
|
||
|
|
if n.lower().endswith((".xhtml", ".html", ".htm"))
|
||
|
|
and not n.startswith("META-INF/")
|
||
|
|
)
|