Files
ragflow/rag/app/naive.py
2026-06-24 13:22:57 +08:00

1217 lines
50 KiB
Python

#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import re
import os
from functools import reduce
from io import BytesIO
from timeit import default_timer as timer
from docx import Document
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.table import Table as DocxTable
from docx.text.paragraph import Paragraph
from docx.opc.oxml import parse_xml
from markdown import markdown
from PIL import Image
from common.token_utils import num_tokens_from_string
from common.constants import LLMType, MAXIMUM_PAGE_NUMBER
from api.db.services.llm_service import LLMBundle
from api.db.joint_services.tenant_model_service import (
ensure_mineru_from_env,
ensure_opendataloader_from_env,
ensure_paddleocr_from_env,
get_first_provider_model_name,
get_model_config_from_provider_instance,
get_tenant_default_model_by_type,
)
from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
from deepdoc.parser import DocxParser, EpubParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser
from common.float_utils import normalize_overlapped_percent
from common.parser_config_utils import normalize_layout_recognizer
from common.text_utils import normalize_arabic_presentation_forms
from rag.nlp import (
concat_img,
find_codec,
naive_merge,
naive_merge_with_images,
naive_merge_docx,
rag_tokenizer,
tokenize_chunks,
doc_tokenize_chunks_with_images,
tokenize_table,
append_context2table_image4pdf,
tokenize_chunks_with_images,
) # noqa: F401
def _is_short_header(text, max_tokens=50):
"""
Check if text is a short markdown header.
Args:
text: The text to check
max_tokens: Maximum tokens for a header to be considered "short"
Returns:
bool: True if text is a short markdown header, False otherwise
"""
if not text or not text.strip():
return False
# Check if it matches markdown header pattern: 1-6 # followed by space
if not re.match(r"^#{1,6}\s+", text.strip()):
return False
# Check if token count is below threshold
return num_tokens_from_string(text) < max_tokens
def _normalize_section_text_for_rtl_presentation_forms(sections):
if not sections:
return sections
normalized_sections = []
for section in sections:
if isinstance(section, tuple):
if not section:
normalized_sections.append(section)
continue
text = section[0]
normalized_text = normalize_arabic_presentation_forms(text)
normalized_sections.append((normalized_text, *section[1:]))
continue
if isinstance(section, list):
if not section:
normalized_sections.append(section)
continue
text = section[0]
normalized_text = normalize_arabic_presentation_forms(text)
normalized_sections.append([normalized_text, *section[1:]])
continue
normalized_sections.append(normalize_arabic_presentation_forms(section))
return normalized_sections
def by_deepdoc(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
callback = callback
binary = binary
pdf_parser = pdf_cls() if pdf_cls else Pdf()
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
tables = vision_figure_parser_pdf_wrapper(
tbls=tables,
sections=sections,
callback=callback,
**kwargs,
)
return sections, tables, pdf_parser
def by_mineru(
filename,
binary=None,
from_page=0,
to_page=MAXIMUM_PAGE_NUMBER,
lang="Chinese",
callback=None,
pdf_cls=None,
parse_method: str = "raw",
mineru_llm_name: str | None = None,
tenant_id: str | None = None,
**kwargs,
):
pdf_parser = None
if tenant_id:
if not mineru_llm_name:
try:
mineru_llm_name = get_first_provider_model_name(tenant_id, "MinerU", LLMType.OCR) or ensure_mineru_from_env(tenant_id)
except Exception as e: # best-effort fallback
logging.warning(f"fallback to env mineru: {e}")
if mineru_llm_name:
try:
ocr_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.OCR, mineru_llm_name)
ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config, lang=lang)
pdf_parser = ocr_model.mdl
# Closes #14869: when the tenant has an IMAGE2TEXT model
# configured, let the MinerU parser enrich image chunks with
# VLM-generated semantic descriptions (parity with deepdoc's
# VisionFigureParser). Best-effort — fall back silently if
# no vision model is available.
if "vision_model" not in kwargs:
try:
vision_model_config = get_tenant_default_model_by_type(tenant_id, LLMType.IMAGE2TEXT)
kwargs["vision_model"] = LLMBundle(tenant_id=tenant_id, model_config=vision_model_config, lang=lang)
except Exception as vlm_err:
logging.info(f"[MinerU] no IMAGE2TEXT model for tenant; skipping image VLM enhancement: {vlm_err}")
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
parse_method=parse_method,
lang=lang,
**kwargs,
)
return sections, tables, pdf_parser
except Exception as e:
logging.error(f"Failed to parse pdf via LLMBundle MinerU ({mineru_llm_name}): {e}")
if callback:
callback(-1, "MinerU not found.")
return None, None, None
def by_docling(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
pdf_parser = DoclingParser()
parse_method = kwargs.get("parse_method", "raw")
if not pdf_parser.check_installation():
if callback:
callback(-1, "Docling not found.")
return None, None, pdf_parser
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("DOCLING_OUTPUT_DIR", ""),
delete_output=bool(int(os.environ.get("DOCLING_DELETE_OUTPUT", 1))),
docling_server_url=os.environ.get("DOCLING_SERVER_URL", ""),
parse_method=parse_method,
)
return sections, tables, pdf_parser
def by_opendataloader(
filename,
binary=None,
from_page=0,
to_page=MAXIMUM_PAGE_NUMBER,
lang="Chinese",
callback=None,
pdf_cls=None,
parse_method: str = "raw",
opendataloader_llm_name: str | None = None,
tenant_id: str | None = None,
**kwargs,
):
if tenant_id:
if not opendataloader_llm_name:
try:
opendataloader_llm_name = get_first_provider_model_name(tenant_id, "OpenDataLoader", LLMType.OCR) or ensure_opendataloader_from_env(tenant_id)
except Exception as e: # best-effort fallback
logging.warning(f"fallback to env opendataloader: {e}")
if opendataloader_llm_name:
try:
ocr_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.OCR, opendataloader_llm_name)
ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config, lang=lang)
pdf_parser = ocr_model.mdl
parse_options = {k: kwargs[k] for k in ("hybrid", "image_output", "sanitize") if k in kwargs}
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
parse_method=parse_method,
**parse_options,
)
return sections, tables, pdf_parser
except Exception as e:
logging.error(f"Failed to parse pdf via LLMBundle OpenDataLoader ({opendataloader_llm_name}): {e}")
if callback:
callback(-1, "OpenDataLoader not found.")
return None, None, None
def by_tcadp(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
tcadp_parser = TCADPParser()
if not tcadp_parser.check_installation():
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
return None, None, tcadp_parser
sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type="PDF")
return sections, tables, tcadp_parser
def by_paddleocr(
filename,
binary=None,
from_page=0,
to_page=MAXIMUM_PAGE_NUMBER,
lang="Chinese",
callback=None,
pdf_cls=None,
parse_method: str = "raw",
paddleocr_llm_name: str | None = None,
tenant_id: str | None = None,
**kwargs,
):
pdf_parser = None
if tenant_id:
if not paddleocr_llm_name:
try:
paddleocr_llm_name = get_first_provider_model_name(tenant_id, "PaddleOCR", LLMType.OCR) or ensure_paddleocr_from_env(tenant_id)
except Exception as e: # best-effort fallback
logging.warning(f"fallback to env paddleocr: {e}")
if paddleocr_llm_name:
try:
ocr_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.OCR, paddleocr_llm_name)
ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config, lang=lang)
pdf_parser = ocr_model.mdl
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
parse_method=parse_method,
**kwargs,
)
return sections, tables, pdf_parser
except Exception as e:
logging.error(f"Failed to parse pdf via LLMBundle PaddleOCR ({paddleocr_llm_name}): {e}")
return None, None, None
if callback:
callback(-1, "PaddleOCR not found.")
return None, None, None
def by_plaintext(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, callback=None, **kwargs):
layout_recognizer = (kwargs.get("layout_recognizer") or "").strip()
if (not layout_recognizer) or (layout_recognizer == "Plain Text"):
pdf_parser = PlainParser()
else:
tenant_id = kwargs.get("tenant_id")
if not tenant_id:
raise ValueError("tenant_id is required when using vision layout recognizer")
vision_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.IMAGE2TEXT, layout_recognizer)
vision_model = LLMBundle(
tenant_id,
model_config=vision_model_config,
lang=kwargs.get("lang", "Chinese"),
)
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
return sections, tables, pdf_parser
PARSERS = {
"deepdoc": by_deepdoc,
"mineru": by_mineru,
"docling": by_docling,
"opendataloader": by_opendataloader,
"tcadp parser": by_tcadp,
"paddleocr": by_paddleocr,
"plaintext": by_plaintext, # default
}
class Docx(DocxParser):
def __init__(self):
pass
def __clean(self, line):
line = re.sub(r"\u3000", " ", line).strip()
return line
def __get_nearest_title(self, table_index, filename):
"""Get the hierarchical title structure before the table"""
import re
from docx.text.paragraph import Paragraph
titles = []
blocks = []
# Get document name from filename parameter
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
if not doc_name:
doc_name = "Untitled Document"
# Collect all document blocks while maintaining document order
try:
# Iterate through all paragraphs and tables in document order
for i, block in enumerate(self.doc._element.body):
if block.tag.endswith("p"): # Paragraph
p = Paragraph(block, self.doc)
blocks.append(("p", i, p))
elif block.tag.endswith("tbl"): # Table
blocks.append(("t", i, None)) # Table object will be retrieved later
except Exception as e:
logging.error(f"Error collecting blocks: {e}")
return ""
# Find the target table position
target_table_pos = -1
table_count = 0
for i, (block_type, pos, _) in enumerate(blocks):
if block_type == "t":
if table_count == table_index:
target_table_pos = pos
break
table_count += 1
if target_table_pos == -1:
return "" # Target table not found
# Find the nearest heading paragraph in reverse order
nearest_title = None
for i in range(len(blocks) - 1, -1, -1):
block_type, pos, block = blocks[i]
if pos >= target_table_pos: # Skip blocks after the table
continue
if block_type != "p":
continue
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
try:
level_match = re.search(r"(\d+)", block.style.name)
if level_match:
level = int(level_match.group(1))
if level <= 7: # Support up to 7 heading levels
title_text = block.text.strip()
if title_text: # Avoid empty titles
nearest_title = (level, title_text)
break
except Exception as e:
logging.error(f"Error parsing heading level: {e}")
if nearest_title:
# Add current title
titles.append(nearest_title)
current_level = nearest_title[0]
# Find all parent headings, allowing cross-level search
while current_level > 1:
found = False
for i in range(len(blocks) - 1, -1, -1):
block_type, pos, block = blocks[i]
if pos >= target_table_pos: # Skip blocks after the table
continue
if block_type != "p":
continue
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
try:
level_match = re.search(r"(\d+)", block.style.name)
if level_match:
level = int(level_match.group(1))
# Find any heading with a higher level
if level < current_level:
title_text = block.text.strip()
if title_text: # Avoid empty titles
titles.append((level, title_text))
current_level = level
found = True
break
except Exception as e:
logging.error(f"Error parsing parent heading: {e}")
if not found: # Break if no parent heading is found
break
# Sort by level (ascending, from highest to lowest)
titles.sort(key=lambda x: x[0])
# Organize titles (from highest to lowest)
hierarchy = [doc_name] + [t[1] for t in titles]
return " > ".join(hierarchy)
return ""
def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER):
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
last_image = None
table_idx = 0
def flush_last_image():
nonlocal last_image, lines
if last_image is not None:
lines.append({"text": "", "image": last_image, "table": None, "style": "Image"})
last_image = None
for block in self.doc._element.body:
if pn > to_page:
break
if block.tag.endswith("p"):
p = Paragraph(block, self.doc)
if from_page <= pn < to_page:
text = p.text.strip()
style_name = p.style.name if p.style else ""
if text:
if style_name == "Caption":
former_image = None
if lines and lines[-1].get("image") and lines[-1].get("style") != "Caption":
former_image = lines[-1].get("image")
lines.pop()
elif last_image is not None:
former_image = last_image
last_image = None
lines.append(
{
"text": self.__clean(text),
"image": former_image if former_image else None,
"table": None,
}
)
else:
flush_last_image()
lines.append(
{
"text": self.__clean(text),
"image": None,
"table": None,
}
)
current_image = self.get_picture(self.doc, p)
if current_image is not None:
lines.append(
{
"text": "",
"image": current_image,
"table": None,
}
)
else:
current_image = self.get_picture(self.doc, p)
if current_image is not None:
last_image = current_image
for run in p.runs:
xml = run._element.xml
if "lastRenderedPageBreak" in xml:
pn += 1
continue
if "w:br" in xml and 'type="page"' in xml:
pn += 1
elif block.tag.endswith("tbl"):
if pn < from_page or pn > to_page:
table_idx += 1
continue
flush_last_image()
tb = DocxTable(block, self.doc)
title = self.__get_nearest_title(table_idx, filename)
html = "<table>"
if title:
html += f"<caption>Table Location: {title}</caption>"
for r in tb.rows:
html += "<tr>"
col_idx = 0
try:
while col_idx < len(r.cells):
span = 1
c = r.cells[col_idx]
for j in range(col_idx + 1, len(r.cells)):
if c.text == r.cells[j].text:
span += 1
col_idx = j
else:
break
col_idx += 1
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
except Exception as e:
logging.warning(f"Error parsing table, ignore: {e}")
html += "</tr>"
html += "</table>"
lines.append({"text": "", "image": None, "table": html})
table_idx += 1
flush_last_image()
new_line = [(line.get("text"), line.get("image"), line.get("table")) for line in lines]
return new_line
def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
"""
This function uses mammoth, licensed under the BSD 2-Clause License.
"""
import base64
import uuid
import mammoth
from markdownify import markdownify
docx_file = BytesIO(binary) if binary else open(filename, "rb")
def _convert_image_to_base64(image):
try:
with image.open() as image_file:
image_bytes = image_file.read()
encoded = base64.b64encode(image_bytes).decode("utf-8")
base64_url = f"data:{image.content_type};base64,{encoded}"
alt_name = "image"
alt_name = f"img_{uuid.uuid4().hex[:8]}"
return {"src": base64_url, "alt": alt_name}
except Exception as e:
logging.warning(f"Failed to convert image to base64: {e}")
return {"src": "", "alt": "image"}
try:
if inline_images:
result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
else:
result = mammoth.convert_to_html(docx_file)
html = result.value
markdown_text = markdownify(html)
return markdown_text
finally:
if not binary:
docx_file.close()
class Pdf(PdfParser):
def __init__(self):
super().__init__()
def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, zoomin=3, callback=None, separate_tables_figures=False):
start = timer()
first_start = start
callback(msg="OCR started")
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
start = timer()
self._layouts_rec(zoomin)
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._table_transformer_job(zoomin)
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._text_merge(zoomin=zoomin)
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
if separate_tables_figures:
tbls, figures = self._extract_table_figure(True, zoomin, True, True, True)
self._concat_downward()
logging.info("layouts cost: {}s".format(timer() - first_start))
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls, figures
else:
tbls = self._extract_table_figure(True, zoomin, True, True)
self._naive_vertical_merge()
self._concat_downward()
# self._final_reading_order_merge()
# self._filter_forpages()
logging.info("layouts cost: {}s".format(timer() - first_start))
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
# Maximum number of HTTP redirects followed when fetching a remote image
# referenced by a markdown document (each hop is SSRF-validated).
MAX_IMAGE_REDIRECTS = 5
class Markdown(MarkdownParser):
def md_to_html(self, sections):
if not sections:
return []
if isinstance(sections, type("")):
text = sections
elif isinstance(sections[0], type("")):
text = sections[0]
else:
return []
from bs4 import BeautifulSoup
html_content = markdown(text)
soup = BeautifulSoup(html_content, "html.parser")
return soup
def get_hyperlink_urls(self, soup):
if soup:
return set([a.get("href") for a in soup.find_all("a") if a.get("href")])
return []
def extract_image_urls_with_lines(self, text):
md_img_re = re.compile(r"!\[[^\]]*\]\(([^)\s]+)")
html_img_re = re.compile(r'src=["\\\']([^"\\\'>\\s]+)', re.IGNORECASE)
urls = []
seen = set()
lines = text.splitlines()
for idx, line in enumerate(lines):
for url in md_img_re.findall(line):
if (url, idx) not in seen:
urls.append({"url": url, "line": idx})
seen.add((url, idx))
for url in html_img_re.findall(line):
if (url, idx) not in seen:
urls.append({"url": url, "line": idx})
seen.add((url, idx))
# cross-line
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(text, "html.parser")
newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)]
for img_tag in soup.find_all("img"):
src = img_tag.get("src")
if not src:
continue
tag_str = str(img_tag)
pos = text.find(tag_str)
if pos == -1:
# fallback
pos = max(text.find(src), 0)
line_no = 0
for i, off in enumerate(newline_offsets):
if pos <= off:
line_no = i
break
if (src, line_no) not in seen:
urls.append({"url": src, "line": line_no})
seen.add((src, line_no))
except Exception as e:
logging.error("Failed to extract image urls: {}".format(e))
pass
return urls
def load_images_from_urls(self, urls, cache=None):
import requests
from pathlib import Path
from urllib.parse import urljoin
from common.ssrf_guard import assert_url_is_safe, pin_dns
cache = cache or {}
images = []
for url in urls:
if url in cache:
if cache[url]:
images.append(cache[url])
continue
img_obj = None
try:
if url.startswith(("http://", "https://")):
# SSRF guard: image references come from the (untrusted) uploaded
# document, so validate and DNS-pin every hop before connecting.
# Otherwise a markdown image like ![x](http://169.254.169.254/...)
# would make the server fetch internal services / cloud metadata.
# Redirects are followed manually so each hop is re-validated,
# mirroring common/data_source/rss_connector.py.
current_hostname, current_ip = assert_url_is_safe(url)
current_url = url
response = None
try:
for _ in range(MAX_IMAGE_REDIRECTS + 1):
# Release the previous hop before opening the next: with
# stream=True the connection isn't returned to the pool
# until the body is read or the response is closed.
if response is not None:
response.close()
with pin_dns(current_hostname, current_ip):
response = requests.get(current_url, stream=True, timeout=30, allow_redirects=False)
if response.status_code not in (301, 302, 303, 307, 308):
break
location = response.headers.get("Location")
if not location:
break
current_url = urljoin(current_url, location)
current_hostname, current_ip = assert_url_is_safe(current_url)
else:
raise ValueError(f"Exceeded {MAX_IMAGE_REDIRECTS} redirects fetching {url!r}")
if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"):
img_obj = Image.open(BytesIO(response.content)).convert("RGB")
finally:
# Always release the final/streamed response, including the
# non-image and redirect-cap paths where the body is unread.
if response is not None:
response.close()
else:
local_path = Path(url)
if local_path.exists():
img_obj = Image.open(url).convert("RGB")
else:
logging.warning(f"Local image file not found: {url}")
except Exception as e:
logging.error(f"Failed to download/open image from {url}: {e}")
cache[url] = img_obj
if img_obj:
images.append(img_obj)
return images, cache
def __call__(self, filename, binary=None, separate_tables=True, delimiter=None, return_section_images=False):
"""Parse markdown into text sections and optional standalone table chunks."""
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(filename, "r") as f:
txt = f.read()
remainder, tables = self.extract_tables_and_remainder(f"{txt}\n", separate_tables=separate_tables)
parsing_text = remainder if separate_tables else txt
extractor = MarkdownElementExtractor(parsing_text)
image_refs = self.extract_image_urls_with_lines(parsing_text)
element_sections = extractor.extract_elements(delimiter, include_meta=True)
sections = []
section_images = []
image_cache = {}
for element in element_sections:
content = element["content"]
start_line = element["start_line"]
end_line = element["end_line"]
urls_in_section = [ref["url"] for ref in image_refs if start_line <= ref["line"] <= end_line]
imgs = []
if urls_in_section:
imgs, image_cache = self.load_images_from_urls(urls_in_section, image_cache)
combined_image = None
if imgs:
combined_image = reduce(concat_img, imgs) if len(imgs) > 1 else imgs[0]
sections.append((content, ""))
section_images.append(combined_image)
tbls = []
for table in tables:
tbls.append(((None, markdown(table, extensions=["markdown.extensions.tables"])), ""))
if return_section_images:
return sections, tbls, section_images
return sections, tbls
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
relationships contained in *rels_item_xml*. Returns an empty
collection if *rels_item_xml* is |None|.
"""
srels = _SerializedRelationships()
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
if rel_elm.target_ref in ("../NULL", "NULL") or rel_elm.target_ref.startswith("#"):
continue
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels
def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, excel, txt.
This method apply the naive ways to chunk files.
Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
"""
urls = set()
url_res = []
lang = lang or "Chinese"
is_english = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
child_deli = (parser_config.get("children_delimiter") or "").encode("utf-8").decode("unicode_escape").encode("latin1").decode("utf-8")
cust_child_deli = re.findall(r"`([^`]+)`", child_deli)
child_deli = "|".join(re.sub(r"`([^`]+)`", "", child_deli))
if cust_child_deli:
cust_child_deli = sorted(set(cust_child_deli), key=lambda x: -len(x))
cust_child_deli = "|".join(re.escape(t) for t in cust_child_deli if t)
child_deli += cust_child_deli
is_markdown = False
table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
pdf_parser = None
section_images = None
is_root = kwargs.get("is_root", True)
embed_res = []
if is_root:
# Only extract embedded files at the root call
embeds = []
if binary is not None:
embeds = extract_embed_file(binary)
else:
raise Exception("Embedding extraction from file path is not supported.")
# Recursively chunk each embedded file and collect results
for embed_filename, embed_bytes in embeds:
try:
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, **kwargs) or []
embed_res.extend(sub_res)
except Exception as e:
error_msg = f"Failed to chunk embed {embed_filename}: {e}"
logging.error(error_msg)
if callback:
callback(0.05, error_msg)
continue
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
if parser_config.get("analyze_hyperlink", False) and is_root:
urls = extract_links_from_docx(binary)
for index, url in enumerate(urls):
html_bytes, metadata = extract_html(url)
if not html_bytes:
continue
try:
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
except Exception as e:
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
url_res.extend(sub_url_res)
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
_SerializedRelationships.load_from_xml = load_from_xml_v2
# sections = (text, image, tables)
sections = Docx()(filename, binary)
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
# chunks list[dict]
# images list - index of image chunk in chunks
chunks, images = naive_merge_docx(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), table_context_size, image_context_size)
vision_figure_parser_docx_wrapper_naive(chunks=chunks, idx_lst=images, callback=callback, **kwargs)
callback(0.8, "Finish parsing.")
st = timer()
res.extend(doc_tokenize_chunks_with_images(chunks, doc, is_english, child_delimiters_pattern=child_deli))
logging.info("naive_merge({}): {}".format(filename, timer() - st))
res.extend(embed_res)
res.extend(url_res)
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
opendataloader_llm_name = kwargs.pop("opendataloader_llm_name", None)
if layout_recognizer == "OpenDataLoader" and parser_model_name:
opendataloader_llm_name = parser_model_name
if parser_config.get("analyze_hyperlink", False) and is_root:
urls = extract_links_from_pdf(binary)
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "PlainText"
name = layout_recognizer.strip().lower()
parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.")
sections, tables, pdf_parser = parser(
filename=filename,
binary=binary,
from_page=from_page,
to_page=to_page,
lang=lang,
callback=callback,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
paddleocr_llm_name=parser_model_name,
opendataloader_llm_name=opendataloader_llm_name,
**kwargs,
)
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
if not sections and not tables:
return []
if table_context_size or image_context_size:
tables = append_context2table_image4pdf(sections, tables, image_context_size)
if name in ["tcadp", "docling", "mineru", "paddleocr", "opendataloader"]:
if int(parser_config.get("chunk_token_num", 0)) <= 0:
parser_config["chunk_token_num"] = 0
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
# Check if tcadp_parser is selected for spreadsheet files
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if layout_recognizer == "TCADP Parser":
table_result_type = parser_config.get("table_result_type", "1")
markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser(table_result_type=table_result_type, markdown_image_response_type=markdown_image_response_type)
if not tcadp_parser.check_installation():
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
return res
# Determine file type based on extension
file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type=file_type)
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
parser_config["chunk_token_num"] = 0
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
else:
# Default DeepDOC parser
excel_parser = ExcelParser()
if parser_config.get("html4excel"):
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
parser_config["chunk_token_num"] = 0
else:
sections = [(_, "") for _ in excel_parser(binary) if _]
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?"))
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
print("\n", "-"*150, "\n")
print(sections)
print("\n", "-"*150, "\n")
callback(0.8, "Finish parsing.")
elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
sections, tables, section_images = markdown_parser(
filename,
binary,
separate_tables=True,
delimiter=parser_config.get("delimiter", "\n!?;。;!?"),
return_section_images=True,
)
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
is_markdown = True
try:
vision_model_config = get_tenant_default_model_by_type(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
vision_model = LLMBundle(kwargs["tenant_id"], vision_model_config)
callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
except Exception as e:
logging.warning(f"Failed to detect figure extraction: {e}")
vision_model = None
if vision_model:
# Process images for each section
for idx, (section_text, _) in enumerate(sections):
images = []
if section_images and len(section_images) > idx and section_images[idx] is not None:
images.append(section_images[idx])
if images and len(images) > 0:
# If multiple images found, combine them using concat_img
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
if section_images:
section_images[idx] = combined_image
else:
section_images = [None] * len(sections)
section_images[idx] = combined_image
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
boosted_figures = markdown_vision_parser(callback=callback)
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
else:
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
if parser_config.get("hyperlink_urls", False) and is_root:
for idx, (section_text, _) in enumerate(sections):
soup = markdown_parser.md_to_html(section_text)
hyperlink_urls = markdown_parser.get_hyperlink_urls(soup)
urls.update(hyperlink_urls)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
sections = HtmlParser()(filename, binary, chunk_token_num)
sections = [(_, "") for _ in sections if _]
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
callback(0.8, "Finish parsing.")
elif re.search(r"\.epub$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
sections = EpubParser()(filename, binary, chunk_token_num)
sections = [(_, "") for _ in sections if _]
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
callback(0.8, "Finish parsing.")
elif re.search(r"\.(json|jsonl|ldjson)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
sections = JsonParser(chunk_token_num)(binary)
sections = [(_, "") for _ in sections if _]
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
callback(0.8, "Finish parsing.")
elif re.search(r"\.doc$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
try:
from tika import parser as tika_parser
except Exception as e:
callback(0.8, f"tika not available: {e}. Unsupported .doc parsing.")
logging.warning(f"tika not available: {e}. Unsupported .doc parsing for {filename}.")
return []
binary = BytesIO(binary)
doc_parsed = tika_parser.from_buffer(binary)
if doc_parsed.get("content", None) is not None:
sections = doc_parsed["content"].split("\n")
sections = [(_, "") for _ in sections if _]
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
callback(0.8, "Finish parsing.")
else:
error_msg = f"tika.parser got empty content from {filename}."
callback(0.8, error_msg)
logging.warning(error_msg)
return []
else:
raise NotImplementedError("file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
st = timer()
overlapped_percent = normalize_overlapped_percent(parser_config.get("overlapped_percent", 0))
if is_markdown:
merged_chunks = []
merged_images = []
chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128)))
current_text = ""
current_tokens = 0
current_image = None
for idx, sec in enumerate(sections):
text = sec[0] if isinstance(sec, tuple) else sec
sec_tokens = num_tokens_from_string(text)
sec_image = section_images[idx] if section_images and idx < len(section_images) else None
# Don't finalize chunk if current_text is a short header (force merge with next section)
if current_text and not _is_short_header(current_text) and current_tokens + sec_tokens > chunk_limit:
merged_chunks.append(current_text)
merged_images.append(current_image)
overlap_part = ""
if overlapped_percent > 0:
overlap_len = int(len(current_text) * overlapped_percent / 100)
if overlap_len > 0:
overlap_part = current_text[-overlap_len:]
current_text = overlap_part
current_tokens = num_tokens_from_string(current_text)
current_image = current_image if overlap_part else None
if current_text:
current_text += "\n" + text
else:
current_text = text
current_tokens += sec_tokens
if sec_image:
current_image = concat_img(current_image, sec_image) if current_image else sec_image
if current_text:
merged_chunks.append(current_text)
merged_images.append(current_image)
chunks = merged_chunks
has_images = merged_images and any(img is not None for img in merged_images)
if has_images:
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images, child_delimiters_pattern=child_deli))
else:
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
else:
if section_images:
if all(image is None for image in section_images):
section_images = None
if section_images:
chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), overlapped_percent)
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
else:
chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), overlapped_percent)
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
if urls and parser_config.get("analyze_hyperlink", False) and is_root:
for index, url in enumerate(urls):
html_bytes, metadata = extract_html(url)
if not html_bytes:
continue
try:
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
except Exception as e:
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
url_res.extend(sub_url_res)
logging.info("naive_merge({}): {}".format(filename, timer() - st))
if embed_res:
res.extend(embed_res)
if url_res:
res.extend(url_res)
# if table_context_size or image_context_size:
# attach_media_context(res, table_context_size, image_context_size)
# Attach PDF outline as transient metadata on the first chunk.
# task_executor.py will extract and persist it as document metadata.
if res and pdf_parser and getattr(pdf_parser, "outlines", None):
res[0]["__outline__"] = [
{"title": title, "depth": depth}
for title, depth, *_ in pdf_parser.outlines
]
return res
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)