# # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging import re import os from functools import reduce from io import BytesIO from timeit import default_timer as timer from docx import Document from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.table import Table as DocxTable from docx.text.paragraph import Paragraph from docx.opc.oxml import parse_xml from markdown import markdown from PIL import Image from common.token_utils import num_tokens_from_string from common.constants import LLMType, MAXIMUM_PAGE_NUMBER from api.db.services.llm_service import LLMBundle from api.db.joint_services.tenant_model_service import ( ensure_mineru_from_env, ensure_opendataloader_from_env, ensure_paddleocr_from_env, get_first_provider_model_name, get_model_config_from_provider_instance, get_tenant_default_model_by_type, ) from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html from deepdoc.parser import DocxParser, EpubParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.docling_parser import DoclingParser from deepdoc.parser.tcadp_parser import TCADPParser from common.float_utils import normalize_overlapped_percent from common.parser_config_utils import normalize_layout_recognizer from common.text_utils import normalize_arabic_presentation_forms from rag.nlp import ( concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, doc_tokenize_chunks_with_images, tokenize_table, append_context2table_image4pdf, tokenize_chunks_with_images, ) # noqa: F401 def _is_short_header(text, max_tokens=50): """ Check if text is a short markdown header. Args: text: The text to check max_tokens: Maximum tokens for a header to be considered "short" Returns: bool: True if text is a short markdown header, False otherwise """ if not text or not text.strip(): return False # Check if it matches markdown header pattern: 1-6 # followed by space if not re.match(r"^#{1,6}\s+", text.strip()): return False # Check if token count is below threshold return num_tokens_from_string(text) < max_tokens def _normalize_section_text_for_rtl_presentation_forms(sections): if not sections: return sections normalized_sections = [] for section in sections: if isinstance(section, tuple): if not section: normalized_sections.append(section) continue text = section[0] normalized_text = normalize_arabic_presentation_forms(text) normalized_sections.append((normalized_text, *section[1:])) continue if isinstance(section, list): if not section: normalized_sections.append(section) continue text = section[0] normalized_text = normalize_arabic_presentation_forms(text) normalized_sections.append([normalized_text, *section[1:]]) continue normalized_sections.append(normalize_arabic_presentation_forms(section)) return normalized_sections def by_deepdoc(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, **kwargs): callback = callback binary = binary pdf_parser = pdf_cls() if pdf_cls else Pdf() sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) tables = vision_figure_parser_pdf_wrapper( tbls=tables, sections=sections, callback=callback, **kwargs, ) return sections, tables, pdf_parser def by_mineru( filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, parse_method: str = "raw", mineru_llm_name: str | None = None, tenant_id: str | None = None, **kwargs, ): pdf_parser = None if tenant_id: if not mineru_llm_name: try: mineru_llm_name = get_first_provider_model_name(tenant_id, "MinerU", LLMType.OCR) or ensure_mineru_from_env(tenant_id) except Exception as e: # best-effort fallback logging.warning(f"fallback to env mineru: {e}") if mineru_llm_name: try: ocr_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.OCR, mineru_llm_name) ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config, lang=lang) pdf_parser = ocr_model.mdl # Closes #14869: when the tenant has an IMAGE2TEXT model # configured, let the MinerU parser enrich image chunks with # VLM-generated semantic descriptions (parity with deepdoc's # VisionFigureParser). Best-effort — fall back silently if # no vision model is available. if "vision_model" not in kwargs: try: vision_model_config = get_tenant_default_model_by_type(tenant_id, LLMType.IMAGE2TEXT) kwargs["vision_model"] = LLMBundle(tenant_id=tenant_id, model_config=vision_model_config, lang=lang) except Exception as vlm_err: logging.info(f"[MinerU] no IMAGE2TEXT model for tenant; skipping image VLM enhancement: {vlm_err}") sections, tables = pdf_parser.parse_pdf( filepath=filename, binary=binary, callback=callback, parse_method=parse_method, lang=lang, **kwargs, ) return sections, tables, pdf_parser except Exception as e: logging.error(f"Failed to parse pdf via LLMBundle MinerU ({mineru_llm_name}): {e}") if callback: callback(-1, "MinerU not found.") return None, None, None def by_docling(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, **kwargs): pdf_parser = DoclingParser() parse_method = kwargs.get("parse_method", "raw") if not pdf_parser.check_installation(): if callback: callback(-1, "Docling not found.") return None, None, pdf_parser sections, tables = pdf_parser.parse_pdf( filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("DOCLING_OUTPUT_DIR", ""), delete_output=bool(int(os.environ.get("DOCLING_DELETE_OUTPUT", 1))), docling_server_url=os.environ.get("DOCLING_SERVER_URL", ""), parse_method=parse_method, ) return sections, tables, pdf_parser def by_opendataloader( filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, parse_method: str = "raw", opendataloader_llm_name: str | None = None, tenant_id: str | None = None, **kwargs, ): if tenant_id: if not opendataloader_llm_name: try: opendataloader_llm_name = get_first_provider_model_name(tenant_id, "OpenDataLoader", LLMType.OCR) or ensure_opendataloader_from_env(tenant_id) except Exception as e: # best-effort fallback logging.warning(f"fallback to env opendataloader: {e}") if opendataloader_llm_name: try: ocr_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.OCR, opendataloader_llm_name) ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config, lang=lang) pdf_parser = ocr_model.mdl parse_options = {k: kwargs[k] for k in ("hybrid", "image_output", "sanitize") if k in kwargs} sections, tables = pdf_parser.parse_pdf( filepath=filename, binary=binary, callback=callback, parse_method=parse_method, **parse_options, ) return sections, tables, pdf_parser except Exception as e: logging.error(f"Failed to parse pdf via LLMBundle OpenDataLoader ({opendataloader_llm_name}): {e}") if callback: callback(-1, "OpenDataLoader not found.") return None, None, None def by_tcadp(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, **kwargs): tcadp_parser = TCADPParser() if not tcadp_parser.check_installation(): callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.") return None, None, tcadp_parser sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type="PDF") return sections, tables, tcadp_parser def by_paddleocr( filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, pdf_cls=None, parse_method: str = "raw", paddleocr_llm_name: str | None = None, tenant_id: str | None = None, **kwargs, ): pdf_parser = None if tenant_id: if not paddleocr_llm_name: try: paddleocr_llm_name = get_first_provider_model_name(tenant_id, "PaddleOCR", LLMType.OCR) or ensure_paddleocr_from_env(tenant_id) except Exception as e: # best-effort fallback logging.warning(f"fallback to env paddleocr: {e}") if paddleocr_llm_name: try: ocr_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.OCR, paddleocr_llm_name) ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config, lang=lang) pdf_parser = ocr_model.mdl sections, tables = pdf_parser.parse_pdf( filepath=filename, binary=binary, callback=callback, parse_method=parse_method, **kwargs, ) return sections, tables, pdf_parser except Exception as e: logging.error(f"Failed to parse pdf via LLMBundle PaddleOCR ({paddleocr_llm_name}): {e}") return None, None, None if callback: callback(-1, "PaddleOCR not found.") return None, None, None def by_plaintext(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, callback=None, **kwargs): layout_recognizer = (kwargs.get("layout_recognizer") or "").strip() if (not layout_recognizer) or (layout_recognizer == "Plain Text"): pdf_parser = PlainParser() else: tenant_id = kwargs.get("tenant_id") if not tenant_id: raise ValueError("tenant_id is required when using vision layout recognizer") vision_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.IMAGE2TEXT, layout_recognizer) vision_model = LLMBundle( tenant_id, model_config=vision_model_config, lang=kwargs.get("lang", "Chinese"), ) pdf_parser = VisionParser(vision_model=vision_model, **kwargs) sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) return sections, tables, pdf_parser PARSERS = { "deepdoc": by_deepdoc, "mineru": by_mineru, "docling": by_docling, "opendataloader": by_opendataloader, "tcadp parser": by_tcadp, "paddleocr": by_paddleocr, "plaintext": by_plaintext, # default } class Docx(DocxParser): def __init__(self): pass def __clean(self, line): line = re.sub(r"\u3000", " ", line).strip() return line def __get_nearest_title(self, table_index, filename): """Get the hierarchical title structure before the table""" import re from docx.text.paragraph import Paragraph titles = [] blocks = [] # Get document name from filename parameter doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename) if not doc_name: doc_name = "Untitled Document" # Collect all document blocks while maintaining document order try: # Iterate through all paragraphs and tables in document order for i, block in enumerate(self.doc._element.body): if block.tag.endswith("p"): # Paragraph p = Paragraph(block, self.doc) blocks.append(("p", i, p)) elif block.tag.endswith("tbl"): # Table blocks.append(("t", i, None)) # Table object will be retrieved later except Exception as e: logging.error(f"Error collecting blocks: {e}") return "" # Find the target table position target_table_pos = -1 table_count = 0 for i, (block_type, pos, _) in enumerate(blocks): if block_type == "t": if table_count == table_index: target_table_pos = pos break table_count += 1 if target_table_pos == -1: return "" # Target table not found # Find the nearest heading paragraph in reverse order nearest_title = None for i in range(len(blocks) - 1, -1, -1): block_type, pos, block = blocks[i] if pos >= target_table_pos: # Skip blocks after the table continue if block_type != "p": continue if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I): try: level_match = re.search(r"(\d+)", block.style.name) if level_match: level = int(level_match.group(1)) if level <= 7: # Support up to 7 heading levels title_text = block.text.strip() if title_text: # Avoid empty titles nearest_title = (level, title_text) break except Exception as e: logging.error(f"Error parsing heading level: {e}") if nearest_title: # Add current title titles.append(nearest_title) current_level = nearest_title[0] # Find all parent headings, allowing cross-level search while current_level > 1: found = False for i in range(len(blocks) - 1, -1, -1): block_type, pos, block = blocks[i] if pos >= target_table_pos: # Skip blocks after the table continue if block_type != "p": continue if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I): try: level_match = re.search(r"(\d+)", block.style.name) if level_match: level = int(level_match.group(1)) # Find any heading with a higher level if level < current_level: title_text = block.text.strip() if title_text: # Avoid empty titles titles.append((level, title_text)) current_level = level found = True break except Exception as e: logging.error(f"Error parsing parent heading: {e}") if not found: # Break if no parent heading is found break # Sort by level (ascending, from highest to lowest) titles.sort(key=lambda x: x[0]) # Organize titles (from highest to lowest) hierarchy = [doc_name] + [t[1] for t in titles] return " > ".join(hierarchy) return "" def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER): self.doc = Document(filename) if not binary else Document(BytesIO(binary)) pn = 0 lines = [] last_image = None table_idx = 0 def flush_last_image(): nonlocal last_image, lines if last_image is not None: lines.append({"text": "", "image": last_image, "table": None, "style": "Image"}) last_image = None for block in self.doc._element.body: if pn > to_page: break if block.tag.endswith("p"): p = Paragraph(block, self.doc) if from_page <= pn < to_page: text = p.text.strip() style_name = p.style.name if p.style else "" if text: if style_name == "Caption": former_image = None if lines and lines[-1].get("image") and lines[-1].get("style") != "Caption": former_image = lines[-1].get("image") lines.pop() elif last_image is not None: former_image = last_image last_image = None lines.append( { "text": self.__clean(text), "image": former_image if former_image else None, "table": None, } ) else: flush_last_image() lines.append( { "text": self.__clean(text), "image": None, "table": None, } ) current_image = self.get_picture(self.doc, p) if current_image is not None: lines.append( { "text": "", "image": current_image, "table": None, } ) else: current_image = self.get_picture(self.doc, p) if current_image is not None: last_image = current_image for run in p.runs: xml = run._element.xml if "lastRenderedPageBreak" in xml: pn += 1 continue if "w:br" in xml and 'type="page"' in xml: pn += 1 elif block.tag.endswith("tbl"): if pn < from_page or pn > to_page: table_idx += 1 continue flush_last_image() tb = DocxTable(block, self.doc) title = self.__get_nearest_title(table_idx, filename) html = "" if title: html += f"" for r in tb.rows: html += "" col_idx = 0 try: while col_idx < len(r.cells): span = 1 c = r.cells[col_idx] for j in range(col_idx + 1, len(r.cells)): if c.text == r.cells[j].text: span += 1 col_idx = j else: break col_idx += 1 html += f"" if span == 1 else f"" except Exception as e: logging.warning(f"Error parsing table, ignore: {e}") html += "" html += "
Table Location: {title}
{c.text}{c.text}
" lines.append({"text": "", "image": None, "table": html}) table_idx += 1 flush_last_image() new_line = [(line.get("text"), line.get("image"), line.get("table")) for line in lines] return new_line def to_markdown(self, filename=None, binary=None, inline_images: bool = True): """ This function uses mammoth, licensed under the BSD 2-Clause License. """ import base64 import uuid import mammoth from markdownify import markdownify docx_file = BytesIO(binary) if binary else open(filename, "rb") def _convert_image_to_base64(image): try: with image.open() as image_file: image_bytes = image_file.read() encoded = base64.b64encode(image_bytes).decode("utf-8") base64_url = f"data:{image.content_type};base64,{encoded}" alt_name = "image" alt_name = f"img_{uuid.uuid4().hex[:8]}" return {"src": base64_url, "alt": alt_name} except Exception as e: logging.warning(f"Failed to convert image to base64: {e}") return {"src": "", "alt": "image"} try: if inline_images: result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64)) else: result = mammoth.convert_to_html(docx_file) html = result.value markdown_text = markdownify(html) return markdown_text finally: if not binary: docx_file.close() class Pdf(PdfParser): def __init__(self): super().__init__() def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, zoomin=3, callback=None, separate_tables_figures=False): start = timer() first_start = start callback(msg="OCR started") self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback) callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start)) start = timer() self._layouts_rec(zoomin) callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) start = timer() self._table_transformer_job(zoomin) callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start)) start = timer() self._text_merge(zoomin=zoomin) callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) if separate_tables_figures: tbls, figures = self._extract_table_figure(True, zoomin, True, True, True) self._concat_downward() logging.info("layouts cost: {}s".format(timer() - first_start)) return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls, figures else: tbls = self._extract_table_figure(True, zoomin, True, True) self._naive_vertical_merge() self._concat_downward() # self._final_reading_order_merge() # self._filter_forpages() logging.info("layouts cost: {}s".format(timer() - first_start)) return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls class Markdown(MarkdownParser): def md_to_html(self, sections): if not sections: return [] if isinstance(sections, type("")): text = sections elif isinstance(sections[0], type("")): text = sections[0] else: return [] from bs4 import BeautifulSoup html_content = markdown(text) soup = BeautifulSoup(html_content, "html.parser") return soup def get_hyperlink_urls(self, soup): if soup: return set([a.get("href") for a in soup.find_all("a") if a.get("href")]) return [] def extract_image_urls_with_lines(self, text): md_img_re = re.compile(r"!\[[^\]]*\]\(([^)\s]+)") html_img_re = re.compile(r'src=["\\\']([^"\\\'>\\s]+)', re.IGNORECASE) urls = [] seen = set() lines = text.splitlines() for idx, line in enumerate(lines): for url in md_img_re.findall(line): if (url, idx) not in seen: urls.append({"url": url, "line": idx}) seen.add((url, idx)) for url in html_img_re.findall(line): if (url, idx) not in seen: urls.append({"url": url, "line": idx}) seen.add((url, idx)) # cross-line try: from bs4 import BeautifulSoup soup = BeautifulSoup(text, "html.parser") newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)] for img_tag in soup.find_all("img"): src = img_tag.get("src") if not src: continue tag_str = str(img_tag) pos = text.find(tag_str) if pos == -1: # fallback pos = max(text.find(src), 0) line_no = 0 for i, off in enumerate(newline_offsets): if pos <= off: line_no = i break if (src, line_no) not in seen: urls.append({"url": src, "line": line_no}) seen.add((src, line_no)) except Exception as e: logging.error("Failed to extract image urls: {}".format(e)) pass return urls def load_images_from_urls(self, urls, cache=None): import requests from pathlib import Path cache = cache or {} images = [] for url in urls: if url in cache: if cache[url]: images.append(cache[url]) continue img_obj = None try: if url.startswith(("http://", "https://")): response = requests.get(url, stream=True, timeout=30) if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"): img_obj = Image.open(BytesIO(response.content)).convert("RGB") else: local_path = Path(url) if local_path.exists(): img_obj = Image.open(url).convert("RGB") else: logging.warning(f"Local image file not found: {url}") except Exception as e: logging.error(f"Failed to download/open image from {url}: {e}") cache[url] = img_obj if img_obj: images.append(img_obj) return images, cache def __call__(self, filename, binary=None, separate_tables=True, delimiter=None, return_section_images=False): if binary: encoding = find_codec(binary) txt = binary.decode(encoding, errors="ignore") else: with open(filename, "r") as f: txt = f.read() remainder, tables = self.extract_tables_and_remainder(f"{txt}\n", separate_tables=separate_tables) # To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410. # extractor = MarkdownElementExtractor(remainder) extractor = MarkdownElementExtractor(txt) image_refs = self.extract_image_urls_with_lines(txt) element_sections = extractor.extract_elements(delimiter, include_meta=True) sections = [] section_images = [] image_cache = {} for element in element_sections: content = element["content"] start_line = element["start_line"] end_line = element["end_line"] urls_in_section = [ref["url"] for ref in image_refs if start_line <= ref["line"] <= end_line] imgs = [] if urls_in_section: imgs, image_cache = self.load_images_from_urls(urls_in_section, image_cache) combined_image = None if imgs: combined_image = reduce(concat_img, imgs) if len(imgs) > 1 else imgs[0] sections.append((content, "")) section_images.append(combined_image) tbls = [] for table in tables: tbls.append(((None, markdown(table, extensions=["markdown.extensions.tables"])), "")) if return_section_images: return sections, tbls, section_images return sections, tbls def load_from_xml_v2(baseURI, rels_item_xml): """ Return |_SerializedRelationships| instance loaded with the relationships contained in *rels_item_xml*. Returns an empty collection if *rels_item_xml* is |None|. """ srels = _SerializedRelationships() if rels_item_xml is not None: rels_elm = parse_xml(rels_item_xml) for rel_elm in rels_elm.Relationship_lst: if rel_elm.target_ref in ("../NULL", "NULL") or rel_elm.target_ref.startswith("#"): continue srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) return srels def chunk(filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER, lang="Chinese", callback=None, **kwargs): """ Supported file formats are docx, pdf, excel, txt. This method apply the naive ways to chunk files. Successive text will be sliced into pieces using 'delimiter'. Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. """ urls = set() url_res = [] lang = lang or "Chinese" is_english = lang.lower() == "english" # is_english(cks) parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True}) child_deli = (parser_config.get("children_delimiter") or "").encode("utf-8").decode("unicode_escape").encode("latin1").decode("utf-8") cust_child_deli = re.findall(r"`([^`]+)`", child_deli) child_deli = "|".join(re.sub(r"`([^`]+)`", "", child_deli)) if cust_child_deli: cust_child_deli = sorted(set(cust_child_deli), key=lambda x: -len(x)) cust_child_deli = "|".join(re.escape(t) for t in cust_child_deli if t) child_deli += cust_child_deli is_markdown = False table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0)) image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0)) doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))} doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) res = [] pdf_parser = None section_images = None is_root = kwargs.get("is_root", True) embed_res = [] if is_root: # Only extract embedded files at the root call embeds = [] if binary is not None: embeds = extract_embed_file(binary) else: raise Exception("Embedding extraction from file path is not supported.") # Recursively chunk each embedded file and collect results for embed_filename, embed_bytes in embeds: try: sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, **kwargs) or [] embed_res.extend(sub_res) except Exception as e: error_msg = f"Failed to chunk embed {embed_filename}: {e}" logging.error(error_msg) if callback: callback(0.05, error_msg) continue if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") if parser_config.get("analyze_hyperlink", False) and is_root: urls = extract_links_from_docx(binary) for index, url in enumerate(urls): html_bytes, metadata = extract_html(url) if not html_bytes: continue try: sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) except Exception as e: logging.info(f"Failed to chunk url in registered file type {url}: {e}") sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) url_res.extend(sub_url_res) # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246 _SerializedRelationships.load_from_xml = load_from_xml_v2 # sections = (text, image, tables) sections = Docx()(filename, binary) sections = _normalize_section_text_for_rtl_presentation_forms(sections) # chunks list[dict] # images list - index of image chunk in chunks chunks, images = naive_merge_docx(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), table_context_size, image_context_size) vision_figure_parser_docx_wrapper_naive(chunks=chunks, idx_lst=images, callback=callback, **kwargs) callback(0.8, "Finish parsing.") st = timer() res.extend(doc_tokenize_chunks_with_images(chunks, doc, is_english, child_delimiters_pattern=child_deli)) logging.info("naive_merge({}): {}".format(filename, timer() - st)) res.extend(embed_res) res.extend(url_res) return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC")) opendataloader_llm_name = kwargs.pop("opendataloader_llm_name", None) if layout_recognizer == "OpenDataLoader" and parser_model_name: opendataloader_llm_name = parser_model_name if parser_config.get("analyze_hyperlink", False) and is_root: urls = extract_links_from_pdf(binary) if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "PlainText" name = layout_recognizer.strip().lower() parser = PARSERS.get(name, by_plaintext) callback(0.1, "Start to parse.") sections, tables, pdf_parser = parser( filename=filename, binary=binary, from_page=from_page, to_page=to_page, lang=lang, callback=callback, layout_recognizer=layout_recognizer, mineru_llm_name=parser_model_name, paddleocr_llm_name=parser_model_name, opendataloader_llm_name=opendataloader_llm_name, **kwargs, ) sections = _normalize_section_text_for_rtl_presentation_forms(sections) if not sections and not tables: return [] if table_context_size or image_context_size: tables = append_context2table_image4pdf(sections, tables, image_context_size) if name in ["tcadp", "docling", "mineru", "paddleocr", "opendataloader"]: if int(parser_config.get("chunk_token_num", 0)) <= 0: parser_config["chunk_token_num"] = 0 res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") # Check if tcadp_parser is selected for spreadsheet files layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") if layout_recognizer == "TCADP Parser": table_result_type = parser_config.get("table_result_type", "1") markdown_image_response_type = parser_config.get("markdown_image_response_type", "1") tcadp_parser = TCADPParser(table_result_type=table_result_type, markdown_image_response_type=markdown_image_response_type) if not tcadp_parser.check_installation(): callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.") return res # Determine file type based on extension file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV" sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type=file_type) sections = _normalize_section_text_for_rtl_presentation_forms(sections) parser_config["chunk_token_num"] = 0 res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") else: # Default DeepDOC parser excel_parser = ExcelParser() if parser_config.get("html4excel"): sections = [(_, "") for _ in excel_parser.html(binary, 12) if _] parser_config["chunk_token_num"] = 0 else: sections = [(_, "") for _ in excel_parser(binary) if _] sections = _normalize_section_text_for_rtl_presentation_forms(sections) elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?")) sections = _normalize_section_text_for_rtl_presentation_forms(sections) print("\n", "-"*150, "\n") print(sections) print("\n", "-"*150, "\n") callback(0.8, "Finish parsing.") elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) sections, tables, section_images = markdown_parser( filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。;!?"), return_section_images=True, ) sections = _normalize_section_text_for_rtl_presentation_forms(sections) is_markdown = True try: vision_model_config = get_tenant_default_model_by_type(kwargs["tenant_id"], LLMType.IMAGE2TEXT) vision_model = LLMBundle(kwargs["tenant_id"], vision_model_config) callback(0.2, "Visual model detected. Attempting to enhance figure extraction...") except Exception as e: logging.warning(f"Failed to detect figure extraction: {e}") vision_model = None if vision_model: # Process images for each section for idx, (section_text, _) in enumerate(sections): images = [] if section_images and len(section_images) > idx and section_images[idx] is not None: images.append(section_images[idx]) if images and len(images) > 0: # If multiple images found, combine them using concat_img combined_image = reduce(concat_img, images) if len(images) > 1 else images[0] if section_images: section_images[idx] = combined_image else: section_images = [None] * len(sections) section_images[idx] = combined_image markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs) boosted_figures = markdown_vision_parser(callback=callback) sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1]) else: logging.warning("No visual model detected. Skipping figure parsing enhancement.") if parser_config.get("hyperlink_urls", False) and is_root: for idx, (section_text, _) in enumerate(sections): soup = markdown_parser.md_to_html(section_text) hyperlink_urls = markdown_parser.get_hyperlink_urls(soup) urls.update(hyperlink_urls) res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") chunk_token_num = int(parser_config.get("chunk_token_num", 128)) sections = HtmlParser()(filename, binary, chunk_token_num) sections = [(_, "") for _ in sections if _] sections = _normalize_section_text_for_rtl_presentation_forms(sections) callback(0.8, "Finish parsing.") elif re.search(r"\.epub$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") chunk_token_num = int(parser_config.get("chunk_token_num", 128)) sections = EpubParser()(filename, binary, chunk_token_num) sections = [(_, "") for _ in sections if _] sections = _normalize_section_text_for_rtl_presentation_forms(sections) callback(0.8, "Finish parsing.") elif re.search(r"\.(json|jsonl|ldjson)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") chunk_token_num = int(parser_config.get("chunk_token_num", 128)) sections = JsonParser(chunk_token_num)(binary) sections = [(_, "") for _ in sections if _] sections = _normalize_section_text_for_rtl_presentation_forms(sections) callback(0.8, "Finish parsing.") elif re.search(r"\.doc$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") try: from tika import parser as tika_parser except Exception as e: callback(0.8, f"tika not available: {e}. Unsupported .doc parsing.") logging.warning(f"tika not available: {e}. Unsupported .doc parsing for {filename}.") return [] binary = BytesIO(binary) doc_parsed = tika_parser.from_buffer(binary) if doc_parsed.get("content", None) is not None: sections = doc_parsed["content"].split("\n") sections = [(_, "") for _ in sections if _] sections = _normalize_section_text_for_rtl_presentation_forms(sections) callback(0.8, "Finish parsing.") else: error_msg = f"tika.parser got empty content from {filename}." callback(0.8, error_msg) logging.warning(error_msg) return [] else: raise NotImplementedError("file type not supported yet(pdf, xlsx, doc, docx, txt supported)") st = timer() overlapped_percent = normalize_overlapped_percent(parser_config.get("overlapped_percent", 0)) if is_markdown: merged_chunks = [] merged_images = [] chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128))) current_text = "" current_tokens = 0 current_image = None for idx, sec in enumerate(sections): text = sec[0] if isinstance(sec, tuple) else sec sec_tokens = num_tokens_from_string(text) sec_image = section_images[idx] if section_images and idx < len(section_images) else None # Don't finalize chunk if current_text is a short header (force merge with next section) if current_text and not _is_short_header(current_text) and current_tokens + sec_tokens > chunk_limit: merged_chunks.append(current_text) merged_images.append(current_image) overlap_part = "" if overlapped_percent > 0: overlap_len = int(len(current_text) * overlapped_percent / 100) if overlap_len > 0: overlap_part = current_text[-overlap_len:] current_text = overlap_part current_tokens = num_tokens_from_string(current_text) current_image = current_image if overlap_part else None if current_text: current_text += "\n" + text else: current_text = text current_tokens += sec_tokens if sec_image: current_image = concat_img(current_image, sec_image) if current_image else sec_image if current_text: merged_chunks.append(current_text) merged_images.append(current_image) chunks = merged_chunks has_images = merged_images and any(img is not None for img in merged_images) if has_images: res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images, child_delimiters_pattern=child_deli)) else: res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli)) else: if section_images: if all(image is None for image in section_images): section_images = None if section_images: chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), overlapped_percent) res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli)) else: chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), overlapped_percent) res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli)) if urls and parser_config.get("analyze_hyperlink", False) and is_root: for index, url in enumerate(urls): html_bytes, metadata = extract_html(url) if not html_bytes: continue try: sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) except Exception as e: logging.info(f"Failed to chunk url in registered file type {url}: {e}") sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) url_res.extend(sub_url_res) logging.info("naive_merge({}): {}".format(filename, timer() - st)) if embed_res: res.extend(embed_res) if url_res: res.extend(url_res) # if table_context_size or image_context_size: # attach_media_context(res, table_context_size, image_context_size) # Attach PDF outline as transient metadata on the first chunk. # task_executor.py will extract and persist it as document metadata. if res and pdf_parser and getattr(pdf_parser, "outlines", None): res[0]["__outline__"] = [ {"title": title, "depth": depth} for title, depth, *_ in pdf_parser.outlines ] return res if __name__ == "__main__": import sys def dummy(prog=None, msg=""): pass chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)