diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index b369f9122a..2c35ead98c 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -644,6 +644,12 @@ class MinerUParser(RAGFlowPdfParser): case MinerUContentType.IMAGE: section = "".join(output.get("image_caption", [])) + "\n" + "".join( output.get("image_footnote", [])) + # If a vision model enriched this image with a semantic + # description (see _enhance_images_with_vlm), embed it in + # the chunk so it becomes searchable / retrievable. + vlm_description = (output.get("vlm_description") or "").strip() + if vlm_description: + section = (section.strip("\n") + "\n" + vlm_description).strip("\n") if section.strip() else vlm_description case MinerUContentType.EQUATION: section = output.get("text", "") case MinerUContentType.CODE: @@ -664,6 +670,49 @@ class MinerUParser(RAGFlowPdfParser): def _transfer_to_tables(self, outputs: list[dict[str, Any]]): return [] + def _enhance_images_with_vlm(self, outputs: list[dict[str, Any]], vision_model, callback: Optional[Callable] = None): + """Generate semantic descriptions for image blocks via the tenant's + IMAGE2TEXT model, mirroring deepdoc's VisionFigureParser. Each + IMAGE block with a readable img_path gets a ``vlm_description`` + field that ``_transfer_to_sections`` then folds into the chunk + text — closing issue #14869. + """ + from concurrent.futures import ThreadPoolExecutor, as_completed + from rag.app.picture import vision_llm_chunk + from rag.prompts.generator import vision_llm_figure_describe_prompt + + image_jobs = [ + (idx, item) + for idx, item in enumerate(outputs) + if item.get("type") == MinerUContentType.IMAGE + and item.get("img_path") + and os.path.exists(item["img_path"]) + ] + if not image_jobs: + return + + if callback: + callback(0.78, f"[MinerU] Generating VLM descriptions for {len(image_jobs)} images...") + + prompt = vision_llm_figure_describe_prompt() + + def worker(idx, item): + try: + with Image.open(item["img_path"]) as img: + img.load() + desc = vision_llm_chunk(binary=img, vision_model=vision_model, prompt=prompt) + return idx, (desc or "").strip() + except Exception as e: + logging.warning(f"[MinerU] VLM description failed for image #{idx}: {e}") + return idx, "" + + with ThreadPoolExecutor(max_workers=10) as executor: + futures = [executor.submit(worker, idx, item) for idx, item in image_jobs] + for fut in as_completed(futures): + idx, desc = fut.result() + if desc: + outputs[idx]["vlm_description"] = desc + def parse_pdf( self, filepath: str | PathLike[str], @@ -744,6 +793,13 @@ class MinerUParser(RAGFlowPdfParser): if callback: callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.") + vision_model = kwargs.get("vision_model") + if vision_model is not None: + try: + self._enhance_images_with_vlm(outputs, vision_model, callback=callback) + except Exception as e: + self.logger.warning(f"[MinerU] VLM image enhancement failed: {e}. Continuing without descriptions.") + return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs) finally: if temp_pdf and temp_pdf.exists(): diff --git a/rag/app/naive.py b/rag/app/naive.py index f91e2a8f94..7bf4743e7d 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -131,6 +131,19 @@ def by_mineru( ocr_model_config = get_model_config_by_type_and_name(tenant_id, LLMType.OCR, mineru_llm_name) ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config, lang=lang) pdf_parser = ocr_model.mdl + + # Closes #14869: when the tenant has an IMAGE2TEXT model + # configured, let the MinerU parser enrich image chunks with + # VLM-generated semantic descriptions (parity with deepdoc's + # VisionFigureParser). Best-effort — fall back silently if + # no vision model is available. + if "vision_model" not in kwargs: + try: + vision_model_config = get_tenant_default_model_by_type(tenant_id, LLMType.IMAGE2TEXT) + kwargs["vision_model"] = LLMBundle(tenant_id=tenant_id, model_config=vision_model_config, lang=lang) + except Exception as vlm_err: + logging.info(f"[MinerU] no IMAGE2TEXT model for tenant; skipping image VLM enhancement: {vlm_err}") + sections, tables = pdf_parser.parse_pdf( filepath=filename, binary=binary,