diff --git a/deepdoc/parser/paddleocr_parser.py b/deepdoc/parser/paddleocr_parser.py index 3acb583787..6e5c63d0e0 100644 --- a/deepdoc/parser/paddleocr_parser.py +++ b/deepdoc/parser/paddleocr_parser.py @@ -380,6 +380,14 @@ class PaddleOCRParser(RAGFlowPdfParser): if block_content.strip(): texts.append(block_content.strip()) + # Fallback to ocrResults for models like PP-OCRv6 + if not texts: + ocr_results = result.get("ocrResults", []) + for ocr_result in ocr_results: + pruned = ocr_result.get("prunedResult", {}) + rec_texts = pruned.get("rec_texts", []) + texts.extend(t.strip() for t in rec_texts if t.strip()) + if callback: callback(0.9, f"[PaddleOCR] image done, blocks: {len(texts)}") @@ -556,11 +564,13 @@ class PaddleOCRParser(RAGFlowPdfParser): callback(0.8, "[PaddleOCR] result received") # Extract raw result (preserving prunedResult with bbox info) - combined_result: dict[str, Any] = {"layoutParsingResults": []} + combined_result: dict[str, Any] = {"layoutParsingResults": [], "ocrResults": []} for line_obj in jsonl_data: result = line_obj.get("result", {}) layout_results = result.get("layoutParsingResults", []) combined_result["layoutParsingResults"].extend(layout_results) + ocr_results = result.get("ocrResults", []) + combined_result["ocrResults"].extend(ocr_results) return combined_result @@ -571,6 +581,26 @@ class PaddleOCRParser(RAGFlowPdfParser): if algorithm in SUPPORTED_PADDLEOCR_ALGORITHMS: layout_parsing_results = result.get("layoutParsingResults", []) + # Fallback to ocrResults for models like PP-OCRv6 that only return text recognition + if not layout_parsing_results: + ocr_results = result.get("ocrResults", []) + for page_idx, ocr_result in enumerate(ocr_results): + pruned = ocr_result.get("prunedResult", {}) + rec_texts = pruned.get("rec_texts", []) + rec_boxes = pruned.get("rec_boxes", []) + for i, text in enumerate(rec_texts): + text = text.strip() + if not text: + continue + if i < len(rec_boxes): + box = rec_boxes[i] + left, top, right, bottom = box[0], box[1], box[2], box[3] + else: + left, top, right, bottom = 0, 0, 0, 0 + tag = f"@@{page_idx + 1}\t{left // self._ZOOMIN}\t{right // self._ZOOMIN}\t{top // self._ZOOMIN}\t{bottom // self._ZOOMIN}##" + sections.append((text, tag)) + return sections + for page_idx, layout_result in enumerate(layout_parsing_results): pruned_result = layout_result.get("prunedResult", {}) parsing_res_list = pruned_result.get("parsing_res_list", []) diff --git a/internal/entity/models/paddleocr.go b/internal/entity/models/paddleocr.go index 2541b3e80b..6a9b147e73 100644 --- a/internal/entity/models/paddleocr.go +++ b/internal/entity/models/paddleocr.go @@ -107,6 +107,11 @@ type paddleJsonlLine struct { Text string `json:"text"` } `json:"markdown"` } `json:"layoutParsingResults"` + OcrResults []struct { + PrunedResult struct { + RecTexts []string `json:"rec_texts"` + } `json:"prunedResult"` + } `json:"ocrResults"` } `json:"result"` } @@ -289,6 +294,19 @@ func (p *PaddleOCRModel) OCRFile(modelName *string, content []byte, fileURL *str fullMarkdown.WriteString(layoutRes.Markdown.Text) fullMarkdown.WriteString("\n\n") } + + // Fallback to ocrResults for models like PP-OCRv6 + if len(lineData.Result.LayoutParsingResults) == 0 { + for _, ocrRes := range lineData.Result.OcrResults { + for _, text := range ocrRes.PrunedResult.RecTexts { + text = strings.TrimSpace(text) + if text != "" { + fullMarkdown.WriteString(text) + fullMarkdown.WriteString("\n") + } + } + } + } } if err = scanner.Err(); err != nil { diff --git a/rag/app/picture.py b/rag/app/picture.py index d58f923eb8..22aa5fd11c 100644 --- a/rag/app/picture.py +++ b/rag/app/picture.py @@ -16,14 +16,18 @@ import asyncio import io +import logging +import os import re +import tempfile import numpy as np from PIL import Image from api.db.services.llm_service import LLMBundle -from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type +from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type, get_first_provider_model_name, get_model_config_from_provider_instance, ensure_paddleocr_from_env from common.constants import LLMType +from common.parser_config_utils import normalize_layout_recognizer from common.string_utils import clean_markdown_block from deepdoc.vision import OCR from rag.nlp import attach_media_context, rag_tokenizer, tokenize @@ -70,8 +74,15 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): "doc_type_kwd": "image", } ) - bxs = ocr(np.array(img)) - txt = "\n".join([t[0] for _, t in bxs if t[0]]) + + # Try PaddleOCR if configured as layout_recognize + txt = _try_paddleocr_image(filename, binary, tenant_id, parser_config, callback) + + if not txt: + # Fallback to local deepdoc OCR + bxs = ocr(np.array(img)) + txt = "\n".join([t[0] for _, t in bxs if t[0]]) + callback(0.4, "Finish OCR: (%s ...)" % txt[:12]) if (eng and len(txt.split()) > 32) or len(txt) > 32: tokenize(doc, txt, eng) @@ -96,6 +107,47 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): return [] +def _try_paddleocr_image(filename, binary, tenant_id, parser_config, callback): + """Try to parse image using PaddleOCR if configured. Returns text or empty string.""" + layout_recognize = parser_config.get("layout_recognize", "") + if not layout_recognize: + return "" + + layout_recognizer, parser_model_name = normalize_layout_recognizer(layout_recognize) + if layout_recognizer != "PaddleOCR": + return "" + + try: + paddleocr_llm_name = parser_model_name + if not paddleocr_llm_name: + paddleocr_llm_name = get_first_provider_model_name(tenant_id, "PaddleOCR", LLMType.OCR) or ensure_paddleocr_from_env(tenant_id) + + if not paddleocr_llm_name: + return "" + + ocr_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.OCR, paddleocr_llm_name) + ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config) + pdf_parser = ocr_model.mdl + + if not hasattr(pdf_parser, "parse_image"): + logging.warning("[PaddleOCR] parse_image not available, falling back to local OCR") + return "" + + callback(0.2, "Using PaddleOCR to parse image...") + with tempfile.NamedTemporaryFile(suffix=os.path.splitext(filename)[1] or ".png", delete=True) as tmp: + tmp.write(binary) + tmp.flush() + txt = pdf_parser.parse_image(filepath=tmp.name, binary=binary, callback=callback) + + if txt: + logging.info(f"[PaddleOCR] image parsed successfully: {len(txt)} chars") + return txt + except Exception as e: + logging.warning(f"[PaddleOCR] image parsing failed, falling back to local OCR: {e}") + + return "" + + def vision_llm_chunk(binary, vision_model, prompt=None, callback=None): """ A simple wrapper to process image to markdown texts via VLM.