fix(paddleocr): support PP-OCRv6 ocrResults fallback and integrate image parsing (#16150)

## Summary This PR fixes two issues discovered during testing of the PaddleOCR async API refactoring: ### 1. PP-OCRv6 returns `ocrResults` instead of `layoutParsingResults` Models like PP-OCRv6 are pure text recognition models that return results in `ocrResults.prunedResult.rec_texts` format rather than the `layoutParsingResults.prunedResult.parsing_res_list` format used by layout-aware models (PaddleOCR-VL series). **Changes:** - `deepdoc/parser/paddleocr_parser.py`: Extract `ocrResults` alongside `layoutParsingResults` in `_send_request()`, add fallback logic in `_transfer_to_sections()` and `parse_image()` - `internal/entity/models/paddleocr.go`: Add `ocrResults` struct and fallback extraction in Go OCR handler ### 2. Image parsing not integrated into picture chunker The `parse_image()` method existed in PaddleOCRParser but was never called from `rag/app/picture.py` (the module that handles image file uploads). Users configuring PaddleOCR as their layout recognizer would still get local deepdoc OCR for images. **Changes:** - `rag/app/picture.py`: When `layout_recognize` is set to PaddleOCR, use `PaddleOCROcrModel.parse_image()` instead of local OCR. Falls back gracefully to local OCR on failure. ## Testing Verified end-to-end in Docker: - PaddleOCR-VL-1.6 PDF parsing: ✅ (10 text blocks with bbox) - PaddleOCR-VL-1.6 image parsing: ✅ (219 chars) - PP-OCRv6 PDF parsing with ocrResults fallback: ✅ (10 text blocks) - PP-OCRv6 image parsing with ocrResults fallback: ✅ (136 chars) ## Related PRs - #15967 (merged) - PaddleOCR async Job API refactoring + new models - #16086 (merged) - PaddleOCR image parsing support
2026-06-29 15:31:05 +08:00 · 2026-06-23 22:02:54 +08:00
parent b4a8a90c73
commit 017adf841f
3 changed files with 104 additions and 4 deletions
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@@ -16,14 +16,18 @@

 import asyncio
 import io
+import logging
+import os
 import re
+import tempfile

 import numpy as np
 from PIL import Image

 from api.db.services.llm_service import LLMBundle
-from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type
+from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type, get_first_provider_model_name, get_model_config_from_provider_instance, ensure_paddleocr_from_env
 from common.constants import LLMType
+from common.parser_config_utils import normalize_layout_recognizer
 from common.string_utils import clean_markdown_block
 from deepdoc.vision import OCR
 from rag.nlp import attach_media_context, rag_tokenizer, tokenize
@@ -70,8 +74,15 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
                "doc_type_kwd": "image",
            }
        )
-        bxs = ocr(np.array(img))
-        txt = "\n".join([t[0] for _, t in bxs if t[0]])
+
+        # Try PaddleOCR if configured as layout_recognize
+        txt = _try_paddleocr_image(filename, binary, tenant_id, parser_config, callback)
+
+        if not txt:
+            # Fallback to local deepdoc OCR
+            bxs = ocr(np.array(img))
+            txt = "\n".join([t[0] for _, t in bxs if t[0]])
+
        callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
        if (eng and len(txt.split()) > 32) or len(txt) > 32:
            tokenize(doc, txt, eng)
@@ -96,6 +107,47 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
    return []


+def _try_paddleocr_image(filename, binary, tenant_id, parser_config, callback):
+    """Try to parse image using PaddleOCR if configured. Returns text or empty string."""
+    layout_recognize = parser_config.get("layout_recognize", "")
+    if not layout_recognize:
+        return ""
+
+    layout_recognizer, parser_model_name = normalize_layout_recognizer(layout_recognize)
+    if layout_recognizer != "PaddleOCR":
+        return ""
+
+    try:
+        paddleocr_llm_name = parser_model_name
+        if not paddleocr_llm_name:
+            paddleocr_llm_name = get_first_provider_model_name(tenant_id, "PaddleOCR", LLMType.OCR) or ensure_paddleocr_from_env(tenant_id)
+
+        if not paddleocr_llm_name:
+            return ""
+
+        ocr_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.OCR, paddleocr_llm_name)
+        ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config)
+        pdf_parser = ocr_model.mdl
+
+        if not hasattr(pdf_parser, "parse_image"):
+            logging.warning("[PaddleOCR] parse_image not available, falling back to local OCR")
+            return ""
+
+        callback(0.2, "Using PaddleOCR to parse image...")
+        with tempfile.NamedTemporaryFile(suffix=os.path.splitext(filename)[1] or ".png", delete=True) as tmp:
+            tmp.write(binary)
+            tmp.flush()
+            txt = pdf_parser.parse_image(filepath=tmp.name, binary=binary, callback=callback)
+
+        if txt:
+            logging.info(f"[PaddleOCR] image parsed successfully: {len(txt)} chars")
+            return txt
+    except Exception as e:
+        logging.warning(f"[PaddleOCR] image parsing failed, falling back to local OCR: {e}")
+
+    return ""
+
+
 def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
    """
    A simple wrapper to process image to markdown texts via VLM.