mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-04 18:45:38 +08:00
fix(paddleocr): support PP-OCRv6 ocrResults fallback and integrate image parsing (#16150)
## Summary This PR fixes two issues discovered during testing of the PaddleOCR async API refactoring: ### 1. PP-OCRv6 returns `ocrResults` instead of `layoutParsingResults` Models like PP-OCRv6 are pure text recognition models that return results in `ocrResults.prunedResult.rec_texts` format rather than the `layoutParsingResults.prunedResult.parsing_res_list` format used by layout-aware models (PaddleOCR-VL series). **Changes:** - `deepdoc/parser/paddleocr_parser.py`: Extract `ocrResults` alongside `layoutParsingResults` in `_send_request()`, add fallback logic in `_transfer_to_sections()` and `parse_image()` - `internal/entity/models/paddleocr.go`: Add `ocrResults` struct and fallback extraction in Go OCR handler ### 2. Image parsing not integrated into picture chunker The `parse_image()` method existed in PaddleOCRParser but was never called from `rag/app/picture.py` (the module that handles image file uploads). Users configuring PaddleOCR as their layout recognizer would still get local deepdoc OCR for images. **Changes:** - `rag/app/picture.py`: When `layout_recognize` is set to PaddleOCR, use `PaddleOCROcrModel.parse_image()` instead of local OCR. Falls back gracefully to local OCR on failure. ## Testing Verified end-to-end in Docker: - PaddleOCR-VL-1.6 PDF parsing: ✅ (10 text blocks with bbox) - PaddleOCR-VL-1.6 image parsing: ✅ (219 chars) - PP-OCRv6 PDF parsing with ocrResults fallback: ✅ (10 text blocks) - PP-OCRv6 image parsing with ocrResults fallback: ✅ (136 chars) ## Related PRs - #15967 (merged) - PaddleOCR async Job API refactoring + new models - #16086 (merged) - PaddleOCR image parsing support
This commit is contained in:
@@ -380,6 +380,14 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
if block_content.strip():
|
||||
texts.append(block_content.strip())
|
||||
|
||||
# Fallback to ocrResults for models like PP-OCRv6
|
||||
if not texts:
|
||||
ocr_results = result.get("ocrResults", [])
|
||||
for ocr_result in ocr_results:
|
||||
pruned = ocr_result.get("prunedResult", {})
|
||||
rec_texts = pruned.get("rec_texts", [])
|
||||
texts.extend(t.strip() for t in rec_texts if t.strip())
|
||||
|
||||
if callback:
|
||||
callback(0.9, f"[PaddleOCR] image done, blocks: {len(texts)}")
|
||||
|
||||
@@ -556,11 +564,13 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
callback(0.8, "[PaddleOCR] result received")
|
||||
|
||||
# Extract raw result (preserving prunedResult with bbox info)
|
||||
combined_result: dict[str, Any] = {"layoutParsingResults": []}
|
||||
combined_result: dict[str, Any] = {"layoutParsingResults": [], "ocrResults": []}
|
||||
for line_obj in jsonl_data:
|
||||
result = line_obj.get("result", {})
|
||||
layout_results = result.get("layoutParsingResults", [])
|
||||
combined_result["layoutParsingResults"].extend(layout_results)
|
||||
ocr_results = result.get("ocrResults", [])
|
||||
combined_result["ocrResults"].extend(ocr_results)
|
||||
|
||||
return combined_result
|
||||
|
||||
@@ -571,6 +581,26 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
if algorithm in SUPPORTED_PADDLEOCR_ALGORITHMS:
|
||||
layout_parsing_results = result.get("layoutParsingResults", [])
|
||||
|
||||
# Fallback to ocrResults for models like PP-OCRv6 that only return text recognition
|
||||
if not layout_parsing_results:
|
||||
ocr_results = result.get("ocrResults", [])
|
||||
for page_idx, ocr_result in enumerate(ocr_results):
|
||||
pruned = ocr_result.get("prunedResult", {})
|
||||
rec_texts = pruned.get("rec_texts", [])
|
||||
rec_boxes = pruned.get("rec_boxes", [])
|
||||
for i, text in enumerate(rec_texts):
|
||||
text = text.strip()
|
||||
if not text:
|
||||
continue
|
||||
if i < len(rec_boxes):
|
||||
box = rec_boxes[i]
|
||||
left, top, right, bottom = box[0], box[1], box[2], box[3]
|
||||
else:
|
||||
left, top, right, bottom = 0, 0, 0, 0
|
||||
tag = f"@@{page_idx + 1}\t{left // self._ZOOMIN}\t{right // self._ZOOMIN}\t{top // self._ZOOMIN}\t{bottom // self._ZOOMIN}##"
|
||||
sections.append((text, tag))
|
||||
return sections
|
||||
|
||||
for page_idx, layout_result in enumerate(layout_parsing_results):
|
||||
pruned_result = layout_result.get("prunedResult", {})
|
||||
parsing_res_list = pruned_result.get("parsing_res_list", [])
|
||||
|
||||
@@ -107,6 +107,11 @@ type paddleJsonlLine struct {
|
||||
Text string `json:"text"`
|
||||
} `json:"markdown"`
|
||||
} `json:"layoutParsingResults"`
|
||||
OcrResults []struct {
|
||||
PrunedResult struct {
|
||||
RecTexts []string `json:"rec_texts"`
|
||||
} `json:"prunedResult"`
|
||||
} `json:"ocrResults"`
|
||||
} `json:"result"`
|
||||
}
|
||||
|
||||
@@ -289,6 +294,19 @@ func (p *PaddleOCRModel) OCRFile(modelName *string, content []byte, fileURL *str
|
||||
fullMarkdown.WriteString(layoutRes.Markdown.Text)
|
||||
fullMarkdown.WriteString("\n\n")
|
||||
}
|
||||
|
||||
// Fallback to ocrResults for models like PP-OCRv6
|
||||
if len(lineData.Result.LayoutParsingResults) == 0 {
|
||||
for _, ocrRes := range lineData.Result.OcrResults {
|
||||
for _, text := range ocrRes.PrunedResult.RecTexts {
|
||||
text = strings.TrimSpace(text)
|
||||
if text != "" {
|
||||
fullMarkdown.WriteString(text)
|
||||
fullMarkdown.WriteString("\n")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err = scanner.Err(); err != nil {
|
||||
|
||||
@@ -16,14 +16,18 @@
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type
|
||||
from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type, get_first_provider_model_name, get_model_config_from_provider_instance, ensure_paddleocr_from_env
|
||||
from common.constants import LLMType
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
from common.string_utils import clean_markdown_block
|
||||
from deepdoc.vision import OCR
|
||||
from rag.nlp import attach_media_context, rag_tokenizer, tokenize
|
||||
@@ -70,8 +74,15 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
|
||||
"doc_type_kwd": "image",
|
||||
}
|
||||
)
|
||||
bxs = ocr(np.array(img))
|
||||
txt = "\n".join([t[0] for _, t in bxs if t[0]])
|
||||
|
||||
# Try PaddleOCR if configured as layout_recognize
|
||||
txt = _try_paddleocr_image(filename, binary, tenant_id, parser_config, callback)
|
||||
|
||||
if not txt:
|
||||
# Fallback to local deepdoc OCR
|
||||
bxs = ocr(np.array(img))
|
||||
txt = "\n".join([t[0] for _, t in bxs if t[0]])
|
||||
|
||||
callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
|
||||
if (eng and len(txt.split()) > 32) or len(txt) > 32:
|
||||
tokenize(doc, txt, eng)
|
||||
@@ -96,6 +107,47 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
|
||||
return []
|
||||
|
||||
|
||||
def _try_paddleocr_image(filename, binary, tenant_id, parser_config, callback):
|
||||
"""Try to parse image using PaddleOCR if configured. Returns text or empty string."""
|
||||
layout_recognize = parser_config.get("layout_recognize", "")
|
||||
if not layout_recognize:
|
||||
return ""
|
||||
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(layout_recognize)
|
||||
if layout_recognizer != "PaddleOCR":
|
||||
return ""
|
||||
|
||||
try:
|
||||
paddleocr_llm_name = parser_model_name
|
||||
if not paddleocr_llm_name:
|
||||
paddleocr_llm_name = get_first_provider_model_name(tenant_id, "PaddleOCR", LLMType.OCR) or ensure_paddleocr_from_env(tenant_id)
|
||||
|
||||
if not paddleocr_llm_name:
|
||||
return ""
|
||||
|
||||
ocr_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.OCR, paddleocr_llm_name)
|
||||
ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config)
|
||||
pdf_parser = ocr_model.mdl
|
||||
|
||||
if not hasattr(pdf_parser, "parse_image"):
|
||||
logging.warning("[PaddleOCR] parse_image not available, falling back to local OCR")
|
||||
return ""
|
||||
|
||||
callback(0.2, "Using PaddleOCR to parse image...")
|
||||
with tempfile.NamedTemporaryFile(suffix=os.path.splitext(filename)[1] or ".png", delete=True) as tmp:
|
||||
tmp.write(binary)
|
||||
tmp.flush()
|
||||
txt = pdf_parser.parse_image(filepath=tmp.name, binary=binary, callback=callback)
|
||||
|
||||
if txt:
|
||||
logging.info(f"[PaddleOCR] image parsed successfully: {len(txt)} chars")
|
||||
return txt
|
||||
except Exception as e:
|
||||
logging.warning(f"[PaddleOCR] image parsing failed, falling back to local OCR: {e}")
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
|
||||
"""
|
||||
A simple wrapper to process image to markdown texts via VLM.
|
||||
|
||||
Reference in New Issue
Block a user