From 62698725ca09e0ca5cc3995e8b17112c2761dbb1 Mon Sep 17 00:00:00 2001
From: Rander <renchenwei@baidu.com>
Date: Tue, 16 Jun 2026 19:34:38 +0800
Subject: [PATCH] feat(paddleocr): add image parsing support with async Job API
 (#16086)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

Add image parsing capability to PaddleOCR integration, building on top
of #15967 (async Job API migration).

## Changes

### `deepdoc/parser/paddleocr_parser.py`
- Add `parse_image()` method that uses the same async Job API flow as
`parse_pdf()`
- Extracts text from `layoutParsingResults` → `prunedResult` →
`parsing_res_list`
- Returns concatenated block content as a single string

### `rag/llm/ocr_model.py`
- Add `parse_image()` wrapper to `PaddleOCROcrModel` with availability
check and logging

## Relationship to other PRs

- **Depends on**: #15967 (async Job API migration) — this PR is based on
that branch
- **Replaces**: #14826 (original image processing PR based on old sync
API)

## Notes

This PR uses `base_url` and the async Job API (submit → poll → fetch)
consistent with #15967, rather than the old `api_url` + sync POST
pattern from #14826.
---
 deepdoc/parser/paddleocr_parser.py | 63 ++++++++++++++++++++++++++++++
 rag/llm/ocr_model.py               | 10 +++++
 2 files changed, 73 insertions(+)

diff --git a/deepdoc/parser/paddleocr_parser.py b/deepdoc/parser/paddleocr_parser.py
index be8c2d319c..3acb583787 100644
--- a/deepdoc/parser/paddleocr_parser.py
+++ b/deepdoc/parser/paddleocr_parser.py
@@ -323,6 +323,69 @@ class PaddleOCRParser(RAGFlowPdfParser):
 
         return sections, tables
 
+    def parse_image(
+        self,
+        filepath: str | PathLike[str],
+        binary: BytesIO | bytes | None = None,
+        callback: Optional[Callable[[float, str], None]] = None,
+        *,
+        base_url: Optional[str] = None,
+        access_token: Optional[str] = None,
+        algorithm: Optional[AlgorithmType] = None,
+        request_timeout: Optional[int] = None,
+        prettify_markdown: Optional[bool] = None,
+        show_formula_number: Optional[bool] = None,
+        visualize: Optional[bool] = None,
+        additional_params: Optional[dict[str, Any]] = None,
+        algorithm_config: Optional[dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Parse image using PaddleOCR API. Returns extracted text."""
+        self.logger.info(f"[PaddleOCR] parse_image start: {filepath}")
+
+        config_dict = {
+            "base_url": base_url if base_url is not None else self.base_url,
+            "access_token": access_token if access_token is not None else self.access_token,
+            "algorithm": algorithm if algorithm is not None else self.algorithm,
+            "request_timeout": request_timeout if request_timeout is not None else self.request_timeout,
+        }
+        if prettify_markdown is not None:
+            config_dict["prettify_markdown"] = prettify_markdown
+        if show_formula_number is not None:
+            config_dict["show_formula_number"] = show_formula_number
+        if visualize is not None:
+            config_dict["visualize"] = visualize
+        if additional_params is not None:
+            config_dict["additional_params"] = additional_params
+        if algorithm_config is not None:
+            config_dict["algorithm_config"] = algorithm_config
+
+        cfg = PaddleOCRConfig.from_dict(config_dict)
+        data_bytes = self._prepare_file_data(filepath, binary)
+
+        if callback:
+            callback(0.1, "[PaddleOCR] submitting image request")
+
+        result = self._send_request(data_bytes, cfg, callback)
+
+        texts: list[str] = []
+        layout_parsing_results = result.get("layoutParsingResults", [])
+        for layout_result in layout_parsing_results:
+            pruned_result = layout_result.get("prunedResult", {})
+            parsing_res_list = pruned_result.get("parsing_res_list", [])
+            for block in parsing_res_list:
+                block_content = block.get("block_content", "").strip()
+                if block_content:
+                    block_content = _remove_images_from_markdown(block_content)
+                    if block_content.strip():
+                        texts.append(block_content.strip())
+
+        if callback:
+            callback(0.9, f"[PaddleOCR] image done, blocks: {len(texts)}")
+
+        self.logger.info(f"[PaddleOCR] parse_image done: {filepath}, blocks: {len(texts)}")
+        return "\n".join(texts)
+
     def _prepare_file_data(self, filepath: str | PathLike[str], binary: BytesIO | bytes | None) -> bytes:
         """Prepare file data for API request."""
         source_path = Path(filepath)
diff --git a/rag/llm/ocr_model.py b/rag/llm/ocr_model.py
index c7a9a28d17..42e67d1bd9 100644
--- a/rag/llm/ocr_model.py
+++ b/rag/llm/ocr_model.py
@@ -148,6 +148,16 @@ class PaddleOCROcrModel(Base, PaddleOCRParser):
         sections, tables = PaddleOCRParser.parse_pdf(self, filepath=filepath, binary=binary, callback=callback, parse_method=parse_method, **kwargs)
         return sections, tables
 
+    def parse_image(self, filepath: str, binary=None, callback=None, **kwargs) -> str:
+        ok, reason = self.check_available()
+        if not ok:
+            raise RuntimeError(f"PaddleOCR server not accessible: {reason}")
+
+        logging.info(f"PaddleOCR parse_image start: {filepath}")
+        result = PaddleOCRParser.parse_image(self, filepath=filepath, binary=binary, callback=callback, **kwargs)
+        logging.info(f"PaddleOCR parse_image done: {filepath}, text length: {len(result)}")
+        return result
+
 
 class OpenDataLoaderOcrModel(Base, OpenDataLoaderParser):
     _FACTORY_NAME = "OpenDataLoader"