feat(paddleocr): add image parsing support with async Job API (#16086)

## Summary Add image parsing capability to PaddleOCR integration, building on top of #15967 (async Job API migration). ## Changes ### `deepdoc/parser/paddleocr_parser.py` - Add `parse_image()` method that uses the same async Job API flow as `parse_pdf()` - Extracts text from `layoutParsingResults` → `prunedResult` → `parsing_res_list` - Returns concatenated block content as a single string ### `rag/llm/ocr_model.py` - Add `parse_image()` wrapper to `PaddleOCROcrModel` with availability check and logging ## Relationship to other PRs - **Depends on**: #15967 (async Job API migration) — this PR is based on that branch - **Replaces**: #14826 (original image processing PR based on old sync API) ## Notes This PR uses `base_url` and the async Job API (submit → poll → fetch) consistent with #15967, rather than the old `api_url` + sync POST pattern from #14826.
2026-06-29 15:31:05 +08:00 · 2026-06-16 19:34:38 +08:00
parent 1235da7093
commit 62698725ca
2 changed files with 73 additions and 0 deletions
--- a/deepdoc/parser/paddleocr_parser.py
+++ b/deepdoc/parser/paddleocr_parser.py
@@ -323,6 +323,69 @@ class PaddleOCRParser(RAGFlowPdfParser):

        return sections, tables

+    def parse_image(
+        self,
+        filepath: str | PathLike[str],
+        binary: BytesIO | bytes | None = None,
+        callback: Optional[Callable[[float, str], None]] = None,
+        *,
+        base_url: Optional[str] = None,
+        access_token: Optional[str] = None,
+        algorithm: Optional[AlgorithmType] = None,
+        request_timeout: Optional[int] = None,
+        prettify_markdown: Optional[bool] = None,
+        show_formula_number: Optional[bool] = None,
+        visualize: Optional[bool] = None,
+        additional_params: Optional[dict[str, Any]] = None,
+        algorithm_config: Optional[dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Parse image using PaddleOCR API. Returns extracted text."""
+        self.logger.info(f"[PaddleOCR] parse_image start: {filepath}")
+
+        config_dict = {
+            "base_url": base_url if base_url is not None else self.base_url,
+            "access_token": access_token if access_token is not None else self.access_token,
+            "algorithm": algorithm if algorithm is not None else self.algorithm,
+            "request_timeout": request_timeout if request_timeout is not None else self.request_timeout,
+        }
+        if prettify_markdown is not None:
+            config_dict["prettify_markdown"] = prettify_markdown
+        if show_formula_number is not None:
+            config_dict["show_formula_number"] = show_formula_number
+        if visualize is not None:
+            config_dict["visualize"] = visualize
+        if additional_params is not None:
+            config_dict["additional_params"] = additional_params
+        if algorithm_config is not None:
+            config_dict["algorithm_config"] = algorithm_config
+
+        cfg = PaddleOCRConfig.from_dict(config_dict)
+        data_bytes = self._prepare_file_data(filepath, binary)
+
+        if callback:
+            callback(0.1, "[PaddleOCR] submitting image request")
+
+        result = self._send_request(data_bytes, cfg, callback)
+
+        texts: list[str] = []
+        layout_parsing_results = result.get("layoutParsingResults", [])
+        for layout_result in layout_parsing_results:
+            pruned_result = layout_result.get("prunedResult", {})
+            parsing_res_list = pruned_result.get("parsing_res_list", [])
+            for block in parsing_res_list:
+                block_content = block.get("block_content", "").strip()
+                if block_content:
+                    block_content = _remove_images_from_markdown(block_content)
+                    if block_content.strip():
+                        texts.append(block_content.strip())
+
+        if callback:
+            callback(0.9, f"[PaddleOCR] image done, blocks: {len(texts)}")
+
+        self.logger.info(f"[PaddleOCR] parse_image done: {filepath}, blocks: {len(texts)}")
+        return "\n".join(texts)
+
    def _prepare_file_data(self, filepath: str | PathLike[str], binary: BytesIO | bytes | None) -> bytes:
        """Prepare file data for API request."""
        source_path = Path(filepath)