From 62698725ca09e0ca5cc3995e8b17112c2761dbb1 Mon Sep 17 00:00:00 2001 From: Rander Date: Tue, 16 Jun 2026 19:34:38 +0800 Subject: [PATCH] feat(paddleocr): add image parsing support with async Job API (#16086) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Add image parsing capability to PaddleOCR integration, building on top of #15967 (async Job API migration). ## Changes ### `deepdoc/parser/paddleocr_parser.py` - Add `parse_image()` method that uses the same async Job API flow as `parse_pdf()` - Extracts text from `layoutParsingResults` → `prunedResult` → `parsing_res_list` - Returns concatenated block content as a single string ### `rag/llm/ocr_model.py` - Add `parse_image()` wrapper to `PaddleOCROcrModel` with availability check and logging ## Relationship to other PRs - **Depends on**: #15967 (async Job API migration) — this PR is based on that branch - **Replaces**: #14826 (original image processing PR based on old sync API) ## Notes This PR uses `base_url` and the async Job API (submit → poll → fetch) consistent with #15967, rather than the old `api_url` + sync POST pattern from #14826. --- deepdoc/parser/paddleocr_parser.py | 63 ++++++++++++++++++++++++++++++ rag/llm/ocr_model.py | 10 +++++ 2 files changed, 73 insertions(+) diff --git a/deepdoc/parser/paddleocr_parser.py b/deepdoc/parser/paddleocr_parser.py index be8c2d319c..3acb583787 100644 --- a/deepdoc/parser/paddleocr_parser.py +++ b/deepdoc/parser/paddleocr_parser.py @@ -323,6 +323,69 @@ class PaddleOCRParser(RAGFlowPdfParser): return sections, tables + def parse_image( + self, + filepath: str | PathLike[str], + binary: BytesIO | bytes | None = None, + callback: Optional[Callable[[float, str], None]] = None, + *, + base_url: Optional[str] = None, + access_token: Optional[str] = None, + algorithm: Optional[AlgorithmType] = None, + request_timeout: Optional[int] = None, + prettify_markdown: Optional[bool] = None, + show_formula_number: Optional[bool] = None, + visualize: Optional[bool] = None, + additional_params: Optional[dict[str, Any]] = None, + algorithm_config: Optional[dict[str, Any]] = None, + **kwargs: Any, + ) -> str: + """Parse image using PaddleOCR API. Returns extracted text.""" + self.logger.info(f"[PaddleOCR] parse_image start: {filepath}") + + config_dict = { + "base_url": base_url if base_url is not None else self.base_url, + "access_token": access_token if access_token is not None else self.access_token, + "algorithm": algorithm if algorithm is not None else self.algorithm, + "request_timeout": request_timeout if request_timeout is not None else self.request_timeout, + } + if prettify_markdown is not None: + config_dict["prettify_markdown"] = prettify_markdown + if show_formula_number is not None: + config_dict["show_formula_number"] = show_formula_number + if visualize is not None: + config_dict["visualize"] = visualize + if additional_params is not None: + config_dict["additional_params"] = additional_params + if algorithm_config is not None: + config_dict["algorithm_config"] = algorithm_config + + cfg = PaddleOCRConfig.from_dict(config_dict) + data_bytes = self._prepare_file_data(filepath, binary) + + if callback: + callback(0.1, "[PaddleOCR] submitting image request") + + result = self._send_request(data_bytes, cfg, callback) + + texts: list[str] = [] + layout_parsing_results = result.get("layoutParsingResults", []) + for layout_result in layout_parsing_results: + pruned_result = layout_result.get("prunedResult", {}) + parsing_res_list = pruned_result.get("parsing_res_list", []) + for block in parsing_res_list: + block_content = block.get("block_content", "").strip() + if block_content: + block_content = _remove_images_from_markdown(block_content) + if block_content.strip(): + texts.append(block_content.strip()) + + if callback: + callback(0.9, f"[PaddleOCR] image done, blocks: {len(texts)}") + + self.logger.info(f"[PaddleOCR] parse_image done: {filepath}, blocks: {len(texts)}") + return "\n".join(texts) + def _prepare_file_data(self, filepath: str | PathLike[str], binary: BytesIO | bytes | None) -> bytes: """Prepare file data for API request.""" source_path = Path(filepath) diff --git a/rag/llm/ocr_model.py b/rag/llm/ocr_model.py index c7a9a28d17..42e67d1bd9 100644 --- a/rag/llm/ocr_model.py +++ b/rag/llm/ocr_model.py @@ -148,6 +148,16 @@ class PaddleOCROcrModel(Base, PaddleOCRParser): sections, tables = PaddleOCRParser.parse_pdf(self, filepath=filepath, binary=binary, callback=callback, parse_method=parse_method, **kwargs) return sections, tables + def parse_image(self, filepath: str, binary=None, callback=None, **kwargs) -> str: + ok, reason = self.check_available() + if not ok: + raise RuntimeError(f"PaddleOCR server not accessible: {reason}") + + logging.info(f"PaddleOCR parse_image start: {filepath}") + result = PaddleOCRParser.parse_image(self, filepath=filepath, binary=binary, callback=callback, **kwargs) + logging.info(f"PaddleOCR parse_image done: {filepath}, text length: {len(result)}") + return result + class OpenDataLoaderOcrModel(Base, OpenDataLoaderParser): _FACTORY_NAME = "OpenDataLoader"