mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
feat(paddleocr): add image parsing support with async Job API (#16086)
## Summary Add image parsing capability to PaddleOCR integration, building on top of #15967 (async Job API migration). ## Changes ### `deepdoc/parser/paddleocr_parser.py` - Add `parse_image()` method that uses the same async Job API flow as `parse_pdf()` - Extracts text from `layoutParsingResults` → `prunedResult` → `parsing_res_list` - Returns concatenated block content as a single string ### `rag/llm/ocr_model.py` - Add `parse_image()` wrapper to `PaddleOCROcrModel` with availability check and logging ## Relationship to other PRs - **Depends on**: #15967 (async Job API migration) — this PR is based on that branch - **Replaces**: #14826 (original image processing PR based on old sync API) ## Notes This PR uses `base_url` and the async Job API (submit → poll → fetch) consistent with #15967, rather than the old `api_url` + sync POST pattern from #14826.
This commit is contained in:
@@ -323,6 +323,69 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
|
||||
return sections, tables
|
||||
|
||||
def parse_image(
|
||||
self,
|
||||
filepath: str | PathLike[str],
|
||||
binary: BytesIO | bytes | None = None,
|
||||
callback: Optional[Callable[[float, str], None]] = None,
|
||||
*,
|
||||
base_url: Optional[str] = None,
|
||||
access_token: Optional[str] = None,
|
||||
algorithm: Optional[AlgorithmType] = None,
|
||||
request_timeout: Optional[int] = None,
|
||||
prettify_markdown: Optional[bool] = None,
|
||||
show_formula_number: Optional[bool] = None,
|
||||
visualize: Optional[bool] = None,
|
||||
additional_params: Optional[dict[str, Any]] = None,
|
||||
algorithm_config: Optional[dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> str:
|
||||
"""Parse image using PaddleOCR API. Returns extracted text."""
|
||||
self.logger.info(f"[PaddleOCR] parse_image start: {filepath}")
|
||||
|
||||
config_dict = {
|
||||
"base_url": base_url if base_url is not None else self.base_url,
|
||||
"access_token": access_token if access_token is not None else self.access_token,
|
||||
"algorithm": algorithm if algorithm is not None else self.algorithm,
|
||||
"request_timeout": request_timeout if request_timeout is not None else self.request_timeout,
|
||||
}
|
||||
if prettify_markdown is not None:
|
||||
config_dict["prettify_markdown"] = prettify_markdown
|
||||
if show_formula_number is not None:
|
||||
config_dict["show_formula_number"] = show_formula_number
|
||||
if visualize is not None:
|
||||
config_dict["visualize"] = visualize
|
||||
if additional_params is not None:
|
||||
config_dict["additional_params"] = additional_params
|
||||
if algorithm_config is not None:
|
||||
config_dict["algorithm_config"] = algorithm_config
|
||||
|
||||
cfg = PaddleOCRConfig.from_dict(config_dict)
|
||||
data_bytes = self._prepare_file_data(filepath, binary)
|
||||
|
||||
if callback:
|
||||
callback(0.1, "[PaddleOCR] submitting image request")
|
||||
|
||||
result = self._send_request(data_bytes, cfg, callback)
|
||||
|
||||
texts: list[str] = []
|
||||
layout_parsing_results = result.get("layoutParsingResults", [])
|
||||
for layout_result in layout_parsing_results:
|
||||
pruned_result = layout_result.get("prunedResult", {})
|
||||
parsing_res_list = pruned_result.get("parsing_res_list", [])
|
||||
for block in parsing_res_list:
|
||||
block_content = block.get("block_content", "").strip()
|
||||
if block_content:
|
||||
block_content = _remove_images_from_markdown(block_content)
|
||||
if block_content.strip():
|
||||
texts.append(block_content.strip())
|
||||
|
||||
if callback:
|
||||
callback(0.9, f"[PaddleOCR] image done, blocks: {len(texts)}")
|
||||
|
||||
self.logger.info(f"[PaddleOCR] parse_image done: {filepath}, blocks: {len(texts)}")
|
||||
return "\n".join(texts)
|
||||
|
||||
def _prepare_file_data(self, filepath: str | PathLike[str], binary: BytesIO | bytes | None) -> bytes:
|
||||
"""Prepare file data for API request."""
|
||||
source_path = Path(filepath)
|
||||
|
||||
Reference in New Issue
Block a user