mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
## Summary This PR fixes two issues discovered during testing of the PaddleOCR async API refactoring: ### 1. PP-OCRv6 returns `ocrResults` instead of `layoutParsingResults` Models like PP-OCRv6 are pure text recognition models that return results in `ocrResults.prunedResult.rec_texts` format rather than the `layoutParsingResults.prunedResult.parsing_res_list` format used by layout-aware models (PaddleOCR-VL series). **Changes:** - `deepdoc/parser/paddleocr_parser.py`: Extract `ocrResults` alongside `layoutParsingResults` in `_send_request()`, add fallback logic in `_transfer_to_sections()` and `parse_image()` - `internal/entity/models/paddleocr.go`: Add `ocrResults` struct and fallback extraction in Go OCR handler ### 2. Image parsing not integrated into picture chunker The `parse_image()` method existed in PaddleOCRParser but was never called from `rag/app/picture.py` (the module that handles image file uploads). Users configuring PaddleOCR as their layout recognizer would still get local deepdoc OCR for images. **Changes:** - `rag/app/picture.py`: When `layout_recognize` is set to PaddleOCR, use `PaddleOCROcrModel.parse_image()` instead of local OCR. Falls back gracefully to local OCR on failure. ## Testing Verified end-to-end in Docker: - PaddleOCR-VL-1.6 PDF parsing: ✅ (10 text blocks with bbox) - PaddleOCR-VL-1.6 image parsing: ✅ (219 chars) - PP-OCRv6 PDF parsing with ocrResults fallback: ✅ (10 text blocks) - PP-OCRv6 image parsing with ocrResults fallback: ✅ (136 chars) ## Related PRs - #15967 (merged) - PaddleOCR async Job API refactoring + new models - #16086 (merged) - PaddleOCR image parsing support
812 lines
32 KiB
Python
812 lines
32 KiB
Python
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import tempfile
|
|
import time
|
|
from dataclasses import asdict, dataclass, field, fields
|
|
from io import BytesIO
|
|
from os import PathLike
|
|
from pathlib import Path
|
|
from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List
|
|
|
|
import numpy as np
|
|
import pdfplumber
|
|
import requests
|
|
from PIL import Image
|
|
|
|
from common.constants import MAXIMUM_PAGE_NUMBER
|
|
|
|
try:
|
|
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
|
except Exception:
|
|
|
|
class RAGFlowPdfParser:
|
|
pass
|
|
|
|
|
|
from deepdoc.parser.utils import extract_pdf_outlines
|
|
|
|
|
|
AlgorithmType = Literal["PaddleOCR-VL", "PaddleOCR-VL-1.6", "PP-OCRv5", "PP-OCRv6", "PP-StructureV3", "PaddleOCR-VL-1.5"]
|
|
SectionTuple = tuple[str, ...]
|
|
TableTuple = tuple[str, ...]
|
|
ParseResult = tuple[list[SectionTuple], list[TableTuple]]
|
|
SUPPORTED_PADDLEOCR_ALGORITHMS: tuple[AlgorithmType, ...] = (
|
|
"PaddleOCR-VL",
|
|
"PaddleOCR-VL-1.6",
|
|
"PP-OCRv5",
|
|
"PP-OCRv6",
|
|
"PP-StructureV3",
|
|
"PaddleOCR-VL-1.5",
|
|
)
|
|
|
|
|
|
_MARKDOWN_IMAGE_PATTERN = re.compile(
|
|
r"""
|
|
<div[^>]*>\s*
|
|
<img[^>]*/>\s*
|
|
</div>
|
|
|
|
|
<img[^>]*/>
|
|
""",
|
|
re.IGNORECASE | re.VERBOSE | re.DOTALL,
|
|
)
|
|
|
|
|
|
def _remove_images_from_markdown(markdown: str) -> str:
|
|
return _MARKDOWN_IMAGE_PATTERN.sub("", markdown)
|
|
|
|
|
|
def _normalize_bbox(bbox: list[Any] | tuple[Any, ...]) -> tuple[float, float, float, float]:
|
|
if len(bbox) < 4:
|
|
return 0.0, 0.0, 0.0, 0.0
|
|
|
|
left, top, right, bottom = (float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3]))
|
|
if left > right:
|
|
left, right = right, left
|
|
if top > bottom:
|
|
top, bottom = bottom, top
|
|
return left, top, right, bottom
|
|
|
|
|
|
@dataclass
|
|
class PaddleOCRVLConfig:
|
|
"""Configuration for PaddleOCR-VL algorithm."""
|
|
|
|
use_doc_orientation_classify: Optional[bool] = False
|
|
use_doc_unwarping: Optional[bool] = False
|
|
use_layout_detection: Optional[bool] = None
|
|
use_chart_recognition: Optional[bool] = None
|
|
use_seal_recognition: Optional[bool] = None
|
|
use_ocr_for_image_block: Optional[bool] = None
|
|
layout_threshold: Optional[Union[float, dict]] = None
|
|
layout_nms: Optional[bool] = None
|
|
layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None
|
|
layout_merge_bboxes_mode: Optional[Union[str, dict]] = None
|
|
layout_shape_mode: Optional[str] = None
|
|
prompt_label: Optional[str] = None
|
|
format_block_content: Optional[bool] = True
|
|
repetition_penalty: Optional[float] = None
|
|
temperature: Optional[float] = None
|
|
top_p: Optional[float] = None
|
|
min_pixels: Optional[int] = None
|
|
max_pixels: Optional[int] = None
|
|
max_new_tokens: Optional[int] = None
|
|
merge_layout_blocks: Optional[bool] = False
|
|
markdown_ignore_labels: Optional[List[str]] = None
|
|
vlm_extra_args: Optional[dict] = None
|
|
restructure_pages: Optional[bool] = False
|
|
merge_tables: Optional[bool] = None
|
|
relevel_titles: Optional[bool] = None
|
|
|
|
|
|
@dataclass
|
|
class PaddleOCRConfig:
|
|
"""Main configuration for PaddleOCR parser."""
|
|
|
|
base_url: str = "https://paddleocr.aistudio-app.com"
|
|
access_token: Optional[str] = None
|
|
algorithm: AlgorithmType = "PaddleOCR-VL"
|
|
request_timeout: int = 600
|
|
prettify_markdown: bool = True
|
|
show_formula_number: bool = True
|
|
visualize: bool = False
|
|
additional_params: dict[str, Any] = field(default_factory=dict)
|
|
algorithm_config: dict[str, Any] = field(default_factory=dict)
|
|
|
|
@classmethod
|
|
def from_dict(cls, config: Optional[dict[str, Any]]) -> "PaddleOCRConfig":
|
|
"""Create configuration from dictionary."""
|
|
if not config:
|
|
return cls()
|
|
|
|
cfg = config.copy()
|
|
algorithm = cfg.get("algorithm", "PaddleOCR-VL")
|
|
|
|
# Validate algorithm
|
|
if algorithm not in SUPPORTED_PADDLEOCR_ALGORITHMS:
|
|
raise ValueError(f"Unsupported algorithm: {algorithm}")
|
|
|
|
# Extract algorithm-specific configuration
|
|
algorithm_config: dict[str, Any] = {}
|
|
if algorithm in SUPPORTED_PADDLEOCR_ALGORITHMS:
|
|
algorithm_config = asdict(PaddleOCRVLConfig())
|
|
algorithm_config_user = cfg.get("algorithm_config")
|
|
if isinstance(algorithm_config_user, dict):
|
|
algorithm_config.update({k: v for k, v in algorithm_config_user.items() if v is not None})
|
|
|
|
# Remove processed keys
|
|
cfg.pop("algorithm_config", None)
|
|
|
|
# Prepare initialization arguments
|
|
field_names = {field.name for field in fields(cls)}
|
|
init_kwargs: dict[str, Any] = {}
|
|
|
|
for field_name in field_names:
|
|
if field_name in cfg:
|
|
init_kwargs[field_name] = cfg[field_name]
|
|
|
|
init_kwargs["algorithm_config"] = algorithm_config
|
|
|
|
return cls(**init_kwargs)
|
|
|
|
@classmethod
|
|
def from_kwargs(cls, **kwargs: Any) -> "PaddleOCRConfig":
|
|
"""Create configuration from keyword arguments."""
|
|
return cls.from_dict(kwargs)
|
|
|
|
|
|
_DEFAULT_BASE_URL = "https://paddleocr.aistudio-app.com"
|
|
|
|
|
|
class PaddleOCRParser(RAGFlowPdfParser):
|
|
"""Parser for PDF documents using PaddleOCR API."""
|
|
|
|
_ZOOMIN = 2
|
|
|
|
_COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = {
|
|
"prettify_markdown": "prettifyMarkdown",
|
|
"show_formula_number": "showFormulaNumber",
|
|
"visualize": "visualize",
|
|
}
|
|
|
|
_VL_FIELD_MAPPING: ClassVar[dict[str, str]] = {
|
|
"use_doc_orientation_classify": "useDocOrientationClassify",
|
|
"use_doc_unwarping": "useDocUnwarping",
|
|
"use_layout_detection": "useLayoutDetection",
|
|
"use_chart_recognition": "useChartRecognition",
|
|
"use_seal_recognition": "useSealRecognition",
|
|
"use_ocr_for_image_block": "useOcrForImageBlock",
|
|
"layout_threshold": "layoutThreshold",
|
|
"layout_nms": "layoutNms",
|
|
"layout_unclip_ratio": "layoutUnclipRatio",
|
|
"layout_merge_bboxes_mode": "layoutMergeBboxesMode",
|
|
"layout_shape_mode": "layoutShapeMode",
|
|
"prompt_label": "promptLabel",
|
|
"format_block_content": "formatBlockContent",
|
|
"repetition_penalty": "repetitionPenalty",
|
|
"temperature": "temperature",
|
|
"top_p": "topP",
|
|
"min_pixels": "minPixels",
|
|
"max_pixels": "maxPixels",
|
|
"max_new_tokens": "maxNewTokens",
|
|
"merge_layout_blocks": "mergeLayoutBlocks",
|
|
"markdown_ignore_labels": "markdownIgnoreLabels",
|
|
"vlm_extra_args": "vlmExtraArgs",
|
|
"restructure_pages": "restructurePages",
|
|
"merge_tables": "mergeTables",
|
|
"relevel_titles": "relevelTitles",
|
|
}
|
|
|
|
_ALGORITHM_FIELD_MAPPINGS: ClassVar[dict[str, dict[str, str]]] = {
|
|
"PaddleOCR-VL": _VL_FIELD_MAPPING,
|
|
"PP-OCRv5": _VL_FIELD_MAPPING,
|
|
"PP-StructureV3": _VL_FIELD_MAPPING,
|
|
"PaddleOCR-VL-1.5": _VL_FIELD_MAPPING,
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
base_url: Optional[str] = None,
|
|
access_token: Optional[str] = None,
|
|
algorithm: AlgorithmType = "PaddleOCR-VL",
|
|
*,
|
|
request_timeout: int = 600,
|
|
):
|
|
"""Initialize PaddleOCR parser."""
|
|
self.outlines = []
|
|
self.base_url = base_url.rstrip("/") if base_url else os.getenv("PADDLEOCR_BASE_URL", _DEFAULT_BASE_URL)
|
|
self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN")
|
|
self.algorithm = algorithm
|
|
self.request_timeout = request_timeout
|
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
|
|
# Force PDF file type
|
|
self.file_type = 0
|
|
|
|
# Initialize page images for cropping
|
|
self.page_images: list[Image.Image] = []
|
|
self.page_from = 0
|
|
|
|
# Public methods
|
|
def check_installation(self) -> tuple[bool, str]:
|
|
"""Check if the parser is properly installed and configured."""
|
|
if not self.access_token:
|
|
return False, "[PaddleOCR] Access token not configured"
|
|
|
|
return True, ""
|
|
|
|
def parse_pdf(
|
|
self,
|
|
filepath: str | PathLike[str],
|
|
binary: BytesIO | bytes | None = None,
|
|
callback: Optional[Callable[[float, str], None]] = None,
|
|
*,
|
|
parse_method: str = "raw",
|
|
base_url: Optional[str] = None,
|
|
access_token: Optional[str] = None,
|
|
algorithm: Optional[AlgorithmType] = None,
|
|
request_timeout: Optional[int] = None,
|
|
prettify_markdown: Optional[bool] = None,
|
|
show_formula_number: Optional[bool] = None,
|
|
visualize: Optional[bool] = None,
|
|
additional_params: Optional[dict[str, Any]] = None,
|
|
algorithm_config: Optional[dict[str, Any]] = None,
|
|
**kwargs: Any,
|
|
) -> ParseResult:
|
|
"""Parse PDF document using PaddleOCR API."""
|
|
self.outlines = extract_pdf_outlines(binary if binary is not None else filepath)
|
|
config_dict = {
|
|
"base_url": base_url if base_url is not None else self.base_url,
|
|
"access_token": access_token if access_token is not None else self.access_token,
|
|
"algorithm": algorithm if algorithm is not None else self.algorithm,
|
|
"request_timeout": request_timeout if request_timeout is not None else self.request_timeout,
|
|
}
|
|
if prettify_markdown is not None:
|
|
config_dict["prettify_markdown"] = prettify_markdown
|
|
if show_formula_number is not None:
|
|
config_dict["show_formula_number"] = show_formula_number
|
|
if visualize is not None:
|
|
config_dict["visualize"] = visualize
|
|
if additional_params is not None:
|
|
config_dict["additional_params"] = additional_params
|
|
if algorithm_config is not None:
|
|
config_dict["algorithm_config"] = algorithm_config
|
|
|
|
# Forward any extra kwargs that match PaddleOCRConfig fields
|
|
config_field_names = {f.name for f in fields(PaddleOCRConfig)}
|
|
config_dict.update({k: v for k, v in kwargs.items() if k in config_field_names and v is not None})
|
|
|
|
cfg = PaddleOCRConfig.from_dict(config_dict)
|
|
|
|
if not cfg.base_url:
|
|
raise RuntimeError("[PaddleOCR] Base URL missing")
|
|
|
|
# Prepare file data and generate page images for cropping
|
|
data_bytes = self._prepare_file_data(filepath, binary)
|
|
|
|
# Generate page images for cropping functionality
|
|
input_source = filepath if binary is None else binary
|
|
try:
|
|
self.__images__(input_source, callback=callback)
|
|
except Exception as e:
|
|
self.logger.warning(f"[PaddleOCR] Failed to generate page images for cropping: {e}")
|
|
|
|
# Build and send request
|
|
result = self._send_request(data_bytes, cfg, callback)
|
|
|
|
# Process response
|
|
sections = self._transfer_to_sections(result, algorithm=cfg.algorithm, parse_method=parse_method)
|
|
if callback:
|
|
callback(0.9, f"[PaddleOCR] done, sections: {len(sections)}")
|
|
|
|
tables = self._transfer_to_tables(result)
|
|
if callback:
|
|
callback(1.0, f"[PaddleOCR] done, tables: {len(tables)}")
|
|
|
|
return sections, tables
|
|
|
|
def parse_image(
|
|
self,
|
|
filepath: str | PathLike[str],
|
|
binary: BytesIO | bytes | None = None,
|
|
callback: Optional[Callable[[float, str], None]] = None,
|
|
*,
|
|
base_url: Optional[str] = None,
|
|
access_token: Optional[str] = None,
|
|
algorithm: Optional[AlgorithmType] = None,
|
|
request_timeout: Optional[int] = None,
|
|
prettify_markdown: Optional[bool] = None,
|
|
show_formula_number: Optional[bool] = None,
|
|
visualize: Optional[bool] = None,
|
|
additional_params: Optional[dict[str, Any]] = None,
|
|
algorithm_config: Optional[dict[str, Any]] = None,
|
|
**kwargs: Any,
|
|
) -> str:
|
|
"""Parse image using PaddleOCR API. Returns extracted text."""
|
|
self.logger.info(f"[PaddleOCR] parse_image start: {filepath}")
|
|
|
|
config_dict = {
|
|
"base_url": base_url if base_url is not None else self.base_url,
|
|
"access_token": access_token if access_token is not None else self.access_token,
|
|
"algorithm": algorithm if algorithm is not None else self.algorithm,
|
|
"request_timeout": request_timeout if request_timeout is not None else self.request_timeout,
|
|
}
|
|
if prettify_markdown is not None:
|
|
config_dict["prettify_markdown"] = prettify_markdown
|
|
if show_formula_number is not None:
|
|
config_dict["show_formula_number"] = show_formula_number
|
|
if visualize is not None:
|
|
config_dict["visualize"] = visualize
|
|
if additional_params is not None:
|
|
config_dict["additional_params"] = additional_params
|
|
if algorithm_config is not None:
|
|
config_dict["algorithm_config"] = algorithm_config
|
|
|
|
cfg = PaddleOCRConfig.from_dict(config_dict)
|
|
data_bytes = self._prepare_file_data(filepath, binary)
|
|
|
|
if callback:
|
|
callback(0.1, "[PaddleOCR] submitting image request")
|
|
|
|
result = self._send_request(data_bytes, cfg, callback)
|
|
|
|
texts: list[str] = []
|
|
layout_parsing_results = result.get("layoutParsingResults", [])
|
|
for layout_result in layout_parsing_results:
|
|
pruned_result = layout_result.get("prunedResult", {})
|
|
parsing_res_list = pruned_result.get("parsing_res_list", [])
|
|
for block in parsing_res_list:
|
|
block_content = block.get("block_content", "").strip()
|
|
if block_content:
|
|
block_content = _remove_images_from_markdown(block_content)
|
|
if block_content.strip():
|
|
texts.append(block_content.strip())
|
|
|
|
# Fallback to ocrResults for models like PP-OCRv6
|
|
if not texts:
|
|
ocr_results = result.get("ocrResults", [])
|
|
for ocr_result in ocr_results:
|
|
pruned = ocr_result.get("prunedResult", {})
|
|
rec_texts = pruned.get("rec_texts", [])
|
|
texts.extend(t.strip() for t in rec_texts if t.strip())
|
|
|
|
if callback:
|
|
callback(0.9, f"[PaddleOCR] image done, blocks: {len(texts)}")
|
|
|
|
self.logger.info(f"[PaddleOCR] parse_image done: {filepath}, blocks: {len(texts)}")
|
|
return "\n".join(texts)
|
|
|
|
def _prepare_file_data(self, filepath: str | PathLike[str], binary: BytesIO | bytes | None) -> bytes:
|
|
"""Prepare file data for API request."""
|
|
source_path = Path(filepath)
|
|
|
|
if binary is not None:
|
|
if isinstance(binary, (bytes, bytearray)):
|
|
return binary
|
|
return binary.getbuffer().tobytes()
|
|
|
|
if not source_path.exists():
|
|
raise FileNotFoundError(f"[PaddleOCR] file not found: {source_path}")
|
|
|
|
return source_path.read_bytes()
|
|
|
|
def _build_payload(self, data: bytes, file_type: int, config: PaddleOCRConfig) -> dict[str, Any]:
|
|
"""Build optionalPayload for async Job API request."""
|
|
payload: dict[str, Any] = {}
|
|
|
|
# Add common parameters
|
|
for param_key, param_value in [
|
|
("prettify_markdown", config.prettify_markdown),
|
|
("show_formula_number", config.show_formula_number),
|
|
("visualize", config.visualize),
|
|
]:
|
|
if param_value is not None:
|
|
api_param = self._COMMON_FIELD_MAPPING[param_key]
|
|
payload[api_param] = param_value
|
|
|
|
# Add algorithm-specific parameters
|
|
algorithm_mapping = self._ALGORITHM_FIELD_MAPPINGS.get(config.algorithm, {})
|
|
for param_key, param_value in config.algorithm_config.items():
|
|
if param_value is not None and param_key in algorithm_mapping:
|
|
api_param = algorithm_mapping[param_key]
|
|
payload[api_param] = param_value
|
|
|
|
# Add any additional parameters
|
|
if config.additional_params:
|
|
payload.update(config.additional_params)
|
|
|
|
return payload
|
|
|
|
def _send_request(self, data: bytes, config: PaddleOCRConfig, callback: Optional[Callable[[float, str], None]]) -> dict[str, Any]:
|
|
"""Send request to PaddleOCR async Job API (submit → poll → fetch)."""
|
|
optional_payload = self._build_payload(data, self.file_type, config)
|
|
|
|
# Prepare headers
|
|
headers: dict[str, str] = {"Client-Platform": "ragflow"}
|
|
if config.access_token:
|
|
headers["Authorization"] = f"Bearer {config.access_token}"
|
|
|
|
jobs_url = f"{config.base_url.rstrip('/')}/api/v2/ocr/jobs"
|
|
deadline = time.monotonic() + config.request_timeout
|
|
|
|
def _remaining() -> float:
|
|
r = deadline - time.monotonic()
|
|
if r <= 0:
|
|
raise RuntimeError(f"[PaddleOCR] timed out after {config.request_timeout}s")
|
|
return r
|
|
|
|
self.logger.info("[PaddleOCR] submitting job")
|
|
if callback:
|
|
callback(0.1, "[PaddleOCR] submitting request")
|
|
|
|
# Step 1: Submit job with file upload
|
|
tmp_file = None
|
|
try:
|
|
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
|
tmp_file.write(data)
|
|
tmp_file.close()
|
|
|
|
form_data = {
|
|
"model": config.algorithm,
|
|
"optionalPayload": json.dumps(optional_payload),
|
|
}
|
|
with open(tmp_file.name, "rb") as f:
|
|
resp = requests.post(
|
|
jobs_url,
|
|
data=form_data,
|
|
files={"file": ("document.pdf", f)},
|
|
headers=headers,
|
|
timeout=_remaining(),
|
|
)
|
|
except Exception as exc:
|
|
if callback:
|
|
callback(-1, f"[PaddleOCR] submit failed: {exc}")
|
|
raise RuntimeError(f"[PaddleOCR] submit failed: {exc}")
|
|
finally:
|
|
if tmp_file and os.path.exists(tmp_file.name):
|
|
os.unlink(tmp_file.name)
|
|
|
|
if resp.status_code != 200:
|
|
raise RuntimeError(f"[PaddleOCR] submit failed: HTTP {resp.status_code} {resp.text}")
|
|
|
|
try:
|
|
submit_data = resp.json()
|
|
except ValueError as exc:
|
|
raise RuntimeError(f"[PaddleOCR] submit response is not JSON: {exc}")
|
|
job_id = submit_data.get("data", {}).get("jobId") or submit_data.get("jobId")
|
|
if not job_id:
|
|
raise RuntimeError(f"[PaddleOCR] job ID not found in response: {submit_data}")
|
|
|
|
if callback:
|
|
callback(0.2, f"[PaddleOCR] job submitted: {job_id}")
|
|
|
|
# Step 2: Poll until done (exponential backoff)
|
|
poll_url = f"{jobs_url}/{job_id}"
|
|
interval = 3.0
|
|
multiplier = 1.5
|
|
max_interval = 15.0
|
|
self.logger.info(f"[PaddleOCR] polling job {job_id}")
|
|
|
|
while True:
|
|
if time.monotonic() >= deadline:
|
|
raise RuntimeError(f"[PaddleOCR] job {job_id} timed out after {config.request_timeout}s")
|
|
|
|
try:
|
|
poll_resp = requests.get(poll_url, headers=headers, timeout=_remaining())
|
|
except Exception as exc:
|
|
raise RuntimeError(f"[PaddleOCR] poll failed: {exc}")
|
|
|
|
if poll_resp.status_code != 200:
|
|
raise RuntimeError(f"[PaddleOCR] poll failed: HTTP {poll_resp.status_code} {poll_resp.text[:200]}")
|
|
|
|
try:
|
|
poll_data = poll_resp.json()
|
|
except ValueError as exc:
|
|
raise RuntimeError(f"[PaddleOCR] poll response is not JSON: {exc}")
|
|
state = poll_data.get("data", {}).get("state") or poll_data.get("state")
|
|
|
|
if state == "done":
|
|
self.logger.info(f"[PaddleOCR] job {job_id} done")
|
|
if callback:
|
|
callback(0.7, "[PaddleOCR] job done, fetching result")
|
|
break
|
|
elif state == "failed":
|
|
error_msg = poll_data.get("data", {}).get("errorMsg", "Unknown error")
|
|
self.logger.error(f"[PaddleOCR] job {job_id} failed: {error_msg}")
|
|
raise RuntimeError(f"[PaddleOCR] job failed: {error_msg}")
|
|
|
|
sleep_time = min(interval, max(0, deadline - time.monotonic()))
|
|
time.sleep(sleep_time)
|
|
interval = min(interval * multiplier, max_interval)
|
|
|
|
# Step 3: Fetch result
|
|
result_data = poll_data.get("data", {})
|
|
result_json_url = result_data.get("resultJsonUrl") or (result_data.get("resultUrl") or {}).get("jsonUrl")
|
|
if not result_json_url:
|
|
raise RuntimeError(f"[PaddleOCR] result URL not found: {poll_data}")
|
|
|
|
try:
|
|
result_resp = requests.get(result_json_url, timeout=_remaining())
|
|
result_resp.raise_for_status()
|
|
except Exception as exc:
|
|
raise RuntimeError(f"[PaddleOCR] failed to fetch result: {exc}")
|
|
|
|
# Parse JSONL result
|
|
jsonl_lines = result_resp.text.strip().split("\n")
|
|
jsonl_data = []
|
|
for line in jsonl_lines:
|
|
line = line.strip()
|
|
if line:
|
|
try:
|
|
jsonl_data.append(json.loads(line))
|
|
except ValueError as exc:
|
|
raise RuntimeError(f"[PaddleOCR] result JSONL parse error: {exc}")
|
|
|
|
if callback:
|
|
callback(0.8, "[PaddleOCR] result received")
|
|
|
|
# Extract raw result (preserving prunedResult with bbox info)
|
|
combined_result: dict[str, Any] = {"layoutParsingResults": [], "ocrResults": []}
|
|
for line_obj in jsonl_data:
|
|
result = line_obj.get("result", {})
|
|
layout_results = result.get("layoutParsingResults", [])
|
|
combined_result["layoutParsingResults"].extend(layout_results)
|
|
ocr_results = result.get("ocrResults", [])
|
|
combined_result["ocrResults"].extend(ocr_results)
|
|
|
|
return combined_result
|
|
|
|
def _transfer_to_sections(self, result: dict[str, Any], algorithm: AlgorithmType, parse_method: str) -> list[SectionTuple]:
|
|
"""Convert API response to section tuples."""
|
|
sections: list[SectionTuple] = []
|
|
|
|
if algorithm in SUPPORTED_PADDLEOCR_ALGORITHMS:
|
|
layout_parsing_results = result.get("layoutParsingResults", [])
|
|
|
|
# Fallback to ocrResults for models like PP-OCRv6 that only return text recognition
|
|
if not layout_parsing_results:
|
|
ocr_results = result.get("ocrResults", [])
|
|
for page_idx, ocr_result in enumerate(ocr_results):
|
|
pruned = ocr_result.get("prunedResult", {})
|
|
rec_texts = pruned.get("rec_texts", [])
|
|
rec_boxes = pruned.get("rec_boxes", [])
|
|
for i, text in enumerate(rec_texts):
|
|
text = text.strip()
|
|
if not text:
|
|
continue
|
|
if i < len(rec_boxes):
|
|
box = rec_boxes[i]
|
|
left, top, right, bottom = box[0], box[1], box[2], box[3]
|
|
else:
|
|
left, top, right, bottom = 0, 0, 0, 0
|
|
tag = f"@@{page_idx + 1}\t{left // self._ZOOMIN}\t{right // self._ZOOMIN}\t{top // self._ZOOMIN}\t{bottom // self._ZOOMIN}##"
|
|
sections.append((text, tag))
|
|
return sections
|
|
|
|
for page_idx, layout_result in enumerate(layout_parsing_results):
|
|
pruned_result = layout_result.get("prunedResult", {})
|
|
parsing_res_list = pruned_result.get("parsing_res_list", [])
|
|
|
|
for block in parsing_res_list:
|
|
block_content = block.get("block_content", "").strip()
|
|
if not block_content:
|
|
continue
|
|
|
|
# Remove images
|
|
block_content = _remove_images_from_markdown(block_content)
|
|
|
|
label = block.get("block_label", "")
|
|
block_bbox = block.get("block_bbox", [0, 0, 0, 0])
|
|
left, top, right, bottom = _normalize_bbox(block_bbox)
|
|
|
|
tag = f"@@{page_idx + 1}\t{left // self._ZOOMIN}\t{right // self._ZOOMIN}\t{top // self._ZOOMIN}\t{bottom // self._ZOOMIN}##"
|
|
|
|
if parse_method in {"manual", "pipeline"}:
|
|
sections.append((block_content, label, tag))
|
|
elif parse_method == "paper":
|
|
sections.append((block_content + tag, label))
|
|
else:
|
|
sections.append((block_content, tag))
|
|
|
|
return sections
|
|
|
|
def _transfer_to_tables(self, result: dict[str, Any]) -> list[TableTuple]:
|
|
"""Convert API response to table tuples."""
|
|
return []
|
|
|
|
def __images__(self, fnm, page_from=0, page_to=MAXIMUM_PAGE_NUMBER, callback=None):
|
|
"""Generate page images from PDF for cropping."""
|
|
self.page_from = page_from
|
|
self.page_to = page_to
|
|
try:
|
|
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
|
|
self.pdf = pdf
|
|
self.page_images = [p.to_image(resolution=72, antialias=True).original for i, p in enumerate(self.pdf.pages[page_from:page_to])]
|
|
except Exception as e:
|
|
self.page_images = None
|
|
self.logger.exception(e)
|
|
|
|
@staticmethod
|
|
def extract_positions(txt: str):
|
|
"""Extract position information from text tags."""
|
|
poss = []
|
|
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
|
|
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
|
|
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
|
|
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
|
return poss
|
|
|
|
def crop(self, text: str, need_position: bool = False):
|
|
"""Crop images from PDF based on position tags in text."""
|
|
imgs = []
|
|
poss = self.extract_positions(text)
|
|
|
|
if not poss:
|
|
if need_position:
|
|
return None, None
|
|
return
|
|
|
|
if not getattr(self, "page_images", None):
|
|
self.logger.warning("[PaddleOCR] crop called without page images; skipping image generation.")
|
|
if need_position:
|
|
return None, None
|
|
return
|
|
|
|
page_count = len(self.page_images)
|
|
|
|
filtered_poss = []
|
|
for pns, left, right, top, bottom in poss:
|
|
if not pns:
|
|
self.logger.warning("[PaddleOCR] Empty page index list in crop; skipping this position.")
|
|
continue
|
|
valid_pns = [p for p in pns if 0 <= p < page_count]
|
|
if not valid_pns:
|
|
self.logger.warning(f"[PaddleOCR] All page indices {pns} out of range for {page_count} pages; skipping.")
|
|
continue
|
|
filtered_poss.append((valid_pns, left, right, top, bottom))
|
|
|
|
poss = filtered_poss
|
|
if not poss:
|
|
self.logger.warning("[PaddleOCR] No valid positions after filtering; skip cropping.")
|
|
if need_position:
|
|
return None, None
|
|
return
|
|
|
|
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
|
GAP = 6
|
|
pos = poss[0]
|
|
first_page_idx = pos[0][0]
|
|
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
|
pos = poss[-1]
|
|
last_page_idx = pos[0][-1]
|
|
if not (0 <= last_page_idx < page_count):
|
|
self.logger.warning(f"[PaddleOCR] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
|
|
if need_position:
|
|
return None, None
|
|
return
|
|
last_page_height = self.page_images[last_page_idx].size[1]
|
|
poss.append(
|
|
(
|
|
[last_page_idx],
|
|
pos[1],
|
|
pos[2],
|
|
min(last_page_height, pos[4] + GAP),
|
|
min(last_page_height, pos[4] + 120),
|
|
)
|
|
)
|
|
|
|
positions = []
|
|
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
|
right = left + max_width
|
|
|
|
if bottom <= top:
|
|
bottom = top + 2
|
|
|
|
for pn in pns[1:]:
|
|
if 0 <= pn - 1 < page_count:
|
|
bottom += self.page_images[pn - 1].size[1]
|
|
else:
|
|
self.logger.warning(f"[PaddleOCR] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
|
|
|
|
if not (0 <= pns[0] < page_count):
|
|
self.logger.warning(f"[PaddleOCR] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
|
|
continue
|
|
|
|
img0 = self.page_images[pns[0]]
|
|
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
|
|
if x0 > x1:
|
|
x0, x1 = x1, x0
|
|
if y0 > y1:
|
|
y0, y1 = y1, y0
|
|
x0 = max(0, min(x0, img0.size[0]))
|
|
x1 = max(0, min(x1, img0.size[0]))
|
|
y0 = max(0, min(y0, img0.size[1]))
|
|
y1 = max(0, min(y1, img0.size[1]))
|
|
if x1 <= x0 or y1 <= y0:
|
|
continue
|
|
crop0 = img0.crop((x0, y0, x1, y1))
|
|
imgs.append(crop0)
|
|
if 0 < ii < len(poss) - 1:
|
|
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
|
|
|
|
bottom -= img0.size[1]
|
|
for pn in pns[1:]:
|
|
if not (0 <= pn < page_count):
|
|
self.logger.warning(f"[PaddleOCR] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
|
|
continue
|
|
page = self.page_images[pn]
|
|
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
|
|
if x0 > x1:
|
|
x0, x1 = x1, x0
|
|
if y0 > y1:
|
|
y0, y1 = y1, y0
|
|
x0 = max(0, min(x0, page.size[0]))
|
|
x1 = max(0, min(x1, page.size[0]))
|
|
y0 = max(0, min(y0, page.size[1]))
|
|
y1 = max(0, min(y1, page.size[1]))
|
|
if x1 <= x0 or y1 <= y0:
|
|
bottom -= page.size[1]
|
|
continue
|
|
cimgp = page.crop((x0, y0, x1, y1))
|
|
imgs.append(cimgp)
|
|
if 0 < ii < len(poss) - 1:
|
|
positions.append((pn + self.page_from, x0, x1, y0, y1))
|
|
bottom -= page.size[1]
|
|
|
|
if not imgs:
|
|
if need_position:
|
|
return None, None
|
|
return
|
|
|
|
total_height = 0
|
|
max_width = 0
|
|
img_sizes = []
|
|
for img in imgs:
|
|
w, h = img.size
|
|
img_sizes.append((w, h))
|
|
max_width = max(max_width, w)
|
|
total_height += h + GAP
|
|
|
|
pic = Image.new("RGB", (max_width, int(total_height)), (245, 245, 245))
|
|
current_height = 0
|
|
imgs_count = len(imgs)
|
|
for ii, (img, (w, h)) in enumerate(zip(imgs, img_sizes)):
|
|
if ii == 0 or ii + 1 == imgs_count:
|
|
img = img.convert("RGBA")
|
|
overlay = Image.new("RGBA", img.size, (0, 0, 0, 128))
|
|
img = Image.alpha_composite(img, overlay).convert("RGB")
|
|
pic.paste(img, (0, int(current_height)))
|
|
current_height += h + GAP
|
|
|
|
if need_position:
|
|
return pic, positions
|
|
return pic
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.INFO)
|
|
parser = PaddleOCRParser(
|
|
base_url=os.getenv("PADDLEOCR_BASE_URL") or None,
|
|
algorithm=os.getenv("PADDLEOCR_ALGORITHM", "PaddleOCR-VL"),
|
|
)
|
|
ok, reason = parser.check_installation()
|
|
print("PaddleOCR available:", ok, reason)
|