refactor(paddleocr): migrate from sync API to async Job API (#15967)

## Summary

Migrate PaddleOCR integration from the deprecated synchronous HTTP API
to the new asynchronous Job API (`submit → poll → fetch`), aligning with
PaddleOCR 3.6.0+ architecture.

## Changes

### Python (`deepdoc/parser/paddleocr_parser.py`)
- Replace synchronous `requests.post()` with async Job API flow (submit
→ poll → fetch)
- Authentication: `token {token}` → `Bearer {token}`
- File transfer: base64 JSON body → multipart file upload
- Polling: exponential backoff (initial 3s, ×1.5, max 15s, timeout
controlled by `request_timeout`)
- Result: fetch full JSONL from result URL, preserving `prunedResult`
with bbox info for crop functionality
- Rename `api_url` → `base_url` (backward compatible: `api_url` still
accepted as fallback)

### Python (`rag/llm/ocr_model.py`)
- Prefer `paddleocr_base_url` / `PADDLEOCR_BASE_URL`, fallback to
`paddleocr_api_url` / `PADDLEOCR_API_URL`

### Go (`internal/entity/models/paddleocr.go`)
- Add `Client-Platform: ragflow` header to submit and poll requests
- Change polling from fixed 3s to exponential backoff (initial 3s, ×1.5,
max 15s)

### Python (`common/constants.py`)
- Add `PADDLEOCR_BASE_URL` to env keys and default config

## Backward Compatibility

- Old env var `PADDLEOCR_API_URL` still works (used as fallback)
- Frontend field `paddleocr_api_url` still works (backend reads it as
fallback)
- No user-facing configuration changes required for existing setups

## Why not use the `paddleocr` SDK package directly?

RAGFlow's `_transfer_to_sections()` relies on `prunedResult` (containing
`block_bbox`, `block_label`, `parsing_res_list`) from the raw API
response for PDF crop functionality. The SDK's public `parse_document()`
API only returns `DocParsingResult` with `markdown_text`, discarding the
bbox data. Therefore we implement the async Job API flow directly via
HTTP, following the same logic as the SDK internally.
This commit is contained in:
Rander
2026-06-16 19:34:21 +08:00
committed by GitHub
parent 3d8bc76e27
commit 1235da7093
7 changed files with 191 additions and 53 deletions

View File

@@ -276,8 +276,9 @@ MINERU_DEFAULT_CONFIG = {
"MINERU_DELETE_OUTPUT": 1,
}
PADDLEOCR_ENV_KEYS = ["PADDLEOCR_API_URL", "PADDLEOCR_ACCESS_TOKEN", "PADDLEOCR_ALGORITHM"]
PADDLEOCR_ENV_KEYS = ["PADDLEOCR_BASE_URL", "PADDLEOCR_API_URL", "PADDLEOCR_ACCESS_TOKEN", "PADDLEOCR_ALGORITHM"]
PADDLEOCR_DEFAULT_CONFIG = {
"PADDLEOCR_BASE_URL": "",
"PADDLEOCR_API_URL": "",
"PADDLEOCR_ACCESS_TOKEN": None,
"PADDLEOCR_ALGORITHM": "PaddleOCR-VL",

View File

@@ -8,6 +8,13 @@
},
"class": "paddleocr.net",
"models": [
{
"name": "PaddleOCR-VL-1.6",
"max_tokens": 16384,
"model_types": [
"ocr"
]
},
{
"name": "PaddleOCR-VL-1.5",
"max_tokens": 16384,
@@ -15,6 +22,13 @@
"ocr"
]
},
{
"name": "PP-OCRv6",
"max_tokens": 16384,
"model_types": [
"ocr"
]
},
{
"name": "PP-OCRv5",
"max_tokens": 16384,

View File

@@ -14,10 +14,12 @@
#
from __future__ import annotations
import base64
import json
import logging
import os
import re
import tempfile
import time
from dataclasses import asdict, dataclass, field, fields
from io import BytesIO
from os import PathLike
@@ -38,16 +40,19 @@ except Exception:
class RAGFlowPdfParser:
pass
from deepdoc.parser.utils import extract_pdf_outlines
AlgorithmType = Literal["PaddleOCR-VL", "PP-OCRv5", "PP-StructureV3", "PaddleOCR-VL-1.5"]
AlgorithmType = Literal["PaddleOCR-VL", "PaddleOCR-VL-1.6", "PP-OCRv5", "PP-OCRv6", "PP-StructureV3", "PaddleOCR-VL-1.5"]
SectionTuple = tuple[str, ...]
TableTuple = tuple[str, ...]
ParseResult = tuple[list[SectionTuple], list[TableTuple]]
SUPPORTED_PADDLEOCR_ALGORITHMS: tuple[AlgorithmType, ...] = (
"PaddleOCR-VL",
"PaddleOCR-VL-1.6",
"PP-OCRv5",
"PP-OCRv6",
"PP-StructureV3",
"PaddleOCR-VL-1.5",
)
@@ -116,7 +121,7 @@ class PaddleOCRVLConfig:
class PaddleOCRConfig:
"""Main configuration for PaddleOCR parser."""
api_url: str = ""
base_url: str = "https://paddleocr.aistudio-app.com"
access_token: Optional[str] = None
algorithm: AlgorithmType = "PaddleOCR-VL"
request_timeout: int = 600
@@ -168,6 +173,9 @@ class PaddleOCRConfig:
return cls.from_dict(kwargs)
_DEFAULT_BASE_URL = "https://paddleocr.aistudio-app.com"
class PaddleOCRParser(RAGFlowPdfParser):
"""Parser for PDF documents using PaddleOCR API."""
@@ -216,7 +224,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
def __init__(
self,
api_url: Optional[str] = None,
base_url: Optional[str] = None,
access_token: Optional[str] = None,
algorithm: AlgorithmType = "PaddleOCR-VL",
*,
@@ -224,7 +232,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
):
"""Initialize PaddleOCR parser."""
self.outlines = []
self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "")
self.base_url = base_url.rstrip("/") if base_url else os.getenv("PADDLEOCR_BASE_URL", _DEFAULT_BASE_URL)
self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN")
self.algorithm = algorithm
self.request_timeout = request_timeout
@@ -240,10 +248,8 @@ class PaddleOCRParser(RAGFlowPdfParser):
# Public methods
def check_installation(self) -> tuple[bool, str]:
"""Check if the parser is properly installed and configured."""
if not self.api_url:
return False, "[PaddleOCR] API URL not configured"
# TODO [@Bobholamovic]: Check URL availability and token validity
if not self.access_token:
return False, "[PaddleOCR] Access token not configured"
return True, ""
@@ -254,7 +260,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
callback: Optional[Callable[[float, str], None]] = None,
*,
parse_method: str = "raw",
api_url: Optional[str] = None,
base_url: Optional[str] = None,
access_token: Optional[str] = None,
algorithm: Optional[AlgorithmType] = None,
request_timeout: Optional[int] = None,
@@ -267,9 +273,8 @@ class PaddleOCRParser(RAGFlowPdfParser):
) -> ParseResult:
"""Parse PDF document using PaddleOCR API."""
self.outlines = extract_pdf_outlines(binary if binary is not None else filepath)
# Create configuration - pass all kwargs to capture VL config parameters
config_dict = {
"api_url": api_url if api_url is not None else self.api_url,
"base_url": base_url if base_url is not None else self.base_url,
"access_token": access_token if access_token is not None else self.access_token,
"algorithm": algorithm if algorithm is not None else self.algorithm,
"request_timeout": request_timeout if request_timeout is not None else self.request_timeout,
@@ -285,10 +290,14 @@ class PaddleOCRParser(RAGFlowPdfParser):
if algorithm_config is not None:
config_dict["algorithm_config"] = algorithm_config
# Forward any extra kwargs that match PaddleOCRConfig fields
config_field_names = {f.name for f in fields(PaddleOCRConfig)}
config_dict.update({k: v for k, v in kwargs.items() if k in config_field_names and v is not None})
cfg = PaddleOCRConfig.from_dict(config_dict)
if not cfg.api_url:
raise RuntimeError("[PaddleOCR] API URL missing")
if not cfg.base_url:
raise RuntimeError("[PaddleOCR] Base URL missing")
# Prepare file data and generate page images for cropping
data_bytes = self._prepare_file_data(filepath, binary)
@@ -329,11 +338,8 @@ class PaddleOCRParser(RAGFlowPdfParser):
return source_path.read_bytes()
def _build_payload(self, data: bytes, file_type: int, config: PaddleOCRConfig) -> dict[str, Any]:
"""Build payload for API request."""
payload: dict[str, Any] = {
"file": base64.b64encode(data).decode("ascii"),
"fileType": file_type,
}
"""Build optionalPayload for async Job API request."""
payload: dict[str, Any] = {}
# Add common parameters
for param_key, param_value in [
@@ -359,44 +365,141 @@ class PaddleOCRParser(RAGFlowPdfParser):
return payload
def _send_request(self, data: bytes, config: PaddleOCRConfig, callback: Optional[Callable[[float, str], None]]) -> dict[str, Any]:
"""Send request to PaddleOCR API and parse response."""
# Build payload
payload = self._build_payload(data, self.file_type, config)
"""Send request to PaddleOCR async Job API (submit → poll → fetch)."""
optional_payload = self._build_payload(data, self.file_type, config)
# Prepare headers
headers = {"Content-Type": "application/json", "Client-Platform": "ragflow"}
headers: dict[str, str] = {"Client-Platform": "ragflow"}
if config.access_token:
headers["Authorization"] = f"token {config.access_token}"
headers["Authorization"] = f"Bearer {config.access_token}"
self.logger.info("[PaddleOCR] invoking API")
jobs_url = f"{config.base_url.rstrip('/')}/api/v2/ocr/jobs"
deadline = time.monotonic() + config.request_timeout
def _remaining() -> float:
r = deadline - time.monotonic()
if r <= 0:
raise RuntimeError(f"[PaddleOCR] timed out after {config.request_timeout}s")
return r
self.logger.info("[PaddleOCR] submitting job")
if callback:
callback(0.1, "[PaddleOCR] submitting request")
# Send request
# Step 1: Submit job with file upload
tmp_file = None
try:
resp = requests.post(config.api_url, json=payload, headers=headers, timeout=self.request_timeout)
resp.raise_for_status()
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
tmp_file.write(data)
tmp_file.close()
form_data = {
"model": config.algorithm,
"optionalPayload": json.dumps(optional_payload),
}
with open(tmp_file.name, "rb") as f:
resp = requests.post(
jobs_url,
data=form_data,
files={"file": ("document.pdf", f)},
headers=headers,
timeout=_remaining(),
)
except Exception as exc:
if callback:
callback(-1, f"[PaddleOCR] request failed: {exc}")
raise RuntimeError(f"[PaddleOCR] request failed: {exc}")
callback(-1, f"[PaddleOCR] submit failed: {exc}")
raise RuntimeError(f"[PaddleOCR] submit failed: {exc}")
finally:
if tmp_file and os.path.exists(tmp_file.name):
os.unlink(tmp_file.name)
if resp.status_code != 200:
raise RuntimeError(f"[PaddleOCR] submit failed: HTTP {resp.status_code} {resp.text}")
# Parse response
try:
response_data = resp.json()
except Exception as exc:
raise RuntimeError(f"[PaddleOCR] response is not JSON: {exc}") from exc
submit_data = resp.json()
except ValueError as exc:
raise RuntimeError(f"[PaddleOCR] submit response is not JSON: {exc}")
job_id = submit_data.get("data", {}).get("jobId") or submit_data.get("jobId")
if not job_id:
raise RuntimeError(f"[PaddleOCR] job ID not found in response: {submit_data}")
if callback:
callback(0.8, "[PaddleOCR] response received")
callback(0.2, f"[PaddleOCR] job submitted: {job_id}")
# Validate response format
if response_data.get("errorCode") != 0 or not isinstance(response_data.get("result"), dict):
if callback:
callback(-1, "[PaddleOCR] invalid response format")
raise RuntimeError("[PaddleOCR] invalid response format")
# Step 2: Poll until done (exponential backoff)
poll_url = f"{jobs_url}/{job_id}"
interval = 3.0
multiplier = 1.5
max_interval = 15.0
self.logger.info(f"[PaddleOCR] polling job {job_id}")
return response_data["result"]
while True:
if time.monotonic() >= deadline:
raise RuntimeError(f"[PaddleOCR] job {job_id} timed out after {config.request_timeout}s")
try:
poll_resp = requests.get(poll_url, headers=headers, timeout=_remaining())
except Exception as exc:
raise RuntimeError(f"[PaddleOCR] poll failed: {exc}")
if poll_resp.status_code != 200:
raise RuntimeError(f"[PaddleOCR] poll failed: HTTP {poll_resp.status_code} {poll_resp.text[:200]}")
try:
poll_data = poll_resp.json()
except ValueError as exc:
raise RuntimeError(f"[PaddleOCR] poll response is not JSON: {exc}")
state = poll_data.get("data", {}).get("state") or poll_data.get("state")
if state == "done":
self.logger.info(f"[PaddleOCR] job {job_id} done")
if callback:
callback(0.7, "[PaddleOCR] job done, fetching result")
break
elif state == "failed":
error_msg = poll_data.get("data", {}).get("errorMsg", "Unknown error")
self.logger.error(f"[PaddleOCR] job {job_id} failed: {error_msg}")
raise RuntimeError(f"[PaddleOCR] job failed: {error_msg}")
sleep_time = min(interval, max(0, deadline - time.monotonic()))
time.sleep(sleep_time)
interval = min(interval * multiplier, max_interval)
# Step 3: Fetch result
result_data = poll_data.get("data", {})
result_json_url = result_data.get("resultJsonUrl") or (result_data.get("resultUrl") or {}).get("jsonUrl")
if not result_json_url:
raise RuntimeError(f"[PaddleOCR] result URL not found: {poll_data}")
try:
result_resp = requests.get(result_json_url, timeout=_remaining())
result_resp.raise_for_status()
except Exception as exc:
raise RuntimeError(f"[PaddleOCR] failed to fetch result: {exc}")
# Parse JSONL result
jsonl_lines = result_resp.text.strip().split("\n")
jsonl_data = []
for line in jsonl_lines:
line = line.strip()
if line:
try:
jsonl_data.append(json.loads(line))
except ValueError as exc:
raise RuntimeError(f"[PaddleOCR] result JSONL parse error: {exc}")
if callback:
callback(0.8, "[PaddleOCR] result received")
# Extract raw result (preserving prunedResult with bbox info)
combined_result: dict[str, Any] = {"layoutParsingResults": []}
for line_obj in jsonl_data:
result = line_obj.get("result", {})
layout_results = result.get("layoutParsingResults", [])
combined_result["layoutParsingResults"].extend(layout_results)
return combined_result
def _transfer_to_sections(self, result: dict[str, Any], algorithm: AlgorithmType, parse_method: str) -> list[SectionTuple]:
"""Convert API response to section tuples."""
@@ -607,6 +710,9 @@ class PaddleOCRParser(RAGFlowPdfParser):
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
parser = PaddleOCRParser(api_url=os.getenv("PADDLEOCR_API_URL", ""), algorithm=os.getenv("PADDLEOCR_ALGORITHM", "PaddleOCR-VL"))
parser = PaddleOCRParser(
base_url=os.getenv("PADDLEOCR_BASE_URL") or None,
algorithm=os.getenv("PADDLEOCR_ALGORITHM", "PaddleOCR-VL"),
)
ok, reason = parser.check_installation()
print("PaddleOCR available:", ok, reason)

View File

@@ -172,6 +172,7 @@ func (p *PaddleOCRModel) OCRFile(modelName *string, content []byte, fileURL *str
if auth := BearerAuth(apiConfig); auth != "" {
req.Header.Set("Authorization", auth)
}
req.Header.Set("Client-Platform", "ragflow")
resp, err := p.baseModel.httpClient.Do(req)
if err != nil {
@@ -197,17 +198,19 @@ func (p *PaddleOCRModel) OCRFile(modelName *string, content []byte, fileURL *str
pollUrl := fmt.Sprintf("%s/%s", url, jobId)
var jsonlUrl string
for {
select {
case <-time.After(3 * time.Second):
case <-ctx.Done():
return nil, ctx.Err()
}
pollInterval := 3 * time.Second
const pollMultiplier = 1.5
maxPollInterval := 15 * time.Second
pollReq, _ := http.NewRequestWithContext(ctx, "GET", pollUrl, nil)
for {
pollReq, err := http.NewRequestWithContext(ctx, "GET", pollUrl, nil)
if err != nil {
return nil, fmt.Errorf("failed to create poll request: %w", err)
}
if auth := BearerAuth(apiConfig); auth != "" {
pollReq.Header.Set("Authorization", auth)
}
pollReq.Header.Set("Client-Platform", "ragflow")
pollResp, err := p.baseModel.httpClient.Do(pollReq)
if err != nil {
@@ -234,6 +237,18 @@ func (p *PaddleOCRModel) OCRFile(modelName *string, content []byte, fileURL *str
} else if state == "failed" {
return nil, fmt.Errorf("ocr job failed on server: %s", pollData.Data.ErrorMsg)
}
// Exponential backoff
pollInterval = time.Duration(float64(pollInterval) * pollMultiplier)
if pollInterval > maxPollInterval {
pollInterval = maxPollInterval
}
select {
case <-time.After(pollInterval):
case <-ctx.Done():
return nil, ctx.Err()
}
}
if jsonlUrl == "" {

View File

@@ -117,7 +117,7 @@ class PaddleOCROcrModel(Base, PaddleOCRParser):
# lower-case keys (UI), upper-case PADDLEOCR_* (env auto-provision), env vars
return config.get(key, config.get(env_key, os.environ.get(env_key, default)))
self.paddleocr_api_url = _resolve_config("paddleocr_api_url", "PADDLEOCR_API_URL", "")
self.paddleocr_base_url = _resolve_config("paddleocr_base_url", "PADDLEOCR_BASE_URL", "") or _resolve_config("paddleocr_api_url", "PADDLEOCR_API_URL", "")
self.paddleocr_algorithm = _resolve_config("paddleocr_algorithm", "PADDLEOCR_ALGORITHM", "PaddleOCR-VL")
self.paddleocr_access_token = _resolve_config("paddleocr_access_token", "PADDLEOCR_ACCESS_TOKEN", None)
@@ -132,7 +132,7 @@ class PaddleOCROcrModel(Base, PaddleOCRParser):
PaddleOCRParser.__init__(
self,
api_url=self.paddleocr_api_url,
base_url=self.paddleocr_base_url or None,
access_token=self.paddleocr_access_token,
algorithm=self.paddleocr_algorithm,
)

View File

@@ -6,7 +6,7 @@ import { buildOptions } from '@/utils/form';
import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
const algorithmOptions = buildOptions(['PaddleOCR-VL']);
const algorithmOptions = buildOptions(['PaddleOCR-VL-1.6', 'PaddleOCR-VL-1.5', 'PaddleOCR-VL', 'PP-OCRv6', 'PP-OCRv5', 'PP-StructureV3']);
export function PaddleOCROptionsFormField({
namePrefix = 'parser_config',

View File

@@ -746,8 +746,10 @@ export const ProviderConfigMap: Record<string, ProviderConfig> = {
defaultValue: 'PaddleOCR-VL',
placeholder: 'paddleocrSelectAlgorithm',
options: [
{ label: 'PaddleOCR-VL-1.6', value: 'PaddleOCR-VL-1.6' },
{ label: 'PaddleOCR-VL-1.5', value: 'PaddleOCR-VL-1.5' },
{ label: 'PaddleOCR-VL', value: 'PaddleOCR-VL' },
{ label: 'PP-OCRv6', value: 'PP-OCRv6' },
{ label: 'PP-OCRv5', value: 'PP-OCRv5' },
{ label: 'PP-StructureV3', value: 'PP-StructureV3' },
],