refactor(paddleocr): migrate from sync API to async Job API (#15967)

## Summary Migrate PaddleOCR integration from the deprecated synchronous HTTP API to the new asynchronous Job API (`submit → poll → fetch`), aligning with PaddleOCR 3.6.0+ architecture. ## Changes ### Python (`deepdoc/parser/paddleocr_parser.py`) - Replace synchronous `requests.post()` with async Job API flow (submit → poll → fetch) - Authentication: `token {token}` → `Bearer {token}` - File transfer: base64 JSON body → multipart file upload - Polling: exponential backoff (initial 3s, ×1.5, max 15s, timeout controlled by `request_timeout`) - Result: fetch full JSONL from result URL, preserving `prunedResult` with bbox info for crop functionality - Rename `api_url` → `base_url` (backward compatible: `api_url` still accepted as fallback) ### Python (`rag/llm/ocr_model.py`) - Prefer `paddleocr_base_url` / `PADDLEOCR_BASE_URL`, fallback to `paddleocr_api_url` / `PADDLEOCR_API_URL` ### Go (`internal/entity/models/paddleocr.go`) - Add `Client-Platform: ragflow` header to submit and poll requests - Change polling from fixed 3s to exponential backoff (initial 3s, ×1.5, max 15s) ### Python (`common/constants.py`) - Add `PADDLEOCR_BASE_URL` to env keys and default config ## Backward Compatibility - Old env var `PADDLEOCR_API_URL` still works (used as fallback) - Frontend field `paddleocr_api_url` still works (backend reads it as fallback) - No user-facing configuration changes required for existing setups ## Why not use the `paddleocr` SDK package directly? RAGFlow's `_transfer_to_sections()` relies on `prunedResult` (containing `block_bbox`, `block_label`, `parsing_res_list`) from the raw API response for PDF crop functionality. The SDK's public `parse_document()` API only returns `DocParsingResult` with `markdown_text`, discarding the bbox data. Therefore we implement the async Job API flow directly via HTTP, following the same logic as the SDK internally.
2026-06-29 23:41:12 +08:00 · 2026-06-16 19:34:21 +08:00
parent 3d8bc76e27
commit 1235da7093
7 changed files with 191 additions and 53 deletions
--- a/common/constants.py
+++ b/common/constants.py
@@ -276,8 +276,9 @@ MINERU_DEFAULT_CONFIG = {
    "MINERU_DELETE_OUTPUT": 1,
 }

-PADDLEOCR_ENV_KEYS = ["PADDLEOCR_API_URL", "PADDLEOCR_ACCESS_TOKEN", "PADDLEOCR_ALGORITHM"]
+PADDLEOCR_ENV_KEYS = ["PADDLEOCR_BASE_URL", "PADDLEOCR_API_URL", "PADDLEOCR_ACCESS_TOKEN", "PADDLEOCR_ALGORITHM"]
 PADDLEOCR_DEFAULT_CONFIG = {
+    "PADDLEOCR_BASE_URL": "",
    "PADDLEOCR_API_URL": "",
    "PADDLEOCR_ACCESS_TOKEN": None,
    "PADDLEOCR_ALGORITHM": "PaddleOCR-VL",
--- a/conf/models/paddleocr.json
+++ b/conf/models/paddleocr.json
@@ -8,6 +8,13 @@
  },
  "class": "paddleocr.net",
  "models": [
+    {
+      "name": "PaddleOCR-VL-1.6",
+      "max_tokens": 16384,
+      "model_types": [
+        "ocr"
+      ]
+    },
    {
      "name": "PaddleOCR-VL-1.5",
      "max_tokens": 16384,
@@ -15,6 +22,13 @@
        "ocr"
      ]
    },
+    {
+      "name": "PP-OCRv6",
+      "max_tokens": 16384,
+      "model_types": [
+        "ocr"
+      ]
+    },
    {
      "name": "PP-OCRv5",
      "max_tokens": 16384,
--- a/deepdoc/parser/paddleocr_parser.py
+++ b/deepdoc/parser/paddleocr_parser.py
@@ -14,10 +14,12 @@
 #
 from __future__ import annotations

-import base64
+import json
 import logging
 import os
 import re
+import tempfile
+import time
 from dataclasses import asdict, dataclass, field, fields
 from io import BytesIO
 from os import PathLike
@@ -38,16 +40,19 @@ except Exception:
    class RAGFlowPdfParser:
        pass

+
 from deepdoc.parser.utils import extract_pdf_outlines


-AlgorithmType = Literal["PaddleOCR-VL", "PP-OCRv5", "PP-StructureV3", "PaddleOCR-VL-1.5"]
+AlgorithmType = Literal["PaddleOCR-VL", "PaddleOCR-VL-1.6", "PP-OCRv5", "PP-OCRv6", "PP-StructureV3", "PaddleOCR-VL-1.5"]
 SectionTuple = tuple[str, ...]
 TableTuple = tuple[str, ...]
 ParseResult = tuple[list[SectionTuple], list[TableTuple]]
 SUPPORTED_PADDLEOCR_ALGORITHMS: tuple[AlgorithmType, ...] = (
    "PaddleOCR-VL",
+    "PaddleOCR-VL-1.6",
    "PP-OCRv5",
+    "PP-OCRv6",
    "PP-StructureV3",
    "PaddleOCR-VL-1.5",
 )
@@ -116,7 +121,7 @@ class PaddleOCRVLConfig:
 class PaddleOCRConfig:
    """Main configuration for PaddleOCR parser."""

-    api_url: str = ""
+    base_url: str = "https://paddleocr.aistudio-app.com"
    access_token: Optional[str] = None
    algorithm: AlgorithmType = "PaddleOCR-VL"
    request_timeout: int = 600
@@ -168,6 +173,9 @@ class PaddleOCRConfig:
        return cls.from_dict(kwargs)


+_DEFAULT_BASE_URL = "https://paddleocr.aistudio-app.com"
+
+
 class PaddleOCRParser(RAGFlowPdfParser):
    """Parser for PDF documents using PaddleOCR API."""

@@ -216,7 +224,7 @@ class PaddleOCRParser(RAGFlowPdfParser):

    def __init__(
        self,
-        api_url: Optional[str] = None,
+        base_url: Optional[str] = None,
        access_token: Optional[str] = None,
        algorithm: AlgorithmType = "PaddleOCR-VL",
        *,
@@ -224,7 +232,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
    ):
        """Initialize PaddleOCR parser."""
        self.outlines = []
-        self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "")
+        self.base_url = base_url.rstrip("/") if base_url else os.getenv("PADDLEOCR_BASE_URL", _DEFAULT_BASE_URL)
        self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN")
        self.algorithm = algorithm
        self.request_timeout = request_timeout
@@ -240,10 +248,8 @@ class PaddleOCRParser(RAGFlowPdfParser):
    # Public methods
    def check_installation(self) -> tuple[bool, str]:
        """Check if the parser is properly installed and configured."""
-        if not self.api_url:
-            return False, "[PaddleOCR] API URL not configured"
-
-        # TODO [@Bobholamovic]: Check URL availability and token validity
+        if not self.access_token:
+            return False, "[PaddleOCR] Access token not configured"

        return True, ""

@@ -254,7 +260,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
        callback: Optional[Callable[[float, str], None]] = None,
        *,
        parse_method: str = "raw",
-        api_url: Optional[str] = None,
+        base_url: Optional[str] = None,
        access_token: Optional[str] = None,
        algorithm: Optional[AlgorithmType] = None,
        request_timeout: Optional[int] = None,
@@ -267,9 +273,8 @@ class PaddleOCRParser(RAGFlowPdfParser):
    ) -> ParseResult:
        """Parse PDF document using PaddleOCR API."""
        self.outlines = extract_pdf_outlines(binary if binary is not None else filepath)
-        # Create configuration - pass all kwargs to capture VL config parameters
        config_dict = {
-            "api_url": api_url if api_url is not None else self.api_url,
+            "base_url": base_url if base_url is not None else self.base_url,
            "access_token": access_token if access_token is not None else self.access_token,
            "algorithm": algorithm if algorithm is not None else self.algorithm,
            "request_timeout": request_timeout if request_timeout is not None else self.request_timeout,
@@ -285,10 +290,14 @@ class PaddleOCRParser(RAGFlowPdfParser):
        if algorithm_config is not None:
            config_dict["algorithm_config"] = algorithm_config

+        # Forward any extra kwargs that match PaddleOCRConfig fields
+        config_field_names = {f.name for f in fields(PaddleOCRConfig)}
+        config_dict.update({k: v for k, v in kwargs.items() if k in config_field_names and v is not None})
+
        cfg = PaddleOCRConfig.from_dict(config_dict)

-        if not cfg.api_url:
-            raise RuntimeError("[PaddleOCR] API URL missing")
+        if not cfg.base_url:
+            raise RuntimeError("[PaddleOCR] Base URL missing")

        # Prepare file data and generate page images for cropping
        data_bytes = self._prepare_file_data(filepath, binary)
@@ -329,11 +338,8 @@ class PaddleOCRParser(RAGFlowPdfParser):
        return source_path.read_bytes()

    def _build_payload(self, data: bytes, file_type: int, config: PaddleOCRConfig) -> dict[str, Any]:
-        """Build payload for API request."""
-        payload: dict[str, Any] = {
-            "file": base64.b64encode(data).decode("ascii"),
-            "fileType": file_type,
-        }
+        """Build optionalPayload for async Job API request."""
+        payload: dict[str, Any] = {}

        # Add common parameters
        for param_key, param_value in [
@@ -359,44 +365,141 @@ class PaddleOCRParser(RAGFlowPdfParser):
        return payload

    def _send_request(self, data: bytes, config: PaddleOCRConfig, callback: Optional[Callable[[float, str], None]]) -> dict[str, Any]:
-        """Send request to PaddleOCR API and parse response."""
-        # Build payload
-        payload = self._build_payload(data, self.file_type, config)
+        """Send request to PaddleOCR async Job API (submit → poll → fetch)."""
+        optional_payload = self._build_payload(data, self.file_type, config)

        # Prepare headers
-        headers = {"Content-Type": "application/json", "Client-Platform": "ragflow"}
+        headers: dict[str, str] = {"Client-Platform": "ragflow"}
        if config.access_token:
-            headers["Authorization"] = f"token {config.access_token}"
+            headers["Authorization"] = f"Bearer {config.access_token}"

-        self.logger.info("[PaddleOCR] invoking API")
+        jobs_url = f"{config.base_url.rstrip('/')}/api/v2/ocr/jobs"
+        deadline = time.monotonic() + config.request_timeout
+
+        def _remaining() -> float:
+            r = deadline - time.monotonic()
+            if r <= 0:
+                raise RuntimeError(f"[PaddleOCR] timed out after {config.request_timeout}s")
+            return r
+
+        self.logger.info("[PaddleOCR] submitting job")
        if callback:
            callback(0.1, "[PaddleOCR] submitting request")

-        # Send request
+        # Step 1: Submit job with file upload
+        tmp_file = None
        try:
-            resp = requests.post(config.api_url, json=payload, headers=headers, timeout=self.request_timeout)
-            resp.raise_for_status()
+            tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+            tmp_file.write(data)
+            tmp_file.close()
+
+            form_data = {
+                "model": config.algorithm,
+                "optionalPayload": json.dumps(optional_payload),
+            }
+            with open(tmp_file.name, "rb") as f:
+                resp = requests.post(
+                    jobs_url,
+                    data=form_data,
+                    files={"file": ("document.pdf", f)},
+                    headers=headers,
+                    timeout=_remaining(),
+                )
        except Exception as exc:
            if callback:
-                callback(-1, f"[PaddleOCR] request failed: {exc}")
-            raise RuntimeError(f"[PaddleOCR] request failed: {exc}")
+                callback(-1, f"[PaddleOCR] submit failed: {exc}")
+            raise RuntimeError(f"[PaddleOCR] submit failed: {exc}")
+        finally:
+            if tmp_file and os.path.exists(tmp_file.name):
+                os.unlink(tmp_file.name)
+
+        if resp.status_code != 200:
+            raise RuntimeError(f"[PaddleOCR] submit failed: HTTP {resp.status_code} {resp.text}")

-        # Parse response
        try:
-            response_data = resp.json()
-        except Exception as exc:
-            raise RuntimeError(f"[PaddleOCR] response is not JSON: {exc}") from exc
+            submit_data = resp.json()
+        except ValueError as exc:
+            raise RuntimeError(f"[PaddleOCR] submit response is not JSON: {exc}")
+        job_id = submit_data.get("data", {}).get("jobId") or submit_data.get("jobId")
+        if not job_id:
+            raise RuntimeError(f"[PaddleOCR] job ID not found in response: {submit_data}")

        if callback:
-            callback(0.8, "[PaddleOCR] response received")
+            callback(0.2, f"[PaddleOCR] job submitted: {job_id}")

-        # Validate response format
-        if response_data.get("errorCode") != 0 or not isinstance(response_data.get("result"), dict):
-            if callback:
-                callback(-1, "[PaddleOCR] invalid response format")
-            raise RuntimeError("[PaddleOCR] invalid response format")
+        # Step 2: Poll until done (exponential backoff)
+        poll_url = f"{jobs_url}/{job_id}"
+        interval = 3.0
+        multiplier = 1.5
+        max_interval = 15.0
+        self.logger.info(f"[PaddleOCR] polling job {job_id}")

-        return response_data["result"]
+        while True:
+            if time.monotonic() >= deadline:
+                raise RuntimeError(f"[PaddleOCR] job {job_id} timed out after {config.request_timeout}s")
+
+            try:
+                poll_resp = requests.get(poll_url, headers=headers, timeout=_remaining())
+            except Exception as exc:
+                raise RuntimeError(f"[PaddleOCR] poll failed: {exc}")
+
+            if poll_resp.status_code != 200:
+                raise RuntimeError(f"[PaddleOCR] poll failed: HTTP {poll_resp.status_code} {poll_resp.text[:200]}")
+
+            try:
+                poll_data = poll_resp.json()
+            except ValueError as exc:
+                raise RuntimeError(f"[PaddleOCR] poll response is not JSON: {exc}")
+            state = poll_data.get("data", {}).get("state") or poll_data.get("state")
+
+            if state == "done":
+                self.logger.info(f"[PaddleOCR] job {job_id} done")
+                if callback:
+                    callback(0.7, "[PaddleOCR] job done, fetching result")
+                break
+            elif state == "failed":
+                error_msg = poll_data.get("data", {}).get("errorMsg", "Unknown error")
+                self.logger.error(f"[PaddleOCR] job {job_id} failed: {error_msg}")
+                raise RuntimeError(f"[PaddleOCR] job failed: {error_msg}")
+
+            sleep_time = min(interval, max(0, deadline - time.monotonic()))
+            time.sleep(sleep_time)
+            interval = min(interval * multiplier, max_interval)
+
+        # Step 3: Fetch result
+        result_data = poll_data.get("data", {})
+        result_json_url = result_data.get("resultJsonUrl") or (result_data.get("resultUrl") or {}).get("jsonUrl")
+        if not result_json_url:
+            raise RuntimeError(f"[PaddleOCR] result URL not found: {poll_data}")
+
+        try:
+            result_resp = requests.get(result_json_url, timeout=_remaining())
+            result_resp.raise_for_status()
+        except Exception as exc:
+            raise RuntimeError(f"[PaddleOCR] failed to fetch result: {exc}")
+
+        # Parse JSONL result
+        jsonl_lines = result_resp.text.strip().split("\n")
+        jsonl_data = []
+        for line in jsonl_lines:
+            line = line.strip()
+            if line:
+                try:
+                    jsonl_data.append(json.loads(line))
+                except ValueError as exc:
+                    raise RuntimeError(f"[PaddleOCR] result JSONL parse error: {exc}")
+
+        if callback:
+            callback(0.8, "[PaddleOCR] result received")
+
+        # Extract raw result (preserving prunedResult with bbox info)
+        combined_result: dict[str, Any] = {"layoutParsingResults": []}
+        for line_obj in jsonl_data:
+            result = line_obj.get("result", {})
+            layout_results = result.get("layoutParsingResults", [])
+            combined_result["layoutParsingResults"].extend(layout_results)
+
+        return combined_result

    def _transfer_to_sections(self, result: dict[str, Any], algorithm: AlgorithmType, parse_method: str) -> list[SectionTuple]:
        """Convert API response to section tuples."""
@@ -607,6 +710,9 @@ class PaddleOCRParser(RAGFlowPdfParser):

 if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
-    parser = PaddleOCRParser(api_url=os.getenv("PADDLEOCR_API_URL", ""), algorithm=os.getenv("PADDLEOCR_ALGORITHM", "PaddleOCR-VL"))
+    parser = PaddleOCRParser(
+        base_url=os.getenv("PADDLEOCR_BASE_URL") or None,
+        algorithm=os.getenv("PADDLEOCR_ALGORITHM", "PaddleOCR-VL"),
+    )
    ok, reason = parser.check_installation()
    print("PaddleOCR available:", ok, reason)
--- a/internal/entity/models/paddleocr.go
+++ b/internal/entity/models/paddleocr.go
@@ -172,6 +172,7 @@ func (p *PaddleOCRModel) OCRFile(modelName *string, content []byte, fileURL *str
 	if auth := BearerAuth(apiConfig); auth != "" {
 		req.Header.Set("Authorization", auth)
 	}
+	req.Header.Set("Client-Platform", "ragflow")

 	resp, err := p.baseModel.httpClient.Do(req)
 	if err != nil {
@@ -197,17 +198,19 @@ func (p *PaddleOCRModel) OCRFile(modelName *string, content []byte, fileURL *str
 	pollUrl := fmt.Sprintf("%s/%s", url, jobId)
 	var jsonlUrl string

-	for {
-		select {
-		case <-time.After(3 * time.Second):
-		case <-ctx.Done():
-			return nil, ctx.Err()
-		}
+	pollInterval := 3 * time.Second
+	const pollMultiplier = 1.5
+	maxPollInterval := 15 * time.Second

-		pollReq, _ := http.NewRequestWithContext(ctx, "GET", pollUrl, nil)
+	for {
+		pollReq, err := http.NewRequestWithContext(ctx, "GET", pollUrl, nil)
+		if err != nil {
+			return nil, fmt.Errorf("failed to create poll request: %w", err)
+		}
 		if auth := BearerAuth(apiConfig); auth != "" {
 			pollReq.Header.Set("Authorization", auth)
 		}
+		pollReq.Header.Set("Client-Platform", "ragflow")

 		pollResp, err := p.baseModel.httpClient.Do(pollReq)
 		if err != nil {
@@ -234,6 +237,18 @@ func (p *PaddleOCRModel) OCRFile(modelName *string, content []byte, fileURL *str
 		} else if state == "failed" {
 			return nil, fmt.Errorf("ocr job failed on server: %s", pollData.Data.ErrorMsg)
 		}
+
+		// Exponential backoff
+		pollInterval = time.Duration(float64(pollInterval) * pollMultiplier)
+		if pollInterval > maxPollInterval {
+			pollInterval = maxPollInterval
+		}
+
+		select {
+		case <-time.After(pollInterval):
+		case <-ctx.Done():
+			return nil, ctx.Err()
+		}
 	}

 	if jsonlUrl == "" {
--- a/rag/llm/ocr_model.py
+++ b/rag/llm/ocr_model.py
@@ -117,7 +117,7 @@ class PaddleOCROcrModel(Base, PaddleOCRParser):
            # lower-case keys (UI), upper-case PADDLEOCR_* (env auto-provision), env vars
            return config.get(key, config.get(env_key, os.environ.get(env_key, default)))

-        self.paddleocr_api_url = _resolve_config("paddleocr_api_url", "PADDLEOCR_API_URL", "")
+        self.paddleocr_base_url = _resolve_config("paddleocr_base_url", "PADDLEOCR_BASE_URL", "") or _resolve_config("paddleocr_api_url", "PADDLEOCR_API_URL", "")
        self.paddleocr_algorithm = _resolve_config("paddleocr_algorithm", "PADDLEOCR_ALGORITHM", "PaddleOCR-VL")
        self.paddleocr_access_token = _resolve_config("paddleocr_access_token", "PADDLEOCR_ACCESS_TOKEN", None)

@@ -132,7 +132,7 @@ class PaddleOCROcrModel(Base, PaddleOCRParser):

        PaddleOCRParser.__init__(
            self,
-            api_url=self.paddleocr_api_url,
+            base_url=self.paddleocr_base_url or None,
            access_token=self.paddleocr_access_token,
            algorithm=self.paddleocr_algorithm,
        )
--- a/web/src/components/paddleocr-options-form-field.tsx
+++ b/web/src/components/paddleocr-options-form-field.tsx
@@ -6,7 +6,7 @@ import { buildOptions } from '@/utils/form';
 import { useFormContext, useWatch } from 'react-hook-form';
 import { useTranslation } from 'react-i18next';

-const algorithmOptions = buildOptions(['PaddleOCR-VL']);
+const algorithmOptions = buildOptions(['PaddleOCR-VL-1.6', 'PaddleOCR-VL-1.5', 'PaddleOCR-VL', 'PP-OCRv6', 'PP-OCRv5', 'PP-StructureV3']);

 export function PaddleOCROptionsFormField({
  namePrefix = 'parser_config',
--- a/web/src/pages/user-setting/setting-model/modal/provider-modal/field-config/provider-config-map.ts
+++ b/web/src/pages/user-setting/setting-model/modal/provider-modal/field-config/provider-config-map.ts
@@ -746,8 +746,10 @@ export const ProviderConfigMap: Record<string, ProviderConfig> = {
        defaultValue: 'PaddleOCR-VL',
        placeholder: 'paddleocrSelectAlgorithm',
        options: [
+          { label: 'PaddleOCR-VL-1.6', value: 'PaddleOCR-VL-1.6' },
          { label: 'PaddleOCR-VL-1.5', value: 'PaddleOCR-VL-1.5' },
          { label: 'PaddleOCR-VL', value: 'PaddleOCR-VL' },
+          { label: 'PP-OCRv6', value: 'PP-OCRv6' },
          { label: 'PP-OCRv5', value: 'PP-OCRv5' },
          { label: 'PP-StructureV3', value: 'PP-StructureV3' },
        ],