Files
ragflow/internal/entity/models/paddleocr.go
Rander 1235da7093 refactor(paddleocr): migrate from sync API to async Job API (#15967)
## Summary

Migrate PaddleOCR integration from the deprecated synchronous HTTP API
to the new asynchronous Job API (`submit → poll → fetch`), aligning with
PaddleOCR 3.6.0+ architecture.

## Changes

### Python (`deepdoc/parser/paddleocr_parser.py`)
- Replace synchronous `requests.post()` with async Job API flow (submit
→ poll → fetch)
- Authentication: `token {token}` → `Bearer {token}`
- File transfer: base64 JSON body → multipart file upload
- Polling: exponential backoff (initial 3s, ×1.5, max 15s, timeout
controlled by `request_timeout`)
- Result: fetch full JSONL from result URL, preserving `prunedResult`
with bbox info for crop functionality
- Rename `api_url` → `base_url` (backward compatible: `api_url` still
accepted as fallback)

### Python (`rag/llm/ocr_model.py`)
- Prefer `paddleocr_base_url` / `PADDLEOCR_BASE_URL`, fallback to
`paddleocr_api_url` / `PADDLEOCR_API_URL`

### Go (`internal/entity/models/paddleocr.go`)
- Add `Client-Platform: ragflow` header to submit and poll requests
- Change polling from fixed 3s to exponential backoff (initial 3s, ×1.5,
max 15s)

### Python (`common/constants.py`)
- Add `PADDLEOCR_BASE_URL` to env keys and default config

## Backward Compatibility

- Old env var `PADDLEOCR_API_URL` still works (used as fallback)
- Frontend field `paddleocr_api_url` still works (backend reads it as
fallback)
- No user-facing configuration changes required for existing setups

## Why not use the `paddleocr` SDK package directly?

RAGFlow's `_transfer_to_sections()` relies on `prunedResult` (containing
`block_bbox`, `block_label`, `parsing_res_list`) from the raw API
response for PDF crop functionality. The SDK's public `parse_document()`
API only returns `DocParsingResult` with `markdown_text`, discarding the
bbox data. Therefore we implement the async Job API flow directly via
HTTP, following the same logic as the SDK internally.
2026-06-16 19:34:21 +08:00

326 lines
9.8 KiB
Go

//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package models
import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/http"
"strings"
"time"
)
type PaddleOCRModel struct {
baseModel BaseModel
}
func NewPaddleOCRModel(baseURL map[string]string, urlSuffix URLSuffix) *PaddleOCRModel {
return &PaddleOCRModel{
baseModel: BaseModel{
BaseURL: baseURL,
URLSuffix: urlSuffix,
AllowEmptyAPIKey: true,
httpClient: NewDriverHTTPClient(),
},
}
}
func (p PaddleOCRModel) NewInstance(baseURL map[string]string) ModelDriver {
return NewPaddleOCRModel(baseURL, p.baseModel.URLSuffix)
}
func (p *PaddleOCRModel) Name() string {
return "paddle_ocr.net"
}
func (p *PaddleOCRModel) ChatWithMessages(modelName string, messages []Message, apiConfig *APIConfig, chatModelConfig *ChatConfig) (*ChatResponse, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) ChatStreamlyWithSender(modelName string, messages []Message, apiConfig *APIConfig, modelConfig *ChatConfig, sender func(*string, *string) error) error {
return fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) Embed(modelName *string, texts []string, apiConfig *APIConfig, embeddingConfig *EmbeddingConfig) ([]EmbeddingData, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) Rerank(modelName *string, query string, documents []string, apiConfig *APIConfig, rerankConfig *RerankConfig) (*RerankResponse, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) TranscribeAudio(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig) (*ASRResponse, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) TranscribeAudioWithSender(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig, sender func(*string, *string) error) error {
return fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig) (*TTSResponse, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error {
return fmt.Errorf("%s, no such method", p.Name())
}
type paddleSubmitResponse struct {
Data struct {
JobId string `json:"jobId"`
} `json:"data"`
}
type paddlePollResponse struct {
Data struct {
State string `json:"state"`
ErrorMsg string `json:"errorMsg"`
ResultUrl struct {
JsonUrl string `json:"jsonUrl"`
} `json:"resultUrl"`
} `json:"data"`
}
type paddleJsonlLine struct {
Result struct {
LayoutParsingResults []struct {
Markdown struct {
Text string `json:"text"`
} `json:"markdown"`
} `json:"layoutParsingResults"`
} `json:"result"`
}
func (p *PaddleOCRModel) OCRFile(modelName *string, content []byte, fileURL *string, apiConfig *APIConfig, ocrConfig *OCRConfig) (*OCRFileResponse, error) {
if err := p.baseModel.APIConfigCheck(apiConfig); err != nil {
return nil, err
}
if (content == nil || len(content) == 0) && (fileURL == nil || *fileURL == "") {
return nil, fmt.Errorf("content and fileURL cannot be both empty")
}
resolvedBaseURL, err := p.baseModel.GetBaseURL(apiConfig)
if err != nil {
return nil, err
}
url := fmt.Sprintf("%s/%s", resolvedBaseURL, p.baseModel.URLSuffix.OCR)
optionalPayload := map[string]bool{
"useDocOrientationClassify": false,
"useDocUnwarping": false,
"useChartRecognition": false,
}
optBytes, _ := json.Marshal(optionalPayload)
// One generous deadline bounds the whole OCR operation (submit + poll +
// result download), so the poll loop below can no longer spin forever.
ctx, cancel := context.WithTimeout(context.Background(), longOpCallTimeout)
defer cancel()
var req *http.Request
if fileURL != nil && strings.HasPrefix(*fileURL, "http") {
reqData := map[string]interface{}{
"fileUrl": *fileURL,
"model": *modelName,
"optionalPayload": optionalPayload,
}
jsonData, err := json.Marshal(reqData)
if err != nil {
return nil, fmt.Errorf("failed to marshal json: %w", err)
}
req, err = http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonData))
req.Header.Set("Content-Type", "application/json")
} else {
body := &bytes.Buffer{}
writer := multipart.NewWriter(body)
_ = writer.WriteField("model", *modelName)
_ = writer.WriteField("optionalPayload", string(optBytes))
part, err := writer.CreateFormFile("file", "document.pdf")
if err != nil {
return nil, fmt.Errorf("failed to create form file: %w", err)
}
part.Write(content)
writer.Close()
req, err = http.NewRequestWithContext(ctx, "POST", url, body)
req.Header.Set("Content-Type", writer.FormDataContentType())
}
if auth := BearerAuth(apiConfig); auth != "" {
req.Header.Set("Authorization", auth)
}
req.Header.Set("Client-Platform", "ragflow")
resp, err := p.baseModel.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to submit job: %w", err)
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("submit job failed: %s", string(respBody))
}
var submitResp paddleSubmitResponse
if err := json.Unmarshal(respBody, &submitResp); err != nil {
return nil, fmt.Errorf("failed to parse submit response: %w", err)
}
jobId := submitResp.Data.JobId
if jobId == "" {
return nil, fmt.Errorf("failed to get jobId from response")
}
pollUrl := fmt.Sprintf("%s/%s", url, jobId)
var jsonlUrl string
pollInterval := 3 * time.Second
const pollMultiplier = 1.5
maxPollInterval := 15 * time.Second
for {
pollReq, err := http.NewRequestWithContext(ctx, "GET", pollUrl, nil)
if err != nil {
return nil, fmt.Errorf("failed to create poll request: %w", err)
}
if auth := BearerAuth(apiConfig); auth != "" {
pollReq.Header.Set("Authorization", auth)
}
pollReq.Header.Set("Client-Platform", "ragflow")
pollResp, err := p.baseModel.httpClient.Do(pollReq)
if err != nil {
return nil, fmt.Errorf("failed to poll job status: %w", err)
}
pollBody, _ := io.ReadAll(pollResp.Body)
pollResp.Body.Close()
if pollResp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("poll job failed: %s", string(pollBody))
}
var pollData paddlePollResponse
if err = json.Unmarshal(pollBody, &pollData); err != nil {
return nil, fmt.Errorf("failed to parse poll response: %w", err)
}
// end if 'done' or 'failed'
state := pollData.Data.State
if state == "done" {
jsonlUrl = pollData.Data.ResultUrl.JsonUrl
break
} else if state == "failed" {
return nil, fmt.Errorf("ocr job failed on server: %s", pollData.Data.ErrorMsg)
}
// Exponential backoff
pollInterval = time.Duration(float64(pollInterval) * pollMultiplier)
if pollInterval > maxPollInterval {
pollInterval = maxPollInterval
}
select {
case <-time.After(pollInterval):
case <-ctx.Done():
return nil, ctx.Err()
}
}
if jsonlUrl == "" {
return nil, fmt.Errorf("job done but jsonl url is empty")
}
resReq, err := http.NewRequestWithContext(ctx, "GET", jsonlUrl, nil)
if err != nil {
return nil, fmt.Errorf("failed to create request for jsonl: %w", err)
}
resResp, err := p.baseModel.httpClient.Do(resReq)
if err != nil {
return nil, fmt.Errorf("failed to download jsonl result: %w", err)
}
defer resResp.Body.Close()
if resResp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("failed to download jsonl, status: %d", resResp.StatusCode)
}
var fullMarkdown strings.Builder
scanner := bufio.NewScanner(resResp.Body)
scanner.Buffer(make([]byte, 64*1024), 1024*1024)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
var lineData paddleJsonlLine
if err := json.Unmarshal([]byte(line), &lineData); err != nil {
continue
}
for _, layoutRes := range lineData.Result.LayoutParsingResults {
fullMarkdown.WriteString(layoutRes.Markdown.Text)
fullMarkdown.WriteString("\n\n")
}
}
if err = scanner.Err(); err != nil {
return nil, fmt.Errorf("error reading jsonl: %w", err)
}
extractedText := strings.TrimSpace(fullMarkdown.String())
return &OCRFileResponse{Text: &extractedText}, nil
}
func (p *PaddleOCRModel) ParseFile(modelName *string, content []byte, url *string, apiConfig *APIConfig, parseFileConfig *ParseFileConfig) (*ParseFileResponse, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) ListModels(apiConfig *APIConfig) ([]ListModelResponse, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) Balance(apiConfig *APIConfig) (map[string]interface{}, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) CheckConnection(apiConfig *APIConfig) error {
return fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) ListTasks(apiConfig *APIConfig) ([]ListTaskStatus, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) ShowTask(taskID string, apiConfig *APIConfig) (*TaskResponse, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}