mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-04 18:45:38 +08:00
## Summary
Migrate PaddleOCR integration from the deprecated synchronous HTTP API
to the new asynchronous Job API (`submit → poll → fetch`), aligning with
PaddleOCR 3.6.0+ architecture.
## Changes
### Python (`deepdoc/parser/paddleocr_parser.py`)
- Replace synchronous `requests.post()` with async Job API flow (submit
→ poll → fetch)
- Authentication: `token {token}` → `Bearer {token}`
- File transfer: base64 JSON body → multipart file upload
- Polling: exponential backoff (initial 3s, ×1.5, max 15s, timeout
controlled by `request_timeout`)
- Result: fetch full JSONL from result URL, preserving `prunedResult`
with bbox info for crop functionality
- Rename `api_url` → `base_url` (backward compatible: `api_url` still
accepted as fallback)
### Python (`rag/llm/ocr_model.py`)
- Prefer `paddleocr_base_url` / `PADDLEOCR_BASE_URL`, fallback to
`paddleocr_api_url` / `PADDLEOCR_API_URL`
### Go (`internal/entity/models/paddleocr.go`)
- Add `Client-Platform: ragflow` header to submit and poll requests
- Change polling from fixed 3s to exponential backoff (initial 3s, ×1.5,
max 15s)
### Python (`common/constants.py`)
- Add `PADDLEOCR_BASE_URL` to env keys and default config
## Backward Compatibility
- Old env var `PADDLEOCR_API_URL` still works (used as fallback)
- Frontend field `paddleocr_api_url` still works (backend reads it as
fallback)
- No user-facing configuration changes required for existing setups
## Why not use the `paddleocr` SDK package directly?
RAGFlow's `_transfer_to_sections()` relies on `prunedResult` (containing
`block_bbox`, `block_label`, `parsing_res_list`) from the raw API
response for PDF crop functionality. The SDK's public `parse_document()`
API only returns `DocParsingResult` with `markdown_text`, discarding the
bbox data. Therefore we implement the async Job API flow directly via
HTTP, following the same logic as the SDK internally.
326 lines
9.8 KiB
Go
326 lines
9.8 KiB
Go
//
|
|
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
|
|
package models
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"mime/multipart"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type PaddleOCRModel struct {
|
|
baseModel BaseModel
|
|
}
|
|
|
|
func NewPaddleOCRModel(baseURL map[string]string, urlSuffix URLSuffix) *PaddleOCRModel {
|
|
return &PaddleOCRModel{
|
|
baseModel: BaseModel{
|
|
BaseURL: baseURL,
|
|
URLSuffix: urlSuffix,
|
|
AllowEmptyAPIKey: true,
|
|
httpClient: NewDriverHTTPClient(),
|
|
},
|
|
}
|
|
}
|
|
|
|
func (p PaddleOCRModel) NewInstance(baseURL map[string]string) ModelDriver {
|
|
return NewPaddleOCRModel(baseURL, p.baseModel.URLSuffix)
|
|
}
|
|
|
|
func (p *PaddleOCRModel) Name() string {
|
|
return "paddle_ocr.net"
|
|
}
|
|
|
|
func (p *PaddleOCRModel) ChatWithMessages(modelName string, messages []Message, apiConfig *APIConfig, chatModelConfig *ChatConfig) (*ChatResponse, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) ChatStreamlyWithSender(modelName string, messages []Message, apiConfig *APIConfig, modelConfig *ChatConfig, sender func(*string, *string) error) error {
|
|
return fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) Embed(modelName *string, texts []string, apiConfig *APIConfig, embeddingConfig *EmbeddingConfig) ([]EmbeddingData, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) Rerank(modelName *string, query string, documents []string, apiConfig *APIConfig, rerankConfig *RerankConfig) (*RerankResponse, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) TranscribeAudio(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig) (*ASRResponse, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) TranscribeAudioWithSender(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig, sender func(*string, *string) error) error {
|
|
return fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig) (*TTSResponse, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error {
|
|
return fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
type paddleSubmitResponse struct {
|
|
Data struct {
|
|
JobId string `json:"jobId"`
|
|
} `json:"data"`
|
|
}
|
|
|
|
type paddlePollResponse struct {
|
|
Data struct {
|
|
State string `json:"state"`
|
|
ErrorMsg string `json:"errorMsg"`
|
|
ResultUrl struct {
|
|
JsonUrl string `json:"jsonUrl"`
|
|
} `json:"resultUrl"`
|
|
} `json:"data"`
|
|
}
|
|
|
|
type paddleJsonlLine struct {
|
|
Result struct {
|
|
LayoutParsingResults []struct {
|
|
Markdown struct {
|
|
Text string `json:"text"`
|
|
} `json:"markdown"`
|
|
} `json:"layoutParsingResults"`
|
|
} `json:"result"`
|
|
}
|
|
|
|
func (p *PaddleOCRModel) OCRFile(modelName *string, content []byte, fileURL *string, apiConfig *APIConfig, ocrConfig *OCRConfig) (*OCRFileResponse, error) {
|
|
if err := p.baseModel.APIConfigCheck(apiConfig); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if (content == nil || len(content) == 0) && (fileURL == nil || *fileURL == "") {
|
|
return nil, fmt.Errorf("content and fileURL cannot be both empty")
|
|
}
|
|
|
|
resolvedBaseURL, err := p.baseModel.GetBaseURL(apiConfig)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
url := fmt.Sprintf("%s/%s", resolvedBaseURL, p.baseModel.URLSuffix.OCR)
|
|
|
|
optionalPayload := map[string]bool{
|
|
"useDocOrientationClassify": false,
|
|
"useDocUnwarping": false,
|
|
"useChartRecognition": false,
|
|
}
|
|
optBytes, _ := json.Marshal(optionalPayload)
|
|
|
|
// One generous deadline bounds the whole OCR operation (submit + poll +
|
|
// result download), so the poll loop below can no longer spin forever.
|
|
ctx, cancel := context.WithTimeout(context.Background(), longOpCallTimeout)
|
|
defer cancel()
|
|
|
|
var req *http.Request
|
|
|
|
if fileURL != nil && strings.HasPrefix(*fileURL, "http") {
|
|
reqData := map[string]interface{}{
|
|
"fileUrl": *fileURL,
|
|
"model": *modelName,
|
|
"optionalPayload": optionalPayload,
|
|
}
|
|
jsonData, err := json.Marshal(reqData)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to marshal json: %w", err)
|
|
}
|
|
req, err = http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonData))
|
|
req.Header.Set("Content-Type", "application/json")
|
|
} else {
|
|
body := &bytes.Buffer{}
|
|
writer := multipart.NewWriter(body)
|
|
|
|
_ = writer.WriteField("model", *modelName)
|
|
_ = writer.WriteField("optionalPayload", string(optBytes))
|
|
|
|
part, err := writer.CreateFormFile("file", "document.pdf")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create form file: %w", err)
|
|
}
|
|
part.Write(content)
|
|
writer.Close()
|
|
|
|
req, err = http.NewRequestWithContext(ctx, "POST", url, body)
|
|
req.Header.Set("Content-Type", writer.FormDataContentType())
|
|
}
|
|
|
|
if auth := BearerAuth(apiConfig); auth != "" {
|
|
req.Header.Set("Authorization", auth)
|
|
}
|
|
req.Header.Set("Client-Platform", "ragflow")
|
|
|
|
resp, err := p.baseModel.httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to submit job: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
respBody, _ := io.ReadAll(resp.Body)
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("submit job failed: %s", string(respBody))
|
|
}
|
|
|
|
var submitResp paddleSubmitResponse
|
|
if err := json.Unmarshal(respBody, &submitResp); err != nil {
|
|
return nil, fmt.Errorf("failed to parse submit response: %w", err)
|
|
}
|
|
|
|
jobId := submitResp.Data.JobId
|
|
if jobId == "" {
|
|
return nil, fmt.Errorf("failed to get jobId from response")
|
|
}
|
|
|
|
pollUrl := fmt.Sprintf("%s/%s", url, jobId)
|
|
var jsonlUrl string
|
|
|
|
pollInterval := 3 * time.Second
|
|
const pollMultiplier = 1.5
|
|
maxPollInterval := 15 * time.Second
|
|
|
|
for {
|
|
pollReq, err := http.NewRequestWithContext(ctx, "GET", pollUrl, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create poll request: %w", err)
|
|
}
|
|
if auth := BearerAuth(apiConfig); auth != "" {
|
|
pollReq.Header.Set("Authorization", auth)
|
|
}
|
|
pollReq.Header.Set("Client-Platform", "ragflow")
|
|
|
|
pollResp, err := p.baseModel.httpClient.Do(pollReq)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to poll job status: %w", err)
|
|
}
|
|
|
|
pollBody, _ := io.ReadAll(pollResp.Body)
|
|
pollResp.Body.Close()
|
|
|
|
if pollResp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("poll job failed: %s", string(pollBody))
|
|
}
|
|
|
|
var pollData paddlePollResponse
|
|
if err = json.Unmarshal(pollBody, &pollData); err != nil {
|
|
return nil, fmt.Errorf("failed to parse poll response: %w", err)
|
|
}
|
|
|
|
// end if 'done' or 'failed'
|
|
state := pollData.Data.State
|
|
if state == "done" {
|
|
jsonlUrl = pollData.Data.ResultUrl.JsonUrl
|
|
break
|
|
} else if state == "failed" {
|
|
return nil, fmt.Errorf("ocr job failed on server: %s", pollData.Data.ErrorMsg)
|
|
}
|
|
|
|
// Exponential backoff
|
|
pollInterval = time.Duration(float64(pollInterval) * pollMultiplier)
|
|
if pollInterval > maxPollInterval {
|
|
pollInterval = maxPollInterval
|
|
}
|
|
|
|
select {
|
|
case <-time.After(pollInterval):
|
|
case <-ctx.Done():
|
|
return nil, ctx.Err()
|
|
}
|
|
}
|
|
|
|
if jsonlUrl == "" {
|
|
return nil, fmt.Errorf("job done but jsonl url is empty")
|
|
}
|
|
|
|
resReq, err := http.NewRequestWithContext(ctx, "GET", jsonlUrl, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create request for jsonl: %w", err)
|
|
}
|
|
|
|
resResp, err := p.baseModel.httpClient.Do(resReq)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to download jsonl result: %w", err)
|
|
}
|
|
defer resResp.Body.Close()
|
|
|
|
if resResp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("failed to download jsonl, status: %d", resResp.StatusCode)
|
|
}
|
|
|
|
var fullMarkdown strings.Builder
|
|
scanner := bufio.NewScanner(resResp.Body)
|
|
scanner.Buffer(make([]byte, 64*1024), 1024*1024)
|
|
|
|
for scanner.Scan() {
|
|
line := strings.TrimSpace(scanner.Text())
|
|
if line == "" {
|
|
continue
|
|
}
|
|
|
|
var lineData paddleJsonlLine
|
|
if err := json.Unmarshal([]byte(line), &lineData); err != nil {
|
|
continue
|
|
}
|
|
|
|
for _, layoutRes := range lineData.Result.LayoutParsingResults {
|
|
fullMarkdown.WriteString(layoutRes.Markdown.Text)
|
|
fullMarkdown.WriteString("\n\n")
|
|
}
|
|
}
|
|
|
|
if err = scanner.Err(); err != nil {
|
|
return nil, fmt.Errorf("error reading jsonl: %w", err)
|
|
}
|
|
|
|
extractedText := strings.TrimSpace(fullMarkdown.String())
|
|
|
|
return &OCRFileResponse{Text: &extractedText}, nil
|
|
}
|
|
|
|
func (p *PaddleOCRModel) ParseFile(modelName *string, content []byte, url *string, apiConfig *APIConfig, parseFileConfig *ParseFileConfig) (*ParseFileResponse, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) ListModels(apiConfig *APIConfig) ([]ListModelResponse, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) Balance(apiConfig *APIConfig) (map[string]interface{}, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) CheckConnection(apiConfig *APIConfig) error {
|
|
return fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) ListTasks(apiConfig *APIConfig) ([]ListTaskStatus, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) ShowTask(taskID string, apiConfig *APIConfig) (*TaskResponse, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|