Files
ragflow/internal/entity/models/paddleocr.go
Rander 017adf841f fix(paddleocr): support PP-OCRv6 ocrResults fallback and integrate image parsing (#16150)
## Summary

This PR fixes two issues discovered during testing of the PaddleOCR
async API refactoring:

### 1. PP-OCRv6 returns `ocrResults` instead of `layoutParsingResults`

Models like PP-OCRv6 are pure text recognition models that return
results in `ocrResults.prunedResult.rec_texts` format rather than the
`layoutParsingResults.prunedResult.parsing_res_list` format used by
layout-aware models (PaddleOCR-VL series).

**Changes:**
- `deepdoc/parser/paddleocr_parser.py`: Extract `ocrResults` alongside
`layoutParsingResults` in `_send_request()`, add fallback logic in
`_transfer_to_sections()` and `parse_image()`
- `internal/entity/models/paddleocr.go`: Add `ocrResults` struct and
fallback extraction in Go OCR handler

### 2. Image parsing not integrated into picture chunker

The `parse_image()` method existed in PaddleOCRParser but was never
called from `rag/app/picture.py` (the module that handles image file
uploads). Users configuring PaddleOCR as their layout recognizer would
still get local deepdoc OCR for images.

**Changes:**
- `rag/app/picture.py`: When `layout_recognize` is set to PaddleOCR, use
`PaddleOCROcrModel.parse_image()` instead of local OCR. Falls back
gracefully to local OCR on failure.

## Testing

Verified end-to-end in Docker:
- PaddleOCR-VL-1.6 PDF parsing:  (10 text blocks with bbox)
- PaddleOCR-VL-1.6 image parsing:  (219 chars)
- PP-OCRv6 PDF parsing with ocrResults fallback:  (10 text blocks)
- PP-OCRv6 image parsing with ocrResults fallback:  (136 chars)

## Related PRs

- #15967 (merged) - PaddleOCR async Job API refactoring + new models
- #16086 (merged) - PaddleOCR image parsing support
2026-06-23 22:02:54 +08:00

344 lines
10 KiB
Go

//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package models
import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/http"
"strings"
"time"
)
type PaddleOCRModel struct {
baseModel BaseModel
}
func NewPaddleOCRModel(baseURL map[string]string, urlSuffix URLSuffix) *PaddleOCRModel {
return &PaddleOCRModel{
baseModel: BaseModel{
BaseURL: baseURL,
URLSuffix: urlSuffix,
AllowEmptyAPIKey: true,
httpClient: NewDriverHTTPClient(),
},
}
}
func (p PaddleOCRModel) NewInstance(baseURL map[string]string) ModelDriver {
return NewPaddleOCRModel(baseURL, p.baseModel.URLSuffix)
}
func (p *PaddleOCRModel) Name() string {
return "paddle_ocr.net"
}
func (p *PaddleOCRModel) ChatWithMessages(modelName string, messages []Message, apiConfig *APIConfig, chatModelConfig *ChatConfig) (*ChatResponse, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) ChatStreamlyWithSender(modelName string, messages []Message, apiConfig *APIConfig, modelConfig *ChatConfig, sender func(*string, *string) error) error {
return fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) Embed(modelName *string, texts []string, apiConfig *APIConfig, embeddingConfig *EmbeddingConfig) ([]EmbeddingData, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) Rerank(modelName *string, query string, documents []string, apiConfig *APIConfig, rerankConfig *RerankConfig) (*RerankResponse, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) TranscribeAudio(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig) (*ASRResponse, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) TranscribeAudioWithSender(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig, sender func(*string, *string) error) error {
return fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig) (*TTSResponse, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error {
return fmt.Errorf("%s, no such method", p.Name())
}
type paddleSubmitResponse struct {
Data struct {
JobId string `json:"jobId"`
} `json:"data"`
}
type paddlePollResponse struct {
Data struct {
State string `json:"state"`
ErrorMsg string `json:"errorMsg"`
ResultUrl struct {
JsonUrl string `json:"jsonUrl"`
} `json:"resultUrl"`
} `json:"data"`
}
type paddleJsonlLine struct {
Result struct {
LayoutParsingResults []struct {
Markdown struct {
Text string `json:"text"`
} `json:"markdown"`
} `json:"layoutParsingResults"`
OcrResults []struct {
PrunedResult struct {
RecTexts []string `json:"rec_texts"`
} `json:"prunedResult"`
} `json:"ocrResults"`
} `json:"result"`
}
func (p *PaddleOCRModel) OCRFile(modelName *string, content []byte, fileURL *string, apiConfig *APIConfig, ocrConfig *OCRConfig) (*OCRFileResponse, error) {
if err := p.baseModel.APIConfigCheck(apiConfig); err != nil {
return nil, err
}
if (content == nil || len(content) == 0) && (fileURL == nil || *fileURL == "") {
return nil, fmt.Errorf("content and fileURL cannot be both empty")
}
resolvedBaseURL, err := p.baseModel.GetBaseURL(apiConfig)
if err != nil {
return nil, err
}
url := fmt.Sprintf("%s/%s", resolvedBaseURL, p.baseModel.URLSuffix.OCR)
optionalPayload := map[string]bool{
"useDocOrientationClassify": false,
"useDocUnwarping": false,
"useChartRecognition": false,
}
optBytes, _ := json.Marshal(optionalPayload)
// One generous deadline bounds the whole OCR operation (submit + poll +
// result download), so the poll loop below can no longer spin forever.
ctx, cancel := context.WithTimeout(context.Background(), longOpCallTimeout)
defer cancel()
var req *http.Request
if fileURL != nil && strings.HasPrefix(*fileURL, "http") {
reqData := map[string]interface{}{
"fileUrl": *fileURL,
"model": *modelName,
"optionalPayload": optionalPayload,
}
jsonData, err := json.Marshal(reqData)
if err != nil {
return nil, fmt.Errorf("failed to marshal json: %w", err)
}
req, err = http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonData))
req.Header.Set("Content-Type", "application/json")
} else {
body := &bytes.Buffer{}
writer := multipart.NewWriter(body)
_ = writer.WriteField("model", *modelName)
_ = writer.WriteField("optionalPayload", string(optBytes))
part, err := writer.CreateFormFile("file", "document.pdf")
if err != nil {
return nil, fmt.Errorf("failed to create form file: %w", err)
}
part.Write(content)
writer.Close()
req, err = http.NewRequestWithContext(ctx, "POST", url, body)
req.Header.Set("Content-Type", writer.FormDataContentType())
}
if auth := BearerAuth(apiConfig); auth != "" {
req.Header.Set("Authorization", auth)
}
req.Header.Set("Client-Platform", "ragflow")
resp, err := p.baseModel.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to submit job: %w", err)
}
defer resp.Body.Close()
respBody, _ := io.ReadAll(resp.Body)
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("submit job failed: %s", string(respBody))
}
var submitResp paddleSubmitResponse
if err := json.Unmarshal(respBody, &submitResp); err != nil {
return nil, fmt.Errorf("failed to parse submit response: %w", err)
}
jobId := submitResp.Data.JobId
if jobId == "" {
return nil, fmt.Errorf("failed to get jobId from response")
}
pollUrl := fmt.Sprintf("%s/%s", url, jobId)
var jsonlUrl string
pollInterval := 3 * time.Second
const pollMultiplier = 1.5
maxPollInterval := 15 * time.Second
for {
pollReq, err := http.NewRequestWithContext(ctx, "GET", pollUrl, nil)
if err != nil {
return nil, fmt.Errorf("failed to create poll request: %w", err)
}
if auth := BearerAuth(apiConfig); auth != "" {
pollReq.Header.Set("Authorization", auth)
}
pollReq.Header.Set("Client-Platform", "ragflow")
pollResp, err := p.baseModel.httpClient.Do(pollReq)
if err != nil {
return nil, fmt.Errorf("failed to poll job status: %w", err)
}
pollBody, _ := io.ReadAll(pollResp.Body)
pollResp.Body.Close()
if pollResp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("poll job failed: %s", string(pollBody))
}
var pollData paddlePollResponse
if err = json.Unmarshal(pollBody, &pollData); err != nil {
return nil, fmt.Errorf("failed to parse poll response: %w", err)
}
// end if 'done' or 'failed'
state := pollData.Data.State
if state == "done" {
jsonlUrl = pollData.Data.ResultUrl.JsonUrl
break
} else if state == "failed" {
return nil, fmt.Errorf("ocr job failed on server: %s", pollData.Data.ErrorMsg)
}
// Exponential backoff
pollInterval = time.Duration(float64(pollInterval) * pollMultiplier)
if pollInterval > maxPollInterval {
pollInterval = maxPollInterval
}
select {
case <-time.After(pollInterval):
case <-ctx.Done():
return nil, ctx.Err()
}
}
if jsonlUrl == "" {
return nil, fmt.Errorf("job done but jsonl url is empty")
}
resReq, err := http.NewRequestWithContext(ctx, "GET", jsonlUrl, nil)
if err != nil {
return nil, fmt.Errorf("failed to create request for jsonl: %w", err)
}
resResp, err := p.baseModel.httpClient.Do(resReq)
if err != nil {
return nil, fmt.Errorf("failed to download jsonl result: %w", err)
}
defer resResp.Body.Close()
if resResp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("failed to download jsonl, status: %d", resResp.StatusCode)
}
var fullMarkdown strings.Builder
scanner := bufio.NewScanner(resResp.Body)
scanner.Buffer(make([]byte, 64*1024), 1024*1024)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
var lineData paddleJsonlLine
if err := json.Unmarshal([]byte(line), &lineData); err != nil {
continue
}
for _, layoutRes := range lineData.Result.LayoutParsingResults {
fullMarkdown.WriteString(layoutRes.Markdown.Text)
fullMarkdown.WriteString("\n\n")
}
// Fallback to ocrResults for models like PP-OCRv6
if len(lineData.Result.LayoutParsingResults) == 0 {
for _, ocrRes := range lineData.Result.OcrResults {
for _, text := range ocrRes.PrunedResult.RecTexts {
text = strings.TrimSpace(text)
if text != "" {
fullMarkdown.WriteString(text)
fullMarkdown.WriteString("\n")
}
}
}
}
}
if err = scanner.Err(); err != nil {
return nil, fmt.Errorf("error reading jsonl: %w", err)
}
extractedText := strings.TrimSpace(fullMarkdown.String())
return &OCRFileResponse{Text: &extractedText}, nil
}
func (p *PaddleOCRModel) ParseFile(modelName *string, content []byte, url *string, apiConfig *APIConfig, parseFileConfig *ParseFileConfig) (*ParseFileResponse, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) ListModels(apiConfig *APIConfig) ([]ListModelResponse, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) Balance(apiConfig *APIConfig) (map[string]interface{}, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) CheckConnection(apiConfig *APIConfig) error {
return fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) ListTasks(apiConfig *APIConfig) ([]ListTaskStatus, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}
func (p *PaddleOCRModel) ShowTask(taskID string, apiConfig *APIConfig) (*TaskResponse, error) {
return nil, fmt.Errorf("%s, no such method", p.Name())
}