mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
## Summary This PR fixes two issues discovered during testing of the PaddleOCR async API refactoring: ### 1. PP-OCRv6 returns `ocrResults` instead of `layoutParsingResults` Models like PP-OCRv6 are pure text recognition models that return results in `ocrResults.prunedResult.rec_texts` format rather than the `layoutParsingResults.prunedResult.parsing_res_list` format used by layout-aware models (PaddleOCR-VL series). **Changes:** - `deepdoc/parser/paddleocr_parser.py`: Extract `ocrResults` alongside `layoutParsingResults` in `_send_request()`, add fallback logic in `_transfer_to_sections()` and `parse_image()` - `internal/entity/models/paddleocr.go`: Add `ocrResults` struct and fallback extraction in Go OCR handler ### 2. Image parsing not integrated into picture chunker The `parse_image()` method existed in PaddleOCRParser but was never called from `rag/app/picture.py` (the module that handles image file uploads). Users configuring PaddleOCR as their layout recognizer would still get local deepdoc OCR for images. **Changes:** - `rag/app/picture.py`: When `layout_recognize` is set to PaddleOCR, use `PaddleOCROcrModel.parse_image()` instead of local OCR. Falls back gracefully to local OCR on failure. ## Testing Verified end-to-end in Docker: - PaddleOCR-VL-1.6 PDF parsing: ✅ (10 text blocks with bbox) - PaddleOCR-VL-1.6 image parsing: ✅ (219 chars) - PP-OCRv6 PDF parsing with ocrResults fallback: ✅ (10 text blocks) - PP-OCRv6 image parsing with ocrResults fallback: ✅ (136 chars) ## Related PRs - #15967 (merged) - PaddleOCR async Job API refactoring + new models - #16086 (merged) - PaddleOCR image parsing support
344 lines
10 KiB
Go
344 lines
10 KiB
Go
//
|
|
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
|
|
package models
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"mime/multipart"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type PaddleOCRModel struct {
|
|
baseModel BaseModel
|
|
}
|
|
|
|
func NewPaddleOCRModel(baseURL map[string]string, urlSuffix URLSuffix) *PaddleOCRModel {
|
|
return &PaddleOCRModel{
|
|
baseModel: BaseModel{
|
|
BaseURL: baseURL,
|
|
URLSuffix: urlSuffix,
|
|
AllowEmptyAPIKey: true,
|
|
httpClient: NewDriverHTTPClient(),
|
|
},
|
|
}
|
|
}
|
|
|
|
func (p PaddleOCRModel) NewInstance(baseURL map[string]string) ModelDriver {
|
|
return NewPaddleOCRModel(baseURL, p.baseModel.URLSuffix)
|
|
}
|
|
|
|
func (p *PaddleOCRModel) Name() string {
|
|
return "paddle_ocr.net"
|
|
}
|
|
|
|
func (p *PaddleOCRModel) ChatWithMessages(modelName string, messages []Message, apiConfig *APIConfig, chatModelConfig *ChatConfig) (*ChatResponse, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) ChatStreamlyWithSender(modelName string, messages []Message, apiConfig *APIConfig, modelConfig *ChatConfig, sender func(*string, *string) error) error {
|
|
return fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) Embed(modelName *string, texts []string, apiConfig *APIConfig, embeddingConfig *EmbeddingConfig) ([]EmbeddingData, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) Rerank(modelName *string, query string, documents []string, apiConfig *APIConfig, rerankConfig *RerankConfig) (*RerankResponse, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) TranscribeAudio(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig) (*ASRResponse, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) TranscribeAudioWithSender(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig, sender func(*string, *string) error) error {
|
|
return fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig) (*TTSResponse, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error {
|
|
return fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
type paddleSubmitResponse struct {
|
|
Data struct {
|
|
JobId string `json:"jobId"`
|
|
} `json:"data"`
|
|
}
|
|
|
|
type paddlePollResponse struct {
|
|
Data struct {
|
|
State string `json:"state"`
|
|
ErrorMsg string `json:"errorMsg"`
|
|
ResultUrl struct {
|
|
JsonUrl string `json:"jsonUrl"`
|
|
} `json:"resultUrl"`
|
|
} `json:"data"`
|
|
}
|
|
|
|
type paddleJsonlLine struct {
|
|
Result struct {
|
|
LayoutParsingResults []struct {
|
|
Markdown struct {
|
|
Text string `json:"text"`
|
|
} `json:"markdown"`
|
|
} `json:"layoutParsingResults"`
|
|
OcrResults []struct {
|
|
PrunedResult struct {
|
|
RecTexts []string `json:"rec_texts"`
|
|
} `json:"prunedResult"`
|
|
} `json:"ocrResults"`
|
|
} `json:"result"`
|
|
}
|
|
|
|
func (p *PaddleOCRModel) OCRFile(modelName *string, content []byte, fileURL *string, apiConfig *APIConfig, ocrConfig *OCRConfig) (*OCRFileResponse, error) {
|
|
if err := p.baseModel.APIConfigCheck(apiConfig); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if (content == nil || len(content) == 0) && (fileURL == nil || *fileURL == "") {
|
|
return nil, fmt.Errorf("content and fileURL cannot be both empty")
|
|
}
|
|
|
|
resolvedBaseURL, err := p.baseModel.GetBaseURL(apiConfig)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
url := fmt.Sprintf("%s/%s", resolvedBaseURL, p.baseModel.URLSuffix.OCR)
|
|
|
|
optionalPayload := map[string]bool{
|
|
"useDocOrientationClassify": false,
|
|
"useDocUnwarping": false,
|
|
"useChartRecognition": false,
|
|
}
|
|
optBytes, _ := json.Marshal(optionalPayload)
|
|
|
|
// One generous deadline bounds the whole OCR operation (submit + poll +
|
|
// result download), so the poll loop below can no longer spin forever.
|
|
ctx, cancel := context.WithTimeout(context.Background(), longOpCallTimeout)
|
|
defer cancel()
|
|
|
|
var req *http.Request
|
|
|
|
if fileURL != nil && strings.HasPrefix(*fileURL, "http") {
|
|
reqData := map[string]interface{}{
|
|
"fileUrl": *fileURL,
|
|
"model": *modelName,
|
|
"optionalPayload": optionalPayload,
|
|
}
|
|
jsonData, err := json.Marshal(reqData)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to marshal json: %w", err)
|
|
}
|
|
req, err = http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonData))
|
|
req.Header.Set("Content-Type", "application/json")
|
|
} else {
|
|
body := &bytes.Buffer{}
|
|
writer := multipart.NewWriter(body)
|
|
|
|
_ = writer.WriteField("model", *modelName)
|
|
_ = writer.WriteField("optionalPayload", string(optBytes))
|
|
|
|
part, err := writer.CreateFormFile("file", "document.pdf")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create form file: %w", err)
|
|
}
|
|
part.Write(content)
|
|
writer.Close()
|
|
|
|
req, err = http.NewRequestWithContext(ctx, "POST", url, body)
|
|
req.Header.Set("Content-Type", writer.FormDataContentType())
|
|
}
|
|
|
|
if auth := BearerAuth(apiConfig); auth != "" {
|
|
req.Header.Set("Authorization", auth)
|
|
}
|
|
req.Header.Set("Client-Platform", "ragflow")
|
|
|
|
resp, err := p.baseModel.httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to submit job: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
respBody, _ := io.ReadAll(resp.Body)
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("submit job failed: %s", string(respBody))
|
|
}
|
|
|
|
var submitResp paddleSubmitResponse
|
|
if err := json.Unmarshal(respBody, &submitResp); err != nil {
|
|
return nil, fmt.Errorf("failed to parse submit response: %w", err)
|
|
}
|
|
|
|
jobId := submitResp.Data.JobId
|
|
if jobId == "" {
|
|
return nil, fmt.Errorf("failed to get jobId from response")
|
|
}
|
|
|
|
pollUrl := fmt.Sprintf("%s/%s", url, jobId)
|
|
var jsonlUrl string
|
|
|
|
pollInterval := 3 * time.Second
|
|
const pollMultiplier = 1.5
|
|
maxPollInterval := 15 * time.Second
|
|
|
|
for {
|
|
pollReq, err := http.NewRequestWithContext(ctx, "GET", pollUrl, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create poll request: %w", err)
|
|
}
|
|
if auth := BearerAuth(apiConfig); auth != "" {
|
|
pollReq.Header.Set("Authorization", auth)
|
|
}
|
|
pollReq.Header.Set("Client-Platform", "ragflow")
|
|
|
|
pollResp, err := p.baseModel.httpClient.Do(pollReq)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to poll job status: %w", err)
|
|
}
|
|
|
|
pollBody, _ := io.ReadAll(pollResp.Body)
|
|
pollResp.Body.Close()
|
|
|
|
if pollResp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("poll job failed: %s", string(pollBody))
|
|
}
|
|
|
|
var pollData paddlePollResponse
|
|
if err = json.Unmarshal(pollBody, &pollData); err != nil {
|
|
return nil, fmt.Errorf("failed to parse poll response: %w", err)
|
|
}
|
|
|
|
// end if 'done' or 'failed'
|
|
state := pollData.Data.State
|
|
if state == "done" {
|
|
jsonlUrl = pollData.Data.ResultUrl.JsonUrl
|
|
break
|
|
} else if state == "failed" {
|
|
return nil, fmt.Errorf("ocr job failed on server: %s", pollData.Data.ErrorMsg)
|
|
}
|
|
|
|
// Exponential backoff
|
|
pollInterval = time.Duration(float64(pollInterval) * pollMultiplier)
|
|
if pollInterval > maxPollInterval {
|
|
pollInterval = maxPollInterval
|
|
}
|
|
|
|
select {
|
|
case <-time.After(pollInterval):
|
|
case <-ctx.Done():
|
|
return nil, ctx.Err()
|
|
}
|
|
}
|
|
|
|
if jsonlUrl == "" {
|
|
return nil, fmt.Errorf("job done but jsonl url is empty")
|
|
}
|
|
|
|
resReq, err := http.NewRequestWithContext(ctx, "GET", jsonlUrl, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create request for jsonl: %w", err)
|
|
}
|
|
|
|
resResp, err := p.baseModel.httpClient.Do(resReq)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to download jsonl result: %w", err)
|
|
}
|
|
defer resResp.Body.Close()
|
|
|
|
if resResp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("failed to download jsonl, status: %d", resResp.StatusCode)
|
|
}
|
|
|
|
var fullMarkdown strings.Builder
|
|
scanner := bufio.NewScanner(resResp.Body)
|
|
scanner.Buffer(make([]byte, 64*1024), 1024*1024)
|
|
|
|
for scanner.Scan() {
|
|
line := strings.TrimSpace(scanner.Text())
|
|
if line == "" {
|
|
continue
|
|
}
|
|
|
|
var lineData paddleJsonlLine
|
|
if err := json.Unmarshal([]byte(line), &lineData); err != nil {
|
|
continue
|
|
}
|
|
|
|
for _, layoutRes := range lineData.Result.LayoutParsingResults {
|
|
fullMarkdown.WriteString(layoutRes.Markdown.Text)
|
|
fullMarkdown.WriteString("\n\n")
|
|
}
|
|
|
|
// Fallback to ocrResults for models like PP-OCRv6
|
|
if len(lineData.Result.LayoutParsingResults) == 0 {
|
|
for _, ocrRes := range lineData.Result.OcrResults {
|
|
for _, text := range ocrRes.PrunedResult.RecTexts {
|
|
text = strings.TrimSpace(text)
|
|
if text != "" {
|
|
fullMarkdown.WriteString(text)
|
|
fullMarkdown.WriteString("\n")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if err = scanner.Err(); err != nil {
|
|
return nil, fmt.Errorf("error reading jsonl: %w", err)
|
|
}
|
|
|
|
extractedText := strings.TrimSpace(fullMarkdown.String())
|
|
|
|
return &OCRFileResponse{Text: &extractedText}, nil
|
|
}
|
|
|
|
func (p *PaddleOCRModel) ParseFile(modelName *string, content []byte, url *string, apiConfig *APIConfig, parseFileConfig *ParseFileConfig) (*ParseFileResponse, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) ListModels(apiConfig *APIConfig) ([]ListModelResponse, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) Balance(apiConfig *APIConfig) (map[string]interface{}, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) CheckConnection(apiConfig *APIConfig) error {
|
|
return fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) ListTasks(apiConfig *APIConfig) ([]ListTaskStatus, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|
|
|
|
func (p *PaddleOCRModel) ShowTask(taskID string, apiConfig *APIConfig) (*TaskResponse, error) {
|
|
return nil, fmt.Errorf("%s, no such method", p.Name())
|
|
}
|