mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
89 lines
2.7 KiB
Go
89 lines
2.7 KiB
Go
//go:build cgo
|
|
|
|
package parser
|
|
|
|
import (
|
|
"context"
|
|
"image"
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
)
|
|
|
|
func TestParse_PdfiumRender(t *testing.T) {
|
|
// Use a small controlled test PDF from the testdata/pdfs directory.
|
|
pdfPath := filepath.Join("testdata", "pdfs", "01_english_simple.pdf")
|
|
data, err := os.ReadFile(pdfPath)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
eng, err := NewEngine(data)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer eng.Close()
|
|
|
|
// Verify RawData is available and correct.
|
|
raw := eng.RawData()
|
|
if len(raw) == 0 {
|
|
t.Fatal("RawData() returned empty data")
|
|
}
|
|
if len(raw) != len(data) {
|
|
t.Fatalf("RawData() length %d != original %d", len(raw), len(data))
|
|
}
|
|
|
|
// Render a page through pdfium (via the parser's renderPageToImage).
|
|
img, err := renderPageToImage(eng, 0)
|
|
if err != nil {
|
|
t.Skipf("pdfium render not available: %v", err)
|
|
}
|
|
b := img.Bounds()
|
|
t.Logf("01_english_simple.pdf page 0: %dx%d", b.Dx(), b.Dy())
|
|
if b.Dx() <= 0 || b.Dy() <= 0 {
|
|
t.Errorf("expected non-zero dimensions from pdfium render, got %dx%d", b.Dx(), b.Dy())
|
|
}
|
|
|
|
// Run Parse with pdfium rendering — BATCH_SKIP_DEEPDOC=1 to avoid HTTP calls.
|
|
t.Setenv("BATCH_SKIP_DEEPDOC", "1")
|
|
cfg := DefaultParserConfig()
|
|
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true, Model: ModelSaas})
|
|
result, err := p.Parse(context.Background(), eng)
|
|
if err != nil {
|
|
t.Fatalf("Parse: %v", err)
|
|
}
|
|
t.Logf("Parse: %d sections, %d tables, %d page images", len(result.Sections), len(result.Tables), len(result.PageImages))
|
|
|
|
if len(result.Sections) == 0 {
|
|
t.Error("expected at least one section")
|
|
}
|
|
if len(result.PageImages) == 0 {
|
|
t.Error("expected at least one page image")
|
|
}
|
|
}
|
|
|
|
func TestParse_PdfiumRender_NoData(t *testing.T) {
|
|
// When engine has no raw PDF bytes, renderPageToImage falls back to
|
|
// engine.RenderPageImage(). Stub returns (nil, nil) → guard converts
|
|
// to ErrNoPDFData so callers never receive a nil image with nil error.
|
|
img, err := renderPageToImage(&pythonCharEngineStub{}, 0)
|
|
if err != ErrNoPDFData {
|
|
t.Errorf("expected ErrNoPDFData, got %v", err)
|
|
}
|
|
if img != nil {
|
|
t.Error("expected nil image")
|
|
}
|
|
}
|
|
|
|
// pythonCharEngineStub implements PDFEngine with RawData() returning nil.
|
|
type pythonCharEngineStub struct{}
|
|
|
|
func (e *pythonCharEngineStub) ExtractChars(_ int) ([]TextChar, error) { return nil, nil }
|
|
func (e *pythonCharEngineStub) RenderPage(_ int, _ float64) ([]byte, error) { return nil, nil }
|
|
func (e *pythonCharEngineStub) RenderPageImage(_ int, _ float64) (image.Image, error) {
|
|
return nil, nil
|
|
}
|
|
func (e *pythonCharEngineStub) RawData() []byte { return nil }
|
|
func (e *pythonCharEngineStub) PageCount() (int, error) { return 0, nil }
|
|
func (e *pythonCharEngineStub) Close() error { return nil }
|