mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
242 lines
6.2 KiB
Go
242 lines
6.2 KiB
Go
package pdfium
|
||
|
||
import (
|
||
"image"
|
||
"math"
|
||
"os"
|
||
"path/filepath"
|
||
"sync"
|
||
"testing"
|
||
)
|
||
|
||
// testdataDir points at the shared test-pdf directory.
|
||
var testdataDir = filepath.Join("..", "parser", "testdata", "pdfs")
|
||
|
||
func readPDF(t *testing.T, name string) []byte {
|
||
t.Helper()
|
||
data, err := os.ReadFile(filepath.Join(testdataDir, name))
|
||
if err != nil {
|
||
t.Fatalf("read %s: %v", name, err)
|
||
}
|
||
return data
|
||
}
|
||
|
||
func TestRenderPage_EnglishSimple(t *testing.T) {
|
||
data := readPDF(t, "01_english_simple.pdf")
|
||
img, err := RenderPage(data, 0, 72)
|
||
if err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
b := img.Bounds()
|
||
t.Logf("01_english_simple.pdf @ 72 DPI: %dx%d", b.Dx(), b.Dy())
|
||
if b.Dx() <= 0 || b.Dy() <= 0 {
|
||
t.Errorf("expected non-zero dimensions, got %dx%d", b.Dx(), b.Dy())
|
||
}
|
||
// Must not be pure white (text should be present).
|
||
if isPureWhite(img) {
|
||
t.Error("rendered page is pure white — expected text content")
|
||
}
|
||
}
|
||
|
||
func TestRenderPage_ChineseSimple(t *testing.T) {
|
||
data := readPDF(t, "02_chinese_simple.pdf")
|
||
img, err := RenderPage(data, 0, 72)
|
||
if err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
b := img.Bounds()
|
||
t.Logf("02_chinese_simple.pdf @ 72 DPI: %dx%d", b.Dx(), b.Dy())
|
||
if b.Dx() <= 0 || b.Dy() <= 0 {
|
||
t.Errorf("expected non-zero dimensions, got %dx%d", b.Dx(), b.Dy())
|
||
}
|
||
if isPureWhite(img) {
|
||
t.Error("rendered page is pure white — expected text content")
|
||
}
|
||
}
|
||
|
||
func TestRenderPage_MultiPage(t *testing.T) {
|
||
data := readPDF(t, "03_multipage.pdf")
|
||
// Render both pages.
|
||
for pg := 0; pg < 2; pg++ {
|
||
img, err := RenderPage(data, pg, 72)
|
||
if err != nil {
|
||
t.Fatalf("page %d: %v", pg, err)
|
||
}
|
||
b := img.Bounds()
|
||
t.Logf("03_multipage.pdf page %d @ 72 DPI: %dx%d", pg, b.Dx(), b.Dy())
|
||
if b.Dx() <= 0 || b.Dy() <= 0 {
|
||
t.Errorf("page %d: expected non-zero dimensions", pg)
|
||
}
|
||
}
|
||
}
|
||
|
||
func TestRenderPage_OutOfRange(t *testing.T) {
|
||
data := readPDF(t, "01_english_simple.pdf")
|
||
_, err := RenderPage(data, 99, 72)
|
||
if err == nil {
|
||
t.Error("expected error for out-of-range page index")
|
||
}
|
||
}
|
||
|
||
func TestRenderPage_InvalidPDF(t *testing.T) {
|
||
_, err := RenderPage([]byte("not a pdf"), 0, 72)
|
||
if err == nil {
|
||
t.Error("expected error for invalid PDF data")
|
||
}
|
||
}
|
||
|
||
func TestRenderPage_EmptyData(t *testing.T) {
|
||
_, err := RenderPage(nil, 0, 72)
|
||
if err == nil {
|
||
t.Error("expected error for nil data")
|
||
}
|
||
_, err = RenderPage([]byte{}, 0, 72)
|
||
if err == nil {
|
||
t.Error("expected error for empty data")
|
||
}
|
||
}
|
||
|
||
func TestRenderPage_DPI(t *testing.T) {
|
||
data := readPDF(t, "01_english_simple.pdf")
|
||
|
||
// Higher DPI → larger image.
|
||
low, err := RenderPage(data, 0, 72)
|
||
if err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
high, err := RenderPage(data, 0, 144)
|
||
if err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
lw, lh := low.Bounds().Dx(), low.Bounds().Dy()
|
||
hw, hh := high.Bounds().Dx(), high.Bounds().Dy()
|
||
t.Logf("72 DPI: %dx%d 144 DPI: %dx%d", lw, lh, hw, hh)
|
||
|
||
if hw < lw*2-2 || hw > lw*2+2 {
|
||
t.Errorf("144 DPI width %d not ≈ 2× 72 DPI width %d", hw, lw)
|
||
}
|
||
if hh < lh*2-2 || hh > lh*2+2 {
|
||
t.Errorf("144 DPI height %d not ≈ 2× 72 DPI height %d", hh, lh)
|
||
}
|
||
}
|
||
|
||
func TestRenderPage_AllTestPDFs(t *testing.T) {
|
||
entries, err := os.ReadDir(testdataDir)
|
||
if err != nil {
|
||
t.Skipf("testdata dir not found: %v", err)
|
||
}
|
||
for _, e := range entries {
|
||
if e.IsDir() || filepath.Ext(e.Name()) != ".pdf" {
|
||
continue
|
||
}
|
||
data, err := os.ReadFile(filepath.Join(testdataDir, e.Name()))
|
||
if err != nil {
|
||
t.Errorf("%s: read: %v", e.Name(), err)
|
||
continue
|
||
}
|
||
img, err := RenderPage(data, 0, 72)
|
||
if err != nil {
|
||
t.Errorf("%s: RenderPage: %v", e.Name(), err)
|
||
continue
|
||
}
|
||
b := img.Bounds()
|
||
if b.Dx() <= 0 || b.Dy() <= 0 {
|
||
t.Errorf("%s: zero dimensions %dx%d", e.Name(), b.Dx(), b.Dy())
|
||
}
|
||
t.Logf("%s: %dx%d", e.Name(), b.Dx(), b.Dy())
|
||
}
|
||
}
|
||
|
||
func isPureWhite(img image.Image) bool {
|
||
b := img.Bounds()
|
||
for y := b.Min.Y; y < b.Max.Y; y++ {
|
||
for x := b.Min.X; x < b.Max.X; x++ {
|
||
r, g, b, _ := img.At(x, y).RGBA()
|
||
// RGBA() returns premultiplied values in [0, 65535].
|
||
if r>>8 < 250 || g>>8 < 250 || b>>8 < 250 {
|
||
return false
|
||
}
|
||
}
|
||
}
|
||
return true
|
||
}
|
||
|
||
func TestPageSize(t *testing.T) {
|
||
// Non-rotated A4: expect ~595×842
|
||
data := readPDF(t, "rotate_0.pdf")
|
||
w, h, err := PageSize(data, 0)
|
||
if err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
if w < 500 || w > 700 || h < 700 || h > 900 {
|
||
t.Errorf("rotate_0.pdf: got %.1f×%.1f, want ~595×842", w, h)
|
||
}
|
||
t.Logf("rotate_0.pdf: %.1f×%.1f pts", w, h)
|
||
|
||
// Rotate=90 A4: expect swapped ~842×595
|
||
data90 := readPDF(t, "rotate_90.pdf")
|
||
w90, h90, err := PageSize(data90, 0)
|
||
if err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
if w90 < 700 || w90 > 950 || h90 < 500 || h90 > 700 {
|
||
t.Errorf("rotate_90.pdf: got %.1f×%.1f, want ~842×595 (swapped)", w90, h90)
|
||
}
|
||
t.Logf("rotate_90.pdf: %.1f×%.1f pts (post-rotation)", w90, h90)
|
||
|
||
// Verify dimensions ARE swapped relative to Rotate=0
|
||
if math.Abs(w-w90) < 50 {
|
||
t.Errorf("Rotate=90 width %.1f not significantly different from Rotate=0 width %.1f — rotation not reflected?", w90, w)
|
||
}
|
||
if math.Abs(w-h90) > 2 || math.Abs(h-w90) > 2 {
|
||
t.Errorf("Rotate=90 dimensions (%.1f×%.1f) are not swapped from Rotate=0 (%.1f×%.1f)", w90, h90, w, h)
|
||
}
|
||
|
||
// Invalid page index
|
||
_, _, err = PageSize(data, 999)
|
||
if err == nil {
|
||
t.Error("expected error for out-of-range page")
|
||
}
|
||
|
||
// Empty data
|
||
_, _, err = PageSize([]byte{}, 0)
|
||
if err == nil {
|
||
t.Error("expected error for empty PDF data")
|
||
}
|
||
}
|
||
|
||
// TestPdfiumConcurrentSafety verifies that the pdfiumMu mutex prevents
|
||
// SIGSEGV from concurrent pdfium access. Without the mutex, 10 goroutines
|
||
// calling PageSize/RenderPage simultaneously causes heap corruption within
|
||
// milliseconds (empirically proven). If this test completes without
|
||
// crashing, the mutex is working.
|
||
func TestPdfiumConcurrentSafety(t *testing.T) {
|
||
data := readPDF(t, "01_english_simple.pdf")
|
||
|
||
const goroutines = 10
|
||
const iterations = 3
|
||
|
||
var wg sync.WaitGroup
|
||
for i := 0; i < goroutines; i++ {
|
||
wg.Add(1)
|
||
go func() {
|
||
defer wg.Done()
|
||
for j := 0; j < iterations; j++ {
|
||
if _, _, err := PageSize(data, 0); err != nil {
|
||
t.Errorf("PageSize: %v", err)
|
||
return
|
||
}
|
||
if img, err := RenderPage(data, 0, 72); err != nil {
|
||
t.Errorf("RenderPage: %v", err)
|
||
return
|
||
} else if img.Bounds().Dx() <= 0 {
|
||
t.Error("RenderPage returned zero-width image")
|
||
return
|
||
}
|
||
}
|
||
}()
|
||
}
|
||
wg.Wait()
|
||
// Reaching here without SIGSEGV = mutex is effective.
|
||
}
|