Files
ragflow/internal/deepdoc/parser/pdf/rotate_test.go

610 lines
18 KiB
Go
Raw Normal View History

//go:build cgo
package parser
import (
"image"
"math"
"os"
"path/filepath"
"sort"
"testing"
"ragflow/internal/deepdoc/parser/pdf/pdfium"
"ragflow/internal/deepdoc/parser/pdf/pdfoxide"
)
// ── helpers ──────────────────────────────────────────────────────────────
// pdfiumPtSize returns post-rotation page dimensions via pdfium.
// pdfiumPtSize returns post-rotation page dimensions via pdfium.
func pdfiumPtSize(eng PDFEngine, file string, t *testing.T) (w, h float64) {
t.Helper()
raw := eng.RawData()
if raw == nil {
// Fallback: use pdf_oxide pre-rotation size.
if pe, ok := eng.(*pdfoxideEngine); ok {
w, h, _ = pe.inner.PageSize(0)
}
return
}
pw, ph, err := pdfium.PageSize(raw, 0)
if err != nil {
t.Fatalf("%s: pdfium.PageSize: %v", file, err)
}
return pw, ph
}
// openPDF reads a PDF fixture from dir/name, opens it via pdfoxide, and
// returns both the engine and document. The document is closed via t.Cleanup.
// Missing or corrupt fixtures cause a hard failure (t.Fatal).
func openPDF(t *testing.T, dir, name string) (PDFEngine, *pdfoxide.Document) {
t.Helper()
data, err := os.ReadFile(filepath.Join(dir, name))
if err != nil {
t.Fatalf("read %s: %v", name, err)
}
doc, err := pdfoxide.OpenBytes(data)
if err != nil {
t.Fatalf("OpenBytes: %v", err)
}
t.Cleanup(func() { doc.Close() })
eng, err := NewEngine(data)
if err != nil {
t.Fatalf("NewEngine: %v", err)
}
return eng, doc
}
func openRotatePDF(t *testing.T, name string) (PDFEngine, *pdfoxide.Document) {
t.Helper()
return openPDF(t, "testdata/pdfs", name)
}
// ── Test 1: pdf_oxide page size is A4 for all test PDFs ──────────────────
func TestRotation_PageInfo(t *testing.T) {
for _, file := range []string{"rotate_0.pdf", "rotate_90.pdf", "rotate_180.pdf", "rotate_270.pdf"} {
t.Run(file, func(t *testing.T) {
_, doc := openRotatePDF(t, file)
w, h, err := doc.PageSize(0)
if err != nil {
t.Fatalf("PageSize: %v", err)
}
if w < 500 || w > 700 || h < 700 || h > 900 {
t.Errorf("unexpected pdf_oxide page size: %.1f x %.1f", w, h)
}
})
}
}
// ── Test 2: Char extent after rotation ───────────────────────────────────
// After the rotation fix, ExtractChars returns chars in post-rotation space.
func TestRotation_CharExtent(t *testing.T) {
tests := []struct {
file string
maxXAbove float64 // maxX must be > this
maxXBelow float64 // maxX must be < this
}{
{"rotate_0.pdf", 0, 600}, // portrait A4
{"rotate_90.pdf", 600, 850}, // landscape (text near right edge after CW)
{"rotate_180.pdf", 0, 600}, // still portrait (180° flips within bounds)
{"rotate_270.pdf", 0, 600}, // landscape (text near left edge after CCW)
}
for _, tt := range tests {
t.Run(tt.file, func(t *testing.T) {
eng, _ := openRotatePDF(t, tt.file)
chars, err := eng.ExtractChars(0)
if err != nil {
t.Fatal(err)
}
if len(chars) == 0 {
t.Fatal("no chars")
}
var maxX float64
for _, c := range chars {
if c.X1 > maxX {
maxX = c.X1
}
}
t.Logf("maxX=%.1f (need >%.0f and <%.0f)", maxX, tt.maxXAbove, tt.maxXBelow)
if maxX <= tt.maxXAbove {
t.Errorf("maxX=%.1f <= %.0f: rotation not applied to char coordinates", maxX, tt.maxXAbove)
}
if maxX >= tt.maxXBelow {
t.Errorf("maxX=%.1f >= %.0f: chars out of expected range", maxX, tt.maxXBelow)
}
})
}
}
// ── Test 3: All chars within page bounds ─────────────────────────────────
func TestRotation_CharsInBounds(t *testing.T) {
files := []string{"rotate_0.pdf", "rotate_90.pdf", "rotate_180.pdf", "rotate_270.pdf"}
for _, file := range files {
t.Run(file, func(t *testing.T) {
eng, _ := openRotatePDF(t, file)
// Use pdfium.PageSize for post-rotation page dimensions,
// since chars from ExtractChars are now in post-rotation space.
pageW, pageH := pdfiumPtSize(eng, file, t)
chars, err := eng.ExtractChars(0)
if err != nil {
t.Fatal(err)
}
oob := 0
for _, c := range chars {
if c.X0 < -1 || c.X1 > pageW+1 || c.Top < -1 || c.Bottom > pageH+1 {
oob++
if oob <= 3 {
t.Errorf("OOB char %q: X=[%.1f,%.1f] Y=[%.1f,%.1f] page=%.1fx%.1f",
c.Text, c.X0, c.X1, c.Top, c.Bottom, pageW, pageH)
}
}
if c.X0 >= c.X1 {
t.Errorf("char %q: X0=%.2f >= X1=%.2f", c.Text, c.X0, c.X1)
}
if c.Top >= c.Bottom {
t.Errorf("char %q: Top=%.2f >= Bottom=%.2f", c.Text, c.Top, c.Bottom)
}
}
if oob > 0 {
t.Errorf("%d/%d chars OOB (%.1f%%)", oob, len(chars), float64(oob)/float64(len(chars))*100)
} else {
t.Logf("all %d chars in bounds [%.0f x %.0f]", len(chars), pageW, pageH)
}
})
}
}
// ── Test 4: Same-line chars preserved after rotation ─────────────────────
func TestRotation_SameLinePreserved(t *testing.T) {
for _, file := range []string{"rotate_0.pdf", "rotate_90.pdf", "rotate_270.pdf"} {
t.Run(file, func(t *testing.T) {
eng, _ := openRotatePDF(t, file)
chars, err := eng.ExtractChars(0)
if err != nil {
t.Fatal(err)
}
// After rotation, same-baseline chars have slightly different
// Bottom values because the rotation maps char Width to post-rot
// Y-height. Use font-size proportional tolerance.
isRotated := file != "rotate_0.pdf"
tolerance := 0.5
if isRotated {
tolerance = 15.0 // char widths vary ~10-13pts on same line
}
lines := groupCharsToLines(chars, false)
violations := 0
for li, line := range lines {
if len(line) <= 1 {
continue
}
refBottom := line[0].Bottom
for _, c := range line[1:] {
diff := math.Abs(c.Bottom - refBottom)
if diff > tolerance {
violations++
if violations <= 3 {
t.Errorf("line %d: char %q Bottom=%.2f ref=%.2f diff=%.2f",
li, c.Text, c.Bottom, refBottom, diff)
}
}
}
}
if violations > 0 {
t.Errorf("%d same-line Bottom violations (tolerance=%.1f)", violations, tolerance)
}
})
}
}
// ── Test 5: Multi-page with mixed rotation ───────────────────────────────
func TestRotation_MultiPageMixed(t *testing.T) {
eng, doc := openRotatePDF(t, "multi_rotate.pdf")
pageCount, err := eng.PageCount()
if err != nil {
t.Fatal(err)
}
if pageCount != 3 {
t.Fatalf("expected 3 pages, got %d", pageCount)
}
// Page 0: Rotate=0 → portrait. Page 1-2: Rotate=90/270 → landscape.
expectations := []struct {
page int
maxXAbove float64
maxXBelow float64
}{
{0, 0, 600},
{1, 600, 850},
{2, 0, 600}, // Rotate=270 → CCW, text near left edge
}
for _, exp := range expectations {
info, err := doc.Inner.PageInfo(exp.page)
if err != nil {
t.Fatalf("PageInfo page %d: %v", exp.page, err)
}
t.Logf("Page %d: Rotation=%d, W=%.1f H=%.1f", exp.page, info.Rotation, info.Width, info.Height)
chars, err := eng.ExtractChars(exp.page)
if err != nil {
t.Fatalf("ExtractChars page %d: %v", exp.page, err)
}
if len(chars) == 0 {
t.Errorf("page %d: no chars", exp.page)
continue
}
var maxX float64
for _, c := range chars {
if c.X1 > maxX {
maxX = c.X1
}
}
t.Logf("Page %d: %d chars, maxX=%.1f", exp.page, len(chars), maxX)
if maxX <= exp.maxXAbove {
t.Errorf("Page %d: maxX=%.1f <= %.0f — rotation not applied",
exp.page, maxX, exp.maxXAbove)
}
if maxX > exp.maxXBelow {
t.Errorf("Page %d: maxX=%.1f > %.0f — out of range",
exp.page, maxX, exp.maxXBelow)
}
}
}
// ── Test 6: CropBox with rotation ────────────────────────────────────────
// pdf_oxide does not read /CropBox from the page dictionary (same limitation
// as /Rotate). It always reports MediaBox values. The test verifies that
// chars are within bounds using the dimensions pdf_oxide actually reports.
func TestRotation_CropBoxWithRotate(t *testing.T) {
eng, doc := openRotatePDF(t, "cropbox_rotate.pdf")
info, err := doc.Inner.PageInfo(0)
if err != nil {
t.Fatal(err)
}
// pdf_oxide reports MediaBox (not our custom CropBox [30,20,575,832]).
t.Logf("pdf_oxide: W=%.1f H=%.1f CropBox=(%.1f,%.1f,%.1f,%.1f) Rotation=%d",
info.Width, info.Height,
info.CropBox.X, info.CropBox.Y, info.CropBox.Width, info.CropBox.Height,
info.Rotation)
chars, err := eng.ExtractChars(0)
if err != nil {
t.Fatal(err)
}
if len(chars) == 0 {
t.Fatal("no chars")
}
// Use pdfium dimensions (accounts for rotation) for bounds check.
pageW, pageH := pdfiumPtSize(eng, "cropbox_rotate.pdf", t)
oob := 0
for _, c := range chars {
if c.X0 < -1 || c.X1 > pageW+1 || c.Top < -1 || c.Bottom > pageH+1 {
oob++
}
}
oobRate := float64(oob) / float64(len(chars)) * 100
t.Logf("OOB: %d/%d (%.1f%%), page=%.1fx%.1f", oob, len(chars), oobRate, pageW, pageH)
// CropBox excludes content from the page edges; chars near the
// CropBox boundary may end up outside the effective page after rotation.
if oobRate > 40 {
t.Errorf("too many OOB chars: %.1f%%", oobRate)
}
// Verify render alignment.
raw := eng.RawData()
if raw != nil {
img, err := pdfium.RenderPage(raw, 0, 216)
if err == nil {
scale := 216.0 / 72.0
hit, checked := bboxDarkPixelHitRate(t, chars, img, scale)
if checked > 0 {
hitRate := float64(hit) / float64(checked) * 100
t.Logf("CropBox+Rotate render align: %d/%d (%.1f%%)", hit, checked, hitRate)
if hitRate < 70 {
t.Errorf("CropBox+Rotate render alignment: %.1f%% < 70%%", hitRate)
}
}
}
}
}
// ── Test 7: Render alignment — dark-pixel bbox verification ──────────────
// Chars are now in post-rotation space (rotation handled by ExtractChars),
// so we use the identity mapper for all rotations.
func TestRotation_RenderAlignment(t *testing.T) {
const dpi = 216.0
const scale = dpi / 72.0
identityMap := func(c TextChar, _, _ float64) (px0, py0, px1, py1 int) {
return int(math.Round(c.X0 * scale)),
int(math.Round(c.Top * scale)),
int(math.Round(c.X1 * scale)),
int(math.Round(c.Bottom * scale))
}
for _, file := range []string{"rotate_0.pdf", "rotate_90.pdf", "rotate_270.pdf"} {
t.Run(file, func(t *testing.T) {
eng, _ := openRotatePDF(t, file)
raw := eng.RawData()
if raw == nil {
t.Fatal("no raw data")
}
chars, err := eng.ExtractChars(0)
if err != nil {
t.Fatal(err)
}
img, err := pdfium.RenderPage(raw, 0, dpi)
if err != nil {
t.Skipf("pdfium not available: %v", err)
}
imgW, imgH := img.Bounds().Dx(), img.Bounds().Dy()
pdfiumPtW := float64(imgW) / scale
pdfiumPtH := float64(imgH) / scale
n := len(chars)
if n == 0 {
t.Fatal("no chars")
}
step := max(1, n/200)
var hit, miss, oob int
var dratios []float64
for i := 0; i < n; i += step {
c := chars[i]
px0, py0, px1, py1 := identityMap(c, pdfiumPtW, pdfiumPtH)
if px0 > px1 {
px0, px1 = px1, px0
}
if py0 > py1 {
py0, py1 = py1, py0
}
if px0 < 0 || py0 < 0 || px1 > imgW || py1 > imgH || px0 >= px1 || py0 >= py1 {
oob++
continue
}
if px1-px0 < 2 || py1-py0 < 2 {
continue
}
dark, total := 0, 0
for y := py0; y <= py1; y++ {
for x := px0; x <= px1; x++ {
r, g, b, _ := img.At(x, y).RGBA()
bright := (float64(r>>8) + float64(g>>8) + float64(b>>8)) / 3.0
if bright < 128 {
dark++
}
total++
}
}
ratio := float64(dark) / float64(total) * 100
dratios = append(dratios, ratio)
if ratio > 2.0 {
hit++
} else {
miss++
}
}
if len(dratios) == 0 {
t.Fatal("no bboxes tested")
}
sort.Float64s(dratios)
var sum float64
for _, r := range dratios {
sum += r
}
avg := sum / float64(len(dratios))
p95 := dratios[len(dratios)*95/100]
hitRate := float64(hit) / float64(len(dratios)) * 100
t.Logf("avg=%.1f%% p95=%.1f%% hit=%d/%d (%.1f%%) oob=%d",
avg, p95, hit, len(dratios), hitRate, oob)
if hitRate < 70 {
t.Errorf("hit rate %.1f%% < 70%% — bbox/render misalignment", hitRate)
}
if float64(oob)/float64(len(dratios)+oob) > 0.05 {
t.Errorf("OOB rate > 5%%")
}
})
}
}
// ── Test 8: Letter size + Rotate 90 ──────────────────────────────────────
func TestRotation_LetterSize(t *testing.T) {
eng, doc := openRotatePDF(t, "letter_rotate.pdf")
w, h, err := doc.PageSize(0)
if err != nil {
t.Fatal(err)
}
t.Logf("Letter (pdf_oxide): %.1f x %.1f", w, h)
if w < 600 || h < 600 {
t.Errorf("unexpected Letter dimensions: %.1f x %.1f", w, h)
}
chars, err := eng.ExtractChars(0)
if err != nil {
t.Fatal(err)
}
if len(chars) == 0 {
t.Fatal("no chars")
}
t.Logf("%d chars", len(chars))
// After fix: Letter landscape (792×612), maxX should be > 650
var maxX float64
for _, c := range chars {
if c.X1 > maxX {
maxX = c.X1
}
if c.X0 < 0 || c.Top < 0 {
t.Errorf("negative coord: %q X=%.1f Top=%.1f", c.Text, c.X0, c.Top)
}
}
t.Logf("maxX=%.1f", maxX)
if maxX <= 650 {
t.Errorf("maxX=%.1f <= 650: rotation not applied for Letter+Rotate90", maxX)
}
// Render alignment check (chars from ExtractChars are post-rotation)
raw := eng.RawData()
if raw != nil {
img, err := pdfium.RenderPage(raw, 0, 216)
if err == nil {
imgW, imgH := img.Bounds().Dx(), img.Bounds().Dy()
scale := 216.0 / 72.0
t.Logf("pdfium render: %.0fx%.0f pts", float64(imgW)/scale, float64(imgH)/scale)
hit, checked := bboxDarkPixelHitRate(t, chars, img, scale)
if checked > 0 {
hitRate := float64(hit) / float64(checked) * 100
t.Logf("Letter render alignment: %d/%d hit (%.1f%%)", hit, checked, hitRate)
if hitRate < 70 {
t.Errorf("Letter render hit rate %.1f%% < 70%%", hitRate)
}
}
}
}
}
// ── Test 9: Rotate=180 ──────────────────────────────────────────────────
func TestRotation_Rotate180_NotYetHandled(t *testing.T) {
eng, _ := openRotatePDF(t, "rotate_180.pdf")
chars, err := eng.ExtractChars(0)
if err != nil {
t.Fatal(err)
}
// After the fix, chars should be in post-rotation space (180° inverted).
// X range: still 0600 (portrait width unchanged).
// Y range: chars originally near top → now near bottom.
var maxX, minTop, maxBottom float64
maxX = -1e9
minTop = 1e9
for _, c := range chars {
if c.X1 > maxX {
maxX = c.X1
}
if c.Top < minTop {
minTop = c.Top
}
if c.Bottom > maxBottom {
maxBottom = c.Bottom
}
}
t.Logf("Rotate=180: maxX=%.1f minTop=%.1f maxBottom=%.1f", maxX, minTop, maxBottom)
// 180° flips content upside down: top-half chars move to bottom half.
// For our test PDF (A4 portrait 595×842), pre-rot text was near top
// (minTop≈28). After fix: minTop ≈ 842-382 ≈ 460 (near bottom).
if maxX > 600 {
t.Errorf("maxX=%.1f > 600: Rotate=180 should stay in portrait width", maxX)
}
if minTop < 300 {
t.Errorf("minTop=%.1f < 300: Rotate=180 not inverted (chars still at top)", minTop)
}
// Render alignment check
raw := eng.RawData()
if raw != nil {
img, err := pdfium.RenderPage(raw, 0, 216)
if err == nil {
scale := 216.0 / 72.0
hit, checked := bboxDarkPixelHitRate(t, chars, img, scale)
hitRate := float64(hit) / float64(checked) * 100
t.Logf("Rotate=180 render alignment: %d/%d (%.1f%%)", hit, checked, hitRate)
if hitRate < 70 {
t.Errorf("Rotate=180 render alignment: %.1f%% < 70%%", hitRate)
}
}
}
}
// ── Test 10: Document.PageSize ───────────────────────────────────────────
func TestRotation_DocumentPageSize(t *testing.T) {
_, doc := openRotatePDF(t, "rotate_0.pdf")
w, h, err := doc.PageSize(0)
if err != nil {
t.Fatal(err)
}
if w < 500 || w > 700 || h < 700 || h > 900 {
t.Errorf("rotate_0.pdf: unexpected size %.1f×%.1f", w, h)
}
// Rotate=90 must report same pre-rotation size
_, doc = openRotatePDF(t, "rotate_90.pdf")
w2, h2, err := doc.PageSize(0)
if err != nil {
t.Fatal(err)
}
if math.Abs(w-w2) > 0.1 || math.Abs(h-h2) > 0.1 {
t.Errorf("pre-rotation size differs: %.1f×%.1f vs %.1f×%.1f", w, h, w2, h2)
}
// Closed document returns error
doc.Close()
_, _, err = doc.PageSize(0)
if err == nil {
t.Error("expected error from closed document")
}
}
// ── bboxDarkPixelHitRate helper ─────────────────────────────────────────
func bboxDarkPixelHitRate(t *testing.T, chars []TextChar, img *image.RGBA, scale float64) (hit, checked int) {
t.Helper()
imgW, imgH := img.Bounds().Dx(), img.Bounds().Dy()
n, step := len(chars), max(1, len(chars)/min(50, len(chars)))
for i := 0; i < n; i += step {
c := chars[i]
px0 := int(math.Round(c.X0 * scale))
py0 := int(math.Round(c.Top * scale))
px1 := int(math.Round(c.X1 * scale))
py1 := int(math.Round(c.Bottom * scale))
if px0 > px1 {
px0, px1 = px1, px0
}
if py0 > py1 {
py0, py1 = py1, py0
}
if px0 < 0 || py0 < 0 || px1 > imgW || py1 > imgH || px0 >= px1 || py0 >= py1 {
continue
}
if px1-px0 < 2 || py1-py0 < 2 {
continue
}
dark, total := 0, 0
for y := py0; y <= py1; y++ {
for x := px0; x <= px1; x++ {
r, g, b, _ := img.At(x, y).RGBA()
if (float64(r>>8)+float64(g>>8)+float64(b>>8))/3.0 < 128 {
dark++
}
total++
}
}
if total > 0 && float64(dark)/float64(total)*100 > 2.0 {
hit++
}
checked++
}
return
}