Files
ragflow/internal/deepdoc/parser/pdf/crop_test.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

392 lines
13 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package parser
import (
"bytes"
"encoding/base64"
"image"
"image/color"
"image/png"
"math"
"testing"
)
// makeTestPageImage creates a solid-color RGBA PNG and returns the encoded bytes.
func makeTestPageImage(w, h int, c color.Color) image.Image {
img := image.NewRGBA(image.Rect(0, 0, w, h))
for y := 0; y < h; y++ {
for x := 0; x < w; x++ {
img.Set(x, y, c)
}
}
return img
}
func decodePNG(t *testing.T, data []byte) image.Image {
t.Helper()
img, err := png.Decode(bytes.NewReader(data))
if err != nil {
t.Fatalf("decode png: %v", err)
}
return img
}
func TestCropSectionImage_SinglePage(t *testing.T) {
pageImages := map[int]image.Image{
0: makeTestPageImage(200, 300, color.RGBA{255, 0, 0, 255}),
}
posTag := FormatPositionTag(0, 10, 100, 20, 150)
b64 := cropSectionImage(posTag, pageImages, 1)
if b64 == "" {
t.Fatal("expected non-empty base64 image")
}
decoded, err := base64.StdEncoding.DecodeString(b64)
if err != nil {
t.Fatalf("base64 decode: %v", err)
}
img := decodePNG(t, decoded)
bounds := img.Bounds()
if bounds.Dx() != 90 {
t.Errorf("width: got %d, want 90", bounds.Dx())
}
if bounds.Dy() != 276 {
t.Errorf("height: got %d, want 276", bounds.Dy())
}
}
func TestCropSectionImage_EmptyImages(t *testing.T) {
posTag := FormatPositionTag(0, 10, 100, 20, 150)
if b64 := cropSectionImage(posTag, nil, 1); b64 != "" {
t.Error("nil pageImages should return empty string")
}
if b64 := cropSectionImage(posTag, map[int]image.Image{}, 1); b64 != "" {
t.Error("empty pageImages should return empty string")
}
}
func TestCropSectionImage_OutOfBounds(t *testing.T) {
pageImages := map[int]image.Image{
0: makeTestPageImage(200, 300, color.RGBA{255, 0, 0, 255}),
}
posTag := FormatPositionTag(5, 10, 100, 20, 150)
if b64 := cropSectionImage(posTag, pageImages, 1); b64 != "" {
t.Error("out-of-bounds page should return empty string")
}
}
func TestCropSectionImage_InvalidTag(t *testing.T) {
pageImages := map[int]image.Image{
0: makeTestPageImage(200, 300, color.RGBA{255, 0, 0, 255}),
}
if b64 := cropSectionImage("invalid", pageImages, 1); b64 != "" {
t.Error("invalid position tag should return empty string")
}
if b64 := cropSectionImage("", pageImages, 1); b64 != "" {
t.Error("empty position tag should return empty string")
}
}
func TestCropSectionImage_ContextPadding(t *testing.T) {
pageImages := map[int]image.Image{
0: makeTestPageImage(200, 800, color.RGBA{255, 0, 0, 255}),
}
posTag := FormatPositionTag(0, 20, 120, 300, 400)
b64 := cropSectionImage(posTag, pageImages, 1)
if b64 == "" {
t.Fatal("expected non-empty result")
}
decoded, _ := base64.StdEncoding.DecodeString(b64)
img := decodePNG(t, decoded)
bounds := img.Bounds()
if bounds.Dy() != 346 {
t.Errorf("height with context: got %d, want 346", bounds.Dy())
}
}
func TestCropSectionImage_ZoomScaling(t *testing.T) {
pageImages := map[int]image.Image{
0: makeTestPageImage(400, 600, color.RGBA{255, 0, 0, 255}),
}
posTag := FormatPositionTag(0, 10, 100, 20, 150)
b64 := cropSectionImage(posTag, pageImages, 2)
if b64 == "" {
t.Fatal("expected non-empty result")
}
decoded, _ := base64.StdEncoding.DecodeString(b64)
img := decodePNG(t, decoded)
bounds := img.Bounds()
if bounds.Dx() != 180 {
t.Errorf("width at zoom 2: got %d, want 180", bounds.Dx())
}
}
func TestRotateImageCW(t *testing.T) {
// Create a 3x2 image with known colors: (0,0)=red, (1,0)=green, (2,0)=blue,
// (0,1)=white, (1,1)=black, (2,1)=gray
img := image.NewRGBA(image.Rect(0, 0, 3, 2))
r, g, b, w, bl, gr := color.RGBA{255, 0, 0, 255}, color.RGBA{0, 255, 0, 255}, color.RGBA{0, 0, 255, 255}, color.RGBA{255, 255, 255, 255}, color.RGBA{0, 0, 0, 255}, color.RGBA{128, 128, 128, 255}
img.Set(0, 0, r)
img.Set(1, 0, g)
img.Set(2, 0, b)
img.Set(0, 1, w)
img.Set(1, 1, bl)
img.Set(2, 1, gr)
t.Run("0 degrees", func(t *testing.T) {
rot := rotateImageCW(img, 0)
if rot == nil {
t.Fatal("nil result")
}
if rot.Bounds().Dx() != 3 || rot.Bounds().Dy() != 2 {
t.Errorf("size: got %dx%d, want 3x2", rot.Bounds().Dx(), rot.Bounds().Dy())
}
if !colorEqual(rot.At(0, 0), r) || !colorEqual(rot.At(2, 1), gr) {
t.Error("pixels shifted for 0° rotation")
}
})
t.Run("90 degrees", func(t *testing.T) {
rot := rotateImageCW(img, 90)
if rot == nil {
t.Fatal("nil result")
}
if rot.Bounds().Dx() != 2 || rot.Bounds().Dy() != 3 {
t.Errorf("size: got %dx%d, want 2x3", rot.Bounds().Dx(), rot.Bounds().Dy())
}
// 90° CW: (0,0) of dst = (h-1-y, x) = (1, 0) = original (0,1)=white
if !colorEqual(rot.At(0, 0), w) {
t.Error("90° CW top-left should be original (0,1)=white")
}
// 90° CW: (1, 2) of dst = (h-1-y, x) = (1-1-2=-2...) → wait
// (x=1, y=2): dst_x = h-1-y = 2-1-2 = -1? No. h=2, dst_x = 2-1-y = 1-y.
// For y=2: dst_x = 1-2 = -1. That's wrong.
// Actually 90° CW maps (orig_x, orig_y) → (h-1-orig_y, orig_x).
// So original (2,1)=gray → dst (2-1-1=0, 2) = (0,2)
if !colorEqual(rot.At(0, 2), gr) {
t.Error("90° CW: original (2,1)=gray should be at (0,2)")
}
// Original (0,0)=red → dst (2-1-0=1, 0) = (1,0)
if !colorEqual(rot.At(1, 0), r) {
t.Error("90° CW: original (0,0)=red should be at (1,0)")
}
})
t.Run("180 degrees", func(t *testing.T) {
rot := rotateImageCW(img, 180)
if rot == nil {
t.Fatal("nil result")
}
if rot.Bounds().Dx() != 3 || rot.Bounds().Dy() != 2 {
t.Errorf("size: got %dx%d, want 3x2", rot.Bounds().Dx(), rot.Bounds().Dy())
}
if !colorEqual(rot.At(0, 0), gr) {
t.Error("180°: (0,0) should be original (2,1)=gray")
}
if !colorEqual(rot.At(2, 1), r) {
t.Error("180°: (2,1) should be original (0,0)=red")
}
})
t.Run("270 degrees", func(t *testing.T) {
rot := rotateImageCW(img, 270)
if rot == nil {
t.Fatal("nil result")
}
if rot.Bounds().Dx() != 2 || rot.Bounds().Dy() != 3 {
t.Errorf("size: got %dx%d, want 2x3", rot.Bounds().Dx(), rot.Bounds().Dy())
}
})
t.Run("invalid angle", func(t *testing.T) {
if rotateImageCW(img, 45) != nil {
t.Error("expected nil for invalid angle")
}
})
}
func TestMapRotatedPointToOriginal_RoundTrip(t *testing.T) {
// Verify that forward (rotateImageCW) → inverse (mapRotatedPointToOriginal)
// recovers the original coordinates for all rotation angles.
origW, origH := 200, 100
for _, angle := range []int{0, 90, 180, 270} {
for _, ox := range []float64{0, 50, 199} {
for _, oy := range []float64{0, 30, 99} {
rx, ry := rotateCoordCW(ox, oy, origW, origH, angle)
gotX, gotY := mapRotatedPointToOriginal(rx, ry, angle, origW, origH)
if math.Abs(gotX-ox) > 0.01 || math.Abs(gotY-oy) > 0.01 {
t.Errorf("angle=%d orig(%.0f,%.0f) → rot(%.0f,%.0f) → got(%.1f,%.1f)",
angle, ox, oy, rx, ry, gotX, gotY)
}
}
}
}
}
func TestMapRotatedPointToOriginal(t *testing.T) {
// Verify alignment with Python's _map_rotated_point formulas.
// Original 200x100; rotW,rotH swap for 90/270.
tests := []struct {
angle int
rx, ry float64
origW, origH int
wantX, wantY float64
}{
{0, 50, 30, 200, 100, 50, 30},
{90, 50, 30, 200, 100, 30, 49}, // rotH=100: forward (100-1-oy,ox)
{180, 50, 30, 200, 100, 149, 69}, // (199-50, 99-30)
{270, 50, 30, 200, 100, 169, 50}, // rotW=200: inverse (199-30,50)
}
for _, tt := range tests {
gotX, gotY := mapRotatedPointToOriginal(tt.rx, tt.ry, tt.angle, tt.origW, tt.origH)
if math.Abs(gotX-tt.wantX) > 0.01 || math.Abs(gotY-tt.wantY) > 0.01 {
t.Errorf("angle=%d (%f,%f) got(%f,%f) want(%f,%f)",
tt.angle, tt.rx, tt.ry, gotX, gotY, tt.wantX, tt.wantY)
}
}
}
func colorEqual(a, b color.Color) bool {
ar, ag, ab, aa := a.RGBA()
br, bg, bb, ba := b.RGBA()
return ar == br && ag == bg && ab == bb && aa == ba
}
// TestCropSectionImage_MultiPage verifies the bottomRemaining fix for 3+ page
// positions where page heights differ. Regression test for Bug #3.
func TestCropSectionImage_MultiPage(t *testing.T) {
// Page 0: tall (2000px), Page 1: short (800px), Page 2: short (800px)
// Content spans all 3 pages. The old bug subtracted full pageH2 from
// bottomRemaining instead of the actual clamped value, causing negative
// y1 on the last page → 1×1 placeholder crop.
pageImages := map[int]image.Image{
0: makeTestPageImage(100, 2000, color.RGBA{200, 0, 0, 255}),
1: makeTestPageImage(100, 800, color.RGBA{0, 200, 0, 255}),
2: makeTestPageImage(100, 800, color.RGBA{0, 0, 200, 255}),
}
// Position spans pages 0-2, bottom reaches into page 2.
posTag := "@@1-3\t0.0\t100.0\t0.0\t500.0##"
b64 := cropSectionImage(posTag, pageImages, 1)
if b64 == "" {
t.Fatal("expected non-empty result for multi-page position")
}
// Decode and check height: content 500pt + bottom on page 1 clamped
// to 800 → page 1 crop 0-800, page 2 crop 0-200. Total with 2x6px gaps
// should be ~2000 + 200 + 12 = 2212.
decoded, _ := base64.StdEncoding.DecodeString(b64)
img := decodePNG(t, decoded)
h := img.Bounds().Dy()
// Without the fix, page 2 gets negative y1 → 1x1 output (~100 + gap).
// With fix, proper crop from all 3 pages.
if h < 500 {
t.Errorf("multi-page height too small: got %d, want >= 500 (bug: bottomRemaining over-subtraction)", h)
}
t.Logf("multi-page stitch height: %d", h)
}
// TestCropSectionImage_LargePageSpan verifies 2-page case was not broken.
func TestCropSectionImage_LargePageSpan(t *testing.T) {
pageImages := map[int]image.Image{
0: makeTestPageImage(100, 800, color.RGBA{200, 0, 0, 255}),
1: makeTestPageImage(100, 600, color.RGBA{0, 200, 0, 255}),
}
posTag := "@@1-2\t0.0\t100.0\t0.0\t900.0##"
b64 := cropSectionImage(posTag, pageImages, 1)
if b64 == "" {
t.Fatal("expected non-empty result")
}
decoded, _ := base64.StdEncoding.DecodeString(b64)
img := decodePNG(t, decoded)
if img.Bounds().Dy() < 500 {
t.Errorf("2-page height too small: %d", img.Bounds().Dy())
}
}
// TestCropSectionByDLA tests that figure sections get cropped using the
// best-overlapping DLA region instead of the text-box PositionTag.
func TestCropSectionByDLA(t *testing.T) {
// Create a test page image (216 DPI scale = 3x PDF points).
// The image is 300x450 px, which is 100x150 in PDF points at scale 3.
pageImages := map[int]image.Image{
0: makeTestPageImage(300, 450, color.RGBA{255, 0, 0, 255}),
}
// DLA regions in pixel space (216 DPI).
// Figure region at (30, 60, 270, 420) — a large area covering most of the image.
// Text region at (10, 400, 100, 440) — a small text box near the bottom.
dlaDebug := []DLAPageRegions{{
Page: 0,
Regions: []DLARegion{
{X0: 10, Y0: 400, X1: 100, Y1: 440, Label: "text"},
{X0: 30, Y0: 60, X1: 270, Y1: 420, Label: "figure"},
{X0: 5, Y0: 5, X1: 290, Y1: 55, Label: "title"},
},
}}
// Section with a text-box-sized bbox (PDF points, 72 DPI).
// In pixel space at scale 3: (60, 1200, 150, 1320) → (20, 400, 50, 440).
// This overlaps with the "figure" DLA region.
sec := Section{
Positions: []Position{{
PageNumbers: []int{0},
Left: 20, Right: 50,
Top: 400 / 3.0, Bottom: 440 / 3.0,
}},
LayoutType: "figure",
}
result := cropSectionByDLA(sec, dlaDebug, pageImages)
if result == "" {
t.Fatal("expected non-empty result for figure overlapping DLA region")
}
// Decode and verify.
decoded, _ := base64.StdEncoding.DecodeString(result)
img := decodePNG(t, decoded)
// The DLA figure region is (30,60)-(270,420) with 3% margin.
// Expected: ~(30-7.2, 60-10.8)-(270+7.2, 420+10.8) ≈ (22.8, 49.2)-(277.2, 430.8)
// width ≈ 254px, height ≈ 381px
w, h := img.Bounds().Dx(), img.Bounds().Dy()
t.Logf("cropSectionByDLA result: %dx%d", w, h)
if w < 200 || h < 300 {
t.Errorf("unexpected crop size %dx%d, want >= 200x300 (DLA region based)", w, h)
}
}
// TestCropSectionByDLA_NoMatch returns empty when no DLA region overlaps.
func TestCropSectionByDLA_NoMatch(t *testing.T) {
pageImages := map[int]image.Image{
0: makeTestPageImage(300, 450, color.RGBA{255, 0, 0, 255}),
}
dlaDebug := []DLAPageRegions{{
Page: 0,
Regions: []DLARegion{
{X0: 10, Y0: 10, X1: 100, Y1: 50, Label: "title"},
{X0: 10, Y0: 60, X1: 100, Y1: 100, Label: "text"},
},
}}
// Section whose bbox doesn't overlap any figure/equation DLA region.
sec := Section{
Positions: []Position{{
PageNumbers: []int{0},
Left: 20, Right: 50, Top: 20, Bottom: 50,
}},
LayoutType: "figure",
}
result := cropSectionByDLA(sec, dlaDebug, pageImages)
if result != "" {
t.Errorf("expected empty result when no figure/equation DLA region found, got length %d", len(result))
}
}
// TestCropSectionByDLA_EmptyInputs returns empty for edge cases.
func TestCropSectionByDLA_EmptyInputs(t *testing.T) {
// Empty positions.
if got := cropSectionByDLA(Section{}, nil, nil); got != "" {
t.Error("expected empty for empty positions")
}
// Empty page numbers.
sec := Section{Positions: []Position{{PageNumbers: nil}}}
if got := cropSectionByDLA(sec, nil, nil); got != "" {
t.Error("expected empty for empty page numbers")
}
}