Files
ragflow/internal/deepdoc/parser/pdf/util/geometry_test.go
Jack 98323e7910 Refactor: oss parser go refactor (#16391)
### What problem does this PR solve?

Package refactor and PDF post process.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-29 18:46:41 +08:00

216 lines
5.4 KiB
Go

package util
import (
pdf "ragflow/internal/deepdoc/parser/pdf/type"
"testing"
)
func TestCharWidth(t *testing.T) {
c := pdf.TextChar{X0: 50, X1: 58, Text: "A"}
if w := CharWidth(c); w != 8.0 {
t.Errorf("CharWidth = %v, want 8.0", w)
}
c2 := pdf.TextChar{X0: 50, X1: 70, Text: "hi"}
if w := CharWidth(c2); w != 10.0 {
t.Errorf("CharWidth = %v, want 10.0", w)
}
c3 := pdf.TextChar{X0: 50, X1: 50, Text: ""}
if w := CharWidth(c3); w != 0 {
t.Errorf("CharWidth empty = %v, want 0", w)
}
}
func TestCharHeight(t *testing.T) {
c := pdf.TextChar{Top: 200, Bottom: 212}
if h := CharHeight(c); h != 12.0 {
t.Errorf("CharHeight = %v, want 8.0", h)
}
}
func TestXDis(t *testing.T) {
a := pdf.TextChar{X0: 50, X1: 58}
b := pdf.TextChar{X0: 60, X1: 68}
d := XDis(a, b)
expected := 2.0 // min(|58-60|=2, |50-68|=18, |108-128|/2=10)
if d != expected {
t.Errorf("XDis = %v, want %v", d, expected)
}
}
func TestYDis(t *testing.T) {
a := pdf.TextChar{Top: 100, Bottom: 112}
b := pdf.TextChar{Top: 114, Bottom: 126}
d := YDis(a, b)
expected := (114.0 + 126.0 - 100.0 - 112.0) / 2 // 14
if d != expected {
t.Errorf("YDis = %v, want %v", d, expected)
}
}
func TestSortXByPage(t *testing.T) {
boxes := []pdf.TextBox{
{PageNumber: 1, X0: 100, Top: 50, Text: "C"},
{PageNumber: 1, X0: 50, Top: 100, Text: "A"},
{PageNumber: 1, X0: 50, Top: 30, Text: "B"},
{PageNumber: 0, X0: 0, Top: 0, Text: "D"},
}
result := SortXByPage(boxes, 3)
if result[0].Text != "D" {
t.Errorf("first should be page 0: got %q", result[0].Text)
}
if result[1].Text != "B" || result[2].Text != "A" {
t.Errorf("page 1 ordering wrong: %q, %q", result[1].Text, result[2].Text)
}
}
func TestOverlapX(t *testing.T) {
b1 := pdf.TextBox{X0: 50, X1: 200}
b2 := pdf.TextBox{X0: 100, X1: 250}
overlap := OverlapX(&b1, &b2)
if overlap <= 0.5 || overlap >= 0.8 {
t.Errorf("OverlapX = %v, want ~0.667", overlap)
}
b3 := pdf.TextBox{X0: 50, X1: 100}
b4 := pdf.TextBox{X0: 200, X1: 250}
if overlap := OverlapX(&b3, &b4); overlap != 0 {
t.Errorf("non-overlapping should be 0: got %v", overlap)
}
}
func TestMedianCharHeight(t *testing.T) {
chars := []pdf.TextChar{
{Top: 0, Bottom: 10},
{Top: 0, Bottom: 20},
}
h := MedianCharHeight(chars)
if h != 15.0 {
t.Errorf("MedianCharHeight = %v, want 15.0", h)
}
if h2 := MedianCharHeight(nil); h2 != 10.0 {
t.Errorf("MedianCharHeight(empty) = %v, want 10.0", h2)
}
}
func TestMedianHeight(t *testing.T) {
boxes := []pdf.TextBox{
{Top: 0, Bottom: 10},
{Top: 0, Bottom: 20},
{Top: 0, Bottom: 30},
}
if mh := MedianHeight(boxes); mh != 20.0 {
t.Errorf("MedianHeight = %v, want 20.0", mh)
}
if mh2 := MedianHeight(nil); mh2 != 10.0 {
t.Errorf("MedianHeight(empty) = %v, want 10.0", mh2)
}
}
func TestBoxWidth(t *testing.T) {
b := pdf.TextBox{X0: 50, X1: 200}
if w := BoxWidth(b); w != 150 {
t.Errorf("BoxWidth = %v, want 150", w)
}
}
func TestBoxHeight(t *testing.T) {
b := pdf.TextBox{Top: 100, Bottom: 130}
if h := BoxHeight(b); h != 30 {
t.Errorf("BoxHeight = %v, want 30", h)
}
}
func TestBoxXDis(t *testing.T) {
b1 := pdf.TextBox{X0: 50, X1: 100}
b2 := pdf.TextBox{X0: 110, X1: 200}
if d := BoxXDis(b1, b2); d != 10 {
t.Errorf("BoxXDis = %v, want 10", d)
}
}
func TestBoxYDis(t *testing.T) {
b1 := pdf.TextBox{Top: 100, Bottom: 112}
b2 := pdf.TextBox{Top: 114, Bottom: 126}
d := BoxYDis(b1, b2)
expected := (114.0 + 126.0 - 100.0 - 112.0) / 2
if d != expected {
t.Errorf("BoxYDis = %v, want %v", d, expected)
}
}
func TestMedianCharWidth(t *testing.T) {
chars := []pdf.TextChar{
{X0: 0, X1: 8, Text: "A"},
{X0: 0, X1: 16, Text: "AB"},
}
if w := MedianCharWidth(chars); w != 8 {
t.Errorf("MedianCharWidth = %v, want 8", w)
}
if w := MedianCharWidth(nil); w != 5 {
t.Errorf("MedianCharWidth(empty) = %v, want 5", w)
}
}
// textBox implements Rectangular for testing.
type textBox struct{ x0, y0, x1, y1 float64 }
func (b textBox) Bounds() (float64, float64, float64, float64) {
return b.x0, b.y0, b.x1, b.y1
}
func TestArea(t *testing.T) {
tests := []struct {
name string
r pdf.Rectangular
want float64
}{
{"normal", textBox{0, 0, 10, 20}, 200},
{"zero width", textBox{5, 0, 5, 10}, 0},
{"zero height", textBox{0, 5, 10, 5}, 0},
{"degenerate", textBox{10, 10, 5, 5}, 0},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := Area(tt.r); got != tt.want {
t.Errorf("Area = %v, want %v", got, tt.want)
}
})
}
}
func TestOverlapInter(t *testing.T) {
tests := []struct {
name string
a, b pdf.Rectangular
want float64
}{
{"full overlap", textBox{0, 0, 10, 10}, textBox{0, 0, 10, 10}, 100},
{"partial", textBox{0, 0, 10, 10}, textBox{5, 5, 15, 15}, 25},
{"no overlap", textBox{0, 0, 10, 10}, textBox{20, 20, 30, 30}, 0},
{"edge touching", textBox{0, 0, 10, 10}, textBox{10, 0, 20, 10}, 0},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := OverlapInter(tt.a, tt.b); got != tt.want {
t.Errorf("OverlapInter = %v, want %v", got, tt.want)
}
})
}
}
func TestOverlapRatioA(t *testing.T) {
a := textBox{0, 0, 10, 10} // area = 100
b := textBox{5, 5, 15, 15} // overlap = 25
if got := OverlapRatioA(a, b); got != 0.25 {
t.Errorf("OverlapRatioA = %v, want 0.25", got)
}
// no overlap
c := textBox{0, 0, 10, 10}
d := textBox{20, 20, 30, 30}
if got := OverlapRatioA(c, d); got != 0 {
t.Errorf("OverlapRatioA no overlap = %v, want 0", got)
}
}