Files
ragflow/internal/deepdoc/parser/pdf/geometry_test.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

186 lines
5.0 KiB
Go

package parser
import (
"strings"
"testing"
)
func TestCharWidth(t *testing.T) {
c := TextChar{X0: 50, X1: 58, Text: "A"}
if w := CharWidth(c); w != 8.0 {
t.Errorf("CharWidth = %v, want 8.0", w)
}
c2 := TextChar{X0: 50, X1: 70, Text: "hi"}
if w := CharWidth(c2); w != 10.0 {
t.Errorf("CharWidth = %v, want 10.0", w)
}
c3 := TextChar{X0: 50, X1: 50, Text: ""}
if w := CharWidth(c3); w != 0 {
t.Errorf("CharWidth empty = %v, want 0", w)
}
}
func TestCharHeight(t *testing.T) {
c := TextChar{Top: 200, Bottom: 212}
if h := CharHeight(c); h != 12.0 {
t.Errorf("CharHeight = %v, want 8.0", h)
}
}
func TestXDis(t *testing.T) {
a := TextChar{X0: 50, X1: 58}
b := TextChar{X0: 60, X1: 68}
d := XDis(a, b)
expected := 2.0 // min(|58-60|=2, |50-68|=18, |108-128|/2=10)
if d != expected {
t.Errorf("XDis = %v, want %v", d, expected)
}
}
func TestYDis(t *testing.T) {
a := TextChar{Top: 100, Bottom: 112}
b := TextChar{Top: 114, Bottom: 126}
d := YDis(a, b)
expected := (114.0 + 126.0 - 100.0 - 112.0) / 2 // 14
if d != expected {
t.Errorf("YDis = %v, want %v", d, expected)
}
}
func TestSortXByPage(t *testing.T) {
boxes := []TextBox{
{PageNumber: 1, X0: 100, Top: 50, Text: "C"},
{PageNumber: 1, X0: 50, Top: 100, Text: "A"},
{PageNumber: 1, X0: 50, Top: 30, Text: "B"},
{PageNumber: 0, X0: 0, Top: 0, Text: "D"},
}
result := SortXByPage(boxes, 3)
if result[0].Text != "D" {
t.Errorf("first should be page 0: got %q", result[0].Text)
}
if result[1].Text != "B" || result[2].Text != "A" {
t.Errorf("page 1 ordering wrong: %q, %q", result[1].Text, result[2].Text)
}
}
func TestOverlapX(t *testing.T) {
b1 := TextBox{X0: 50, X1: 200}
b2 := TextBox{X0: 100, X1: 250}
overlap := OverlapX(&b1, &b2)
if overlap <= 0.5 || overlap >= 0.8 {
t.Errorf("OverlapX = %v, want ~0.667", overlap)
}
b3 := TextBox{X0: 50, X1: 100}
b4 := TextBox{X0: 200, X1: 250}
if overlap := OverlapX(&b3, &b4); overlap != 0 {
t.Errorf("non-overlapping should be 0: got %v", overlap)
}
}
func TestMedianCharHeight(t *testing.T) {
chars := []TextChar{
{Top: 0, Bottom: 10},
{Top: 0, Bottom: 20},
}
h := MedianCharHeight(chars)
if h != 15.0 {
t.Errorf("MedianCharHeight = %v, want 15.0", h)
}
if h2 := MedianCharHeight(nil); h2 != 10.0 {
t.Errorf("MedianCharHeight(empty) = %v, want 10.0", h2)
}
}
func TestMedianHeight(t *testing.T) {
boxes := []TextBox{
{Top: 0, Bottom: 10},
{Top: 0, Bottom: 20},
{Top: 0, Bottom: 30},
}
if mh := MedianHeight(boxes); mh != 20.0 {
t.Errorf("MedianHeight = %v, want 20.0", mh)
}
if mh2 := MedianHeight(nil); mh2 != 10.0 {
t.Errorf("MedianHeight(empty) = %v, want 10.0", mh2)
}
}
func TestNaiveVerticalMerge(t *testing.T) {
boxes := []TextBox{
{PageNumber: 0, ColID: 0, X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "第一段", LayoutNo: "1", LayoutType: "text"},
{PageNumber: 0, ColID: 0, X0: 50, X1: 550, Top: 114, Bottom: 126, Text: "续文", LayoutNo: "1", LayoutType: "text"},
}
meanH := map[int]float64{0: 12}
meanW := map[int]float64{0: 5}
result := NaiveVerticalMerge(boxes, meanH, meanW, false)
// These should merge: small vertical gap, overlapping horizontally, same layout
if len(result) != 1 {
t.Errorf("expected 1 merged box, got %d: %v", len(result), result)
}
if len(result) > 0 && !strings.Contains(result[0].Text, "第一段") {
t.Errorf("merged text should contain '第一段': got %q", result[0].Text)
}
}
func TestNaiveVerticalMergeNonMerge(t *testing.T) {
// Large gap — should not merge
boxes := []TextBox{
{PageNumber: 0, ColID: 0, X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "第一段。", LayoutNo: "1", LayoutType: "text"},
{PageNumber: 0, ColID: 0, X0: 50, X1: 550, Top: 300, Bottom: 312, Text: "第二段。", LayoutNo: "1", LayoutType: "text"},
}
meanH := map[int]float64{0: 12}
meanW := map[int]float64{0: 5}
result := NaiveVerticalMerge(boxes, meanH, meanW, false)
if len(result) != 2 {
t.Errorf("expected 2 separate boxes (large gap), got %d", len(result))
}
}
func TestBoxWidth(t *testing.T) {
b := TextBox{X0: 50, X1: 200}
if w := BoxWidth(b); w != 150 {
t.Errorf("BoxWidth = %v, want 150", w)
}
}
func TestBoxHeight(t *testing.T) {
b := TextBox{Top: 100, Bottom: 130}
if h := BoxHeight(b); h != 30 {
t.Errorf("BoxHeight = %v, want 30", h)
}
}
func TestBoxXDis(t *testing.T) {
b1 := TextBox{X0: 50, X1: 100}
b2 := TextBox{X0: 110, X1: 200}
if d := BoxXDis(b1, b2); d != 10 {
t.Errorf("BoxXDis = %v, want 10", d)
}
}
func TestBoxYDis(t *testing.T) {
b1 := TextBox{Top: 100, Bottom: 112}
b2 := TextBox{Top: 114, Bottom: 126}
d := BoxYDis(b1, b2)
expected := (114.0 + 126.0 - 100.0 - 112.0) / 2
if d != expected {
t.Errorf("BoxYDis = %v, want %v", d, expected)
}
}
func TestMedianCharWidth(t *testing.T) {
chars := []TextChar{
{X0: 0, X1: 8, Text: "A"},
{X0: 0, X1: 16, Text: "AB"},
}
if w := MedianCharWidth(chars); w != 8 {
t.Errorf("MedianCharWidth = %v, want 8", w)
}
if w := MedianCharWidth(nil); w != 5 {
t.Errorf("MedianCharWidth(empty) = %v, want 5", w)
}
}