mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
186 lines
5.0 KiB
Go
186 lines
5.0 KiB
Go
package parser
|
|
|
|
import (
|
|
"strings"
|
|
"testing"
|
|
)
|
|
|
|
func TestCharWidth(t *testing.T) {
|
|
c := TextChar{X0: 50, X1: 58, Text: "A"}
|
|
if w := CharWidth(c); w != 8.0 {
|
|
t.Errorf("CharWidth = %v, want 8.0", w)
|
|
}
|
|
|
|
c2 := TextChar{X0: 50, X1: 70, Text: "hi"}
|
|
if w := CharWidth(c2); w != 10.0 {
|
|
t.Errorf("CharWidth = %v, want 10.0", w)
|
|
}
|
|
|
|
c3 := TextChar{X0: 50, X1: 50, Text: ""}
|
|
if w := CharWidth(c3); w != 0 {
|
|
t.Errorf("CharWidth empty = %v, want 0", w)
|
|
}
|
|
}
|
|
|
|
func TestCharHeight(t *testing.T) {
|
|
c := TextChar{Top: 200, Bottom: 212}
|
|
if h := CharHeight(c); h != 12.0 {
|
|
t.Errorf("CharHeight = %v, want 8.0", h)
|
|
}
|
|
}
|
|
|
|
func TestXDis(t *testing.T) {
|
|
a := TextChar{X0: 50, X1: 58}
|
|
b := TextChar{X0: 60, X1: 68}
|
|
d := XDis(a, b)
|
|
expected := 2.0 // min(|58-60|=2, |50-68|=18, |108-128|/2=10)
|
|
if d != expected {
|
|
t.Errorf("XDis = %v, want %v", d, expected)
|
|
}
|
|
}
|
|
|
|
func TestYDis(t *testing.T) {
|
|
a := TextChar{Top: 100, Bottom: 112}
|
|
b := TextChar{Top: 114, Bottom: 126}
|
|
d := YDis(a, b)
|
|
expected := (114.0 + 126.0 - 100.0 - 112.0) / 2 // 14
|
|
if d != expected {
|
|
t.Errorf("YDis = %v, want %v", d, expected)
|
|
}
|
|
}
|
|
|
|
func TestSortXByPage(t *testing.T) {
|
|
boxes := []TextBox{
|
|
{PageNumber: 1, X0: 100, Top: 50, Text: "C"},
|
|
{PageNumber: 1, X0: 50, Top: 100, Text: "A"},
|
|
{PageNumber: 1, X0: 50, Top: 30, Text: "B"},
|
|
{PageNumber: 0, X0: 0, Top: 0, Text: "D"},
|
|
}
|
|
result := SortXByPage(boxes, 3)
|
|
if result[0].Text != "D" {
|
|
t.Errorf("first should be page 0: got %q", result[0].Text)
|
|
}
|
|
if result[1].Text != "B" || result[2].Text != "A" {
|
|
t.Errorf("page 1 ordering wrong: %q, %q", result[1].Text, result[2].Text)
|
|
}
|
|
}
|
|
|
|
func TestOverlapX(t *testing.T) {
|
|
b1 := TextBox{X0: 50, X1: 200}
|
|
b2 := TextBox{X0: 100, X1: 250}
|
|
overlap := OverlapX(&b1, &b2)
|
|
if overlap <= 0.5 || overlap >= 0.8 {
|
|
t.Errorf("OverlapX = %v, want ~0.667", overlap)
|
|
}
|
|
|
|
b3 := TextBox{X0: 50, X1: 100}
|
|
b4 := TextBox{X0: 200, X1: 250}
|
|
if overlap := OverlapX(&b3, &b4); overlap != 0 {
|
|
t.Errorf("non-overlapping should be 0: got %v", overlap)
|
|
}
|
|
}
|
|
|
|
func TestMedianCharHeight(t *testing.T) {
|
|
chars := []TextChar{
|
|
{Top: 0, Bottom: 10},
|
|
{Top: 0, Bottom: 20},
|
|
}
|
|
h := MedianCharHeight(chars)
|
|
if h != 15.0 {
|
|
t.Errorf("MedianCharHeight = %v, want 15.0", h)
|
|
}
|
|
if h2 := MedianCharHeight(nil); h2 != 10.0 {
|
|
t.Errorf("MedianCharHeight(empty) = %v, want 10.0", h2)
|
|
}
|
|
}
|
|
|
|
func TestMedianHeight(t *testing.T) {
|
|
boxes := []TextBox{
|
|
{Top: 0, Bottom: 10},
|
|
{Top: 0, Bottom: 20},
|
|
{Top: 0, Bottom: 30},
|
|
}
|
|
if mh := MedianHeight(boxes); mh != 20.0 {
|
|
t.Errorf("MedianHeight = %v, want 20.0", mh)
|
|
}
|
|
if mh2 := MedianHeight(nil); mh2 != 10.0 {
|
|
t.Errorf("MedianHeight(empty) = %v, want 10.0", mh2)
|
|
}
|
|
}
|
|
|
|
func TestNaiveVerticalMerge(t *testing.T) {
|
|
boxes := []TextBox{
|
|
{PageNumber: 0, ColID: 0, X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "第一段", LayoutNo: "1", LayoutType: "text"},
|
|
{PageNumber: 0, ColID: 0, X0: 50, X1: 550, Top: 114, Bottom: 126, Text: "续文", LayoutNo: "1", LayoutType: "text"},
|
|
}
|
|
meanH := map[int]float64{0: 12}
|
|
meanW := map[int]float64{0: 5}
|
|
result := NaiveVerticalMerge(boxes, meanH, meanW, false)
|
|
// These should merge: small vertical gap, overlapping horizontally, same layout
|
|
if len(result) != 1 {
|
|
t.Errorf("expected 1 merged box, got %d: %v", len(result), result)
|
|
}
|
|
if len(result) > 0 && !strings.Contains(result[0].Text, "第一段") {
|
|
t.Errorf("merged text should contain '第一段': got %q", result[0].Text)
|
|
}
|
|
}
|
|
|
|
func TestNaiveVerticalMergeNonMerge(t *testing.T) {
|
|
// Large gap — should not merge
|
|
boxes := []TextBox{
|
|
{PageNumber: 0, ColID: 0, X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "第一段。", LayoutNo: "1", LayoutType: "text"},
|
|
{PageNumber: 0, ColID: 0, X0: 50, X1: 550, Top: 300, Bottom: 312, Text: "第二段。", LayoutNo: "1", LayoutType: "text"},
|
|
}
|
|
meanH := map[int]float64{0: 12}
|
|
meanW := map[int]float64{0: 5}
|
|
result := NaiveVerticalMerge(boxes, meanH, meanW, false)
|
|
if len(result) != 2 {
|
|
t.Errorf("expected 2 separate boxes (large gap), got %d", len(result))
|
|
}
|
|
}
|
|
|
|
func TestBoxWidth(t *testing.T) {
|
|
b := TextBox{X0: 50, X1: 200}
|
|
if w := BoxWidth(b); w != 150 {
|
|
t.Errorf("BoxWidth = %v, want 150", w)
|
|
}
|
|
}
|
|
|
|
func TestBoxHeight(t *testing.T) {
|
|
b := TextBox{Top: 100, Bottom: 130}
|
|
if h := BoxHeight(b); h != 30 {
|
|
t.Errorf("BoxHeight = %v, want 30", h)
|
|
}
|
|
}
|
|
|
|
func TestBoxXDis(t *testing.T) {
|
|
b1 := TextBox{X0: 50, X1: 100}
|
|
b2 := TextBox{X0: 110, X1: 200}
|
|
if d := BoxXDis(b1, b2); d != 10 {
|
|
t.Errorf("BoxXDis = %v, want 10", d)
|
|
}
|
|
}
|
|
|
|
func TestBoxYDis(t *testing.T) {
|
|
b1 := TextBox{Top: 100, Bottom: 112}
|
|
b2 := TextBox{Top: 114, Bottom: 126}
|
|
d := BoxYDis(b1, b2)
|
|
expected := (114.0 + 126.0 - 100.0 - 112.0) / 2
|
|
if d != expected {
|
|
t.Errorf("BoxYDis = %v, want %v", d, expected)
|
|
}
|
|
}
|
|
|
|
func TestMedianCharWidth(t *testing.T) {
|
|
chars := []TextChar{
|
|
{X0: 0, X1: 8, Text: "A"},
|
|
{X0: 0, X1: 16, Text: "AB"},
|
|
}
|
|
if w := MedianCharWidth(chars); w != 8 {
|
|
t.Errorf("MedianCharWidth = %v, want 8", w)
|
|
}
|
|
if w := MedianCharWidth(nil); w != 5 {
|
|
t.Errorf("MedianCharWidth(empty) = %v, want 5", w)
|
|
}
|
|
}
|