mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
555 lines
21 KiB
Go
555 lines
21 KiB
Go
package parser
|
||
|
||
import (
|
||
"sort"
|
||
"testing"
|
||
)
|
||
|
||
// ── Mock TSR data ──────────────────────────────────────────────────────
|
||
|
||
// makeMockTableCells returns a 2x3 table with header, rows, and spanning cell.
|
||
// Layout:
|
||
//
|
||
// +----------+----------+
|
||
// | col A | col B | ← column headers (Y=10..30)
|
||
// | (span) | | ← spanning cell covers both
|
||
// +----------+----------+
|
||
// | row 1A | row 1B | ← row 1 (Y=30..50)
|
||
// +----------+----------+
|
||
// | row 2A | row 2B | ← row 2 (Y=50..70)
|
||
// +----------+----------+
|
||
func makeMockTableCells() []TSRCell {
|
||
return []TSRCell{
|
||
{X0: 10, Y0: 10, X1: 50, Y1: 30, Label: "table column header"},
|
||
{X0: 50, Y0: 10, X1: 90, Y1: 30, Label: "table column header"},
|
||
{X0: 70, Y0: 30, X1: 90, Y1: 50, Label: "table row"},
|
||
{X0: 10, Y0: 30, X1: 70, Y1: 50, Label: "table row"},
|
||
{X0: 10, Y0: 50, X1: 50, Y1: 70, Label: "table row"},
|
||
{X0: 50, Y0: 50, X1: 90, Y1: 70, Label: "table row"},
|
||
{X0: 10, Y0: 10, X1: 90, Y1: 30, Label: "table spanning cell"},
|
||
}
|
||
}
|
||
|
||
func makeMockBoxes() []TextBox {
|
||
return []TextBox{
|
||
{X0: 10, X1: 90, Top: 25, Bottom: 55, LayoutType: "table", Text: "test table"},
|
||
// row at Y=30..50 overlaps ~80% → should match
|
||
}
|
||
}
|
||
|
||
func TestSortYFirstly(t *testing.T) {
|
||
t.Run("basic sort", func(t *testing.T) {
|
||
cells := []TSRCell{
|
||
{X0: 10, Y0: 50, Label: "c"},
|
||
{X0: 10, Y0: 10, Label: "a"},
|
||
{X0: 10, Y0: 30, Label: "b"},
|
||
}
|
||
sortYFirstly(cells, 5)
|
||
if cells[0].Label != "a" || cells[1].Label != "b" || cells[2].Label != "c" {
|
||
t.Errorf("sort order wrong: %v", cells)
|
||
}
|
||
})
|
||
|
||
t.Run("same Y sorts by X", func(t *testing.T) {
|
||
cells := []TSRCell{
|
||
{X0: 90, Y0: 10, Label: "right"},
|
||
{X0: 10, Y0: 10, Label: "left"},
|
||
}
|
||
sortYFirstly(cells, 5)
|
||
if cells[0].Label != "left" || cells[1].Label != "right" {
|
||
t.Errorf("same Y should sort X ascending: %v", cells)
|
||
}
|
||
})
|
||
}
|
||
|
||
// ── layoutCleanup ──────────────────────────────────────────────────────
|
||
|
||
func TestLayoutCleanup(t *testing.T) {
|
||
boxes := makeMockBoxes()
|
||
|
||
t.Run("no overlap different types", func(t *testing.T) {
|
||
cells := []TSRCell{
|
||
{X0: 10, Y0: 10, X1: 50, Y1: 30, Label: "table column header"},
|
||
{X0: 10, Y0: 10, X1: 50, Y1: 30, Label: "table row"},
|
||
}
|
||
result := layoutCleanup(cells, boxes, 2, 0.7)
|
||
if len(result) != 2 {
|
||
t.Errorf("different types should both keep: got %d", len(result))
|
||
}
|
||
})
|
||
|
||
t.Run("overlap same type keeps one", func(t *testing.T) {
|
||
cells := []TSRCell{
|
||
{X0: 10, Y0: 10, X1: 50, Y1: 30, Label: "table row"},
|
||
{X0: 12, Y0: 12, X1: 48, Y1: 28, Label: "table row"}, // mostly contained
|
||
}
|
||
result := layoutCleanup(cells, boxes, 2, 0.7)
|
||
if len(result) != 1 {
|
||
t.Errorf("overlapping same type should dedup: got %d", len(result))
|
||
}
|
||
})
|
||
|
||
t.Run("non overlapping same type keeps both", func(t *testing.T) {
|
||
cells := []TSRCell{
|
||
{X0: 10, Y0: 10, X1: 50, Y1: 30, Label: "table row"},
|
||
{X0: 200, Y0: 10, X1: 250, Y1: 30, Label: "table row"}, // far away
|
||
}
|
||
result := layoutCleanup(cells, boxes, 2, 0.7)
|
||
if len(result) != 2 {
|
||
t.Errorf("non-overlapping same type should keep both: got %d", len(result))
|
||
}
|
||
})
|
||
|
||
t.Run("empty boxes", func(t *testing.T) {
|
||
result := layoutCleanup(nil, nil, 2, 0.7)
|
||
if len(result) != 0 {
|
||
t.Errorf("empty input should return empty: got %d", len(result))
|
||
}
|
||
})
|
||
}
|
||
|
||
// ── findOverlappedWithThreshold ────────────────────────────────────────
|
||
|
||
func TestFindOverlappedWithThreshold(t *testing.T) {
|
||
cells := []TSRCell{
|
||
{X0: 10, Y0: 10, X1: 50, Y1: 30},
|
||
{X0: 50, Y0: 30, X1: 90, Y1: 50},
|
||
{X0: 10, Y0: 50, X1: 50, Y1: 70},
|
||
}
|
||
|
||
t.Run("exact match", func(t *testing.T) {
|
||
box := TextBox{X0: 10, X1: 50, Top: 10, Bottom: 30}
|
||
if idx := findOverlappedWithThreshold(box, cells, 0.3); idx != 0 {
|
||
t.Errorf("expected idx=0, got %d", idx)
|
||
}
|
||
})
|
||
|
||
t.Run("no match", func(t *testing.T) {
|
||
box := TextBox{X0: 200, X1: 250, Top: 200, Bottom: 230}
|
||
if idx := findOverlappedWithThreshold(box, cells, 0.3); idx != -1 {
|
||
t.Errorf("expected idx=-1, got %d", idx)
|
||
}
|
||
})
|
||
|
||
t.Run("zero area box", func(t *testing.T) {
|
||
box := TextBox{X0: 10, X1: 10, Top: 10, Bottom: 10}
|
||
if idx := findOverlappedWithThreshold(box, cells, 0.3); idx != -1 {
|
||
t.Errorf("zero-area box should return -1: got %d", idx)
|
||
}
|
||
})
|
||
}
|
||
|
||
// ── annotateTableBoxes ─────────────────────────────────────────────────
|
||
|
||
func TestAnnotateTableBoxes(t *testing.T) {
|
||
cells := makeMockTableCells()
|
||
boxes := makeMockBoxes()
|
||
|
||
annotateTableBoxes(boxes, groupTSRCellsToRowsLabeled(cells))
|
||
|
||
b := boxes[0]
|
||
|
||
// Check header annotation
|
||
if b.H < 0 {
|
||
t.Error("header index should be >= 0 for a table with headers")
|
||
}
|
||
|
||
// Check row annotation
|
||
if b.R == 0 {
|
||
t.Error("row index should be set")
|
||
}
|
||
|
||
// Column annotation (2 columns)
|
||
if b.C < 0 {
|
||
t.Error("col index should be >= 0")
|
||
}
|
||
}
|
||
|
||
// ── groupTSRCellsToRowsLabeled ─────────────────────────────────────────
|
||
|
||
func TestGroupTSRCellsToRowsLabeled(t *testing.T) {
|
||
cells := makeMockTableCells()
|
||
|
||
t.Run("label-based grouping", func(t *testing.T) {
|
||
rows := groupTSRCellsToRowsLabeled(cells)
|
||
if len(rows) < 2 {
|
||
t.Errorf("expected >= 2 rows, got %d", len(rows))
|
||
}
|
||
// Each row should be sorted by X
|
||
for ri, row := range rows {
|
||
if !sort.SliceIsSorted(row, func(i, j int) bool { return row[i].X0 < row[j].X0 }) {
|
||
t.Errorf("row %d not sorted by X", ri)
|
||
}
|
||
}
|
||
})
|
||
|
||
t.Run("fallback to Y-based", func(t *testing.T) {
|
||
unlabeled := []TSRCell{
|
||
{X0: 10, Y0: 10, X1: 50, Y1: 20, Label: ""},
|
||
{X0: 10, Y0: 30, X1: 50, Y1: 40, Label: ""},
|
||
}
|
||
rows := groupTSRCellsToRowsLabeled(unlabeled)
|
||
if len(rows) < 2 {
|
||
t.Errorf("fallback: expected >= 2 rows, got %d", len(rows))
|
||
}
|
||
})
|
||
|
||
t.Run("single cell", func(t *testing.T) {
|
||
cells := []TSRCell{{X0: 0, Y0: 0, X1: 10, Y1: 10, Label: "table row"}}
|
||
rows := groupTSRCellsToRowsLabeled(cells)
|
||
if len(rows) != 1 {
|
||
t.Errorf("expected 1 row, got %d", len(rows))
|
||
}
|
||
})
|
||
}
|
||
|
||
// TestAnnotateTableBoxes_PixelSpace verifies that boxes in pixel space
|
||
// (as from DLA-scaled coordinates) correctly match TSR cells. Regression test for Bug #1.
|
||
func TestAnnotateTableBoxes_PixelSpace(t *testing.T) {
|
||
boxes := []TextBox{
|
||
{X0: 150, X1: 750, Top: 300, Bottom: 420, LayoutType: "table"},
|
||
}
|
||
cells := []TSRCell{
|
||
{X0: 150, Y0: 300, X1: 750, Y1: 350, Label: "table column header"},
|
||
{X0: 150, Y0: 350, X1: 750, Y1: 380, Label: "table row"},
|
||
{X0: 150, Y0: 380, X1: 750, Y1: 420, Label: "table row"},
|
||
}
|
||
annotateTableBoxes(boxes, groupTSRCellsToRowsLabeled(cells))
|
||
if boxes[0].R < 0 {
|
||
t.Error("row index should be set (pixel-space matching)")
|
||
}
|
||
if boxes[0].H < 0 {
|
||
t.Error("header index should be set")
|
||
}
|
||
}
|
||
|
||
// TestFindHorizontallyTightestFit verifies the edge-distance matching
|
||
// (Python's minimum edge distance, not Go's old containment check).
|
||
func TestFindHorizontallyTightestFit(t *testing.T) {
|
||
clmns := []TSRCell{
|
||
{X0: 0, Y0: 0, X1: 100, Y1: 50},
|
||
{X0: 100, Y0: 0, X1: 200, Y1: 50},
|
||
}
|
||
|
||
t.Run("exact match left edge", func(t *testing.T) {
|
||
box := TextBox{X0: 100, X1: 150, Top: 0, Bottom: 50}
|
||
if idx := findHorizontallyTightestFit(box, clmns); idx != 1 {
|
||
t.Errorf("box at col 1 left edge: got idx=%d, want 1", idx)
|
||
}
|
||
})
|
||
|
||
t.Run("partial containment — still matches nearest", func(t *testing.T) {
|
||
// Box mostly in col 0 but spills into col 1. Old containment check
|
||
// would fail; distance check matches col 0 (closer edges).
|
||
box := TextBox{X0: 80, X1: 120, Top: 0, Bottom: 50}
|
||
if idx := findHorizontallyTightestFit(box, clmns); idx != 0 {
|
||
t.Errorf("spill box: got idx=%d, want 0 (nearest edges)", idx)
|
||
}
|
||
})
|
||
|
||
t.Run("empty columns", func(t *testing.T) {
|
||
if idx := findHorizontallyTightestFit(TextBox{}, nil); idx != -1 {
|
||
t.Errorf("empty: got %d, want -1", idx)
|
||
}
|
||
})
|
||
}
|
||
|
||
// TestFindOverlappedWithThreshold_BestMatch verifies the best-match
|
||
// (bidirectional overlap) replaces the old first-match behavior.
|
||
func TestFindOverlappedWithThreshold_BestMatch(t *testing.T) {
|
||
// Two cells overlap the same box. Cell 1 has MORE overlap → should win.
|
||
cells := []TSRCell{
|
||
{X0: 0, Y0: 0, X1: 50, Y1: 50}, // 30% overlap
|
||
{X0: 0, Y0: 0, X1: 100, Y1: 100}, // 100% overlap — best match
|
||
}
|
||
box := TextBox{X0: 0, X1: 100, Top: 0, Bottom: 100}
|
||
if idx := findOverlappedWithThreshold(box, cells, 0.2); idx != 1 {
|
||
t.Errorf("best-match: got idx=%d, want 1 (100%% overlap beats 30%%)", idx)
|
||
}
|
||
}
|
||
|
||
// TestFindOverlappedWithThreshold_BidirectionalGate verifies that the gate
|
||
// uses max(boxRatio, cellRatio) — matching Python's bidirectional check.
|
||
// A large box that fully contains a tiny cell should match because the
|
||
// cell-perspective ratio is 1.0 (the cell is entirely inside the box).
|
||
// Python: max(overlap/boxArea, overlap/cellArea) = max(0.02, 1.0) = 1.0 ≥ 0.3 ✓
|
||
// Old Go (box-only gate): overlap/boxArea = 0.02 > 0.3? → NO MATCH ✗
|
||
func TestFindOverlappedWithThreshold_BidirectionalGate(t *testing.T) {
|
||
// Large box fully contains a tiny cell.
|
||
box := TextBox{X0: 0, X1: 500, Top: 0, Bottom: 20} // area = 10000
|
||
cells := []TSRCell{
|
||
{X0: 0, Y0: 0, X1: 10, Y1: 20}, // area = 200, entirely inside box
|
||
}
|
||
// boxRatio = 200/10000 = 0.02, cellRatio = 200/200 = 1.0
|
||
// Python: max(0.02, 1.0) = 1.0 ≥ 0.3 → match!
|
||
idx := findOverlappedWithThreshold(box, cells, 0.3)
|
||
if idx != 0 {
|
||
t.Errorf("bidirectional gate: cell fully inside large box should match (cellRatio=1.0 ≥ 0.3). got idx=%d, want 0", idx)
|
||
}
|
||
}
|
||
|
||
// TestFindOverlappedWithThreshold_MaxScoring verifies that scoring uses
|
||
// max(boxRatio, cellRatio) — NOT sum. Python picks the cell with the
|
||
// highest max(boxRatio, cellRatio).
|
||
//
|
||
// Cell A: boxRatio=0.60, cellRatio=0.05 → max=0.60, sum=0.65
|
||
// Cell B: boxRatio=0.40, cellRatio=0.40 → max=0.40, sum=0.80
|
||
// Python (max): picks A (0.60 > 0.40). Old Go (sum): picks B (0.80 > 0.65).
|
||
func TestFindOverlappedWithThreshold_MaxScoring(t *testing.T) {
|
||
box := TextBox{X0: 0, X1: 100, Top: 0, Bottom: 100} // area = 10000
|
||
cells := []TSRCell{
|
||
// Cell A: narrow but tall (60×2000), covers 60% of box width.
|
||
// boxRatio=60*100/10000=0.60, cellRatio=60*100/(60*2000)=0.05, max=0.60
|
||
{X0: 0, Y0: 0, X1: 60, Y1: 2000},
|
||
// Cell B: moderate width (35×100), covers 35% of box. cellRatio=1.0.
|
||
// boxRatio=35*100/10000=0.35, cellRatio=35*100/(35*100)=1.0, max=1.0
|
||
// Hmm that gives cellRatio=1.0. Need to adjust for max=0.4 not 1.0.
|
||
// Actually cell B should be: overlap/boxArea=0.35, overlap/cellArea=0.4.
|
||
// overlap=3500, cellArea=3500/0.4=8750 → e.g., 35×250.
|
||
{X0: 0, Y0: 0, X1: 35, Y1: 250},
|
||
}
|
||
// Cell A: overlap=6000, boxRatio=0.60, cellRatio=6000/120000=0.05, max=0.60
|
||
// Cell B: overlap=3500, boxRatio=0.35, cellRatio=3500/8750=0.40, max=0.40
|
||
// Python picks A (0.60 > 0.40). Old Go picks B (0.75 > 0.65).
|
||
idx := findOverlappedWithThreshold(box, cells, 0.3)
|
||
if idx != 0 {
|
||
t.Errorf("max scoring: cell A (max=0.60) should beat cell B (max=0.40). got idx=%d, want 0 (Python uses max, not sum)", idx)
|
||
}
|
||
}
|
||
|
||
// TestGroupTSRCellsToRowsLabeled_FallbackY verifies the fallback
|
||
// Y-based grouping path when all cells have label "table" (real
|
||
// DeepDoc HTTP API with wrong TSR model). Must produce correct
|
||
// row×col structure even without row/column labels.
|
||
func TestGroupTSRCellsToRowsLabeled_FallbackY(t *testing.T) {
|
||
// 4 rows × 5 cols = 20 cells, all label="table".
|
||
cells := make([]TSRCell, 20)
|
||
for r := 0; r < 4; r++ {
|
||
for c := 0; c < 5; c++ {
|
||
cells[r*5+c] = TSRCell{
|
||
X0: float64(c * 100), Y0: float64(r * 30),
|
||
X1: float64(c*100 + 80), Y1: float64(r*30 + 25),
|
||
Label: "table",
|
||
}
|
||
}
|
||
}
|
||
rows := groupTSRCellsToRowsLabeled(cells)
|
||
if len(rows) != 4 {
|
||
t.Fatalf("fallback Y-grouping: expected 4 rows, got %d", len(rows))
|
||
}
|
||
for i, row := range rows {
|
||
if len(row) != 5 {
|
||
t.Errorf("row %d: expected 5 columns, got %d", i, len(row))
|
||
}
|
||
}
|
||
// Verify X-order within each row.
|
||
for i, row := range rows {
|
||
for j := 1; j < len(row); j++ {
|
||
if row[j].X0 < row[j-1].X0 {
|
||
t.Errorf("row %d: cells not sorted by X (cell %d at X=%.0f, cell %d at X=%.0f)",
|
||
i, j-1, row[j-1].X0, j, row[j].X0)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestGroupTSRCellsToRowsLabeled_Irregular verifies Y-grouping
|
||
// tolerates irregular cell layouts: overlapping rows, missing
|
||
// cells, varying sizes. Real DeepDoc output is not always a
|
||
// clean 4×5 grid.
|
||
func TestGroupTSRCellsToRowsLabeled_Irregular(t *testing.T) {
|
||
// Irregular layout: row 0 has 3 cells, row 1 has 5, row 2 has 2.
|
||
// Cells within a row have slightly different Y (within threshold).
|
||
cells := []TSRCell{
|
||
// Row 0 — 3 cells at ~Y=0 (slightly staggered tops).
|
||
{X0: 0, Y0: 0, X1: 80, Y1: 25, Label: "table"},
|
||
{X0: 90, Y0: 2, X1: 170, Y1: 27, Label: "table"},
|
||
{X0: 180, Y0: 1, X1: 260, Y1: 26, Label: "table"},
|
||
// Row 1 — 5 cells at ~Y=30.
|
||
{X0: 0, Y0: 30, X1: 80, Y1: 55, Label: "table"},
|
||
{X0: 90, Y0: 31, X1: 170, Y1: 56, Label: "table"},
|
||
{X0: 180, Y0: 30, X1: 260, Y1: 55, Label: "table"},
|
||
{X0: 270, Y0: 32, X1: 350, Y1: 57, Label: "table"},
|
||
{X0: 360, Y0: 30, X1: 440, Y1: 55, Label: "table"},
|
||
// Row 2 — 2 cells at ~Y=60.
|
||
{X0: 0, Y0: 60, X1: 80, Y1: 85, Label: "table"},
|
||
{X0: 90, Y0: 61, X1: 170, Y1: 86, Label: "table"},
|
||
}
|
||
rows := groupTSRCellsToRowsLabeled(cells)
|
||
if len(rows) != 3 {
|
||
t.Fatalf("irregular: expected 3 rows, got %d", len(rows))
|
||
}
|
||
if len(rows[0]) != 5 {
|
||
t.Errorf("row 0: expected 5 cols (padded), got %d", len(rows[0]))
|
||
}
|
||
if len(rows[1]) != 5 {
|
||
t.Errorf("row 1: expected 5 cols, got %d", len(rows[1]))
|
||
}
|
||
if len(rows[2]) != 5 {
|
||
t.Errorf("row 2: expected 5 cols (padded), got %d", len(rows[2]))
|
||
}
|
||
}
|
||
|
||
// TestFillCellTextFromBoxes_PreservesTSRText verifies that
|
||
// fillCellTextFromBoxes only overwrites a cell when matching box
|
||
// text is found. When no box overlaps the cell, the cell keeps
|
||
// its existing Text (from TSR or previous steps).
|
||
func TestFillCellTextFromBoxes_PreservesTSRText(t *testing.T) {
|
||
// Cell already has text from TSR. No box overlaps it.
|
||
cells := []TSRCell{
|
||
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "TSR-provided"},
|
||
}
|
||
boxes := []TextBox{
|
||
{X0: 500, X1: 600, Top: 500, Bottom: 550, Text: "far away"},
|
||
}
|
||
fillCellTextFromBoxes(cells, boxes)
|
||
if cells[0].Text != "TSR-provided" {
|
||
t.Errorf("TSR text overwritten: got %q, want 'TSR-provided'", cells[0].Text)
|
||
}
|
||
|
||
// Cell with TSR text, box covers >85% — should be overwritten.
|
||
cells2 := []TSRCell{
|
||
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "TSR-provided"},
|
||
}
|
||
boxes2 := []TextBox{
|
||
{X0: 1, X1: 99, Top: 1, Bottom: 49, Text: "box-text"},
|
||
}
|
||
fillCellTextFromBoxes(cells2, boxes2)
|
||
if cells2[0].Text != "box-text" {
|
||
t.Errorf("box text should override TSR text: got %q, want 'box-text'", cells2[0].Text)
|
||
}
|
||
}
|
||
|
||
// TestFillCellTextFromBoxes_PartialOverlap verifies that when a cell
|
||
// has NO existing text, even a box with partial overlap (< 85% of box
|
||
// area inside the cell) fills the cell. Simulates real DeepDoc TSR
|
||
// where cell boundaries are approximate and box coordinates may have
|
||
// slight offsets. Regression test for qa.pdf SKIP_OCR empty cells.
|
||
func TestFillCellTextFromBoxes_PartialOverlap(t *testing.T) {
|
||
// Empty cell (no TSR text). Box only has ~55% of its area inside
|
||
// the cell (spills across the boundary). Python's 0.3 threshold
|
||
// accepts this; Go's 0.85 rejects it → empty cell.
|
||
cells := []TSRCell{
|
||
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: ""},
|
||
}
|
||
boxes := []TextBox{
|
||
// Box: 60% inside cell, 40% outside. Overlap ratio = 60%.
|
||
{X0: 40, X1: 140, Top: 5, Bottom: 15, Text: "spill text"},
|
||
}
|
||
// Cell (0,0)-(100,50). Box (40,5)-(140,15).
|
||
// Overlap: X=(40,100) Y=(5,15) → 60×10=600.
|
||
// Box area: 100×10=1000. ratio = 600/1000 = 60%.
|
||
// Old 85% threshold → rejected. Python's 0.3 → accepted.
|
||
fillCellTextFromBoxes(cells, boxes)
|
||
if cells[0].Text != "spill text" {
|
||
t.Errorf("partial overlap (<85%%) on empty cell should still fill: got %q, want 'spill text'", cells[0].Text)
|
||
}
|
||
}
|
||
|
||
// TestGroupTSRCellsToRowsLabeled_ColumnAlignment verifies that all
|
||
// rows have the same column count after grouping, even with spanning
|
||
// cells. Python's construct_table ensures R×C matrix alignment;
|
||
// Go's Y-grouping can produce jagged rows when spanning cells make
|
||
// some rows appear shorter.
|
||
func TestGroupTSRCellsToRowsLabeled_ColumnAlignment(t *testing.T) {
|
||
// 2-row table: row 0 has a spanning cell (covers 2 columns) → 2 visible cells.
|
||
// row 1 has 3 normal cells.
|
||
// Python construct_table: both rows padded to 3 cols.
|
||
// Go Y-grouping (current): row 0 has 2 cols, row 1 has 3 → JAGGED.
|
||
cells := []TSRCell{
|
||
// Row 0 — spanning cell + 1 normal cell (= 2 cells)
|
||
{X0: 0, Y0: 0, X1: 200, Y1: 30, Label: "table spanning cell"},
|
||
{X0: 200, Y0: 0, X1: 300, Y1: 30, Label: "table row"},
|
||
// Row 1 — 3 normal cells
|
||
{X0: 0, Y0: 30, X1: 100, Y1: 60, Label: "table row"},
|
||
{X0: 100, Y0: 30, X1: 200, Y1: 60, Label: "table row"},
|
||
{X0: 200, Y0: 30, X1: 300, Y1: 60, Label: "table row"},
|
||
}
|
||
rows := groupTSRCellsToRowsLabeled(cells)
|
||
if len(rows) != 2 {
|
||
t.Fatalf("expected 2 rows, got %d", len(rows))
|
||
}
|
||
// BUG: row 0 only has 2 cells (spanning cell covers 2 columns but
|
||
// appears as 1 cell in Y-grouping). Python's construct_table pads
|
||
// to 3 columns.
|
||
if len(rows[0]) != len(rows[1]) {
|
||
t.Errorf("column alignment broken: row0=%d cols, row1=%d cols — "+
|
||
"Python construct_table ensures all rows have equal columns", len(rows[0]), len(rows[1]))
|
||
}
|
||
}
|
||
|
||
// TestAnnotateTableBoxes_RealTSRLabels verifies that annotateTableBoxes
|
||
// assigns correct R/C annotations with real TSR labels ("table" + "table column").
|
||
// Python assigns R/C by spatial overlap, independent of label.
|
||
func TestAnnotateTableBoxes_RealTSRLabels(t *testing.T) {
|
||
// Simulate a 2×3 table: 2 rows, 3 columns.
|
||
// TSR cells with label "table" (default TSR class 0) — like 公司差旅费.
|
||
cells := []TSRCell{
|
||
{X0: 0, Y0: 0, X1: 100, Y1: 30, Label: "table"},
|
||
{X0: 101, Y0: 0, X1: 200, Y1: 30, Label: "table"},
|
||
{X0: 201, Y0: 0, X1: 300, Y1: 30, Label: "table"},
|
||
{X0: 0, Y0: 35, X1: 100, Y1: 65, Label: "table"},
|
||
{X0: 101, Y0: 35, X1: 200, Y1: 65, Label: "table"},
|
||
{X0: 201, Y0: 35, X1: 300, Y1: 65, Label: "table"},
|
||
}
|
||
boxes := []TextBox{
|
||
{X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "A", LayoutType: "table"},
|
||
{X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "B", LayoutType: "table"},
|
||
{X0: 210, X1: 290, Top: 0, Bottom: 30, Text: "C", LayoutType: "table"},
|
||
{X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "D", LayoutType: "table"},
|
||
{X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "E", LayoutType: "table"},
|
||
{X0: 210, X1: 290, Top: 35, Bottom: 65, Text: "F", LayoutType: "table"},
|
||
}
|
||
annotateTableBoxes(boxes, groupTSRCellsToRowsLabeled(cells))
|
||
|
||
// Verify R (row) assignments — should be 0 for top row, 1 for bottom row.
|
||
for i, b := range boxes {
|
||
expectedR := i / 3
|
||
if b.R != expectedR {
|
||
t.Errorf("box[%d] %q: R=%d, want %d", i, b.Text, b.R, expectedR)
|
||
}
|
||
}
|
||
// Verify C (column) assignments — 0,1,2 within each row.
|
||
for i, b := range boxes {
|
||
expectedC := i % 3
|
||
if b.C != expectedC {
|
||
t.Errorf("box[%d] %q: C=%d, want %d", i, b.Text, b.C, expectedC)
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestTsrBoxOverlap_ReturnsTrueWhenDisjoint verifies that tsrBoxOverlap
|
||
// returns true when the box and cell do NOT overlap (are separated in
|
||
// at least one dimension). Despite the name "Overlap", the function
|
||
// tests for disjointness. All callers must negate it to check for
|
||
// actual overlap. This test locks in the semantics so future readers
|
||
// and static analysis tools can rely on the behaviour.
|
||
func TestTsrBoxOverlap_ReturnsTrueWhenDisjoint(t *testing.T) {
|
||
box := TextBox{X0: 50, X1: 100, Top: 0, Bottom: 50}
|
||
|
||
// Separated in X (cell to the right) → disjoint → true.
|
||
if !tsrBoxOverlap(box, TSRCell{X0: 150, Y0: 0, X1: 200, Y1: 50}) {
|
||
t.Error("cell to the right (separated in X): expected true")
|
||
}
|
||
// Separated in X (cell to the left) → disjoint → true.
|
||
if !tsrBoxOverlap(box, TSRCell{X0: 0, Y0: 0, X1: 30, Y1: 50}) {
|
||
t.Error("cell to the left (separated in X): expected true")
|
||
}
|
||
// Separated in Y (cell below) → disjoint → true.
|
||
if !tsrBoxOverlap(box, TSRCell{X0: 50, Y0: 100, X1: 100, Y1: 150}) {
|
||
t.Error("cell below (separated in Y): expected true")
|
||
}
|
||
// Separated in Y (cell above) → disjoint → true.
|
||
if !tsrBoxOverlap(box, TSRCell{X0: 50, Y0: -50, X1: 100, Y1: -10}) {
|
||
t.Error("cell above (separated in Y): expected true")
|
||
}
|
||
// Fully enclosing cell → overlaps in both X and Y → NOT disjoint → false.
|
||
if tsrBoxOverlap(box, TSRCell{X0: 0, Y0: 0, X1: 200, Y1: 100}) {
|
||
t.Error("cell fully enclosing box (overlaps): expected false")
|
||
}
|
||
// Partially overlapping cell → overlaps in both dims → false.
|
||
if tsrBoxOverlap(box, TSRCell{X0: 25, Y0: 25, X1: 75, Y1: 75}) {
|
||
t.Error("cell partially overlapping: expected false")
|
||
}
|
||
}
|