Files
ragflow/internal/deepdoc/parser/pdf/table_test.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

1863 lines
69 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package parser
import (
"context"
"image"
"strings"
"testing"
)
// ---- groupTSRCellsToRows ----
func TestGroupTSRCellsToRows_Empty(t *testing.T) {
if rows := groupTSRCellsToRows(nil); rows != nil {
t.Errorf("nil input: expected nil, got %d rows", len(rows))
}
if rows := groupTSRCellsToRows([]TSRCell{}); rows != nil {
t.Errorf("empty input: expected nil, got %d rows", len(rows))
}
}
func TestGroupTSRCellsToRows_SingleCell(t *testing.T) {
cells := []TSRCell{{X0: 0, Y0: 0, X1: 10, Y1: 10, Text: "A"}}
rows := groupTSRCellsToRows(cells)
if len(rows) != 1 || len(rows[0]) != 1 || rows[0][0].Text != "A" {
t.Errorf("single cell: expected [[A]], got %v", rows)
}
}
func TestGroupTSRCellsToRows_TwoRows(t *testing.T) {
cells := []TSRCell{
{X0: 00, Y0: 0, X1: 10, Y1: 10, Text: "A1"},
{X0: 20, Y0: 0, X1: 30, Y1: 10, Text: "B1"},
{X0: 00, Y0: 30, X1: 10, Y1: 40, Text: "A2"},
{X0: 20, Y0: 30, X1: 30, Y1: 40, Text: "B2"},
}
rows := groupTSRCellsToRows(cells)
if len(rows) != 2 {
t.Fatalf("expected 2 rows, got %d", len(rows))
}
if len(rows[0]) != 2 || len(rows[1]) != 2 {
t.Errorf("expected 2 cells per row, got %d/%d", len(rows[0]), len(rows[1]))
}
// Row 0 sorted by X0
if rows[0][0].Text != "A1" || rows[0][1].Text != "B1" {
t.Errorf("row 0 order wrong: %v", tsrCellTexts(rows[0]))
}
// Row 1 sorted by X0
if rows[1][0].Text != "A2" || rows[1][1].Text != "B2" {
t.Errorf("row 1 order wrong: %v", tsrCellTexts(rows[1]))
}
}
func TestGroupTSRCellsToRows_CloseRows(t *testing.T) {
// Two rows with small Y gap — should still be separate rows
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 10, Y1: 8, Text: "Row1"},
{X0: 0, Y0: 9, X1: 10, Y1: 17, Text: "Row2"},
}
rows := groupTSRCellsToRows(cells)
// medianH = 8, threshold = 4. gap = 9-8 = 1 < 4? Actually Y diff = 9-8=1 < 4 → same row!
// No: cells sorted by Y0: Row1(0), Row2(9). gap = 9-0 = 9 > 4 → different rows.
if len(rows) != 2 {
t.Errorf("close rows: expected 2, got %d", len(rows))
}
}
func TestGroupTSRCellsToRows_VaryingHeights(t *testing.T) {
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 10, Y1: 5, Text: "A"}, // height 5
{X0: 0, Y0: 50, X1: 10, Y1: 70, Text: "B"}, // height 20
{X0: 0, Y0: 50, X1: 10, Y1: 70, Text: "C"}, // height 20, same row as B
}
rows := groupTSRCellsToRows(cells)
// median height = 5 (sorted: 5,20,20 → median index 1 = 20)
// threshold = 10. Y gap B-to-A = 50-5 = 45 > 10 → different row
// Y gap C-to-B = 50-50 = 0 ≤ 10 → same row
if len(rows) != 2 {
t.Fatalf("varying heights: expected 2 rows, got %d", len(rows))
}
if len(rows[0]) != 1 || rows[0][0].Text != "A" {
t.Errorf("row 0: expected [A], got %v", tsrCellTexts(rows[0]))
}
if len(rows[1]) != 2 {
t.Errorf("row 1: expected 2 cells, got %v", tsrCellTexts(rows[1]))
}
}
func tsrCellTexts(cells []TSRCell) []string {
out := make([]string, len(cells))
for i, c := range cells {
out[i] = c.Text
}
return out
}
// ---- boxOverlapsCell ----
func TestBoxOverlapsCell_FullOverlap(t *testing.T) {
// Box is entirely inside cell → ≥85% of box area inside cell → match.
cell := TSRCell{X0: 0, Y0: 0, X1: 100, Y1: 50}
box := TextBox{X0: 0, X1: 100, Top: 0, Bottom: 50, Text: "hello"}
if !boxOverlapsCell(cell, box) {
t.Error("full overlap should return true")
}
// Box is still entirely inside cell → box→cell = 100% ≥ 85% → match.
box2 := TextBox{X0: 10, X1: 90, Top: 10, Bottom: 40, Text: "partial"}
if !boxOverlapsCell(cell, box2) {
t.Error("box entirely inside cell (100% of box) should match")
}
}
func TestBoxOverlapsCell_NoOverlap(t *testing.T) {
cell := TSRCell{X0: 0, Y0: 0, X1: 100, Y1: 50}
box := TextBox{X0: 200, X1: 300, Top: 10, Bottom: 40, Text: "away"}
if boxOverlapsCell(cell, box) {
t.Error("no X overlap should return false")
}
}
func TestBoxOverlapsCell_PartialOverlap(t *testing.T) {
// Box is entirely inside cell (100% of box area) → matches.
// boxOverlapsCell uses box→cell overlap (≥85% of box area inside cell).
cell := TSRCell{X0: 0, Y0: 0, X1: 100, Y1: 50}
box := TextBox{X0: 0, X1: 30, Top: 0, Bottom: 25, Text: "small"}
if !boxOverlapsCell(cell, box) {
t.Error("box entirely inside cell should match")
}
// Box straddles cell boundary (< 85% of box inside cell) → no match.
box2 := TextBox{X0: 80, X1: 180, Top: 0, Bottom: 25, Text: "spill"}
if boxOverlapsCell(cell, box2) {
t.Error("box straddling boundary (<85% inside) should NOT match")
}
}
func TestBoxOverlapsCell_ZeroArea(t *testing.T) {
cell := TSRCell{X0: 0, Y0: 0, X1: 0, Y1: 50}
box := TextBox{X0: 0, X1: 10, Top: 0, Bottom: 10, Text: "x"}
if boxOverlapsCell(cell, box) {
t.Error("zero cell area should return false")
}
}
// ---- fillCellTextFromBoxes ----
func TestFillCellTextFromBoxes_Simple(t *testing.T) {
// Box covering entire cell (>85%) → match
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50},
{X0: 100, Y0: 0, X1: 200, Y1: 50},
}
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 50, Text: "cell1"},
{X0: 100, X1: 200, Top: 0, Bottom: 50, Text: "cell2"},
}
fillCellTextFromBoxes(cells, boxes)
if cells[0].Text != "cell1" {
t.Errorf("cell 0: got %q, want 'cell1'", cells[0].Text)
}
if cells[1].Text != "cell2" {
t.Errorf("cell 1: got %q, want 'cell2'", cells[1].Text)
}
}
func TestFillCellTextFromBoxes_MultipleBoxesPerCell(t *testing.T) {
// Two boxes, each covering >85% of the cell → concatenated
// (boxes must overlap the cell near-completely to match individually)
cells := []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50}}
boxes := []TextBox{
{X0: 0, X1: 95, Top: 0, Bottom: 47, Text: "part1"},
{X0: 5, X1: 100, Top: 3, Bottom: 50, Text: "part2"},
}
fillCellTextFromBoxes(cells, boxes)
// Both boxes cover >85% → both match → concatenated with space
if cells[0].Text == "" {
t.Error("expected non-empty cell text")
}
}
func TestFillCellTextFromBoxes_EmptyBoxText(t *testing.T) {
cells := []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50}}
boxes := []TextBox{
{X0: 5, X1: 95, Top: 5, Bottom: 45, Text: " "},
}
fillCellTextFromBoxes(cells, boxes)
if cells[0].Text != "" {
t.Errorf("empty box text: got %q, want empty", cells[0].Text)
}
}
func TestFillCellTextFromBoxes_NoMatchingBox(t *testing.T) {
cells := []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50}}
boxes := []TextBox{
{X0: 500, X1: 600, Top: 500, Bottom: 550, Text: "far away"},
}
fillCellTextFromBoxes(cells, boxes)
if cells[0].Text != "" {
t.Errorf("no match: got %q, want empty", cells[0].Text)
}
}
// ---- regionOverlapsBox ----
func TestRegionOverlapsBox_StrongOverlap(t *testing.T) {
region := DLARegion{X0: 0, Y0: 0, X1: 216, Y1: 108} // DLA coords at 216 DPI
box := TextBox{X0: 0, X1: 100, Top: 0, Bottom: 50}
if !regionOverlapsBox(region, box, 3.0) {
t.Error("full overlap should match")
}
}
func TestRegionOverlapsBox_NoOverlap(t *testing.T) {
region := DLARegion{X0: 0, Y0: 0, X1: 216, Y1: 108}
box := TextBox{X0: 500, X1: 600, Top: 500, Bottom: 550}
if regionOverlapsBox(region, box, 3.0) {
t.Error("no overlap should return false")
}
}
func TestRegionOverlapsBox_WeakOverlap(t *testing.T) {
// Overlap at 30% → below 40% threshold → false.
region := DLARegion{X0: 0, Y0: 0, X1: 90, Y1: 90} // 30x30 at scale 3
box := TextBox{X0: 0, X1: 100, Top: 0, Bottom: 100} // overlap = 30*30/10000 = 9%? No: 30x30=900 / 10000 = 9%
if regionOverlapsBox(region, box, 3.0) {
t.Error("9% overlap should return false")
}
// Overlap ≥ 40% → should match (Python thr=0.4).
// box 100x100=10000 area; region 100x40=4000 → exactly 40%.
region2 := DLARegion{X0: 0, Y0: 0, X1: 300, Y1: 120, Label: "table"} // 100x40 at scale 3
if !regionOverlapsBox(region2, box, 3.0) {
t.Error("40% overlap should match (>= 0.4)")
}
// Region that covers most of the box → should match
region3 := DLARegion{X0: 0, Y0: 0, X1: 270, Y1: 270} // 90x90 at scale 3
if !regionOverlapsBox(region3, box, 3.0) {
t.Error("81% overlap should match")
}
}
func TestRegionOverlapsBox_ThresholdAt040(t *testing.T) {
// Exact 40% overlap: 100x100 box, region just covering 40%
// 0.4 * 10000 = 4000. Need region with area 4000 in box space.
// 63.2*63.2 ≈ 3994. Let's use 100x40 = 4000.
box := TextBox{X0: 0, X1: 100, Top: 0, Bottom: 100}
region := DLARegion{X0: 0, Y0: 0, X1: 300, Y1: 120, Label: "table"} // 100x40 at scale 3
if !regionOverlapsBox(region, box, 3.0) {
t.Error("exact 40% overlap should match (>= 0.4)")
}
// 39% overlap should NOT match
region2 := DLARegion{X0: 0, Y0: 0, X1: 300, Y1: 117, Label: "table"} // 100x39 at scale 3
if regionOverlapsBox(region2, box, 3.0) {
t.Error("39% overlap should NOT match")
}
}
// ---- annotateBoxLayouts ----
func TestAnnotateBoxLayouts_SetsLabel(t *testing.T) {
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20},
{X0: 0, X1: 100, Top: 30, Bottom: 50},
}
regions := []DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "title"}, // covers box 0 at scale 3
{X0: 0, Y0: 90, X1: 300, Y1: 150, Label: "text"}, // covers box 1 at scale 3
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
if boxes[0].LayoutType != "title" {
t.Errorf("box 0: got %q, want 'title'", boxes[0].LayoutType)
}
if boxes[1].LayoutType != "text" {
t.Errorf("box 1: got %q, want 'text'", boxes[1].LayoutType)
}
}
func TestAnnotateBoxLayouts_NoMatch(t *testing.T) {
// Region far away from the box — no overlap
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20},
}
regions := []DLARegion{
{X0: 900, Y0: 900, X1: 1000, Y1: 1000, Label: "far"}, // completely outside
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
if boxes[0].LayoutType != "" {
t.Errorf("no match: expected empty, got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_EmptyRegions(t *testing.T) {
boxes := []TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 20}}
boxes = annotateBoxLayouts(boxes, nil, 3.0, 0)
boxes = annotateBoxLayouts(boxes, []DLARegion{}, 3.0, 0)
if boxes[0].LayoutType != "" {
t.Errorf("empty regions: got %q, want empty", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_PriorityOverMaxArea(t *testing.T) {
// "table" type checked before "text" in priority order.
// Even if "text" region has larger overlap, "table" wins if it meets threshold (≥40%).
boxes := []TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 50}}
regions := []DLARegion{
// text region: full coverage (100% overlap) — but lower priority
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"},
// table region: 45% overlap (45x50 out of 100x50) — higher priority, meets threshold
{X0: 0, Y0: 0, X1: 45 * 3, Y1: 50 * 3, Label: "table"},
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
if boxes[0].LayoutType != "table" {
t.Errorf("priority: 'table' should win over 'text' when both meet threshold, got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_OverlapThreshold(t *testing.T) {
// Region overlaps only 30% of box — below 0.4 threshold — should NOT match.
boxes := []TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 50}}
regions := []DLARegion{
{X0: 0, Y0: 0, X1: 30 * 3, Y1: 30 * 3, Label: "table"}, // covers ~30% of box
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
if boxes[0].LayoutType != "" {
t.Errorf("threshold: overlap < 40%% should not match, got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_CIDGarbage(t *testing.T) {
// CID-pattern boxes should be popped entirely (Python: bxs.pop(i)).
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20, Text: "(cid:123)"},
{X0: 0, X1: 100, Top: 30, Bottom: 50, Text: "normal text"},
}
regions := []DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "text", Confidence: 0.9},
{X0: 0, Y0: 90, X1: 300, Y1: 150, Label: "text", Confidence: 0.9},
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
// CID-garbled box was popped → only 1 box remains.
if len(boxes) != 1 {
t.Fatalf("CID-garbled box should be popped, got %d boxes", len(boxes))
}
if boxes[0].LayoutType != "text" {
t.Errorf("CID: remaining box should be 'text', got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_LayoutNoFormat(t *testing.T) {
// layoutno uses Python format: "{type}-{per_type_index}" where per_type_index
// is the index of the matched DLA region within its type (not global).
// Two boxes overlapping the SAME text region share the same layoutno → VM can merge them.
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20},
{X0: 0, X1: 100, Top: 30, Bottom: 50},
}
regions := []DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"}, // covers both boxes
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
want := "text-0"
if boxes[0].LayoutNo != want {
t.Errorf("box 0 layoutno: got %q, want %q", boxes[0].LayoutNo, want)
}
if boxes[1].LayoutNo != want {
t.Errorf("box 1 layoutno should share same per-type index: got %q, want %q", boxes[1].LayoutNo, want)
}
}
func TestAnnotateBoxLayouts_LayoutNoDifferentRegions(t *testing.T) {
// Two boxes in different text regions → different layoutno.
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20},
{X0: 0, X1: 100, Top: 100, Bottom: 120},
}
regions := []DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "text"}, // per-type index 0
{X0: 0, Y0: 300, X1: 300, Y1: 360, Label: "text"}, // per-type index 1
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
if boxes[0].LayoutNo != "text-0" {
t.Errorf("box 0: got %q, want 'text-0'", boxes[0].LayoutNo)
}
if boxes[1].LayoutNo != "text-1" {
t.Errorf("box 1: got %q, want 'text-1'", boxes[1].LayoutNo)
}
}
// TestAnnotateBoxLayouts_ConfidenceFilter verifies that DLA regions with
// low confidence (< 0.4) for garbage layout types are excluded from matching.
// Python: float(b["score"]) >= 0.4 filter in LayoutRecognizer.
func TestAnnotateBoxLayouts_ConfidenceFilter(t *testing.T) {
boxes := []TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 50}}
// Low-confidence footer — should be filtered out.
regions := []DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "footer", Confidence: 0.2},
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text", Confidence: 0.9},
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
// Footer region filtered (low confidence) → box matches "text" instead.
if boxes[0].LayoutType != "text" {
t.Errorf("low-confidence footer filtered → box should get 'text', got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_GarbageFooterRejected(t *testing.T) {
// Footer at page bottom: Bottom(290) > 270 (90% of 300px→PDF height 100→90% of 100=90)
// → real footer decoration → garbage → pop (Python: bxs.pop(i)).
boxes := []TextBox{{X0: 0, X1: 100, Top: 280, Bottom: 290}}
regions := []DLARegion{
{X0: 0, Y0: 840, X1: 300, Y1: 870, Label: "footer", Confidence: 0.9}, // y=280-290 after /3, PDF 93-97
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 300) // PDF height = 300/3 = 100
if len(boxes) != 0 {
t.Errorf("footer at bottom: should be popped as decoration, got %d boxes left", len(boxes))
}
}
func TestAnnotateBoxLayouts_HeaderRemovedAtTop(t *testing.T) {
// Header at page top edge (y=5 in 300px page → PDF height 100 → 5 < 10% of 100)
// → real header decoration → garbage → pop (Python: bxs.pop(i)).
boxes := []TextBox{{X0: 0, X1: 100, Top: 5, Bottom: 20}}
regions := []DLARegion{
{X0: 0, Y0: 15, X1: 300, Y1: 60, Label: "header", Confidence: 0.9}, // y=5-20 after /3
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 300)
if len(boxes) != 0 {
t.Errorf("header at very top: should be popped as decoration, got %d boxes left", len(boxes))
}
}
func TestAnnotateBoxLayouts_HeaderKeptInMiddle(t *testing.T) {
// Header in middle of page (y=50 in 300px page → PDF height 100 → 50 > 10)
// → DLA false positive → KEEP the text.
boxes := []TextBox{{X0: 0, X1: 100, Top: 50, Bottom: 70}}
regions := []DLARegion{
{X0: 0, Y0: 150, X1: 300, Y1: 210, Label: "header", Confidence: 0.9}, // y=50-70 after /3
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 300)
if boxes[0].LayoutType != "header" {
t.Errorf("header in middle of page: DLA false positive, keep text, got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_FooterRemovedAtBottom(t *testing.T) {
// Footer at page bottom (y=95 in 300px page → PDF height 100 → 95 > 90% of 100)
// → real footer decoration → garbage → REMOVE.
boxes := []TextBox{{X0: 0, X1: 100, Top: 95, Bottom: 100}}
regions := []DLARegion{
{X0: 0, Y0: 285, X1: 300, Y1: 300, Label: "footer", Confidence: 0.9}, // y=95-100 after /3
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 300)
if len(boxes) != 0 {
t.Errorf("footer at very bottom: should be popped as decoration, got %d boxes left", len(boxes))
}
}
func TestAnnotateBoxLayouts_FooterKeptInMiddle(t *testing.T) {
// Footer in middle of page (y=50 in 300px page → PDF height 100 → 50 < 90)
// → DLA false positive → KEEP the text.
boxes := []TextBox{{X0: 0, X1: 100, Top: 50, Bottom: 70}}
regions := []DLARegion{
{X0: 0, Y0: 150, X1: 300, Y1: 210, Label: "footer", Confidence: 0.9}, // y=50-70 after /3
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 300)
if boxes[0].LayoutType != "footer" {
t.Errorf("footer in middle of page: DLA false positive, keep text, got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_ReferenceAlwaysGarbage(t *testing.T) {
// Reference type is always garbage regardless of position (no keep_feat).
boxes := []TextBox{{X0: 0, X1: 100, Top: 50, Bottom: 70}}
regions := []DLARegion{
{X0: 0, Y0: 150, X1: 300, Y1: 210, Label: "reference", Confidence: 0.9},
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 300)
if len(boxes) != 0 {
t.Errorf("reference: should always be garbage-filtered, got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_NonGarbageTypeUnaffected(t *testing.T) {
// "text" type is NOT a garbage type — should always be assigned.
boxes := []TextBox{{X0: 0, X1: 100, Top: 200, Bottom: 220}}
regions := []DLARegion{
{X0: 0, Y0: 600, X1: 300, Y1: 660, Label: "text"},
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 300)
if boxes[0].LayoutType != "text" {
t.Errorf("non-garbage type: should be assigned, got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_ZeroPageHeightDisablesGarbage(t *testing.T) {
// pageImgHeight=0 → garbage check disabled → all types assigned.
boxes := []TextBox{{X0: 0, X1: 100, Top: 100, Bottom: 120}}
regions := []DLARegion{
{X0: 0, Y0: 300, X1: 300, Y1: 360, Label: "header", Confidence: 0.9},
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
if boxes[0].LayoutType != "header" {
t.Errorf("zero page height: garbage check disabled, got %q", boxes[0].LayoutType)
}
}
// TestAnnotateBoxLayouts_SyntheticFigure creates synthetic figure boxes for
// unmatched figure/equation DLA regions (Python: dla_cli.py:187-195).
func TestAnnotateBoxLayouts_SyntheticFigure(t *testing.T) {
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20, Text: "text box"},
}
// Two figure regions, one text region
regions := []DLARegion{
{X0: 0, Y0: 0, X1: 150, Y1: 60, Label: "text", Confidence: 0.9}, // matches text box → visited
{X0: 300, Y0: 300, X1: 600, Y1: 600, Label: "figure", Confidence: 0.9}, // no box overlaps → synthetic
{X0: 600, Y0: 0, X1: 900, Y1: 300, Label: "figure", Confidence: 0.9}, // no box overlaps → synthetic
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
// Original text box + 2 synthetic figure boxes = 3
if len(boxes) != 3 {
t.Fatalf("expected 3 boxes (1 original + 2 synthetic figures), got %d", len(boxes))
}
// Check synthetic boxes
foundFig0, foundFig1 := false, false
for _, b := range boxes {
if b.LayoutType == "figure" && b.Text == "" {
if b.LayoutNo == "figure-0" {
foundFig0 = true
if b.X0 != 100 || b.X1 != 200 {
t.Errorf("synthetic figure-0: expected x0=100,x1=200 (300/3,600/3), got x0=%v,x1=%v", b.X0, b.X1)
}
}
if b.LayoutNo == "figure-1" {
foundFig1 = true
}
}
}
if !foundFig0 {
t.Error("missing synthetic figure-0 box")
}
if !foundFig1 {
t.Error("missing synthetic figure-1 box")
}
}
// TestAnnotateBoxLayouts_EquationMappedToFigure verifies equation DLA regions
// get LayoutType="figure" but LayoutNo keeps "equation" prefix (Python behavior).
func TestAnnotateBoxLayouts_EquationMappedToFigure(t *testing.T) {
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20},
}
regions := []DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "equation", Confidence: 0.9},
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
if len(boxes) != 1 {
t.Fatalf("expected 1 box, got %d", len(boxes))
}
if boxes[0].LayoutType != "figure" {
t.Errorf("equation → LayoutType: got %q, want 'figure'", boxes[0].LayoutType)
}
if boxes[0].LayoutNo != "equation-0" {
t.Errorf("equation → LayoutNo: got %q, want 'equation-0'", boxes[0].LayoutNo)
}
}
// TestAnnotateBoxLayouts_MixedTypesLayoutNo verifies per-type LayoutNo counting
// with multiple region types present.
func TestAnnotateBoxLayouts_MixedTypesLayoutNo(t *testing.T) {
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20}, // overlaps text region 0
{X0: 0, X1: 100, Top: 200, Bottom: 220}, // overlaps text region 1
{X0: 200, X1: 300, Top: 0, Bottom: 20}, // overlaps figure region 0 only
}
regions := []DLARegion{
{X0: 0, Y0: 0, X1: 150, Y1: 60, Label: "text", Confidence: 0.9}, // text-0
{X0: 0, Y0: 600, X1: 150, Y1: 660, Label: "text", Confidence: 0.9}, // text-1
{X0: 600, Y0: 0, X1: 900, Y1: 60, Label: "figure", Confidence: 0.9}, // figure-0 (PDF: x0=200, x1=300)
}
boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
if len(boxes) != 3 {
t.Fatalf("expected 3 boxes, got %d", len(boxes))
}
// Check that text and figure indices are independent
if boxes[0].LayoutNo != "text-0" {
t.Errorf("box 0: got %q, want 'text-0'", boxes[0].LayoutNo)
}
if boxes[1].LayoutNo != "text-1" {
t.Errorf("box 1: got %q, want 'text-1'", boxes[1].LayoutNo)
}
if boxes[2].LayoutNo != "figure-0" {
t.Errorf("box 2: got %q, want 'figure-0' (independent from text counter)", boxes[2].LayoutNo)
}
}
// ---- Mock-integration: DLA→TSR pipeline with MockDeepDoc ----
func TestExtractTableBoxes_PriorityPreservesTable(t *testing.T) {
// One box overlaps both a large "text" region and a smaller "table" region.
// Priority order (table before text) must ensure the box gets "table" label,
// triggering TSR and producing TableItems.
dummyImg := image.NewRGBA(image.Rect(0, 0, 900, 900))
boxes := []TextBox{
{X0: 200, X1: 400, Top: 200, Bottom: 400, Text: "cell content"},
}
mock := &MockDocAnalyzer{
Healthy: true,
DLARegions: []DLARegion{
{X0: 0, Y0: 0, X1: 2700, Y1: 2700, Label: "text"}, // full-page, 3x scale
{X0: 300, Y0: 300, X1: 1500, Y1: 1500, Label: "table"}, // partial, 3x scale
},
TSRCells: []TSRCell{{X0: 200, Y0: 200, X1: 400, Y1: 400, Text: "cell1"}},
}
p := NewParser(DefaultParserConfig(), mock)
items := p.extractTableBoxesFromImage(context.Background(), boxes, dummyImg, 0, 0)
if len(items) == 0 {
t.Error("priority: table should win over text, got 0 tables")
}
}
func TestExtractTableBoxes_OverlapBelowThresholdNoTable(t *testing.T) {
// Table region covers <40% of the box's area → matches no box → no table.
dummyImg := image.NewRGBA(image.Rect(0, 0, 900, 900))
boxes := []TextBox{
{X0: 200, X1: 400, Top: 200, Bottom: 400, Text: "content"},
}
// Table region only touches a tiny corner (40*40/3 = 13x13 in PDF space).
mock := &MockDocAnalyzer{
Healthy: true,
DLARegions: []DLARegion{
{X0: 600, Y0: 600, X1: 720, Y1: 720, Label: "table"}, // tiny corner
},
TSRCells: []TSRCell{},
}
p := NewParser(DefaultParserConfig(), mock)
items := p.extractTableBoxesFromImage(context.Background(), boxes, dummyImg, 0, 0)
if len(items) != 0 {
t.Errorf("threshold: overlap < 40%% should produce 0 tables, got %d", len(items))
}
}
func TestExtractTableBoxes_FooterGarbageNotTriggerTable(t *testing.T) {
// Footer at page bottom → garbage-filtered → not kept as footer.
// Since no other type matches, box remains unannotated.
dummyImg := image.NewRGBA(image.Rect(0, 0, 900, 900)) // 900/3=300 PDF height
boxes := []TextBox{
{X0: 100, X1: 300, Top: 280, Bottom: 295, Text: "page 1"},
}
mock := &MockDocAnalyzer{
Healthy: true,
DLARegions: []DLARegion{
{X0: 300, Y0: 840, X1: 900, Y1: 885, Label: "footer", Confidence: 0.9}, // y=280-295 in PDF
},
}
p := NewParser(DefaultParserConfig(), mock)
items := p.extractTableBoxesFromImage(context.Background(), boxes, dummyImg, 0, 0)
// Footer at bottom edge → garbage → no table regions match
if len(items) != 0 {
t.Errorf("footer garbage: should not produce tables, got %d", len(items))
}
}
// ---- helpers ----
func TestCellTexts(t *testing.T) {
cells := []TSRCell{
{Text: "A"}, {Text: "B"}, {Text: "C"},
}
texts := tsrCellTexts(cells)
got := strings.Join(texts, ",")
if got != "A,B,C" {
t.Errorf("cellTexts: got %q, want 'A,B,C'", got)
}
}
// ── constructTable unit tests ─────────────────────────────────────────
func TestConstructTable_Simple3x2(t *testing.T) {
// 3 columns × 2 rows — cells pre-filled (simulating extractTableBoxesFromImage).
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "A", Label: "table row"},
{X0: 101, Y0: 0, X1: 200, Y1: 50, Text: "B", Label: "table row"},
{X0: 201, Y0: 0, X1: 300, Y1: 50, Text: "C", Label: "table row"},
{X0: 0, Y0: 51, X1: 100, Y1: 100, Text: "D", Label: "table row"},
{X0: 101, Y0: 51, X1: 200, Y1: 100, Text: "E", Label: "table row"},
{X0: 201, Y0: 51, X1: 300, Y1: 100, Text: "F", Label: "table row"},
}
boxes := []TextBox{}
html := constructTable(cells, boxes, "", nil)
if !strings.Contains(html, "<table>") {
t.Error("expected <table> tag")
}
if !strings.Contains(html, "A") || !strings.Contains(html, "B") || !strings.Contains(html, "C") {
t.Error("expected cell texts A, B, C in HTML")
}
// Should have 2 <tr> elements
trCount := strings.Count(html, "<tr>")
if trCount != 2 {
t.Errorf("expected 2 <tr> rows, got %d", trCount)
}
tdCount := strings.Count(html, "<td ")
if tdCount != 6 {
t.Errorf("expected 6 <td > cells, got %d", tdCount)
}
t.Logf("HTML:\n%s", html)
}
func TestConstructTable_EmptyCells(t *testing.T) {
html := constructTable(nil, nil, "", nil)
if html != "" {
t.Errorf("expected empty string for empty cells, got %q", html)
}
html = constructTable([]TSRCell{}, []TextBox{}, "", nil)
if html != "" {
t.Errorf("expected empty string for empty cells slice, got %q", html)
}
}
func TestConstructTable_NoMatchingBox(t *testing.T) {
// Cell has no overlapping text box → empty <td >
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "Has text", Label: "table row"},
{X0: 101, Y0: 0, X1: 200, Y1: 50, Label: "table row"},
}
boxes := []TextBox{}
html := constructTable(cells, boxes, "", nil)
if !strings.Contains(html, "Has text") {
t.Error("expected first cell text")
}
// Should still have 2 <td > cells
if strings.Count(html, "<td ") != 2 {
t.Errorf("expected 2 <td > cells, got %d. HTML:\n%s", strings.Count(html, "<td "), html)
}
}
func TestConstructTable_WithCaption(t *testing.T) {
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "X", Label: "table row"},
}
html := constructTable(cells, nil, "表1测试标题", nil)
if !strings.Contains(html, "<caption>表1测试标题</caption>") {
t.Errorf("expected caption, got:\n%s", html)
}
t.Logf("HTML:\n%s", html)
}
func TestConstructTable_SingleRow(t *testing.T) {
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 50, Y1: 40, Text: "Col1", Label: "table row"},
{X0: 51, Y0: 0, X1: 100, Y1: 40, Text: "Col2", Label: "table row"},
}
html := constructTable(cells, nil, "", nil)
if strings.Count(html, "<tr>") != 1 {
t.Errorf("expected 1 row, got %d", strings.Count(html, "<tr>"))
}
if strings.Count(html, "<td ") != 2 {
t.Errorf("expected 2 cells, got %d", strings.Count(html, "<td "))
}
}
func TestConstructTable_CellsTextFilledAfterCall(t *testing.T) {
// constructTable should populate cell text from boxes.
// Bug: fillCellTextFromBoxes modifies a local copy — original cells stay empty,
// causing generate_test.go to output empty rows.
// Cells pre-filled — constructTable no longer fills text (done in extractTableBoxesFromImage).
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "A1", Label: "table row"},
{X0: 101, Y0: 0, X1: 200, Y1: 50, Text: "B1", Label: "table row"},
{X0: 0, Y0: 51, X1: 100, Y1: 100, Text: "A2", Label: "table row"},
{X0: 101, Y0: 51, X1: 200, Y1: 100, Text: "B2", Label: "table row"},
}
_ = constructTable(cells, nil, "", nil)
// constructTable preserves cell text (does not clear or overwrite).
if cells[0].Text != "A1" {
t.Errorf("cell[0] text = %q, want %q", cells[0].Text, "A1")
}
if cells[1].Text != "B1" {
t.Errorf("cell[1] text = %q, want %q", cells[1].Text, "B1")
}
}
func TestConstructTable_YBasedFallback(t *testing.T) {
// Cells with label "table" + pre-filled text
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 50, Y1: 30, Text: "R1C1", Label: "table"},
{X0: 51, Y0: 0, X1: 100, Y1: 30, Text: "R1C2", Label: "table"},
{X0: 0, Y0: 31, X1: 50, Y1: 60, Text: "R2C1", Label: "table"},
}
html := constructTable(cells, nil, "", nil)
if strings.Count(html, "<tr>") != 2 {
t.Errorf("expected 2 rows from Y-fallback, got %d", strings.Count(html, "<tr>"))
}
if strings.Count(html, "<td ") != 4 { // padded to maxCols=2: 2 in row1, 2 in row2
t.Errorf("expected 4 cells (padded), got %d", strings.Count(html, "<td "))
}
}
// TestExtractTableAndReplace_CellTextFilled verifies that extractTableAndReplace
// fills cell text correctly with realistic coordinate transforms (Scale=3, CropOff≠0).
// This simulates the real pipeline where TSR cells are in crop pixel space and
// post-merge boxes are in PDF point space.
func TestExtractTableAndReplace_CellTextFilled(t *testing.T) {
// Simulate 公司差旅费 page 0 table coordinates.
// DLA region: X0=217, X1=1584, Y0=985, Y1=1599 at 216 DPI → PDF: 72-528 x 328-533
// Scale = 216/72 = 3.0
// cropOff ≈ region.X - region.W*0.03
const scale = 3.0
const cropOffX = 176.0
const cropOffY = 967.0
// Post-merge boxes in PDF point space (inside the table region).
// PDF Y=470 → crop Top = 470*3-967 = 443 → overlaps crop cell at Y0=441.
// Boxes must have R (row) and C (col) annotations matching cells,
// matching Python's construct_table which assigns boxes to cells by R/C.
boxes := []TextBox{
{X0: 80, X1: 210, Top: 470, Bottom: 490, Text: "标职务", LayoutType: "table", PageNumber: 0, R: 0, C: 0},
{X0: 220, X1: 270, Top: 470, Bottom: 490, Text: "飞机", LayoutType: "table", PageNumber: 0, R: 0, C: 1},
{X0: 80, X1: 210, Top: 492, Bottom: 512, Text: "公司级领导", LayoutType: "table", PageNumber: 0, R: 1, C: 0},
{X0: 220, X1: 270, Top: 492, Bottom: 512, Text: "经济舱位", LayoutType: "table", PageNumber: 0, R: 1, C: 1},
}
// TSR cells in crop pixel space (matching real TSR output).
// Cells pre-filled (extractTableBoxesFromImage already ran fillText + OCR).
cells := []TSRCell{
{X0: 35, Y0: 441, X1: 456, Y1: 500, Text: "标职务", Label: "table row"},
{X0: 460, Y0: 442, X1: 630, Y1: 500, Text: "飞机", Label: "table row"},
{X0: 35, Y0: 501, X1: 456, Y1: 560, Text: "公司级领导", Label: "table row"},
{X0: 460, Y0: 502, X1: 630, Y1: 560, Text: "经济舱位", Label: "table row"},
}
tables := []TableItem{{
Cells: cells,
Positions: []Position{{Left: 80, Right: 500, Top: 480, Bottom: 560}},
Scale: scale,
CropOffX: cropOffX,
CropOffY: cropOffY,
}}
result := extractTableAndReplace(boxes, tables)
if len(result) != 1 {
t.Fatalf("expected 1 output box (HTML table), got %d", len(result))
}
if !strings.Contains(result[0].Text, "<table>") {
t.Error("output should contain HTML table")
}
// Key assertion: constructTable backfills tables[0].Rows.
rows := tables[0].Rows
if len(rows) != 2 {
t.Fatalf("expected 2 rows, got %d", len(rows))
}
if rows[0][0] != "标职务" {
t.Errorf("row 0 col 0 = %q, want %q", rows[0][0], "标职务")
}
if rows[0][1] != "飞机" {
t.Errorf("row 0 col 1 = %q, want %q", rows[0][1], "飞机")
}
if rows[1][0] != "公司级领导" {
t.Errorf("row 1 col 0 = %q, want %q", rows[1][0], "公司级领导")
}
if rows[1][1] != "经济舱位" {
t.Errorf("row 1 col 1 = %q, want %q", rows[1][1], "经济舱位")
}
}
// TestConstructTable_FromBoxesRC builds HTML directly from boxes with R/C
// annotations, matching Python's construct_table. No cells needed for text.
func TestConstructTable_FromBoxesRC(t *testing.T) {
// Boxes with R (row) and C (col) annotations — like the output of
// annotateTableBoxes after layout cleanup.
boxes := []TextBox{
{X0: 50, X1: 150, Top: 100, Bottom: 130, Text: "姓名", R: 0, C: 0},
{X0: 155, X1: 255, Top: 100, Bottom: 130, Text: "年龄", R: 0, C: 1},
{X0: 50, X1: 150, Top: 135, Bottom: 165, Text: "张三", R: 1, C: 0},
{X0: 155, X1: 255, Top: 135, Bottom: 165, Text: "25", R: 1, C: 1},
}
// constructTable should build HTML directly from boxes by R/C grouping,
// ignoring cell text (matching Python's construct_table).
item := &TableItem{}
html := constructTable(nil, boxes, "", item)
if !strings.Contains(html, "姓名") || !strings.Contains(html, "张三") {
t.Errorf("HTML missing box text: %s", html)
}
// 2 rows, 2 cols
if strings.Count(html, "<tr>") != 2 {
t.Errorf("expected 2 rows, got %d. HTML: %s", strings.Count(html, "<tr>"), html)
}
if strings.Count(html, "<td ") != 4 {
t.Errorf("expected 4 cells, got %d. HTML: %s", strings.Count(html, "<td "), html)
}
// Verify Rows output
if len(item.Rows) != 2 || len(item.Rows[0]) != 2 {
t.Errorf("Rows: expected 2x2, got %dx%d", len(item.Rows), len(item.Rows[0]))
}
if item.Rows[0][0] != "姓名" {
t.Errorf("Rows[0][0] = %q, want %q", item.Rows[0][0], "姓名")
}
t.Logf("HTML: %s", html)
}
// TestFillCellTextFromBoxes_RCAnnotations fills text via R/C when spatial
// overlap is marginal. Real-world TSR cells and pdf_oxide boxes have pixel-level
// offsets — R/C annotations (set by annotateTableBoxes) are the Python-equivalent
// way to assign boxes to cells regardless of coordinate deviations.
func TestFillCellTextFromBoxes_RCAnnotations(t *testing.T) {
// Cells with real-world coordinate offsets (box shifted by 2px from cell).
// Spatial overlap <30% for the shifted case — fillCellTextFromBoxes fails.
cells := []TSRCell{
{X0: 10, Y0: 10, X1: 200, Y1: 50},
{X0: 210, Y0: 10, X1: 400, Y1: 50},
{X0: 10, Y0: 55, X1: 200, Y1: 95},
{X0: 210, Y0: 55, X1: 400, Y1: 95},
}
// Boxes have R/C annotations but their spatial overlap with cell rects
// is marginal (real-world scenario). R/C path should still fill text.
boxes := []TextBox{
{X0: 12, X1: 198, Top: 12, Bottom: 48, Text: "A", R: 0, C: 0}, // overlap ~92% → OK
{X0: 215, X1: 395, Top: 12, Bottom: 48, Text: "B", R: 0, C: 1}, // overlap ~90% → OK
{X0: 12, X1: 198, Top: 58, Bottom: 92, Text: "C", R: 1, C: 0}, // overlap ~92% → OK
{X0: 215, X1: 350, Top: 58, Bottom: 90, Text: "D", R: 1, C: 1}, // overlap ~50% → MARGINAL
}
// This SHOULD fill all 4 cells via R/C, but spatial-only may fail on D.
fillCellTextFromBoxes(cells, boxes)
// When spatial overlap is marginal (box "D" at 50%), fillCellTextFromBoxes
// may still match because cell is empty (0.3 threshold). But the real
// problem is that fillCellTextFromBoxes depends on coordinates, while
// R/C annotations don't.
hasText := false
for _, c := range cells {
if c.Text != "" {
hasText = true
}
}
if !hasText {
t.Error("fillCellTextFromBoxes should fill text from spatially overlapping boxes with R/C")
}
// NOW test the R/C path explicitly: fillCellTextFromAnnotations uses
// R/C labels only, ignoring coordinate overlap entirely.
cells2 := []TSRCell{
{X0: 10, Y0: 10, X1: 200, Y1: 50},
{X0: 210, Y0: 10, X1: 400, Y1: 50},
{X0: 10, Y0: 55, X1: 200, Y1: 95},
{X0: 210, Y0: 55, X1: 400, Y1: 95},
}
rows := [][]TSRCell{{cells2[0], cells2[1]}, {cells2[2], cells2[3]}}
fillCellTextFromAnnotations(rows, boxes)
if rows[0][0].Text != "A" {
t.Errorf("R/C: row0 col0 = %q, want %q", rows[0][0].Text, "A")
}
if rows[0][1].Text != "B" {
t.Errorf("R/C: row0 col1 = %q, want %q", rows[0][1].Text, "B")
}
if rows[1][0].Text != "C" {
t.Errorf("R/C: row1 col0 = %q, want %q", rows[1][0].Text, "C")
}
if rows[1][1].Text != "D" {
t.Errorf("R/C: row1 col1 = %q, want %q", rows[1][1].Text, "D")
}
}
// TestConstructTable_SingleRowMultiCol covers R=0 with multiple columns
// (table header pattern). boxesHaveAnnotations must detect valid annotations
// even though maxR=0.
func TestConstructTable_SingleRowMultiCol(t *testing.T) {
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 30, Text: "姓名", R: 0, C: 0},
{X0: 101, X1: 200, Top: 0, Bottom: 30, Text: "年龄", R: 0, C: 1},
{X0: 201, X1: 300, Top: 0, Bottom: 30, Text: "性别", R: 0, C: 2},
}
item := &TableItem{}
html := constructTable(nil, boxes, "", item)
if strings.Count(html, "<td ") != 3 {
t.Errorf("expected 3 cells, got %d. HTML: %s", strings.Count(html, "<td "), html)
}
if item.Rows[0][0] != "姓名" || item.Rows[0][1] != "年龄" || item.Rows[0][2] != "性别" {
t.Errorf("wrong row text: %v", item.Rows[0])
}
}
// TestConstructTable_MultiRowSingleCol covers C=0 with multiple rows
// (vertical list pattern). boxesHaveAnnotations must detect valid
// annotations even though maxC=0.
func TestConstructTable_MultiRowSingleCol(t *testing.T) {
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 30, Text: "第一行", R: 0, C: 0},
{X0: 0, X1: 100, Top: 35, Bottom: 65, Text: "第二行", R: 1, C: 0},
{X0: 0, X1: 100, Top: 70, Bottom: 100, Text: "第三行", R: 2, C: 0},
}
item := &TableItem{}
html := constructTable(nil, boxes, "", item)
if strings.Count(html, "<tr>") != 3 {
t.Errorf("expected 3 rows, got %d. HTML: %s", strings.Count(html, "<tr>"), html)
}
if item.Rows[0][0] != "第一行" || item.Rows[1][0] != "第二行" || item.Rows[2][0] != "第三行" {
t.Errorf("wrong text: row0=%q row1=%q row2=%q", item.Rows[0][0], item.Rows[1][0], item.Rows[2][0])
}
}
// TestConstructTable_RCAfterMerge verifies that R/C annotations survive
// text merge. The merged box expands bounds but keeps the first box's R/C.
func TestConstructTable_RCAfterMerge(t *testing.T) {
// Simulate two adjacent fragments merged into one box.
// The merged box keeps R/C from the first fragment.
postMerge := []TextBox{
{X0: 0, X1: 350, Top: 0, Bottom: 30, Text: "公司级领导人员(含公司董事、总监)", R: 0, C: 0},
{X0: 355, X1: 500, Top: 0, Bottom: 30, Text: "经济舱位", R: 0, C: 1},
{X0: 0, X1: 200, Top: 35, Bottom: 65, Text: "其他工作人员", R: 1, C: 0},
{X0: 355, X1: 500, Top: 35, Bottom: 65, Text: "经济舱位", R: 1, C: 1},
}
item := &TableItem{}
html := constructTable(nil, postMerge, "", item)
if !strings.Contains(html, "公司级领导") {
t.Errorf("missing merged text: %s", html)
}
if strings.Count(html, "<tr>") != 2 {
t.Errorf("expected 2 rows, got %d", strings.Count(html, "<tr>"))
}
if item.Rows[0][0] != "公司级领导人员(含公司董事、总监)" {
t.Errorf("row 0 col 0 = %q", item.Rows[0][0])
}
}
// TestGroupTSRCellsToRowsLabeled_DefaultTableLabel verifies that cells with
// the real TSR default label "table" (class 0) are grouped correctly.
// The current deepDocReRowHdr regex only matches ".* (row|header)" — it misses
// the default "table" label, causing gatherTSR to return empty and forcing
// a fallback to pure Y-based grouping (which loses R/C annotations).
func TestGroupTSRCellsToRowsLabeled_DefaultTableLabel(t *testing.T) {
cells := []TSRCell{
{X0: 10, Y0: 0, X1: 100, Y1: 30, Label: "table"},
{X0: 101, Y0: 0, X1: 200, Y1: 30, Label: "table"},
{X0: 10, Y0: 35, X1: 100, Y1: 65, Label: "table"},
{X0: 101, Y0: 35, X1: 200, Y1: 65, Label: "table"},
}
rows := groupTSRCellsToRowsLabeled(cells)
if len(rows) != 2 {
t.Fatalf("label %q: expected 2 rows, got %d (BUG: deepDocReRowHdr does not match %q)", "table", len(rows), "table")
}
if len(rows[0]) != 2 || len(rows[1]) != 2 {
t.Errorf("expected 2 cols/row, got %d/%d", len(rows[0]), len(rows[1]))
}
}
// TestGroupBoxesByRC_RDiffSplitsRows verifies that groupBoxesByRC
// creates separate rows for different R values (Python: R differs → new row).
// Even when boxes share the same Y, different R → different grid row.
func TestGroupBoxesByRC_RDiffSplitsRows(t *testing.T) {
// 6 boxes with 6 different R values → 6 rows (Python R-first splitting).
boxes := []TextBox{
{X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "A", R: 0, C: 0},
{X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "B", R: 1, C: 1},
{X0: 210, X1: 290, Top: 0, Bottom: 30, Text: "C", R: 2, C: 2},
{X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "D", R: 3, C: 0},
{X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "E", R: 4, C: 1},
{X0: 210, X1: 290, Top: 35, Bottom: 65, Text: "F", R: 5, C: 2},
}
rows := groupBoxesByRC(boxes)
// R=0,1,2,3,4,5 → 6 rows (Python: R differs → new row).
if len(rows) != 6 {
t.Fatalf("expected 6 rows (R differs → split), got %d", len(rows))
}
}
// TestGroupBoxesByRC_MergesCloseCols verifies that C compression works
// within each R group — merging different C values that are close in X.
func TestGroupBoxesByRC_MergesCloseCols(t *testing.T) {
// R=0 has C=0,1. R=1 has C=0,1. C compression → 2 cols each.
boxes := []TextBox{
{X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "A", R: 0, C: 0},
{X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "B", R: 0, C: 1},
{X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "C", R: 1, C: 0},
{X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "D", R: 1, C: 1},
}
rows := groupBoxesByRC(boxes)
if len(rows) != 2 {
t.Fatalf("expected 2 rows (R diff), got %d", len(rows))
}
if len(rows[0]) != 2 || len(rows[1]) != 2 {
t.Errorf("expected 2 cols/row, got %d/%d", len(rows[0]), len(rows[1]))
}
if rows[0][0].Text != "A" || rows[0][1].Text != "B" {
t.Errorf("row0 wrong: %q %q", rows[0][0].Text, rows[0][1].Text)
}
if rows[1][0].Text != "C" || rows[1][1].Text != "D" {
t.Errorf("row1 wrong: %q %q", rows[1][0].Text, rows[1][1].Text)
}
}
// TestGroupBoxesByRC_RDiffSplitsRow verifies that boxes with different R
// values are placed in separate rows even when their Y ranges overlap.
// Matches Python: R differs → new row unconditionally.
func TestGroupBoxesByRC_RDiffSplitsRow(t *testing.T) {
// R=0 and R=1 at same Y (overlapping) → two separate rows in the grid.
boxes := []TextBox{
{X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "A", R: 0, C: 0},
{X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "B", R: 1, C: 1},
{X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "C", R: 2, C: 0},
{X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "D", R: 3, C: 1},
}
rows := groupBoxesByRC(boxes)
// R=0,1,2,3 → 4 different R values → 4 rows (Python: R differs → new row).
if len(rows) != 4 {
t.Fatalf("expected 4 rows (R differs → split), got %d", len(rows))
}
if rows[0][0].Text != "A" || rows[1][0].Text != "B" {
t.Errorf("row0/1 wrong: A=%q B=%q", rows[0][0].Text, rows[1][0].Text)
}
}
// TestFillCellTextFromBoxes_RCOnly verifies that box text goes to exactly
// one cell via R/C annotations, not multiple cells via spatial overlap.
// A box overlapping two cells should only fill the one matching its R/C.
func TestFillCellTextFromBoxes_RCOnly(t *testing.T) {
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50, Label: "table"},
{X0: 90, Y0: 0, X1: 200, Y1: 50, Label: "table"},
}
// This box straddles cell 0 (X=0-100) and cell 1 (X=90-200).
// Spatial overlap: both match. R/C: should go to cell R=0, C=0 only.
boxes := []TextBox{
{X0: 80, X1: 120, Top: 0, Bottom: 50, Text: "TEXT", LayoutType: "table", R: 0, C: 0},
}
rows := groupTSRCellsToRowsLabeled(cells)
for _, b := range boxes {
t := strings.TrimSpace(b.Text)
if t == "" {
continue
}
if b.R >= 0 && b.R < len(rows) && b.C >= 0 && b.C < len(rows[b.R]) {
rows[b.R][b.C].Text = t
}
}
// Cell 0 should have text, cell 1 should NOT.
if rows[0][0].Text != "TEXT" {
t.Errorf("cell[0][0] = %q, want %q", rows[0][0].Text, "TEXT")
}
if rows[0][1].Text != "" {
t.Errorf("cell[0][1] = %q, should be empty (spatial overlap leak)", rows[0][1].Text)
}
}
// TestRowsToHTML_HeaderRows verifies that header rows use <th > instead of <td >.
func TestRowsToHTML_HeaderRows(t *testing.T) {
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "Name", Label: "table column header"},
{X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "Age", Label: "table column header"},
{X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "John", Label: "table row"},
{X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "30", Label: "table row"},
}
// constructTable should produce <th > for header row.
item := &TableItem{}
html := constructTable(cells, nil, "", item)
// Header row should use <th >, data row <td >.
if !strings.Contains(html, "<th >") {
t.Errorf("expected <th > for header row. HTML: %s", html)
}
if strings.Count(html, "<th ") != 2 {
t.Errorf("expected 2 <th > cells, got %d. HTML: %s", strings.Count(html, "<th "), html)
}
if strings.Count(html, "<td ") != 2 {
t.Errorf("expected 2 <td > cells (data row), got %d", strings.Count(html, "<td "))
}
}
// TestExtractTableAndReplace_OnlyTableBoxes verifies that only boxes with
// LayoutType=="table" are passed to constructTable (Python: filters by layout_type).
func TestExtractTableAndReplace_OnlyTableBoxes(t *testing.T) {
boxes := []TextBox{
{X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "A", R: 0, C: 0, LayoutType: "table"},
{X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "B", R: 0, C: 1, LayoutType: "table"},
{X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "NOT_TABLE", R: 0, C: 0, LayoutType: "text"}, // non-table, R/C=0
{X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "D", R: 1, C: 1, LayoutType: "table"},
}
tables := []TableItem{{
Cells: []TSRCell{{Label: "table"}},
Positions: []Position{{Left: 0, Right: 200, Top: 0, Bottom: 70}},
Scale: 1.0,
}}
result := extractTableAndReplace(boxes, tables)
// constructTable should produce HTML with "A", "B", "D" but NOT "NOT_TABLE".
if !strings.Contains(result[0].Text, "A") || !strings.Contains(result[0].Text, "D") {
t.Errorf("missing table box text: %s", result[0].Text)
}
if strings.Contains(result[0].Text, "NOT_TABLE") {
t.Errorf("non-table box leaked into HTML: %s", result[0].Text)
}
}
// TestFillCellText_RCOverSpatial verifies that R/C-based fill puts a
// box into exactly one cell (matching Python), unlike spatial fill which
// puts it into all overlapping cells.
func TestFillCellText_RCOverSpatial(t *testing.T) {
// Box at X=30..270 overlaps all 3 cells (>30% each — spatial fills ALL).
// With R/C, it belongs only to cell[1] (R=0, C=1).
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 30, Label: "table"},
{X0: 90, Y0: 0, X1: 200, Y1: 30, Label: "table"},
{X0: 180, Y0: 0, X1: 300, Y1: 30, Label: "table"},
}
boxes := []TextBox{
{X0: 30, X1: 270, Top: 0, Bottom: 30, Text: "TEXT", LayoutType: "table", R: 0, C: 1},
}
// Spatial fill: fills ALL overlapping cells —> duplication.
cellsCopy := make([]TSRCell, 3)
copy(cellsCopy, cells)
fillCellTextFromBoxes(cellsCopy, boxes)
spatialCount := 0
for _, c := range cellsCopy {
if c.Text != "" {
spatialCount++
}
}
if spatialCount <= 1 {
t.Errorf("spatial fill: expected >1 cells with text, got %d", spatialCount)
}
t.Logf("spatial fill: %d cells (WRONG — duplication)", spatialCount)
// R/C fill: only cell matching box.R/C gets text.
cellsRC := make([]TSRCell, 3)
copy(cellsRC, cells)
rows := groupTSRCellsToRowsLabeled(cellsRC)
for _, b := range boxes {
if b.R >= 0 && b.R < len(rows) && b.C >= 0 && b.C < len(rows[b.R]) {
rows[b.R][b.C].Text = strings.TrimSpace(b.Text)
}
}
rcCount := 0
for _, row := range rows {
for _, c := range row {
if c.Text == "TEXT" {
rcCount++
}
}
}
if rcCount != 1 {
t.Errorf("R/C fill: expected 1 cell with 'TEXT', got %d", rcCount)
}
}
func TestIsCaptionBox(t *testing.T) {
tests := []struct {
text string
want bool
}{
{"表1交通工具等级", true},
{"Table 1: Transport Levels", true},
{"图表 1: 测试", true},
{"公司领导班子成员、出差地", false}, // plain text, not caption
{"第十条到厂矿单位出差", false}, // normal paragraph
{"", false},
}
for _, tt := range tests {
if got := isCaptionBox(tt.text, ""); got != tt.want {
t.Errorf("isCaptionBox(%q) = %v, want %v", tt.text, got, tt.want)
}
}
}
func TestFillCellTextFromBoxes_SkipsCaption(t *testing.T) {
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 200, Y1: 30, Label: "table"},
{X0: 0, Y0: 35, X1: 200, Y1: 65, Label: "table"},
}
boxes := []TextBox{
// Caption box (should be skipped)
{X0: 0, X1: 200, Top: 0, Bottom: 30, Text: "表1交通工具等级"},
// Data box
{X0: 0, X1: 200, Top: 35, Bottom: 65, Text: "数据行"},
}
fillCellTextFromBoxes(cells, boxes)
if cells[0].Text != "" {
t.Errorf("caption leaked into cell 0: %q", cells[0].Text)
}
if cells[1].Text != "数据行" {
t.Errorf("data not in cell 1: %q", cells[1].Text)
}
}
func TestFillCellText_RCPreventsCrossCellLeak(t *testing.T) {
// Caption box at Y=0-15 overlaps BOTH cell rows (both are "empty").
// Spatial fill: text leaks to both cells. R/C fill: only cell[0] gets text.
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 300, Y1: 30, Label: "table"},
{X0: 0, Y0: 35, X1: 300, Y1: 65, Label: "table"},
}
boxes := []TextBox{
{X0: 10, X1: 200, Top: 12, Bottom: 28, Text: "公司领导班子成员、出差地", R: 0, C: 0},
}
// Spatial fill → leaks to cells[1] (overlap ≥30%).
cellsSp := make([]TSRCell, 2)
copy(cellsSp, cells)
fillCellTextFromBoxes(cellsSp, boxes)
if cellsSp[1].Text != "" {
t.Errorf("spatial fill: caption leaked to cell[1]: %q", cellsSp[1].Text)
}
// R/C fill → only cell[0] (R=0,C=0).
cellsRC := make([]TSRCell, 2)
copy(cellsRC, cells)
rows := groupTSRCellsToRowsLabeled(cellsRC)
for _, b := range boxes {
if b.R >= 0 && b.R < len(rows) && b.C >= 0 && b.C < len(rows[b.R]) {
if rows[b.R][b.C].Text == "" {
rows[b.R][b.C].Text = strings.TrimSpace(b.Text)
}
}
}
if cellsRC[1].Text != "" {
t.Errorf("R/C fill: caption leaked to cell[1]: %q", cellsRC[1].Text)
}
}
func TestGroupBoxesByRC_FallbackToYXWhenNoAnnotations(t *testing.T) {
// When all boxes have R=-1 (Python's case: regex didn't match "table" label),
// groupBoxesByRC should fall back to YX coordinate grouping.
boxes := []TextBox{
{X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "A", R: -1, C: -1},
{X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "B", R: -1, C: -1},
{X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "C", R: -1, C: -1},
{X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "D", R: -1, C: -1},
}
rows := groupBoxesByRC(boxes)
// R=-1 for all → maxR = -1 → grid would be 0 rows. Must fall back to YX.
if len(rows) == 0 {
t.Fatal("groupBoxesByRC returned 0 rows when R=-1 — no YX fallback")
}
if len(rows) != 2 {
t.Errorf("expected 2 rows (Y-split), got %d", len(rows))
}
}
func TestRowsToHTML_Colspan(t *testing.T) {
// Box spanning 2 columns: SP annotation with HLeft/HRight covering cols 0-1.
boxes := []TextBox{
{X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "Name", R: 0, C: 0, H: 1, HLeft: 10, HRight: 190},
{X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "", R: 0, C: 1, SP: 1},
{X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "John", R: 1, C: 0},
{X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "30", R: 1, C: 1},
}
rows := groupBoxesByRC(boxes)
spans, covered := calSpans(rows)
html := rowsToHTML(rows, "", nil, spans, covered)
if !strings.Contains(html, "colspan") {
t.Errorf("expected colspan attribute, got: %s", html)
}
t.Logf("HTML: %s", html)
}
// TestStripCaptionFromCells verifies that caption-like text is cleared
// from TSR cells before the table HTML is built.
func TestStripCaptionFromCells_ClearsCaptionPattern(t *testing.T) {
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "表1差旅费标准"},
{X0: 100, Y0: 0, X1: 200, Y1: 50, Text: ""},
{X0: 0, Y0: 60, X1: 100, Y1: 110, Text: "张三"},
{X0: 100, Y0: 60, X1: 200, Y1: 110, Text: "100"},
}
stripCaptionFromCells(cells)
if cells[0].Text != "" {
t.Errorf("caption cell should be cleared, got %q", cells[0].Text)
}
if cells[2].Text != "张三" {
t.Errorf("data cell should be preserved, got %q", cells[2].Text)
}
}
// TestStripCaptionFromCells_PreservesData verifies that non-caption
// cells are not cleared.
func TestStripCaptionFromCells_PreservesData(t *testing.T) {
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "姓名"},
{X0: 100, Y0: 0, X1: 200, Y1: 50, Text: "年龄"},
{X0: 0, Y0: 60, X1: 100, Y1: 110, Text: "张三"},
{X0: 100, Y0: 60, X1: 200, Y1: 110, Text: "25"},
}
// Make a copy and strip
orig := make([]string, len(cells))
for i, c := range cells {
orig[i] = c.Text
}
stripCaptionFromCells(cells)
for i := range cells {
if cells[i].Text != orig[i] {
t.Errorf("cell[%d] changed: %q -> %q", i, orig[i], cells[i].Text)
}
}
}
// TestStripCaptionFromCells_Empty is a no-op on empty cells.
func TestStripCaptionFromCells_Empty(t *testing.T) {
cells := []TSRCell{}
stripCaptionFromCells(cells) // must not panic
}
// TestConstructTable_StripsCaptionFromCells verifies that constructTable
// strips caption text from cells before building HTML.
func TestConstructTable_StripsCaptionFromCells(t *testing.T) {
// Cell[0] has caption text "表1标题"; cell[1] has real data.
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "表1标题"},
{X0: 100, Y0: 0, X1: 200, Y1: 50, Text: "数据"},
}
html := constructTable(cells, nil, "", nil)
// "表1标题" should NOT appear in the HTML (stripped as caption).
if strings.Contains(html, "表1") {
t.Errorf("caption text '表1标题' should be stripped: %s", html)
}
// "数据" should still be there.
if !strings.Contains(html, "数据") {
t.Errorf("data text '数据' should be preserved: %s", html)
}
t.Logf("HTML: %s", html)
}
// TestCalSpans_NonSpanningCellsNotPolluted verifies that a regular cell
// at position [0,0] is NOT detected as spanning when a spanning cell at
// [0,1] extends to the left, polluting column boundary calculations.
// Bug: calSpans computed column boundaries from ALL cells including
// spanning cells. "部门开支汇总" at [0,1] with X0=0 extends colLeft[1]
// to 0 instead of 101, shifting the center and causing "Q1" at [0,0]
// to be incorrectly detected as spanning 2 columns.
func TestCalSpans_NonSpanningCellsNotPolluted(t *testing.T) {
// Simulate the SpannedTable test grid: row 0 has Q1(regular), 部门开支汇总(span), Q2(regular)
rows := [][]TSRCell{
{
{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "Q1", Label: "table row"},
{X0: 0, Y0: 0, X1: 200, Y1: 30, Text: "部门开支汇总", Label: "table spanning cell"},
{X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "Q2", Label: "table row"},
},
{
{X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "100", Label: "table row"},
{X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "200", Label: "table row"},
},
}
spans, covered := calSpans(rows)
// Q1 at [0,0] has X0=0, X1=100 which should only cover its own column.
// It should NOT get a colspan.
if s, ok := spans[[2]int{0, 0}]; ok {
t.Errorf("Q1 at [0,0] should NOT have colspan, got %v. "+
"Spanning cell at [0,1] polluted column boundaries", s)
}
// 部门开支汇总 at [0,1] has X0=0, X1=200 which DOES span columns 0 and 1.
if s, ok := spans[[2]int{0, 1}]; !ok {
t.Error("部门开支汇总 at [0,1] should have colspan=2 (covers X=0-200)")
} else if s[0] != 2 {
t.Errorf("部门开支汇总 colspan = %d, want 2", s[0])
}
// Q2 at [0,2] should be covered by the spanning cell (col 2 is within X=0-200).
if !covered[[2]int{0, 2}] {
t.Error("Q2 at [0,2] should be covered by spanning cell at [0,1]")
}
t.Logf("spans: %v, covered: %v", spans, covered)
}
// ── coordinate space conversion helpers ─────────────────────────────────
func TestCellToPageSpace(t *testing.T) {
cell := TSRCell{X0: 100, Y0: 200, X1: 300, Y1: 400, Text: "hello", Label: "table"}
got := cellToPageSpace(cell, 15, 25, 3.0)
// (100+15)/3 = 38.33..., (200+25)/3 = 75
if got.X0 != 38.333333333333336 || got.Y0 != 75 || got.X1 != 105 || got.Y1 != 141.66666666666666 {
t.Errorf("cellToPageSpace: got (%f,%f,%f,%f), want (38.33,75,105,141.67)", got.X0, got.Y0, got.X1, got.Y1)
}
if got.Text != "hello" || got.Label != "table" {
t.Error("cellToPageSpace should preserve Text and Label")
}
}
func TestCellAddOffset(t *testing.T) {
cell := TSRCell{X0: 100, Y0: 200, X1: 300, Y1: 400, Text: "hello"}
got := cellAddOffset(cell, 15, 25)
if got.X0 != 115 || got.Y0 != 225 || got.X1 != 315 || got.Y1 != 425 {
t.Errorf("cellAddOffset: got (%f,%f,%f,%f)", got.X0, got.Y0, got.X1, got.Y1)
}
if got.Text != "hello" {
t.Error("cellAddOffset should preserve Text")
}
}
func TestBoxToCropSpace(t *testing.T) {
box := TextBox{X0: 50, X1: 200, Top: 100, Bottom: 300, Text: "text"}
got := boxToCropSpace(box, 3.0, 10, 20)
if got.X0 != 140 || got.Top != 280 || got.X1 != 590 || got.Bottom != 880 {
t.Errorf("boxToCropSpace: got (%f,%f,%f,%f)", got.X0, got.Top, got.X1, got.Bottom)
}
if got.Text != "text" {
t.Error("boxToCropSpace should preserve Text")
}
}
func TestCopyBoxAnnotations(t *testing.T) {
src := &TextBox{R: 1, C: 2, RTop: 10, RBott: 20, H: 3, HTop: 30, HBott: 40,
HLeft: 50, HRight: 60, CLeft: 70, CRight: 80, SP: 4}
dst := &TextBox{}
copyBoxAnnotations(dst, src)
if dst.R != 1 || dst.C != 2 || dst.RTop != 10 || dst.RBott != 20 {
t.Error("R/C fields not copied")
}
if dst.H != 3 || dst.HTop != 30 || dst.HBott != 40 {
t.Error("H fields not copied")
}
if dst.HLeft != 50 || dst.HRight != 60 || dst.CLeft != 70 || dst.CRight != 80 {
t.Error("spanning fields not copied")
}
if dst.SP != 4 {
t.Error("SP not copied")
}
}
// TestAnnotateBoxLayouts_CompactionPreservesWriteBackMapping verifies that
// when annotateBoxLayouts drops some boxes (CID garbage or garbage-layout
// at non-edge positions), the compaction step does not corrupt the caller's
// ability to write annotations back to the correct global box indices.
//
// The bug: annotateBoxLayouts compacts boxes in place in the shared backing
// array, shifting survivors forward. enrichWithDeepDoc then iterates
// len(indices) positions and writes pageBoxes[i] back to boxes[indices[i]],
// but after compaction pageBoxes[1] holds what was originally pageBoxes[2],
// so annotations land on the wrong global box.
func TestAnnotateBoxLayouts_CompactionPreservesWriteBackMapping(t *testing.T) {
// ── Simulate the exact enrichWithDeepDoc write-back pattern ──
// Global boxes on a page: B0, B1, B2 (indices 0, 1, 2 in the PDF-space
// boxes slice).
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 50, Text: "will be dropped via reference match"},
{X0: 0, X1: 100, Top: 60, Bottom: 110, Text: "text box A"},
{X0: 110, X1: 200, Top: 60, Bottom: 110, Text: "text box B"},
}
// Per-page subset (what enrichWithDeepDoc constructs from byPage[pg]).
indices := []int{0, 1, 2}
pageBoxes := make([]TextBox, len(indices))
for i, idx := range indices {
pageBoxes[i] = boxes[idx] // value copy
}
// DLA regions: one reference (garbage type → matched boxes are dropped
// unless at page edge), two text regions for the surviving boxes.
// scale=1.0 so DLA pixel coords == PDF point coords.
regions := []DLARegion{
{Label: "reference", Confidence: 0.9, X0: 0, Y0: 0, X1: 100, Y1: 50},
{Label: "text", Confidence: 0.9, X0: 0, Y0: 60, X1: 100, Y1: 110},
{Label: "text", Confidence: 0.9, X0: 110, Y0: 60, X1: 200, Y1: 110},
}
pageImgHeight := 200.0
// The function under test.
_ = annotateBoxLayouts(pageBoxes, regions, 1.0, pageImgHeight)
// Simulate enrichWithDeepDoc write-back (table.go:52-58).
for i, idx := range indices {
if pageBoxes[i].LayoutType != "" {
boxes[idx].LayoutType = pageBoxes[i].LayoutType
boxes[idx].LayoutNo = pageBoxes[i].LayoutNo
}
copyBoxAnnotations(&boxes[idx], &pageBoxes[i])
}
// ── Assertions ──
// B0 matched a "reference" region far from page edge → must be dropped.
if boxes[0].LayoutType != "" {
t.Errorf("B0 was dropped (reference region) but got LayoutType=%q from a shifted survivor",
boxes[0].LayoutType)
}
// B1 matched the first text region → must be text-0.
if boxes[1].LayoutType != "text" {
t.Errorf("B1 LayoutType = %q, want text", boxes[1].LayoutType)
}
if boxes[1].LayoutNo != "text-0" {
t.Errorf("B1 LayoutNo = %q, want text-0 (compaction shifted B2 into position 1)", boxes[1].LayoutNo)
}
// B2 matched the second text region → must be text-1.
if boxes[2].LayoutType != "text" {
t.Errorf("B2 LayoutType = %q, want text", boxes[2].LayoutType)
}
if boxes[2].LayoutNo != "text-1" {
t.Errorf("B2 LayoutNo = %q, want text-1 (stale element at position 2 after compaction)", boxes[2].LayoutNo)
}
}
// ── matchTableRegions unit tests ─────────────────────────────────────
func TestMatchTableRegions_SingleMatch(t *testing.T) {
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 50},
{X0: 200, X1: 300, Top: 0, Bottom: 50},
}
regions := []DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "table"}, // covers box 0 at scale 3
{X0: 600, Y0: 0, X1: 900, Y1: 150, Label: "text"}, // non-table, ignored
}
matches := matchTableRegions(boxes, regions, 3.0)
if len(matches) != 1 {
t.Fatalf("expected 1 match, got %d", len(matches))
}
if len(matches[0].boxIdx) != 1 || matches[0].boxIdx[0] != 0 {
t.Errorf("expected box 0 matched, got %v", matches[0].boxIdx)
}
}
func TestMatchTableRegions_NoTableLabel(t *testing.T) {
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 50},
}
regions := []DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"},
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "figure"},
}
matches := matchTableRegions(boxes, regions, 3.0)
if len(matches) != 0 {
t.Errorf("non-table labels: expected 0 matches, got %d", len(matches))
}
}
func TestMatchTableRegions_MultipleBoxesSameTable(t *testing.T) {
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 50}, // box 0
{X0: 110, X1: 210, Top: 0, Bottom: 50}, // box 1
}
regions := []DLARegion{
{X0: 0, Y0: 0, X1: 630, Y1: 150, Label: "table"}, // covers both boxes at scale 3
}
matches := matchTableRegions(boxes, regions, 3.0)
if len(matches) != 1 {
t.Fatalf("expected 1 match, got %d", len(matches))
}
if len(matches[0].boxIdx) != 2 {
t.Errorf("expected 2 boxes matched, got %d: %v", len(matches[0].boxIdx), matches[0].boxIdx)
}
}
func TestMatchTableRegions_ImageOnlyPDF(t *testing.T) {
// Zero boxes — image-only PDF. Python processes every table DLA region
// regardless of text box overlap.
var boxes []TextBox // nil
regions := []DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "table"},
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"},
}
matches := matchTableRegions(boxes, regions, 3.0)
if len(matches) != 1 {
t.Fatalf("image-only: expected 1 table match, got %d", len(matches))
}
if len(matches[0].boxIdx) != 0 {
t.Errorf("image-only: expected empty boxIdx, got %d", len(matches[0].boxIdx))
}
}
func TestMatchTableRegions_BelowThreshold(t *testing.T) {
// Region overlaps only a sliver of the box (<40%) → no match.
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 100},
}
regions := []DLARegion{
{X0: 0, Y0: 0, X1: 90, Y1: 90, Label: "table"}, // 30x30 at scale 3 → 9% overlap
}
matches := matchTableRegions(boxes, regions, 3.0)
if len(matches) != 0 {
t.Errorf("below threshold: expected 0 matches, got %d", len(matches))
}
}
func TestCellSliceToPageSpace(t *testing.T) {
cells := []TSRCell{
{X0: 100, Y0: 200, X1: 300, Y1: 400},
{X0: 400, Y0: 200, X1: 600, Y1: 400},
}
got := cellSliceToPageSpace(cells, 15, 25, 3)
if len(got) != 2 {
t.Fatal("expected 2 cells")
}
if got[0].X0 != 38.333333333333336 || got[1].X0 != 138.33333333333334 {
t.Error("wrong conversion")
}
}
// MockTableBuilder is a test-only TableBuilder with a configurable GroupCells.
type MockTableBuilder struct {
GroupCellsFn func(cells []TSRCell) [][]TSRCell
}
func (m *MockTableBuilder) Name() string { return "mock" }
func (m *MockTableBuilder) DetectCells(_ context.Context, _ image.Image) ([]TSRCell, error) {
return nil, nil
}
func (m *MockTableBuilder) GroupCells(cells []TSRCell) [][]TSRCell {
if m.GroupCellsFn != nil {
return m.GroupCellsFn(cells)
}
return nil
}
// ── writeTableAnnotations unit tests ──────────────────────────────────
func TestWriteTableAnnotations_WriteBack(t *testing.T) {
boxes := []TextBox{
{X0: 10, X1: 100, Top: 10, Bottom: 30, Text: "A", LayoutType: "table"},
{X0: 110, X1: 200, Top: 10, Bottom: 30, Text: "B", LayoutType: "table"},
{X0: 10, X1: 100, Top: 35, Bottom: 55, Text: "C", LayoutType: "table"},
}
boxIdx := []int{0, 2}
cells := []TSRCell{
{X0: 30, Y0: 30, X1: 300, Y1: 90, Label: "table row"},
{X0: 30, Y0: 110, X1: 300, Y1: 170, Label: "table row"},
}
scale := 3.0
tb := &MockTableBuilder{GroupCellsFn: func(cells []TSRCell) [][]TSRCell {
return [][]TSRCell{{cells[0]}, {cells[1]}}
}}
writeTableAnnotations(boxes, boxIdx, cells, scale, 0, 0, tb)
if boxes[0].R != 0 {
t.Errorf("box 0 R = %d, want 0", boxes[0].R)
}
if boxes[0].C != 0 {
t.Errorf("box 0 C = %d, want 0", boxes[0].C)
}
// Box 1 was not in boxIdx — should NOT be annotated
if boxes[1].R != 0 || boxes[1].C != 0 {
t.Errorf("box 1 should not be annotated: R=%d C=%d", boxes[1].R, boxes[1].C)
}
if boxes[2].R != 1 {
t.Errorf("box 2 R = %d, want 1", boxes[2].R)
}
}
func TestWriteTableAnnotations_ScaleDown(t *testing.T) {
boxes := []TextBox{
{X0: 10, X1: 100, Top: 10, Bottom: 50, Text: "X", LayoutType: "table"},
}
boxIdx := []int{0}
cells := []TSRCell{
{X0: 30, Y0: 30, X1: 300, Y1: 150, Label: "table row"},
}
scale := 3.0
tb := &MockTableBuilder{GroupCellsFn: func(cells []TSRCell) [][]TSRCell {
return [][]TSRCell{{cells[0]}}
}}
writeTableAnnotations(boxes, boxIdx, cells, scale, 0, 0, tb)
// After scale-down: RTop / 3 should be in PDF space (~10).
if boxes[0].RTop == 0 {
t.Error("RTop should be non-zero after annotation")
}
}
func TestWriteTableAnnotations_EmptyCells(t *testing.T) {
boxes := []TextBox{{X0: 10, X1: 100, Top: 10, Bottom: 50, Text: "X", LayoutType: "table"}}
boxIdx := []int{0}
var cells []TSRCell
tb := &MockTableBuilder{GroupCellsFn: func(cells []TSRCell) [][]TSRCell {
return nil
}}
// Should not panic with empty cells.
writeTableAnnotations(boxes, boxIdx, cells, 3.0, 0, 0, tb)
if boxes[0].R != 0 || boxes[0].C != 0 {
t.Errorf("empty cells: R=%d C=%d, want 0,0", boxes[0].R, boxes[0].C)
}
}
// ── markNoMergeTables unit tests ─────────────────────────────────────
func TestMarkNoMergeTables_CaptionAfterTable(t *testing.T) {
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
{X0: 0, X1: 100, Top: 35, Bottom: 50, LayoutType: "table caption", Text: "表1标题"},
}
tables := []TableItem{
{Positions: []Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}},
}
markNoMergeTables(boxes, tables)
if !tables[0].NoMerge {
t.Error("table followed by caption should be marked NoMerge")
}
}
func TestMarkNoMergeTables_TitleAfterTable(t *testing.T) {
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
{X0: 0, X1: 100, Top: 35, Bottom: 50, LayoutType: "title"},
}
tables := []TableItem{
{Positions: []Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}},
}
markNoMergeTables(boxes, tables)
if !tables[0].NoMerge {
t.Error("table followed by title should be marked NoMerge")
}
}
func TestMarkNoMergeTables_NoCaptionAfter(t *testing.T) {
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
{X0: 0, X1: 100, Top: 35, Bottom: 50, LayoutType: "text"},
{X0: 0, X1: 100, Top: 55, Bottom: 70, LayoutType: "table"},
}
tables := []TableItem{
{Positions: []Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}},
{Positions: []Position{{Left: 0, Right: 100, Top: 55, Bottom: 70}}},
}
markNoMergeTables(boxes, tables)
if tables[0].NoMerge {
t.Error("table followed by text should NOT be marked NoMerge")
}
if tables[1].NoMerge {
t.Error("last table should NOT be marked NoMerge")
}
}
func TestMarkNoMergeTables_StaleLastTableTI(t *testing.T) {
// Scenario: table box that does NOT overlap any TableItem.Position
// should reset lastTableTI. Otherwise the next caption marks the
// wrong (non-adjacent) table as NoMerge.
// Box 0: "table", overlaps table[0] → lastTableTI = 0
// Box 1: "table", no overlap → lastTableTI should reset to -1
// Box 2: "title" → should be a no-op (no adjacent table)
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
{X0: 500, X1: 600, Top: 100, Bottom: 130, LayoutType: "table"}, // far away, no overlap
{X0: 0, X1: 100, Top: 140, Bottom: 160, LayoutType: "title"},
}
tables := []TableItem{
{Positions: []Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}}, // table 0
{Positions: []Position{{Left: 0, Right: 100, Top: 35, Bottom: 65}}}, // table 1 — box 0 doesn't overlap this either
}
markNoMergeTables(boxes, tables)
// table[0] should NOT be NoMerge: the title follows a non-matching
// table box, not table[0] directly.
if tables[0].NoMerge {
t.Error("stale lastTableTI: table[0] incorrectly marked NoMerge — " +
"the non-overlapping table box (box 1) should have reset lastTableTI")
}
}
func TestMarkNoMergeTables_EmptyInputs(t *testing.T) {
// Should not panic with empty inputs.
markNoMergeTables(nil, nil)
markNoMergeTables([]TextBox{}, []TableItem{})
}