Files
ragflow/internal/deepdoc/parser/pdf/table/table_annotate_test.go
Jack 98323e7910 Refactor: oss parser go refactor (#16391)
### What problem does this PR solve?

Package refactor and PDF post process.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-29 18:46:41 +08:00

605 lines
22 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package table
import (
"context"
"image"
"testing"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
)
func TestAnnotateBoxLayouts_SetsLabel(t *testing.T) {
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20},
{X0: 0, X1: 100, Top: 30, Bottom: 50},
}
regions := []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "title"}, // covers box 0 at scale 3
{X0: 0, Y0: 90, X1: 300, Y1: 150, Label: "text"}, // covers box 1 at scale 3
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
if boxes[0].LayoutType != "title" {
t.Errorf("box 0: got %q, want 'title'", boxes[0].LayoutType)
}
if boxes[1].LayoutType != "text" {
t.Errorf("box 1: got %q, want 'text'", boxes[1].LayoutType)
}
}
func TestAnnotateBoxLayouts_NoMatch(t *testing.T) {
// Region far away from the box — no overlap
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20},
}
regions := []pdf.DLARegion{
{X0: 900, Y0: 900, X1: 1000, Y1: 1000, Label: "far"}, // completely outside
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
if boxes[0].LayoutType != "" {
t.Errorf("no match: expected empty, got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_EmptyRegions(t *testing.T) {
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 20}}
boxes = AnnotateBoxLayouts(boxes, nil, 3.0, 0)
boxes = AnnotateBoxLayouts(boxes, []pdf.DLARegion{}, 3.0, 0)
if boxes[0].LayoutType != "" {
t.Errorf("empty regions: got %q, want empty", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_PriorityOverMaxArea(t *testing.T) {
// "table" type checked before "text" in priority order.
// Even if "text" region has larger overlap, "table" wins if it meets threshold (≥40%).
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 50}}
regions := []pdf.DLARegion{
// text region: full coverage (100% overlap) — but lower priority
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"},
// table region: 45% overlap (45x50 out of 100x50) — higher priority, meets threshold
{X0: 0, Y0: 0, X1: 45 * 3, Y1: 50 * 3, Label: "table"},
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
if boxes[0].LayoutType != "table" {
t.Errorf("priority: 'table' should win over 'text' when both meet threshold, got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_OverlapThreshold(t *testing.T) {
// Region overlaps only 30% of box — below 0.4 threshold — should NOT match.
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 50}}
regions := []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 30 * 3, Y1: 30 * 3, Label: "table"}, // covers ~30% of box
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
if boxes[0].LayoutType != "" {
t.Errorf("threshold: overlap < 40%% should not match, got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_CIDGarbage(t *testing.T) {
// CID-pattern boxes should be popped entirely (Python: bxs.pop(i)).
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20, Text: "(cid:123)"},
{X0: 0, X1: 100, Top: 30, Bottom: 50, Text: "normal text"},
}
regions := []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "text", Confidence: 0.9},
{X0: 0, Y0: 90, X1: 300, Y1: 150, Label: "text", Confidence: 0.9},
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
// CID-garbled box was popped → only 1 box remains.
if len(boxes) != 1 {
t.Fatalf("CID-garbled box should be popped, got %d boxes", len(boxes))
}
if boxes[0].LayoutType != "text" {
t.Errorf("CID: remaining box should be 'text', got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_LayoutNoFormat(t *testing.T) {
// layoutno uses Python format: "{type}-{per_type_index}" where per_type_index
// is the index of the matched DLA region within its type (not global).
// Two boxes overlapping the SAME text region share the same layoutno → VM can merge them.
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20},
{X0: 0, X1: 100, Top: 30, Bottom: 50},
}
regions := []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"}, // covers both boxes
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
want := "text-0"
if boxes[0].LayoutNo != want {
t.Errorf("box 0 layoutno: got %q, want %q", boxes[0].LayoutNo, want)
}
if boxes[1].LayoutNo != want {
t.Errorf("box 1 layoutno should share same per-type index: got %q, want %q", boxes[1].LayoutNo, want)
}
}
func TestAnnotateBoxLayouts_LayoutNoDifferentRegions(t *testing.T) {
// Two boxes in different text regions → different layoutno.
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20},
{X0: 0, X1: 100, Top: 100, Bottom: 120},
}
regions := []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "text"}, // per-type index 0
{X0: 0, Y0: 300, X1: 300, Y1: 360, Label: "text"}, // per-type index 1
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
if boxes[0].LayoutNo != "text-0" {
t.Errorf("box 0: got %q, want 'text-0'", boxes[0].LayoutNo)
}
if boxes[1].LayoutNo != "text-1" {
t.Errorf("box 1: got %q, want 'text-1'", boxes[1].LayoutNo)
}
}
// TestAnnotateBoxLayouts_ConfidenceFilter verifies that DLA regions with
// low confidence (< 0.4) for garbage layout types are excluded from matching.
// Python: float(b["score"]) >= 0.4 filter in LayoutRecognizer.
func TestAnnotateBoxLayouts_ConfidenceFilter(t *testing.T) {
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 50}}
// Low-confidence footer — should be filtered out.
regions := []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "footer", Confidence: 0.2},
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text", Confidence: 0.9},
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
// Footer region filtered (low confidence) → box matches "text" instead.
if boxes[0].LayoutType != "text" {
t.Errorf("low-confidence footer filtered → box should get 'text', got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_GarbageFooterRejected(t *testing.T) {
// Footer at page bottom: Bottom(290) > 270 (90% of 300px→PDF height 100→90% of 100=90)
// → real footer decoration → garbage → pop (Python: bxs.pop(i)).
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 280, Bottom: 290}}
regions := []pdf.DLARegion{
{X0: 0, Y0: 840, X1: 300, Y1: 870, Label: "footer", Confidence: 0.9}, // y=280-290 after /3, PDF 93-97
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 300) // PDF height = 300/3 = 100
if len(boxes) != 0 {
t.Errorf("footer at bottom: should be popped as decoration, got %d boxes left", len(boxes))
}
}
func TestAnnotateBoxLayouts_HeaderRemovedAtTop(t *testing.T) {
// Header at page top edge (y=5 in 300px page → PDF height 100 → 5 < 10% of 100)
// → real header decoration → garbage → pop (Python: bxs.pop(i)).
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 5, Bottom: 20}}
regions := []pdf.DLARegion{
{X0: 0, Y0: 15, X1: 300, Y1: 60, Label: "header", Confidence: 0.9}, // y=5-20 after /3
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 300)
if len(boxes) != 0 {
t.Errorf("header at very top: should be popped as decoration, got %d boxes left", len(boxes))
}
}
func TestAnnotateBoxLayouts_HeaderKeptInMiddle(t *testing.T) {
// Header in middle of page (y=50 in 300px page → PDF height 100 → 50 > 10)
// → DLA false positive → KEEP the text.
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 50, Bottom: 70}}
regions := []pdf.DLARegion{
{X0: 0, Y0: 150, X1: 300, Y1: 210, Label: "header", Confidence: 0.9}, // y=50-70 after /3
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 300)
if boxes[0].LayoutType != "header" {
t.Errorf("header in middle of page: DLA false positive, keep text, got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_FooterRemovedAtBottom(t *testing.T) {
// Footer at page bottom (y=95 in 300px page → PDF height 100 → 95 > 90% of 100)
// → real footer decoration → garbage → REMOVE.
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 95, Bottom: 100}}
regions := []pdf.DLARegion{
{X0: 0, Y0: 285, X1: 300, Y1: 300, Label: "footer", Confidence: 0.9}, // y=95-100 after /3
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 300)
if len(boxes) != 0 {
t.Errorf("footer at very bottom: should be popped as decoration, got %d boxes left", len(boxes))
}
}
func TestAnnotateBoxLayouts_FooterKeptInMiddle(t *testing.T) {
// Footer in middle of page (y=50 in 300px page → PDF height 100 → 50 < 90)
// → DLA false positive → KEEP the text.
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 50, Bottom: 70}}
regions := []pdf.DLARegion{
{X0: 0, Y0: 150, X1: 300, Y1: 210, Label: "footer", Confidence: 0.9}, // y=50-70 after /3
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 300)
if boxes[0].LayoutType != "footer" {
t.Errorf("footer in middle of page: DLA false positive, keep text, got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_ReferenceAlwaysGarbage(t *testing.T) {
// Reference type is always garbage regardless of position (no keep_feat).
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 50, Bottom: 70}}
regions := []pdf.DLARegion{
{X0: 0, Y0: 150, X1: 300, Y1: 210, Label: "reference", Confidence: 0.9},
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 300)
if len(boxes) != 0 {
t.Errorf("reference: should always be garbage-filtered, got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_NonGarbageTypeUnaffected(t *testing.T) {
// "text" type is NOT a garbage type — should always be assigned.
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 200, Bottom: 220}}
regions := []pdf.DLARegion{
{X0: 0, Y0: 600, X1: 300, Y1: 660, Label: "text"},
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 300)
if boxes[0].LayoutType != "text" {
t.Errorf("non-garbage type: should be assigned, got %q", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_ZeroPageHeightDisablesGarbage(t *testing.T) {
// pageImgHeight=0 → garbage check disabled → all types assigned.
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 100, Bottom: 120}}
regions := []pdf.DLARegion{
{X0: 0, Y0: 300, X1: 300, Y1: 360, Label: "header", Confidence: 0.9},
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
if boxes[0].LayoutType != "header" {
t.Errorf("zero page height: garbage check disabled, got %q", boxes[0].LayoutType)
}
}
// TestAnnotateBoxLayouts_SyntheticFigure creates synthetic figure boxes for
// unmatched figure/equation DLA regions (Python: dla_cli.py:187-195).
func TestAnnotateBoxLayouts_SyntheticFigure(t *testing.T) {
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20, Text: "text box"},
}
// Two figure regions, one text region
regions := []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 150, Y1: 60, Label: "text", Confidence: 0.9}, // matches text box → visited
{X0: 300, Y0: 300, X1: 600, Y1: 600, Label: "figure", Confidence: 0.9}, // no box overlaps → synthetic
{X0: 600, Y0: 0, X1: 900, Y1: 300, Label: "figure", Confidence: 0.9}, // no box overlaps → synthetic
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
// Original text box + 2 synthetic figure boxes = 3
if len(boxes) != 3 {
t.Fatalf("expected 3 boxes (1 original + 2 synthetic figures), got %d", len(boxes))
}
// Check synthetic boxes
foundFig0, foundFig1 := false, false
for _, b := range boxes {
if b.LayoutType == "figure" && b.Text == "" {
if b.LayoutNo == "figure-0" {
foundFig0 = true
if b.X0 != 100 || b.X1 != 200 {
t.Errorf("synthetic figure-0: expected x0=100,x1=200 (300/3,600/3), got x0=%v,x1=%v", b.X0, b.X1)
}
}
if b.LayoutNo == "figure-1" {
foundFig1 = true
}
}
}
if !foundFig0 {
t.Error("missing synthetic figure-0 box")
}
if !foundFig1 {
t.Error("missing synthetic figure-1 box")
}
}
// TestAnnotateBoxLayouts_EquationMappedToFigure verifies equation DLA regions
// get LayoutType="figure" but LayoutNo keeps "equation" prefix (Python behavior).
func TestAnnotateBoxLayouts_EquationMappedToFigure(t *testing.T) {
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20},
}
regions := []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "equation", Confidence: 0.9},
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
if len(boxes) != 1 {
t.Fatalf("expected 1 box, got %d", len(boxes))
}
if boxes[0].LayoutType != "figure" {
t.Errorf("equation → LayoutType: got %q, want 'figure'", boxes[0].LayoutType)
}
if boxes[0].LayoutNo != "equation-0" {
t.Errorf("equation → LayoutNo: got %q, want 'equation-0'", boxes[0].LayoutNo)
}
}
// TestAnnotateBoxLayouts_MixedTypesLayoutNo verifies per-type LayoutNo counting
// with multiple region types present.
func TestAnnotateBoxLayouts_MixedTypesLayoutNo(t *testing.T) {
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 20}, // overlaps text region 0
{X0: 0, X1: 100, Top: 200, Bottom: 220}, // overlaps text region 1
{X0: 200, X1: 300, Top: 0, Bottom: 20}, // overlaps figure region 0 only
}
regions := []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 150, Y1: 60, Label: "text", Confidence: 0.9}, // text-0
{X0: 0, Y0: 600, X1: 150, Y1: 660, Label: "text", Confidence: 0.9}, // text-1
{X0: 600, Y0: 0, X1: 900, Y1: 60, Label: "figure", Confidence: 0.9}, // figure-0 (PDF: x0=200, x1=300)
}
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
if len(boxes) != 3 {
t.Fatalf("expected 3 boxes, got %d", len(boxes))
}
// Check that text and figure indices are independent
if boxes[0].LayoutNo != "text-0" {
t.Errorf("box 0: got %q, want 'text-0'", boxes[0].LayoutNo)
}
if boxes[1].LayoutNo != "text-1" {
t.Errorf("box 1: got %q, want 'text-1'", boxes[1].LayoutNo)
}
if boxes[2].LayoutNo != "figure-0" {
t.Errorf("box 2: got %q, want 'figure-0' (independent from text counter)", boxes[2].LayoutNo)
}
}
// TestAnnotateBoxLayouts_CompactionPreservesWriteBackMapping verifies that
// when annotateBoxLayouts drops some boxes (CID garbage or garbage-layout
// at non-edge positions), the compaction step does not corrupt the caller's
// ability to write annotations back to the correct global box indices.
//
// The bug: annotateBoxLayouts compacts boxes in place in the shared backing
// array, shifting survivors forward. enrichWithDeepDoc then iterates
// len(indices) positions and writes pageBoxes[i] back to boxes[indices[i]],
// but after compaction pageBoxes[1] holds what was originally pageBoxes[2],
// so annotations land on the wrong global box.
func TestMatchTableRegions_SingleMatch(t *testing.T) {
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 50},
{X0: 200, X1: 300, Top: 0, Bottom: 50},
}
regions := []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "table"}, // covers box 0 at scale 3
{X0: 600, Y0: 0, X1: 900, Y1: 150, Label: "text"}, // non-table, ignored
}
matches := MatchTableRegions(boxes, regions, 3.0)
if len(matches) != 1 {
t.Fatalf("expected 1 match, got %d", len(matches))
}
if len(matches[0].BoxIdx) != 1 || matches[0].BoxIdx[0] != 0 {
t.Errorf("expected box 0 matched, got %v", matches[0].BoxIdx)
}
}
func TestMatchTableRegions_NoTableLabel(t *testing.T) {
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 50},
}
regions := []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"},
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "figure"},
}
matches := MatchTableRegions(boxes, regions, 3.0)
if len(matches) != 0 {
t.Errorf("non-table labels: expected 0 matches, got %d", len(matches))
}
}
func TestMatchTableRegions_MultipleBoxesSameTable(t *testing.T) {
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 50}, // box 0
{X0: 110, X1: 210, Top: 0, Bottom: 50}, // box 1
}
regions := []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 630, Y1: 150, Label: "table"}, // covers both boxes at scale 3
}
matches := MatchTableRegions(boxes, regions, 3.0)
if len(matches) != 1 {
t.Fatalf("expected 1 match, got %d", len(matches))
}
if len(matches[0].BoxIdx) != 2 {
t.Errorf("expected 2 boxes matched, got %d: %v", len(matches[0].BoxIdx), matches[0].BoxIdx)
}
}
func TestMatchTableRegions_ImageOnlyPDF(t *testing.T) {
// Zero boxes — image-only PDF. Python processes every table DLA region
// regardless of text box overlap.
var boxes []pdf.TextBox // nil
regions := []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "table"},
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"},
}
matches := MatchTableRegions(boxes, regions, 3.0)
if len(matches) != 1 {
t.Fatalf("image-only: expected 1 table match, got %d", len(matches))
}
if len(matches[0].BoxIdx) != 0 {
t.Errorf("image-only: expected empty BoxIdx, got %d", len(matches[0].BoxIdx))
}
}
func TestMatchTableRegions_BelowThreshold(t *testing.T) {
// Region overlaps only a sliver of the box (<40%) → no match.
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 100},
}
regions := []pdf.DLARegion{
{X0: 0, Y0: 0, X1: 90, Y1: 90, Label: "table"}, // 30x30 at scale 3 → 9% overlap
}
matches := MatchTableRegions(boxes, regions, 3.0)
if len(matches) != 0 {
t.Errorf("below threshold: expected 0 matches, got %d", len(matches))
}
}
// MockTableBuilder is a test-only pdf.TableBuilder with a configurable GroupCells.
type MockTableBuilder struct {
GroupCellsFn func(cells []pdf.TSRCell) [][]pdf.TSRCell
}
func (m *MockTableBuilder) Name() string { return "mock" }
func (m *MockTableBuilder) DetectCells(_ context.Context, _ image.Image) ([]pdf.TSRCell, error) {
return nil, nil
}
func (m *MockTableBuilder) GroupCells(cells []pdf.TSRCell) [][]pdf.TSRCell {
if m.GroupCellsFn != nil {
return m.GroupCellsFn(cells)
}
return nil
}
// ── writeTableAnnotations unit tests ──────────────────────────────────
func TestWriteTableAnnotations_WriteBack(t *testing.T) {
boxes := []pdf.TextBox{
{X0: 10, X1: 100, Top: 10, Bottom: 30, Text: "A", LayoutType: "table"},
{X0: 110, X1: 200, Top: 10, Bottom: 30, Text: "B", LayoutType: "table"},
{X0: 10, X1: 100, Top: 35, Bottom: 55, Text: "C", LayoutType: "table"},
}
BoxIdx := []int{0, 2}
cells := []pdf.TSRCell{
{X0: 30, Y0: 30, X1: 300, Y1: 90, Label: "table row"},
{X0: 30, Y0: 110, X1: 300, Y1: 170, Label: "table row"},
}
scale := 3.0
tb := &MockTableBuilder{GroupCellsFn: func(cells []pdf.TSRCell) [][]pdf.TSRCell {
return [][]pdf.TSRCell{{cells[0]}, {cells[1]}}
}}
WriteTableAnnotations(boxes, BoxIdx, cells, scale, 0, 0, tb)
if boxes[0].R != 0 {
t.Errorf("box 0 R = %d, want 0", boxes[0].R)
}
if boxes[0].C != 0 {
t.Errorf("box 0 C = %d, want 0", boxes[0].C)
}
// Box 1 was not in BoxIdx — should NOT be annotated
if boxes[1].R != 0 || boxes[1].C != 0 {
t.Errorf("box 1 should not be annotated: R=%d C=%d", boxes[1].R, boxes[1].C)
}
if boxes[2].R != 1 {
t.Errorf("box 2 R = %d, want 1", boxes[2].R)
}
}
func TestWriteTableAnnotations_ScaleDown(t *testing.T) {
boxes := []pdf.TextBox{
{X0: 10, X1: 100, Top: 10, Bottom: 50, Text: "X", LayoutType: "table"},
}
BoxIdx := []int{0}
cells := []pdf.TSRCell{
{X0: 30, Y0: 30, X1: 300, Y1: 150, Label: "table row"},
}
scale := 3.0
tb := &MockTableBuilder{GroupCellsFn: func(cells []pdf.TSRCell) [][]pdf.TSRCell {
return [][]pdf.TSRCell{{cells[0]}}
}}
WriteTableAnnotations(boxes, BoxIdx, cells, scale, 0, 0, tb)
// After scale-down: RTop / 3 should be in PDF space (~10).
if boxes[0].RTop == 0 {
t.Error("RTop should be non-zero after annotation")
}
}
func TestWriteTableAnnotations_EmptyCells(t *testing.T) {
boxes := []pdf.TextBox{{X0: 10, X1: 100, Top: 10, Bottom: 50, Text: "X", LayoutType: "table"}}
BoxIdx := []int{0}
var cells []pdf.TSRCell
tb := &MockTableBuilder{GroupCellsFn: func(cells []pdf.TSRCell) [][]pdf.TSRCell {
return nil
}}
// Should not panic with empty cells.
WriteTableAnnotations(boxes, BoxIdx, cells, 3.0, 0, 0, tb)
if boxes[0].R != 0 || boxes[0].C != 0 {
t.Errorf("empty cells: R=%d C=%d, want 0,0", boxes[0].R, boxes[0].C)
}
}
// ── markNoMergeTables unit tests ─────────────────────────────────────
func TestMarkNoMergeTables_CaptionAfterTable(t *testing.T) {
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
{X0: 0, X1: 100, Top: 35, Bottom: 50, LayoutType: "table caption", Text: "表1标题"},
}
tables := []pdf.TableItem{
{Positions: []pdf.Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}},
}
MarkNoMergeTables(boxes, tables)
if !tables[0].NoMerge {
t.Error("table followed by caption should be marked NoMerge")
}
}
func TestMarkNoMergeTables_TitleAfterTable(t *testing.T) {
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
{X0: 0, X1: 100, Top: 35, Bottom: 50, LayoutType: "title"},
}
tables := []pdf.TableItem{
{Positions: []pdf.Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}},
}
MarkNoMergeTables(boxes, tables)
if !tables[0].NoMerge {
t.Error("table followed by title should be marked NoMerge")
}
}
func TestMarkNoMergeTables_NoCaptionAfter(t *testing.T) {
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
{X0: 0, X1: 100, Top: 35, Bottom: 50, LayoutType: "text"},
{X0: 0, X1: 100, Top: 55, Bottom: 70, LayoutType: "table"},
}
tables := []pdf.TableItem{
{Positions: []pdf.Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}},
{Positions: []pdf.Position{{Left: 0, Right: 100, Top: 55, Bottom: 70}}},
}
MarkNoMergeTables(boxes, tables)
if tables[0].NoMerge {
t.Error("table followed by text should NOT be marked NoMerge")
}
if tables[1].NoMerge {
t.Error("last table should NOT be marked NoMerge")
}
}
func TestMarkNoMergeTables_StaleLastTableTI(t *testing.T) {
// Scenario: table box that does NOT overlap any pdf.TableItem.Position
// should reset lastTableTI. Otherwise the next caption marks the
// wrong (non-adjacent) table as NoMerge.
// Box 0: "table", overlaps table[0] → lastTableTI = 0
// Box 1: "table", no overlap → lastTableTI should reset to -1
// Box 2: "title" → should be a no-op (no adjacent table)
boxes := []pdf.TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
{X0: 500, X1: 600, Top: 100, Bottom: 130, LayoutType: "table"}, // far away, no overlap
{X0: 0, X1: 100, Top: 140, Bottom: 160, LayoutType: "title"},
}
tables := []pdf.TableItem{
{Positions: []pdf.Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}}, // table 0
{Positions: []pdf.Position{{Left: 0, Right: 100, Top: 35, Bottom: 65}}}, // table 1 — box 0 doesn't overlap this either
}
MarkNoMergeTables(boxes, tables)
// table[0] should NOT be NoMerge: the title follows a non-matching
// table box, not table[0] directly.
if tables[0].NoMerge {
t.Error("stale lastTableTI: table[0] incorrectly marked NoMerge — " +
"the non-overlapping table box (box 1) should have reset lastTableTI")
}
}
func TestMarkNoMergeTables_EmptyInputs(t *testing.T) {
// Should not panic with empty inputs.
MarkNoMergeTables(nil, nil)
MarkNoMergeTables([]pdf.TextBox{}, []pdf.TableItem{})
}