mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-01 16:25:44 +08:00
### What problem does this PR solve? Package refactor and PDF post process. ### Type of change - [x] Refactoring --------- Co-authored-by: Claude <noreply@anthropic.com>
605 lines
22 KiB
Go
605 lines
22 KiB
Go
package table
|
||
|
||
import (
|
||
"context"
|
||
"image"
|
||
"testing"
|
||
|
||
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
||
)
|
||
|
||
func TestAnnotateBoxLayouts_SetsLabel(t *testing.T) {
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 20},
|
||
{X0: 0, X1: 100, Top: 30, Bottom: 50},
|
||
}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "title"}, // covers box 0 at scale 3
|
||
{X0: 0, Y0: 90, X1: 300, Y1: 150, Label: "text"}, // covers box 1 at scale 3
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
|
||
if boxes[0].LayoutType != "title" {
|
||
t.Errorf("box 0: got %q, want 'title'", boxes[0].LayoutType)
|
||
}
|
||
if boxes[1].LayoutType != "text" {
|
||
t.Errorf("box 1: got %q, want 'text'", boxes[1].LayoutType)
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_NoMatch(t *testing.T) {
|
||
// Region far away from the box — no overlap
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 20},
|
||
}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 900, Y0: 900, X1: 1000, Y1: 1000, Label: "far"}, // completely outside
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
|
||
if boxes[0].LayoutType != "" {
|
||
t.Errorf("no match: expected empty, got %q", boxes[0].LayoutType)
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_EmptyRegions(t *testing.T) {
|
||
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 20}}
|
||
boxes = AnnotateBoxLayouts(boxes, nil, 3.0, 0)
|
||
boxes = AnnotateBoxLayouts(boxes, []pdf.DLARegion{}, 3.0, 0)
|
||
if boxes[0].LayoutType != "" {
|
||
t.Errorf("empty regions: got %q, want empty", boxes[0].LayoutType)
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_PriorityOverMaxArea(t *testing.T) {
|
||
// "table" type checked before "text" in priority order.
|
||
// Even if "text" region has larger overlap, "table" wins if it meets threshold (≥40%).
|
||
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 50}}
|
||
regions := []pdf.DLARegion{
|
||
// text region: full coverage (100% overlap) — but lower priority
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"},
|
||
// table region: 45% overlap (45x50 out of 100x50) — higher priority, meets threshold
|
||
{X0: 0, Y0: 0, X1: 45 * 3, Y1: 50 * 3, Label: "table"},
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
|
||
if boxes[0].LayoutType != "table" {
|
||
t.Errorf("priority: 'table' should win over 'text' when both meet threshold, got %q", boxes[0].LayoutType)
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_OverlapThreshold(t *testing.T) {
|
||
// Region overlaps only 30% of box — below 0.4 threshold — should NOT match.
|
||
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 50}}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 0, X1: 30 * 3, Y1: 30 * 3, Label: "table"}, // covers ~30% of box
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
|
||
if boxes[0].LayoutType != "" {
|
||
t.Errorf("threshold: overlap < 40%% should not match, got %q", boxes[0].LayoutType)
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_CIDGarbage(t *testing.T) {
|
||
// CID-pattern boxes should be popped entirely (Python: bxs.pop(i)).
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 20, Text: "(cid:123)"},
|
||
{X0: 0, X1: 100, Top: 30, Bottom: 50, Text: "normal text"},
|
||
}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "text", Confidence: 0.9},
|
||
{X0: 0, Y0: 90, X1: 300, Y1: 150, Label: "text", Confidence: 0.9},
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
|
||
// CID-garbled box was popped → only 1 box remains.
|
||
if len(boxes) != 1 {
|
||
t.Fatalf("CID-garbled box should be popped, got %d boxes", len(boxes))
|
||
}
|
||
if boxes[0].LayoutType != "text" {
|
||
t.Errorf("CID: remaining box should be 'text', got %q", boxes[0].LayoutType)
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_LayoutNoFormat(t *testing.T) {
|
||
// layoutno uses Python format: "{type}-{per_type_index}" where per_type_index
|
||
// is the index of the matched DLA region within its type (not global).
|
||
// Two boxes overlapping the SAME text region share the same layoutno → VM can merge them.
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 20},
|
||
{X0: 0, X1: 100, Top: 30, Bottom: 50},
|
||
}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"}, // covers both boxes
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
|
||
want := "text-0"
|
||
if boxes[0].LayoutNo != want {
|
||
t.Errorf("box 0 layoutno: got %q, want %q", boxes[0].LayoutNo, want)
|
||
}
|
||
if boxes[1].LayoutNo != want {
|
||
t.Errorf("box 1 layoutno should share same per-type index: got %q, want %q", boxes[1].LayoutNo, want)
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_LayoutNoDifferentRegions(t *testing.T) {
|
||
// Two boxes in different text regions → different layoutno.
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 20},
|
||
{X0: 0, X1: 100, Top: 100, Bottom: 120},
|
||
}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "text"}, // per-type index 0
|
||
{X0: 0, Y0: 300, X1: 300, Y1: 360, Label: "text"}, // per-type index 1
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
|
||
if boxes[0].LayoutNo != "text-0" {
|
||
t.Errorf("box 0: got %q, want 'text-0'", boxes[0].LayoutNo)
|
||
}
|
||
if boxes[1].LayoutNo != "text-1" {
|
||
t.Errorf("box 1: got %q, want 'text-1'", boxes[1].LayoutNo)
|
||
}
|
||
}
|
||
|
||
// TestAnnotateBoxLayouts_ConfidenceFilter verifies that DLA regions with
|
||
// low confidence (< 0.4) for garbage layout types are excluded from matching.
|
||
// Python: float(b["score"]) >= 0.4 filter in LayoutRecognizer.
|
||
func TestAnnotateBoxLayouts_ConfidenceFilter(t *testing.T) {
|
||
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 50}}
|
||
// Low-confidence footer — should be filtered out.
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "footer", Confidence: 0.2},
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text", Confidence: 0.9},
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
|
||
// Footer region filtered (low confidence) → box matches "text" instead.
|
||
if boxes[0].LayoutType != "text" {
|
||
t.Errorf("low-confidence footer filtered → box should get 'text', got %q", boxes[0].LayoutType)
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_GarbageFooterRejected(t *testing.T) {
|
||
// Footer at page bottom: Bottom(290) > 270 (90% of 300px→PDF height 100→90% of 100=90)
|
||
// → real footer decoration → garbage → pop (Python: bxs.pop(i)).
|
||
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 280, Bottom: 290}}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 840, X1: 300, Y1: 870, Label: "footer", Confidence: 0.9}, // y=280-290 after /3, PDF 93-97
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 300) // PDF height = 300/3 = 100
|
||
if len(boxes) != 0 {
|
||
t.Errorf("footer at bottom: should be popped as decoration, got %d boxes left", len(boxes))
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_HeaderRemovedAtTop(t *testing.T) {
|
||
// Header at page top edge (y=5 in 300px page → PDF height 100 → 5 < 10% of 100)
|
||
// → real header decoration → garbage → pop (Python: bxs.pop(i)).
|
||
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 5, Bottom: 20}}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 15, X1: 300, Y1: 60, Label: "header", Confidence: 0.9}, // y=5-20 after /3
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 300)
|
||
if len(boxes) != 0 {
|
||
t.Errorf("header at very top: should be popped as decoration, got %d boxes left", len(boxes))
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_HeaderKeptInMiddle(t *testing.T) {
|
||
// Header in middle of page (y=50 in 300px page → PDF height 100 → 50 > 10)
|
||
// → DLA false positive → KEEP the text.
|
||
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 50, Bottom: 70}}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 150, X1: 300, Y1: 210, Label: "header", Confidence: 0.9}, // y=50-70 after /3
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 300)
|
||
if boxes[0].LayoutType != "header" {
|
||
t.Errorf("header in middle of page: DLA false positive, keep text, got %q", boxes[0].LayoutType)
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_FooterRemovedAtBottom(t *testing.T) {
|
||
// Footer at page bottom (y=95 in 300px page → PDF height 100 → 95 > 90% of 100)
|
||
// → real footer decoration → garbage → REMOVE.
|
||
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 95, Bottom: 100}}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 285, X1: 300, Y1: 300, Label: "footer", Confidence: 0.9}, // y=95-100 after /3
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 300)
|
||
if len(boxes) != 0 {
|
||
t.Errorf("footer at very bottom: should be popped as decoration, got %d boxes left", len(boxes))
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_FooterKeptInMiddle(t *testing.T) {
|
||
// Footer in middle of page (y=50 in 300px page → PDF height 100 → 50 < 90)
|
||
// → DLA false positive → KEEP the text.
|
||
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 50, Bottom: 70}}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 150, X1: 300, Y1: 210, Label: "footer", Confidence: 0.9}, // y=50-70 after /3
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 300)
|
||
if boxes[0].LayoutType != "footer" {
|
||
t.Errorf("footer in middle of page: DLA false positive, keep text, got %q", boxes[0].LayoutType)
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_ReferenceAlwaysGarbage(t *testing.T) {
|
||
// Reference type is always garbage regardless of position (no keep_feat).
|
||
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 50, Bottom: 70}}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 150, X1: 300, Y1: 210, Label: "reference", Confidence: 0.9},
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 300)
|
||
if len(boxes) != 0 {
|
||
t.Errorf("reference: should always be garbage-filtered, got %q", boxes[0].LayoutType)
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_NonGarbageTypeUnaffected(t *testing.T) {
|
||
// "text" type is NOT a garbage type — should always be assigned.
|
||
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 200, Bottom: 220}}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 600, X1: 300, Y1: 660, Label: "text"},
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 300)
|
||
if boxes[0].LayoutType != "text" {
|
||
t.Errorf("non-garbage type: should be assigned, got %q", boxes[0].LayoutType)
|
||
}
|
||
}
|
||
|
||
func TestAnnotateBoxLayouts_ZeroPageHeightDisablesGarbage(t *testing.T) {
|
||
// pageImgHeight=0 → garbage check disabled → all types assigned.
|
||
boxes := []pdf.TextBox{{X0: 0, X1: 100, Top: 100, Bottom: 120}}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 300, X1: 300, Y1: 360, Label: "header", Confidence: 0.9},
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
|
||
if boxes[0].LayoutType != "header" {
|
||
t.Errorf("zero page height: garbage check disabled, got %q", boxes[0].LayoutType)
|
||
}
|
||
}
|
||
|
||
// TestAnnotateBoxLayouts_SyntheticFigure creates synthetic figure boxes for
|
||
// unmatched figure/equation DLA regions (Python: dla_cli.py:187-195).
|
||
func TestAnnotateBoxLayouts_SyntheticFigure(t *testing.T) {
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 20, Text: "text box"},
|
||
}
|
||
// Two figure regions, one text region
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 0, X1: 150, Y1: 60, Label: "text", Confidence: 0.9}, // matches text box → visited
|
||
{X0: 300, Y0: 300, X1: 600, Y1: 600, Label: "figure", Confidence: 0.9}, // no box overlaps → synthetic
|
||
{X0: 600, Y0: 0, X1: 900, Y1: 300, Label: "figure", Confidence: 0.9}, // no box overlaps → synthetic
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
|
||
// Original text box + 2 synthetic figure boxes = 3
|
||
if len(boxes) != 3 {
|
||
t.Fatalf("expected 3 boxes (1 original + 2 synthetic figures), got %d", len(boxes))
|
||
}
|
||
// Check synthetic boxes
|
||
foundFig0, foundFig1 := false, false
|
||
for _, b := range boxes {
|
||
if b.LayoutType == "figure" && b.Text == "" {
|
||
if b.LayoutNo == "figure-0" {
|
||
foundFig0 = true
|
||
if b.X0 != 100 || b.X1 != 200 {
|
||
t.Errorf("synthetic figure-0: expected x0=100,x1=200 (300/3,600/3), got x0=%v,x1=%v", b.X0, b.X1)
|
||
}
|
||
}
|
||
if b.LayoutNo == "figure-1" {
|
||
foundFig1 = true
|
||
}
|
||
}
|
||
}
|
||
if !foundFig0 {
|
||
t.Error("missing synthetic figure-0 box")
|
||
}
|
||
if !foundFig1 {
|
||
t.Error("missing synthetic figure-1 box")
|
||
}
|
||
}
|
||
|
||
// TestAnnotateBoxLayouts_EquationMappedToFigure verifies equation DLA regions
|
||
// get LayoutType="figure" but LayoutNo keeps "equation" prefix (Python behavior).
|
||
func TestAnnotateBoxLayouts_EquationMappedToFigure(t *testing.T) {
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 20},
|
||
}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "equation", Confidence: 0.9},
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
|
||
if len(boxes) != 1 {
|
||
t.Fatalf("expected 1 box, got %d", len(boxes))
|
||
}
|
||
if boxes[0].LayoutType != "figure" {
|
||
t.Errorf("equation → LayoutType: got %q, want 'figure'", boxes[0].LayoutType)
|
||
}
|
||
if boxes[0].LayoutNo != "equation-0" {
|
||
t.Errorf("equation → LayoutNo: got %q, want 'equation-0'", boxes[0].LayoutNo)
|
||
}
|
||
}
|
||
|
||
// TestAnnotateBoxLayouts_MixedTypesLayoutNo verifies per-type LayoutNo counting
|
||
// with multiple region types present.
|
||
func TestAnnotateBoxLayouts_MixedTypesLayoutNo(t *testing.T) {
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 20}, // overlaps text region 0
|
||
{X0: 0, X1: 100, Top: 200, Bottom: 220}, // overlaps text region 1
|
||
{X0: 200, X1: 300, Top: 0, Bottom: 20}, // overlaps figure region 0 only
|
||
}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 0, X1: 150, Y1: 60, Label: "text", Confidence: 0.9}, // text-0
|
||
{X0: 0, Y0: 600, X1: 150, Y1: 660, Label: "text", Confidence: 0.9}, // text-1
|
||
{X0: 600, Y0: 0, X1: 900, Y1: 60, Label: "figure", Confidence: 0.9}, // figure-0 (PDF: x0=200, x1=300)
|
||
}
|
||
boxes = AnnotateBoxLayouts(boxes, regions, 3.0, 0)
|
||
if len(boxes) != 3 {
|
||
t.Fatalf("expected 3 boxes, got %d", len(boxes))
|
||
}
|
||
// Check that text and figure indices are independent
|
||
if boxes[0].LayoutNo != "text-0" {
|
||
t.Errorf("box 0: got %q, want 'text-0'", boxes[0].LayoutNo)
|
||
}
|
||
if boxes[1].LayoutNo != "text-1" {
|
||
t.Errorf("box 1: got %q, want 'text-1'", boxes[1].LayoutNo)
|
||
}
|
||
if boxes[2].LayoutNo != "figure-0" {
|
||
t.Errorf("box 2: got %q, want 'figure-0' (independent from text counter)", boxes[2].LayoutNo)
|
||
}
|
||
}
|
||
|
||
// TestAnnotateBoxLayouts_CompactionPreservesWriteBackMapping verifies that
|
||
// when annotateBoxLayouts drops some boxes (CID garbage or garbage-layout
|
||
// at non-edge positions), the compaction step does not corrupt the caller's
|
||
// ability to write annotations back to the correct global box indices.
|
||
//
|
||
// The bug: annotateBoxLayouts compacts boxes in place in the shared backing
|
||
// array, shifting survivors forward. enrichWithDeepDoc then iterates
|
||
// len(indices) positions and writes pageBoxes[i] back to boxes[indices[i]],
|
||
// but after compaction pageBoxes[1] holds what was originally pageBoxes[2],
|
||
// so annotations land on the wrong global box.
|
||
|
||
func TestMatchTableRegions_SingleMatch(t *testing.T) {
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 50},
|
||
{X0: 200, X1: 300, Top: 0, Bottom: 50},
|
||
}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "table"}, // covers box 0 at scale 3
|
||
{X0: 600, Y0: 0, X1: 900, Y1: 150, Label: "text"}, // non-table, ignored
|
||
}
|
||
matches := MatchTableRegions(boxes, regions, 3.0)
|
||
if len(matches) != 1 {
|
||
t.Fatalf("expected 1 match, got %d", len(matches))
|
||
}
|
||
if len(matches[0].BoxIdx) != 1 || matches[0].BoxIdx[0] != 0 {
|
||
t.Errorf("expected box 0 matched, got %v", matches[0].BoxIdx)
|
||
}
|
||
}
|
||
|
||
func TestMatchTableRegions_NoTableLabel(t *testing.T) {
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 50},
|
||
}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"},
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "figure"},
|
||
}
|
||
matches := MatchTableRegions(boxes, regions, 3.0)
|
||
if len(matches) != 0 {
|
||
t.Errorf("non-table labels: expected 0 matches, got %d", len(matches))
|
||
}
|
||
}
|
||
|
||
func TestMatchTableRegions_MultipleBoxesSameTable(t *testing.T) {
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 50}, // box 0
|
||
{X0: 110, X1: 210, Top: 0, Bottom: 50}, // box 1
|
||
}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 0, X1: 630, Y1: 150, Label: "table"}, // covers both boxes at scale 3
|
||
}
|
||
matches := MatchTableRegions(boxes, regions, 3.0)
|
||
if len(matches) != 1 {
|
||
t.Fatalf("expected 1 match, got %d", len(matches))
|
||
}
|
||
if len(matches[0].BoxIdx) != 2 {
|
||
t.Errorf("expected 2 boxes matched, got %d: %v", len(matches[0].BoxIdx), matches[0].BoxIdx)
|
||
}
|
||
}
|
||
|
||
func TestMatchTableRegions_ImageOnlyPDF(t *testing.T) {
|
||
// Zero boxes — image-only PDF. Python processes every table DLA region
|
||
// regardless of text box overlap.
|
||
var boxes []pdf.TextBox // nil
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "table"},
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"},
|
||
}
|
||
matches := MatchTableRegions(boxes, regions, 3.0)
|
||
if len(matches) != 1 {
|
||
t.Fatalf("image-only: expected 1 table match, got %d", len(matches))
|
||
}
|
||
if len(matches[0].BoxIdx) != 0 {
|
||
t.Errorf("image-only: expected empty BoxIdx, got %d", len(matches[0].BoxIdx))
|
||
}
|
||
}
|
||
|
||
func TestMatchTableRegions_BelowThreshold(t *testing.T) {
|
||
// Region overlaps only a sliver of the box (<40%) → no match.
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 100},
|
||
}
|
||
regions := []pdf.DLARegion{
|
||
{X0: 0, Y0: 0, X1: 90, Y1: 90, Label: "table"}, // 30x30 at scale 3 → 9% overlap
|
||
}
|
||
matches := MatchTableRegions(boxes, regions, 3.0)
|
||
if len(matches) != 0 {
|
||
t.Errorf("below threshold: expected 0 matches, got %d", len(matches))
|
||
}
|
||
}
|
||
|
||
// MockTableBuilder is a test-only pdf.TableBuilder with a configurable GroupCells.
|
||
type MockTableBuilder struct {
|
||
GroupCellsFn func(cells []pdf.TSRCell) [][]pdf.TSRCell
|
||
}
|
||
|
||
func (m *MockTableBuilder) Name() string { return "mock" }
|
||
func (m *MockTableBuilder) DetectCells(_ context.Context, _ image.Image) ([]pdf.TSRCell, error) {
|
||
return nil, nil
|
||
}
|
||
func (m *MockTableBuilder) GroupCells(cells []pdf.TSRCell) [][]pdf.TSRCell {
|
||
if m.GroupCellsFn != nil {
|
||
return m.GroupCellsFn(cells)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// ── writeTableAnnotations unit tests ──────────────────────────────────
|
||
|
||
func TestWriteTableAnnotations_WriteBack(t *testing.T) {
|
||
boxes := []pdf.TextBox{
|
||
{X0: 10, X1: 100, Top: 10, Bottom: 30, Text: "A", LayoutType: "table"},
|
||
{X0: 110, X1: 200, Top: 10, Bottom: 30, Text: "B", LayoutType: "table"},
|
||
{X0: 10, X1: 100, Top: 35, Bottom: 55, Text: "C", LayoutType: "table"},
|
||
}
|
||
BoxIdx := []int{0, 2}
|
||
cells := []pdf.TSRCell{
|
||
{X0: 30, Y0: 30, X1: 300, Y1: 90, Label: "table row"},
|
||
{X0: 30, Y0: 110, X1: 300, Y1: 170, Label: "table row"},
|
||
}
|
||
scale := 3.0
|
||
|
||
tb := &MockTableBuilder{GroupCellsFn: func(cells []pdf.TSRCell) [][]pdf.TSRCell {
|
||
return [][]pdf.TSRCell{{cells[0]}, {cells[1]}}
|
||
}}
|
||
WriteTableAnnotations(boxes, BoxIdx, cells, scale, 0, 0, tb)
|
||
|
||
if boxes[0].R != 0 {
|
||
t.Errorf("box 0 R = %d, want 0", boxes[0].R)
|
||
}
|
||
if boxes[0].C != 0 {
|
||
t.Errorf("box 0 C = %d, want 0", boxes[0].C)
|
||
}
|
||
// Box 1 was not in BoxIdx — should NOT be annotated
|
||
if boxes[1].R != 0 || boxes[1].C != 0 {
|
||
t.Errorf("box 1 should not be annotated: R=%d C=%d", boxes[1].R, boxes[1].C)
|
||
}
|
||
if boxes[2].R != 1 {
|
||
t.Errorf("box 2 R = %d, want 1", boxes[2].R)
|
||
}
|
||
}
|
||
|
||
func TestWriteTableAnnotations_ScaleDown(t *testing.T) {
|
||
boxes := []pdf.TextBox{
|
||
{X0: 10, X1: 100, Top: 10, Bottom: 50, Text: "X", LayoutType: "table"},
|
||
}
|
||
BoxIdx := []int{0}
|
||
cells := []pdf.TSRCell{
|
||
{X0: 30, Y0: 30, X1: 300, Y1: 150, Label: "table row"},
|
||
}
|
||
scale := 3.0
|
||
|
||
tb := &MockTableBuilder{GroupCellsFn: func(cells []pdf.TSRCell) [][]pdf.TSRCell {
|
||
return [][]pdf.TSRCell{{cells[0]}}
|
||
}}
|
||
WriteTableAnnotations(boxes, BoxIdx, cells, scale, 0, 0, tb)
|
||
|
||
// After scale-down: RTop / 3 should be in PDF space (~10).
|
||
if boxes[0].RTop == 0 {
|
||
t.Error("RTop should be non-zero after annotation")
|
||
}
|
||
}
|
||
|
||
func TestWriteTableAnnotations_EmptyCells(t *testing.T) {
|
||
boxes := []pdf.TextBox{{X0: 10, X1: 100, Top: 10, Bottom: 50, Text: "X", LayoutType: "table"}}
|
||
BoxIdx := []int{0}
|
||
var cells []pdf.TSRCell
|
||
|
||
tb := &MockTableBuilder{GroupCellsFn: func(cells []pdf.TSRCell) [][]pdf.TSRCell {
|
||
return nil
|
||
}}
|
||
// Should not panic with empty cells.
|
||
WriteTableAnnotations(boxes, BoxIdx, cells, 3.0, 0, 0, tb)
|
||
if boxes[0].R != 0 || boxes[0].C != 0 {
|
||
t.Errorf("empty cells: R=%d C=%d, want 0,0", boxes[0].R, boxes[0].C)
|
||
}
|
||
}
|
||
|
||
// ── markNoMergeTables unit tests ─────────────────────────────────────
|
||
|
||
func TestMarkNoMergeTables_CaptionAfterTable(t *testing.T) {
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
|
||
{X0: 0, X1: 100, Top: 35, Bottom: 50, LayoutType: "table caption", Text: "表1:标题"},
|
||
}
|
||
tables := []pdf.TableItem{
|
||
{Positions: []pdf.Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}},
|
||
}
|
||
MarkNoMergeTables(boxes, tables)
|
||
if !tables[0].NoMerge {
|
||
t.Error("table followed by caption should be marked NoMerge")
|
||
}
|
||
}
|
||
|
||
func TestMarkNoMergeTables_TitleAfterTable(t *testing.T) {
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
|
||
{X0: 0, X1: 100, Top: 35, Bottom: 50, LayoutType: "title"},
|
||
}
|
||
tables := []pdf.TableItem{
|
||
{Positions: []pdf.Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}},
|
||
}
|
||
MarkNoMergeTables(boxes, tables)
|
||
if !tables[0].NoMerge {
|
||
t.Error("table followed by title should be marked NoMerge")
|
||
}
|
||
}
|
||
|
||
func TestMarkNoMergeTables_NoCaptionAfter(t *testing.T) {
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
|
||
{X0: 0, X1: 100, Top: 35, Bottom: 50, LayoutType: "text"},
|
||
{X0: 0, X1: 100, Top: 55, Bottom: 70, LayoutType: "table"},
|
||
}
|
||
tables := []pdf.TableItem{
|
||
{Positions: []pdf.Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}},
|
||
{Positions: []pdf.Position{{Left: 0, Right: 100, Top: 55, Bottom: 70}}},
|
||
}
|
||
MarkNoMergeTables(boxes, tables)
|
||
if tables[0].NoMerge {
|
||
t.Error("table followed by text should NOT be marked NoMerge")
|
||
}
|
||
if tables[1].NoMerge {
|
||
t.Error("last table should NOT be marked NoMerge")
|
||
}
|
||
}
|
||
|
||
func TestMarkNoMergeTables_StaleLastTableTI(t *testing.T) {
|
||
// Scenario: table box that does NOT overlap any pdf.TableItem.Position
|
||
// should reset lastTableTI. Otherwise the next caption marks the
|
||
// wrong (non-adjacent) table as NoMerge.
|
||
// Box 0: "table", overlaps table[0] → lastTableTI = 0
|
||
// Box 1: "table", no overlap → lastTableTI should reset to -1
|
||
// Box 2: "title" → should be a no-op (no adjacent table)
|
||
boxes := []pdf.TextBox{
|
||
{X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
|
||
{X0: 500, X1: 600, Top: 100, Bottom: 130, LayoutType: "table"}, // far away, no overlap
|
||
{X0: 0, X1: 100, Top: 140, Bottom: 160, LayoutType: "title"},
|
||
}
|
||
tables := []pdf.TableItem{
|
||
{Positions: []pdf.Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}}, // table 0
|
||
{Positions: []pdf.Position{{Left: 0, Right: 100, Top: 35, Bottom: 65}}}, // table 1 — box 0 doesn't overlap this either
|
||
}
|
||
MarkNoMergeTables(boxes, tables)
|
||
// table[0] should NOT be NoMerge: the title follows a non-matching
|
||
// table box, not table[0] directly.
|
||
if tables[0].NoMerge {
|
||
t.Error("stale lastTableTI: table[0] incorrectly marked NoMerge — " +
|
||
"the non-overlapping table box (box 1) should have reset lastTableTI")
|
||
}
|
||
}
|
||
|
||
func TestMarkNoMergeTables_EmptyInputs(t *testing.T) {
|
||
// Should not panic with empty inputs.
|
||
MarkNoMergeTables(nil, nil)
|
||
MarkNoMergeTables([]pdf.TextBox{}, []pdf.TableItem{})
|
||
}
|