Files
ragflow/internal/deepdoc/parser/pdf/deepdoc_test.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

905 lines
31 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//go:build cgo
package parser
import (
"context"
"fmt"
"image"
"strings"
"testing"
)
// ── MockDocAnalyzer tests ──────────────────────────────────────────────
func TestMockDocAnalyzer(t *testing.T) {
mock := &MockDocAnalyzer{
Healthy: true,
DLARegions: []DLARegion{
{X0: 0, Y0: 0, X1: 100, Y1: 100, Label: "table", Confidence: 0.95},
},
TSRCells: []TSRCell{
{X0: 0, Y0: 0, X1: 50, Y1: 30, Text: "A"},
},
}
if !mock.Health() {
t.Error("mock should be healthy")
}
regions, _ := mock.DLA(context.Background(), nil)
if len(regions) != 1 || regions[0].Label != "table" {
t.Error("mock DLA returned wrong data")
}
cells, _ := mock.TSR(context.Background(), nil)
if len(cells) != 1 || cells[0].Text != "A" {
t.Error("mock TSR returned wrong data")
}
// OCRDetect + OCRRecognize replaces deprecated OCR — tested in TestOCR_scanPage/TestOCR_fallback.
_ = mock.OCRDetect
_ = mock.OCRRecognize
// Unhealthy mock
mock2 := &MockDocAnalyzer{Healthy: false}
if mock2.Health() {
t.Error("unhealthy mock should return false")
}
}
// ── groupTSRCellsToRows ────────────────────────────────────────────────
func TestGroupTSRCellsToRows(t *testing.T) {
t.Run("empty", func(t *testing.T) {
if rows := groupTSRCellsToRows(nil); rows != nil {
t.Error("nil → nil")
}
if rows := groupTSRCellsToRows([]TSRCell{}); rows != nil {
t.Error("empty → nil")
}
})
t.Run("single cell", func(t *testing.T) {
cells := []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "A"}}
rows := groupTSRCellsToRows(cells)
if len(rows) != 1 || rows[0][0].Text != "A" {
t.Error("single cell not preserved")
}
})
t.Run("two rows two cols", func(t *testing.T) {
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 50, Y1: 30, Text: "A"},
{X0: 50, Y0: 0, X1: 100, Y1: 30, Text: "B"},
{X0: 0, Y0: 50, X1: 50, Y1: 80, Text: "C"},
{X0: 50, Y0: 50, X1: 100, Y1: 80, Text: "D"},
}
rows := groupTSRCellsToRows(cells)
if len(rows) != 2 {
t.Fatalf("2 rows expected, got %d", len(rows))
}
if rows[0][0].Text != "A" || rows[0][1].Text != "B" {
t.Errorf("row0: %v", cellTexts(rows[0]))
}
if rows[1][0].Text != "C" || rows[1][1].Text != "D" {
t.Errorf("row1: %v", cellTexts(rows[1]))
}
})
t.Run("unsorted input", func(t *testing.T) {
cells := []TSRCell{
{X0: 50, Y0: 50, X1: 100, Y1: 80, Text: "D"},
{X0: 0, Y0: 0, X1: 50, Y1: 30, Text: "A"},
{X0: 0, Y0: 50, X1: 50, Y1: 80, Text: "C"},
{X0: 50, Y0: 0, X1: 100, Y1: 30, Text: "B"},
}
rows := groupTSRCellsToRows(cells)
if len(rows) != 2 {
t.Fatalf("unsorted: 2 rows expected, got %d", len(rows))
}
if rows[0][0].Text != "A" || rows[0][1].Text != "B" {
t.Errorf("unsorted row0: %v", cellTexts(rows[0]))
}
})
t.Run("tall merged cell", func(t *testing.T) {
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 50, Y1: 100, Text: "merged"},
{X0: 50, Y0: 0, X1: 100, Y1: 30, Text: "B"},
{X0: 50, Y0: 50, X1: 100, Y1: 80, Text: "D"},
}
rows := groupTSRCellsToRows(cells)
// merged cell starts Y0=0 → row 0; Y0=50 cell → row 1
if len(rows) != 2 {
t.Fatalf("merged cell: 2 rows expected, got %d", len(rows))
}
})
t.Run("large gap different rows", func(t *testing.T) {
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 50, Y1: 30, Text: "top"},
{X0: 0, Y0: 200, X1: 50, Y1: 230, Text: "far"},
}
rows := groupTSRCellsToRows(cells)
if len(rows) != 2 {
t.Fatalf("large gap: 2 rows expected, got %d", len(rows))
}
})
}
// ── fillCellTextFromBoxes ──────────────────────────────────────────────
func TestFillCellTextFromBoxes(t *testing.T) {
t.Run("exact match", func(t *testing.T) {
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50},
{X0: 100, Y0: 0, X1: 200, Y1: 50},
}
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 50, Text: "A"},
{X0: 100, X1: 200, Top: 0, Bottom: 50, Text: "B"},
}
fillCellTextFromBoxes(cells, boxes)
if cells[0].Text != "A" || cells[1].Text != "B" {
t.Errorf("got %q/%q, want A/B", cells[0].Text, cells[1].Text)
}
})
t.Run("empty cells", func(t *testing.T) {
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50},
{X0: 100, Y0: 0, X1: 200, Y1: 50},
}
boxes := []TextBox{
{X0: 0, X1: 100, Top: 0, Bottom: 50, Text: "only first"},
}
fillCellTextFromBoxes(cells, boxes)
if cells[0].Text != "only first" {
t.Errorf("cell[0]: got %q", cells[0].Text)
}
if cells[1].Text != "" {
t.Errorf("cell[1] should be empty, got %q", cells[1].Text)
}
})
t.Run("partial cell coverage — empty cell filled from any overlapping box", func(t *testing.T) {
// Box covers 40% of cell area. Old code rejected (<85% cell coverage).
// New code: cell is empty → accepts box (≥30% box area inside cell).
cells := []TSRCell{{X0: 0, Y0: 0, X1: 200, Y1: 50}}
boxes := []TextBox{{X0: 0, X1: 80, Top: 0, Bottom: 50, Text: "partial"}}
fillCellTextFromBoxes(cells, boxes)
if cells[0].Text != "partial" {
t.Errorf("empty cell should be filled from overlapping box, got %q", cells[0].Text)
}
})
t.Run("box inside cell >85%", func(t *testing.T) {
cells := []TSRCell{{X0: 0, Y0: 0, X1: 500, Y1: 300}}
boxes := []TextBox{{X0: 10, X1: 490, Top: 10, Bottom: 290, Text: "inside"}}
fillCellTextFromBoxes(cells, boxes)
if cells[0].Text != "inside" {
t.Errorf("got %q", cells[0].Text)
}
})
t.Run("concatenate two boxes to same cell", func(t *testing.T) {
cells := []TSRCell{{X0: 0, Y0: 0, X1: 200, Y1: 100}}
boxes := []TextBox{
{X0: 5, X1: 195, Top: 2, Bottom: 98, Text: "hello"},
{X0: 5, X1: 195, Top: 2, Bottom: 98, Text: "world"},
}
fillCellTextFromBoxes(cells, boxes)
if cells[0].Text != "hello world" {
t.Errorf("got %q, want 'hello world'", cells[0].Text)
}
})
t.Run("empty inputs", func(t *testing.T) {
fillCellTextFromBoxes(nil, nil)
fillCellTextFromBoxes([]TSRCell{}, []TextBox{})
c := []TSRCell{{X0: 0, Y0: 0, X1: 1, Y1: 1}}
fillCellTextFromBoxes(c, nil)
if c[0].Text != "" {
t.Error("no boxes → text empty")
}
})
}
// ── regionOverlapsBox ──────────────────────────────────────────────────
func TestRegionOverlapsBox(t *testing.T) {
scale := 3.0
tests := []struct {
name string
region DLARegion
box TextBox
expected bool
}{
{"full overlap", DLARegion{X0: 0, Y0: 300, X1: 1500, Y1: 2300, Label: "table", Confidence: 0.9}, TextBox{X0: 50, X1: 500, Top: 100, Bottom: 760, Text: "x", PageNumber: 0}, true},
{"no overlap", DLARegion{X0: 0, Y0: 3000, X1: 1500, Y1: 5000, Label: "table", Confidence: 0.9}, TextBox{X0: 50, X1: 500, Top: 0, Bottom: 10, Text: "x", PageNumber: 0}, false},
{"no Y overlap", DLARegion{X0: 150, Y0: 300, X1: 1650, Y1: 336, Label: "table", Confidence: 0.9}, TextBox{X0: 50, X1: 550, Top: 500, Bottom: 520, Text: "x", PageNumber: 0}, false},
{"zero area box", DLARegion{X0: 0, Y0: 300, X1: 1500, Y1: 2300, Label: "table", Confidence: 0.9}, TextBox{X0: 50, X1: 50, Top: 50, Bottom: 50, Text: "x", PageNumber: 0}, false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := regionOverlapsBox(tt.region, tt.box, scale); got != tt.expected {
t.Errorf("= %v, want %v", got, tt.expected)
}
})
}
}
// ── enrichWithDeepDoc noop ─────────────────────────────────────────────
func TestEnrichWithDeepDoc_Noop(t *testing.T) {
boxes := []TextBox{
{PageNumber: 0, X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "text"},
}
eng := &mockEngine{pageCount: 1}
p := NewParser(DefaultParserConfig(), &MockDocAnalyzer{Healthy: false, Model: ModelSaas})
tables := p.enrichWithDeepDoc(context.Background(), eng, boxes, nil)
if len(tables) != 0 {
t.Error("unhealthy DeepDoc → 0 Tables")
}
}
// ── extractTableBoxesFromImage with mock ───────────────────────────────
func TestExtractTableBoxes_Mock(t *testing.T) {
boxes := []TextBox{
{PageNumber: 0, X0: 80, X1: 500, Top: 200, Bottom: 550, Text: "cell 1"},
{PageNumber: 0, X0: 80, X1: 500, Top: 550, Bottom: 760, Text: "cell 2"},
{PageNumber: 0, X0: 50, X1: 550, Top: 100, Bottom: 180, Text: "heading"},
{PageNumber: 0, X0: 50, X1: 550, Top: 780, Bottom: 850, Text: "below"},
}
mock := &MockDocAnalyzer{
Healthy: true,
DLARegions: []DLARegion{
{X0: 250, Y0: 600, X1: 1500, Y1: 2300, Label: "table", Confidence: 0.95},
},
TSRCells: []TSRCell{
{X0: 0, Y0: 0, X1: 600, Y1: 400, Text: "A1"},
{X0: 600, Y0: 0, X1: 1240, Y1: 400, Text: "B1"},
{X0: 0, Y0: 410, X1: 600, Y1: 800, Text: "A2"},
{X0: 600, Y0: 410, X1: 1240, Y1: 800, Text: "B2"},
},
}
p := NewParser(DefaultParserConfig(), mock)
dummyImg := image.NewRGBA(image.Rect(0, 0, 2000, 3000))
tables := p.extractTableBoxesFromImage(context.Background(), boxes, dummyImg, 0, 0)
if len(tables) != 1 {
t.Fatalf("expected 1 TableItem, got %d", len(tables))
}
tbl := tables[0]
if len(tbl.Cells) != 4 {
t.Errorf("expected 4 cells, got %d", len(tbl.Cells))
}
// Rows populated later by constructTable via extractTableAndReplace.
if tbl.ImageB64 == "" {
t.Error("ImageB64 empty")
}
if len(tbl.Positions) != 2 {
t.Errorf("expected 2 Positions, got %d", len(tbl.Positions))
}
}
func TestExtractTableBoxes_NoTables(t *testing.T) {
mock := &MockDocAnalyzer{Healthy: true, DLARegions: []DLARegion{}}
p := NewParser(DefaultParserConfig(), mock)
dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000))
tables := p.extractTableBoxesFromImage(context.Background(), nil, dummy, 0, 0)
if len(tables) != 0 {
t.Errorf("0 tables expected, got %d", len(tables))
}
}
func TestExtractTableBoxes_NonTableRegions(t *testing.T) {
mock := &MockDocAnalyzer{
Healthy: true,
DLARegions: []DLARegion{
{X0: 150, Y0: 300, X1: 1650, Y1: 336, Label: "text", Confidence: 0.9},
{X0: 150, Y0: 600, X1: 1650, Y1: 900, Label: "figure", Confidence: 0.8},
},
}
p := NewParser(DefaultParserConfig(), mock)
dummy := image.NewRGBA(image.Rect(0, 0, 2000, 2000))
tables := p.extractTableBoxesFromImage(context.Background(), nil, dummy, 0, 0)
if len(tables) != 0 {
t.Errorf("non-table regions → 0 tables, got %d", len(tables))
}
}
func TestExtractTableBoxes_NoOverlap(t *testing.T) {
boxes := []TextBox{
{PageNumber: 0, X0: 50, X1: 550, Top: 10, Bottom: 30, Text: "far away"},
}
mock := &MockDocAnalyzer{
Healthy: true,
DLARegions: []DLARegion{
{X0: 150, Y0: 1500, X1: 1500, Y1: 2300, Label: "table", Confidence: 0.95},
},
}
p := NewParser(DefaultParserConfig(), mock)
dummy := image.NewRGBA(image.Rect(0, 0, 2000, 3000))
tables := p.extractTableBoxesFromImage(context.Background(), boxes, dummy, 0, 0)
if len(tables) != 0 {
t.Errorf("no overlap → 0 tables, got %d", len(tables))
}
}
func TestExtractTableBoxes_TSRError(t *testing.T) {
boxes := []TextBox{
{PageNumber: 0, X0: 80, X1: 500, Top: 210, Bottom: 660, Text: "cell"},
}
mock := &MockDocAnalyzer{
Healthy: true,
DLARegions: []DLARegion{
{X0: 250, Y0: 600, X1: 1500, Y1: 2000, Label: "table", Confidence: 0.95},
},
TSRCells: nil, // TSR returns nothing
}
p := NewParser(DefaultParserConfig(), mock)
dummy := image.NewRGBA(image.Rect(0, 0, 2000, 3000))
tables := p.extractTableBoxesFromImage(context.Background(), boxes, dummy, 0, 0)
if len(tables) != 1 {
t.Fatalf("TSR failure: expected 1 TableItem with image+positions, got %d", len(tables))
}
if tables[0].ImageB64 == "" {
t.Error("should have image despite TSR failure")
}
if len(tables[0].Positions) == 0 {
t.Error("should have positions despite TSR failure")
}
if len(tables[0].Rows) != 0 {
t.Errorf("TSR failure → 0 rows, got %d", len(tables[0].Rows))
}
}
func TestGroupTSRCellsToRows_SameHeight(t *testing.T) {
// All cells have identical height → medianH is that value → threshold = medianH/2
cells := []TSRCell{
{X0: 0, Y0: 0, X1: 50, Y1: 30, Text: "A"},
{X0: 50, Y0: 0, X1: 100, Y1: 30, Text: "B"},
{X0: 0, Y0: 31, X1: 50, Y1: 61, Text: "C"}, // gap = 31-30=1 < 30/2=15 → same row? NO, Y0=31 is right at edge
}
rows := groupTSRCellsToRows(cells)
// medianH=30, threshold=15. C.Y0=31 > curY+threshold?" curY=0, 31 > 15 → new row.
// So A,B in row 0, C in row 1.
if len(rows) != 2 {
t.Fatalf("expected 2 rows, got %d", len(rows))
}
if len(rows[0]) != 2 || len(rows[1]) != 1 {
t.Errorf("row sizes: %d %d, want 2 1", len(rows[0]), len(rows[1]))
}
}
func TestFillCellTextFromBoxes_WhitespaceTrim(t *testing.T) {
cells := []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 100}}
boxes := []TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 100, Text: " hello "}}
fillCellTextFromBoxes(cells, boxes)
if cells[0].Text != "hello" {
t.Errorf("got %q, want 'hello'", cells[0].Text)
}
}
func TestFillCellTextFromBoxes_EmptyBoxIgnored(t *testing.T) {
cells := []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 100}}
boxes := []TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 100, Text: " "}} // all whitespace
fillCellTextFromBoxes(cells, boxes)
if cells[0].Text != "" {
t.Errorf("whitespace text should produce empty, got %q", cells[0].Text)
}
}
func TestExtractTableBoxes_DLAError(t *testing.T) {
// DLA returns only non-table regions → 0 tables
mock := &MockDocAnalyzer{Healthy: true, DLARegions: []DLARegion{
{X0: 0, Y0: 0, X1: 100, Y1: 100, Label: "text", Confidence: 0.9},
}}
p := NewParser(DefaultParserConfig(), mock)
dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000))
tables := p.extractTableBoxesFromImage(context.Background(), nil, dummy, 0, 0)
if len(tables) != 0 {
t.Errorf("non-table DLA → 0 tables, got %d", len(tables))
}
}
func TestAnnotateBoxLayouts(t *testing.T) {
boxes := []TextBox{
{X0: 50, X1: 200, Top: 100, Bottom: 200, Text: "title text"},
{X0: 250, X1: 500, Top: 100, Bottom: 200, Text: "body"},
{X0: 50, X1: 500, Top: 300, Bottom: 600, Text: "table content"},
{X0: 50, X1: 500, Top: 700, Bottom: 800, Text: "unmatched"},
}
regions := []DLARegion{
{X0: 150, Y0: 300, X1: 600, Y1: 600, Label: "title", Confidence: 0.9}, // PDF pts: X50-200,Y100-200 → only box[0]
{X0: 750, Y0: 300, X1: 1500, Y1: 600, Label: "text", Confidence: 0.8}, // PDF pts: X250-500,Y100-200 → box[1]
{X0: 150, Y0: 900, X1: 1500, Y1: 1800, Label: "table", Confidence: 0.95}, // PDF pts: X50-500,Y300-600 → box[2]
}
scale := 3.0
annotateBoxLayouts(boxes, regions, scale, 0)
if boxes[0].LayoutType != "title" {
t.Errorf("box[0] = %q, want title", boxes[0].LayoutType)
}
if boxes[1].LayoutType != "text" {
t.Errorf("box[1] = %q, want text", boxes[1].LayoutType)
}
if boxes[2].LayoutType != "table" {
t.Errorf("box[2] = %q, want table", boxes[2].LayoutType)
}
if boxes[3].LayoutType != "" {
t.Errorf("box[3] = %q, want empty (no matching region)", boxes[3].LayoutType)
}
}
func TestAnnotateBoxLayouts_Figure(t *testing.T) {
// Figure region → box gets "figure" layout type (no TSR needed)
boxes := []TextBox{
{X0: 50, X1: 500, Top: 100, Bottom: 400, Text: "chart image"},
}
regions := []DLARegion{
{X0: 50, Y0: 200, X1: 2000, Y1: 1000, Label: "figure", Confidence: 0.85},
}
annotateBoxLayouts(boxes, regions, 3.0, 0)
if boxes[0].LayoutType != "figure" {
t.Errorf("LayoutType = %q, want 'figure'", boxes[0].LayoutType)
}
}
func TestAnnotateBoxLayouts_Empty(t *testing.T) {
boxes := []TextBox{{Text: "x"}}
annotateBoxLayouts(boxes, nil, 3.0, 0)
if boxes[0].LayoutType != "" {
t.Error("empty regions → no annotation")
}
}
func TestBoxesToSections_PassesLayoutType(t *testing.T) {
boxes := []TextBox{
{PageNumber: 0, X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "标题", LayoutType: "title"},
{PageNumber: 0, X0: 50, X1: 550, Top: 200, Bottom: 212, Text: "表格", LayoutType: "table"},
{PageNumber: 0, X0: 50, X1: 550, Top: 300, Bottom: 312, Text: "正文", LayoutType: "text"},
}
sections := boxesToSections(boxes, nil)
if len(sections) != 3 {
t.Fatalf("expected 3 sections, got %d", len(sections))
}
if sections[0].LayoutType != "title" {
t.Errorf("section[0].LayoutType = %q, want 'title'", sections[0].LayoutType)
}
if sections[1].LayoutType != "table" {
t.Errorf("section[1].LayoutType = %q, want 'table'", sections[1].LayoutType)
}
if sections[2].LayoutType != "text" {
t.Errorf("section[2].LayoutType = %q, want 'text'", sections[2].LayoutType)
}
}
func TestBoxesToSections_PreservesTableLayout(t *testing.T) {
// boxesToSections should produce sections for all boxes regardless of LayoutType.
boxes := []TextBox{
{PageNumber: 0, X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "标题", LayoutType: "title"},
{PageNumber: 0, X0: 50, X1: 550, Top: 200, Bottom: 212, Text: "表格文字", LayoutType: "table"},
{PageNumber: 0, X0: 50, X1: 550, Top: 300, Bottom: 312, Text: "正文", LayoutType: "text"},
{PageNumber: 0, X0: 50, X1: 550, Top: 400, Bottom: 412, Text: ""},
}
sections := boxesToSections(boxes, nil)
if len(sections) != 3 {
t.Errorf("expected 3 sections (1 empty skipped), got %d", len(sections))
}
for _, s := range sections {
if strings.Contains(s.Text, "@@") {
t.Error("section text should NOT contain position tag")
}
}
t.Logf("boxesToSections: %d sections (all LayoutTypes passed through)", len(sections))
}
func TestEnrichWithDeepDoc_PreservesBoxes(t *testing.T) {
// Simulate enrichWithDeepDoc's write-back logic:
// 1. Create pageBoxes as copies of p.boxes[idx]
// 2. annotateBoxLayouts(pageBoxes, regions) — modifies copies
// 3. Write LayoutType back to p.boxes[idx]
// This test validates step 3 works.
original := []TextBox{
{PageNumber: 0, X0: 50, X1: 200, Top: 50, Bottom: 80, Text: "title", LayoutType: ""},
{PageNumber: 0, X0: 50, X1: 200, Top: 100, Bottom: 200, Text: "text before", LayoutType: ""},
{PageNumber: 0, X0: 50, X1: 500, Top: 250, Bottom: 700, Text: "table cell", LayoutType: ""},
{PageNumber: 0, X0: 50, X1: 200, Top: 750, Bottom: 800, Text: "text after", LayoutType: ""},
{PageNumber: 1, X0: 50, X1: 200, Top: 50, Bottom: 80, Text: "page2", LayoutType: ""},
}
byPage := map[int][]int{0: {0, 1, 2, 3}, 1: {4}} // indices into original
regions := []DLARegion{
{X0: 150, Y0: 150, X1: 600, Y1: 240, Label: "title", Confidence: 0.9}, // PDF: X50-200,Y50-80 → box[0]
{X0: 150, Y0: 750, X1: 1500, Y1: 2100, Label: "table", Confidence: 0.95}, // PDF: X50-500,Y250-700 → box[2]
}
// Step 1-2: copy + annotate
for _, indices := range byPage {
pageBoxes := make([]TextBox, len(indices))
for i, idx := range indices {
pageBoxes[i] = original[idx]
}
annotateBoxLayouts(pageBoxes, regions, 3.0, 0)
// Step 3: write back (this is what enrichWithDeepDoc now does)
for i, idx := range indices {
if pageBoxes[i].LayoutType != "" {
original[idx].LayoutType = pageBoxes[i].LayoutType
}
}
}
if original[0].LayoutType != "title" {
t.Errorf("box[0] LayoutType = %q, want 'title'", original[0].LayoutType)
}
if original[2].LayoutType != "table" {
t.Errorf("box[2] LayoutType = %q, want 'table'", original[2].LayoutType)
}
if original[1].LayoutType != "" {
t.Errorf("box[1] LayoutType = %q, want '' (no matching region)", original[1].LayoutType)
}
// All boxes still present
if len(original) != 5 {
t.Errorf("all boxes preserved: got %d, want 5", len(original))
}
t.Logf("Write-back verified: box[0]=%q box[2]=%q", original[0].LayoutType, original[2].LayoutType)
}
func TestBoxesToSections_PositionsFromTag(t *testing.T) {
boxes := []TextBox{
{PageNumber: 0, X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "标题段落"},
}
sections := boxesToSections(boxes, nil)
if sections[0].PositionTag == "" {
t.Error("PositionTag should not be empty")
}
if len(sections[0].Positions) == 0 {
t.Error("Positions should be parsed from PositionTag — BUG: ExtractPositions not called")
}
if len(sections[0].Positions) > 0 {
pos := sections[0].Positions[0]
if pos.Left != 50 || pos.Right != 550 || pos.Top != 100 || pos.Bottom != 112 {
t.Errorf("position coords wrong: got (%.0f,%.0f,%.0f,%.0f)", pos.Left, pos.Right, pos.Top, pos.Bottom)
}
}
t.Logf("Positions: %v", sections[0].Positions)
}
func TestParse_TableLinkedToSections(t *testing.T) {
// Simulate enrichWithDeepDoc → extractTableAndReplace → boxesToSections:
// table boxes are popped and replaced with one HTML box.
boxes := []TextBox{
{PageNumber: 0, X0: 50, X1: 200, Top: 50, Bottom: 80, Text: "heading"},
{PageNumber: 0, X0: 50, X1: 500, Top: 250, Bottom: 400, Text: "table text", LayoutType: "table"},
{PageNumber: 0, X0: 50, X1: 200, Top: 450, Bottom: 480, Text: "after"},
}
tableItem := TableItem{
Cells: []TSRCell{
{X0: 0, Y0: 0, X1: 200, Y1: 50, Label: "table row"},
{X0: 0, Y0: 51, X1: 200, Y1: 100, Label: "table row"},
},
Positions: []Position{{PageNumbers: []int{0}, Left: 50, Right: 500, Top: 250, Bottom: 400}},
Scale: 1.0,
}
boxes = extractTableAndReplace(boxes, []TableItem{tableItem})
sections := boxesToSections(boxes, nil)
// 3 boxes (heading, table, after) → 3 sections (heading, HTML, after).
if len(sections) != 3 {
t.Errorf("expected 3 sections, got %d", len(sections))
}
tableFound := false
for _, s := range sections {
if s.LayoutType == "table" && strings.Contains(s.Text, "<table>") {
tableFound = true
}
}
if !tableFound {
t.Errorf("expected at least one section with HTML table")
for _, s := range sections {
t.Logf(" section text=%q LayoutType=%q", s.Text[:min(40, len(s.Text))], s.LayoutType)
}
}
}
func cellTexts(cells []TSRCell) []string {
t := make([]string, len(cells))
for i, c := range cells {
t[i] = c.Text
}
return t
}
// ── cropImageRegion ────────────────────────────────────────────────────
func TestCropImageRegion(t *testing.T) {
img := image.NewRGBA(image.Rect(0, 0, 200, 300))
t.Run("normal crop", func(t *testing.T) {
r := DLARegion{X0: 10, Y0: 20, X1: 100, Y1: 150}
cropped, err := cropImageRegion(img, r)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// 3% proportional margin: 90×3%≈3px, 130×3%≈4px → 95×137
if cropped.Bounds().Dx() != 95 || cropped.Bounds().Dy() != 137 {
t.Errorf("size %v, want 95x137", cropped.Bounds())
}
})
t.Run("x0 >= x1 returns error", func(t *testing.T) {
// 3% proportional margin on each side: if the gap is too small after margin expansion, x0 ≥ x1 triggers error.
r := DLARegion{X0: 110, Y0: 20, X1: 50, Y1: 150}
_, err := cropImageRegion(img, r)
if err == nil {
t.Fatal("expected error for x0 >= x1, got nil")
}
})
t.Run("y0 >= y1 returns error", func(t *testing.T) {
r := DLARegion{X0: 10, Y0: 150, X1: 100, Y1: 20}
_, err := cropImageRegion(img, r)
if err == nil {
t.Fatal("expected error for y0 >= y1, got nil")
}
})
t.Run("region fully outside image bounds", func(t *testing.T) {
// Clamped to image bounds → zero-width/height → error.
r := DLARegion{X0: 300, Y0: 400, X1: 500, Y1: 600}
_, err := cropImageRegion(img, r)
if err == nil {
t.Fatal("expected error for region outside image bounds")
}
})
}
// ── extractTableBoxesFromImage: invalid DLA region ─────────────────────
func TestExtractTableBoxes_InvalidRegion(t *testing.T) {
// DLA returns a table region with x1 < x0. The pipeline should skip
// this table gracefully (Python raises ValueError from PIL.Image.crop).
mock := &MockDocAnalyzer{
Healthy: true,
DLARegions: []DLARegion{
{X0: 500, Y0: 100, X1: 100, Y1: 300, Label: "table", Confidence: 0.9},
},
}
p := NewParser(DefaultParserConfig(), mock)
dummy := image.NewRGBA(image.Rect(0, 0, 1000, 1000))
tables := p.extractTableBoxesFromImage(context.Background(), nil, dummy, 0, 0)
if len(tables) != 0 {
t.Errorf("invalid DLA region should be skipped, got %d tables", len(tables))
}
}
// ── DLA → figure end-to-end ───────────────────────────────────────────
func TestParse_CollectsFigures(t *testing.T) {
// End-to-end: Parse() with mock DeepDoc that labels a box as "figure".
// Verify p.Figures is populated.
eng := &mockEngine{pageCount: 1, chars: map[int][]TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "chart image"}}}}
mock := &MockDocAnalyzer{
Healthy: true,
DLARegions: []DLARegion{
{X0: 50, Y0: 200, X1: 2000, Y1: 1000, Label: "figure", Confidence: 0.85},
},
}
p := NewParser(DefaultParserConfig(), mock)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) == 0 {
t.Fatal("expected at least 1 section")
}
if len(result.Figures) != 1 {
t.Fatalf("expected 1 figure, got %d", len(result.Figures))
}
if result.Figures[0].LayoutType != "figure" {
t.Errorf("figure LayoutType = %q, want 'figure'", result.Figures[0].LayoutType)
}
if result.Figures[0].Text == "" {
t.Error("figure Text should not be empty")
}
}
func TestParse_NoFigures(t *testing.T) {
// Parse() with no DLA figure regions → p.Figures should be empty.
eng := &mockEngine{pageCount: 1, chars: map[int][]TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "just text"}}}}
mock := &MockDocAnalyzer{
DLARegions: []DLARegion{
{X0: 150, Y0: 300, X1: 1500, Y1: 600, Label: "text", Confidence: 0.8},
},
}
p := NewParser(DefaultParserConfig(), mock)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Figures) != 0 {
t.Fatalf("expected 0 figures, got %d", len(result.Figures))
}
}
func TestParse_NoDeepDoc_NoFigures(t *testing.T) {
// Parse() with mock DeepDoc → Figures should be empty (no DLA-detected figures).
eng := &mockEngine{pageCount: 1, chars: map[int][]TextChar{0: {{X0: 50, X1: 550, Top: 100, Bottom: 112, Text: "text"}}}}
p := NewParser(DefaultParserConfig(), &MockDocAnalyzer{Healthy: true, Model: ModelSaas})
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Figures) != 0 {
t.Fatalf("expected 0 Figures (no DLA-detected figures), got %d", len(result.Figures))
}
}
// ── Parse + ocrMergeChars (full-page detect) ──────────────────────────
func TestParse_UsesOCRDetectForEmbeddedChars(t *testing.T) {
// When DeepDoc is available and the page has embedded chars,
// Parse should use ocrMergeChars (detect → merge → recognize).
eng := &mockEngine{
pageCount: 1,
chars: map[int][]TextChar{0: {
{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0},
}},
}
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []OCRBox{
{X0: 5, Y0: 5, X1: 50, Y1: 5, X2: 50, Y2: 50, X3: 5, Y3: 50},
},
}
p := NewParser(DefaultParserConfig(), mock)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) == 0 {
t.Fatal("expected at least 1 section")
}
// The box should come from OCR detect, not charsToBoxes.
// Verifying that ocrMergeChars was used (sections exist).
if result.Metrics.BoxesInitial == 0 {
t.Error("expected BoxesInitial > 0 (OCR detect path)")
}
}
func TestParse_FallsBackToCharsToBoxes_NoDeepDoc(t *testing.T) {
// Without DeepDoc, Parse should use charsToBoxes (unchanged behavior).
eng := &mockEngine{
pageCount: 1,
chars: map[int][]TextChar{0: {
{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0},
}},
}
p := NewParser(DefaultParserConfig(), &MockDocAnalyzer{Healthy: true, Model: ModelSaas})
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) == 0 {
t.Fatal("expected at least 1 section (charsToBoxes)")
}
}
func TestParse_FallsBackToCharsToBoxes_EmptyOCRBoxes(t *testing.T) {
// OCRDetect returns no boxes → falls through to charsToBoxes.
eng := &mockEngine{
pageCount: 1,
chars: map[int][]TextChar{0: {
{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0},
}},
}
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []OCRBox{}, // empty detect
}
p := NewParser(DefaultParserConfig(), mock)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) == 0 {
t.Fatal("expected at least 1 section (charsToBoxes fallback)")
}
}
// ── Error path coverage ────────────────────────────────────────────────
func TestMockDocAnalyzer_DLAError_DoesNotCrash(t *testing.T) {
p := NewParser(DefaultParserConfig(), &MockDocAnalyzer{
Healthy: true,
DLAErr: fmt.Errorf("DLA service unavailable"),
})
eng := &mockEngine{pageCount: 1}
img := image.NewRGBA(image.Rect(0, 0, 100, 100))
pageImages := map[int]image.Image{0: img}
boxes := []TextBox{
{PageNumber: 0, X0: 0, X1: 100, Top: 0, Bottom: 50, Text: "text"},
}
// enrichWithDeepDoc should return nil (not panic) on DLA error.
tables := p.enrichWithDeepDoc(context.Background(), eng, boxes, pageImages)
if len(tables) != 0 {
t.Errorf("DLA error should produce 0 tables, got %d", len(tables))
}
}
func TestMockDocAnalyzer_TSRError_DoesNotCrash(t *testing.T) {
// TSR error: DLA succeeds, TSR fails. The table region is detected
// but no cells are returned — the table is skipped gracefully.
p := NewParser(DefaultParserConfig(), &MockDocAnalyzer{
Healthy: true,
DLARegions: []DLARegion{
{X0: 0, Y0: 0, X1: 400, Y1: 400, Label: "table", Confidence: 0.95},
},
TSRErr: fmt.Errorf("TSR model timeout"),
})
eng := &mockEngine{pageCount: 1}
img := image.NewRGBA(image.Rect(0, 0, 100, 100))
pageImages := map[int]image.Image{0: img}
boxes := []TextBox{
{PageNumber: 0, X0: 10, X1: 90, Top: 10, Bottom: 90, Text: "in table region"},
}
tables := p.enrichWithDeepDoc(context.Background(), eng, boxes, pageImages)
// DLA detects the table region → 1 TableItem is created. TSR failure
// means it has no cells, but the pipeline must not panic.
if len(tables) != 1 {
t.Errorf("TSR error: expected 1 table (DLA region found), got %d", len(tables))
}
if len(tables[0].Cells) != 0 {
t.Errorf("TSR error: Cells should be empty, got %d", len(tables[0].Cells))
}
}
func TestMockDocAnalyzer_OCRDetectError_DoesNotCrash(t *testing.T) {
// OCRDetect failure path: extractPages uses ocrDetectAndRecognize which
// calls doc.OCRDetect. When it fails, the page is skipped gracefully.
mock := &MockDocAnalyzer{Healthy: true, OCRDetectErr: fmt.Errorf("OCR model OOM")}
eng := &mockEngine{
pageCount: 1,
chars: map[int][]TextChar{}, // empty → triggers OCR path
}
p := NewParser(DefaultParserConfig(), mock)
_, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse returned error: %v", err)
}
// Parse should succeed — the page with OCRDetect error is just skipped.
}
// TestTSRLabels verifies Go defaultTSRLabels matches Python's table_structure_recognizer.py labels.
// Order must be exact — the ONNX model returns class IDs that index into this array.
func TestTSRLabels(t *testing.T) {
want := []string{
"table", "table column", "table row",
"table column header", "table projected row header",
"table spanning cell",
}
if len(defaultTSRLabels) != len(want) {
t.Fatalf("defaultTSRLabels length %d, want %d", len(defaultTSRLabels), len(want))
}
for i := range want {
if defaultTSRLabels[i] != want[i] {
t.Errorf("defaultTSRLabels[%d] = %q, want %q", i, defaultTSRLabels[i], want[i])
}
}
}