package parser import ( "context" "image" "strings" "testing" ) // TestTableSection_TextFromTSR verifies that table Sections carry // TSR-structured text (from TableItem.Rows) rather than raw char text. // Python _parse_loaded_window_into_bboxes runs _extract_table_figure // which pops table boxes and replaces them with consolidated table // entries. Go backfills Section.Text from TableItem.Rows after // linkTableSections. func TestTableSection_TextFromTSR(t *testing.T) { eng := &mockEngine{ pageCount: 1, renderW: 900, // 300pt at 3x = 900px (216 DPI) renderH: 600, chars: map[int][]TextChar{0: { // PDF space (72 DPI): well inside DLA region {X0: 50, X1: 70, Top: 40, Bottom: 55, Text: "姓"}, {X0: 80, X1: 100, Top: 40, Bottom: 55, Text: "名"}, }}, } mock := &MockDocAnalyzer{ Healthy: true, // DLA table region in pixel space (216 DPI). // PDF space: x0=100/3≈33, y0=80/3≈27, x1=500/3≈167, y1=300/3≈100. DLARegions: []DLARegion{ {X0: 100, Y0: 80, X1: 500, Y1: 300, Label: "table", Confidence: 0.9}, }, // TSR returns structured 2x2 cells with text. // Pixel space (relative to cropped region). TSRCells: []TSRCell{ {X0: 0, Y0: 0, X1: 200, Y1: 100, Text: "姓名", Label: "table column header"}, {X0: 200, Y0: 0, X1: 460, Y1: 100, Text: "年龄", Label: "table column header"}, {X0: 0, Y0: 100, X1: 200, Y1: 220, Text: "张三", Label: "table row"}, {X0: 200, Y0: 100, X1: 460, Y1: 220, Text: "25", Label: "table row"}, }, } p := NewParser(DefaultParserConfig(), mock) result, err := p.Parse(context.Background(), eng) if err != nil { t.Fatalf("Parse: %v", err) } // ── Assert 1: Tables exist (Cells are filled by constructTable later) ── if len(result.Tables) == 0 { t.Fatal("expected at least 1 TableItem") } tbl := result.Tables[0] if len(tbl.Cells) == 0 { t.Fatal("expected TSR cells in TableItem") } // ── Assert 2: A table section exists with HTML output ── var tableSections []Section for _, s := range result.Sections { if s.LayoutType == "table" { tableSections = append(tableSections, s) } } if len(tableSections) == 0 { t.Fatal("expected at least 1 section with LayoutType=='table'") } ts := tableSections[0] // ── Assert 3: Section.Text is HTML table from constructTable ── if !strings.HasPrefix(ts.Text, "") { t.Errorf("table Section.Text = %q, want HTML
", ts.Text) } // TSR cells have pre-filled text ("姓名", "年龄", "张三", "25") — // fillCellTextFromBoxes preserves it since cells already have text. if !strings.Contains(ts.Text, "姓名") || !strings.Contains(ts.Text, "年龄") { t.Errorf("table HTML should contain cell text, got %q", ts.Text) } } // TestEnrichWithDeepDoc_ImageOnlyPage verifies that enrichWithDeepDoc // runs DLA on pages that have images but zero embedded chars (boxes). // Regression test for test.pdf (Go 0 tables, Py 1 table). func TestEnrichWithDeepDoc_ImageOnlyPage(t *testing.T) { mock := &MockDocAnalyzer{ Healthy: true, DLARegions: []DLARegion{ {X0: 54, Y0: 100, X1: 846, Y1: 500, Label: "table", Confidence: 0.95}, }, TSRCells: []TSRCell{ {X0: 0, Y0: 0, X1: 200, Y1: 100, Text: "A", Label: "table row"}, }, } p := NewParser(DefaultParserConfig(), mock) // 0 text boxes, but page 0 has a rendered image. boxes := []TextBox{} dummyImg := image.NewRGBA(image.Rect(0, 0, 900, 600)) pageImages := map[int]image.Image{0: dummyImg} tables := p.enrichWithDeepDoc(context.Background(), nil, boxes, pageImages) if len(tables) == 0 { t.Fatal("enrichWithDeepDoc: expected at least 1 table from DLA on page with image but no boxes, got 0") } if len(tables[0].Cells) == 0 { t.Fatal("enrichWithDeepDoc: expected TSR cells in table") } } // TestMergeCaptions_Unit verifies mergeCaptions directly without full pipeline. func TestMergeCaptions_Unit(t *testing.T) { sections := []Section{ {Text: "F", LayoutType: "figure", Positions: []Position{{PageNumbers: []int{0, 0}, Left: 40, Right: 60, Top: 30, Bottom: 45}}}, {Text: "C", LayoutType: "figure caption", Positions: []Position{{PageNumbers: []int{0, 0}, Left: 40, Right: 60, Top: 80, Bottom: 95}}}, } figures := CollectFigures(sections) result := mergeCaptions(sections, figures) // Caption removed. if len(result) != 1 { t.Fatalf("expected 1 section after merge, got %d", len(result)) } // Figure text includes caption. if !strings.Contains(result[0].Text, "C") { t.Errorf("expected figure Text to contain caption 'C', got %q", result[0].Text) } if result[0].LayoutType != "figure" { t.Errorf("expected figure LayoutType, got %q", result[0].LayoutType) } } // TestMergeCaptions_TableCaption verifies table caption merging directly. func TestMergeCaptions_TableCaption(t *testing.T) { sections := []Section{ {Text: "T", LayoutType: "table", Positions: []Position{{PageNumbers: []int{0, 0}, Left: 40, Right: 60, Top: 30, Bottom: 45}}}, {Text: "C", LayoutType: "table caption", Positions: []Position{{PageNumbers: []int{0, 0}, Left: 40, Right: 60, Top: 80, Bottom: 95}}}, } figures := CollectFigures(sections) result := mergeCaptions(sections, figures) if len(result) != 1 { t.Fatalf("expected 1 section after merge, got %d", len(result)) } if !strings.Contains(result[0].Text, "C") { t.Errorf("expected table Text to contain caption 'C', got %q", result[0].Text) } } // TestFigureCaption_MergedIntoFigure verifies that "figure caption" text // is merged into the nearest "figure" Section and the caption Section is // removed. Matches Python _extract_table_figure caption matching. func TestFigureCaption_MergedIntoFigure(t *testing.T) { eng := &mockEngine{ pageCount: 1, renderW: 1800, renderH: 2400, chars: map[int][]TextChar{0: { // Figure text — overlaps DLA figure region (pixel Y=80-300 → PDF 27-100). {X0: 40, X1: 60, Top: 30, Bottom: 45, Text: "F"}, // Caption text — overlaps DLA figure caption region (pixel Y=310-340 → PDF 103-113). {X0: 40, X1: 60, Top: 104, Bottom: 112, Text: "C"}, }}, } mock := &MockDocAnalyzer{ Healthy: true, DLARegions: []DLARegion{ {X0: 100, Y0: 80, X1: 500, Y1: 300, Label: "figure", Confidence: 0.9}, // Caption is below the figure. {X0: 100, Y0: 310, X1: 500, Y1: 340, Label: "figure caption", Confidence: 0.9}, }, } p := NewParser(DefaultParserConfig(), mock) result, err := p.Parse(context.Background(), eng) if err != nil { t.Fatalf("Parse: %v", err) } // Assert 1: figure caption Section removed. for _, s := range result.Sections { if s.LayoutType == "figure caption" { t.Errorf("figure caption Section should be removed after mergeCaptions, got %q", s.Text) } } // Assert 2: figure Section exists and has caption text appended. var fig *Section for i := range result.Sections { if result.Sections[i].LayoutType == "figure" { fig = &result.Sections[i] break } } if fig == nil { t.Fatal("expected a figure Section") } if !strings.Contains(fig.Text, "C") { t.Errorf("figure Text should contain caption text 'C', got %q", fig.Text) } // Assert 3: figure is in result.Figures. if len(result.Figures) == 0 { t.Error("expected at least 1 entry in result.Figures") } } // TestTableCaption_MergedIntoTable verifies that "table caption" text // is merged into the nearest table Section and the caption is removed. func TestTableCaption_MergedIntoTable(t *testing.T) { eng := &mockEngine{ pageCount: 1, renderW: 1800, renderH: 2400, chars: map[int][]TextChar{0: { // Table text — overlaps DLA table region (pixel Y=80-300 → PDF 27-100). {X0: 40, X1: 60, Top: 30, Bottom: 45, Text: "T"}, // Caption text — overlaps DLA table caption region (pixel Y=310-340 → PDF 103-113). {X0: 40, X1: 60, Top: 104, Bottom: 112, Text: "C"}, }}, } mock := &MockDocAnalyzer{ Healthy: true, DLARegions: []DLARegion{ {X0: 100, Y0: 80, X1: 500, Y1: 300, Label: "table", Confidence: 0.9}, {X0: 100, Y0: 310, X1: 500, Y1: 340, Label: "table caption", Confidence: 0.9}, }, TSRCells: []TSRCell{ {X0: 0, Y0: 0, X1: 200, Y1: 100, Text: "A", Label: "table row"}, {X0: 200, Y0: 0, X1: 460, Y1: 100, Text: "B", Label: "table row"}, }, } p := NewParser(DefaultParserConfig(), mock) result, err := p.Parse(context.Background(), eng) if err != nil { t.Fatalf("Parse: %v", err) } // Assert: table caption Section removed, text merged into table Section. for _, s := range result.Sections { if s.LayoutType == "table caption" { t.Errorf("table caption Section should be removed, got %q", s.Text) } } var tbl *Section for i := range result.Sections { if result.Sections[i].LayoutType == "table" { tbl = &result.Sections[i] break } } if tbl == nil { t.Fatal("expected a table Section") } if !strings.Contains(tbl.Text, "C") { t.Errorf("table Text should contain caption text 'C', got %q", tbl.Text) } } // TestTextSectionsInsideTableRegion_Suppressed verifies that Sections // whose positions fall inside a table region are suppressed even when // DLA labeled them as "text". Python _extract_table_figure pops ALL // boxes overlapping a table region, regardless of their DLA label. // This is the #1 cause of Go vs Python discrepancy on table-heavy PDFs. func TestTextSectionsInsideTableRegion_Suppressed(t *testing.T) { eng := &mockEngine{ pageCount: 1, renderW: 1800, renderH: 2400, chars: map[int][]TextChar{0: { // Box A: inside DLA table region, labeled as "text" by DLA. {X0: 50, X1: 100, Top: 40, Bottom: 55, Text: "碎片文字"}, // Box B: inside DLA table region, same situation. {X0: 120, X1: 160, Top: 40, Bottom: 55, Text: "垃圾"}, }}, } // DLA returns a "table" region AND a "text" sub-region inside it. // Real DLA often splits large table regions this way. mock := &MockDocAnalyzer{ Healthy: true, DLARegions: []DLARegion{ {X0: 100, Y0: 80, X1: 500, Y1: 300, Label: "table", Confidence: 0.9}, {X0: 120, Y0: 100, X1: 180, Y1: 140, Label: "text", Confidence: 0.8}, }, TSRCells: []TSRCell{ {X0: 0, Y0: 0, X1: 200, Y1: 100, Text: "姓名", Label: "table row"}, {X0: 200, Y0: 0, X1: 460, Y1: 100, Text: "年龄", Label: "table row"}, }, } p := NewParser(DefaultParserConfig(), mock) result, err := p.Parse(context.Background(), eng) if err != nil { t.Fatalf("Parse: %v", err) } // Assert 1: table Section exists with structured text. var hasTable bool for _, s := range result.Sections { if s.LayoutType == "table" && s.Text != "" { hasTable = true break } } if !hasTable { t.Fatal("expected a table Section with structured text") } // Assert 2: NO "text" fragment sections remain — they were inside // the table region and should be suppressed (Python pops them). for _, s := range result.Sections { if s.LayoutType != "table" && strings.Contains(s.Text, "碎片") { t.Errorf("text fragment %q inside table region should be suppressed, got %q", s.Text, s.LayoutType) } if s.LayoutType != "table" && strings.Contains(s.Text, "垃圾") { t.Errorf("text fragment %q inside table region should be suppressed, got %q", s.Text, s.LayoutType) } } sectionCount := len(result.Sections) if sectionCount > 3 { t.Errorf("expected ≤3 sections (table + outside fragments), got %d", sectionCount) } } // TestEmptyDoc_NoCrash verifies Parse handles edge cases gracefully. func TestEmptyDoc_NoCrash(t *testing.T) { eng := &mockEngine{pageCount: 0} p := NewParser(DefaultParserConfig(), &MockDocAnalyzer{Healthy: true, Model: ModelSaas}) result, err := p.Parse(context.Background(), eng) if err != nil { t.Fatalf("Parse: %v", err) } if len(result.Sections) != 0 { t.Errorf("expected 0 sections for empty doc, got %d", len(result.Sections)) } } // TestNilChars_handled verifies zero-chars pages don't crash. func TestNilChars_Handled(t *testing.T) { eng := &mockEngine{pageCount: 1, renderW: 200, renderH: 200} p := NewParser(DefaultParserConfig(), &MockDocAnalyzer{Healthy: true, Model: ModelSaas}) result, err := p.Parse(context.Background(), eng) if err != nil { t.Fatalf("Parse: %v", err) } if len(result.Sections) != 0 && p.DeepDoc != nil { t.Logf("nil chars + DeepDoc: sections=%d (may trigger OCR path)", len(result.Sections)) } } // TestMergeCaptions_EuclideanDistance verifies that caption matching uses // squared Euclidean distance (center-to-center), not Y-only distance. // Two captions at different X positions — the one closer by Euclidean // distance wins, even if its Y distance is slightly larger. func TestMergeCaptions_EuclideanDistance(t *testing.T) { sections := []Section{ {Text: "F", LayoutType: "figure", Positions: []Position{ {PageNumbers: []int{0, 0}, Left: 0, Right: 100, Top: 0, Bottom: 50}, }}, // Caption A: directly below figure (dx=0, dy=20) → Euclidean = 20² {Text: "close", LayoutType: "figure caption", Positions: []Position{ {PageNumbers: []int{0, 0}, Left: 0, Right: 100, Top: 70, Bottom: 80}, }}, } figures := CollectFigures(sections) result := mergeCaptions(sections, figures) // Caption merged into figure — verified by figure Text containing caption. if len(result) != 1 { t.Fatalf("expected 1 section after merge, got %d", len(result)) } if !strings.Contains(result[0].Text, "close") { t.Errorf("figure Text should contain caption 'close', got %q", result[0].Text) } } // mockEngine is a minimal PDFEngine stub for unit tests. type mockEngine struct { chars map[int][]TextChar pageCount int renderW int renderH int } func (m *mockEngine) ExtractChars(pg int) ([]TextChar, error) { return m.chars[pg], nil } func (m *mockEngine) RenderPage(pg int, dpi float64) ([]byte, error) { w, h := m.renderW, m.renderH if w <= 0 { w = 595 } if h <= 0 { h = 842 } return nil, nil } func (m *mockEngine) RenderPageImage(pg int, dpi float64) (image.Image, error) { w, h := m.renderW, m.renderH if w <= 0 { w = 100 } if h <= 0 { h = 100 } return image.NewRGBA(image.Rect(0, 0, w, h)), nil } func (m *mockEngine) PageCount() (int, error) { if m.pageCount <= 0 { return 1, nil } return m.pageCount, nil } func (m *mockEngine) RawData() []byte { return nil } func (m *mockEngine) Close() error { return nil }