Files

575 lines
22 KiB
Go
Raw Permalink Normal View History

package parser
import (
"context"
"image"
"strings"
"sync"
"testing"
lyt "ragflow/internal/deepdoc/parser/pdf/layout"
tbl "ragflow/internal/deepdoc/parser/pdf/table"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
util "ragflow/internal/deepdoc/parser/pdf/util"
)
// ── OCR fallback ──────────────────────────────────────────────────────
func TestOCR_Fallback(t *testing.T) {
dummyImg := image.NewRGBA(image.Rect(0, 0, 100, 100))
t.Run("nil image", func(t *testing.T) {
if got := ocrDetectAndRecognize(context.Background(), nil, &MockDocAnalyzer{Healthy: true}, 0, "garbled page"); got != nil {
t.Error("nil image → nil")
}
})
t.Run("detect returns no boxes", func(t *testing.T) {
mock := &MockDocAnalyzer{Healthy: true, OCRBoxes: nil}
if got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "garbled page"); got != nil {
t.Error("no det boxes → nil")
}
})
t.Run("detect + recognize success", func(t *testing.T) {
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []pdf.OCRBox{{X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40}},
OCRTexts: []pdf.OCRText{{Text: "Hello", Confidence: 0.9}},
}
got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "garbled page")
if len(got) != 1 {
t.Fatalf("expected 1 pdf.TextChar, got %d", len(got))
}
if got[0].Text != "Hello" {
t.Errorf("text = %q, want Hello", got[0].Text)
}
})
t.Run("detect boxes but rec returns empty text", func(t *testing.T) {
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []pdf.OCRBox{{X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40}},
OCRTexts: []pdf.OCRText{{Text: "", Confidence: 0.1}},
}
got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "garbled page")
if len(got) != 0 {
t.Error("empty rec text → empty result")
}
})
}
// garbledSample returns chars that trigger IsGarbledByFontEncoding:
// ≥30% subset font, <5% CJK, >40% ASCII punctuation.
// ── OCR scan page ──────────────────────────────────────────────────────
func TestOCR_ScanPage(t *testing.T) {
dummyImg := image.NewRGBA(image.Rect(0, 0, 100, 100))
t.Run("nil image", func(t *testing.T) {
if got := ocrDetectAndRecognize(context.Background(), nil, &MockDocAnalyzer{Healthy: true}, 0, "scan page"); got != nil {
t.Error("nil image → nil")
}
})
t.Run("detect returns no boxes", func(t *testing.T) {
mock := &MockDocAnalyzer{Healthy: true, OCRBoxes: nil}
if got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "scan page"); got != nil {
t.Error("no det boxes → nil")
}
})
t.Run("detect + recognize success", func(t *testing.T) {
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []pdf.OCRBox{
{X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40},
{X0: 10, Y0: 50, X1: 90, Y1: 50, X2: 90, Y2: 70, X3: 10, Y3: 70},
},
OCRTexts: []pdf.OCRText{{Text: "Hello", Confidence: 0.9}, {Text: "World", Confidence: 0.8}},
}
got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "scan page")
if len(got) < 1 {
t.Error("expected at least 1 pdf.TextChar")
}
})
t.Run("detect success but rec returns empty", func(t *testing.T) {
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []pdf.OCRBox{{X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40}},
OCRTexts: []pdf.OCRText{},
}
got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "scan page")
if len(got) != 0 {
t.Error("no rec text → empty")
}
})
}
// ── OCR table cell ─────────────────────────────────────────────────────
func TestOCR_TableCell(t *testing.T) {
t.Run("fill single empty cell", func(t *testing.T) {
cells := []pdf.TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: ""},
{X0: 100, Y0: 0, X1: 200, Y1: 50, Text: "已有"},
}
mock := &MockDocAnalyzer{Healthy: true, OCRTexts: []pdf.OCRText{{Text: "识别结果", Confidence: 0.9}}}
dummy := image.NewRGBA(image.Rect(0, 0, 200, 50))
ocrTableCells(context.Background(), cells, dummy, mock)
if cells[0].Text != "识别结果" {
t.Errorf("empty cell not filled: %q", cells[0].Text)
}
if cells[1].Text != "已有" {
t.Errorf("filled cell changed: %q", cells[1].Text)
}
})
t.Run("all cells already filled — no OCR", func(t *testing.T) {
cells := []pdf.TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "A"},
{X0: 100, Y0: 0, X1: 200, Y1: 50, Text: "B"},
}
ocrTableCells(context.Background(), cells, nil, nil) // should not panic
if cells[0].Text != "A" || cells[1].Text != "B" {
t.Error("filled cells should not change")
}
})
t.Run("empty cells list", func(t *testing.T) {
ocrTableCells(context.Background(), nil, nil, nil) // should not panic
ocrTableCells(context.Background(), []pdf.TSRCell{}, nil, nil)
})
t.Run("no DeepDoc — skip", func(t *testing.T) {
cells := []pdf.TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: ""}}
ocrTableCells(context.Background(), cells, nil, nil)
if cells[0].Text != "" {
t.Error("without DeepDoc, cell should stay empty")
}
})
t.Run("no cropped image — skip", func(t *testing.T) {
cells := []pdf.TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: ""}}
mock := &MockDocAnalyzer{Healthy: true, OCRTexts: []pdf.OCRText{{Text: "x", Confidence: 0.5}}}
ocrTableCells(context.Background(), cells, nil, mock)
if cells[0].Text != "" {
t.Error("without image, cell should stay empty")
}
})
t.Run("OCR returns empty string", func(t *testing.T) {
cells := []pdf.TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: ""}}
mock := &MockDocAnalyzer{Healthy: true, OCRTexts: []pdf.OCRText{}}
dummy := image.NewRGBA(image.Rect(0, 0, 100, 50))
ocrTableCells(context.Background(), cells, dummy, mock)
if cells[0].Text != "" {
t.Error("empty OCR result → cell stays empty")
}
})
t.Run("cell out of image bounds", func(t *testing.T) {
cells := []pdf.TSRCell{{X0: 500, Y0: 500, X1: 600, Y1: 600, Text: ""}}
mock := &MockDocAnalyzer{Healthy: true, OCRTexts: []pdf.OCRText{{Text: "out of bounds", Confidence: 0.9}}}
dummy := image.NewRGBA(image.Rect(0, 0, 100, 100))
// Should not panic — gracefully degrade
ocrTableCells(context.Background(), cells, dummy, mock)
t.Logf("out-of-bounds cell: text=%q", cells[0].Text)
})
}
func garbledSample() []pdf.TextChar {
punctuation := []string{"!", "#", "$", "%", "&", "*", "+", "-", ".", "/",
":", ";", "<", ">", "=", "?", "@", "^", "_", "~"}
chars := make([]pdf.TextChar, 20)
for i, p := range punctuation {
chars[i] = pdf.TextChar{
X0: 50 + float64(i*10), X1: 58 + float64(i*10),
Top: 100, Bottom: 112,
Text: p, FontName: "ABCDEF+SimSun", PageNumber: 0,
}
}
return chars
}
// ── OCR fallback integration through Parse ─────────────────────────────
func TestOCR_FallbackIntegration(t *testing.T) {
// ocrFallback logic is tested via TestOCR_fallback.
// The render+OCR path in Parse requires a real PDF + DeepDoc service.
// This test verifies the wiring compiles and that garbled chars without
// DeepDoc pass through gracefully (covered by TestOCR_FallbackIntegration_NoDeepDoc).
t.Log("OCR fallback Parse integration: tested via TestOCR_fallback (logic) + live DeepDoc testing")
}
func TestOCR_FallbackIntegration_NoDeepDoc(t *testing.T) {
chars := garbledSample()
mockEng := &mockEngine{chars: map[int][]pdf.TextChar{0: chars}, pageCount: 1}
cfg := pdf.DefaultParserConfig()
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
result, err := p.Parse(context.Background(), mockEng)
if err != nil {
t.Fatal(err)
}
t.Logf("garbled chars: %d sections", len(result.Sections))
}
func TestNoDeepDoc_PdfOxideUnmapped_KeepsChars(t *testing.T) {
// pdf_oxide ### unmapped glyphs mixed with real CJK text.
// Without DeepDoc, isGarbledPage should return false (isScanNoise gate),
// so chars are kept and sections > 0.
chars := make([]pdf.TextChar, 30)
for i := 0; i < 20; i++ {
chars[i] = pdf.TextChar{
Text: "测试文本", FontName: "SimSun",
X0: 50, X1: 128, Top: float64(100 + i*15), Bottom: float64(112 + i*15),
}
}
// Insert ### unmapped glyph noise (no subset fonts)
chars[20] = pdf.TextChar{Text: "#", FontName: "SimSun", X0: 130, X1: 138, Top: 100, Bottom: 112}
chars[21] = pdf.TextChar{Text: "#", FontName: "SimSun", X0: 138, X1: 146, Top: 100, Bottom: 112}
chars[22] = pdf.TextChar{Text: "#", FontName: "SimSun", X0: 146, X1: 154, Top: 100, Bottom: 112}
chars[23] = pdf.TextChar{Text: "D", FontName: "SimSun", X0: 154, X1: 162, Top: 100, Bottom: 112}
chars[24] = pdf.TextChar{Text: "_", FontName: "SimSun", X0: 162, X1: 170, Top: 100, Bottom: 112}
chars[25] = pdf.TextChar{Text: "8", FontName: "SimSun", X0: 170, X1: 178, Top: 100, Bottom: 112}
chars[26] = pdf.TextChar{Text: "-", FontName: "SimSun", X0: 178, X1: 186, Top: 100, Bottom: 112}
chars[27] = pdf.TextChar{Text: ".", FontName: "SimSun", X0: 186, X1: 194, Top: 100, Bottom: 112}
chars[28] = pdf.TextChar{Text: "*", FontName: "SimSun", X0: 194, X1: 202, Top: 100, Bottom: 112}
chars[29] = pdf.TextChar{Text: "用", FontName: "SimSun", X0: 202, X1: 210, Top: 100, Bottom: 112}
mockEng := &mockEngine{chars: map[int][]pdf.TextChar{0: chars}, pageCount: 1}
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
result, err := p.Parse(context.Background(), mockEng)
if err != nil {
t.Fatal(err)
}
if len(result.Sections) == 0 {
t.Error("pdf_oxide unmapped + CJK: expected >0 sections, got 0")
}
t.Logf("pdf_oxide unmapped + CJK: %d sections (chars kept)", len(result.Sections))
}
func TestIsGarbledPage(t *testing.T) {
t.Run("PUA dominant", func(t *testing.T) {
chars := make([]pdf.TextChar, 50)
for i := range chars {
chars[i] = pdf.TextChar{Text: string(rune(0xE000)), PageNumber: 0}
}
if !util.IsGarbledPage(chars) {
t.Error("100% PUA → garbled")
}
})
t.Run("font encoding", func(t *testing.T) {
if !util.IsGarbledPage(garbledSample()) {
t.Error("subset font → garbled")
}
})
t.Run("normal text", func(t *testing.T) {
chars := make([]pdf.TextChar, 50)
for i := range chars {
chars[i] = pdf.TextChar{Text: "a", PageNumber: 0}
}
if util.IsGarbledPage(chars) {
t.Error("normal text → not garbled")
}
})
t.Run("pdf oxide unmapped + CJK — not garbled", func(t *testing.T) {
// ### unmapped glyphs + real CJK text (no subset fonts).
// isScanNoise returns false (≥2 consecutive CJK chars: "护理全科").
chars := []pdf.TextChar{
{Text: "和", PageNumber: 0}, {Text: "蔘", PageNumber: 0},
{Text: "语", PageNumber: 0}, {Text: "言", PageNumber: 0},
{Text: "#", PageNumber: 0}, {Text: "#", PageNumber: 0},
{Text: "#", PageNumber: 0}, {Text: "D", PageNumber: 0},
{Text: "_", PageNumber: 0}, {Text: "8", PageNumber: 0},
{Text: "-", PageNumber: 0}, {Text: ".", PageNumber: 0},
{Text: "*", PageNumber: 0}, {Text: "/", PageNumber: 0},
{Text: "*", PageNumber: 0}, {Text: "护", PageNumber: 0},
{Text: "理", PageNumber: 0}, {Text: "全", PageNumber: 0},
{Text: "科", PageNumber: 0}, {Text: "引", PageNumber: 0},
{Text: "用", PageNumber: 0},
}
if util.IsGarbledPage(chars) {
t.Error("### unmapped + CJK text should NOT be garbled (no subset fonts)")
}
})
t.Run("too few chars", func(t *testing.T) {
if util.IsGarbledPage([]pdf.TextChar{{Text: " ", PageNumber: 0}}) {
t.Error("< 20 chars → not garbled")
}
})
}
func TestOCR_Fallback_PUAGarbled(t *testing.T) {
pua := make([]pdf.TextChar, 50)
for i := range pua {
pua[i] = pdf.TextChar{Text: string(rune(0xE000 + i%10)), PageNumber: 0}
}
dummyImg := image.NewRGBA(image.Rect(0, 0, 100, 100))
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []pdf.OCRBox{{X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40}},
OCRTexts: []pdf.OCRText{{Text: "PUA OCR text", Confidence: 0.9}},
}
got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "garbled page")
if len(got) != 1 || got[0].Text != "PUA OCR text" {
t.Errorf("PUA garbled should trigger OCR, got %v", got)
}
}
// ── ocrMergeChars ─────────────────────────────────────────────────────
func TestOCR_MergeChars(t *testing.T) {
dummyImg := image.NewRGBA(image.Rect(0, 0, 600, 600))
t.Run("nil image", func(t *testing.T) {
chars := []pdf.TextChar{{X0: 10, Top: 10, X1: 20, Bottom: 30, Text: "A", PageNumber: 0}}
if boxes := ocrMergeChars(context.Background(), nil, chars, &MockDocAnalyzer{Healthy: true}, 0); boxes != nil {
t.Error("nil image → nil")
}
})
t.Run("detect returns no boxes", func(t *testing.T) {
mock := &MockDocAnalyzer{Healthy: true, OCRBoxes: []pdf.OCRBox{}}
chars := []pdf.TextChar{{X0: 10, Top: 10, X1: 20, Bottom: 30, Text: "A", PageNumber: 0}}
if boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0); boxes != nil {
t.Error("no detect boxes → nil")
}
})
t.Run("detect boxes — all overlap with chars (chars used, Python-aligned)", func(t *testing.T) {
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []pdf.OCRBox{{X0: 15, Y0: 15, X1: 150, Y1: 15, X2: 150, Y2: 150, X3: 15, Y3: 150}},
OCRTexts: []pdf.OCRText{{Text: "Hello OCR", Confidence: 0.9}},
}
chars := []pdf.TextChar{{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0}}
boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
if len(boxes) != 1 {
t.Fatalf("expected 1 box, got %d", len(boxes))
}
// Embedded chars override OCR — char text is more precise.
if boxes[0].Text != "Hello" {
t.Errorf("expected char text 'Hello', got %q", boxes[0].Text)
}
})
t.Run("detect boxes — none overlap with chars", func(t *testing.T) {
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []pdf.OCRBox{{X0: 240, Y0: 240, X1: 270, Y1: 240, X2: 270, Y2: 270, X3: 240, Y3: 270}},
OCRTexts: []pdf.OCRText{{Text: "OCR", Confidence: 0.9}},
}
chars := []pdf.TextChar{{X0: 10, X1: 20, Top: 10, Bottom: 20, Text: "A", PageNumber: 0}}
boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
if len(boxes) != 1 {
t.Fatalf("expected 1 box (OCR), got %d", len(boxes))
}
if boxes[0].Text != "OCR" {
t.Errorf("expected OCR text 'OCR', got %q", boxes[0].Text)
}
})
t.Run("detect box — no chars and OCR returns empty", func(t *testing.T) {
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []pdf.OCRBox{{X0: 240, Y0: 240, X1: 270, Y1: 240, X2: 270, Y2: 270, X3: 240, Y3: 270}},
OCRTexts: []pdf.OCRText{},
}
chars := []pdf.TextChar{{X0: 10, X1: 20, Top: 10, Bottom: 20, Text: "A", PageNumber: 0}}
boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
if len(boxes) != 0 {
t.Fatalf("expected 0 boxes (empty OCR), got %d", len(boxes))
}
})
t.Run("multiple detect boxes — one with chars, one OCR", func(t *testing.T) {
// Box 1 overlaps chars → uses char text. Box 2 has no chars → OCR.
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []pdf.OCRBox{
{X0: 15, Y0: 15, X1: 150, Y1: 15, X2: 150, Y2: 150, X3: 15, Y3: 150},
{X0: 240, Y0: 240, X1: 270, Y1: 240, X2: 270, Y2: 270, X3: 240, Y3: 270},
},
OCRTexts: []pdf.OCRText{
{Text: "box 1 text", Confidence: 0.9},
},
}
chars := []pdf.TextChar{{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0}}
boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
if len(boxes) != 2 {
t.Fatalf("expected 2 boxes, got %d", len(boxes))
}
// Box 0 has chars → uses char text.
if boxes[0].Text != "Hello" {
t.Errorf("box[0] expected char text 'Hello', got %q", boxes[0].Text)
}
// Box 1 has no chars → OCR.
if boxes[1].Text != "box 1 text" {
t.Errorf("box[1] expected OCR 'box 1 text', got %q", boxes[1].Text)
}
})
t.Run("chars in box — sorted by reading order (top→x0)", func(t *testing.T) {
// Box 1 (pixel Y=30-90 → PDF 10-30) overlaps char "a" at (10,10-30).
// Box 2 (pixel Y=330-390 → PDF 110-130) overlaps char "c" at (70,110-130).
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []pdf.OCRBox{
{X0: 15, Y0: 30, X1: 90, Y1: 30, X2: 90, Y2: 90, X3: 15, Y3: 90},
{X0: 75, Y0: 330, X1: 300, Y1: 330, X2: 300, Y2: 390, X3: 75, Y3: 390},
},
}
chars := []pdf.TextChar{
{X0: 70, X1: 90, Top: 110, Bottom: 130, Text: "c", PageNumber: 0},
{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "a", PageNumber: 0},
}
boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
if len(boxes) != 2 {
t.Fatalf("expected 2 detect boxes, got %d", len(boxes))
}
// Each box gets its overlapping char text.
if boxes[0].Text != "a" {
t.Errorf("box[0] expected 'a', got %q", boxes[0].Text)
}
if boxes[1].Text != "c" {
t.Errorf("box[1] expected 'c', got %q", boxes[1].Text)
}
})
t.Run("height mismatch — chars with very different height excluded", func(t *testing.T) {
// Box pixel Y=75-165 → PDF 25-55, height=30. Char A height=20, diff=10/30=0.33 < 0.7 → kept.
// Char B height=100, diff=70/100=0.70 ≥ 0.7 → excluded.
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []pdf.OCRBox{
{X0: 15, Y0: 75, X1: 150, Y1: 75, X2: 150, Y2: 165, X3: 15, Y3: 165},
},
OCRTexts: []pdf.OCRText{{Text: "OCR height test", Confidence: 0.9}},
}
chars := []pdf.TextChar{
{X0: 10, X1: 30, Top: 30, Bottom: 50, Text: "A", PageNumber: 0},
{X0: 40, X1: 60, Top: 20, Bottom: 120, Text: "B", PageNumber: 0},
}
boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
if len(boxes) != 1 {
t.Fatalf("expected 1 box, got %d", len(boxes))
}
// Only 'A' matches; 'B' excluded by height gate.
if boxes[0].Text != "A" {
t.Errorf("expected 'A' (B excluded by height gate), got %q", boxes[0].Text)
}
})
t.Run("garbled chars — box text cleared for OCR recognize", func(t *testing.T) {
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []pdf.OCRBox{
{X0: 15, Y0: 15, X1: 450, Y1: 15, X2: 450, Y2: 450, X3: 15, Y3: 450},
},
OCRTexts: []pdf.OCRText{{Text: "OCR result", Confidence: 0.9}},
}
chars := []pdf.TextChar{
{X0: 10, X1: 20, Top: 10, Bottom: 20, Text: "", PageNumber: 0},
{X0: 30, X1: 40, Top: 10, Bottom: 20, Text: "", PageNumber: 0},
{X0: 50, X1: 60, Top: 10, Bottom: 20, Text: "a", PageNumber: 0},
}
boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
if len(boxes) != 1 {
t.Fatalf("expected 1 box, got %d", len(boxes))
}
if boxes[0].Text != "OCR result" {
t.Errorf("expected 'OCR result' (garbled majority -> OCR), got %q", boxes[0].Text)
}
})
t.Run("OCR text preserves word spacing", func(t *testing.T) {
// Detect box at (pixel 30,30 → 90,90 → PDF 10,10 → 30,30).
// Chars at (10,10-25) → within the box region. Char text "do" is
// used (Python-aligned: embedded chars are more precise than OCR).
mock := &MockDocAnalyzer{
Healthy: true,
OCRBoxes: []pdf.OCRBox{{X0: 30, Y0: 30, X1: 90, Y1: 30, X2: 90, Y2: 90, X3: 30, Y3: 90}},
OCRTexts: []pdf.OCRText{{Text: "docker commit infiniflow", Confidence: 0.95}},
}
chars := []pdf.TextChar{
{Text: "d", X0: 10, X1: 20, Top: 10, Bottom: 25, PageNumber: 0},
{Text: "o", X0: 21, X1: 30, Top: 10, Bottom: 25, PageNumber: 0},
}
boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
if len(boxes) != 1 {
t.Fatalf("expected 1 box, got %d", len(boxes))
}
// Char text used (Python-aligned).
if boxes[0].Text != "do" {
t.Errorf("expected char text 'do', got %q", boxes[0].Text)
}
})
}
// TestTableSectionCaptionInHTML verifies mergeCaptions prepends table
// caption text before the HTML table, matching Python's caption handling.
func TestTableSectionCaptionInHTML(t *testing.T) {
// Simulate pipeline order: extractTableAndReplace → boxesToSections → mergeCaptions
boxes := []pdf.TextBox{
{X0: 100, X1: 500, Top: 200, Bottom: 400, LayoutType: "table", PageNumber: 0},
}
ti := pdf.TableItem{
Cells: []pdf.TSRCell{
{X0: 0, Y0: 0, X1: 200, Y1: 50, Label: "table row", Text: "飞机"},
{X0: 0, Y0: 51, X1: 200, Y1: 100, Label: "table row", Text: "火车"},
},
Positions: []pdf.Position{{Left: 100, Right: 500, Top: 200, Bottom: 400}},
Scale: 1.0,
}
// Step 1: extractTableAndReplace → HTML box with table text
boxes = tbl.ExtractTableAndReplace(boxes, []pdf.TableItem{ti})
sections := lyt.BoxesToSections(boxes, nil)
// Add caption section
sections = append(sections, pdf.Section{
LayoutType: "table caption",
Positions: []pdf.Position{{Left: 100, Right: 500, Top: 180, Bottom: 198}},
Text: "表1: 交通工具等级",
})
// Step 2: mergeCaptions prepends caption before HTML
figures := pdf.CollectFigures(sections)
sections = tbl.MergeCaptions(sections, figures)
if !strings.HasPrefix(sections[0].Text, "表1: 交通工具等级<table>") {
t.Errorf("expected caption before table HTML, got %q", sections[0].Text)
}
}
// TestBoxMatchesCell_FalsePositive verifies that boxMatchesCell rejects
// text boxes that are mostly OUTSIDE the cell, even with cellIsEmpty=true.
// The 0.3 threshold should not match a wide box that barely touches a
// narrow cell — this would cause body text to leak into table cells.
// TestParser_ConcurrentSafety verifies that Parser.Parse() is safe for
// concurrent use. 8 goroutines each call Parse 5 times on the same Parser
// instance. Run with -race.
func TestParser_ConcurrentSafety(t *testing.T) {
p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: false})
var wg sync.WaitGroup
n := 8
for range n {
wg.Add(1)
go func() {
defer wg.Done()
for range 5 {
eng := &mockEngine{pageCount: 2}
_, _ = p.Parse(context.Background(), eng)
}
}()
}
wg.Wait()
}