package parser import ( "context" "image" "strings" "sync" "testing" lyt "ragflow/internal/deepdoc/parser/pdf/layout" tbl "ragflow/internal/deepdoc/parser/pdf/table" pdf "ragflow/internal/deepdoc/parser/pdf/type" util "ragflow/internal/deepdoc/parser/pdf/util" ) // ── OCR fallback ────────────────────────────────────────────────────── func TestOCR_Fallback(t *testing.T) { dummyImg := image.NewRGBA(image.Rect(0, 0, 100, 100)) t.Run("nil image", func(t *testing.T) { if got := ocrDetectAndRecognize(context.Background(), nil, &MockDocAnalyzer{Healthy: true}, 0, "garbled page"); got != nil { t.Error("nil image → nil") } }) t.Run("detect returns no boxes", func(t *testing.T) { mock := &MockDocAnalyzer{Healthy: true, OCRBoxes: nil} if got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "garbled page"); got != nil { t.Error("no det boxes → nil") } }) t.Run("detect + recognize success", func(t *testing.T) { mock := &MockDocAnalyzer{ Healthy: true, OCRBoxes: []pdf.OCRBox{{X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40}}, OCRTexts: []pdf.OCRText{{Text: "Hello", Confidence: 0.9}}, } got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "garbled page") if len(got) != 1 { t.Fatalf("expected 1 pdf.TextChar, got %d", len(got)) } if got[0].Text != "Hello" { t.Errorf("text = %q, want Hello", got[0].Text) } }) t.Run("detect boxes but rec returns empty text", func(t *testing.T) { mock := &MockDocAnalyzer{ Healthy: true, OCRBoxes: []pdf.OCRBox{{X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40}}, OCRTexts: []pdf.OCRText{{Text: "", Confidence: 0.1}}, } got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "garbled page") if len(got) != 0 { t.Error("empty rec text → empty result") } }) } // garbledSample returns chars that trigger IsGarbledByFontEncoding: // ≥30% subset font, <5% CJK, >40% ASCII punctuation. // ── OCR scan page ────────────────────────────────────────────────────── func TestOCR_ScanPage(t *testing.T) { dummyImg := image.NewRGBA(image.Rect(0, 0, 100, 100)) t.Run("nil image", func(t *testing.T) { if got := ocrDetectAndRecognize(context.Background(), nil, &MockDocAnalyzer{Healthy: true}, 0, "scan page"); got != nil { t.Error("nil image → nil") } }) t.Run("detect returns no boxes", func(t *testing.T) { mock := &MockDocAnalyzer{Healthy: true, OCRBoxes: nil} if got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "scan page"); got != nil { t.Error("no det boxes → nil") } }) t.Run("detect + recognize success", func(t *testing.T) { mock := &MockDocAnalyzer{ Healthy: true, OCRBoxes: []pdf.OCRBox{ {X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40}, {X0: 10, Y0: 50, X1: 90, Y1: 50, X2: 90, Y2: 70, X3: 10, Y3: 70}, }, OCRTexts: []pdf.OCRText{{Text: "Hello", Confidence: 0.9}, {Text: "World", Confidence: 0.8}}, } got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "scan page") if len(got) < 1 { t.Error("expected at least 1 pdf.TextChar") } }) t.Run("detect success but rec returns empty", func(t *testing.T) { mock := &MockDocAnalyzer{ Healthy: true, OCRBoxes: []pdf.OCRBox{{X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40}}, OCRTexts: []pdf.OCRText{}, } got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "scan page") if len(got) != 0 { t.Error("no rec text → empty") } }) } // ── OCR table cell ───────────────────────────────────────────────────── func TestOCR_TableCell(t *testing.T) { t.Run("fill single empty cell", func(t *testing.T) { cells := []pdf.TSRCell{ {X0: 0, Y0: 0, X1: 100, Y1: 50, Text: ""}, {X0: 100, Y0: 0, X1: 200, Y1: 50, Text: "已有"}, } mock := &MockDocAnalyzer{Healthy: true, OCRTexts: []pdf.OCRText{{Text: "识别结果", Confidence: 0.9}}} dummy := image.NewRGBA(image.Rect(0, 0, 200, 50)) ocrTableCells(context.Background(), cells, dummy, mock) if cells[0].Text != "识别结果" { t.Errorf("empty cell not filled: %q", cells[0].Text) } if cells[1].Text != "已有" { t.Errorf("filled cell changed: %q", cells[1].Text) } }) t.Run("all cells already filled — no OCR", func(t *testing.T) { cells := []pdf.TSRCell{ {X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "A"}, {X0: 100, Y0: 0, X1: 200, Y1: 50, Text: "B"}, } ocrTableCells(context.Background(), cells, nil, nil) // should not panic if cells[0].Text != "A" || cells[1].Text != "B" { t.Error("filled cells should not change") } }) t.Run("empty cells list", func(t *testing.T) { ocrTableCells(context.Background(), nil, nil, nil) // should not panic ocrTableCells(context.Background(), []pdf.TSRCell{}, nil, nil) }) t.Run("no DeepDoc — skip", func(t *testing.T) { cells := []pdf.TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: ""}} ocrTableCells(context.Background(), cells, nil, nil) if cells[0].Text != "" { t.Error("without DeepDoc, cell should stay empty") } }) t.Run("no cropped image — skip", func(t *testing.T) { cells := []pdf.TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: ""}} mock := &MockDocAnalyzer{Healthy: true, OCRTexts: []pdf.OCRText{{Text: "x", Confidence: 0.5}}} ocrTableCells(context.Background(), cells, nil, mock) if cells[0].Text != "" { t.Error("without image, cell should stay empty") } }) t.Run("OCR returns empty string", func(t *testing.T) { cells := []pdf.TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: ""}} mock := &MockDocAnalyzer{Healthy: true, OCRTexts: []pdf.OCRText{}} dummy := image.NewRGBA(image.Rect(0, 0, 100, 50)) ocrTableCells(context.Background(), cells, dummy, mock) if cells[0].Text != "" { t.Error("empty OCR result → cell stays empty") } }) t.Run("cell out of image bounds", func(t *testing.T) { cells := []pdf.TSRCell{{X0: 500, Y0: 500, X1: 600, Y1: 600, Text: ""}} mock := &MockDocAnalyzer{Healthy: true, OCRTexts: []pdf.OCRText{{Text: "out of bounds", Confidence: 0.9}}} dummy := image.NewRGBA(image.Rect(0, 0, 100, 100)) // Should not panic — gracefully degrade ocrTableCells(context.Background(), cells, dummy, mock) t.Logf("out-of-bounds cell: text=%q", cells[0].Text) }) } func garbledSample() []pdf.TextChar { punctuation := []string{"!", "#", "$", "%", "&", "*", "+", "-", ".", "/", ":", ";", "<", ">", "=", "?", "@", "^", "_", "~"} chars := make([]pdf.TextChar, 20) for i, p := range punctuation { chars[i] = pdf.TextChar{ X0: 50 + float64(i*10), X1: 58 + float64(i*10), Top: 100, Bottom: 112, Text: p, FontName: "ABCDEF+SimSun", PageNumber: 0, } } return chars } // ── OCR fallback integration through Parse ───────────────────────────── func TestOCR_FallbackIntegration(t *testing.T) { // ocrFallback logic is tested via TestOCR_fallback. // The render+OCR path in Parse requires a real PDF + DeepDoc service. // This test verifies the wiring compiles and that garbled chars without // DeepDoc pass through gracefully (covered by TestOCR_FallbackIntegration_NoDeepDoc). t.Log("OCR fallback Parse integration: tested via TestOCR_fallback (logic) + live DeepDoc testing") } func TestOCR_FallbackIntegration_NoDeepDoc(t *testing.T) { chars := garbledSample() mockEng := &mockEngine{chars: map[int][]pdf.TextChar{0: chars}, pageCount: 1} cfg := pdf.DefaultParserConfig() p := NewParser(cfg, &MockDocAnalyzer{Healthy: true}) result, err := p.Parse(context.Background(), mockEng) if err != nil { t.Fatal(err) } t.Logf("garbled chars: %d sections", len(result.Sections)) } func TestNoDeepDoc_PdfOxideUnmapped_KeepsChars(t *testing.T) { // pdf_oxide ### unmapped glyphs mixed with real CJK text. // Without DeepDoc, isGarbledPage should return false (isScanNoise gate), // so chars are kept and sections > 0. chars := make([]pdf.TextChar, 30) for i := 0; i < 20; i++ { chars[i] = pdf.TextChar{ Text: "测试文本", FontName: "SimSun", X0: 50, X1: 128, Top: float64(100 + i*15), Bottom: float64(112 + i*15), } } // Insert ### unmapped glyph noise (no subset fonts) chars[20] = pdf.TextChar{Text: "#", FontName: "SimSun", X0: 130, X1: 138, Top: 100, Bottom: 112} chars[21] = pdf.TextChar{Text: "#", FontName: "SimSun", X0: 138, X1: 146, Top: 100, Bottom: 112} chars[22] = pdf.TextChar{Text: "#", FontName: "SimSun", X0: 146, X1: 154, Top: 100, Bottom: 112} chars[23] = pdf.TextChar{Text: "D", FontName: "SimSun", X0: 154, X1: 162, Top: 100, Bottom: 112} chars[24] = pdf.TextChar{Text: "_", FontName: "SimSun", X0: 162, X1: 170, Top: 100, Bottom: 112} chars[25] = pdf.TextChar{Text: "8", FontName: "SimSun", X0: 170, X1: 178, Top: 100, Bottom: 112} chars[26] = pdf.TextChar{Text: "-", FontName: "SimSun", X0: 178, X1: 186, Top: 100, Bottom: 112} chars[27] = pdf.TextChar{Text: ".", FontName: "SimSun", X0: 186, X1: 194, Top: 100, Bottom: 112} chars[28] = pdf.TextChar{Text: "*", FontName: "SimSun", X0: 194, X1: 202, Top: 100, Bottom: 112} chars[29] = pdf.TextChar{Text: "用", FontName: "SimSun", X0: 202, X1: 210, Top: 100, Bottom: 112} mockEng := &mockEngine{chars: map[int][]pdf.TextChar{0: chars}, pageCount: 1} p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true}) result, err := p.Parse(context.Background(), mockEng) if err != nil { t.Fatal(err) } if len(result.Sections) == 0 { t.Error("pdf_oxide unmapped + CJK: expected >0 sections, got 0") } t.Logf("pdf_oxide unmapped + CJK: %d sections (chars kept)", len(result.Sections)) } func TestIsGarbledPage(t *testing.T) { t.Run("PUA dominant", func(t *testing.T) { chars := make([]pdf.TextChar, 50) for i := range chars { chars[i] = pdf.TextChar{Text: string(rune(0xE000)), PageNumber: 0} } if !util.IsGarbledPage(chars) { t.Error("100% PUA → garbled") } }) t.Run("font encoding", func(t *testing.T) { if !util.IsGarbledPage(garbledSample()) { t.Error("subset font → garbled") } }) t.Run("normal text", func(t *testing.T) { chars := make([]pdf.TextChar, 50) for i := range chars { chars[i] = pdf.TextChar{Text: "a", PageNumber: 0} } if util.IsGarbledPage(chars) { t.Error("normal text → not garbled") } }) t.Run("pdf oxide unmapped + CJK — not garbled", func(t *testing.T) { // ### unmapped glyphs + real CJK text (no subset fonts). // isScanNoise returns false (≥2 consecutive CJK chars: "护理全科"). chars := []pdf.TextChar{ {Text: "和", PageNumber: 0}, {Text: "蔘", PageNumber: 0}, {Text: "语", PageNumber: 0}, {Text: "言", PageNumber: 0}, {Text: "#", PageNumber: 0}, {Text: "#", PageNumber: 0}, {Text: "#", PageNumber: 0}, {Text: "D", PageNumber: 0}, {Text: "_", PageNumber: 0}, {Text: "8", PageNumber: 0}, {Text: "-", PageNumber: 0}, {Text: ".", PageNumber: 0}, {Text: "*", PageNumber: 0}, {Text: "/", PageNumber: 0}, {Text: "*", PageNumber: 0}, {Text: "护", PageNumber: 0}, {Text: "理", PageNumber: 0}, {Text: "全", PageNumber: 0}, {Text: "科", PageNumber: 0}, {Text: "引", PageNumber: 0}, {Text: "用", PageNumber: 0}, } if util.IsGarbledPage(chars) { t.Error("### unmapped + CJK text should NOT be garbled (no subset fonts)") } }) t.Run("too few chars", func(t *testing.T) { if util.IsGarbledPage([]pdf.TextChar{{Text: " ", PageNumber: 0}}) { t.Error("< 20 chars → not garbled") } }) } func TestOCR_Fallback_PUAGarbled(t *testing.T) { pua := make([]pdf.TextChar, 50) for i := range pua { pua[i] = pdf.TextChar{Text: string(rune(0xE000 + i%10)), PageNumber: 0} } dummyImg := image.NewRGBA(image.Rect(0, 0, 100, 100)) mock := &MockDocAnalyzer{ Healthy: true, OCRBoxes: []pdf.OCRBox{{X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40}}, OCRTexts: []pdf.OCRText{{Text: "PUA OCR text", Confidence: 0.9}}, } got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "garbled page") if len(got) != 1 || got[0].Text != "PUA OCR text" { t.Errorf("PUA garbled should trigger OCR, got %v", got) } } // ── ocrMergeChars ───────────────────────────────────────────────────── func TestOCR_MergeChars(t *testing.T) { dummyImg := image.NewRGBA(image.Rect(0, 0, 600, 600)) t.Run("nil image", func(t *testing.T) { chars := []pdf.TextChar{{X0: 10, Top: 10, X1: 20, Bottom: 30, Text: "A", PageNumber: 0}} if boxes := ocrMergeChars(context.Background(), nil, chars, &MockDocAnalyzer{Healthy: true}, 0); boxes != nil { t.Error("nil image → nil") } }) t.Run("detect returns no boxes", func(t *testing.T) { mock := &MockDocAnalyzer{Healthy: true, OCRBoxes: []pdf.OCRBox{}} chars := []pdf.TextChar{{X0: 10, Top: 10, X1: 20, Bottom: 30, Text: "A", PageNumber: 0}} if boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0); boxes != nil { t.Error("no detect boxes → nil") } }) t.Run("detect boxes — all overlap with chars (chars used, Python-aligned)", func(t *testing.T) { mock := &MockDocAnalyzer{ Healthy: true, OCRBoxes: []pdf.OCRBox{{X0: 15, Y0: 15, X1: 150, Y1: 15, X2: 150, Y2: 150, X3: 15, Y3: 150}}, OCRTexts: []pdf.OCRText{{Text: "Hello OCR", Confidence: 0.9}}, } chars := []pdf.TextChar{{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0}} boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0) if len(boxes) != 1 { t.Fatalf("expected 1 box, got %d", len(boxes)) } // Embedded chars override OCR — char text is more precise. if boxes[0].Text != "Hello" { t.Errorf("expected char text 'Hello', got %q", boxes[0].Text) } }) t.Run("detect boxes — none overlap with chars", func(t *testing.T) { mock := &MockDocAnalyzer{ Healthy: true, OCRBoxes: []pdf.OCRBox{{X0: 240, Y0: 240, X1: 270, Y1: 240, X2: 270, Y2: 270, X3: 240, Y3: 270}}, OCRTexts: []pdf.OCRText{{Text: "OCR", Confidence: 0.9}}, } chars := []pdf.TextChar{{X0: 10, X1: 20, Top: 10, Bottom: 20, Text: "A", PageNumber: 0}} boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0) if len(boxes) != 1 { t.Fatalf("expected 1 box (OCR), got %d", len(boxes)) } if boxes[0].Text != "OCR" { t.Errorf("expected OCR text 'OCR', got %q", boxes[0].Text) } }) t.Run("detect box — no chars and OCR returns empty", func(t *testing.T) { mock := &MockDocAnalyzer{ Healthy: true, OCRBoxes: []pdf.OCRBox{{X0: 240, Y0: 240, X1: 270, Y1: 240, X2: 270, Y2: 270, X3: 240, Y3: 270}}, OCRTexts: []pdf.OCRText{}, } chars := []pdf.TextChar{{X0: 10, X1: 20, Top: 10, Bottom: 20, Text: "A", PageNumber: 0}} boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0) if len(boxes) != 0 { t.Fatalf("expected 0 boxes (empty OCR), got %d", len(boxes)) } }) t.Run("multiple detect boxes — one with chars, one OCR", func(t *testing.T) { // Box 1 overlaps chars → uses char text. Box 2 has no chars → OCR. mock := &MockDocAnalyzer{ Healthy: true, OCRBoxes: []pdf.OCRBox{ {X0: 15, Y0: 15, X1: 150, Y1: 15, X2: 150, Y2: 150, X3: 15, Y3: 150}, {X0: 240, Y0: 240, X1: 270, Y1: 240, X2: 270, Y2: 270, X3: 240, Y3: 270}, }, OCRTexts: []pdf.OCRText{ {Text: "box 1 text", Confidence: 0.9}, }, } chars := []pdf.TextChar{{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0}} boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0) if len(boxes) != 2 { t.Fatalf("expected 2 boxes, got %d", len(boxes)) } // Box 0 has chars → uses char text. if boxes[0].Text != "Hello" { t.Errorf("box[0] expected char text 'Hello', got %q", boxes[0].Text) } // Box 1 has no chars → OCR. if boxes[1].Text != "box 1 text" { t.Errorf("box[1] expected OCR 'box 1 text', got %q", boxes[1].Text) } }) t.Run("chars in box — sorted by reading order (top→x0)", func(t *testing.T) { // Box 1 (pixel Y=30-90 → PDF 10-30) overlaps char "a" at (10,10-30). // Box 2 (pixel Y=330-390 → PDF 110-130) overlaps char "c" at (70,110-130). mock := &MockDocAnalyzer{ Healthy: true, OCRBoxes: []pdf.OCRBox{ {X0: 15, Y0: 30, X1: 90, Y1: 30, X2: 90, Y2: 90, X3: 15, Y3: 90}, {X0: 75, Y0: 330, X1: 300, Y1: 330, X2: 300, Y2: 390, X3: 75, Y3: 390}, }, } chars := []pdf.TextChar{ {X0: 70, X1: 90, Top: 110, Bottom: 130, Text: "c", PageNumber: 0}, {X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "a", PageNumber: 0}, } boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0) if len(boxes) != 2 { t.Fatalf("expected 2 detect boxes, got %d", len(boxes)) } // Each box gets its overlapping char text. if boxes[0].Text != "a" { t.Errorf("box[0] expected 'a', got %q", boxes[0].Text) } if boxes[1].Text != "c" { t.Errorf("box[1] expected 'c', got %q", boxes[1].Text) } }) t.Run("height mismatch — chars with very different height excluded", func(t *testing.T) { // Box pixel Y=75-165 → PDF 25-55, height=30. Char A height=20, diff=10/30=0.33 < 0.7 → kept. // Char B height=100, diff=70/100=0.70 ≥ 0.7 → excluded. mock := &MockDocAnalyzer{ Healthy: true, OCRBoxes: []pdf.OCRBox{ {X0: 15, Y0: 75, X1: 150, Y1: 75, X2: 150, Y2: 165, X3: 15, Y3: 165}, }, OCRTexts: []pdf.OCRText{{Text: "OCR height test", Confidence: 0.9}}, } chars := []pdf.TextChar{ {X0: 10, X1: 30, Top: 30, Bottom: 50, Text: "A", PageNumber: 0}, {X0: 40, X1: 60, Top: 20, Bottom: 120, Text: "B", PageNumber: 0}, } boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0) if len(boxes) != 1 { t.Fatalf("expected 1 box, got %d", len(boxes)) } // Only 'A' matches; 'B' excluded by height gate. if boxes[0].Text != "A" { t.Errorf("expected 'A' (B excluded by height gate), got %q", boxes[0].Text) } }) t.Run("garbled chars — box text cleared for OCR recognize", func(t *testing.T) { mock := &MockDocAnalyzer{ Healthy: true, OCRBoxes: []pdf.OCRBox{ {X0: 15, Y0: 15, X1: 450, Y1: 15, X2: 450, Y2: 450, X3: 15, Y3: 450}, }, OCRTexts: []pdf.OCRText{{Text: "OCR result", Confidence: 0.9}}, } chars := []pdf.TextChar{ {X0: 10, X1: 20, Top: 10, Bottom: 20, Text: "", PageNumber: 0}, {X0: 30, X1: 40, Top: 10, Bottom: 20, Text: "", PageNumber: 0}, {X0: 50, X1: 60, Top: 10, Bottom: 20, Text: "a", PageNumber: 0}, } boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0) if len(boxes) != 1 { t.Fatalf("expected 1 box, got %d", len(boxes)) } if boxes[0].Text != "OCR result" { t.Errorf("expected 'OCR result' (garbled majority -> OCR), got %q", boxes[0].Text) } }) t.Run("OCR text preserves word spacing", func(t *testing.T) { // Detect box at (pixel 30,30 → 90,90 → PDF 10,10 → 30,30). // Chars at (10,10-25) → within the box region. Char text "do" is // used (Python-aligned: embedded chars are more precise than OCR). mock := &MockDocAnalyzer{ Healthy: true, OCRBoxes: []pdf.OCRBox{{X0: 30, Y0: 30, X1: 90, Y1: 30, X2: 90, Y2: 90, X3: 30, Y3: 90}}, OCRTexts: []pdf.OCRText{{Text: "docker commit infiniflow", Confidence: 0.95}}, } chars := []pdf.TextChar{ {Text: "d", X0: 10, X1: 20, Top: 10, Bottom: 25, PageNumber: 0}, {Text: "o", X0: 21, X1: 30, Top: 10, Bottom: 25, PageNumber: 0}, } boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0) if len(boxes) != 1 { t.Fatalf("expected 1 box, got %d", len(boxes)) } // Char text used (Python-aligned). if boxes[0].Text != "do" { t.Errorf("expected char text 'do', got %q", boxes[0].Text) } }) } // TestTableSectionCaptionInHTML verifies mergeCaptions prepends table // caption text before the HTML table, matching Python's caption handling. func TestTableSectionCaptionInHTML(t *testing.T) { // Simulate pipeline order: extractTableAndReplace → boxesToSections → mergeCaptions boxes := []pdf.TextBox{ {X0: 100, X1: 500, Top: 200, Bottom: 400, LayoutType: "table", PageNumber: 0}, } ti := pdf.TableItem{ Cells: []pdf.TSRCell{ {X0: 0, Y0: 0, X1: 200, Y1: 50, Label: "table row", Text: "飞机"}, {X0: 0, Y0: 51, X1: 200, Y1: 100, Label: "table row", Text: "火车"}, }, Positions: []pdf.Position{{Left: 100, Right: 500, Top: 200, Bottom: 400}}, Scale: 1.0, } // Step 1: extractTableAndReplace → HTML box with table text boxes = tbl.ExtractTableAndReplace(boxes, []pdf.TableItem{ti}) sections := lyt.BoxesToSections(boxes, nil) // Add caption section sections = append(sections, pdf.Section{ LayoutType: "table caption", Positions: []pdf.Position{{Left: 100, Right: 500, Top: 180, Bottom: 198}}, Text: "表1: 交通工具等级", }) // Step 2: mergeCaptions prepends caption before HTML figures := pdf.CollectFigures(sections) sections = tbl.MergeCaptions(sections, figures) if !strings.HasPrefix(sections[0].Text, "表1: 交通工具等级") { t.Errorf("expected caption before table HTML, got %q", sections[0].Text) } } // TestBoxMatchesCell_FalsePositive verifies that boxMatchesCell rejects // text boxes that are mostly OUTSIDE the cell, even with cellIsEmpty=true. // The 0.3 threshold should not match a wide box that barely touches a // narrow cell — this would cause body text to leak into table cells. // TestParser_ConcurrentSafety verifies that Parser.Parse() is safe for // concurrent use. 8 goroutines each call Parse 5 times on the same Parser // instance. Run with -race. func TestParser_ConcurrentSafety(t *testing.T) { p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: false}) var wg sync.WaitGroup n := 8 for range n { wg.Add(1) go func() { defer wg.Done() for range 5 { eng := &mockEngine{pageCount: 2} _, _ = p.Parse(context.Background(), eng) } }() } wg.Wait() }