ragflow/internal/deepdoc/parser/pdf/parser_test.go

package parser

import (
	"context"
	"image"
	"strings"
	"sync"
	"testing"

	lyt "ragflow/internal/deepdoc/parser/pdf/layout"
	tbl "ragflow/internal/deepdoc/parser/pdf/table"
	pdf "ragflow/internal/deepdoc/parser/pdf/type"
	util "ragflow/internal/deepdoc/parser/pdf/util"
)

// ── OCR fallback ──────────────────────────────────────────────────────

func TestOCR_Fallback(t *testing.T) {
	dummyImg := image.NewRGBA(image.Rect(0, 0, 100, 100))

	t.Run("nil image", func(t *testing.T) {
		if got := ocrDetectAndRecognize(context.Background(), nil, &MockDocAnalyzer{Healthy: true}, 0, "garbled page"); got != nil {
			t.Error("nil image → nil")
		}
	})

	t.Run("detect returns no boxes", func(t *testing.T) {
		mock := &MockDocAnalyzer{Healthy: true, OCRBoxes: nil}
		if got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "garbled page"); got != nil {
			t.Error("no det boxes → nil")
		}
	})

	t.Run("detect + recognize success", func(t *testing.T) {
		mock := &MockDocAnalyzer{
			Healthy:  true,
			OCRBoxes: []pdf.OCRBox{{X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40}},
			OCRTexts: []pdf.OCRText{{Text: "Hello", Confidence: 0.9}},
		}
		got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "garbled page")
		if len(got) != 1 {
			t.Fatalf("expected 1 pdf.TextChar, got %d", len(got))
		}
		if got[0].Text != "Hello" {
			t.Errorf("text = %q, want Hello", got[0].Text)
		}
	})

	t.Run("detect boxes but rec returns empty text", func(t *testing.T) {
		mock := &MockDocAnalyzer{
			Healthy:  true,
			OCRBoxes: []pdf.OCRBox{{X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40}},
			OCRTexts: []pdf.OCRText{{Text: "", Confidence: 0.1}},
		}
		got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "garbled page")
		if len(got) != 0 {
			t.Error("empty rec text → empty result")
		}
	})
}

// garbledSample returns chars that trigger IsGarbledByFontEncoding:
// ≥30% subset font, <5% CJK, >40% ASCII punctuation.
// ── OCR scan page ──────────────────────────────────────────────────────

func TestOCR_ScanPage(t *testing.T) {
	dummyImg := image.NewRGBA(image.Rect(0, 0, 100, 100))

	t.Run("nil image", func(t *testing.T) {
		if got := ocrDetectAndRecognize(context.Background(), nil, &MockDocAnalyzer{Healthy: true}, 0, "scan page"); got != nil {
			t.Error("nil image → nil")
		}
	})

	t.Run("detect returns no boxes", func(t *testing.T) {
		mock := &MockDocAnalyzer{Healthy: true, OCRBoxes: nil}
		if got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "scan page"); got != nil {
			t.Error("no det boxes → nil")
		}
	})

	t.Run("detect + recognize success", func(t *testing.T) {
		mock := &MockDocAnalyzer{
			Healthy: true,
			OCRBoxes: []pdf.OCRBox{
				{X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40},
				{X0: 10, Y0: 50, X1: 90, Y1: 50, X2: 90, Y2: 70, X3: 10, Y3: 70},
			},
			OCRTexts: []pdf.OCRText{{Text: "Hello", Confidence: 0.9}, {Text: "World", Confidence: 0.8}},
		}
		got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "scan page")
		if len(got) < 1 {
			t.Error("expected at least 1 pdf.TextChar")
		}
	})

	t.Run("detect success but rec returns empty", func(t *testing.T) {
		mock := &MockDocAnalyzer{
			Healthy:  true,
			OCRBoxes: []pdf.OCRBox{{X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40}},
			OCRTexts: []pdf.OCRText{},
		}
		got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "scan page")
		if len(got) != 0 {
			t.Error("no rec text → empty")
		}
	})
}

// ── OCR table cell ─────────────────────────────────────────────────────

func TestOCR_TableCell(t *testing.T) {
	t.Run("fill single empty cell", func(t *testing.T) {
		cells := []pdf.TSRCell{
			{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: ""},
			{X0: 100, Y0: 0, X1: 200, Y1: 50, Text: "已有"},
		}
		mock := &MockDocAnalyzer{Healthy: true, OCRTexts: []pdf.OCRText{{Text: "识别结果", Confidence: 0.9}}}
		dummy := image.NewRGBA(image.Rect(0, 0, 200, 50))

		ocrTableCells(context.Background(), cells, dummy, mock)

		if cells[0].Text != "识别结果" {
			t.Errorf("empty cell not filled: %q", cells[0].Text)
		}
		if cells[1].Text != "已有" {
			t.Errorf("filled cell changed: %q", cells[1].Text)
		}
	})

	t.Run("all cells already filled — no OCR", func(t *testing.T) {
		cells := []pdf.TSRCell{
			{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "A"},
			{X0: 100, Y0: 0, X1: 200, Y1: 50, Text: "B"},
		}
		ocrTableCells(context.Background(), cells, nil, nil) // should not panic
		if cells[0].Text != "A" || cells[1].Text != "B" {
			t.Error("filled cells should not change")
		}
	})

	t.Run("empty cells list", func(t *testing.T) {
		ocrTableCells(context.Background(), nil, nil, nil) // should not panic
		ocrTableCells(context.Background(), []pdf.TSRCell{}, nil, nil)
	})

	t.Run("no DeepDoc — skip", func(t *testing.T) {
		cells := []pdf.TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: ""}}
		ocrTableCells(context.Background(), cells, nil, nil)
		if cells[0].Text != "" {
			t.Error("without DeepDoc, cell should stay empty")
		}
	})

	t.Run("no cropped image — skip", func(t *testing.T) {
		cells := []pdf.TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: ""}}
		mock := &MockDocAnalyzer{Healthy: true, OCRTexts: []pdf.OCRText{{Text: "x", Confidence: 0.5}}}
		ocrTableCells(context.Background(), cells, nil, mock)
		if cells[0].Text != "" {
			t.Error("without image, cell should stay empty")
		}
	})

	t.Run("OCR returns empty string", func(t *testing.T) {
		cells := []pdf.TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: ""}}
		mock := &MockDocAnalyzer{Healthy: true, OCRTexts: []pdf.OCRText{}}
		dummy := image.NewRGBA(image.Rect(0, 0, 100, 50))
		ocrTableCells(context.Background(), cells, dummy, mock)
		if cells[0].Text != "" {
			t.Error("empty OCR result → cell stays empty")
		}
	})

	t.Run("cell out of image bounds", func(t *testing.T) {
		cells := []pdf.TSRCell{{X0: 500, Y0: 500, X1: 600, Y1: 600, Text: ""}}
		mock := &MockDocAnalyzer{Healthy: true, OCRTexts: []pdf.OCRText{{Text: "out of bounds", Confidence: 0.9}}}
		dummy := image.NewRGBA(image.Rect(0, 0, 100, 100))
		// Should not panic — gracefully degrade
		ocrTableCells(context.Background(), cells, dummy, mock)
		t.Logf("out-of-bounds cell: text=%q", cells[0].Text)
	})
}

func garbledSample() []pdf.TextChar {
	punctuation := []string{"!", "#", "$", "%", "&", "*", "+", "-", ".", "/",
		":", ";", "<", ">", "=", "?", "@", "^", "_", "~"}
	chars := make([]pdf.TextChar, 20)
	for i, p := range punctuation {
		chars[i] = pdf.TextChar{
			X0: 50 + float64(i*10), X1: 58 + float64(i*10),
			Top: 100, Bottom: 112,
			Text: p, FontName: "ABCDEF+SimSun", PageNumber: 0,
		}
	}
	return chars
}

// ── OCR fallback integration through Parse ─────────────────────────────

func TestOCR_FallbackIntegration(t *testing.T) {
	// ocrFallback logic is tested via TestOCR_fallback.
	// The render+OCR path in Parse requires a real PDF + DeepDoc service.
	// This test verifies the wiring compiles and that garbled chars without
	// DeepDoc pass through gracefully (covered by TestOCR_FallbackIntegration_NoDeepDoc).
	t.Log("OCR fallback Parse integration: tested via TestOCR_fallback (logic) + live DeepDoc testing")
}

func TestOCR_FallbackIntegration_NoDeepDoc(t *testing.T) {
	chars := garbledSample()
	mockEng := &mockEngine{chars: map[int][]pdf.TextChar{0: chars}, pageCount: 1}

	cfg := pdf.DefaultParserConfig()
	p := NewParser(cfg, &MockDocAnalyzer{Healthy: true})
	result, err := p.Parse(context.Background(), mockEng)
	if err != nil {
		t.Fatal(err)
	}
	t.Logf("garbled chars: %d sections", len(result.Sections))
}

func TestNoDeepDoc_PdfOxideUnmapped_KeepsChars(t *testing.T) {
	// pdf_oxide ### unmapped glyphs mixed with real CJK text.
	// Without DeepDoc, isGarbledPage should return false (isScanNoise gate),
	// so chars are kept and sections > 0.
	chars := make([]pdf.TextChar, 30)
	for i := 0; i < 20; i++ {
		chars[i] = pdf.TextChar{
			Text: "测试文本", FontName: "SimSun",
			X0: 50, X1: 128, Top: float64(100 + i*15), Bottom: float64(112 + i*15),
		}
	}
	// Insert ### unmapped glyph noise (no subset fonts)
	chars[20] = pdf.TextChar{Text: "#", FontName: "SimSun", X0: 130, X1: 138, Top: 100, Bottom: 112}
	chars[21] = pdf.TextChar{Text: "#", FontName: "SimSun", X0: 138, X1: 146, Top: 100, Bottom: 112}
	chars[22] = pdf.TextChar{Text: "#", FontName: "SimSun", X0: 146, X1: 154, Top: 100, Bottom: 112}
	chars[23] = pdf.TextChar{Text: "D", FontName: "SimSun", X0: 154, X1: 162, Top: 100, Bottom: 112}
	chars[24] = pdf.TextChar{Text: "_", FontName: "SimSun", X0: 162, X1: 170, Top: 100, Bottom: 112}
	chars[25] = pdf.TextChar{Text: "8", FontName: "SimSun", X0: 170, X1: 178, Top: 100, Bottom: 112}
	chars[26] = pdf.TextChar{Text: "-", FontName: "SimSun", X0: 178, X1: 186, Top: 100, Bottom: 112}
	chars[27] = pdf.TextChar{Text: ".", FontName: "SimSun", X0: 186, X1: 194, Top: 100, Bottom: 112}
	chars[28] = pdf.TextChar{Text: "*", FontName: "SimSun", X0: 194, X1: 202, Top: 100, Bottom: 112}
	chars[29] = pdf.TextChar{Text: "用", FontName: "SimSun", X0: 202, X1: 210, Top: 100, Bottom: 112}

	mockEng := &mockEngine{chars: map[int][]pdf.TextChar{0: chars}, pageCount: 1}
	p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: true})
	result, err := p.Parse(context.Background(), mockEng)
	if err != nil {
		t.Fatal(err)
	}
	if len(result.Sections) == 0 {
		t.Error("pdf_oxide unmapped + CJK: expected >0 sections, got 0")
	}
	t.Logf("pdf_oxide unmapped + CJK: %d sections (chars kept)", len(result.Sections))
}

func TestIsGarbledPage(t *testing.T) {
	t.Run("PUA dominant", func(t *testing.T) {
		chars := make([]pdf.TextChar, 50)
		for i := range chars {
			chars[i] = pdf.TextChar{Text: string(rune(0xE000)), PageNumber: 0}
		}
		if !util.IsGarbledPage(chars) {
			t.Error("100% PUA → garbled")
		}
	})
	t.Run("font encoding", func(t *testing.T) {
		if !util.IsGarbledPage(garbledSample()) {
			t.Error("subset font → garbled")
		}
	})
	t.Run("normal text", func(t *testing.T) {
		chars := make([]pdf.TextChar, 50)
		for i := range chars {
			chars[i] = pdf.TextChar{Text: "a", PageNumber: 0}
		}
		if util.IsGarbledPage(chars) {
			t.Error("normal text → not garbled")
		}
	})
	t.Run("pdf oxide unmapped + CJK — not garbled", func(t *testing.T) {
		// ### unmapped glyphs + real CJK text (no subset fonts).
		// isScanNoise returns false (≥2 consecutive CJK chars: "护理全科").
		chars := []pdf.TextChar{
			{Text: "和", PageNumber: 0}, {Text: "蔘", PageNumber: 0},
			{Text: "语", PageNumber: 0}, {Text: "言", PageNumber: 0},
			{Text: "#", PageNumber: 0}, {Text: "#", PageNumber: 0},
			{Text: "#", PageNumber: 0}, {Text: "D", PageNumber: 0},
			{Text: "_", PageNumber: 0}, {Text: "8", PageNumber: 0},
			{Text: "-", PageNumber: 0}, {Text: ".", PageNumber: 0},
			{Text: "*", PageNumber: 0}, {Text: "/", PageNumber: 0},
			{Text: "*", PageNumber: 0}, {Text: "护", PageNumber: 0},
			{Text: "理", PageNumber: 0}, {Text: "全", PageNumber: 0},
			{Text: "科", PageNumber: 0}, {Text: "引", PageNumber: 0},
			{Text: "用", PageNumber: 0},
		}
		if util.IsGarbledPage(chars) {
			t.Error("### unmapped + CJK text should NOT be garbled (no subset fonts)")
		}
	})
	t.Run("too few chars", func(t *testing.T) {
		if util.IsGarbledPage([]pdf.TextChar{{Text: " ", PageNumber: 0}}) {
			t.Error("< 20 chars → not garbled")
		}
	})
}

func TestOCR_Fallback_PUAGarbled(t *testing.T) {
	pua := make([]pdf.TextChar, 50)
	for i := range pua {
		pua[i] = pdf.TextChar{Text: string(rune(0xE000 + i%10)), PageNumber: 0}
	}
	dummyImg := image.NewRGBA(image.Rect(0, 0, 100, 100))
	mock := &MockDocAnalyzer{
		Healthy:  true,
		OCRBoxes: []pdf.OCRBox{{X0: 10, Y0: 20, X1: 90, Y1: 20, X2: 90, Y2: 40, X3: 10, Y3: 40}},
		OCRTexts: []pdf.OCRText{{Text: "PUA OCR text", Confidence: 0.9}},
	}
	got := ocrDetectAndRecognize(context.Background(), dummyImg, mock, 0, "garbled page")
	if len(got) != 1 || got[0].Text != "PUA OCR text" {
		t.Errorf("PUA garbled should trigger OCR, got %v", got)
	}
}

// ── ocrMergeChars ─────────────────────────────────────────────────────

func TestOCR_MergeChars(t *testing.T) {
	dummyImg := image.NewRGBA(image.Rect(0, 0, 600, 600))

	t.Run("nil image", func(t *testing.T) {
		chars := []pdf.TextChar{{X0: 10, Top: 10, X1: 20, Bottom: 30, Text: "A", PageNumber: 0}}
		if boxes := ocrMergeChars(context.Background(), nil, chars, &MockDocAnalyzer{Healthy: true}, 0); boxes != nil {
			t.Error("nil image → nil")
		}
	})

	t.Run("detect returns no boxes", func(t *testing.T) {
		mock := &MockDocAnalyzer{Healthy: true, OCRBoxes: []pdf.OCRBox{}}
		chars := []pdf.TextChar{{X0: 10, Top: 10, X1: 20, Bottom: 30, Text: "A", PageNumber: 0}}
		if boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0); boxes != nil {
			t.Error("no detect boxes → nil")
		}
	})

	t.Run("detect boxes — all overlap with chars (chars used, Python-aligned)", func(t *testing.T) {
		mock := &MockDocAnalyzer{
			Healthy:  true,
			OCRBoxes: []pdf.OCRBox{{X0: 15, Y0: 15, X1: 150, Y1: 15, X2: 150, Y2: 150, X3: 15, Y3: 150}},
			OCRTexts: []pdf.OCRText{{Text: "Hello OCR", Confidence: 0.9}},
		}
		chars := []pdf.TextChar{{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0}}
		boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
		if len(boxes) != 1 {
			t.Fatalf("expected 1 box, got %d", len(boxes))
		}
		// Embedded chars override OCR — char text is more precise.
		if boxes[0].Text != "Hello" {
			t.Errorf("expected char text 'Hello', got %q", boxes[0].Text)
		}
	})

	t.Run("detect boxes — none overlap with chars", func(t *testing.T) {
		mock := &MockDocAnalyzer{
			Healthy:  true,
			OCRBoxes: []pdf.OCRBox{{X0: 240, Y0: 240, X1: 270, Y1: 240, X2: 270, Y2: 270, X3: 240, Y3: 270}},
			OCRTexts: []pdf.OCRText{{Text: "OCR", Confidence: 0.9}},
		}
		chars := []pdf.TextChar{{X0: 10, X1: 20, Top: 10, Bottom: 20, Text: "A", PageNumber: 0}}
		boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
		if len(boxes) != 1 {
			t.Fatalf("expected 1 box (OCR), got %d", len(boxes))
		}
		if boxes[0].Text != "OCR" {
			t.Errorf("expected OCR text 'OCR', got %q", boxes[0].Text)
		}
	})

	t.Run("detect box — no chars and OCR returns empty", func(t *testing.T) {
		mock := &MockDocAnalyzer{
			Healthy:  true,
			OCRBoxes: []pdf.OCRBox{{X0: 240, Y0: 240, X1: 270, Y1: 240, X2: 270, Y2: 270, X3: 240, Y3: 270}},
			OCRTexts: []pdf.OCRText{},
		}
		chars := []pdf.TextChar{{X0: 10, X1: 20, Top: 10, Bottom: 20, Text: "A", PageNumber: 0}}
		boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
		if len(boxes) != 0 {
			t.Fatalf("expected 0 boxes (empty OCR), got %d", len(boxes))
		}
	})

	t.Run("multiple detect boxes — one with chars, one OCR", func(t *testing.T) {
		// Box 1 overlaps chars → uses char text. Box 2 has no chars → OCR.
		mock := &MockDocAnalyzer{
			Healthy: true,
			OCRBoxes: []pdf.OCRBox{
				{X0: 15, Y0: 15, X1: 150, Y1: 15, X2: 150, Y2: 150, X3: 15, Y3: 150},
				{X0: 240, Y0: 240, X1: 270, Y1: 240, X2: 270, Y2: 270, X3: 240, Y3: 270},
			},
			OCRTexts: []pdf.OCRText{
				{Text: "box 1 text", Confidence: 0.9},
			},
		}
		chars := []pdf.TextChar{{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "Hello", PageNumber: 0}}
		boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
		if len(boxes) != 2 {
			t.Fatalf("expected 2 boxes, got %d", len(boxes))
		}
		// Box 0 has chars → uses char text.
		if boxes[0].Text != "Hello" {
			t.Errorf("box[0] expected char text 'Hello', got %q", boxes[0].Text)
		}
		// Box 1 has no chars → OCR.
		if boxes[1].Text != "box 1 text" {
			t.Errorf("box[1] expected OCR 'box 1 text', got %q", boxes[1].Text)
		}
	})

	t.Run("chars in box — sorted by reading order (top→x0)", func(t *testing.T) {
		// Box 1 (pixel Y=30-90 → PDF 10-30) overlaps char "a" at (10,10-30).
		// Box 2 (pixel Y=330-390 → PDF 110-130) overlaps char "c" at (70,110-130).
		mock := &MockDocAnalyzer{
			Healthy: true,
			OCRBoxes: []pdf.OCRBox{
				{X0: 15, Y0: 30, X1: 90, Y1: 30, X2: 90, Y2: 90, X3: 15, Y3: 90},
				{X0: 75, Y0: 330, X1: 300, Y1: 330, X2: 300, Y2: 390, X3: 75, Y3: 390},
			},
		}
		chars := []pdf.TextChar{
			{X0: 70, X1: 90, Top: 110, Bottom: 130, Text: "c", PageNumber: 0},
			{X0: 10, X1: 30, Top: 10, Bottom: 30, Text: "a", PageNumber: 0},
		}
		boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
		if len(boxes) != 2 {
			t.Fatalf("expected 2 detect boxes, got %d", len(boxes))
		}
		// Each box gets its overlapping char text.
		if boxes[0].Text != "a" {
			t.Errorf("box[0] expected 'a', got %q", boxes[0].Text)
		}
		if boxes[1].Text != "c" {
			t.Errorf("box[1] expected 'c', got %q", boxes[1].Text)
		}
	})

	t.Run("height mismatch — chars with very different height excluded", func(t *testing.T) {
		// Box pixel Y=75-165 → PDF 25-55, height=30. Char A height=20, diff=10/30=0.33 < 0.7 → kept.
		// Char B height=100, diff=70/100=0.70 ≥ 0.7 → excluded.
		mock := &MockDocAnalyzer{
			Healthy: true,
			OCRBoxes: []pdf.OCRBox{
				{X0: 15, Y0: 75, X1: 150, Y1: 75, X2: 150, Y2: 165, X3: 15, Y3: 165},
			},
			OCRTexts: []pdf.OCRText{{Text: "OCR height test", Confidence: 0.9}},
		}
		chars := []pdf.TextChar{
			{X0: 10, X1: 30, Top: 30, Bottom: 50, Text: "A", PageNumber: 0},
			{X0: 40, X1: 60, Top: 20, Bottom: 120, Text: "B", PageNumber: 0},
		}
		boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
		if len(boxes) != 1 {
			t.Fatalf("expected 1 box, got %d", len(boxes))
		}
		// Only 'A' matches; 'B' excluded by height gate.
		if boxes[0].Text != "A" {
			t.Errorf("expected 'A' (B excluded by height gate), got %q", boxes[0].Text)
		}
	})

	t.Run("garbled chars — box text cleared for OCR recognize", func(t *testing.T) {
		mock := &MockDocAnalyzer{
			Healthy: true,
			OCRBoxes: []pdf.OCRBox{
				{X0: 15, Y0: 15, X1: 450, Y1: 15, X2: 450, Y2: 450, X3: 15, Y3: 450},
			},
			OCRTexts: []pdf.OCRText{{Text: "OCR result", Confidence: 0.9}},
		}
		chars := []pdf.TextChar{
			{X0: 10, X1: 20, Top: 10, Bottom: 20, Text: "", PageNumber: 0},
			{X0: 30, X1: 40, Top: 10, Bottom: 20, Text: "", PageNumber: 0},
			{X0: 50, X1: 60, Top: 10, Bottom: 20, Text: "a", PageNumber: 0},
		}
		boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
		if len(boxes) != 1 {
			t.Fatalf("expected 1 box, got %d", len(boxes))
		}
		if boxes[0].Text != "OCR result" {
			t.Errorf("expected 'OCR result' (garbled majority -> OCR), got %q", boxes[0].Text)
		}
	})

	t.Run("OCR text preserves word spacing", func(t *testing.T) {
		// Detect box at (pixel 30,30 → 90,90 → PDF 10,10 → 30,30).
		// Chars at (10,10-25) → within the box region.  Char text "do" is
		// used (Python-aligned: embedded chars are more precise than OCR).
		mock := &MockDocAnalyzer{
			Healthy:  true,
			OCRBoxes: []pdf.OCRBox{{X0: 30, Y0: 30, X1: 90, Y1: 30, X2: 90, Y2: 90, X3: 30, Y3: 90}},
			OCRTexts: []pdf.OCRText{{Text: "docker commit infiniflow", Confidence: 0.95}},
		}
		chars := []pdf.TextChar{
			{Text: "d", X0: 10, X1: 20, Top: 10, Bottom: 25, PageNumber: 0},
			{Text: "o", X0: 21, X1: 30, Top: 10, Bottom: 25, PageNumber: 0},
		}
		boxes := ocrMergeChars(context.Background(), dummyImg, chars, mock, 0)
		if len(boxes) != 1 {
			t.Fatalf("expected 1 box, got %d", len(boxes))
		}
		// Char text used (Python-aligned).
		if boxes[0].Text != "do" {
			t.Errorf("expected char text 'do', got %q", boxes[0].Text)
		}
	})
}

// TestTableSectionCaptionInHTML verifies mergeCaptions prepends table
// caption text before the HTML table, matching Python's caption handling.

func TestTableSectionCaptionInHTML(t *testing.T) {
	// Simulate pipeline order: extractTableAndReplace → boxesToSections → mergeCaptions
	boxes := []pdf.TextBox{
		{X0: 100, X1: 500, Top: 200, Bottom: 400, LayoutType: "table", PageNumber: 0},
	}
	ti := pdf.TableItem{
		Cells: []pdf.TSRCell{
			{X0: 0, Y0: 0, X1: 200, Y1: 50, Label: "table row", Text: "飞机"},
			{X0: 0, Y0: 51, X1: 200, Y1: 100, Label: "table row", Text: "火车"},
		},
		Positions: []pdf.Position{{Left: 100, Right: 500, Top: 200, Bottom: 400}},
		Scale:     1.0,
	}

	// Step 1: extractTableAndReplace → HTML box with table text
	boxes = tbl.ExtractTableAndReplace(boxes, []pdf.TableItem{ti})
	sections := lyt.BoxesToSections(boxes, nil)

	// Add caption section
	sections = append(sections, pdf.Section{
		LayoutType: "table caption",
		Positions:  []pdf.Position{{Left: 100, Right: 500, Top: 180, Bottom: 198}},
		Text:       "表1: 交通工具等级",
	})

	// Step 2: mergeCaptions prepends caption before HTML
	figures := pdf.CollectFigures(sections)
	sections = tbl.MergeCaptions(sections, figures)

	if !strings.HasPrefix(sections[0].Text, "表1: 交通工具等级<table>") {
		t.Errorf("expected caption before table HTML, got %q", sections[0].Text)
	}
}

// TestBoxMatchesCell_FalsePositive verifies that boxMatchesCell rejects
// text boxes that are mostly OUTSIDE the cell, even with cellIsEmpty=true.
// The 0.3 threshold should not match a wide box that barely touches a
// narrow cell — this would cause body text to leak into table cells.
// TestParser_ConcurrentSafety verifies that Parser.Parse() is safe for
// concurrent use. 8 goroutines each call Parse 5 times on the same Parser
// instance. Run with -race.
func TestParser_ConcurrentSafety(t *testing.T) {
	p := NewParser(pdf.DefaultParserConfig(), &MockDocAnalyzer{Healthy: false})

	var wg sync.WaitGroup
	n := 8
	for range n {
		wg.Add(1)
		go func() {
			defer wg.Done()
			for range 5 {
				eng := &mockEngine{pageCount: 2}
				_, _ = p.Parse(context.Background(), eng)
			}
		}()
	}
	wg.Wait()
}