ragflow/internal/deepdoc/parser/pdf/table_parity_issues_test.go

//go:build manual

package parser

import (
	"bytes"
	"context"
	"encoding/base64"
	"image"
	"regexp"
	"strings"
	"testing"
)

// =============================================================================
// Issue 1: Figure insertion strategy
// Python's insert_table_figures(figs, "figure") inserts figure boxes back into
// self.boxes. Go's extractTableAndReplace only handles LayoutType=="table",
// leaving figure boxes in the list. This test documents the current behavior.
// =============================================================================

// TestExtractTableAndReplace_IgnoresFigures documents that extractTableAndReplace
// does NOT pop or replace figure boxes. In Python's _extract_table_figure,
// figure boxes are popped and re-inserted via insert_table_figures with cropped
// images. Go leaves them in the box list for downstream boxesToSections.
func TestExtractTableAndReplace_IgnoresFigures(t *testing.T) {
	boxes := []TextBox{
		{X0: 10, X1: 200, Top: 0, Bottom: 50, Text: "Figure text", LayoutType: "figure", PageNumber: 0},
		{X0: 10, X1: 200, Top: 60, Bottom: 80, Text: "表1：标题", LayoutType: "table", PageNumber: 0},
	}

	// Table with cells so extractTableAndReplace generates HTML.
	tables := []TableItem{{
		Cells:     []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "A", Label: "table row"}},
		Positions: []Position{{Left: 0, Right: 300, Top: 0, Bottom: 100}},
		Scale:     1.0,
	}}

	result := extractTableAndReplace(boxes, tables)

	// BUG: Figure box is still present — it was not popped or replaced.
	// Python's _extract_table_figure pops figure boxes and re-inserts them
	// via insert_table_figures with cropped images.
	hasFigure := false
	for _, b := range result {
		if b.LayoutType == "figure" {
			hasFigure = true
			// Figure text is still raw text, not a consolidated image+text block
			// like Python's insert_table_figures would produce.
			if b.Text != "Figure text" {
				t.Errorf("figure text should be unchanged, got %q", b.Text)
			}
		}
	}
	if !hasFigure {
		t.Error("BUG EXPOSED: extractTableAndReplace removed figure box (unexpected)")
	}
	t.Log("NOTE: Figure box remains in list as raw text. Python inserts figures back with cropped images via insert_table_figures. Go collects figures separately via CollectFigures without re-inserting.")
}

// TestBoxesToSections_FiguresNotReinserted documents that boxesToSections converts
// figure boxes to sections but without the consolidated image that Python's
// insert_table_figures would attach.
func TestBoxesToSections_FiguresNotReinserted(t *testing.T) {
	// Simulate post-extractTableAndReplace boxes with figures still present.
	boxes := []TextBox{
		{X0: 10, X1: 200, Top: 0, Bottom: 50, Text: "Some text", LayoutType: "text", PageNumber: 0},
		{X0: 10, X1: 200, Top: 60, Bottom: 100, Text: "Figure description", LayoutType: "figure", PageNumber: 0},
	}

	sections := boxesToSections(boxes, nil)
	figures := CollectFigures(sections)

	// BUG: figures are collected separately but NOT re-inserted into sections
	// after image processing. In Python, insert_table_figures(figs, "figure")
	// creates new boxes with layout_type="figure", image=cropped_img, and
	// inserts them at the nearest position among text boxes.
	if len(figures) != 1 {
		t.Fatalf("expected 1 figure, got %d", len(figures))
	}
	if figures[0].LayoutType != "figure" {
		t.Errorf("expected LayoutType 'figure', got %q", figures[0].LayoutType)
	}
	// Figure image is empty at this stage (cropSectionImage runs later in pipeline).
	if figures[0].Image != "" {
		t.Log("figure has image (cropSectionImage already ran)")
	} else {
		t.Log("NOTE: Figure section has no Image yet. Python's cropout creates a consolidated cropped image for the entire figure region before insert_table_figures.")
	}

	t.Logf("Sections count: %d (figure present as raw text section)", len(sections))
	t.Logf("Figures count: %d (collected separately, Python re-inserts them)", len(figures))
}

// =============================================================================
// Issue 2a: blockType classification missing
// Python's construct_table classifies each cell into 9 types (Dt/Nu/Ca/En/NE/
// Sg/Tx/Lx/Nr/Ot). The dominant type drives header detection: if max_type is
// "Nu" (numeric), numeric cells don't count as headers. Go's headerSet only
// checks TSR labels — no cell content type analysis.
// =============================================================================

// TestConstructTable_HeaderDetection_NoBlockType documents that Go's header
// detection is purely TSR-label-based. Python would use blockType to skip
// numeric cells when the dominant type is "Nu".
func TestConstructTable_HeaderDetection_NoBlockType(t *testing.T) {
	// A table where the "header" row has numeric content (like years, amounts).
	// With blockType: "2020","2021" → Nu, "100","200" → Nu — maxType=Nu.
	// block-type-aware detection skips Nu cells → 0 headers.
	// Falls back to TSR label-based detection → still gets 2 <th >.
	cells := []TSRCell{
		{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "2020", Label: "table column header"},
		{X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "2021", Label: "table column header"},
		{X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "100", Label: "table row"},
		{X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "200", Label: "table row"},
	}

	item := &TableItem{}
	html := constructTable(cells, nil, "", item)

	// FIX VERIFIED: headerSetWithBlockType computes block types (all "Nu"),
	// skips Nu headers when maxType=Nu, then falls back to TSR label detection.
	// Header row still gets <th > because TSR labels contain "header".
	thCount := strings.Count(html, "<th ")
	if thCount != 2 {
		t.Errorf("expected 2 <th >, got %d. HTML: %s", thCount, html)
	}

	t.Log("FIX: blockType classification added. maxType=Nu skips Nu headers in primary pass.")
	t.Log("TSR label fallback still marks header rows with 'header' in label.")
}

// TestConstructTable_BlockType_DominantTypeMissing documents that Go has no
// concept of a "dominant cell type" that Python uses for header detection.
func TestConstructTable_BlockType_DominantTypeMissing(t *testing.T) {
	// Mixed table with numeric-dominant data, testing blockType header detection.
	// "年份"/"金额" → Tx (short text), "2020"/"1000"/etc → Nu. maxType=Nu.
	// Header cells are non-Nu → count as headers even under Nu-dominant logic.
	// FIX: blockType now classifies cells and drives header detection.
	cells := []TSRCell{
		{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "年份", Label: "table column header"},
		{X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "金额", Label: "table column header"},
		{X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "2020", Label: "table row"},
		{X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "1000", Label: "table row"},
		{X0: 0, Y0: 70, X1: 100, Y1: 100, Text: "2021", Label: "table row"},
		{X0: 101, Y0: 70, X1: 200, Y1: 100, Text: "2000", Label: "table row"},
		{X0: 0, Y0: 105, X1: 100, Y1: 135, Text: "2022", Label: "table row"},
		{X0: 101, Y0: 105, X1: 200, Y1: 135, Text: "3000", Label: "table row"},
	}

	item := &TableItem{}
	html := constructTable(cells, nil, "", item)

	thCount := strings.Count(html, "<th ")
	if thCount != 2 {
		t.Errorf("expected 2 <th > for non-numeric headers under Nu-dominant table, got %d. HTML: %s", thCount, html)
	}

	t.Log("FIX: blockType classifies '年份'/'金额' as non-Nu headers, '2020'/'1000' as Nu data.")
	t.Logf("blockType('年份')=%q blockType('2020')=%q", blockType("年份"), blockType("2020"))
}

// TestConstructTable_BlockTypeChangesHeaderDetection verifies blockType
// changes header detection for a table WITHOUT TSR header labels.
// This is the case where pure label-based detection would fail.
func TestConstructTable_BlockTypeChangesHeaderDetection(t *testing.T) {
	// Table with NO "header" labels — label-based detection gives 0 headers.
	// blockType: "姓名"/"年龄" → Tx, "张三"/"25" → Ot/En/? — maxType varies.
	// With Nu-dominant data, non-Nu top row cells count as possible headers.
	cells := []TSRCell{
		{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "姓名", Label: "table row"},
		{X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "年龄", Label: "table row"},
		{X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "张三", Label: "table row"},
		{X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "25", Label: "table row"},
		{X0: 0, Y0: 70, X1: 100, Y1: 100, Text: "李四", Label: "table row"},
		{X0: 101, Y0: 70, X1: 200, Y1: 100, Text: "30", Label: "table row"},
		{X0: 0, Y0: 105, X1: 100, Y1: 135, Text: "王五", Label: "table row"},
		{X0: 101, Y0: 105, X1: 200, Y1: 135, Text: "28", Label: "table row"},
	}

	html := constructTable(cells, nil, "", &TableItem{Grid: groupTSRCellsToRowsLabeled(cells)})

	// blockType analysis:
	// "姓名"(Tx), "年龄"(Tx), "张三"(Ot), "25"(Nu), "李四"(Ot), "30"(Nu), "王五"(Ot), "28"(Nu)
	// maxType could be Ot(3), Nu(3), or Tx(2).
	// Fallback catches the case where no headers detected by block-type path.
	t.Logf("HTML:\n%s", html)
	t.Log("FIX: blockType+fallback header detection works for tables without TSR header labels")
}

// =============================================================================
// Issue 2b: colspan/rowspan missing
// Python's __cal_spans computes colspan/rowspan from spanning cells by
// clustering column centers and row centers. Go's rowsToHTML produces
// a flat grid with no spanning attributes.
// =============================================================================

// TestRowsToHTML_NoColspanRowspan documents that rowsToHTML never produces
// colspan or rowspan attributes, even for spanning cells.
func TestRowsToHTML_NoColspanRowspan(t *testing.T) {
	// Two rows with a spanning cell in row 0.
	// In Python, a "table spanning cell" covering columns 0-1 would get colspan=2.
	rows := [][]TSRCell{
		{
			{Text: "跨列标题", Label: "table spanning cell"},
			{Text: "", Label: ""}, // padded cell
		},
		{
			{Text: "数据A", Label: "table row"},
			{Text: "数据B", Label: "table row"},
		},
	}

	html := rowsToHTML(rows, "", nil, nil, nil)

	// BUG: No colspan or rowspan attributes in output.
	if strings.Contains(html, "colspan") {
		t.Error("unexpected: colspan found in output (should not be present without __cal_spans)")
	}
	if strings.Contains(html, "rowspan") {
		t.Error("unexpected: rowspan found in output (should not be present without __cal_spans)")
	}

	// The spanning cell is rendered as a plain <td > with text, and the padded
	// empty cell is also rendered as an empty <td >. Python would merge them.
	tdCount := strings.Count(html, "<td ")
	if tdCount == 4 {
		t.Logf("Got %d <td > cells (flat grid, spanning cell + padded empty cell both rendered)", tdCount)
	} else {
		t.Logf("Got %d <td > cells. HTML:\n%s", tdCount, html)
	}

	t.Log("NOTE: Python's __cal_spans clusters column centers within spanning cells")
	t.Log("to compute colspan/rowspan. Go outputs a flat grid without spanning attributes.")
}

// TestConstructTable_SpannedTable_NoMerge documents the full constructTable
// path with spanning cells — no colspan/rowspan in output.
func TestConstructTable_SpannedTable_NoMerge(t *testing.T) {
	// Spanning cell at same Y as row cells so groupTSRCellsToRowsLabeled
	// puts them in the same row group. The spanning cell covers X=0-200
	// (both columns); Python's __cal_spans would give it colspan=2.
	cells := []TSRCell{
		// Row 0: a spanning cell that covers both columns + one regular cell.
		{X0: 0, Y0: 0, X1: 200, Y1: 30, Text: "部门开支汇总", Label: "table spanning cell"},
		{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "Q1", Label: "table row"},
		{X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "Q2", Label: "table row"},
		// Row 1: data row
		{X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "100", Label: "table row"},
		{X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "200", Label: "table row"},
	}

	item := &TableItem{}
	html := constructTable(cells, nil, "", item)

	// Verify colspan IS now detected (calSpans aligned with Python's __cal_spans).
	if !strings.Contains(html, "colspan") {
		t.Error("expected colspan on spanning cell, calSpans should detect it")
	}

	// Verify the HTML structure — spanning cell exists WITH colspan.
	if !strings.Contains(html, "部门开支汇总") {
		t.Error("spanning cell text missing")
	}
	if !strings.Contains(html, "Q1") {
		t.Error("Q1 cell should still be present (covered by span)")
	}
	t.Logf("HTML:\n%s", html)
}

// =============================================================================
// Issue 2c: Single column/row cleanup missing
// Python's construct_table removes orphan columns (only one non-empty cell)
// when ≥4 rows, and orphan rows when ≥4 columns. Go has no such cleanup.
// =============================================================================

// TestConstructTable_OrphanColumn_NotCleanedUp documents that Go does NOT
// remove columns that have only one non-empty cell.
func TestConstructTable_OrphanColumn_NotCleanedUp(t *testing.T) {
	// 4 rows × 3 columns. Column index 1 has only ONE non-empty cell.
	// Python would relocate/merge that orphan column.
	cells := []TSRCell{
		{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "姓名", Label: "table column header"},
		{X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "备注", Label: "table row"}, // orphan col
		{X0: 201, Y0: 0, X1: 300, Y1: 30, Text: "年龄", Label: "table column header"},
		{X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "张三", Label: "table row"},
		{X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "", Label: "table row"}, // col 1 empty
		{X0: 201, Y0: 35, X1: 300, Y1: 65, Text: "25", Label: "table row"},
		{X0: 0, Y0: 70, X1: 100, Y1: 100, Text: "李四", Label: "table row"},
		{X0: 101, Y0: 70, X1: 200, Y1: 100, Text: "", Label: "table row"}, // col 1 empty
		{X0: 201, Y0: 70, X1: 300, Y1: 100, Text: "30", Label: "table row"},
		{X0: 0, Y0: 105, X1: 100, Y1: 135, Text: "王五", Label: "table row"},
		{X0: 101, Y0: 105, X1: 200, Y1: 135, Text: "", Label: "table row"}, // col 1 empty
		{X0: 201, Y0: 105, X1: 300, Y1: 135, Text: "28", Label: "table row"},
	}

	item := &TableItem{}
	html := constructTable(cells, nil, "", item)

	// BUG: All 4 rows have 3 cells each (orphan column preserved).
	// Python's construct_table pops single-cell columns when ≥4 rows.
	trCount := strings.Count(html, "<tr>")
	totalTdTh := strings.Count(html, "<td ") + strings.Count(html, "<th ")

	t.Logf("Rows: %d, Total cells: %d (Python would cleanup orphan columns)", trCount, totalTdTh)
	t.Log("NOTE: Python's construct_table removes columns with only one non-empty cell")
	t.Log("when there are ≥4 rows, and removes rows with only one non-empty cell")
	t.Log("when there are ≥4 columns. Go has no equivalent cleanup.")
	t.Logf("HTML:\n%s", html)
}

// =============================================================================
// Issue 2d: is_caption pattern matching in mergeCaptions
// Python's is_caption detects captions by text patterns (图表, Fig., Table, etc.)
// AND layout_type. Go's mergeCaptions only checks LayoutType. If DLA labels a
// caption as "text", Go misses it.
// =============================================================================

// TestMergeCaptions_NoIsCaptionPatternMatch documents that mergeCaptions only
// uses LayoutType, NOT text patterns, for caption detection.
func TestMergeCaptions_NoIsCaptionPatternMatch(t *testing.T) {
	// A caption-like text labeled as "text" by DLA (happens with imperfect DLA).
	// Python's is_caption would match "表1：测试数据" pattern regardless of layout_type.
	// FIX: mergeCaptions now calls captionKind → isCaptionBox to detect these.
	sections := []Section{
		{Text: "T", LayoutType: "table", Positions: []Position{
			{PageNumbers: []int{0, 0}, Left: 10, Right: 100, Top: 0, Bottom: 30},
		}},
		// This is clearly a table caption by text pattern, but DLA labeled it as "text".
		{Text: "表1：测试数据", LayoutType: "text", Positions: []Position{
			{PageNumbers: []int{0, 0}, Left: 10, Right: 100, Top: 40, Bottom: 55},
		}},
	}

	figures := CollectFigures(sections)
	result := mergeCaptions(sections, figures)

	// FIX VERIFIED: "表1：测试数据" should be detected as caption via isCaptionBox
	// and merged into the table section.
	merged := false
	for _, s := range result {
		if s.LayoutType == "table" && strings.Contains(s.Text, "表1：测试数据") {
			merged = true
			t.Log("FIX VERIFIED: caption with LayoutType='text' detected via isCaptionBox and merged into table")
		}
	}
	if !merged {
		t.Error("FIX FAILED: caption '表1：测试数据' should be merged into table via isCaptionBox pattern matching")
	}

	// Caption section should be removed.
	for _, s := range result {
		if s.LayoutType == "text" && s.Text == "表1：测试数据" {
			t.Error("FIX FAILED: caption section should be removed after merge")
		}
	}
}

// TestIsCaptionBox_MatchesChinesePattern verifies the existing isCaptionBox
// function works correctly (it exists but is only used in fillCellTextFromBoxes,
// not in mergeCaptions or caption detection pipeline).
func TestIsCaptionBox_MatchesChinesePattern(t *testing.T) {
	tests := []struct {
		text       string
		layoutType string
		want       bool
	}{
		{"表1：交通工具等级", "", true},
		{"表 1：测试数据", "", true},
		{"图1：系统架构", "", true},
		{"图表 3: 实验结果", "", true},
		{"Fig. 1: Architecture", "", true},
		{"Figure 2: Pipeline", "", true},
		{"Table 3: Results", "", true},
		{"普通文本", "", false},
		{"", "", false},
		{"第一章 概述", "", false},
		// LayoutType-based detection
		{"anything", "figure caption", true},
		{"anything", "table caption", true},
	}

	for _, tt := range tests {
		got := isCaptionBox(tt.text, tt.layoutType)
		if got != tt.want {
			t.Errorf("isCaptionBox(%q, %q) = %v, want %v", tt.text, tt.layoutType, got, tt.want)
		}
	}

	t.Log("NOTE: isCaptionBox is now called by mergeCaptions via captionKind for DLA-mislabeled captions.")
}

// TestFigureInsertion_EndToEnd runs the full Parse pipeline on a PDF with
// a figure DLA region containing TWO text boxes far enough apart that
// NaiveVerticalMerge won't merge them.  Python's _extract_table_figure +
// insert_table_figures pops ALL figure boxes and re-inserts ONE unified
// figure block regardless of text box positions.  Go leaves the individual
// text boxes as separate sections — this test FAILS to expose that.
func TestFigureInsertion_EndToEnd(t *testing.T) {
	eng := &mockEngine{
		pageCount: 1,
		renderW:   1800, renderH: 2400,
		chars: map[int][]TextChar{0: {
			// Two text boxes in the SAME figure DLA region, but far apart.
			// DLA pixel: X=100-500 Y=80-600 → PDF 33-167 x 27-200.
			// Box 1 near top, box 2 near bottom.
			{X0: 50, X1: 150, Top: 40, Bottom: 55, Text: "架构图"},
			{X0: 50, X1: 150, Top: 170, Bottom: 185, Text: "系统模块"},
		}},
	}
	mock := &MockDocAnalyzer{
		Healthy: true,
		DLARegions: []DLARegion{
			// Large figure region covering both text boxes.
			{X0: 100, Y0: 80, X1: 500, Y1: 600, Label: "figure", Confidence: 0.9},
		},
	}
	p := NewParser(DefaultParserConfig(), mock)

	result, err := p.Parse(context.Background(), eng)
	if err != nil {
		t.Fatalf("Parse: %v", err)
	}

	// ── Python behavior: _extract_table_figure + insert_table_figures ──
	// Pops ALL figure boxes regardless of position, cropout creates ONE
	// consolidated image covering the entire DLA figure region, and
	// insert_table_figures re-inserts ONE figure block.
	// Expected: 1 figure section with combined text + cropped image.

	// ── Go current behavior ──
	// Figure boxes stay in list.  NaiveVerticalMerge may NOT merge them
	// if the gap is too large (> 1.5 × median_height ≈ 15pt).
	// Each figure text box → separate section in result.Sections.
	// CollectFigures collects them into result.Figures but doesn't re-insert.

	var figureSections []Section
	for _, s := range result.Sections {
		if s.LayoutType == "figure" {
			figureSections = append(figureSections, s)
		}
	}

	// Assert 1: Python expects exactly 1 consolidated figure section.
	// Go currently produces 2 (one per unmerged text box) — this FAILS.
	if len(figureSections) != 1 {
		t.Errorf("FIGURE INSERTION BUG: expected 1 consolidated figure section (Python insert_table_figures), got %d. Go does not consolidate figure text boxes into a single block.", len(figureSections))
	}

	// Assert 2: The single figure section must contain BOTH text fragments.
	if len(figureSections) == 1 {
		combined := figureSections[0].Text
		if !strings.Contains(combined, "架构图") || !strings.Contains(combined, "系统模块") {
			t.Errorf("FIGURE INSERTION BUG: figure section text=%q should contain both fragments. Python merges all figure-region text.", combined)
		}
	}

	t.Logf("figure sections in Sections: %d", len(figureSections))
	t.Logf("result.Figures count: %d", len(result.Figures))
	t.Logf("result.Sections total: %d", len(result.Sections))
	for i, s := range result.Sections {
		t.Logf("  section[%d] layout=%q text=%q", i, s.LayoutType, s.Text)
	}
}

// =============================================================================
// Issue 3: Multi-page table merging
// Python's _extract_table_figure merges tables with same layoutno across
// consecutive pages (gap ≤ 1 page, Y-dis ≤ 23× median height).
// Go's extractTableAndReplace does NOT merge tables across pages.
// =============================================================================

// TestExtractTableAndReplace_NoCrossPageMerge exposes that extractTableAndReplace
// does not merge tables from consecutive pages even with the same layoutno.
func TestExtractTableAndReplace_NoCrossPageMerge(t *testing.T) {
	// Simulate a table spanning pages 0 and 1.
	// Python would merge these because: same layoutno, consecutive pages,
	// Y-distance ≤ 23× median_height.
	boxes := []TextBox{
		{X0: 10, X1: 200, Top: 500, Bottom: 530, Text: "续表内容", LayoutType: "table", PageNumber: 0, LayoutNo: "0"},
		{X0: 10, X1: 200, Top: 50, Bottom: 80, Text: "表尾内容", LayoutType: "table", PageNumber: 1, LayoutNo: "0"},
	}

	// Two separate TableItems — one per page. Python would merge these
	// before insert_table_figures.
	tables := []TableItem{
		{
			Cells:     []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "Page0", Label: "table row"}},
			Positions: []Position{{PageNumbers: []int{0}, Left: 0, Right: 300, Top: 500, Bottom: 530}},
			Scale:     1.0,
		},
		{
			Cells:     []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "Page1", Label: "table row"}},
			Positions: []Position{{PageNumbers: []int{1}, Left: 0, Right: 300, Top: 50, Bottom: 80}},
			Scale:     1.0,
		},
	}

	result := extractTableAndReplace(boxes, tables)

	// Go produces 2 separate HTML table boxes (one per page).
	// Python would produce 1 merged table with cells from both pages.
	tableCount := 0
	for _, b := range result {
		if strings.Contains(b.Text, "<table>") {
			tableCount++
		}
	}
	if tableCount == 2 {
		t.Errorf("CROSS-PAGE TABLE MERGE BUG: got %d separate HTML tables across pages. Python would merge same-layoutno tables on consecutive pages into 1 consolidated table.", tableCount)
	}
	t.Logf("table HTML boxes: %d (Python would merge into 1)", tableCount)
}

// =============================================================================
// Issue 3a: nomerge_lout_no — don't merge tables separated by captions
// Python's _extract_table_figure tracks nomerge_lout_no: when a table box
// is followed by a caption/title/reference, the table's key is added to
// nomerge_lout_no. Later, cross-page merge skips tables in nomerge_lout_no.
//
// Example:
//   Page 0: table "0-table-3" → caption "表1：..." → table "0-table-4"
//   Page 1: table "1-table-3" (same layoutNo)
//   → Page 0's table-3 should NOT merge with Page 1's table-3,
//     because the caption on page 0 indicates the table ended.
//   → Go's mergeTablesAcrossPages has no nomerge_lout_no check.
// =============================================================================

// TestMergeTablesAcrossPages_NomergeAfterCaption_Missing exposes that
// mergeTablesAcrossPages unconditionally merges consecutive-page tables,
// even when Python's nomerge_lout_no would prevent it.
func TestMergeTablesAcrossPages_NomergeAfterCaption_Missing(t *testing.T) {
	// Simulate: page 0 has table at top, followed by a caption,
	// then another table. Page 1 has the same-layoutNo table continuing.
	// In Python, page 0's first table goes into nomerge_lout_no because
	// the next box is a caption → no cross-page merge for that table group.
	tables := []TableItem{
		{
			Cells: []TSRCell{{Text: "Page0-first", Label: "table row"}},
			Positions: []Position{{
				PageNumbers: []int{0},
				Left:        0, Right: 300,
				Top: 0, Bottom: 50,
			}},
			NoMerge: true, // Set when caption follows this table on the page
		},
		{
			Cells: []TSRCell{{Text: "Page1-cont", Label: "table row"}},
			Positions: []Position{{
				PageNumbers: []int{1},
				Left:        0, Right: 300,
				Top: 0, Bottom: 50,
			}},
		},
	}

	result := mergeTablesAcrossPages(tables, nil)

	// Verify NoMerge prevents cross-page merging.
	if len(result) != 2 {
		t.Errorf("NOMERGE BUG: expected 2 separate table groups, got %d.", len(result))
	}
	t.Log("NoMerge flag correctly prevents cross-page merge.")
}

// =============================================================================
// Issue 3b: insert position — min_rectangle_distance vs anchor
// Python's insert_table_figures uses min_rectangle_distance to find the
// spatially nearest text box and inserts the table/figure next to it.
// Go's extractTableAndReplace uses the first replaced table box index as
// the anchor (insert position).
//
// When the DLA table region extends beyond the anchor box's bottom and
// overlaps a text box below the table, Python puts the table next to that
// overlapping text box (distance=0); Go puts it at the anchor position.
// =============================================================================

// TestExtractTableAndReplace_InsertionPosition_DistanceBug exposes that
// extractTableAndReplace uses the first table box as anchor, rather than
// finding the spatially nearest text box like Python.
func TestExtractTableAndReplace_InsertionPosition_DistanceBug(t *testing.T) {
	// Two text boxes above the table: L0 (left, near table) and R0 (right, far).
	// Python: nearest to table is L0 (dx=0, dy=70).  L0 bottom=30 < table top=100
	// → insert AFTER L0.  Result: [L0, table, R0, R1, L2].
	// Go: anchor = first table box (L1 at index 2).  Result: [L0, R0, table, R1, L2].
	// The table is one position off.
	boxes := []TextBox{
		{X0: 10, X1: 100, Top: 10, Bottom: 30, Text: "L0", LayoutType: "text", PageNumber: 0},
		{X0: 300, X1: 400, Top: 10, Bottom: 30, Text: "R0", LayoutType: "text", PageNumber: 0},
		{X0: 10, X1: 100, Top: 100, Bottom: 130, Text: "table", LayoutType: "table", PageNumber: 0},
		{X0: 300, X1: 400, Top: 100, Bottom: 130, Text: "R1", LayoutType: "text", PageNumber: 0},
		{X0: 10, X1: 100, Top: 250, Bottom: 270, Text: "L2", LayoutType: "text", PageNumber: 0},
	}

	tables := []TableItem{{
		Cells:      []TSRCell{{Text: "cell", Label: "table row"}},
		Positions:  []Position{{Left: 10, Right: 100, Top: 100, Bottom: 130, PageNumbers: []int{0}}},
		Scale:      1.0,
		RegionLeft: 10, RegionRight: 100, RegionTop: 100, RegionBottom: 130,
	}}

	result := extractTableAndReplace(boxes, tables)

	// Find L0 and table positions.
	l0Idx, tableIdx := -1, -1
	for i, b := range result {
		if strings.TrimSpace(b.Text) == "L0" {
			l0Idx = i
		}
		if b.LayoutType == "table" {
			tableIdx = i
		}
	}

	// BUG: table should immediately follow L0 (nearest neighbor, insert_after).
	// Python: min_rectangle_distance → L0 nearest (dx=0, dy=70), L0 below table
	// → insert_at+1 → table right after L0.
	// Go: anchor = first table box index → table at original table box position.
	if tableIdx != l0Idx+1 {
		t.Errorf("INSERTION POSITION BUG: table (idx=%d) should immediately follow L0 (idx=%d). "+
			"Python's min_rectangle_distance finds L0 as nearest text box and inserts table after it. "+
			"Go anchors at first table box position (between R0 and R1).", tableIdx, l0Idx)
	}
	t.Logf("L0 at idx=%d, table at idx=%d", l0Idx, tableIdx)
	t.Log("Fix: replace first-replaced-box anchor with min_rectangle_distance nearest-neighbor (Python pdf_parser.py:1608-1655).")
}

// =============================================================================
// Issue 4: page_cum_height coordinate system
// Python tracks cumulative page image heights for cross-page position tags
// and image cropping. Go uses per-page coordinates only.
// =============================================================================

// TestBoxesToSections_PerPageCoordinates confirms position tags use
// page-relative coordinates. Python's _line_tag also produces local
// coordinates (subtracts page_cum_height). The page number differentiates
// pages; page_cum_height is an internal implementation detail.
func TestBoxesToSections_PerPageCoordinates(t *testing.T) {
	boxes := []TextBox{
		{X0: 10, X1: 100, Top: 40, Bottom: 60, Text: "Page 0 text", LayoutType: "text", PageNumber: 0},
		{X0: 10, X1: 100, Top: 40, Bottom: 60, Text: "Page 1 text", LayoutType: "text", PageNumber: 1},
	}
	sections := boxesToSections(boxes, nil)
	if len(sections) != 2 {
		t.Fatalf("expected 2 sections, got %d", len(sections))
	}
	s0, s1 := sections[0], sections[1]
	if len(s0.Positions) > 0 && len(s1.Positions) > 0 {
		p0, p1 := s0.Positions[0], s1.Positions[0]
		// Both Python and Go use local (page-relative) coordinates.
		// Python's _line_tag: top = bx["top"] - page_cum_height[pn-1]
		// gives local coordinate. Same as Go.
		if p0.Top != p1.Top || p0.Bottom != p1.Bottom {
			t.Errorf("expected same local coords, got Top=(%.0f,%.0f) Bottom=(%.0f,%.0f)", p0.Top, p1.Top, p0.Bottom, p1.Bottom)
		}
		t.Logf("page 0: Page=%v Top=%.0f Bottom=%.0f", p0.PageNumbers, p0.Top, p0.Bottom)
		t.Logf("page 1: Page=%v Top=%.0f Bottom=%.0f", p1.PageNumbers, p1.Top, p1.Bottom)
		t.Log("OK: position tags use page-relative coordinates in both Go and Python.")
	}
}

// =============================================================================
// Issue 6: cropSectionImage padding logic
// Python's self.crop adds 120px context above first segment, 120px context
// below last segment, 6px gap between pages, and overlay transparency.
// Go has simpler crop logic.
// =============================================================================

// TestCropSectionImage_PaddingVsPython documents that Go's cropSectionImage
// adds context padding differently from Python's self.crop.
func TestCropSectionImage_PaddingVsPython(t *testing.T) {
	// Create a page image and position tag for a small text region.
	img := image.NewRGBA(image.Rect(0, 0, 300, 800)) // 300×800 page at zoom=3 → PDF 100×267
	pageImages := map[int]image.Image{0: img}

	// Position tag for a small text box near the top of the page.
	posTag := FormatPositionTag(0, 50.0, 100.0, 10.0, 30.0)

	result := cropSectionImage(posTag, pageImages, 3.0)

	if result == "" {
		t.Error("cropSectionImage returned empty string for valid position")
	}
	// Decode result to check image dimensions.
	data, err := base64.StdEncoding.DecodeString(result)
	if err != nil {
		t.Fatalf("failed to decode base64: %v", err)
	}
	cropped, _, err := image.Decode(bytes.NewReader(data))
	if err != nil {
		t.Fatalf("failed to decode PNG: %v", err)
	}
	croppedH := cropped.Bounds().Dy()
	// Original text region: Top=10, Bottom=30 → height=20 at PDF points.
	// zoom=3 → 60px text height.
	// Python adds 120px context above + 120px below + 6px gap → ~306px.
	// Go adds contextPad=120 points above/below at PDF scale → with zoom=3: 360+60+360=780px.
	// Python uses pixel-space padding (120px literally), Go uses PDF-point padding (120pt).
	expectedMin := 60 // bare minimum: text region itself
	if croppedH <= expectedMin {
		t.Errorf("CROP PADDING BUG: cropped image height=%dpx, expected >%dpx with context padding. Python adds 120px above and below for context.", croppedH, expectedMin)
	}
	t.Logf("cropped image: %dx%d (text region 60px, expecting padding)", cropped.Bounds().Dx(), croppedH)
	t.Log("NOTE: Python's self.crop adds 120px context padding in pixel space, multi-page stitching, and overlay transparency. Go's cropSectionImage uses PDF-point padding and simpler stitching.")
}

// =============================================================================
// Issue 7: Data-source filter missing
// Python's _extract_table_figure pops table/figure boxes matching
// r"(数据|资料|图表)*来源[:： ]" (pdf_parser.py:1040-1042, 1050-1052).
// These boxes are discarded — not extracted, not inserted back.
// Go has no equivalent filter in extractTableAndReplace or consolidateFigures.
// =============================================================================

// dataSourcePattern is a Go translation of Python's
// r"(数据|资料|图表)*来源[:： ]" used with re.match (anchored at start).
var dataSourcePattern = `^(数据|资料|图表)*来源[:： ]`

// TestDataSourcePattern_RegexCoverage validates the Python regex behavior
// that should be adopted. Documents which strings match and which don't.
func TestDataSourcePattern_RegexCoverage(t *testing.T) {
	tests := []struct {
		text string
		want bool // Python re.match truthiness
	}{
		// ── Matching patterns (should be filtered) ──
		{"数据来源：国家统计局", true}, // 数据 + 来源 + fullwidth colon
		{"资料来源: 某报告", true},  // 资料 + 来源 + halfwidth colon
		{"图表来源：某数据库", true},  // 图表 + 来源 + fullwidth colon
		{"来源：权威机构", true},    // zero prefix + 来源 + fullwidth colon
		{"来源: 参考数据", true},   // zero prefix + 来源 + halfwidth colon
		{"数据来源 说明", true},    // 数据 + 来源 + space

		// ── Non-matching patterns (should NOT be filtered) ──
		{"数据来源明细", false},          // 来源 followed by 明, not :：space
		{"普通来源说明", false},          // doesn't start with keyword
		{"数据", false},              // too short
		{"来源", false},              // 来源 but no :：space after
		{"资料来源说明", false},          // 来源 followed by 说, not :：space
		{"", false},                // empty
		{"TABLE 1: 数据来源统计", false}, // doesn't start with keyword
	}

	for _, tt := range tests {
		matched := regexp.MustCompile(dataSourcePattern).MatchString(tt.text)
		if matched != tt.want {
			t.Errorf("dataSourcePattern.MatchString(%q) = %v, want %v", tt.text, matched, tt.want)
		}
	}
	t.Log("NOTE: Python re.match(r\"(数据|资料|图表)*来源[:： ]\", text) — anchored at start.")
	t.Log("Go regexp.MatchString equivalent with ^ prefix.")
}

// TestExtractTableAndReplace_DataSourceFilter_Missing exposes that Go does NOT
// filter out table boxes whose text matches r"(数据|资料|图表)*来源[:： ]".
// Python's _extract_table_figure pops these boxes from self.boxes without
// adding them to the tables dict (pdf_parser.py:1040-1042).
func TestExtractTableAndReplace_DataSourceFilter_Missing(t *testing.T) {
	// A table box with data-source text and a normal table box.
	// Both overlap a TableItem position, so both would be replaced with HTML.
	boxes := []TextBox{
		{X0: 10, X1: 200, Top: 0, Bottom: 50, Text: "数据来源：国家统计局", LayoutType: "table", PageNumber: 0},
		{X0: 10, X1: 200, Top: 60, Bottom: 80, Text: "表1：正常数据", LayoutType: "table", PageNumber: 0},
	}

	// Two TableItems — one per table box — so each would independently produce HTML.
	tables := []TableItem{
		{
			Cells:     []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "来源", Label: "table row"}},
			Positions: []Position{{Left: 0, Right: 300, Top: 0, Bottom: 50}},
			Scale:     1.0,
		},
		{
			Cells:     []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "正常", Label: "table row"}},
			Positions: []Position{{Left: 0, Right: 300, Top: 60, Bottom: 80}},
			Scale:     1.0,
		},
	}

	result := extractTableAndReplace(boxes, tables)

	// Python behavior: "数据来源：国家统计局" is popped from self.boxes,
	// NOT added to tables dict, NOT replaced with HTML. Gone entirely.
	// "表1：正常数据" is replaced with HTML as usual.
	// Expected result: exactly 1 HTML table box for the normal table.
	//
	// BUG: Go replaces both boxes with HTML tables. The data-source box
	// produces an HTML table with cell text "来源" — this should NOT exist.
	htmlTableCount := 0
	hasDataSourceTable := false
	for _, b := range result {
		if strings.Contains(b.Text, "<table>") {
			htmlTableCount++
			// The data-source table's cell text "来源" ends up in the HTML.
			// c.f. constructTable which uses TSRCell text, not box text.
			if strings.Contains(b.Text, ">来源<") {
				hasDataSourceTable = true
			}
		}
	}
	if htmlTableCount != 1 {
		t.Errorf("DATA SOURCE FILTER BUG: expected 1 HTML table (normal only), got %d. Python pops data-source table box entirely in _extract_table_figure (pdf_parser.py:1040-1042). Go replaces it with an HTML table.", htmlTableCount)
	}
	if hasDataSourceTable {
		t.Errorf("DATA SOURCE FILTER BUG: data-source table should NOT produce HTML output. Cell '来源' appears in HTML: Python discards these boxes, Go incorrectly constructs a table for them.")
	}

	t.Log("NOTE: Python filters table boxes matching r\"(数据|资料|图表)*来源[:： ]\" in _extract_table_figure.")
	t.Log("Go's extractTableAndReplace has no equivalent filter — data-source boxes get replaced with HTML instead of being discarded.")
}

// TestExtractTableAndReplace_DataSourceVariants tests multiple variants of
// the data-source pattern that should all be filtered.
func TestExtractTableAndReplace_DataSourceVariants(t *testing.T) {
	variants := []string{
		"数据来源：国家统计局",
		"资料来源: 某报告",
		"图表来源：某数据库",
		"来源：权威机构",
		"来源: 参考数据",
	}

	for _, variant := range variants {
		t.Run(variant, func(t *testing.T) {
			boxes := []TextBox{
				{X0: 10, X1: 200, Top: 0, Bottom: 50, Text: variant, LayoutType: "table", PageNumber: 0},
			}

			tables := []TableItem{{
				Cells:     []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "A", Label: "table row"}},
				Positions: []Position{{Left: 0, Right: 300, Top: 0, Bottom: 50}},
				Scale:     1.0,
			}}

			result := extractTableAndReplace(boxes, tables)

			// BUG: box with data-source text should be REMOVED entirely —
			// zero HTML output. Python pops these boxes without replacement.
			for _, b := range result {
				if strings.Contains(b.Text, "<table>") {
					t.Errorf("DATA SOURCE FILTER BUG: variant %q should be removed without HTML replacement. Python pops data-source table boxes entirely.", variant)
				}
			}
		})
	}
	t.Log("NOTE: All variants of r\"(数据|资料|图表)*来源[:： ]\" should be filtered by extractTableAndReplace.")
}

// TestConsolidateFigures_DataSourceFilter_Missing exposes that Go does NOT
// filter out figure boxes whose text matches r"(数据|资料|图表)*来源[:： ]".
// Python's _extract_table_figure pops these boxes from self.boxes without
// adding them to the figures dict (pdf_parser.py:1050-1052).
func TestConsolidateFigures_DataSourceFilter_Missing(t *testing.T) {
	boxes := []TextBox{
		{X0: 10, X1: 200, Top: 0, Bottom: 50, Text: "数据来源：某机构", LayoutType: "figure", PageNumber: 0, LayoutNo: "figure-0"},
		{X0: 10, X1: 200, Top: 60, Bottom: 80, Text: "架构图", LayoutType: "figure", PageNumber: 0, LayoutNo: "figure-0"},
	}

	result := consolidateFigures(boxes)

	// Python behavior: "数据来源：某机构" is popped from self.boxes,
	// NOT added to figures dict → gone entirely.
	// "架构图" is extracted normally.
	// Expected result: exactly 1 figure box with "架构图" text only.
	for _, b := range result {
		if strings.Contains(b.Text, "数据来源") || strings.Contains(b.Text, "某机构") {
			t.Errorf("DATA SOURCE FIGURE FILTER BUG: '数据来源：某机构' figure box should be removed entirely. Python pops data-source figure boxes in _extract_table_figure (pdf_parser.py:1050-1052). Go still includes it.")
		}
	}

	// Verify the normal figure box IS still present.
	foundFigure := false
	for _, b := range result {
		if strings.Contains(b.Text, "架构图") {
			foundFigure = true
		}
	}
	if !foundFigure {
		t.Error("normal figure box '架构图' should still be present")
	}

	t.Log("NOTE: Python filters figure boxes matching r\"(数据|资料|图表)*来源[:： ]\" in _extract_table_figure.")
	t.Log("Go's consolidateFigures has no equivalent filter.")
}