"))
}
if item.Rows[0][0] != "公司级领导人员(含公司董事、总监)" {
t.Errorf("row 0 col 0 = %q", item.Rows[0][0])
}
}
// TestGroupTSRCellsToRowsLabeled_DefaultTableLabel verifies that cells with
// the real TSR default label "table" (class 0) are grouped correctly.
// The current deepDocReRowHdr regex only matches ".* (row|header)" — it misses
// the default "table" label, causing gatherTSR to return empty and forcing
// a fallback to pure Y-based grouping (which loses R/C annotations).
func TestGroupTSRCellsToRowsLabeled_DefaultTableLabel(t *testing.T) {
cells := []pdf.TSRCell{
{X0: 10, Y0: 0, X1: 100, Y1: 30, Label: "table"},
{X0: 101, Y0: 0, X1: 200, Y1: 30, Label: "table"},
{X0: 10, Y0: 35, X1: 100, Y1: 65, Label: "table"},
{X0: 101, Y0: 35, X1: 200, Y1: 65, Label: "table"},
}
rows := GroupTSRCellsToRows(cells)
if len(rows) != 2 {
t.Fatalf("label %q: expected 2 rows, got %d (BUG: deepDocReRowHdr does not match %q)", "table", len(rows), "table")
}
if len(rows[0]) != 2 || len(rows[1]) != 2 {
t.Errorf("expected 2 cols/row, got %d/%d", len(rows[0]), len(rows[1]))
}
}
// TestGroupBoxesByRC_RDiffSplitsRows verifies that groupBoxesByRC
// creates separate rows for different R values (Python: R differs → new row).
// Even when boxes share the same Y, different R → different grid row.
func TestGroupBoxesByRC_RDiffSplitsRows(t *testing.T) {
// 6 boxes with 6 different R values → 6 rows (Python R-first splitting).
boxes := []pdf.TextBox{
{X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "A", R: 0, C: 0},
{X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "B", R: 1, C: 1},
{X0: 210, X1: 290, Top: 0, Bottom: 30, Text: "C", R: 2, C: 2},
{X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "D", R: 3, C: 0},
{X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "E", R: 4, C: 1},
{X0: 210, X1: 290, Top: 35, Bottom: 65, Text: "F", R: 5, C: 2},
}
rows := GroupBoxesByRC(boxes)
// R=0,1,2,3,4,5 → 6 rows (Python: R differs → new row).
if len(rows) != 6 {
t.Fatalf("expected 6 rows (R differs → split), got %d", len(rows))
}
}
// TestGroupBoxesByRC_MergesCloseCols verifies that C compression works
// within each R group — merging different C values that are close in X.
func TestGroupBoxesByRC_MergesCloseCols(t *testing.T) {
// R=0 has C=0,1. R=1 has C=0,1. C compression → 2 cols each.
boxes := []pdf.TextBox{
{X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "A", R: 0, C: 0},
{X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "B", R: 0, C: 1},
{X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "C", R: 1, C: 0},
{X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "D", R: 1, C: 1},
}
rows := GroupBoxesByRC(boxes)
if len(rows) != 2 {
t.Fatalf("expected 2 rows (R diff), got %d", len(rows))
}
if len(rows[0]) != 2 || len(rows[1]) != 2 {
t.Errorf("expected 2 cols/row, got %d/%d", len(rows[0]), len(rows[1]))
}
if rows[0][0].Text != "A" || rows[0][1].Text != "B" {
t.Errorf("row0 wrong: %q %q", rows[0][0].Text, rows[0][1].Text)
}
if rows[1][0].Text != "C" || rows[1][1].Text != "D" {
t.Errorf("row1 wrong: %q %q", rows[1][0].Text, rows[1][1].Text)
}
}
// TestGroupBoxesByRC_RDiffSplitsRow verifies that boxes with different R
// values are placed in separate rows even when their Y ranges overlap.
// Matches Python: R differs → new row unconditionally.
func TestGroupBoxesByRC_RDiffSplitsRow(t *testing.T) {
// R=0 and R=1 at same Y (overlapping) → two separate rows in the grid.
boxes := []pdf.TextBox{
{X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "A", R: 0, C: 0},
{X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "B", R: 1, C: 1},
{X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "C", R: 2, C: 0},
{X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "D", R: 3, C: 1},
}
rows := GroupBoxesByRC(boxes)
// R=0,1,2,3 → 4 different R values → 4 rows (Python: R differs → new row).
if len(rows) != 4 {
t.Fatalf("expected 4 rows (R differs → split), got %d", len(rows))
}
if rows[0][0].Text != "A" || rows[1][0].Text != "B" {
t.Errorf("row0/1 wrong: A=%q B=%q", rows[0][0].Text, rows[1][0].Text)
}
}
// TestFillCellTextFromBoxes_RCOnly verifies that box text goes to exactly
// one cell via R/C annotations, not multiple cells via spatial overlap.
// A box overlapping two cells should only fill the one matching its R/C.
func TestFillCellTextFromBoxes_RCOnly(t *testing.T) {
cells := []pdf.TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50, Label: "table"},
{X0: 90, Y0: 0, X1: 200, Y1: 50, Label: "table"},
}
// This box straddles cell 0 (X=0-100) and cell 1 (X=90-200).
// Spatial overlap: both match. R/C: should go to cell R=0, C=0 only.
boxes := []pdf.TextBox{
{X0: 80, X1: 120, Top: 0, Bottom: 50, Text: "TEXT", LayoutType: "table", R: 0, C: 0},
}
rows := GroupTSRCellsToRows(cells)
for _, b := range boxes {
t := strings.TrimSpace(b.Text)
if t == "" {
continue
}
if b.R >= 0 && b.R < len(rows) && b.C >= 0 && b.C < len(rows[b.R]) {
rows[b.R][b.C].Text = t
}
}
// Cell 0 should have text, cell 1 should NOT.
if rows[0][0].Text != "TEXT" {
t.Errorf("cell[0][0] = %q, want %q", rows[0][0].Text, "TEXT")
}
if rows[0][1].Text != "" {
t.Errorf("cell[0][1] = %q, should be empty (spatial overlap leak)", rows[0][1].Text)
}
}
// TestRowsToHTML_HeaderRows verifies that header rows use | instead of | .
func TestRowsToHTML_HeaderRows(t *testing.T) {
cells := []pdf.TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "Name", Label: "table column header"},
{X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "Age", Label: "table column header"},
{X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "John", Label: "table row"},
{X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "30", Label: "table row"},
}
// constructTable should produce | for header row.
item := &pdf.TableItem{}
html := ConstructTable(cells, nil, "", item)
// Header row should use | , data row | .
if !strings.Contains(html, " | ") {
t.Errorf("expected | for header row. HTML: %s", html)
}
if strings.Count(html, " | cells, got %d. HTML: %s", strings.Count(html, " | cells (data row), got %d", strings.Count(html, " | 30% each — spatial fills ALL).
// With R/C, it belongs only to cell[1] (R=0, C=1).
cells := []pdf.TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 30, Label: "table"},
{X0: 90, Y0: 0, X1: 200, Y1: 30, Label: "table"},
{X0: 180, Y0: 0, X1: 300, Y1: 30, Label: "table"},
}
boxes := []pdf.TextBox{
{X0: 30, X1: 270, Top: 0, Bottom: 30, Text: "TEXT", LayoutType: "table", R: 0, C: 1},
}
// Spatial fill: fills ALL overlapping cells —> duplication.
cellsCopy := make([]pdf.TSRCell, 3)
copy(cellsCopy, cells)
FillCellTextFromBoxes(cellsCopy, boxes)
spatialCount := 0
for _, c := range cellsCopy {
if c.Text != "" {
spatialCount++
}
}
if spatialCount <= 1 {
t.Errorf("spatial fill: expected >1 cells with text, got %d", spatialCount)
}
t.Logf("spatial fill: %d cells (WRONG — duplication)", spatialCount)
// R/C fill: only cell matching box.R/C gets text.
cellsRC := make([]pdf.TSRCell, 3)
copy(cellsRC, cells)
rows := GroupTSRCellsToRows(cellsRC)
for _, b := range boxes {
if b.R >= 0 && b.R < len(rows) && b.C >= 0 && b.C < len(rows[b.R]) {
rows[b.R][b.C].Text = strings.TrimSpace(b.Text)
}
}
rcCount := 0
for _, row := range rows {
for _, c := range row {
if c.Text == "TEXT" {
rcCount++
}
}
}
if rcCount != 1 {
t.Errorf("R/C fill: expected 1 cell with 'TEXT', got %d", rcCount)
}
}
func TestIsCaptionBox(t *testing.T) {
tests := []struct {
text string
want bool
}{
{"表1:交通工具等级", true},
{"Table 1: Transport Levels", true},
{"图表 1: 测试", true},
{"公司领导班子成员、出差地", false}, // plain text, not caption
{"第十条到厂矿单位出差", false}, // normal paragraph
{"", false},
}
for _, tt := range tests {
if got := IsCaptionBox(tt.text, ""); got != tt.want {
t.Errorf("IsCaptionBox(%q) = %v, want %v", tt.text, got, tt.want)
}
}
}
func TestFillCellTextFromBoxes_SkipsCaption(t *testing.T) {
cells := []pdf.TSRCell{
{X0: 0, Y0: 0, X1: 200, Y1: 30, Label: "table"},
{X0: 0, Y0: 35, X1: 200, Y1: 65, Label: "table"},
}
boxes := []pdf.TextBox{
// Caption box (should be skipped)
{X0: 0, X1: 200, Top: 0, Bottom: 30, Text: "表1:交通工具等级"},
// Data box
{X0: 0, X1: 200, Top: 35, Bottom: 65, Text: "数据行"},
}
FillCellTextFromBoxes(cells, boxes)
if cells[0].Text != "" {
t.Errorf("caption leaked into cell 0: %q", cells[0].Text)
}
if cells[1].Text != "数据行" {
t.Errorf("data not in cell 1: %q", cells[1].Text)
}
}
func TestFillCellText_RCPreventsCrossCellLeak(t *testing.T) {
// Caption box at Y=0-15 overlaps BOTH cell rows (both are "empty").
// Spatial fill: text leaks to both cells. R/C fill: only cell[0] gets text.
cells := []pdf.TSRCell{
{X0: 0, Y0: 0, X1: 300, Y1: 30, Label: "table"},
{X0: 0, Y0: 35, X1: 300, Y1: 65, Label: "table"},
}
boxes := []pdf.TextBox{
{X0: 10, X1: 200, Top: 12, Bottom: 28, Text: "公司领导班子成员、出差地", R: 0, C: 0},
}
// Spatial fill → leaks to cells[1] (overlap ≥30%).
cellsSp := make([]pdf.TSRCell, 2)
copy(cellsSp, cells)
FillCellTextFromBoxes(cellsSp, boxes)
if cellsSp[1].Text != "" {
t.Errorf("spatial fill: caption leaked to cell[1]: %q", cellsSp[1].Text)
}
// R/C fill → only cell[0] (R=0,C=0).
cellsRC := make([]pdf.TSRCell, 2)
copy(cellsRC, cells)
rows := GroupTSRCellsToRows(cellsRC)
for _, b := range boxes {
if b.R >= 0 && b.R < len(rows) && b.C >= 0 && b.C < len(rows[b.R]) {
if rows[b.R][b.C].Text == "" {
rows[b.R][b.C].Text = strings.TrimSpace(b.Text)
}
}
}
if cellsRC[1].Text != "" {
t.Errorf("R/C fill: caption leaked to cell[1]: %q", cellsRC[1].Text)
}
}
func TestGroupBoxesByRC_FallbackToYXWhenNoAnnotations(t *testing.T) {
// When all boxes have R=-1 (Python's case: regex didn't match "table" label),
// groupBoxesByRC should fall back to YX coordinate grouping.
boxes := []pdf.TextBox{
{X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "A", R: -1, C: -1},
{X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "B", R: -1, C: -1},
{X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "C", R: -1, C: -1},
{X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "D", R: -1, C: -1},
}
rows := GroupBoxesByRC(boxes)
// R=-1 for all → maxR = -1 → grid would be 0 rows. Must fall back to YX.
if len(rows) == 0 {
t.Fatal("groupBoxesByRC returned 0 rows when R=-1 — no YX fallback")
}
if len(rows) != 2 {
t.Errorf("expected 2 rows (Y-split), got %d", len(rows))
}
}
func TestRowsToHTML_Colspan(t *testing.T) {
// Box spanning 2 columns: SP annotation with HLeft/HRight covering cols 0-1.
boxes := []pdf.TextBox{
{X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "Name", R: 0, C: 0, H: 1, HLeft: 10, HRight: 190},
{X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "", R: 0, C: 1, SP: 1},
{X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "John", R: 1, C: 0},
{X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "30", R: 1, C: 1},
}
rows := GroupBoxesByRC(boxes)
spans, covered := CalSpans(rows)
html := RowsToHTML(rows, "", nil, spans, covered)
if !strings.Contains(html, "colspan") {
t.Errorf("expected colspan attribute, got: %s", html)
}
t.Logf("HTML: %s", html)
}
// TestStripCaptionFromCells verifies that caption-like text is cleared
// from TSR cells before the table HTML is built.
func TestStripCaptionFromCells_ClearsCaptionPattern(t *testing.T) {
cells := []pdf.TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "表1:差旅费标准"},
{X0: 100, Y0: 0, X1: 200, Y1: 50, Text: ""},
{X0: 0, Y0: 60, X1: 100, Y1: 110, Text: "张三"},
{X0: 100, Y0: 60, X1: 200, Y1: 110, Text: "100"},
}
StripCaptionFromCells(cells)
if cells[0].Text != "" {
t.Errorf("caption cell should be cleared, got %q", cells[0].Text)
}
if cells[2].Text != "张三" {
t.Errorf("data cell should be preserved, got %q", cells[2].Text)
}
}
// TestStripCaptionFromCells_PreservesData verifies that non-caption
// cells are not cleared.
func TestStripCaptionFromCells_PreservesData(t *testing.T) {
cells := []pdf.TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "姓名"},
{X0: 100, Y0: 0, X1: 200, Y1: 50, Text: "年龄"},
{X0: 0, Y0: 60, X1: 100, Y1: 110, Text: "张三"},
{X0: 100, Y0: 60, X1: 200, Y1: 110, Text: "25"},
}
// Make a copy and strip
orig := make([]string, len(cells))
for i, c := range cells {
orig[i] = c.Text
}
StripCaptionFromCells(cells)
for i := range cells {
if cells[i].Text != orig[i] {
t.Errorf("cell[%d] changed: %q -> %q", i, orig[i], cells[i].Text)
}
}
}
// TestStripCaptionFromCells_Empty is a no-op on empty cells.
func TestStripCaptionFromCells_Empty(t *testing.T) {
cells := []pdf.TSRCell{}
StripCaptionFromCells(cells) // must not panic
}
// TestConstructTable_StripsCaptionFromCells verifies that constructTable
// strips caption text from cells before building HTML.
func TestConstructTable_StripsCaptionFromCells(t *testing.T) {
// Cell[0] has caption text "表1:标题"; cell[1] has real data.
cells := []pdf.TSRCell{
{X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "表1:标题"},
{X0: 100, Y0: 0, X1: 200, Y1: 50, Text: "数据"},
}
html := ConstructTable(cells, nil, "", nil)
// "表1:标题" should NOT appear in the HTML (stripped as caption).
if strings.Contains(html, "表1") {
t.Errorf("caption text '表1:标题' should be stripped: %s", html)
}
// "数据" should still be there.
if !strings.Contains(html, "数据") {
t.Errorf("data text '数据' should be preserved: %s", html)
}
t.Logf("HTML: %s", html)
}
// TestCalSpans_NonSpanningCellsNotPolluted verifies that a regular cell
// at position [0,0] is NOT detected as spanning when a spanning cell at
// [0,1] extends to the left, polluting column boundary calculations.
// Bug: calSpans computed column boundaries from ALL cells including
// spanning cells. "部门开支汇总" at [0,1] with X0=0 extends colLeft[1]
// to 0 instead of 101, shifting the center and causing "Q1" at [0,0]
// to be incorrectly detected as spanning 2 columns.
func TestCalSpans_NonSpanningCellsNotPolluted(t *testing.T) {
// Simulate the SpannedTable test grid: row 0 has Q1(regular), 部门开支汇总(span), Q2(regular)
rows := [][]pdf.TSRCell{
{
{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "Q1", Label: "table row"},
{X0: 0, Y0: 0, X1: 200, Y1: 30, Text: "部门开支汇总", Label: "table spanning cell"},
{X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "Q2", Label: "table row"},
},
{
{X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "100", Label: "table row"},
{X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "200", Label: "table row"},
},
}
spans, covered := CalSpans(rows)
// Q1 at [0,0] has X0=0, X1=100 which should only cover its own column.
// It should NOT get a colspan.
if s, ok := spans[[2]int{0, 0}]; ok {
t.Errorf("Q1 at [0,0] should NOT have colspan, got %v. "+
"Spanning cell at [0,1] polluted column boundaries", s)
}
// 部门开支汇总 at [0,1] has X0=0, X1=200 which DOES span columns 0 and 1.
if s, ok := spans[[2]int{0, 1}]; !ok {
t.Error("部门开支汇总 at [0,1] should have colspan=2 (covers X=0-200)")
} else if s[0] != 2 {
t.Errorf("部门开支汇总 colspan = %d, want 2", s[0])
}
// Q2 at [0,2] should be covered by the spanning cell (col 2 is within X=0-200).
if !covered[[2]int{0, 2}] {
t.Error("Q2 at [0,2] should be covered by spanning cell at [0,1]")
}
t.Logf("spans: %v, covered: %v", spans, covered)
}
// ── coordinate space conversion helpers ─────────────────────────────────
func TestRowsToHTML(t *testing.T) {
// rowsToHTML takes [][]pdf.TSRCell instead of [][]string (tableToHTML removed).
toCells := func(rows [][]string) [][]pdf.TSRCell {
out := make([][]pdf.TSRCell, len(rows))
for ri, row := range rows {
out[ri] = make([]pdf.TSRCell, len(row))
for ci, s := range row {
out[ri][ci] = pdf.TSRCell{Text: s}
}
}
return out
}
t.Run("simple 2x2 table", func(t *testing.T) {
rows := toCells([][]string{
{"姓名", "年龄"},
{"张三", "25"},
})
html := RowsToHTML(rows, "", nil, nil, nil)
expected := ""
if html != expected {
t.Errorf("got %q\nwant %q", html, expected)
}
})
t.Run("empty table", func(t *testing.T) {
html := RowsToHTML(nil, "", nil, nil, nil)
if html != "" {
t.Errorf("expected '', got %q", html)
}
})
t.Run("single cell", func(t *testing.T) {
rows := toCells([][]string{{"X"}})
html := RowsToHTML(rows, "", nil, nil, nil)
expected := ""
if html != expected {
t.Errorf("got %q\nwant %q", html, expected)
}
})
t.Run("matches Python format for 公司差旅费", func(t *testing.T) {
rows := toCells([][]string{
{"标职务", "飞机", "火车", "轮船", "其他交通工具(不含的士)"},
{"公司级领导人员", "经济舱位", "火车软席", "二等舱位", "按实报销"},
{"其他工作人员", "经济舱位", "火车硬席", "三等舱位", "按实报销"},
})
html := RowsToHTML(rows, "", nil, nil, nil)
if !strings.HasPrefix(html, "") || !strings.HasSuffix(html, " ") {
t.Errorf("not valid HTML: %s", html)
}
if !strings.Contains(html, " | 标职务 | ") {
t.Errorf("missing cell '标职务': %s", html)
}
if strings.Count(html, "