package parser import ( "log/slog" "math" "regexp" "sort" "strings" ) // ── TSR cell grouping ────────────────────────────────────────────────── func groupTSRCellsToRows(cells []TSRCell) [][]TSRCell { if len(cells) == 0 { return nil } if len(cells) == 1 { return [][]TSRCell{{cells[0]}} } heights := make([]float64, len(cells)) for i, c := range cells { heights[i] = c.Y1 - c.Y0 } sort.Float64s(heights) medianH := heights[len(heights)/2] if medianH <= 0 { medianH = 10 } rowThreshold := medianH * 0.5 sort.Slice(cells, func(i, j int) bool { if math.Abs(cells[i].Y0-cells[j].Y0) < rowThreshold { return cells[i].X0 < cells[j].X0 } return cells[i].Y0 < cells[j].Y0 }) var rows [][]TSRCell var curRow []TSRCell curY := 0.0 for _, c := range cells { if len(curRow) == 0 { curRow = append(curRow, c) curY = c.Y0 continue } if c.Y0-curY > rowThreshold { rows = append(rows, curRow) curRow = []TSRCell{c} curY = c.Y0 } else { curRow = append(curRow, c) } } if len(curRow) > 0 { rows = append(rows, curRow) } for _, row := range rows { sort.Slice(row, func(i, j int) bool { return row[i].X0 < row[j].X0 }) } return rows } // ── cell text filling ────────────────────────────────────────────────── func fillCellTextFromBoxes(cells []TSRCell, boxes []TextBox) { slog.Debug("fillCellTextFromBoxes", "cells", len(cells), "boxes", len(boxes)) if len(cells) > 0 && len(boxes) > 0 { c0 := cells[0] slog.Debug("fillCellTextFromBoxes cell[0]", "x0", c0.X0, "y0", c0.Y0, "x1", c0.X1, "y1", c0.Y1) b0 := boxes[0] slog.Debug("fillCellTextFromBoxes box[0]", "x0", b0.X0, "y0", b0.Top, "x1", b0.X1, "y1", b0.Bottom, "text_len", len(b0.Text)) } matched, filled := 0, 0 for ci := range cells { var matches []string for _, b := range boxes { if isCaptionBox(b.Text, b.LayoutType) { continue } if boxMatchesCell(cells[ci], b, cells[ci].Text == "") { matched++ t := strings.TrimSpace(b.Text) if t != "" { matches = append(matches, t) } } } if len(matches) > 0 { cells[ci].Text = strings.Join(matches, " ") filled++ } } slog.Debug("fillCellTextFromBoxes done", "cell_box_matches", matched, "cells_filled", filled) } // boxMatchesCell reports whether a text box's text should be assigned // to a TSR cell. When the cell already has text (from TSR), the box // must be mostly inside the cell (≥85% of box area). When the cell // is empty, any overlap suffices — matching Python's _table_transformer_job // which fills cells from overlapping PDF boxes with thr=0.3. func boxMatchesCell(cell TSRCell, box TextBox, cellIsEmpty bool) bool { inter := OverlapInter(&cell, &box) boxArea := Area(&box) if boxArea <= 0 { return false } if cellIsEmpty { return inter/boxArea >= 0.3 // Python's find_overlapped_with_threshold default } return inter/boxArea >= 0.85 } // boxOverlapsCell is kept for backward compat — same as boxMatchesCell // with cellIsEmpty=false (strict 85% threshold). func boxOverlapsCell(cell TSRCell, box TextBox) bool { return boxMatchesCell(cell, box, false) } // isCaptionBox checks if a text box is a table/figure caption, // matching Python is_caption(). Captions should not enter table cells. var reCaption = regexp.MustCompile(`^[图表]+[ 0-9::]{2,}|(?i)Fig\.?\s*\d+|(?i)Figure\s+\d+|(?i)Table\s+\d+`) func isCaptionBox(text string, layoutType string) bool { if strings.Contains(layoutType, "caption") { return true } return reCaption.MatchString(strings.TrimSpace(text)) } // reTableCaptionText matches text patterns that indicate a table caption // (as opposed to a figure caption). Python is_caption uses the same set. var reTableCaptionText = regexp.MustCompile(`^表|(?i)Table\s+\d+`) // reFigureCaptionText matches text patterns that indicate a figure caption. var reFigureCaptionText = regexp.MustCompile(`^图|(?i)Fig\.?\s*\d+|(?i)Figure\s+\d+`) // captionKind returns "table" if the section is a table caption, // "figure" if a figure caption, or "" if not a caption. // Matches Python's is_caption check: text patterns OR layout_type containing "caption". func captionKind(s Section) string { lt := s.LayoutType if lt == DLALabelTableCaption || (strings.Contains(lt, "caption") && reTableCaptionText.MatchString(strings.TrimSpace(s.Text))) { return LayoutTypeTable } if lt == DLALabelFigureCaption || strings.Contains(lt, "caption") { return LayoutTypeFigure } // DLA may label captions as "text" or other types — check text patterns. t := strings.TrimSpace(s.Text) if reTableCaptionText.MatchString(t) { return LayoutTypeTable } if reFigureCaptionText.MatchString(t) { return LayoutTypeFigure } // "图表" pattern could be either — check if isCaptionBox matches. if isCaptionBox(t, "") { return LayoutTypeTable } return "" } // ── blockType: cell content classification (Python: TableStructureRecognizer.blockType) ── // Compiled once at package init. var blockTypePatterns = []struct { re *regexp.Regexp kind string }{ // Dt (date) patterns — Python blockType lines 161-168. {regexp.MustCompile(`^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$`), "Dt"}, {regexp.MustCompile(`^(20|19)[0-9]{2}年$`), "Dt"}, {regexp.MustCompile(`^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$`), "Dt"}, {regexp.MustCompile(`^[0-9]{1,2}[月-][0-9]{1,2}日*$`), "Dt"}, {regexp.MustCompile(`^第*[一二三四1-4]季度$`), "Dt"}, {regexp.MustCompile(`^(20|19)[0-9]{2}年*[一二三四1-4]季度$`), "Dt"}, {regexp.MustCompile(`^(20|19)[0-9]{2}[ABCDE]$`), "Dt"}, // Nu (numeric) — Python blockType line 169. {regexp.MustCompile(`^[0-9.,+%/ -]+$`), "Nu"}, // Ca (categorical) — Python blockType line 170. {regexp.MustCompile(`^[0-9A-Z/\._~-]+$`), "Ca"}, // En (English) — Python blockType line 171. {regexp.MustCompile(`^[A-Z]*[a-z' -]+$`), "En"}, // NE (named entity — mixed alphanumeric) — Python blockType line 172. {regexp.MustCompile(`^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$`), "NE"}, // Sg (single character) — Python blockType line 173. {regexp.MustCompile(`^.{1}$`), "Sg"}, } // blockType classifies cell text into one of 9+1 types, matching Python's // TableStructureRecognizer.blockType. Types: Dt (date), Nu (numeric), // Ca (categorical), En (English), NE (named entity), Sg (single char), // Tx (short text), Lx (long text), Nr (person name), Ot (other). func blockType(text string) string { t := strings.TrimSpace(text) for _, p := range blockTypePatterns { if p.re.MatchString(t) { return p.kind } } // Token-based classification: >3 tokens, <12 → Tx, >=12 → Lx. // Uses simple token counting (whitespace split + individual CJK chars). tkn := simpleTokenCount(t) if tkn > 3 { if tkn < 12 { return "Tx" } return "Lx" } // Single token with POS tag "nr" → "Nr" (requires tokenizer — not available). // Default: "Ot" (other). return "Ot" } // simpleTokenCount estimates token count: splits on whitespace and counts // CJK characters individually (each CJK char ≈ one token in Chinese). func simpleTokenCount(text string) int { count := 0 for _, r := range text { if isCJK(r) { count++ } else if r == ' ' || r == '\t' { // whitespace tokenizes boundaries already counted via words } } // Also count space-separated words. words := strings.Fields(text) for _, w := range words { if !containsCJK(w) { count++ } } return count } func containsCJK(s string) bool { for _, r := range s { if isCJK(r) { return true } } return false } // headerSetWithBlockType returns rows that should be header rows, using both // TSR cell labels AND block-type classification. Matches Python's // construct_table header detection (table_structure_recognizer.py:370-384). func headerSetWithBlockType(rows [][]TSRCell) map[int]bool { // Compute dominant block type across all cells. typeCounts := make(map[string]int) for _, row := range rows { for _, cell := range row { t := strings.TrimSpace(cell.Text) if t != "" { typeCounts[blockType(t)]++ } } } maxType := "" maxCount := 0 for t, c := range typeCounts { if c > maxCount { maxType = t maxCount = c } } hdrs := make(map[int]bool) for ri, row := range rows { cnt, h := 0, 0 for _, cell := range row { t := strings.TrimSpace(cell.Text) if t == "" { continue } cnt++ bt := blockType(t) // Python: if max_type == "Nu" and cell btype == "Nu" → skip if maxType == "Nu" && bt == "Nu" { continue } // Python: max_type == "Nu" and cell btype != "Nu" → header if maxType == "Nu" && bt != "Nu" { h++ } } if cnt > 0 && float64(h)/float64(cnt) > 0.5 { hdrs[ri] = true } } // Fallback: if block-type found no headers, check for model-agnostic // "header" substring in cell labels (works across different TSR models). if len(hdrs) == 0 { for ri, row := range rows { for _, cell := range row { if strings.Contains(cell.Label, "header") || strings.Contains(cell.Label, "Header") { hdrs[ri] = true break } } } } return hdrs }