package parser import ( "math" "sort" ) // ── Post-TSR layout annotation (Python: pdf_parser.py gather/layouts_cleanup) ── // sortYFirstly sorts cells by top, with fuzzy threshold: if two cells are // within threshold Y pixels, sort by X instead (same-row ordering). // Python: Recognizer.sort_Y_firstly(arr, threshold) func sortYFirstly(cells []TSRCell, threshold float64) { sort.Slice(cells, func(i, j int) bool { diff := cells[i].Y0 - cells[j].Y0 if math.Abs(diff) < threshold { return cells[i].X0 < cells[j].X0 } return diff < 0 }) } // sortXFirstly sorts cells by x0, with fuzzy threshold for top. func sortXFirstly(cells []TSRCell, threshold float64) { sort.Slice(cells, func(i, j int) bool { diff := cells[i].X0 - cells[j].X0 if math.Abs(diff) < threshold { return cells[i].Y0 < cells[j].Y0 } return diff < 0 }) } // layoutCleanup removes duplicate/overlapping cells of the same type. // Python: Recognizer.layouts_cleanup(boxes, layouts, far=2, thr=0.7) // // For each cell, checks the next `far` cells; if they overlap significantly // AND have the same label type, the one with lower score (or less box overlap // area) is removed. func layoutCleanup(cells []TSRCell, boxes []TextBox, far int, thr float64) []TSRCell { // cells are assumed pre-sorted (caller sorts before passing) out := make([]TSRCell, len(cells)) copy(out, cells) i := 0 for i+1 < len(out) { j := i + 1 limit := i + far if limit > len(out) { limit = len(out) } for j < limit && (out[i].Label != "" && out[i].Label != out[j].Label || notOverlapped(out[i], out[j])) { j++ } if j >= limit { i++ continue } // Cells i and j overlap and have same type. Keep one. areaI := OverlapRatioA(&out[i], &out[j]) areaJ := OverlapRatioA(&out[j], &out[i]) if areaI < thr && areaJ < thr { i++ continue } // Prefer the one that overlaps more with text boxes. boxAreaI, boxAreaJ := 0.0, 0.0 for _, b := range boxes { if !tsrBoxOverlap(b, out[i]) { boxAreaI += OverlapInter(&b, &out[i]) } if !tsrBoxOverlap(b, out[j]) { boxAreaJ += OverlapInter(&b, &out[j]) } } if boxAreaI >= boxAreaJ { out = append(out[:j], out[j+1:]...) } else { out = append(out[:i], out[i+1:]...) } } return out } // notOverlapped returns true if cells a and b do NOT overlap. func notOverlapped(a, b TSRCell) bool { return a.X1 < b.X0 || a.X0 > b.X1 || a.Y1 < b.Y0 || a.Y0 > b.Y1 } // tsrBoxOverlap returns true if a TextBox and a TSRCell do NOT overlap. func tsrBoxOverlap(b TextBox, c TSRCell) bool { return b.X1 < c.X0 || b.X0 > c.X1 || b.Bottom < c.Y0 || b.Top > c.Y1 } // findOverlappedWithThreshold returns the index of the cell with the best // bidirectional overlap >= thr, or -1 if none. // Python: Recognizer.find_overlapped_with_threshold(box, boxes, thr=0.3) // Python uses max(boxRatio, cellRatio) for both gate and scoring. func findOverlappedWithThreshold(box TextBox, cells []TSRCell, thr float64) int { boxArea := Area(&box) if boxArea <= 0 { return -1 } bestIdx := -1 bestOverlap := thr // Python: max_overlap starts at thr for i, c := range cells { cellArea := Area(&c) if cellArea <= 0 { continue } ol := OverlapInter(&box, &c) if ol <= 0 { continue } boxRatio := ol / boxArea cellRatio := ol / cellArea // Python: max(cls.overlapped_area(box, layout), cls.overlapped_area(layout, box)) overlap := math.Max(boxRatio, cellRatio) if overlap >= bestOverlap { bestOverlap = overlap bestIdx = i } } return bestIdx } // findHorizontallyTightestFit returns the index of the column cell that // horizontally contains the box with minimal width difference. // Python: Recognizer.find_horizontally_tightest_fit(b, clmns) // findHorizontallyTightestFit returns the column index with minimum // edge distance to the box. Python: Recognizer.find_horizontally_tightest_fit. func findHorizontallyTightestFit(box TextBox, clmns []TSRCell) int { best := -1 bestDist := float64(1<<63 - 1) for i, c := range clmns { // Minimum edge distance between box and column boundaries. dl := math.Abs(box.X0 - c.X0) dr := math.Abs(box.X1 - c.X1) d := math.Min(dl, dr) if d < bestDist { bestDist = d best = i } } return best } // annotateTableBoxes tags table boxes with row/header/column indices using // TSR cell labels. Matching Python's R/H/C/SP annotation logic. // // Python: pdf_parser.py:518-554 func annotateTableBoxes(boxes []TextBox, grid [][]TSRCell) { // grid[0] is the header row. Spans are computed by calSpans later. var headers, spans []TSRCell var clmns []TSRCell if len(grid) > 0 { headers = grid[0] clmns = append(clmns, grid[0]...) } sortYFirstly(headers, 10) sortXFirstly(clmns, 10) for i := range boxes { if boxes[i].LayoutType != LayoutTypeTable { continue } // Grid-based R/C: match box to the row and column it overlaps. for ri, row := range grid { if idx := findOverlappedWithThreshold(boxes[i], row, 0.3); idx >= 0 { boxes[i].R = ri boxes[i].RTop = row[0].Y0 boxes[i].RBott = row[0].Y1 for ci, cell := range row { if !tsrBoxOverlap(boxes[i], cell) { boxes[i].C = ci boxes[i].CLeft = cell.X0 boxes[i].CRight = cell.X1 break } } break } } if idx := findOverlappedWithThreshold(boxes[i], headers, 0.3); idx >= 0 { boxes[i].HTop = headers[idx].Y0 boxes[i].HBott = headers[idx].Y1 boxes[i].HLeft = headers[idx].X0 boxes[i].HRight = headers[idx].X1 boxes[i].H = idx } if len(clmns) > 1 { if idx := findHorizontallyTightestFit(boxes[i], clmns); idx >= 0 { boxes[i].C = idx boxes[i].CLeft = clmns[idx].X0 boxes[i].CRight = clmns[idx].X1 } } if idx := findOverlappedWithThreshold(boxes[i], spans, 0.3); idx >= 0 { boxes[i].SP = idx } } // Two-pass C fallback: after all R values are assigned, compute C by X-order within each row. // This matches Python's behavior when TSR provides few "table column" cells. if len(clmns) <= 1 { // Collect all table boxes grouped by R. rBoxes := make(map[int][]int) for i := range boxes { if boxes[i].LayoutType == LayoutTypeTable { rBoxes[boxes[i].R] = append(rBoxes[boxes[i].R], i) } } for _, indices := range rBoxes { sort.Slice(indices, func(a, b int) bool { return boxes[indices[a]].X0 < boxes[indices[b]].X0 }) for ci, bi := range indices { boxes[bi].C = ci } } } }