Files
ragflow/internal/deepdoc/parser/pdf/table_cells.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

306 lines
9.1 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package parser
import (
"log/slog"
"math"
"regexp"
"sort"
"strings"
)
// ── TSR cell grouping ──────────────────────────────────────────────────
func groupTSRCellsToRows(cells []TSRCell) [][]TSRCell {
if len(cells) == 0 {
return nil
}
if len(cells) == 1 {
return [][]TSRCell{{cells[0]}}
}
heights := make([]float64, len(cells))
for i, c := range cells {
heights[i] = c.Y1 - c.Y0
}
sort.Float64s(heights)
medianH := heights[len(heights)/2]
if medianH <= 0 {
medianH = 10
}
rowThreshold := medianH * 0.5
sort.Slice(cells, func(i, j int) bool {
if math.Abs(cells[i].Y0-cells[j].Y0) < rowThreshold {
return cells[i].X0 < cells[j].X0
}
return cells[i].Y0 < cells[j].Y0
})
var rows [][]TSRCell
var curRow []TSRCell
curY := 0.0
for _, c := range cells {
if len(curRow) == 0 {
curRow = append(curRow, c)
curY = c.Y0
continue
}
if c.Y0-curY > rowThreshold {
rows = append(rows, curRow)
curRow = []TSRCell{c}
curY = c.Y0
} else {
curRow = append(curRow, c)
}
}
if len(curRow) > 0 {
rows = append(rows, curRow)
}
for _, row := range rows {
sort.Slice(row, func(i, j int) bool { return row[i].X0 < row[j].X0 })
}
return rows
}
// ── cell text filling ──────────────────────────────────────────────────
func fillCellTextFromBoxes(cells []TSRCell, boxes []TextBox) {
slog.Debug("fillCellTextFromBoxes", "cells", len(cells), "boxes", len(boxes))
if len(cells) > 0 && len(boxes) > 0 {
c0 := cells[0]
slog.Debug("fillCellTextFromBoxes cell[0]", "x0", c0.X0, "y0", c0.Y0, "x1", c0.X1, "y1", c0.Y1)
b0 := boxes[0]
slog.Debug("fillCellTextFromBoxes box[0]", "x0", b0.X0, "y0", b0.Top, "x1", b0.X1, "y1", b0.Bottom, "text_len", len(b0.Text))
}
matched, filled := 0, 0
for ci := range cells {
var matches []string
for _, b := range boxes {
if isCaptionBox(b.Text, b.LayoutType) {
continue
}
if boxMatchesCell(cells[ci], b, cells[ci].Text == "") {
matched++
t := strings.TrimSpace(b.Text)
if t != "" {
matches = append(matches, t)
}
}
}
if len(matches) > 0 {
cells[ci].Text = strings.Join(matches, " ")
filled++
}
}
slog.Debug("fillCellTextFromBoxes done", "cell_box_matches", matched, "cells_filled", filled)
}
// boxMatchesCell reports whether a text box's text should be assigned
// to a TSR cell. When the cell already has text (from TSR), the box
// must be mostly inside the cell (≥85% of box area). When the cell
// is empty, any overlap suffices — matching Python's _table_transformer_job
// which fills cells from overlapping PDF boxes with thr=0.3.
func boxMatchesCell(cell TSRCell, box TextBox, cellIsEmpty bool) bool {
inter := OverlapInter(&cell, &box)
boxArea := Area(&box)
if boxArea <= 0 {
return false
}
if cellIsEmpty {
return inter/boxArea >= 0.3 // Python's find_overlapped_with_threshold default
}
return inter/boxArea >= 0.85
}
// boxOverlapsCell is kept for backward compat — same as boxMatchesCell
// with cellIsEmpty=false (strict 85% threshold).
func boxOverlapsCell(cell TSRCell, box TextBox) bool {
return boxMatchesCell(cell, box, false)
}
// isCaptionBox checks if a text box is a table/figure caption,
// matching Python is_caption(). Captions should not enter table cells.
var reCaption = regexp.MustCompile(`^[图表]+[ 0-9:]{2,}|(?i)Fig\.?\s*\d+|(?i)Figure\s+\d+|(?i)Table\s+\d+`)
func isCaptionBox(text string, layoutType string) bool {
if strings.Contains(layoutType, "caption") {
return true
}
return reCaption.MatchString(strings.TrimSpace(text))
}
// reTableCaptionText matches text patterns that indicate a table caption
// (as opposed to a figure caption). Python is_caption uses the same set.
var reTableCaptionText = regexp.MustCompile(`^表|(?i)Table\s+\d+`)
// reFigureCaptionText matches text patterns that indicate a figure caption.
var reFigureCaptionText = regexp.MustCompile(`^图|(?i)Fig\.?\s*\d+|(?i)Figure\s+\d+`)
// captionKind returns "table" if the section is a table caption,
// "figure" if a figure caption, or "" if not a caption.
// Matches Python's is_caption check: text patterns OR layout_type containing "caption".
func captionKind(s Section) string {
lt := s.LayoutType
if lt == DLALabelTableCaption || (strings.Contains(lt, "caption") && reTableCaptionText.MatchString(strings.TrimSpace(s.Text))) {
return LayoutTypeTable
}
if lt == DLALabelFigureCaption || strings.Contains(lt, "caption") {
return LayoutTypeFigure
}
// DLA may label captions as "text" or other types — check text patterns.
t := strings.TrimSpace(s.Text)
if reTableCaptionText.MatchString(t) {
return LayoutTypeTable
}
if reFigureCaptionText.MatchString(t) {
return LayoutTypeFigure
}
// "图表" pattern could be either — check if isCaptionBox matches.
if isCaptionBox(t, "") {
return LayoutTypeTable
}
return ""
}
// ── blockType: cell content classification (Python: TableStructureRecognizer.blockType) ──
// Compiled once at package init.
var blockTypePatterns = []struct {
re *regexp.Regexp
kind string
}{
// Dt (date) patterns — Python blockType lines 161-168.
{regexp.MustCompile(`^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$`), "Dt"},
{regexp.MustCompile(`^(20|19)[0-9]{2}年$`), "Dt"},
{regexp.MustCompile(`^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$`), "Dt"},
{regexp.MustCompile(`^[0-9]{1,2}[月-][0-9]{1,2}日*$`), "Dt"},
{regexp.MustCompile(`^第*[一二三四1-4]季度$`), "Dt"},
{regexp.MustCompile(`^(20|19)[0-9]{2}年*[一二三四1-4]季度$`), "Dt"},
{regexp.MustCompile(`^(20|19)[0-9]{2}[ABCDE]$`), "Dt"},
// Nu (numeric) — Python blockType line 169.
{regexp.MustCompile(`^[0-9.,+%/ -]+$`), "Nu"},
// Ca (categorical) — Python blockType line 170.
{regexp.MustCompile(`^[0-9A-Z/\._~-]+$`), "Ca"},
// En (English) — Python blockType line 171.
{regexp.MustCompile(`^[A-Z]*[a-z' -]+$`), "En"},
// NE (named entity — mixed alphanumeric) — Python blockType line 172.
{regexp.MustCompile(`^[0-9.,+-]+[0-9A-Za-z/$¥%<>()' -]+$`), "NE"},
// Sg (single character) — Python blockType line 173.
{regexp.MustCompile(`^.{1}$`), "Sg"},
}
// blockType classifies cell text into one of 9+1 types, matching Python's
// TableStructureRecognizer.blockType. Types: Dt (date), Nu (numeric),
// Ca (categorical), En (English), NE (named entity), Sg (single char),
// Tx (short text), Lx (long text), Nr (person name), Ot (other).
func blockType(text string) string {
t := strings.TrimSpace(text)
for _, p := range blockTypePatterns {
if p.re.MatchString(t) {
return p.kind
}
}
// Token-based classification: >3 tokens, <12 → Tx, >=12 → Lx.
// Uses simple token counting (whitespace split + individual CJK chars).
tkn := simpleTokenCount(t)
if tkn > 3 {
if tkn < 12 {
return "Tx"
}
return "Lx"
}
// Single token with POS tag "nr" → "Nr" (requires tokenizer — not available).
// Default: "Ot" (other).
return "Ot"
}
// simpleTokenCount estimates token count: splits on whitespace and counts
// CJK characters individually (each CJK char ≈ one token in Chinese).
func simpleTokenCount(text string) int {
count := 0
for _, r := range text {
if isCJK(r) {
count++
} else if r == ' ' || r == '\t' {
// whitespace tokenizes boundaries already counted via words
}
}
// Also count space-separated words.
words := strings.Fields(text)
for _, w := range words {
if !containsCJK(w) {
count++
}
}
return count
}
func containsCJK(s string) bool {
for _, r := range s {
if isCJK(r) {
return true
}
}
return false
}
// headerSetWithBlockType returns rows that should be header rows, using both
// TSR cell labels AND block-type classification. Matches Python's
// construct_table header detection (table_structure_recognizer.py:370-384).
func headerSetWithBlockType(rows [][]TSRCell) map[int]bool {
// Compute dominant block type across all cells.
typeCounts := make(map[string]int)
for _, row := range rows {
for _, cell := range row {
t := strings.TrimSpace(cell.Text)
if t != "" {
typeCounts[blockType(t)]++
}
}
}
maxType := ""
maxCount := 0
for t, c := range typeCounts {
if c > maxCount {
maxType = t
maxCount = c
}
}
hdrs := make(map[int]bool)
for ri, row := range rows {
cnt, h := 0, 0
for _, cell := range row {
t := strings.TrimSpace(cell.Text)
if t == "" {
continue
}
cnt++
bt := blockType(t)
// Python: if max_type == "Nu" and cell btype == "Nu" → skip
if maxType == "Nu" && bt == "Nu" {
continue
}
// Python: max_type == "Nu" and cell btype != "Nu" → header
if maxType == "Nu" && bt != "Nu" {
h++
}
}
if cnt > 0 && float64(h)/float64(cnt) > 0.5 {
hdrs[ri] = true
}
}
// Fallback: if block-type found no headers, check for model-agnostic
// "header" substring in cell labels (works across different TSR models).
if len(hdrs) == 0 {
for ri, row := range rows {
for _, cell := range row {
if strings.Contains(cell.Label, "header") || strings.Contains(cell.Label, "Header") {
hdrs[ri] = true
break
}
}
}
}
return hdrs
}