Files
ragflow/internal/deepdoc/parser/pdf/table_cells.go

306 lines
9.1 KiB
Go
Raw Normal View History

package parser
import (
"log/slog"
"math"
"regexp"
"sort"
"strings"
)
// ── TSR cell grouping ──────────────────────────────────────────────────
func groupTSRCellsToRows(cells []TSRCell) [][]TSRCell {
if len(cells) == 0 {
return nil
}
if len(cells) == 1 {
return [][]TSRCell{{cells[0]}}
}
heights := make([]float64, len(cells))
for i, c := range cells {
heights[i] = c.Y1 - c.Y0
}
sort.Float64s(heights)
medianH := heights[len(heights)/2]
if medianH <= 0 {
medianH = 10
}
rowThreshold := medianH * 0.5
sort.Slice(cells, func(i, j int) bool {
if math.Abs(cells[i].Y0-cells[j].Y0) < rowThreshold {
return cells[i].X0 < cells[j].X0
}
return cells[i].Y0 < cells[j].Y0
})
var rows [][]TSRCell
var curRow []TSRCell
curY := 0.0
for _, c := range cells {
if len(curRow) == 0 {
curRow = append(curRow, c)
curY = c.Y0
continue
}
if c.Y0-curY > rowThreshold {
rows = append(rows, curRow)
curRow = []TSRCell{c}
curY = c.Y0
} else {
curRow = append(curRow, c)
}
}
if len(curRow) > 0 {
rows = append(rows, curRow)
}
for _, row := range rows {
sort.Slice(row, func(i, j int) bool { return row[i].X0 < row[j].X0 })
}
return rows
}
// ── cell text filling ──────────────────────────────────────────────────
func fillCellTextFromBoxes(cells []TSRCell, boxes []TextBox) {
slog.Debug("fillCellTextFromBoxes", "cells", len(cells), "boxes", len(boxes))
if len(cells) > 0 && len(boxes) > 0 {
c0 := cells[0]
slog.Debug("fillCellTextFromBoxes cell[0]", "x0", c0.X0, "y0", c0.Y0, "x1", c0.X1, "y1", c0.Y1)
b0 := boxes[0]
slog.Debug("fillCellTextFromBoxes box[0]", "x0", b0.X0, "y0", b0.Top, "x1", b0.X1, "y1", b0.Bottom, "text_len", len(b0.Text))
}
matched, filled := 0, 0
for ci := range cells {
var matches []string
for _, b := range boxes {
if isCaptionBox(b.Text, b.LayoutType) {
continue
}
if boxMatchesCell(cells[ci], b, cells[ci].Text == "") {
matched++
t := strings.TrimSpace(b.Text)
if t != "" {
matches = append(matches, t)
}
}
}
if len(matches) > 0 {
cells[ci].Text = strings.Join(matches, " ")
filled++
}
}
slog.Debug("fillCellTextFromBoxes done", "cell_box_matches", matched, "cells_filled", filled)
}
// boxMatchesCell reports whether a text box's text should be assigned
// to a TSR cell. When the cell already has text (from TSR), the box
// must be mostly inside the cell (≥85% of box area). When the cell
// is empty, any overlap suffices — matching Python's _table_transformer_job
// which fills cells from overlapping PDF boxes with thr=0.3.
func boxMatchesCell(cell TSRCell, box TextBox, cellIsEmpty bool) bool {
inter := OverlapInter(&cell, &box)
boxArea := Area(&box)
if boxArea <= 0 {
return false
}
if cellIsEmpty {
return inter/boxArea >= 0.3 // Python's find_overlapped_with_threshold default
}
return inter/boxArea >= 0.85
}
// boxOverlapsCell is kept for backward compat — same as boxMatchesCell
// with cellIsEmpty=false (strict 85% threshold).
func boxOverlapsCell(cell TSRCell, box TextBox) bool {
return boxMatchesCell(cell, box, false)
}
// isCaptionBox checks if a text box is a table/figure caption,
// matching Python is_caption(). Captions should not enter table cells.
var reCaption = regexp.MustCompile(`^[图表]+[ 0-9:]{2,}|(?i)Fig\.?\s*\d+|(?i)Figure\s+\d+|(?i)Table\s+\d+`)
func isCaptionBox(text string, layoutType string) bool {
if strings.Contains(layoutType, "caption") {
return true
}
return reCaption.MatchString(strings.TrimSpace(text))
}
// reTableCaptionText matches text patterns that indicate a table caption
// (as opposed to a figure caption). Python is_caption uses the same set.
var reTableCaptionText = regexp.MustCompile(`^表|(?i)Table\s+\d+`)
// reFigureCaptionText matches text patterns that indicate a figure caption.
var reFigureCaptionText = regexp.MustCompile(`^图|(?i)Fig\.?\s*\d+|(?i)Figure\s+\d+`)
// captionKind returns "table" if the section is a table caption,
// "figure" if a figure caption, or "" if not a caption.
// Matches Python's is_caption check: text patterns OR layout_type containing "caption".
func captionKind(s Section) string {
lt := s.LayoutType
if lt == DLALabelTableCaption || (strings.Contains(lt, "caption") && reTableCaptionText.MatchString(strings.TrimSpace(s.Text))) {
return LayoutTypeTable
}
if lt == DLALabelFigureCaption || strings.Contains(lt, "caption") {
return LayoutTypeFigure
}
// DLA may label captions as "text" or other types — check text patterns.
t := strings.TrimSpace(s.Text)
if reTableCaptionText.MatchString(t) {
return LayoutTypeTable
}
if reFigureCaptionText.MatchString(t) {
return LayoutTypeFigure
}
// "图表" pattern could be either — check if isCaptionBox matches.
if isCaptionBox(t, "") {
return LayoutTypeTable
}
return ""
}
// ── blockType: cell content classification (Python: TableStructureRecognizer.blockType) ──
// Compiled once at package init.
var blockTypePatterns = []struct {
re *regexp.Regexp
kind string
}{
// Dt (date) patterns — Python blockType lines 161-168.
{regexp.MustCompile(`^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$`), "Dt"},
{regexp.MustCompile(`^(20|19)[0-9]{2}年$`), "Dt"},
{regexp.MustCompile(`^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$`), "Dt"},
{regexp.MustCompile(`^[0-9]{1,2}[月-][0-9]{1,2}日*$`), "Dt"},
{regexp.MustCompile(`^第*[一二三四1-4]季度$`), "Dt"},
{regexp.MustCompile(`^(20|19)[0-9]{2}年*[一二三四1-4]季度$`), "Dt"},
{regexp.MustCompile(`^(20|19)[0-9]{2}[ABCDE]$`), "Dt"},
// Nu (numeric) — Python blockType line 169.
{regexp.MustCompile(`^[0-9.,+%/ -]+$`), "Nu"},
// Ca (categorical) — Python blockType line 170.
{regexp.MustCompile(`^[0-9A-Z/\._~-]+$`), "Ca"},
// En (English) — Python blockType line 171.
{regexp.MustCompile(`^[A-Z]*[a-z' -]+$`), "En"},
// NE (named entity — mixed alphanumeric) — Python blockType line 172.
{regexp.MustCompile(`^[0-9.,+-]+[0-9A-Za-z/$¥%<>()' -]+$`), "NE"},
// Sg (single character) — Python blockType line 173.
{regexp.MustCompile(`^.{1}$`), "Sg"},
}
// blockType classifies cell text into one of 9+1 types, matching Python's
// TableStructureRecognizer.blockType. Types: Dt (date), Nu (numeric),
// Ca (categorical), En (English), NE (named entity), Sg (single char),
// Tx (short text), Lx (long text), Nr (person name), Ot (other).
func blockType(text string) string {
t := strings.TrimSpace(text)
for _, p := range blockTypePatterns {
if p.re.MatchString(t) {
return p.kind
}
}
// Token-based classification: >3 tokens, <12 → Tx, >=12 → Lx.
// Uses simple token counting (whitespace split + individual CJK chars).
tkn := simpleTokenCount(t)
if tkn > 3 {
if tkn < 12 {
return "Tx"
}
return "Lx"
}
// Single token with POS tag "nr" → "Nr" (requires tokenizer — not available).
// Default: "Ot" (other).
return "Ot"
}
// simpleTokenCount estimates token count: splits on whitespace and counts
// CJK characters individually (each CJK char ≈ one token in Chinese).
func simpleTokenCount(text string) int {
count := 0
for _, r := range text {
if isCJK(r) {
count++
} else if r == ' ' || r == '\t' {
// whitespace tokenizes boundaries already counted via words
}
}
// Also count space-separated words.
words := strings.Fields(text)
for _, w := range words {
if !containsCJK(w) {
count++
}
}
return count
}
func containsCJK(s string) bool {
for _, r := range s {
if isCJK(r) {
return true
}
}
return false
}
// headerSetWithBlockType returns rows that should be header rows, using both
// TSR cell labels AND block-type classification. Matches Python's
// construct_table header detection (table_structure_recognizer.py:370-384).
func headerSetWithBlockType(rows [][]TSRCell) map[int]bool {
// Compute dominant block type across all cells.
typeCounts := make(map[string]int)
for _, row := range rows {
for _, cell := range row {
t := strings.TrimSpace(cell.Text)
if t != "" {
typeCounts[blockType(t)]++
}
}
}
maxType := ""
maxCount := 0
for t, c := range typeCounts {
if c > maxCount {
maxType = t
maxCount = c
}
}
hdrs := make(map[int]bool)
for ri, row := range rows {
cnt, h := 0, 0
for _, cell := range row {
t := strings.TrimSpace(cell.Text)
if t == "" {
continue
}
cnt++
bt := blockType(t)
// Python: if max_type == "Nu" and cell btype == "Nu" → skip
if maxType == "Nu" && bt == "Nu" {
continue
}
// Python: max_type == "Nu" and cell btype != "Nu" → header
if maxType == "Nu" && bt != "Nu" {
h++
}
}
if cnt > 0 && float64(h)/float64(cnt) > 0.5 {
hdrs[ri] = true
}
}
// Fallback: if block-type found no headers, check for model-agnostic
// "header" substring in cell labels (works across different TSR models).
if len(hdrs) == 0 {
for ri, row := range rows {
for _, cell := range row {
if strings.Contains(cell.Label, "header") || strings.Contains(cell.Label, "Header") {
hdrs[ri] = true
break
}
}
}
}
return hdrs
}