Files
ragflow/internal/deepdoc/parser/pdf/table/deepdoc_table_builder.go

147 lines
4.0 KiB
Go
Raw Normal View History

package table
import (
"context"
"image"
"sort"
"strings"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
)
// DeepDocTableBuilder implements pdf.TableBuilder for the DeepDoc
// table structure recognition service. Label injection is handled by the
// NewTableBuilderFor factory.
type DeepDocTableBuilder struct {
doc pdf.DocAnalyzer
}
// NewDeepDocTableBuilder creates a TableBuilder. Labels must be set on the
// underlying client by the caller (see deepdoc.go NewTableBuilderFor).
func NewDeepDocTableBuilder(doc pdf.DocAnalyzer) *DeepDocTableBuilder {
return &DeepDocTableBuilder{doc: doc}
}
func (b *DeepDocTableBuilder) Name() string { return "deepdoc" }
func (b *DeepDocTableBuilder) DetectCells(ctx context.Context, cropped image.Image) ([]pdf.TSRCell, error) {
return b.doc.TSR(ctx, cropped)
}
// GroupCells builds a row×column grid from structural cells.
//
// Input: structural cells with labels "table row", "table column",
// "table column header", "table spanning cell".
//
// Algorithm:
// 1. Extract row boundaries from "table row" cells, sort by Y.
// 2. Extract column boundaries from "table column" cells, sort by X.
// 3. Cross-product: grid[r][c].X0/Y0/X1/Y1 = col[c] × row[r].
// 4. Header propagation: rows overlapping the header cell's Y range
// get Label = "table column header".
// 5. Span injection: for each "table spanning cell", find grid cells
// whose center falls inside the span bbox. The top-left cell gets
// the span label + extended bbox; remaining cells are zeroed (covered).
func (b *DeepDocTableBuilder) GroupCells(cells []pdf.TSRCell) [][]pdf.TSRCell {
if len(cells) == 0 {
return nil
}
// 1. Collect and sort structural elements.
var rows, cols, spans []pdf.TSRCell
var header *pdf.TSRCell
for _, c := range cells {
switch {
case strings.HasSuffix(c.Label, "table row"):
rows = append(rows, c)
case strings.HasSuffix(c.Label, "table column"):
cols = append(cols, c)
case strings.Contains(strings.ToLower(c.Label), "spanning"):
spans = append(spans, c)
case strings.HasSuffix(c.Label, "table column header"):
h := c
header = &h
}
}
if len(rows) == 0 {
return nil
}
SortYFirstly(rows, 10)
SortXFirstly(cols, 10)
// 2. If no column cells, synthesize one wide column from row extents.
if len(cols) == 0 {
x0 := rows[0].X0
x1 := rows[0].X1
cols = []pdf.TSRCell{{X0: x0, Y0: rows[0].Y0, X1: x1, Y1: rows[len(rows)-1].Y1, Label: "table column"}}
}
// 3. Cross-product to build grid.
grid := make([][]pdf.TSRCell, len(rows))
for r := range rows {
grid[r] = make([]pdf.TSRCell, len(cols))
for c := range cols {
grid[r][c] = pdf.TSRCell{
X0: cols[c].X0,
Y0: rows[r].Y0,
X1: cols[c].X1,
Y1: rows[r].Y1,
}
}
}
// 4. Header propagation.
if header != nil {
for ri := range rows {
if rows[ri].Y0 >= header.Y0 && rows[ri].Y1 <= header.Y1 ||
overlapsY(rows[ri], *header) {
for cj := range grid[ri] {
grid[ri][cj].Label = "table column header"
}
}
}
}
// 5. Span injection.
for _, sp := range spans {
type cellIdx struct{ r, c int }
var covered []cellIdx
for ri := range grid {
for cj := range grid[ri] {
cell := grid[ri][cj]
cx := (cell.X0 + cell.X1) / 2
cy := (cell.Y0 + cell.Y1) / 2
if cx >= sp.X0 && cx <= sp.X1 && cy >= sp.Y0 && cy <= sp.Y1 {
covered = append(covered, cellIdx{ri, cj})
}
}
}
if len(covered) < 2 {
continue
}
sort.Slice(covered, func(a, b int) bool {
if covered[a].r != covered[b].r {
return covered[a].r < covered[b].r
}
return covered[a].c < covered[b].c
})
first := covered[0]
grid[first.r][first.c].X0 = sp.X0
grid[first.r][first.c].Y0 = sp.Y0
grid[first.r][first.c].X1 = sp.X1
grid[first.r][first.c].Y1 = sp.Y1
grid[first.r][first.c].Label = sp.Label
for _, idx := range covered[1:] {
grid[idx.r][idx.c] = pdf.TSRCell{}
}
}
return grid
}
// overlapsY reports whether two cells overlap in the Y dimension.
func overlapsY(a, b pdf.TSRCell) bool {
return a.Y0 < b.Y1 && a.Y1 > b.Y0
}