2026-06-29 18:46:41 +08:00
|
|
|
|
package table
|
2026-06-25 20:16:16 +08:00
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
|
"context"
|
|
|
|
|
|
"image"
|
|
|
|
|
|
"sort"
|
|
|
|
|
|
"strings"
|
|
|
|
|
|
|
2026-06-29 18:46:41 +08:00
|
|
|
|
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
|
|
|
|
|
)
|
2026-06-25 20:16:16 +08:00
|
|
|
|
|
2026-06-29 18:46:41 +08:00
|
|
|
|
// DeepDocTableBuilder implements pdf.TableBuilder for the DeepDoc
|
|
|
|
|
|
// table structure recognition service. Label injection is handled by the
|
|
|
|
|
|
// NewTableBuilderFor factory.
|
|
|
|
|
|
type DeepDocTableBuilder struct {
|
|
|
|
|
|
doc pdf.DocAnalyzer
|
2026-06-25 20:16:16 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-29 18:46:41 +08:00
|
|
|
|
// NewDeepDocTableBuilder creates a TableBuilder. Labels must be set on the
|
|
|
|
|
|
// underlying client by the caller (see deepdoc.go NewTableBuilderFor).
|
|
|
|
|
|
func NewDeepDocTableBuilder(doc pdf.DocAnalyzer) *DeepDocTableBuilder {
|
|
|
|
|
|
return &DeepDocTableBuilder{doc: doc}
|
2026-06-25 20:16:16 +08:00
|
|
|
|
}
|
2026-06-29 18:46:41 +08:00
|
|
|
|
func (b *DeepDocTableBuilder) Name() string { return "deepdoc" }
|
|
|
|
|
|
func (b *DeepDocTableBuilder) DetectCells(ctx context.Context, cropped image.Image) ([]pdf.TSRCell, error) {
|
2026-06-25 20:16:16 +08:00
|
|
|
|
return b.doc.TSR(ctx, cropped)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-29 18:46:41 +08:00
|
|
|
|
// GroupCells builds a row×column grid from structural cells.
|
2026-06-25 20:16:16 +08:00
|
|
|
|
//
|
|
|
|
|
|
// Input: structural cells with labels "table row", "table column",
|
|
|
|
|
|
// "table column header", "table spanning cell".
|
|
|
|
|
|
//
|
|
|
|
|
|
// Algorithm:
|
|
|
|
|
|
// 1. Extract row boundaries from "table row" cells, sort by Y.
|
|
|
|
|
|
// 2. Extract column boundaries from "table column" cells, sort by X.
|
|
|
|
|
|
// 3. Cross-product: grid[r][c].X0/Y0/X1/Y1 = col[c] × row[r].
|
|
|
|
|
|
// 4. Header propagation: rows overlapping the header cell's Y range
|
|
|
|
|
|
// get Label = "table column header".
|
|
|
|
|
|
// 5. Span injection: for each "table spanning cell", find grid cells
|
|
|
|
|
|
// whose center falls inside the span bbox. The top-left cell gets
|
|
|
|
|
|
// the span label + extended bbox; remaining cells are zeroed (covered).
|
2026-06-29 18:46:41 +08:00
|
|
|
|
func (b *DeepDocTableBuilder) GroupCells(cells []pdf.TSRCell) [][]pdf.TSRCell {
|
2026-06-25 20:16:16 +08:00
|
|
|
|
if len(cells) == 0 {
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 1. Collect and sort structural elements.
|
2026-06-29 18:46:41 +08:00
|
|
|
|
var rows, cols, spans []pdf.TSRCell
|
|
|
|
|
|
var header *pdf.TSRCell
|
2026-06-25 20:16:16 +08:00
|
|
|
|
|
|
|
|
|
|
for _, c := range cells {
|
|
|
|
|
|
switch {
|
|
|
|
|
|
case strings.HasSuffix(c.Label, "table row"):
|
|
|
|
|
|
rows = append(rows, c)
|
|
|
|
|
|
case strings.HasSuffix(c.Label, "table column"):
|
|
|
|
|
|
cols = append(cols, c)
|
|
|
|
|
|
case strings.Contains(strings.ToLower(c.Label), "spanning"):
|
|
|
|
|
|
spans = append(spans, c)
|
|
|
|
|
|
case strings.HasSuffix(c.Label, "table column header"):
|
|
|
|
|
|
h := c
|
|
|
|
|
|
header = &h
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if len(rows) == 0 {
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-06-29 18:46:41 +08:00
|
|
|
|
SortYFirstly(rows, 10)
|
|
|
|
|
|
SortXFirstly(cols, 10)
|
2026-06-25 20:16:16 +08:00
|
|
|
|
|
|
|
|
|
|
// 2. If no column cells, synthesize one wide column from row extents.
|
|
|
|
|
|
if len(cols) == 0 {
|
|
|
|
|
|
x0 := rows[0].X0
|
|
|
|
|
|
x1 := rows[0].X1
|
2026-06-29 18:46:41 +08:00
|
|
|
|
cols = []pdf.TSRCell{{X0: x0, Y0: rows[0].Y0, X1: x1, Y1: rows[len(rows)-1].Y1, Label: "table column"}}
|
2026-06-25 20:16:16 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 3. Cross-product to build grid.
|
2026-06-29 18:46:41 +08:00
|
|
|
|
grid := make([][]pdf.TSRCell, len(rows))
|
2026-06-25 20:16:16 +08:00
|
|
|
|
for r := range rows {
|
2026-06-29 18:46:41 +08:00
|
|
|
|
grid[r] = make([]pdf.TSRCell, len(cols))
|
2026-06-25 20:16:16 +08:00
|
|
|
|
for c := range cols {
|
2026-06-29 18:46:41 +08:00
|
|
|
|
grid[r][c] = pdf.TSRCell{
|
2026-06-25 20:16:16 +08:00
|
|
|
|
X0: cols[c].X0,
|
|
|
|
|
|
Y0: rows[r].Y0,
|
|
|
|
|
|
X1: cols[c].X1,
|
|
|
|
|
|
Y1: rows[r].Y1,
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 4. Header propagation.
|
|
|
|
|
|
if header != nil {
|
|
|
|
|
|
for ri := range rows {
|
|
|
|
|
|
if rows[ri].Y0 >= header.Y0 && rows[ri].Y1 <= header.Y1 ||
|
|
|
|
|
|
overlapsY(rows[ri], *header) {
|
|
|
|
|
|
for cj := range grid[ri] {
|
|
|
|
|
|
grid[ri][cj].Label = "table column header"
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 5. Span injection.
|
|
|
|
|
|
for _, sp := range spans {
|
|
|
|
|
|
type cellIdx struct{ r, c int }
|
|
|
|
|
|
var covered []cellIdx
|
|
|
|
|
|
for ri := range grid {
|
|
|
|
|
|
for cj := range grid[ri] {
|
|
|
|
|
|
cell := grid[ri][cj]
|
|
|
|
|
|
cx := (cell.X0 + cell.X1) / 2
|
|
|
|
|
|
cy := (cell.Y0 + cell.Y1) / 2
|
|
|
|
|
|
if cx >= sp.X0 && cx <= sp.X1 && cy >= sp.Y0 && cy <= sp.Y1 {
|
|
|
|
|
|
covered = append(covered, cellIdx{ri, cj})
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if len(covered) < 2 {
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
sort.Slice(covered, func(a, b int) bool {
|
|
|
|
|
|
if covered[a].r != covered[b].r {
|
|
|
|
|
|
return covered[a].r < covered[b].r
|
|
|
|
|
|
}
|
|
|
|
|
|
return covered[a].c < covered[b].c
|
|
|
|
|
|
})
|
|
|
|
|
|
first := covered[0]
|
|
|
|
|
|
grid[first.r][first.c].X0 = sp.X0
|
|
|
|
|
|
grid[first.r][first.c].Y0 = sp.Y0
|
|
|
|
|
|
grid[first.r][first.c].X1 = sp.X1
|
|
|
|
|
|
grid[first.r][first.c].Y1 = sp.Y1
|
|
|
|
|
|
grid[first.r][first.c].Label = sp.Label
|
|
|
|
|
|
for _, idx := range covered[1:] {
|
2026-06-29 18:46:41 +08:00
|
|
|
|
grid[idx.r][idx.c] = pdf.TSRCell{}
|
2026-06-25 20:16:16 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return grid
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// overlapsY reports whether two cells overlap in the Y dimension.
|
2026-06-29 18:46:41 +08:00
|
|
|
|
func overlapsY(a, b pdf.TSRCell) bool {
|
2026-06-25 20:16:16 +08:00
|
|
|
|
return a.Y0 < b.Y1 && a.Y1 > b.Y0
|
|
|
|
|
|
}
|