Files
ragflow/internal/deepdoc/parser/pdf/table/deepdoc_table_builder.go
Jack 98323e7910 Refactor: oss parser go refactor (#16391)
### What problem does this PR solve?

Package refactor and PDF post process.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-29 18:46:41 +08:00

147 lines
4.0 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package table
import (
"context"
"image"
"sort"
"strings"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
)
// DeepDocTableBuilder implements pdf.TableBuilder for the DeepDoc
// table structure recognition service. Label injection is handled by the
// NewTableBuilderFor factory.
type DeepDocTableBuilder struct {
doc pdf.DocAnalyzer
}
// NewDeepDocTableBuilder creates a TableBuilder. Labels must be set on the
// underlying client by the caller (see deepdoc.go NewTableBuilderFor).
func NewDeepDocTableBuilder(doc pdf.DocAnalyzer) *DeepDocTableBuilder {
return &DeepDocTableBuilder{doc: doc}
}
func (b *DeepDocTableBuilder) Name() string { return "deepdoc" }
func (b *DeepDocTableBuilder) DetectCells(ctx context.Context, cropped image.Image) ([]pdf.TSRCell, error) {
return b.doc.TSR(ctx, cropped)
}
// GroupCells builds a row×column grid from structural cells.
//
// Input: structural cells with labels "table row", "table column",
// "table column header", "table spanning cell".
//
// Algorithm:
// 1. Extract row boundaries from "table row" cells, sort by Y.
// 2. Extract column boundaries from "table column" cells, sort by X.
// 3. Cross-product: grid[r][c].X0/Y0/X1/Y1 = col[c] × row[r].
// 4. Header propagation: rows overlapping the header cell's Y range
// get Label = "table column header".
// 5. Span injection: for each "table spanning cell", find grid cells
// whose center falls inside the span bbox. The top-left cell gets
// the span label + extended bbox; remaining cells are zeroed (covered).
func (b *DeepDocTableBuilder) GroupCells(cells []pdf.TSRCell) [][]pdf.TSRCell {
if len(cells) == 0 {
return nil
}
// 1. Collect and sort structural elements.
var rows, cols, spans []pdf.TSRCell
var header *pdf.TSRCell
for _, c := range cells {
switch {
case strings.HasSuffix(c.Label, "table row"):
rows = append(rows, c)
case strings.HasSuffix(c.Label, "table column"):
cols = append(cols, c)
case strings.Contains(strings.ToLower(c.Label), "spanning"):
spans = append(spans, c)
case strings.HasSuffix(c.Label, "table column header"):
h := c
header = &h
}
}
if len(rows) == 0 {
return nil
}
SortYFirstly(rows, 10)
SortXFirstly(cols, 10)
// 2. If no column cells, synthesize one wide column from row extents.
if len(cols) == 0 {
x0 := rows[0].X0
x1 := rows[0].X1
cols = []pdf.TSRCell{{X0: x0, Y0: rows[0].Y0, X1: x1, Y1: rows[len(rows)-1].Y1, Label: "table column"}}
}
// 3. Cross-product to build grid.
grid := make([][]pdf.TSRCell, len(rows))
for r := range rows {
grid[r] = make([]pdf.TSRCell, len(cols))
for c := range cols {
grid[r][c] = pdf.TSRCell{
X0: cols[c].X0,
Y0: rows[r].Y0,
X1: cols[c].X1,
Y1: rows[r].Y1,
}
}
}
// 4. Header propagation.
if header != nil {
for ri := range rows {
if rows[ri].Y0 >= header.Y0 && rows[ri].Y1 <= header.Y1 ||
overlapsY(rows[ri], *header) {
for cj := range grid[ri] {
grid[ri][cj].Label = "table column header"
}
}
}
}
// 5. Span injection.
for _, sp := range spans {
type cellIdx struct{ r, c int }
var covered []cellIdx
for ri := range grid {
for cj := range grid[ri] {
cell := grid[ri][cj]
cx := (cell.X0 + cell.X1) / 2
cy := (cell.Y0 + cell.Y1) / 2
if cx >= sp.X0 && cx <= sp.X1 && cy >= sp.Y0 && cy <= sp.Y1 {
covered = append(covered, cellIdx{ri, cj})
}
}
}
if len(covered) < 2 {
continue
}
sort.Slice(covered, func(a, b int) bool {
if covered[a].r != covered[b].r {
return covered[a].r < covered[b].r
}
return covered[a].c < covered[b].c
})
first := covered[0]
grid[first.r][first.c].X0 = sp.X0
grid[first.r][first.c].Y0 = sp.Y0
grid[first.r][first.c].X1 = sp.X1
grid[first.r][first.c].Y1 = sp.Y1
grid[first.r][first.c].Label = sp.Label
for _, idx := range covered[1:] {
grid[idx.r][idx.c] = pdf.TSRCell{}
}
}
return grid
}
// overlapsY reports whether two cells overlap in the Y dimension.
func overlapsY(a, b pdf.TSRCell) bool {
return a.Y0 < b.Y1 && a.Y1 > b.Y0
}