mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
222 lines
6.3 KiB
Go
222 lines
6.3 KiB
Go
|
|
package parser
|
||
|
|
|
||
|
|
import (
|
||
|
|
"math"
|
||
|
|
"sort"
|
||
|
|
)
|
||
|
|
|
||
|
|
// ── Post-TSR layout annotation (Python: pdf_parser.py gather/layouts_cleanup) ──
|
||
|
|
|
||
|
|
// sortYFirstly sorts cells by top, with fuzzy threshold: if two cells are
|
||
|
|
// within threshold Y pixels, sort by X instead (same-row ordering).
|
||
|
|
// Python: Recognizer.sort_Y_firstly(arr, threshold)
|
||
|
|
func sortYFirstly(cells []TSRCell, threshold float64) {
|
||
|
|
sort.Slice(cells, func(i, j int) bool {
|
||
|
|
diff := cells[i].Y0 - cells[j].Y0
|
||
|
|
if math.Abs(diff) < threshold {
|
||
|
|
return cells[i].X0 < cells[j].X0
|
||
|
|
}
|
||
|
|
return diff < 0
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
// sortXFirstly sorts cells by x0, with fuzzy threshold for top.
|
||
|
|
func sortXFirstly(cells []TSRCell, threshold float64) {
|
||
|
|
sort.Slice(cells, func(i, j int) bool {
|
||
|
|
diff := cells[i].X0 - cells[j].X0
|
||
|
|
if math.Abs(diff) < threshold {
|
||
|
|
return cells[i].Y0 < cells[j].Y0
|
||
|
|
}
|
||
|
|
return diff < 0
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
// layoutCleanup removes duplicate/overlapping cells of the same type.
|
||
|
|
// Python: Recognizer.layouts_cleanup(boxes, layouts, far=2, thr=0.7)
|
||
|
|
//
|
||
|
|
// For each cell, checks the next `far` cells; if they overlap significantly
|
||
|
|
// AND have the same label type, the one with lower score (or less box overlap
|
||
|
|
// area) is removed.
|
||
|
|
func layoutCleanup(cells []TSRCell, boxes []TextBox, far int, thr float64) []TSRCell {
|
||
|
|
// cells are assumed pre-sorted (caller sorts before passing)
|
||
|
|
out := make([]TSRCell, len(cells))
|
||
|
|
copy(out, cells)
|
||
|
|
|
||
|
|
i := 0
|
||
|
|
for i+1 < len(out) {
|
||
|
|
j := i + 1
|
||
|
|
limit := i + far
|
||
|
|
if limit > len(out) {
|
||
|
|
limit = len(out)
|
||
|
|
}
|
||
|
|
for j < limit && (out[i].Label != "" && out[i].Label != out[j].Label || notOverlapped(out[i], out[j])) {
|
||
|
|
j++
|
||
|
|
}
|
||
|
|
if j >= limit {
|
||
|
|
i++
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
// Cells i and j overlap and have same type. Keep one.
|
||
|
|
areaI := OverlapRatioA(&out[i], &out[j])
|
||
|
|
areaJ := OverlapRatioA(&out[j], &out[i])
|
||
|
|
if areaI < thr && areaJ < thr {
|
||
|
|
i++
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
|
||
|
|
// Prefer the one that overlaps more with text boxes.
|
||
|
|
boxAreaI, boxAreaJ := 0.0, 0.0
|
||
|
|
for _, b := range boxes {
|
||
|
|
if !tsrBoxOverlap(b, out[i]) {
|
||
|
|
boxAreaI += OverlapInter(&b, &out[i])
|
||
|
|
}
|
||
|
|
if !tsrBoxOverlap(b, out[j]) {
|
||
|
|
boxAreaJ += OverlapInter(&b, &out[j])
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if boxAreaI >= boxAreaJ {
|
||
|
|
out = append(out[:j], out[j+1:]...)
|
||
|
|
} else {
|
||
|
|
out = append(out[:i], out[i+1:]...)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return out
|
||
|
|
}
|
||
|
|
|
||
|
|
// notOverlapped returns true if cells a and b do NOT overlap.
|
||
|
|
func notOverlapped(a, b TSRCell) bool {
|
||
|
|
return a.X1 < b.X0 || a.X0 > b.X1 || a.Y1 < b.Y0 || a.Y0 > b.Y1
|
||
|
|
}
|
||
|
|
|
||
|
|
// tsrBoxOverlap returns true if a TextBox and a TSRCell do NOT overlap.
|
||
|
|
func tsrBoxOverlap(b TextBox, c TSRCell) bool {
|
||
|
|
return b.X1 < c.X0 || b.X0 > c.X1 || b.Bottom < c.Y0 || b.Top > c.Y1
|
||
|
|
}
|
||
|
|
|
||
|
|
// findOverlappedWithThreshold returns the index of the cell with the best
|
||
|
|
// bidirectional overlap >= thr, or -1 if none.
|
||
|
|
// Python: Recognizer.find_overlapped_with_threshold(box, boxes, thr=0.3)
|
||
|
|
// Python uses max(boxRatio, cellRatio) for both gate and scoring.
|
||
|
|
func findOverlappedWithThreshold(box TextBox, cells []TSRCell, thr float64) int {
|
||
|
|
boxArea := Area(&box)
|
||
|
|
if boxArea <= 0 {
|
||
|
|
return -1
|
||
|
|
}
|
||
|
|
bestIdx := -1
|
||
|
|
bestOverlap := thr // Python: max_overlap starts at thr
|
||
|
|
for i, c := range cells {
|
||
|
|
cellArea := Area(&c)
|
||
|
|
if cellArea <= 0 {
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
ol := OverlapInter(&box, &c)
|
||
|
|
if ol <= 0 {
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
boxRatio := ol / boxArea
|
||
|
|
cellRatio := ol / cellArea
|
||
|
|
// Python: max(cls.overlapped_area(box, layout), cls.overlapped_area(layout, box))
|
||
|
|
overlap := math.Max(boxRatio, cellRatio)
|
||
|
|
if overlap >= bestOverlap {
|
||
|
|
bestOverlap = overlap
|
||
|
|
bestIdx = i
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return bestIdx
|
||
|
|
}
|
||
|
|
|
||
|
|
// findHorizontallyTightestFit returns the index of the column cell that
|
||
|
|
// horizontally contains the box with minimal width difference.
|
||
|
|
// Python: Recognizer.find_horizontally_tightest_fit(b, clmns)
|
||
|
|
// findHorizontallyTightestFit returns the column index with minimum
|
||
|
|
// edge distance to the box. Python: Recognizer.find_horizontally_tightest_fit.
|
||
|
|
func findHorizontallyTightestFit(box TextBox, clmns []TSRCell) int {
|
||
|
|
best := -1
|
||
|
|
bestDist := float64(1<<63 - 1)
|
||
|
|
for i, c := range clmns {
|
||
|
|
// Minimum edge distance between box and column boundaries.
|
||
|
|
dl := math.Abs(box.X0 - c.X0)
|
||
|
|
dr := math.Abs(box.X1 - c.X1)
|
||
|
|
d := math.Min(dl, dr)
|
||
|
|
if d < bestDist {
|
||
|
|
bestDist = d
|
||
|
|
best = i
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return best
|
||
|
|
}
|
||
|
|
|
||
|
|
// annotateTableBoxes tags table boxes with row/header/column indices using
|
||
|
|
// TSR cell labels. Matching Python's R/H/C/SP annotation logic.
|
||
|
|
//
|
||
|
|
// Python: pdf_parser.py:518-554
|
||
|
|
func annotateTableBoxes(boxes []TextBox, grid [][]TSRCell) {
|
||
|
|
// grid[0] is the header row. Spans are computed by calSpans later.
|
||
|
|
var headers, spans []TSRCell
|
||
|
|
var clmns []TSRCell
|
||
|
|
if len(grid) > 0 {
|
||
|
|
headers = grid[0]
|
||
|
|
clmns = append(clmns, grid[0]...)
|
||
|
|
}
|
||
|
|
sortYFirstly(headers, 10)
|
||
|
|
sortXFirstly(clmns, 10)
|
||
|
|
|
||
|
|
for i := range boxes {
|
||
|
|
if boxes[i].LayoutType != LayoutTypeTable {
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
// Grid-based R/C: match box to the row and column it overlaps.
|
||
|
|
for ri, row := range grid {
|
||
|
|
if idx := findOverlappedWithThreshold(boxes[i], row, 0.3); idx >= 0 {
|
||
|
|
boxes[i].R = ri
|
||
|
|
boxes[i].RTop = row[0].Y0
|
||
|
|
boxes[i].RBott = row[0].Y1
|
||
|
|
for ci, cell := range row {
|
||
|
|
if !tsrBoxOverlap(boxes[i], cell) {
|
||
|
|
boxes[i].C = ci
|
||
|
|
boxes[i].CLeft = cell.X0
|
||
|
|
boxes[i].CRight = cell.X1
|
||
|
|
break
|
||
|
|
}
|
||
|
|
}
|
||
|
|
break
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if idx := findOverlappedWithThreshold(boxes[i], headers, 0.3); idx >= 0 {
|
||
|
|
boxes[i].HTop = headers[idx].Y0
|
||
|
|
boxes[i].HBott = headers[idx].Y1
|
||
|
|
boxes[i].HLeft = headers[idx].X0
|
||
|
|
boxes[i].HRight = headers[idx].X1
|
||
|
|
boxes[i].H = idx
|
||
|
|
}
|
||
|
|
if len(clmns) > 1 {
|
||
|
|
if idx := findHorizontallyTightestFit(boxes[i], clmns); idx >= 0 {
|
||
|
|
boxes[i].C = idx
|
||
|
|
boxes[i].CLeft = clmns[idx].X0
|
||
|
|
boxes[i].CRight = clmns[idx].X1
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if idx := findOverlappedWithThreshold(boxes[i], spans, 0.3); idx >= 0 {
|
||
|
|
boxes[i].SP = idx
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Two-pass C fallback: after all R values are assigned, compute C by X-order within each row.
|
||
|
|
// This matches Python's behavior when TSR provides few "table column" cells.
|
||
|
|
if len(clmns) <= 1 {
|
||
|
|
// Collect all table boxes grouped by R.
|
||
|
|
rBoxes := make(map[int][]int)
|
||
|
|
for i := range boxes {
|
||
|
|
if boxes[i].LayoutType == LayoutTypeTable {
|
||
|
|
rBoxes[boxes[i].R] = append(rBoxes[boxes[i].R], i)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
for _, indices := range rBoxes {
|
||
|
|
sort.Slice(indices, func(a, b int) bool { return boxes[indices[a]].X0 < boxes[indices[b]].X0 })
|
||
|
|
for ci, bi := range indices {
|
||
|
|
boxes[bi].C = ci
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|