Files
ragflow/internal/deepdoc/parser/pdf/table/table_annotate.go
Jack 98323e7910 Refactor: oss parser go refactor (#16391)
### What problem does this PR solve?

Package refactor and PDF post process.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-29 18:46:41 +08:00

282 lines
9.2 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package table
import (
"fmt"
"math"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
"ragflow/internal/deepdoc/parser/pdf/util"
)
// ── region matching ────────────────────────────────────────────────────
// tableMatch pairs a DLA table region with the indices of boxes that overlap it.
type TableMatch struct {
Region pdf.DLARegion
BoxIdx []int
}
// ── region matching ────────────────────────────────────────────────────
func regionOverlapsBox(region pdf.DLARegion, box pdf.TextBox, scale float64) bool {
rx0 := region.X0 / scale
ry0 := region.Y0 / scale
rx1 := region.X1 / scale
ry1 := region.Y1 / scale
scaledR := pdf.DLARegion{X0: rx0, Y0: ry0, X1: rx1, Y1: ry1}
inter := util.OverlapInter(&scaledR, &box)
boxArea := util.Area(&box)
if boxArea <= 0 {
return false
}
return inter/boxArea >= 0.4 // matches Python thr=0.4
}
// matchTableRegions pairs DLA table regions with boxes that overlap them.
// Each table region is matched if at least one box overlaps it (>40% of box
// area) or if there are no boxes at all (image-only PDF), matching Python's
// _table_transformer_job which processes every table DLA region.
func MatchTableRegions(boxes []pdf.TextBox, regions []pdf.DLARegion, scale float64) []TableMatch {
var matches []TableMatch
for _, r := range regions {
if r.Label != pdf.LayoutTypeTable {
continue
}
var matched []int
for i, b := range boxes {
if regionOverlapsBox(r, b, scale) {
matched = append(matched, i)
}
}
if len(matched) > 0 || len(boxes) == 0 {
matches = append(matches, TableMatch{Region: r, BoxIdx: matched})
}
}
return matches
}
// ── layout annotation ──────────────────────────────────────────────────
// annotateBoxLayouts sets LayoutType and LayoutNo on each box, matching
// Python's LayoutRecognizer.__call__ which assigns layout types in priority
// order (footer→header→…→equation) with an overlap threshold of 40% of the
// box's area.
//
// Python: _layouts_rec (pdf_parser.py:827) → LayoutRecognizer.__call__ →
//
// for lt in priority_order: findLayout(lt)
//
// Each findLayout(ty): for each unannotated box, find the DLA region of
// type ty with max overlap ≥ 0.4 × box_area. First type to match wins.
//
// CID-pattern boxes (e.g. "(cid:123)") are skipped as garbage.
// annotateBoxLayouts assigns LayoutType and LayoutNo to boxes based on DLA
// regions. Returns the filtered slice (Python pops CID-garbled boxes and
// garbage-layout boxes at wrong positions — Go mirrors with compact).
// Also creates synthetic figure boxes for unmatched figure/equation regions.
func AnnotateBoxLayouts(boxes []pdf.TextBox, regions []pdf.DLARegion, scale float64, pageImgHeight float64) []pdf.TextBox {
if len(regions) == 0 {
return boxes
}
// Scale all regions to PDF space once.
type scaledRegion struct {
x0, y0, x1, y1 float64
label string
}
scaled := make([]scaledRegion, len(regions))
for i, r := range regions {
scaled[i] = scaledRegion{
x0: r.X0 / scale, y0: r.Y0 / scale,
x1: r.X1 / scale, y1: r.Y1 / scale,
label: r.Label,
}
}
// DLA confidence filter — matches Python's `score >= 0.4`.
regionOK := make([]bool, len(regions))
for i, r := range regions {
regionOK[i] = r.Confidence >= 0.4 || !isGarbageLayoutType(r.Label)
}
// Pre-compute per-type index for each region (Python: matched index within
// filtered layouts_of_type list). "text" regions get 0,1,2... independent
// of "figure" regions.
typeIndex := make([]int, len(regions))
typeCounters := make(map[string]int)
for j, r := range scaled {
if regionOK[j] {
typeIndex[j] = typeCounters[r.label]
typeCounters[r.label]++
}
}
// Track visited regions (Python: layout["visited"] = True).
visited := make([]bool, len(regions))
// Marks for Python-style pop removal.
dropped := make([]bool, len(boxes))
// Priority order matching Python's findLayout loop.
priorityOrder := []string{
pdf.LayoutTypeFooter, pdf.LayoutTypeHeader, pdf.LayoutTypeReference,
pdf.DLALabelFigureCaption, pdf.DLALabelTableCaption,
pdf.LayoutTypeTitle, pdf.LayoutTypeTable, pdf.LayoutTypeText,
pdf.LayoutTypeFigure, pdf.LayoutTypeEquation,
}
for _, ty := range priorityOrder {
for i := range boxes {
if boxes[i].LayoutType != "" || dropped[i] {
continue
}
// CID garbage: pop the box entirely (Python: bxs.pop(i)).
if util.CIDPattern.MatchString(boxes[i].Text) {
dropped[i] = true
continue
}
boxArea := (boxes[i].X1 - boxes[i].X0) * (boxes[i].Bottom - boxes[i].Top)
if boxArea <= 0 {
continue
}
bestOverlap := 0.0
bestJ := -1
for j, r := range scaled {
if r.label != ty || !regionOK[j] {
continue
}
ix0 := math.Max(r.x0, boxes[i].X0)
iy0 := math.Max(r.y0, boxes[i].Top)
ix1 := math.Min(r.x1, boxes[i].X1)
iy1 := math.Min(r.y1, boxes[i].Bottom)
if ix0 < ix1 && iy0 < iy1 {
ov := (ix1 - ix0) * (iy1 - iy0) / boxArea
if ov > bestOverlap {
bestOverlap = ov
bestJ = j
}
}
}
if bestJ >= 0 && bestOverlap >= 0.4 {
// Garbage layout not at page edge → pop (Python: bxs.pop(i)).
if isGarbageLayoutType(ty) && pageImgHeight > 0 && !garbageKeepFeat(ty, boxes[i], pageImgHeight/scale) {
dropped[i] = true
continue
}
visited[bestJ] = true
// Python: equation mapped to "figure" for layout_type
if ty == pdf.LayoutTypeEquation {
boxes[i].LayoutType = pdf.LayoutTypeFigure
} else {
boxes[i].LayoutType = ty
}
// Python: f"{layout_type}-{matched}" where matched is per-type index
boxes[i].LayoutNo = fmt.Sprintf("%s-%d", ty, typeIndex[bestJ])
}
}
}
// Compact: remove popped boxes into a new backing array (Python
// bxs.pop). Allocating a fresh slice is deliberate: annotations were
// set in-place on the input elements, and callers (enrichWithDeepDoc)
// rely on positional stability of the original slice for their
// write-back loop. Reusing the input backing array would shift
// survivors forward and break that index mapping.
survivors := 0
for i := range boxes {
if !dropped[i] {
survivors++
}
}
compacted := make([]pdf.TextBox, 0, survivors)
for i := range boxes {
if !dropped[i] {
compacted = append(compacted, boxes[i])
}
}
boxes = compacted
// Synthetic figure boxes for unmatched figure/equation regions (Python:
// dla_cli.py:187-195). Use a fresh per-type counter for synthetic boxes.
synthIdx := 0
for j, r := range scaled {
if !regionOK[j] || visited[j] {
continue
}
if r.label != pdf.LayoutTypeFigure && r.label != pdf.LayoutTypeEquation {
continue
}
boxes = append(boxes, pdf.TextBox{
X0: r.x0,
X1: r.x1,
Top: r.y0,
Bottom: r.y1,
Text: "",
LayoutType: pdf.LayoutTypeFigure,
LayoutNo: fmt.Sprintf("figure-%d", synthIdx),
})
synthIdx++
}
return boxes
}
// ── garbage layout helpers ────────────────────────────────────────────
// garbageLayoutTypes matches Python's self.garbage_layouts.
var garbageLayoutTypes = map[string]bool{
pdf.LayoutTypeFooter: true, pdf.LayoutTypeHeader: true, pdf.LayoutTypeReference: true,
}
func isGarbageLayoutType(ty string) bool {
return garbageLayoutTypes[ty]
}
// garbageKeepFeat matches Python's keep_feats in LayoutRecognizer.__call__:
// footer near page bottom (>90% of page height) or header near page top (<10%)
// are real page decorations — keep them. Others are DLA noise.
func garbageKeepFeat(ty string, box pdf.TextBox, pageImgHeight float64) bool {
switch ty {
case pdf.LayoutTypeFooter:
return box.Bottom < pageImgHeight*0.9
case pdf.LayoutTypeHeader:
return box.Top > pageImgHeight*0.1
}
return false
}
// writeTableAnnotations annotates boxes at boxIdx with table cell grid
// information (R/C/H/SP). Cells are offset by cropOff, grouped into a grid,
// and annotation fields are scaled back to PDF space for each box.
func WriteTableAnnotations(boxes []pdf.TextBox, boxIdx []int, cells []pdf.TSRCell, scale, cropOffX, cropOffY float64, tb pdf.TableBuilder) {
tableCells := make([]pdf.TSRCell, len(cells))
for k := range cells {
tableCells[k] = CellAddOffset(cells[k], cropOffX, cropOffY)
}
tblBoxes := make([]pdf.TextBox, len(boxIdx))
for k, idx := range boxIdx {
b := boxes[idx]
tblBoxes[k] = pdf.TextBox{
X0: b.X0 * scale, X1: b.X1 * scale,
Top: b.Top * scale, Bottom: b.Bottom * scale,
LayoutType: b.LayoutType,
Text: b.Text,
}
}
annotGrid := tb.GroupCells(tableCells)
AnnotateTableBoxes(tblBoxes, annotGrid)
for k, idx := range boxIdx {
bp := &tblBoxes[k]
boxes[idx].R = bp.R
boxes[idx].RTop = bp.RTop / scale
boxes[idx].RBott = bp.RBott / scale
boxes[idx].H = bp.H
boxes[idx].HTop = bp.HTop / scale
boxes[idx].HBott = bp.HBott / scale
boxes[idx].HLeft = bp.HLeft / scale
boxes[idx].HRight = bp.HRight / scale
boxes[idx].C = bp.C
boxes[idx].CLeft = bp.CLeft / scale
boxes[idx].CRight = bp.CRight / scale
boxes[idx].SP = bp.SP
}
}