mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-02 08:45:42 +08:00
### What problem does this PR solve? Package refactor and PDF post process. ### Type of change - [x] Refactoring --------- Co-authored-by: Claude <noreply@anthropic.com>
350 lines
11 KiB
Go
350 lines
11 KiB
Go
package table
|
||
|
||
import (
|
||
"log/slog"
|
||
"math"
|
||
"sort"
|
||
|
||
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
||
)
|
||
|
||
// extractTableAndReplace pops table boxes and replaces them with consolidated
|
||
// HTML boxes (one per table). This matches Python's _extract_table_figure which
|
||
// pops all boxes inside a table DLA region and inserts a single HTML box.
|
||
//
|
||
// Table boxes whose text matches the data-source discard pattern
|
||
// (r"(数据|资料|图表)*来源[:: ]") are removed entirely without replacement —
|
||
// matching Python's _extract_table_figure discard behavior.
|
||
|
||
// MarkNoMergeTables traverses boxes in page order. When a caption, title, or
|
||
// reference immediately follows a table, the preceding table is marked NoMerge
|
||
// to prevent cross-page merge. Matches Python's nomerge_lout_no.
|
||
func MarkNoMergeTables(boxes []pdf.TextBox, tables []pdf.TableItem) {
|
||
var lastTableTI int = -1
|
||
for i := range boxes {
|
||
lt := boxes[i].LayoutType
|
||
if lt == pdf.LayoutTypeTable {
|
||
matched := false
|
||
for ti := range tables {
|
||
for _, tp := range tables[ti].Positions {
|
||
if boxOverlapsPosition(boxes[i], tp) {
|
||
lastTableTI = ti
|
||
matched = true
|
||
break
|
||
}
|
||
}
|
||
}
|
||
if !matched {
|
||
lastTableTI = -1
|
||
}
|
||
continue
|
||
}
|
||
if lastTableTI >= 0 && (lt == pdf.LayoutTypeTitle || lt == pdf.DLALabelTableCaption || lt == pdf.DLALabelFigureCaption || lt == pdf.LayoutTypeReference || IsCaptionBox(boxes[i].Text, lt)) {
|
||
tables[lastTableTI].NoMerge = true
|
||
}
|
||
}
|
||
}
|
||
|
||
// boxes must be post-TextMerge + post-VerticalMerge. pdf.TableItem.Cells are in
|
||
// crop pixel space; boxes are in PDF point space — conversion via Scale/CropOff.
|
||
// replacement pairs a table index with the box index it replaces.
|
||
type replacement struct {
|
||
tableIdx int
|
||
boxIdx int
|
||
}
|
||
|
||
// buildReplacements scans for data-source-attribution boxes to remove and maps
|
||
// each table to overlapping table-layout boxes, producing the replacement list.
|
||
func buildReplacements(boxes []pdf.TextBox, tables []pdf.TableItem) (map[int]bool, []replacement) {
|
||
removeSet := make(map[int]bool)
|
||
for i := range boxes {
|
||
if boxes[i].LayoutType == pdf.LayoutTypeTable && isDataSourceBox(boxes[i].Text) {
|
||
removeSet[i] = true
|
||
}
|
||
}
|
||
var reps []replacement
|
||
for ti := range tables {
|
||
for i := range boxes {
|
||
if boxes[i].LayoutType != pdf.LayoutTypeTable || removeSet[i] {
|
||
continue
|
||
}
|
||
for _, tp := range tables[ti].Positions {
|
||
if boxOverlapsPosition(boxes[i], tp) {
|
||
reps = append(reps, replacement{tableIdx: ti, boxIdx: i})
|
||
break
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return removeSet, reps
|
||
}
|
||
|
||
func ExtractTableAndReplace(boxes []pdf.TextBox, tables []pdf.TableItem) []pdf.TextBox {
|
||
if len(tables) == 0 {
|
||
return boxes
|
||
}
|
||
// Pre-merge nomerge detection: match Python's nomerge_lout_no.
|
||
// Traverse boxes in page order. When a caption/title/reference is
|
||
// found, mark the preceding table group as NoMerge, preventing
|
||
// cross-page merge when a caption ends a table group.
|
||
// Python: if is_caption(c) or layout_type in ["table caption", "title",
|
||
// "figure caption", "reference"]: nomerge_lout_no.append(lst_lout_no)
|
||
MarkNoMergeTables(boxes, tables)
|
||
|
||
// Merge same-layoutno tables across consecutive pages (Python _extract_table_figure).
|
||
tables = MergeTablesAcrossPages(tables, nil)
|
||
|
||
// Pre-scan: mark data-source-attribution table boxes for removal.
|
||
// Python: if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
|
||
// self.boxes.pop(i); continue — box discarded, no HTML replacement.
|
||
removeSet, replacements := buildReplacements(boxes, tables)
|
||
|
||
// Image-only PDFs (0 boxes) may have tables with cells but no
|
||
// overlapping LayoutType=="table" boxes — generate HTML directly.
|
||
if len(replacements) == 0 && len(boxes) == 0 {
|
||
var out []pdf.TextBox
|
||
for ti := range tables {
|
||
if len(tables[ti].Cells) == 0 {
|
||
continue
|
||
}
|
||
s := tables[ti].Scale
|
||
pageGlobalCells := CellSliceToPageSpace(tables[ti].Cells, tables[ti].CropOffX, tables[ti].CropOffY, s)
|
||
var tableBoxes []pdf.TextBox
|
||
html := ConstructTable(pageGlobalCells, tableBoxes, tables[ti].Caption, &tables[ti])
|
||
if html != "" {
|
||
out = append(out, pdf.TextBox{
|
||
Text: html, LayoutType: "table", PageNumber: 0,
|
||
})
|
||
}
|
||
}
|
||
return out
|
||
}
|
||
if len(replacements) == 0 {
|
||
// No HTML replacements, but data-source boxes still need removal.
|
||
if len(removeSet) == 0 {
|
||
return boxes
|
||
}
|
||
out := make([]pdf.TextBox, 0, len(boxes)-len(removeSet))
|
||
for i, b := range boxes {
|
||
if !removeSet[i] {
|
||
out = append(out, b)
|
||
}
|
||
}
|
||
return out
|
||
}
|
||
|
||
// Distance-based anchor selection (Python's min_rectangle_distance).
|
||
// Find the spatially nearest non-table text box for each table and
|
||
// use that as the anchor, matching insert_table_figures behavior.
|
||
replacedByTable := make(map[int]int)
|
||
for ti := range tables {
|
||
if len(tables[ti].Cells) == 0 {
|
||
continue
|
||
}
|
||
tbl := &tables[ti]
|
||
tblLeft, tblRight := tbl.RegionLeft, tbl.RegionRight
|
||
tblTop, tblBottom := tbl.RegionTop, tbl.RegionBottom
|
||
tblPg := 0
|
||
if len(tbl.Positions) > 0 {
|
||
p := tbl.Positions[0]
|
||
if len(p.PageNumbers) > 0 {
|
||
tblPg = p.PageNumbers[0]
|
||
}
|
||
if tblLeft == 0 && tblRight == 0 && tblTop == 0 && tblBottom == 0 {
|
||
tblLeft, tblRight = p.Left, p.Right
|
||
tblTop, tblBottom = p.Top, p.Bottom
|
||
}
|
||
}
|
||
bestDist := math.MaxFloat64
|
||
bestIdx := -1
|
||
for i, b := range boxes {
|
||
if b.LayoutType == pdf.LayoutTypeTable || b.LayoutType == pdf.LayoutTypeFigure {
|
||
continue
|
||
}
|
||
if b.PageNumber != tblPg {
|
||
continue
|
||
}
|
||
dist := minRectangleDistance(
|
||
b.X0, b.X1, b.Top, b.Bottom,
|
||
tblLeft, tblRight, tblTop, tblBottom,
|
||
)
|
||
if dist < bestDist {
|
||
bestDist = dist
|
||
bestIdx = i
|
||
}
|
||
}
|
||
if bestIdx >= 0 {
|
||
if boxes[bestIdx].Bottom < tblTop {
|
||
bestIdx++
|
||
}
|
||
replacedByTable[ti] = bestIdx
|
||
} else {
|
||
for _, r := range replacements {
|
||
if r.tableIdx == ti {
|
||
if _, ok := replacedByTable[ti]; !ok || r.boxIdx < replacedByTable[ti] {
|
||
replacedByTable[ti] = r.boxIdx
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
for _, r := range replacements {
|
||
removeSet[r.boxIdx] = true
|
||
}
|
||
|
||
// Build HTML for each table using post-merge boxes converted to crop space.
|
||
htmlByTable := make(map[int]string)
|
||
for ti := range tables {
|
||
if len(tables[ti].Cells) == 0 {
|
||
continue
|
||
}
|
||
// Convert TSR cells from crop-pixel space to page-global 72 DPI,
|
||
// matching Python's coordinate space. Text boxes are already in
|
||
// page-global 72 DPI (from ocrMergeChars), so no conversion needed.
|
||
s := tables[ti].Scale
|
||
pageGlobalCells := CellSliceToPageSpace(tables[ti].Cells, tables[ti].CropOffX, tables[ti].CropOffY, s)
|
||
// Collect only table-labelled boxes (Python: filters by layout_type).
|
||
var tableBoxes []pdf.TextBox
|
||
for i := range boxes {
|
||
if boxes[i].LayoutType != pdf.LayoutTypeTable {
|
||
continue
|
||
}
|
||
for _, tp := range tables[ti].Positions {
|
||
if boxOverlapsPosition(boxes[i], tp) {
|
||
tableBoxes = append(tableBoxes, boxes[i])
|
||
break
|
||
}
|
||
}
|
||
}
|
||
slog.Debug("extractTableAndReplace constructTable", "table", ti, "cells", len(pageGlobalCells), "boxes", len(tableBoxes))
|
||
htmlByTable[ti] = ConstructTable(pageGlobalCells, tableBoxes, tables[ti].Caption, &tables[ti])
|
||
}
|
||
|
||
// Sort anchors by position for stable insertion.
|
||
anchorList := make([]struct{ ti, pos int }, 0, len(replacedByTable))
|
||
for ti, pos := range replacedByTable {
|
||
anchorList = append(anchorList, struct{ ti, pos int }{ti, pos})
|
||
}
|
||
sort.Slice(anchorList, func(i, j int) bool { return anchorList[i].pos < anchorList[j].pos })
|
||
|
||
out := make([]pdf.TextBox, 0, len(boxes)-len(removeSet)+len(replacedByTable))
|
||
anchorIdx := 0
|
||
for i, b := range boxes {
|
||
// Insert any HTML boxes whose anchor position is before or at i.
|
||
for anchorIdx < len(anchorList) && anchorList[anchorIdx].pos <= i {
|
||
ti := anchorList[anchorIdx].ti
|
||
html := htmlByTable[ti]
|
||
if html != "" {
|
||
tbl := &tables[ti]
|
||
out = append(out, tableRegionBox(tbl, &b, html))
|
||
}
|
||
anchorIdx++
|
||
}
|
||
if !removeSet[i] {
|
||
out = append(out, b)
|
||
}
|
||
}
|
||
// Remaining anchors after last box.
|
||
for anchorIdx < len(anchorList) {
|
||
ti := anchorList[anchorIdx].ti
|
||
html := htmlByTable[ti]
|
||
if html != "" {
|
||
tbl := &tables[ti]
|
||
last := &boxes[len(boxes)-1]
|
||
out = append(out, tableRegionBox(tbl, last, html))
|
||
}
|
||
anchorIdx++
|
||
}
|
||
return out
|
||
}
|
||
|
||
// consolidateFigures merges figure boxes that share the same LayoutNo
|
||
// (i.e., belong to the same DLA figure region) into a single pdf.TextBox.
|
||
// Matches Python's _extract_table_figure + insert_table_figures which pops
|
||
// individual figure boxes and re-inserts one consolidated figure block
|
||
// per DLA region with combined text.
|
||
//
|
||
// Figure boxes whose text matches the data-source discard pattern
|
||
// (r"(数据|资料|图表)*来源[:: ]") are removed entirely — matching Python's
|
||
// _extract_table_figure discard behavior (pdf_parser.py:1050-1052).
|
||
func ConsolidateFigures(boxes []pdf.TextBox) []pdf.TextBox {
|
||
// Pre-scan: mark data-source-attribution figure boxes for removal.
|
||
// Python: if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
|
||
// self.boxes.pop(i); continue — box discarded.
|
||
removeSet := make(map[int]bool)
|
||
for i, b := range boxes {
|
||
if b.LayoutType == pdf.LayoutTypeFigure && isDataSourceBox(b.Text) {
|
||
removeSet[i] = true
|
||
}
|
||
}
|
||
|
||
// Group figure boxes by (page, layoutno).
|
||
type figKey struct {
|
||
page int
|
||
ln string
|
||
}
|
||
groups := make(map[figKey][]int)
|
||
for i, b := range boxes {
|
||
if b.LayoutType != pdf.LayoutTypeFigure || removeSet[i] {
|
||
continue
|
||
}
|
||
key := figKey{b.PageNumber, b.LayoutNo}
|
||
groups[key] = append(groups[key], i)
|
||
}
|
||
|
||
if len(groups) == 0 {
|
||
// Still need to filter out data-source figure boxes.
|
||
if len(removeSet) == 0 {
|
||
return boxes
|
||
}
|
||
out := make([]pdf.TextBox, 0, len(boxes)-len(removeSet))
|
||
for i, b := range boxes {
|
||
if !removeSet[i] {
|
||
out = append(out, b)
|
||
}
|
||
}
|
||
return out
|
||
}
|
||
|
||
// Collect indices to remove (all group members except the first).
|
||
for _, indices := range groups {
|
||
if len(indices) <= 1 {
|
||
continue
|
||
}
|
||
// Merge into the first box of the group.
|
||
anchor := indices[0]
|
||
for _, idx := range indices[1:] {
|
||
b := boxes[idx]
|
||
boxes[anchor].Text += "\n" + b.Text
|
||
boxes[anchor].X0 = math.Min(boxes[anchor].X0, b.X0)
|
||
boxes[anchor].X1 = math.Max(boxes[anchor].X1, b.X1)
|
||
boxes[anchor].Top = math.Min(boxes[anchor].Top, b.Top)
|
||
boxes[anchor].Bottom = math.Max(boxes[anchor].Bottom, b.Bottom)
|
||
removeSet[idx] = true
|
||
}
|
||
}
|
||
|
||
if len(removeSet) == 0 {
|
||
return boxes
|
||
}
|
||
|
||
out := make([]pdf.TextBox, 0, len(boxes)-len(removeSet))
|
||
for i, b := range boxes {
|
||
if !removeSet[i] {
|
||
out = append(out, b)
|
||
}
|
||
}
|
||
return out
|
||
}
|
||
|
||
// boxOverlapsPosition checks if a pdf.TextBox overlaps a pdf.Position with margin.
|
||
func boxOverlapsPosition(box pdf.TextBox, pos pdf.Position) bool {
|
||
const margin = 2.0
|
||
return box.X0 <= pos.Right+margin && box.X1 >= pos.Left-margin &&
|
||
box.Top <= pos.Bottom+margin && box.Bottom >= pos.Top-margin
|
||
}
|
||
|
||
// rowsToHTML converts grouped TSR cell rows to an HTML table string.
|
||
// spanInfo maps (row,col) → (colspan, rowspan) for spanning cells;
|
||
// covered marks cells hidden by a span. Both may be nil.
|