Files
ragflow/internal/deepdoc/parser/pdf/layout/chars_boxes.go
Jack 98323e7910 Refactor: oss parser go refactor (#16391)
### What problem does this PR solve?

Package refactor and PDF post process.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-29 18:46:41 +08:00

221 lines
6.1 KiB
Go

package layout
import (
"math"
"regexp"
"sort"
"strings"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
util "ragflow/internal/deepdoc/parser/pdf/util"
)
// CharsToBoxes converts raw characters to initial text boxes by grouping
// characters into lines based on vertical overlap.
//
// Python: pdf_parser.__images__ producing self.boxes
func CharsToBoxes(chars []pdf.TextChar, pageNum int, sortByTop bool) []pdf.TextBox {
if len(chars) == 0 {
return nil
}
lines := GroupCharsToLines(chars, sortByTop)
// Page-level column gap threshold from ALL inter-char gaps.
// Falls back to per-line threshold when page has too few gaps.
threshold := pageXGapThreshold(lines)
boxes := make([]pdf.TextBox, 0, len(lines))
for _, line := range lines {
thr := threshold
if thr > 100 {
// No significant column gaps on this page → use per-line threshold.
thr = perLineXGapThreshold(line)
}
subLines := splitLineByXGap(line, thr)
for _, sub := range subLines {
box := LineToTextBox(sub)
box.PageNumber = pageNum
boxes = append(boxes, box)
}
}
return boxes
}
// perLineXGapThreshold computes a dynamic X-gap threshold for column
// splitting within a single line (fallback when page has few gaps).
func perLineXGapThreshold(chars []pdf.TextChar) float64 {
if len(chars) <= 1 {
return 1e9
}
var gaps []float64
for i := 1; i < len(chars); i++ {
g := chars[i].X0 - chars[i-1].X1
gaps = append(gaps, g)
}
if len(gaps) == 0 {
return 1e9
}
sort.Float64s(gaps)
medianGap := gaps[len(gaps)/2]
if medianGap < 6 {
medianGap = 6
}
return medianGap * 2.5
}
// pageXGapThreshold computes a global X-gap column threshold from all
// inter-char gaps across all lines on the page. 95th percentile catches
// column boundaries while excluding word-level gaps.
// Returns a value > 100 when there are too few gaps for reliable p95,
// signalling the caller to fall back to perLineXGapThreshold.
func pageXGapThreshold(lines [][]pdf.TextChar) float64 {
var allGaps []float64
for _, line := range lines {
for i := 1; i < len(line); i++ {
g := line[i].X0 - line[i-1].X1
allGaps = append(allGaps, g)
}
}
if len(allGaps) < 10 {
return 1e9 // too few gaps for reliable p95 → fall back to per-line
}
sort.Float64s(allGaps)
// 95th percentile: only the largest 5% of gaps are column boundaries.
p95 := allGaps[len(allGaps)*95/100]
if p95 < 30 {
p95 = 30 // floor: column gaps are ≥30pt in practice
}
return p95
}
// splitLineByXGap splits a character line into sub-lines where X gaps
// meet or exceed the threshold (column boundaries). Uses >= to match the
// p95 boundary value — a gap exactly at the 95th percentile is a column gap,
// not a word gap.
func splitLineByXGap(chars []pdf.TextChar, threshold float64) [][]pdf.TextChar {
if len(chars) <= 1 {
return [][]pdf.TextChar{chars}
}
var result [][]pdf.TextChar
start := 0
for i := 1; i < len(chars); i++ {
gap := chars[i].X0 - chars[i-1].X1
if gap >= threshold {
result = append(result, chars[start:i])
start = i
}
}
result = append(result, chars[start:])
return result
}
// ---- internal helpers ----
// groupCharsToLines groups characters into horizontal lines based on vertical overlap.
func GroupCharsToLines(chars []pdf.TextChar, sortByTop bool) [][]pdf.TextChar {
if len(chars) == 0 {
return nil
}
key := func(c pdf.TextChar) float64 { return c.Bottom }
if sortByTop {
key = func(c pdf.TextChar) float64 { return c.Top }
}
// Sort by vertical key (Bottom or Top) then x0 using sort.SliceStable.
// Guard against NaN: a NaN key sorts after everything else.
sort.SliceStable(chars, func(i, j int) bool {
ki, kj := key(chars[i]), key(chars[j])
if ki != kj && !math.IsNaN(ki) && !math.IsNaN(kj) {
return ki < kj
}
if math.IsNaN(ki) != math.IsNaN(kj) {
return !math.IsNaN(ki) // non-NaN before NaN
}
return chars[i].X0 < chars[j].X0
})
var lines [][]pdf.TextChar
var currentLine []pdf.TextChar
for _, c := range chars {
if len(currentLine) == 0 {
currentLine = append(currentLine, c)
continue
}
if verticalOverlap(currentLine[len(currentLine)-1], c) {
currentLine = append(currentLine, c)
} else {
if len(currentLine) > 0 {
lines = append(lines, currentLine)
}
currentLine = []pdf.TextChar{c}
}
}
if len(currentLine) > 0 {
lines = append(lines, currentLine)
}
return lines
}
// verticalOverlap checks if two characters are on the same horizontal line.
func verticalOverlap(a, b pdf.TextChar) bool {
mh := math.Max(util.CharHeight(a), util.CharHeight(b))
if mh <= 0 {
mh = 1.0
}
return math.Abs(a.Top-b.Top) < mh*0.5
}
// lineToTextBox converts a line of characters to a single pdf.TextBox.
// asciiWordPattern matches strings composed entirely of ASCII word
// characters. Python uses re.match (prefix match) — the stricter
// full-string match here is equivalent in practice because each
// pdf.TextChar.Text is a single rune, so prevText+currText ≤ 2 chars.
// Python: pdf_parser.py:1528 re.match(r"[0-9a-zA-Z,.:;!%]+", ...)
var asciiWordPattern = regexp.MustCompile(`^[0-9a-zA-Z,.:;!%]+$`)
func LineToTextBox(chars []pdf.TextChar) pdf.TextBox {
if len(chars) == 0 {
return pdf.TextBox{}
}
box := pdf.TextBox{
X0: chars[0].X0,
X1: chars[0].X1,
Top: chars[0].Top,
Bottom: chars[0].Bottom,
}
var textParts []string
for i, c := range chars {
// Insert space between adjacent ASCII words with a visible gap.
// Python: pdf_parser.py:1524-1532 __img_ocr space insertion.
if i > 0 {
prev := chars[i-1]
prevText := strings.TrimSpace(prev.Text)
currText := strings.TrimSpace(c.Text)
if prevText != "" && currText != "" {
gap := c.X0 - prev.X1
minWidth := math.Min(c.X1-c.X0, prev.X1-prev.X0)
if gap >= minWidth/2 &&
asciiWordPattern.MatchString(prevText+currText) {
textParts = append(textParts, " ")
}
}
}
box.X0 = math.Min(box.X0, c.X0)
box.X1 = math.Max(box.X1, c.X1)
box.Top = math.Min(box.Top, c.Top)
box.Bottom = math.Max(box.Bottom, c.Bottom)
textParts = append(textParts, c.Text)
if c.LayoutType != "" {
box.LayoutType = c.LayoutType
}
if c.LayoutNo != "" {
box.LayoutNo = c.LayoutNo
}
}
box.Text = strings.Join(textParts, "")
return box
}