mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
382 lines
10 KiB
Go
382 lines
10 KiB
Go
package parser
|
|
|
|
import (
|
|
"log/slog"
|
|
"math"
|
|
"regexp"
|
|
"slices"
|
|
"sort"
|
|
"strings"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// ---- Column assignment ----
|
|
|
|
// AssignColumn groups boxes into columns on each page by KMeans x0 clustering
|
|
// with silhouette score selection, matching Python's _assign_column().
|
|
//
|
|
// Python: pdf_parser.py:739 _assign_column()
|
|
func AssignColumn(boxes []TextBox, zoom float64) []TextBox {
|
|
if len(boxes) == 0 {
|
|
return boxes
|
|
}
|
|
|
|
pageGroups := make(map[int][]int)
|
|
for i, b := range boxes {
|
|
pageGroups[b.PageNumber] = append(pageGroups[b.PageNumber], i)
|
|
}
|
|
|
|
result := make([]TextBox, len(boxes))
|
|
copy(result, boxes)
|
|
|
|
// Step A: per-page best k using silhouette score.
|
|
pageCols := make(map[int]int)
|
|
for pg, indices := range pageGroups {
|
|
n := len(indices)
|
|
if n < 2 {
|
|
pageCols[pg] = 1
|
|
for _, idx := range indices {
|
|
result[idx].ColID = 0
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Extract x0 values and apply indent tolerance (12% of page width).
|
|
x0s := make([]float64, n)
|
|
minX0 := math.MaxFloat64
|
|
maxX1 := 0.0
|
|
for i, idx := range indices {
|
|
x0s[i] = boxes[idx].X0
|
|
if x0s[i] < minX0 {
|
|
minX0 = x0s[i]
|
|
}
|
|
if boxes[idx].X1 > maxX1 {
|
|
maxX1 = boxes[idx].X1
|
|
}
|
|
}
|
|
pageWidth := maxX1 - minX0
|
|
indentTol := pageWidth * 0.12
|
|
|
|
for i := range x0s {
|
|
if math.Abs(x0s[i]-minX0) < indentTol {
|
|
x0s[i] = minX0
|
|
}
|
|
}
|
|
|
|
// Try k = 1 .. min(4, n), pick best by silhouette.
|
|
maxTry := min(4, n)
|
|
if maxTry < 2 {
|
|
maxTry = 1
|
|
}
|
|
bestK, bestScore := 1, -1.0
|
|
|
|
for k := 1; k <= maxTry; k++ {
|
|
labels, _ := kmeans1D(x0s, k)
|
|
var score float64
|
|
if k > 1 {
|
|
score = silhouette1D(x0s, labels)
|
|
}
|
|
// score = 0 for k=1; score = -1 if silhouette undefined.
|
|
if score > bestScore {
|
|
bestScore = score
|
|
bestK = k
|
|
}
|
|
}
|
|
pageCols[pg] = bestK
|
|
}
|
|
|
|
// Step B: assign col_id per page using per-page best k.
|
|
// Labels are remapped by centroid x-order: leftmost column → 0.
|
|
for pg, indices := range pageGroups {
|
|
if len(indices) == 0 {
|
|
continue
|
|
}
|
|
k := pageCols[pg]
|
|
if len(indices) < k {
|
|
k = 1
|
|
}
|
|
|
|
x0s := make([]float64, len(indices))
|
|
for i, idx := range indices {
|
|
x0s[i] = boxes[idx].X0
|
|
}
|
|
|
|
labels, centroids := kmeans1D(x0s, k)
|
|
|
|
// Sort centroids by x position, remap labels left→right.
|
|
type clPair struct {
|
|
center float64
|
|
label int
|
|
}
|
|
var pairs []clPair
|
|
for lbl, c := range centroids {
|
|
pairs = append(pairs, clPair{c, lbl})
|
|
}
|
|
sort.Slice(pairs, func(i, j int) bool { return pairs[i].center < pairs[j].center })
|
|
remap := make(map[int]int, k)
|
|
for newL, p := range pairs {
|
|
remap[p.label] = newL
|
|
}
|
|
|
|
for i, idx := range indices {
|
|
result[idx].ColID = remap[labels[i]]
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// ---- Text merge (horizontal) ----
|
|
|
|
// TextMerge horizontally merges adjacent boxes at similar vertical positions.
|
|
//
|
|
// Python: pdf_parser.py:888 _text_merge()
|
|
func TextMerge(boxes []TextBox, medianHeights map[int]float64, zoom float64) []TextBox {
|
|
if len(boxes) < 2 {
|
|
return boxes
|
|
}
|
|
// Build output via collect: O(n) instead of O(n²) slice-element removal.
|
|
out := make([]TextBox, 0, len(boxes))
|
|
i := 0
|
|
for i < len(boxes) {
|
|
cur := boxes[i]
|
|
i++
|
|
for i < len(boxes) {
|
|
nxt := boxes[i]
|
|
if cur.PageNumber != nxt.PageNumber || cur.ColID != nxt.ColID {
|
|
break
|
|
}
|
|
// Python: b.get("layoutno", "0") != b_.get("layoutno", "1") —
|
|
// asymmetric defaults mean empty/missing layoutno never merge horizontally.
|
|
if cur.LayoutNo != nxt.LayoutNo || cur.LayoutNo == "" || nxt.LayoutNo == "" ||
|
|
cur.LayoutType == LayoutTypeTable || cur.LayoutType == LayoutTypeFigure || cur.LayoutType == LayoutTypeEquation {
|
|
break
|
|
}
|
|
mh := medianHeights[cur.PageNumber]
|
|
if mh <= 0 {
|
|
mh = 10
|
|
}
|
|
if math.Abs(BoxYDis(cur, nxt)) < mh/3 {
|
|
cur.X1 = nxt.X1
|
|
cur.Top = (cur.Top + nxt.Top) / 2
|
|
cur.Bottom = (cur.Bottom + nxt.Bottom) / 2
|
|
cur.Text += nxt.Text
|
|
i++
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
out = append(out, cur)
|
|
}
|
|
return out
|
|
}
|
|
|
|
// ---- Naive vertical merge ----
|
|
|
|
// NaiveVerticalMerge vertically merges boxes on the same page/column.
|
|
//
|
|
// Python: pdf_parser.py:926 _naive_vertical_merge()
|
|
func NaiveVerticalMerge(boxes []TextBox, medianHeights map[int]float64, medianWidths map[int]float64, isEnglish bool) []TextBox {
|
|
if len(boxes) < 2 {
|
|
return boxes
|
|
}
|
|
// Group by page only — matches Python's _naive_vertical_merge which
|
|
// hardcodes col="x" (pdf_parser.py:868), ignoring column assignment.
|
|
// Cross-column merges are prevented by the 30% horizontal overlap check.
|
|
groups := make(map[int][]int)
|
|
for i, b := range boxes {
|
|
groups[b.PageNumber] = append(groups[b.PageNumber], i)
|
|
}
|
|
// Sort page keys for deterministic output order (Python dict preserves
|
|
// insertion order since 3.7, Go map iteration is random).
|
|
pageKeys := make([]int, 0, len(groups))
|
|
for pg := range groups {
|
|
pageKeys = append(pageKeys, pg)
|
|
}
|
|
sort.Ints(pageKeys)
|
|
|
|
var result []TextBox
|
|
for _, pg := range pageKeys {
|
|
indices := groups[pg]
|
|
sort.Slice(indices, func(i, j int) bool {
|
|
bi, bj := boxes[indices[i]], boxes[indices[j]]
|
|
if bi.Top != bj.Top {
|
|
return bi.Top < bj.Top
|
|
}
|
|
return bi.X0 < bj.X0
|
|
})
|
|
bxs := make([]TextBox, len(indices))
|
|
for i, idx := range indices {
|
|
bxs[i] = boxes[idx]
|
|
}
|
|
|
|
mh := medianHeights[pg]
|
|
if mh <= 0 {
|
|
mh = MedianHeight(bxs)
|
|
}
|
|
mw := medianWidths[pg]
|
|
if mw <= 0 {
|
|
mw = 8 // Python fallback: np.median([...]) if chars else 8 (pdf_parser.py:1465)
|
|
}
|
|
|
|
// Collect pattern: build output slice, merging into last element when appropriate.
|
|
out := make([]TextBox, 0, len(bxs))
|
|
for i := 0; i < len(bxs); i++ {
|
|
b := bxs[i]
|
|
// Cross-page suffix (e.g. page number on previous page): skip.
|
|
if i > 0 && bxs[i-1].PageNumber < b.PageNumber && pageNumSuffixPattern.MatchString(bxs[i-1].Text) {
|
|
continue
|
|
}
|
|
if strings.TrimSpace(b.Text) == "" {
|
|
// Whitespace gap bridge: absorb into prev box if gap/xov pass,
|
|
// extending prev.Bottom. This matches Python's while/pop which
|
|
// keeps whitespace inline and lets it extend the previous box.
|
|
if len(out) > 0 {
|
|
prev := &out[len(out)-1]
|
|
if b.Top-prev.Bottom <= mh*1.5 && OverlapX(prev, &b) >= 0.3 {
|
|
// TODO: prev.Bottom = math.Max(prev.Bottom, b.Bottom) — direct assignment
|
|
// can shrink a tall merged box when a short whitespace box overlaps.
|
|
// Matches Python behavior (also direct assignment). Defer fix until
|
|
// pipeline alignment is shipped. See TestNaiveVerticalMerge_BottomShrink.
|
|
prev.Bottom = b.Bottom
|
|
}
|
|
}
|
|
continue
|
|
}
|
|
if len(out) == 0 {
|
|
out = append(out, b)
|
|
continue
|
|
}
|
|
prev := &out[len(out)-1]
|
|
if prev.LayoutNo != b.LayoutNo || strings.TrimSpace(b.Text) == "" {
|
|
slog.Debug("vm reject", "reason", "layout_no", "prevLayout", prev.LayoutNo, "bLayout", b.LayoutNo)
|
|
out = append(out, b)
|
|
continue
|
|
}
|
|
gap := b.Top - prev.Bottom
|
|
if gap > mh*1.5 {
|
|
slog.Debug("vm reject", "reason", "gap", "gap", gap, "threshold", mh*1.5, "mh", mh)
|
|
out = append(out, b)
|
|
continue
|
|
}
|
|
ov := OverlapX(prev, &b)
|
|
if ov < 0.3 {
|
|
slog.Debug("vm reject", "reason", "ovX", "ov", ov, "threshold", 0.3)
|
|
out = append(out, b)
|
|
continue
|
|
}
|
|
|
|
// Strip text before checking first/last characters (matching Python's
|
|
// b["text"].strip()[-1] / b_["text"].strip()[0]).
|
|
prevText := strings.TrimSpace(prev.Text)
|
|
bText := strings.TrimSpace(b.Text)
|
|
|
|
concatting := []bool{
|
|
endsWithOneOf(prevText, ",;:\",、‘“;:-"),
|
|
endsSecondLastOneOf(prevText, ",;:\",、‘“;:"),
|
|
startsWithOneOf(bText, "。;?!”)),,、:"),
|
|
}
|
|
anti := []bool{
|
|
endsWithOneOf(prevText, "。?!?"),
|
|
isEnglish && endsWithOneOf(prevText, ".!?"),
|
|
prev.PageNumber == b.PageNumber && b.Top-prev.Bottom > mh*1.5,
|
|
prev.PageNumber < b.PageNumber && math.Abs(prev.X0-b.X0) > mw*4,
|
|
}
|
|
detach := []bool{prev.X1 < b.X0, prev.X0 > b.X1}
|
|
if (slices.Contains(anti, true) && !slices.Contains(concatting, true)) || slices.Contains(detach, true) {
|
|
out = append(out, b)
|
|
continue
|
|
}
|
|
|
|
slog.Debug("vm merge", "gap", gap, "ovX", ov, "mh", mh, "prev", prevText[:min(40, len(prevText))], "next", bText[:min(40, len(bText))])
|
|
// Python: (b["text"].rstrip() + " " + b_["text"].lstrip()).strip()
|
|
prev.Text = strings.TrimSpace(strings.TrimRight(prevText, " \t") + " " + strings.TrimLeft(bText, " \t"))
|
|
// Preserve the taller bottom when merging (prev.Bottom may already
|
|
// extend beyond b.Bottom from a previous merge step).
|
|
prev.Bottom = math.Max(prev.Bottom, b.Bottom)
|
|
prev.X0 = math.Min(prev.X0, b.X0)
|
|
prev.X1 = math.Max(prev.X1, b.X1)
|
|
}
|
|
result = append(result, out...)
|
|
}
|
|
slog.Debug("vm result", "in", len(boxes), "out", len(result))
|
|
return result
|
|
}
|
|
|
|
// ---- Reading order ----
|
|
|
|
// FinalReadingOrderMerge sorts boxes by page → column → top → x0.
|
|
//
|
|
// Python: pdf_parser.py:1007 _final_reading_order_merge()
|
|
func FinalReadingOrderMerge(boxes []TextBox) []TextBox {
|
|
if len(boxes) == 0 {
|
|
return boxes
|
|
}
|
|
sort.Slice(boxes, func(i, j int) bool {
|
|
bi, bj := boxes[i], boxes[j]
|
|
if bi.PageNumber != bj.PageNumber {
|
|
return bi.PageNumber < bj.PageNumber
|
|
}
|
|
if bi.ColID != bj.ColID {
|
|
return bi.ColID < bj.ColID
|
|
}
|
|
if bi.Top != bj.Top {
|
|
return bi.Top < bj.Top
|
|
}
|
|
return bi.X0 < bj.X0
|
|
})
|
|
return boxes
|
|
}
|
|
|
|
var pageNumSuffixPattern = regexp.MustCompile(`[0-9 •一—-]+$`)
|
|
|
|
// ---- rune-based text helpers (CJK-safe) ----
|
|
|
|
func lastRune(s string) rune {
|
|
r, _ := utf8.DecodeLastRuneInString(s)
|
|
return r
|
|
}
|
|
|
|
func firstRune(s string) rune {
|
|
r, _ := utf8.DecodeRuneInString(s)
|
|
return r
|
|
}
|
|
|
|
func secondLastRune(s string) rune {
|
|
r, size := utf8.DecodeLastRuneInString(s)
|
|
if r == utf8.RuneError && size == 0 {
|
|
return 0
|
|
}
|
|
r2, _ := utf8.DecodeLastRuneInString(s[:len(s)-size])
|
|
return r2
|
|
}
|
|
|
|
func endsWithOneOf(s, set string) bool {
|
|
r := lastRune(s)
|
|
if r == 0 {
|
|
return false
|
|
}
|
|
return strings.ContainsRune(set, r)
|
|
}
|
|
|
|
func endsSecondLastOneOf(s, set string) bool {
|
|
r := secondLastRune(s)
|
|
if r == 0 {
|
|
return false
|
|
}
|
|
return strings.ContainsRune(set, r)
|
|
}
|
|
|
|
func startsWithOneOf(s, set string) bool {
|
|
r := firstRune(s)
|
|
if r == 0 {
|
|
return false
|
|
}
|
|
return strings.ContainsRune(set, r)
|
|
}
|
|
|
|
// containsRune returns true if the string set contains the given rune.
|
|
func containsRune(set string, r rune) bool {
|
|
return strings.ContainsRune(set, r)
|
|
}
|