Files
ragflow/internal/deepdoc/parser/pdf/cleanup.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

75 lines
1.7 KiB
Go

package parser
import (
"strings"
"unicode"
)
// ---- MergeSameBullet (Python: pdf_parser.py _merge_same_bullet) ----
// MergeSameBullet merges adjacent boxes that start with the same bullet/number
// character, combining their text with a newline separator.
func MergeSameBullet(boxes []TextBox, tok Tokenizer) []TextBox {
if len(boxes) < 2 {
return boxes
}
// Build output via two-pointer collect: O(n) instead of O(n²) slice-element removal.
out := make([]TextBox, 0, len(boxes))
i := 0
for i < len(boxes) {
if strings.TrimSpace(boxes[i].Text) == "" {
i++
continue
}
// Start a merge chain from position i.
cur := boxes[i]
i++
for i < len(boxes) {
if strings.TrimSpace(boxes[i].Text) == "" {
i++
continue
}
nxt := boxes[i]
firstCur := firstRuneString(cur.Text)
firstNxt := firstRuneString(nxt.Text)
// Conditions to NOT merge:
if firstCur != firstNxt ||
unicode.Is(unicode.Latin, firstCur) ||
isChinese(firstCur, tok) ||
cur.Top > nxt.Bottom {
break
}
// Merge nxt into cur.
cur.Text = cur.Text + "\n" + nxt.Text
cur.X0 = min(cur.X0, nxt.X0)
cur.X1 = max(cur.X1, nxt.X1)
cur.Bottom = nxt.Bottom
i++
}
out = append(out, cur)
}
return out
}
// ---- Helpers ----
func firstRuneString(s string) rune {
s = strings.TrimSpace(s)
if s == "" {
return 0
}
return []rune(s)[0]
}
// isChinese checks if a rune is a Chinese character (CJK Unified Ideograph).
func isChinese(r rune, tok Tokenizer) bool {
if tok != nil {
return strings.Contains(tok.Tag(string(r)), "n")
}
return (r >= 0x4E00 && r <= 0x9FFF) ||
(r >= 0x3400 && r <= 0x4DBF) ||
(r >= 0x20000 && r <= 0x2A6DF)
}