mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
75 lines
1.7 KiB
Go
75 lines
1.7 KiB
Go
package parser
|
|
|
|
import (
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
// ---- MergeSameBullet (Python: pdf_parser.py _merge_same_bullet) ----
|
|
|
|
// MergeSameBullet merges adjacent boxes that start with the same bullet/number
|
|
// character, combining their text with a newline separator.
|
|
func MergeSameBullet(boxes []TextBox, tok Tokenizer) []TextBox {
|
|
if len(boxes) < 2 {
|
|
return boxes
|
|
}
|
|
// Build output via two-pointer collect: O(n) instead of O(n²) slice-element removal.
|
|
out := make([]TextBox, 0, len(boxes))
|
|
i := 0
|
|
for i < len(boxes) {
|
|
if strings.TrimSpace(boxes[i].Text) == "" {
|
|
i++
|
|
continue
|
|
}
|
|
// Start a merge chain from position i.
|
|
cur := boxes[i]
|
|
i++
|
|
for i < len(boxes) {
|
|
if strings.TrimSpace(boxes[i].Text) == "" {
|
|
i++
|
|
continue
|
|
}
|
|
nxt := boxes[i]
|
|
firstCur := firstRuneString(cur.Text)
|
|
firstNxt := firstRuneString(nxt.Text)
|
|
|
|
// Conditions to NOT merge:
|
|
if firstCur != firstNxt ||
|
|
unicode.Is(unicode.Latin, firstCur) ||
|
|
isChinese(firstCur, tok) ||
|
|
cur.Top > nxt.Bottom {
|
|
break
|
|
}
|
|
|
|
// Merge nxt into cur.
|
|
cur.Text = cur.Text + "\n" + nxt.Text
|
|
cur.X0 = min(cur.X0, nxt.X0)
|
|
cur.X1 = max(cur.X1, nxt.X1)
|
|
cur.Bottom = nxt.Bottom
|
|
i++
|
|
}
|
|
out = append(out, cur)
|
|
}
|
|
return out
|
|
}
|
|
|
|
// ---- Helpers ----
|
|
|
|
func firstRuneString(s string) rune {
|
|
s = strings.TrimSpace(s)
|
|
if s == "" {
|
|
return 0
|
|
}
|
|
return []rune(s)[0]
|
|
}
|
|
|
|
// isChinese checks if a rune is a Chinese character (CJK Unified Ideograph).
|
|
func isChinese(r rune, tok Tokenizer) bool {
|
|
if tok != nil {
|
|
return strings.Contains(tok.Tag(string(r)), "n")
|
|
}
|
|
return (r >= 0x4E00 && r <= 0x9FFF) ||
|
|
(r >= 0x3400 && r <= 0x4DBF) ||
|
|
(r >= 0x20000 && r <= 0x2A6DF)
|
|
}
|