mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
278 lines
6.5 KiB
Go
278 lines
6.5 KiB
Go
package tools
|
||
|
||
import (
|
||
"sort"
|
||
"strings"
|
||
"unicode"
|
||
)
|
||
|
||
func StripMeta(s string) string {
|
||
if idx := strings.LastIndex(s, "\n#@meta"); idx >= 0 {
|
||
return s[:idx]
|
||
}
|
||
return s
|
||
}
|
||
|
||
func CharSimilarity(a, b string) float64 {
|
||
a = StripMeta(a)
|
||
b = StripMeta(b)
|
||
extract := func(s string) map[rune]int {
|
||
m := make(map[rune]int)
|
||
for _, r := range s {
|
||
if !unicode.IsSpace(r) {
|
||
m[r]++
|
||
}
|
||
}
|
||
return m
|
||
}
|
||
ca, cb := extract(a), extract(b)
|
||
if len(ca) == 0 && len(cb) == 0 {
|
||
return 100
|
||
}
|
||
common, totalA, totalB := 0, 0, 0
|
||
for r, n := range ca {
|
||
totalA += n
|
||
if n2, ok := cb[r]; ok {
|
||
common += min(n, n2)
|
||
}
|
||
}
|
||
for _, n := range cb {
|
||
totalB += n
|
||
}
|
||
if totalA+totalB == 0 {
|
||
return 100
|
||
}
|
||
return float64(common*2) / float64(totalA+totalB) * 100
|
||
}
|
||
|
||
func lcsRunes(a, b []rune) int {
|
||
if len(a) < len(b) {
|
||
a, b = b, a
|
||
}
|
||
m, n := len(b), len(a)
|
||
prev := make([]int, m+1)
|
||
cur := make([]int, m+1)
|
||
for i := 1; i <= n; i++ {
|
||
for j := 1; j <= m; j++ {
|
||
if a[i-1] == b[j-1] {
|
||
cur[j] = prev[j-1] + 1
|
||
} else {
|
||
cur[j] = max(cur[j-1], prev[j])
|
||
}
|
||
}
|
||
prev, cur = cur, prev
|
||
}
|
||
return prev[m]
|
||
}
|
||
|
||
func LcsSimilarity(a, b string) float64 {
|
||
a = StripMeta(a)
|
||
b = StripMeta(b)
|
||
ra := make([]rune, 0)
|
||
for _, r := range a {
|
||
if !unicode.IsSpace(r) {
|
||
ra = append(ra, r)
|
||
}
|
||
}
|
||
rb := make([]rune, 0)
|
||
for _, r := range b {
|
||
if !unicode.IsSpace(r) {
|
||
rb = append(rb, r)
|
||
}
|
||
}
|
||
if len(ra) == 0 && len(rb) == 0 {
|
||
return 100
|
||
}
|
||
if len(ra) == 0 || len(rb) == 0 {
|
||
return 0
|
||
}
|
||
return float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
|
||
}
|
||
|
||
// RawCharSimilarity is CharSimilarity without space stripping — spaces
|
||
// count as characters. Still strips #@meta lines.
|
||
func RawCharSimilarity(a, b string) float64 {
|
||
a = StripMeta(a)
|
||
b = StripMeta(b)
|
||
ca := make(map[rune]int)
|
||
for _, r := range a {
|
||
ca[r]++
|
||
}
|
||
cb := make(map[rune]int)
|
||
for _, r := range b {
|
||
cb[r]++
|
||
}
|
||
if len(ca) == 0 && len(cb) == 0 {
|
||
return 100
|
||
}
|
||
common, totalA, totalB := 0, 0, 0
|
||
for r, n := range ca {
|
||
totalA += n
|
||
if n2, ok := cb[r]; ok {
|
||
common += min(n, n2)
|
||
}
|
||
}
|
||
for _, n := range cb {
|
||
totalB += n
|
||
}
|
||
if totalA+totalB == 0 {
|
||
return 100
|
||
}
|
||
return float64(common*2) / float64(totalA+totalB) * 100
|
||
}
|
||
|
||
// RawLcsSimilarity is LcsSimilarity without space stripping — whitespace
|
||
// is kept in the LCS comparison. Still strips #@meta lines.
|
||
func RawLcsSimilarity(a, b string) float64 {
|
||
a = StripMeta(a)
|
||
b = StripMeta(b)
|
||
ra := []rune(a)
|
||
rb := []rune(b)
|
||
if len(ra) == 0 && len(rb) == 0 {
|
||
return 100
|
||
}
|
||
if len(ra) == 0 || len(rb) == 0 {
|
||
return 0
|
||
}
|
||
return float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
|
||
}
|
||
|
||
// SectionAlignedScore computes a two-phase LCS similarity:
|
||
//
|
||
// Phase 1: One-to-one section matching — pair Go and Python sections by
|
||
// CharSimilarity (greedy, highest first). For matched pairs, compute
|
||
// per-section LCS ratio.
|
||
//
|
||
// Phase 2: Residual — concatenate all unmatched sections from both sides
|
||
// into one string each, compute LCS ratio once. This handles cases where
|
||
// one side merges sections that the other side keeps separate.
|
||
//
|
||
// Final score is a char-weighted average of matched and residual scores.
|
||
func SectionAlignedScore(goText, pyText string) float64 {
|
||
split := func(s string) []string {
|
||
s = StripMeta(s)
|
||
return strings.Split(strings.TrimSpace(s), "\n")
|
||
}
|
||
gs := split(goText)
|
||
ps := split(pyText)
|
||
if len(gs) == 0 && len(ps) == 0 {
|
||
return 100
|
||
}
|
||
if len(gs) == 0 || len(ps) == 0 {
|
||
return 0
|
||
}
|
||
|
||
// Phase 1: Position-window greedy matching.
|
||
// Sections are ordered top-to-bottom by page position, so a global
|
||
// match beyond a small positional offset is extremely unlikely.
|
||
// Constrain candidates to ±window to avoid O(n×m) blow-up on large docs.
|
||
const alignWindow = 5
|
||
type candidate struct {
|
||
gi, pi int
|
||
sim float64
|
||
}
|
||
// Precompute rune lengths for length-ratio gating.
|
||
glens := make([]int, len(gs))
|
||
plens := make([]int, len(ps))
|
||
for i, s := range gs {
|
||
glens[i] = len([]rune(s))
|
||
}
|
||
for i, s := range ps {
|
||
plens[i] = len([]rune(s))
|
||
}
|
||
|
||
candidates := make([]candidate, 0, len(gs)*(alignWindow*2+1))
|
||
for i, g := range gs {
|
||
lo := max(0, i-alignWindow)
|
||
hi := min(len(ps)-1, i+alignWindow)
|
||
for j := lo; j <= hi; j++ {
|
||
// Skip pairs with >2x length difference — a 500-char section
|
||
// matching a 30-char section produces near-zero LCS.
|
||
if glens[i] > plens[j]*2 || plens[j] > glens[i]*2 {
|
||
continue
|
||
}
|
||
if sim := CharSimilarity(g, ps[j]); sim > 30 {
|
||
candidates = append(candidates, candidate{i, j, sim})
|
||
}
|
||
}
|
||
}
|
||
// Sort descending by similarity — best matches first.
|
||
sort.Slice(candidates, func(a, b int) bool {
|
||
return candidates[a].sim > candidates[b].sim
|
||
})
|
||
|
||
goUsed := make([]bool, len(gs))
|
||
pyUsed := make([]bool, len(ps))
|
||
matchedScore := 0.0
|
||
matchedChars := 0
|
||
|
||
for _, c := range candidates {
|
||
if goUsed[c.gi] || pyUsed[c.pi] {
|
||
continue
|
||
}
|
||
goUsed[c.gi] = true
|
||
pyUsed[c.pi] = true
|
||
|
||
// Compute LCS ratio for matched pair.
|
||
ra := nonSpaceRunes(gs[c.gi])
|
||
rb := nonSpaceRunes(ps[c.pi])
|
||
lcsScore := 0.0
|
||
if len(ra) > 0 && len(rb) > 0 {
|
||
lcsScore = float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
|
||
} else if len(ra) == 0 && len(rb) == 0 {
|
||
lcsScore = 100
|
||
}
|
||
chars := max(len(ra), len(rb))
|
||
matchedScore += lcsScore * float64(chars)
|
||
matchedChars += chars
|
||
}
|
||
|
||
// Phase 2: Residual — concat unmatched sections, compute LCS once.
|
||
var goRes, pyRes strings.Builder
|
||
for i, g := range gs {
|
||
if !goUsed[i] {
|
||
goRes.WriteString(g)
|
||
goRes.WriteByte(' ')
|
||
}
|
||
}
|
||
for j, p := range ps {
|
||
if !pyUsed[j] {
|
||
pyRes.WriteString(p)
|
||
pyRes.WriteByte(' ')
|
||
}
|
||
}
|
||
|
||
residualScore := 0.0
|
||
residualChars := 0
|
||
goResRunes := nonSpaceRunes(goRes.String())
|
||
pyResRunes := nonSpaceRunes(pyRes.String())
|
||
residualChars = max(len(goResRunes), len(pyResRunes))
|
||
if residualChars > 0 {
|
||
if len(goResRunes) > 5000 || len(pyResRunes) > 5000 {
|
||
// Residual too large for O(n²) LCS — fall back to CharSimilarity.
|
||
residualScore = CharSimilarity(goRes.String(), pyRes.String())
|
||
} else {
|
||
residualScore = float64(lcsRunes(goResRunes, pyResRunes)) / float64(residualChars) * 100
|
||
}
|
||
} else if len(goResRunes) == 0 && len(pyResRunes) == 0 {
|
||
residualScore = 100
|
||
}
|
||
|
||
// Weighted average.
|
||
totalChars := matchedChars + residualChars
|
||
if totalChars == 0 {
|
||
return 100
|
||
}
|
||
return (matchedScore + residualScore*float64(residualChars)) / float64(totalChars)
|
||
}
|
||
|
||
func nonSpaceRunes(s string) []rune {
|
||
out := make([]rune, 0, len(s))
|
||
for _, r := range s {
|
||
if !unicode.IsSpace(r) {
|
||
out = append(out, r)
|
||
}
|
||
}
|
||
return out
|
||
}
|