Files
ragflow/internal/deepdoc/parser/pdf/tools/similarity.go

278 lines
6.5 KiB
Go
Raw Normal View History

package tools
import (
"sort"
"strings"
"unicode"
)
func StripMeta(s string) string {
if idx := strings.LastIndex(s, "\n#@meta"); idx >= 0 {
return s[:idx]
}
return s
}
func CharSimilarity(a, b string) float64 {
a = StripMeta(a)
b = StripMeta(b)
extract := func(s string) map[rune]int {
m := make(map[rune]int)
for _, r := range s {
if !unicode.IsSpace(r) {
m[r]++
}
}
return m
}
ca, cb := extract(a), extract(b)
if len(ca) == 0 && len(cb) == 0 {
return 100
}
common, totalA, totalB := 0, 0, 0
for r, n := range ca {
totalA += n
if n2, ok := cb[r]; ok {
common += min(n, n2)
}
}
for _, n := range cb {
totalB += n
}
if totalA+totalB == 0 {
return 100
}
return float64(common*2) / float64(totalA+totalB) * 100
}
func lcsRunes(a, b []rune) int {
if len(a) < len(b) {
a, b = b, a
}
m, n := len(b), len(a)
prev := make([]int, m+1)
cur := make([]int, m+1)
for i := 1; i <= n; i++ {
for j := 1; j <= m; j++ {
if a[i-1] == b[j-1] {
cur[j] = prev[j-1] + 1
} else {
cur[j] = max(cur[j-1], prev[j])
}
}
prev, cur = cur, prev
}
return prev[m]
}
func LcsSimilarity(a, b string) float64 {
a = StripMeta(a)
b = StripMeta(b)
ra := make([]rune, 0)
for _, r := range a {
if !unicode.IsSpace(r) {
ra = append(ra, r)
}
}
rb := make([]rune, 0)
for _, r := range b {
if !unicode.IsSpace(r) {
rb = append(rb, r)
}
}
if len(ra) == 0 && len(rb) == 0 {
return 100
}
if len(ra) == 0 || len(rb) == 0 {
return 0
}
return float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
}
// RawCharSimilarity is CharSimilarity without space stripping — spaces
// count as characters. Still strips #@meta lines.
func RawCharSimilarity(a, b string) float64 {
a = StripMeta(a)
b = StripMeta(b)
ca := make(map[rune]int)
for _, r := range a {
ca[r]++
}
cb := make(map[rune]int)
for _, r := range b {
cb[r]++
}
if len(ca) == 0 && len(cb) == 0 {
return 100
}
common, totalA, totalB := 0, 0, 0
for r, n := range ca {
totalA += n
if n2, ok := cb[r]; ok {
common += min(n, n2)
}
}
for _, n := range cb {
totalB += n
}
if totalA+totalB == 0 {
return 100
}
return float64(common*2) / float64(totalA+totalB) * 100
}
// RawLcsSimilarity is LcsSimilarity without space stripping — whitespace
// is kept in the LCS comparison. Still strips #@meta lines.
func RawLcsSimilarity(a, b string) float64 {
a = StripMeta(a)
b = StripMeta(b)
ra := []rune(a)
rb := []rune(b)
if len(ra) == 0 && len(rb) == 0 {
return 100
}
if len(ra) == 0 || len(rb) == 0 {
return 0
}
return float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
}
// SectionAlignedScore computes a two-phase LCS similarity:
//
// Phase 1: One-to-one section matching — pair Go and Python sections by
// CharSimilarity (greedy, highest first). For matched pairs, compute
// per-section LCS ratio.
//
// Phase 2: Residual — concatenate all unmatched sections from both sides
// into one string each, compute LCS ratio once. This handles cases where
// one side merges sections that the other side keeps separate.
//
// Final score is a char-weighted average of matched and residual scores.
func SectionAlignedScore(goText, pyText string) float64 {
split := func(s string) []string {
s = StripMeta(s)
return strings.Split(strings.TrimSpace(s), "\n")
}
gs := split(goText)
ps := split(pyText)
if len(gs) == 0 && len(ps) == 0 {
return 100
}
if len(gs) == 0 || len(ps) == 0 {
return 0
}
// Phase 1: Position-window greedy matching.
// Sections are ordered top-to-bottom by page position, so a global
// match beyond a small positional offset is extremely unlikely.
// Constrain candidates to ±window to avoid O(n×m) blow-up on large docs.
const alignWindow = 5
type candidate struct {
gi, pi int
sim float64
}
// Precompute rune lengths for length-ratio gating.
glens := make([]int, len(gs))
plens := make([]int, len(ps))
for i, s := range gs {
glens[i] = len([]rune(s))
}
for i, s := range ps {
plens[i] = len([]rune(s))
}
candidates := make([]candidate, 0, len(gs)*(alignWindow*2+1))
for i, g := range gs {
lo := max(0, i-alignWindow)
hi := min(len(ps)-1, i+alignWindow)
for j := lo; j <= hi; j++ {
// Skip pairs with >2x length difference — a 500-char section
// matching a 30-char section produces near-zero LCS.
if glens[i] > plens[j]*2 || plens[j] > glens[i]*2 {
continue
}
if sim := CharSimilarity(g, ps[j]); sim > 30 {
candidates = append(candidates, candidate{i, j, sim})
}
}
}
// Sort descending by similarity — best matches first.
sort.Slice(candidates, func(a, b int) bool {
return candidates[a].sim > candidates[b].sim
})
goUsed := make([]bool, len(gs))
pyUsed := make([]bool, len(ps))
matchedScore := 0.0
matchedChars := 0
for _, c := range candidates {
if goUsed[c.gi] || pyUsed[c.pi] {
continue
}
goUsed[c.gi] = true
pyUsed[c.pi] = true
// Compute LCS ratio for matched pair.
ra := nonSpaceRunes(gs[c.gi])
rb := nonSpaceRunes(ps[c.pi])
lcsScore := 0.0
if len(ra) > 0 && len(rb) > 0 {
lcsScore = float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
} else if len(ra) == 0 && len(rb) == 0 {
lcsScore = 100
}
chars := max(len(ra), len(rb))
matchedScore += lcsScore * float64(chars)
matchedChars += chars
}
// Phase 2: Residual — concat unmatched sections, compute LCS once.
var goRes, pyRes strings.Builder
for i, g := range gs {
if !goUsed[i] {
goRes.WriteString(g)
goRes.WriteByte(' ')
}
}
for j, p := range ps {
if !pyUsed[j] {
pyRes.WriteString(p)
pyRes.WriteByte(' ')
}
}
residualScore := 0.0
residualChars := 0
goResRunes := nonSpaceRunes(goRes.String())
pyResRunes := nonSpaceRunes(pyRes.String())
residualChars = max(len(goResRunes), len(pyResRunes))
if residualChars > 0 {
if len(goResRunes) > 5000 || len(pyResRunes) > 5000 {
// Residual too large for O(n²) LCS — fall back to CharSimilarity.
residualScore = CharSimilarity(goRes.String(), pyRes.String())
} else {
residualScore = float64(lcsRunes(goResRunes, pyResRunes)) / float64(residualChars) * 100
}
} else if len(goResRunes) == 0 && len(pyResRunes) == 0 {
residualScore = 100
}
// Weighted average.
totalChars := matchedChars + residualChars
if totalChars == 0 {
return 100
}
return (matchedScore + residualScore*float64(residualChars)) / float64(totalChars)
}
func nonSpaceRunes(s string) []rune {
out := make([]rune, 0, len(s))
for _, r := range s {
if !unicode.IsSpace(r) {
out = append(out, r)
}
}
return out
}