mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
278 lines
6.5 KiB
Go
278 lines
6.5 KiB
Go
|
|
package tools
|
|||
|
|
|
|||
|
|
import (
|
|||
|
|
"sort"
|
|||
|
|
"strings"
|
|||
|
|
"unicode"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
func StripMeta(s string) string {
|
|||
|
|
if idx := strings.LastIndex(s, "\n#@meta"); idx >= 0 {
|
|||
|
|
return s[:idx]
|
|||
|
|
}
|
|||
|
|
return s
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func CharSimilarity(a, b string) float64 {
|
|||
|
|
a = StripMeta(a)
|
|||
|
|
b = StripMeta(b)
|
|||
|
|
extract := func(s string) map[rune]int {
|
|||
|
|
m := make(map[rune]int)
|
|||
|
|
for _, r := range s {
|
|||
|
|
if !unicode.IsSpace(r) {
|
|||
|
|
m[r]++
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return m
|
|||
|
|
}
|
|||
|
|
ca, cb := extract(a), extract(b)
|
|||
|
|
if len(ca) == 0 && len(cb) == 0 {
|
|||
|
|
return 100
|
|||
|
|
}
|
|||
|
|
common, totalA, totalB := 0, 0, 0
|
|||
|
|
for r, n := range ca {
|
|||
|
|
totalA += n
|
|||
|
|
if n2, ok := cb[r]; ok {
|
|||
|
|
common += min(n, n2)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
for _, n := range cb {
|
|||
|
|
totalB += n
|
|||
|
|
}
|
|||
|
|
if totalA+totalB == 0 {
|
|||
|
|
return 100
|
|||
|
|
}
|
|||
|
|
return float64(common*2) / float64(totalA+totalB) * 100
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func lcsRunes(a, b []rune) int {
|
|||
|
|
if len(a) < len(b) {
|
|||
|
|
a, b = b, a
|
|||
|
|
}
|
|||
|
|
m, n := len(b), len(a)
|
|||
|
|
prev := make([]int, m+1)
|
|||
|
|
cur := make([]int, m+1)
|
|||
|
|
for i := 1; i <= n; i++ {
|
|||
|
|
for j := 1; j <= m; j++ {
|
|||
|
|
if a[i-1] == b[j-1] {
|
|||
|
|
cur[j] = prev[j-1] + 1
|
|||
|
|
} else {
|
|||
|
|
cur[j] = max(cur[j-1], prev[j])
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
prev, cur = cur, prev
|
|||
|
|
}
|
|||
|
|
return prev[m]
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func LcsSimilarity(a, b string) float64 {
|
|||
|
|
a = StripMeta(a)
|
|||
|
|
b = StripMeta(b)
|
|||
|
|
ra := make([]rune, 0)
|
|||
|
|
for _, r := range a {
|
|||
|
|
if !unicode.IsSpace(r) {
|
|||
|
|
ra = append(ra, r)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
rb := make([]rune, 0)
|
|||
|
|
for _, r := range b {
|
|||
|
|
if !unicode.IsSpace(r) {
|
|||
|
|
rb = append(rb, r)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if len(ra) == 0 && len(rb) == 0 {
|
|||
|
|
return 100
|
|||
|
|
}
|
|||
|
|
if len(ra) == 0 || len(rb) == 0 {
|
|||
|
|
return 0
|
|||
|
|
}
|
|||
|
|
return float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// RawCharSimilarity is CharSimilarity without space stripping — spaces
|
|||
|
|
// count as characters. Still strips #@meta lines.
|
|||
|
|
func RawCharSimilarity(a, b string) float64 {
|
|||
|
|
a = StripMeta(a)
|
|||
|
|
b = StripMeta(b)
|
|||
|
|
ca := make(map[rune]int)
|
|||
|
|
for _, r := range a {
|
|||
|
|
ca[r]++
|
|||
|
|
}
|
|||
|
|
cb := make(map[rune]int)
|
|||
|
|
for _, r := range b {
|
|||
|
|
cb[r]++
|
|||
|
|
}
|
|||
|
|
if len(ca) == 0 && len(cb) == 0 {
|
|||
|
|
return 100
|
|||
|
|
}
|
|||
|
|
common, totalA, totalB := 0, 0, 0
|
|||
|
|
for r, n := range ca {
|
|||
|
|
totalA += n
|
|||
|
|
if n2, ok := cb[r]; ok {
|
|||
|
|
common += min(n, n2)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
for _, n := range cb {
|
|||
|
|
totalB += n
|
|||
|
|
}
|
|||
|
|
if totalA+totalB == 0 {
|
|||
|
|
return 100
|
|||
|
|
}
|
|||
|
|
return float64(common*2) / float64(totalA+totalB) * 100
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// RawLcsSimilarity is LcsSimilarity without space stripping — whitespace
|
|||
|
|
// is kept in the LCS comparison. Still strips #@meta lines.
|
|||
|
|
func RawLcsSimilarity(a, b string) float64 {
|
|||
|
|
a = StripMeta(a)
|
|||
|
|
b = StripMeta(b)
|
|||
|
|
ra := []rune(a)
|
|||
|
|
rb := []rune(b)
|
|||
|
|
if len(ra) == 0 && len(rb) == 0 {
|
|||
|
|
return 100
|
|||
|
|
}
|
|||
|
|
if len(ra) == 0 || len(rb) == 0 {
|
|||
|
|
return 0
|
|||
|
|
}
|
|||
|
|
return float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// SectionAlignedScore computes a two-phase LCS similarity:
|
|||
|
|
//
|
|||
|
|
// Phase 1: One-to-one section matching — pair Go and Python sections by
|
|||
|
|
// CharSimilarity (greedy, highest first). For matched pairs, compute
|
|||
|
|
// per-section LCS ratio.
|
|||
|
|
//
|
|||
|
|
// Phase 2: Residual — concatenate all unmatched sections from both sides
|
|||
|
|
// into one string each, compute LCS ratio once. This handles cases where
|
|||
|
|
// one side merges sections that the other side keeps separate.
|
|||
|
|
//
|
|||
|
|
// Final score is a char-weighted average of matched and residual scores.
|
|||
|
|
func SectionAlignedScore(goText, pyText string) float64 {
|
|||
|
|
split := func(s string) []string {
|
|||
|
|
s = StripMeta(s)
|
|||
|
|
return strings.Split(strings.TrimSpace(s), "\n")
|
|||
|
|
}
|
|||
|
|
gs := split(goText)
|
|||
|
|
ps := split(pyText)
|
|||
|
|
if len(gs) == 0 && len(ps) == 0 {
|
|||
|
|
return 100
|
|||
|
|
}
|
|||
|
|
if len(gs) == 0 || len(ps) == 0 {
|
|||
|
|
return 0
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Phase 1: Position-window greedy matching.
|
|||
|
|
// Sections are ordered top-to-bottom by page position, so a global
|
|||
|
|
// match beyond a small positional offset is extremely unlikely.
|
|||
|
|
// Constrain candidates to ±window to avoid O(n×m) blow-up on large docs.
|
|||
|
|
const alignWindow = 5
|
|||
|
|
type candidate struct {
|
|||
|
|
gi, pi int
|
|||
|
|
sim float64
|
|||
|
|
}
|
|||
|
|
// Precompute rune lengths for length-ratio gating.
|
|||
|
|
glens := make([]int, len(gs))
|
|||
|
|
plens := make([]int, len(ps))
|
|||
|
|
for i, s := range gs {
|
|||
|
|
glens[i] = len([]rune(s))
|
|||
|
|
}
|
|||
|
|
for i, s := range ps {
|
|||
|
|
plens[i] = len([]rune(s))
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
candidates := make([]candidate, 0, len(gs)*(alignWindow*2+1))
|
|||
|
|
for i, g := range gs {
|
|||
|
|
lo := max(0, i-alignWindow)
|
|||
|
|
hi := min(len(ps)-1, i+alignWindow)
|
|||
|
|
for j := lo; j <= hi; j++ {
|
|||
|
|
// Skip pairs with >2x length difference — a 500-char section
|
|||
|
|
// matching a 30-char section produces near-zero LCS.
|
|||
|
|
if glens[i] > plens[j]*2 || plens[j] > glens[i]*2 {
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
if sim := CharSimilarity(g, ps[j]); sim > 30 {
|
|||
|
|
candidates = append(candidates, candidate{i, j, sim})
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
// Sort descending by similarity — best matches first.
|
|||
|
|
sort.Slice(candidates, func(a, b int) bool {
|
|||
|
|
return candidates[a].sim > candidates[b].sim
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
goUsed := make([]bool, len(gs))
|
|||
|
|
pyUsed := make([]bool, len(ps))
|
|||
|
|
matchedScore := 0.0
|
|||
|
|
matchedChars := 0
|
|||
|
|
|
|||
|
|
for _, c := range candidates {
|
|||
|
|
if goUsed[c.gi] || pyUsed[c.pi] {
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
goUsed[c.gi] = true
|
|||
|
|
pyUsed[c.pi] = true
|
|||
|
|
|
|||
|
|
// Compute LCS ratio for matched pair.
|
|||
|
|
ra := nonSpaceRunes(gs[c.gi])
|
|||
|
|
rb := nonSpaceRunes(ps[c.pi])
|
|||
|
|
lcsScore := 0.0
|
|||
|
|
if len(ra) > 0 && len(rb) > 0 {
|
|||
|
|
lcsScore = float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
|
|||
|
|
} else if len(ra) == 0 && len(rb) == 0 {
|
|||
|
|
lcsScore = 100
|
|||
|
|
}
|
|||
|
|
chars := max(len(ra), len(rb))
|
|||
|
|
matchedScore += lcsScore * float64(chars)
|
|||
|
|
matchedChars += chars
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Phase 2: Residual — concat unmatched sections, compute LCS once.
|
|||
|
|
var goRes, pyRes strings.Builder
|
|||
|
|
for i, g := range gs {
|
|||
|
|
if !goUsed[i] {
|
|||
|
|
goRes.WriteString(g)
|
|||
|
|
goRes.WriteByte(' ')
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
for j, p := range ps {
|
|||
|
|
if !pyUsed[j] {
|
|||
|
|
pyRes.WriteString(p)
|
|||
|
|
pyRes.WriteByte(' ')
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
residualScore := 0.0
|
|||
|
|
residualChars := 0
|
|||
|
|
goResRunes := nonSpaceRunes(goRes.String())
|
|||
|
|
pyResRunes := nonSpaceRunes(pyRes.String())
|
|||
|
|
residualChars = max(len(goResRunes), len(pyResRunes))
|
|||
|
|
if residualChars > 0 {
|
|||
|
|
if len(goResRunes) > 5000 || len(pyResRunes) > 5000 {
|
|||
|
|
// Residual too large for O(n²) LCS — fall back to CharSimilarity.
|
|||
|
|
residualScore = CharSimilarity(goRes.String(), pyRes.String())
|
|||
|
|
} else {
|
|||
|
|
residualScore = float64(lcsRunes(goResRunes, pyResRunes)) / float64(residualChars) * 100
|
|||
|
|
}
|
|||
|
|
} else if len(goResRunes) == 0 && len(pyResRunes) == 0 {
|
|||
|
|
residualScore = 100
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Weighted average.
|
|||
|
|
totalChars := matchedChars + residualChars
|
|||
|
|
if totalChars == 0 {
|
|||
|
|
return 100
|
|||
|
|
}
|
|||
|
|
return (matchedScore + residualScore*float64(residualChars)) / float64(totalChars)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func nonSpaceRunes(s string) []rune {
|
|||
|
|
out := make([]rune, 0, len(s))
|
|||
|
|
for _, r := range s {
|
|||
|
|
if !unicode.IsSpace(r) {
|
|||
|
|
out = append(out, r)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return out
|
|||
|
|
}
|