Files
ragflow/internal/deepdoc/parser/pdf/tools/similarity.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

278 lines
6.5 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package tools
import (
"sort"
"strings"
"unicode"
)
func StripMeta(s string) string {
if idx := strings.LastIndex(s, "\n#@meta"); idx >= 0 {
return s[:idx]
}
return s
}
func CharSimilarity(a, b string) float64 {
a = StripMeta(a)
b = StripMeta(b)
extract := func(s string) map[rune]int {
m := make(map[rune]int)
for _, r := range s {
if !unicode.IsSpace(r) {
m[r]++
}
}
return m
}
ca, cb := extract(a), extract(b)
if len(ca) == 0 && len(cb) == 0 {
return 100
}
common, totalA, totalB := 0, 0, 0
for r, n := range ca {
totalA += n
if n2, ok := cb[r]; ok {
common += min(n, n2)
}
}
for _, n := range cb {
totalB += n
}
if totalA+totalB == 0 {
return 100
}
return float64(common*2) / float64(totalA+totalB) * 100
}
func lcsRunes(a, b []rune) int {
if len(a) < len(b) {
a, b = b, a
}
m, n := len(b), len(a)
prev := make([]int, m+1)
cur := make([]int, m+1)
for i := 1; i <= n; i++ {
for j := 1; j <= m; j++ {
if a[i-1] == b[j-1] {
cur[j] = prev[j-1] + 1
} else {
cur[j] = max(cur[j-1], prev[j])
}
}
prev, cur = cur, prev
}
return prev[m]
}
func LcsSimilarity(a, b string) float64 {
a = StripMeta(a)
b = StripMeta(b)
ra := make([]rune, 0)
for _, r := range a {
if !unicode.IsSpace(r) {
ra = append(ra, r)
}
}
rb := make([]rune, 0)
for _, r := range b {
if !unicode.IsSpace(r) {
rb = append(rb, r)
}
}
if len(ra) == 0 && len(rb) == 0 {
return 100
}
if len(ra) == 0 || len(rb) == 0 {
return 0
}
return float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
}
// RawCharSimilarity is CharSimilarity without space stripping — spaces
// count as characters. Still strips #@meta lines.
func RawCharSimilarity(a, b string) float64 {
a = StripMeta(a)
b = StripMeta(b)
ca := make(map[rune]int)
for _, r := range a {
ca[r]++
}
cb := make(map[rune]int)
for _, r := range b {
cb[r]++
}
if len(ca) == 0 && len(cb) == 0 {
return 100
}
common, totalA, totalB := 0, 0, 0
for r, n := range ca {
totalA += n
if n2, ok := cb[r]; ok {
common += min(n, n2)
}
}
for _, n := range cb {
totalB += n
}
if totalA+totalB == 0 {
return 100
}
return float64(common*2) / float64(totalA+totalB) * 100
}
// RawLcsSimilarity is LcsSimilarity without space stripping — whitespace
// is kept in the LCS comparison. Still strips #@meta lines.
func RawLcsSimilarity(a, b string) float64 {
a = StripMeta(a)
b = StripMeta(b)
ra := []rune(a)
rb := []rune(b)
if len(ra) == 0 && len(rb) == 0 {
return 100
}
if len(ra) == 0 || len(rb) == 0 {
return 0
}
return float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
}
// SectionAlignedScore computes a two-phase LCS similarity:
//
// Phase 1: One-to-one section matching — pair Go and Python sections by
// CharSimilarity (greedy, highest first). For matched pairs, compute
// per-section LCS ratio.
//
// Phase 2: Residual — concatenate all unmatched sections from both sides
// into one string each, compute LCS ratio once. This handles cases where
// one side merges sections that the other side keeps separate.
//
// Final score is a char-weighted average of matched and residual scores.
func SectionAlignedScore(goText, pyText string) float64 {
split := func(s string) []string {
s = StripMeta(s)
return strings.Split(strings.TrimSpace(s), "\n")
}
gs := split(goText)
ps := split(pyText)
if len(gs) == 0 && len(ps) == 0 {
return 100
}
if len(gs) == 0 || len(ps) == 0 {
return 0
}
// Phase 1: Position-window greedy matching.
// Sections are ordered top-to-bottom by page position, so a global
// match beyond a small positional offset is extremely unlikely.
// Constrain candidates to ±window to avoid O(n×m) blow-up on large docs.
const alignWindow = 5
type candidate struct {
gi, pi int
sim float64
}
// Precompute rune lengths for length-ratio gating.
glens := make([]int, len(gs))
plens := make([]int, len(ps))
for i, s := range gs {
glens[i] = len([]rune(s))
}
for i, s := range ps {
plens[i] = len([]rune(s))
}
candidates := make([]candidate, 0, len(gs)*(alignWindow*2+1))
for i, g := range gs {
lo := max(0, i-alignWindow)
hi := min(len(ps)-1, i+alignWindow)
for j := lo; j <= hi; j++ {
// Skip pairs with >2x length difference — a 500-char section
// matching a 30-char section produces near-zero LCS.
if glens[i] > plens[j]*2 || plens[j] > glens[i]*2 {
continue
}
if sim := CharSimilarity(g, ps[j]); sim > 30 {
candidates = append(candidates, candidate{i, j, sim})
}
}
}
// Sort descending by similarity — best matches first.
sort.Slice(candidates, func(a, b int) bool {
return candidates[a].sim > candidates[b].sim
})
goUsed := make([]bool, len(gs))
pyUsed := make([]bool, len(ps))
matchedScore := 0.0
matchedChars := 0
for _, c := range candidates {
if goUsed[c.gi] || pyUsed[c.pi] {
continue
}
goUsed[c.gi] = true
pyUsed[c.pi] = true
// Compute LCS ratio for matched pair.
ra := nonSpaceRunes(gs[c.gi])
rb := nonSpaceRunes(ps[c.pi])
lcsScore := 0.0
if len(ra) > 0 && len(rb) > 0 {
lcsScore = float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
} else if len(ra) == 0 && len(rb) == 0 {
lcsScore = 100
}
chars := max(len(ra), len(rb))
matchedScore += lcsScore * float64(chars)
matchedChars += chars
}
// Phase 2: Residual — concat unmatched sections, compute LCS once.
var goRes, pyRes strings.Builder
for i, g := range gs {
if !goUsed[i] {
goRes.WriteString(g)
goRes.WriteByte(' ')
}
}
for j, p := range ps {
if !pyUsed[j] {
pyRes.WriteString(p)
pyRes.WriteByte(' ')
}
}
residualScore := 0.0
residualChars := 0
goResRunes := nonSpaceRunes(goRes.String())
pyResRunes := nonSpaceRunes(pyRes.String())
residualChars = max(len(goResRunes), len(pyResRunes))
if residualChars > 0 {
if len(goResRunes) > 5000 || len(pyResRunes) > 5000 {
// Residual too large for O(n²) LCS — fall back to CharSimilarity.
residualScore = CharSimilarity(goRes.String(), pyRes.String())
} else {
residualScore = float64(lcsRunes(goResRunes, pyResRunes)) / float64(residualChars) * 100
}
} else if len(goResRunes) == 0 && len(pyResRunes) == 0 {
residualScore = 100
}
// Weighted average.
totalChars := matchedChars + residualChars
if totalChars == 0 {
return 100
}
return (matchedScore + residualScore*float64(residualChars)) / float64(totalChars)
}
func nonSpaceRunes(s string) []rune {
out := make([]rune, 0, len(s))
for _, r := range s {
if !unicode.IsSpace(r) {
out = append(out, r)
}
}
return out
}