Files
ragflow/internal/deepdoc/parser/pdf/pipeline_parity_test.go

265 lines
7.9 KiB
Go
Raw Normal View History

//go:build cgo && manual
package parser
import (
"context"
"os"
"path/filepath"
"ragflow/internal/deepdoc/parser/pdf/tools"
"sort"
"strings"
"testing"
)
// TestPipelineParity verifies Go pipeline logic equivalence with Python.
// It loads Python pdfplumber chars (from charspy/), runs the Go pipeline
// with Top-based sorting to match Python's ordering, and compares sections
// against Python's output/py/noocr/text/ output.
//
// CharSim must be 100% — if not, Go pipeline logic differs from Python's.
func TestPipelineParity(t *testing.T) {
charspyDir := filepath.Join("testdata", "charspy")
pyTextDir := filepath.Join("testdata", "output", "py", "noocr", "text")
entries, err := os.ReadDir(charspyDir)
if err != nil {
t.Skipf("charspy/ not found: %v", err)
}
filter := os.Getenv("BATCH_PARITY_FILTER")
total, passed := 0, 0
for _, e := range entries {
if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") {
continue
}
name := strings.TrimSuffix(e.Name(), ".json")
if filter != "" && !strings.Contains(e.Name(), filter) {
continue
}
// Load Python chars
jsonPath := filepath.Join(charspyDir, e.Name())
engine, err := LoadPythonChars(jsonPath)
if err != nil {
t.Errorf("%s: LoadPythonChars: %v", name, err)
continue
}
// Run Go pipeline (SKIP_OCR — no DeepDoc)
cfg := DefaultParserConfig()
cfg.SortByTop = true
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true, Model: ModelSaas})
result, err := p.Parse(context.Background(), engine)
if err != nil {
t.Errorf("%s: Parse: %v", name, err)
continue
}
// Read Python sections
pyPath := filepath.Join(pyTextDir, name+".txt")
pyData, err := os.ReadFile(pyPath)
if err != nil {
t.Logf("%s: no Python reference at %s — skip", name, pyPath)
continue
}
// Build Go text
var goText strings.Builder
for _, s := range result.Sections {
goText.WriteString(s.Text)
goText.WriteByte('\n')
}
// Compare
sim := tools.CharSimilarity(goText.String(), tools.StripMeta(string(pyData)))
total++
if sim >= 100.0 {
passed++
t.Logf("PASS %s: CharSim=%.1f%% boxes:%d->%d->%d->%d",
name, sim, result.Metrics.BoxesInitial, result.Metrics.BoxesTextMerge, result.Metrics.BoxesVertMerge, len(result.Sections))
} else {
t.Errorf("FAIL %s: CharSim=%.1f%% (must be 100%%) boxes:%d->%d->%d->%d",
name, sim, result.Metrics.BoxesInitial, result.Metrics.BoxesTextMerge, result.Metrics.BoxesVertMerge, len(result.Sections))
}
}
if total == 0 {
t.Skip("no charspy/ files found")
}
t.Logf("Pipeline parity: %d/%d passed", passed, total)
if passed < total {
t.Errorf("%d/%d parity tests failed — Go pipeline differs from Python", total-passed, total)
}
}
// TestVMWhitespaceGapBridge reproduces the exact RAG PDF divergence
// with synthetic boxes. A whitespace box (width > 0, gap just below
// threshold) gets merged into a content box, extending its bottom by
// the whitespace height. This flips the next gap from reject to merge,
// creating a cascade that reduces the section count by 1.
//
// Go's whitespace pre-filter removes this box before VM, so the
// bottom extension never happens and the cascade fails to start.
func TestVMWhitespaceGapBridge(t *testing.T) {
// Coordinates extracted from RAG PDF charspy data, "服务体系" region.
boxes := []TextBox{
// Content A: merged result of 3 preceding lines
{X0: 37.6, X1: 491.0, Top: 339.35, Bottom: 382.39,
Text: "生成文本再用standard分词建立索引", PageNumber: 1},
// Whitespace: U+00A0 non-breaking space, has non-zero width
{X0: 37.6, X1: 40.3, Top: 396.39, Bottom: 406.79,
Text: " ", PageNumber: 1},
// Content B: would be rejected without whitespace gap bridge
{X0: 37.6, X1: 543.3, Top: 420.16, Bottom: 431.19,
Text: "直接用rag分词建立索引", PageNumber: 1},
// Content C: cascades after B merges
{X0: 37.6, X1: 526.4, Top: 436.16, Bottom: 447.20,
Text: "是在原文中并没有这样的文字", PageNumber: 1},
}
mh := 9.361 // RAG PDF char median
thr := mh * 1.5
// Run VM with whitespace PRESENT (Python-like, no pre-filter).
// Python's while/pop merges whitespace at b_ position into b
// (extending b.bottom), then compares same b against next content.
// We simulate this by letting whitespace through gap/xov checks
// and absorbing it into prev when the checks pass.
vWithWS := func() int {
bxs := make([]TextBox, len(boxes))
copy(bxs, boxes)
sort.Slice(bxs, func(i, j int) bool {
if bxs[i].Top != bxs[j].Top {
return bxs[i].Top < bxs[j].Top
}
return bxs[i].X0 < bxs[j].X0
})
out := make([]TextBox, 0, len(bxs))
for i := 0; i < len(bxs); i++ {
b := bxs[i]
isWS := strings.TrimSpace(b.Text) == ""
// Whitespace in b position (current box): pop (skip).
// In Python: bxs.pop(i); continue; i stays.
if isWS && len(out) == 0 {
continue // nothing to extend
}
if isWS && len(out) > 0 {
prev := &out[len(out)-1]
gap := b.Top - prev.Bottom
ov := OverlapX(prev, &b)
// Python: gap passes AND xov passes → whitespace merged
// into prev, extending bottom. i advances (Go for-loop).
if gap <= thr && ov >= 0.3 {
prev.Bottom = b.Bottom
}
continue
}
if len(out) == 0 {
out = append(out, b)
continue
}
prev := &out[len(out)-1]
if prev.LayoutNo != b.LayoutNo {
out = append(out, b)
continue
}
gap := b.Top - prev.Bottom
ov := OverlapX(prev, &b)
if gap > thr {
out = append(out, b)
continue
}
if ov < 0.3 {
out = append(out, b)
continue
}
pt := strings.TrimSpace(prev.Text)
bt := strings.TrimSpace(b.Text)
prev.Text = strings.TrimSpace(strings.TrimRight(pt, " \t") + " " + strings.TrimLeft(bt, " \t"))
prev.Bottom = b.Bottom
if prev.X0 > b.X0 {
prev.X0 = b.X0
}
if prev.X1 < b.X1 {
prev.X1 = b.X1
}
}
return len(out)
}
// Run VM with whitespace PRE-FILTERED (Go current behavior).
vNoWS := func() int {
bxs := make([]TextBox, 0, len(boxes))
for _, b := range boxes {
if strings.TrimSpace(b.Text) != "" {
bxs = append(bxs, b)
}
}
sort.Slice(bxs, func(i, j int) bool {
if bxs[i].Top != bxs[j].Top {
return bxs[i].Top < bxs[j].Top
}
return bxs[i].X0 < bxs[j].X0
})
out := make([]TextBox, 0, len(bxs))
for i := 0; i < len(bxs); i++ {
b := bxs[i]
if len(out) == 0 {
out = append(out, b)
continue
}
prev := &out[len(out)-1]
if prev.LayoutNo != b.LayoutNo {
out = append(out, b)
continue
}
gap := b.Top - prev.Bottom
ov := OverlapX(prev, &b)
if gap > thr {
out = append(out, b)
continue
}
if ov < 0.3 {
out = append(out, b)
continue
}
pt := strings.TrimSpace(prev.Text)
bt := strings.TrimSpace(b.Text)
prev.Text = strings.TrimSpace(strings.TrimRight(pt, " \t") + " " + strings.TrimLeft(bt, " \t"))
prev.Bottom = b.Bottom
if prev.X0 > b.X0 {
prev.X0 = b.X0
}
if prev.X1 < b.X1 {
prev.X1 = b.X1
}
}
return len(out)
}
nWS := vWithWS()
nNoWS := vNoWS()
t.Logf("With whitespace (Python-like): %d sections", nWS)
t.Logf("Without whitespace (Go pre-filter): %d sections", nNoWS)
t.Logf("Gap without bridge: 420.16 - 382.39 = %.2f > %.2f = REJECT", 420.16-382.39, thr)
t.Logf("Gap with bridge: 420.16 - 406.79 = %.2f < %.2f = MERGE", 420.16-406.79, thr)
// The manual vWithWS (Python-like) and vNoWS (old Go pre-filter) still
// differ — the mechanism is real. But production NaiveVerticalMerge now
// handles whitespace inline (gap bridge), matching Python.
if nWS == nNoWS {
t.Error("Manual implementations should differ — the gap bridge mechanism is real")
}
// Verify production NaiveVerticalMerge matches vWithWS (Python behavior).
mhMap := map[int]float64{1: mh}
mwMap := map[int]float64{1: 5}
vmResult := NaiveVerticalMerge(boxes, mhMap, mwMap, false)
t.Logf("NaiveVerticalMerge (production): %d sections", len(vmResult))
if len(vmResult) != nWS {
t.Errorf("NaiveVerticalMerge produced %d sections, want %d (Python-like with gap bridge)", len(vmResult), nWS)
}
}