mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
265 lines
7.9 KiB
Go
265 lines
7.9 KiB
Go
|
|
//go:build cgo && manual
|
|||
|
|
|
|||
|
|
package parser
|
|||
|
|
|
|||
|
|
import (
|
|||
|
|
"context"
|
|||
|
|
"os"
|
|||
|
|
"path/filepath"
|
|||
|
|
"ragflow/internal/deepdoc/parser/pdf/tools"
|
|||
|
|
"sort"
|
|||
|
|
"strings"
|
|||
|
|
"testing"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// TestPipelineParity verifies Go pipeline logic equivalence with Python.
|
|||
|
|
// It loads Python pdfplumber chars (from charspy/), runs the Go pipeline
|
|||
|
|
// with Top-based sorting to match Python's ordering, and compares sections
|
|||
|
|
// against Python's output/py/noocr/text/ output.
|
|||
|
|
//
|
|||
|
|
// CharSim must be 100% — if not, Go pipeline logic differs from Python's.
|
|||
|
|
func TestPipelineParity(t *testing.T) {
|
|||
|
|
charspyDir := filepath.Join("testdata", "charspy")
|
|||
|
|
pyTextDir := filepath.Join("testdata", "output", "py", "noocr", "text")
|
|||
|
|
|
|||
|
|
entries, err := os.ReadDir(charspyDir)
|
|||
|
|
if err != nil {
|
|||
|
|
t.Skipf("charspy/ not found: %v", err)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
filter := os.Getenv("BATCH_PARITY_FILTER")
|
|||
|
|
|
|||
|
|
total, passed := 0, 0
|
|||
|
|
for _, e := range entries {
|
|||
|
|
if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") {
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
name := strings.TrimSuffix(e.Name(), ".json")
|
|||
|
|
if filter != "" && !strings.Contains(e.Name(), filter) {
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Load Python chars
|
|||
|
|
jsonPath := filepath.Join(charspyDir, e.Name())
|
|||
|
|
engine, err := LoadPythonChars(jsonPath)
|
|||
|
|
if err != nil {
|
|||
|
|
t.Errorf("%s: LoadPythonChars: %v", name, err)
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Run Go pipeline (SKIP_OCR — no DeepDoc)
|
|||
|
|
cfg := DefaultParserConfig()
|
|||
|
|
cfg.SortByTop = true
|
|||
|
|
p := NewParser(cfg, &MockDocAnalyzer{Healthy: true, Model: ModelSaas})
|
|||
|
|
result, err := p.Parse(context.Background(), engine)
|
|||
|
|
if err != nil {
|
|||
|
|
t.Errorf("%s: Parse: %v", name, err)
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Read Python sections
|
|||
|
|
pyPath := filepath.Join(pyTextDir, name+".txt")
|
|||
|
|
pyData, err := os.ReadFile(pyPath)
|
|||
|
|
if err != nil {
|
|||
|
|
t.Logf("%s: no Python reference at %s — skip", name, pyPath)
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Build Go text
|
|||
|
|
var goText strings.Builder
|
|||
|
|
for _, s := range result.Sections {
|
|||
|
|
goText.WriteString(s.Text)
|
|||
|
|
goText.WriteByte('\n')
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Compare
|
|||
|
|
sim := tools.CharSimilarity(goText.String(), tools.StripMeta(string(pyData)))
|
|||
|
|
total++
|
|||
|
|
if sim >= 100.0 {
|
|||
|
|
passed++
|
|||
|
|
t.Logf("PASS %s: CharSim=%.1f%% boxes:%d->%d->%d->%d",
|
|||
|
|
name, sim, result.Metrics.BoxesInitial, result.Metrics.BoxesTextMerge, result.Metrics.BoxesVertMerge, len(result.Sections))
|
|||
|
|
} else {
|
|||
|
|
t.Errorf("FAIL %s: CharSim=%.1f%% (must be 100%%) boxes:%d->%d->%d->%d",
|
|||
|
|
name, sim, result.Metrics.BoxesInitial, result.Metrics.BoxesTextMerge, result.Metrics.BoxesVertMerge, len(result.Sections))
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if total == 0 {
|
|||
|
|
t.Skip("no charspy/ files found")
|
|||
|
|
}
|
|||
|
|
t.Logf("Pipeline parity: %d/%d passed", passed, total)
|
|||
|
|
if passed < total {
|
|||
|
|
t.Errorf("%d/%d parity tests failed — Go pipeline differs from Python", total-passed, total)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TestVMWhitespaceGapBridge reproduces the exact RAG PDF divergence
|
|||
|
|
// with synthetic boxes. A whitespace box (width > 0, gap just below
|
|||
|
|
// threshold) gets merged into a content box, extending its bottom by
|
|||
|
|
// the whitespace height. This flips the next gap from reject to merge,
|
|||
|
|
// creating a cascade that reduces the section count by 1.
|
|||
|
|
//
|
|||
|
|
// Go's whitespace pre-filter removes this box before VM, so the
|
|||
|
|
// bottom extension never happens and the cascade fails to start.
|
|||
|
|
func TestVMWhitespaceGapBridge(t *testing.T) {
|
|||
|
|
// Coordinates extracted from RAG PDF charspy data, "服务体系" region.
|
|||
|
|
boxes := []TextBox{
|
|||
|
|
// Content A: merged result of 3 preceding lines
|
|||
|
|
{X0: 37.6, X1: 491.0, Top: 339.35, Bottom: 382.39,
|
|||
|
|
Text: "生成文本再用standard分词建立索引", PageNumber: 1},
|
|||
|
|
// Whitespace: U+00A0 non-breaking space, has non-zero width
|
|||
|
|
{X0: 37.6, X1: 40.3, Top: 396.39, Bottom: 406.79,
|
|||
|
|
Text: " ", PageNumber: 1},
|
|||
|
|
// Content B: would be rejected without whitespace gap bridge
|
|||
|
|
{X0: 37.6, X1: 543.3, Top: 420.16, Bottom: 431.19,
|
|||
|
|
Text: "直接用rag分词建立索引", PageNumber: 1},
|
|||
|
|
// Content C: cascades after B merges
|
|||
|
|
{X0: 37.6, X1: 526.4, Top: 436.16, Bottom: 447.20,
|
|||
|
|
Text: "是在原文中并没有这样的文字", PageNumber: 1},
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
mh := 9.361 // RAG PDF char median
|
|||
|
|
thr := mh * 1.5
|
|||
|
|
|
|||
|
|
// Run VM with whitespace PRESENT (Python-like, no pre-filter).
|
|||
|
|
// Python's while/pop merges whitespace at b_ position into b
|
|||
|
|
// (extending b.bottom), then compares same b against next content.
|
|||
|
|
// We simulate this by letting whitespace through gap/xov checks
|
|||
|
|
// and absorbing it into prev when the checks pass.
|
|||
|
|
vWithWS := func() int {
|
|||
|
|
bxs := make([]TextBox, len(boxes))
|
|||
|
|
copy(bxs, boxes)
|
|||
|
|
sort.Slice(bxs, func(i, j int) bool {
|
|||
|
|
if bxs[i].Top != bxs[j].Top {
|
|||
|
|
return bxs[i].Top < bxs[j].Top
|
|||
|
|
}
|
|||
|
|
return bxs[i].X0 < bxs[j].X0
|
|||
|
|
})
|
|||
|
|
out := make([]TextBox, 0, len(bxs))
|
|||
|
|
for i := 0; i < len(bxs); i++ {
|
|||
|
|
b := bxs[i]
|
|||
|
|
isWS := strings.TrimSpace(b.Text) == ""
|
|||
|
|
// Whitespace in b position (current box): pop (skip).
|
|||
|
|
// In Python: bxs.pop(i); continue; i stays.
|
|||
|
|
if isWS && len(out) == 0 {
|
|||
|
|
continue // nothing to extend
|
|||
|
|
}
|
|||
|
|
if isWS && len(out) > 0 {
|
|||
|
|
prev := &out[len(out)-1]
|
|||
|
|
gap := b.Top - prev.Bottom
|
|||
|
|
ov := OverlapX(prev, &b)
|
|||
|
|
// Python: gap passes AND xov passes → whitespace merged
|
|||
|
|
// into prev, extending bottom. i advances (Go for-loop).
|
|||
|
|
if gap <= thr && ov >= 0.3 {
|
|||
|
|
prev.Bottom = b.Bottom
|
|||
|
|
}
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
if len(out) == 0 {
|
|||
|
|
out = append(out, b)
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
prev := &out[len(out)-1]
|
|||
|
|
if prev.LayoutNo != b.LayoutNo {
|
|||
|
|
out = append(out, b)
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
gap := b.Top - prev.Bottom
|
|||
|
|
ov := OverlapX(prev, &b)
|
|||
|
|
if gap > thr {
|
|||
|
|
out = append(out, b)
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
if ov < 0.3 {
|
|||
|
|
out = append(out, b)
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
pt := strings.TrimSpace(prev.Text)
|
|||
|
|
bt := strings.TrimSpace(b.Text)
|
|||
|
|
prev.Text = strings.TrimSpace(strings.TrimRight(pt, " \t") + " " + strings.TrimLeft(bt, " \t"))
|
|||
|
|
prev.Bottom = b.Bottom
|
|||
|
|
if prev.X0 > b.X0 {
|
|||
|
|
prev.X0 = b.X0
|
|||
|
|
}
|
|||
|
|
if prev.X1 < b.X1 {
|
|||
|
|
prev.X1 = b.X1
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return len(out)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Run VM with whitespace PRE-FILTERED (Go current behavior).
|
|||
|
|
vNoWS := func() int {
|
|||
|
|
bxs := make([]TextBox, 0, len(boxes))
|
|||
|
|
for _, b := range boxes {
|
|||
|
|
if strings.TrimSpace(b.Text) != "" {
|
|||
|
|
bxs = append(bxs, b)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
sort.Slice(bxs, func(i, j int) bool {
|
|||
|
|
if bxs[i].Top != bxs[j].Top {
|
|||
|
|
return bxs[i].Top < bxs[j].Top
|
|||
|
|
}
|
|||
|
|
return bxs[i].X0 < bxs[j].X0
|
|||
|
|
})
|
|||
|
|
out := make([]TextBox, 0, len(bxs))
|
|||
|
|
for i := 0; i < len(bxs); i++ {
|
|||
|
|
b := bxs[i]
|
|||
|
|
if len(out) == 0 {
|
|||
|
|
out = append(out, b)
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
prev := &out[len(out)-1]
|
|||
|
|
if prev.LayoutNo != b.LayoutNo {
|
|||
|
|
out = append(out, b)
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
gap := b.Top - prev.Bottom
|
|||
|
|
ov := OverlapX(prev, &b)
|
|||
|
|
if gap > thr {
|
|||
|
|
out = append(out, b)
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
if ov < 0.3 {
|
|||
|
|
out = append(out, b)
|
|||
|
|
continue
|
|||
|
|
}
|
|||
|
|
pt := strings.TrimSpace(prev.Text)
|
|||
|
|
bt := strings.TrimSpace(b.Text)
|
|||
|
|
prev.Text = strings.TrimSpace(strings.TrimRight(pt, " \t") + " " + strings.TrimLeft(bt, " \t"))
|
|||
|
|
prev.Bottom = b.Bottom
|
|||
|
|
if prev.X0 > b.X0 {
|
|||
|
|
prev.X0 = b.X0
|
|||
|
|
}
|
|||
|
|
if prev.X1 < b.X1 {
|
|||
|
|
prev.X1 = b.X1
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return len(out)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
nWS := vWithWS()
|
|||
|
|
nNoWS := vNoWS()
|
|||
|
|
t.Logf("With whitespace (Python-like): %d sections", nWS)
|
|||
|
|
t.Logf("Without whitespace (Go pre-filter): %d sections", nNoWS)
|
|||
|
|
t.Logf("Gap without bridge: 420.16 - 382.39 = %.2f > %.2f = REJECT", 420.16-382.39, thr)
|
|||
|
|
t.Logf("Gap with bridge: 420.16 - 406.79 = %.2f < %.2f = MERGE", 420.16-406.79, thr)
|
|||
|
|
|
|||
|
|
// The manual vWithWS (Python-like) and vNoWS (old Go pre-filter) still
|
|||
|
|
// differ — the mechanism is real. But production NaiveVerticalMerge now
|
|||
|
|
// handles whitespace inline (gap bridge), matching Python.
|
|||
|
|
if nWS == nNoWS {
|
|||
|
|
t.Error("Manual implementations should differ — the gap bridge mechanism is real")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Verify production NaiveVerticalMerge matches vWithWS (Python behavior).
|
|||
|
|
mhMap := map[int]float64{1: mh}
|
|||
|
|
mwMap := map[int]float64{1: 5}
|
|||
|
|
vmResult := NaiveVerticalMerge(boxes, mhMap, mwMap, false)
|
|||
|
|
t.Logf("NaiveVerticalMerge (production): %d sections", len(vmResult))
|
|||
|
|
if len(vmResult) != nWS {
|
|||
|
|
t.Errorf("NaiveVerticalMerge produced %d sections, want %d (Python-like with gap bridge)", len(vmResult), nWS)
|
|||
|
|
}
|
|||
|
|
}
|