Files
ragflow/internal/deepdoc/parser/pdf/compare_test.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

66 lines
2.3 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//go:build manual
package parser
import (
"log/slog"
"os"
"path/filepath"
"testing"
"ragflow/internal/deepdoc/parser/pdf/tools"
)
// TestBatchCompareWithPython compares Go output against Python reference
// across 4 dimensions (text, tables, DLA, TSR raw). It is read-only —
// no generation, no CGO/DeepDoc dependency. Use BATCH_SKIP_OCR=1 to
// compare the noocr variant; PY_OCR_SUFFIX to override the Python variant.
func TestBatchCompareWithPython(t *testing.T) {
level := slog.LevelInfo
if os.Getenv("BATCH_LOG_LEVEL") == "debug" {
level = slog.LevelDebug
}
if os.Getenv("BATCH_LOG_LEVEL") == "warn" {
level = slog.LevelWarn
}
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: level})))
goVariant := "ocr"
if os.Getenv("BATCH_SKIP_OCR") == "1" {
goVariant = "noocr"
}
pyVariant := os.Getenv("PY_OCR_SUFFIX")
if pyVariant == "" {
pyVariant = goVariant
}
goTextDir := filepath.Join("testdata", "output", "go", goVariant, "text")
pyTextDir := filepath.Join("testdata", "output", "py", pyVariant, "text")
// Read Go text files' #@meta (no aggregate JSON dependency).
goResults, err := tools.ReadGoTextMeta(goTextDir)
if err != nil || len(goResults) == 0 {
t.Fatalf("No Go text files in %s: %v", goTextDir, err)
}
// Read Python text files' #@meta
pyResults, err := tools.ReadPythonTextMeta(pyTextDir)
if err != nil || len(pyResults) == 0 {
t.Fatalf("No Python text files in %s: %v", pyTextDir, err)
}
t.Logf("Comparing %d Go × %d Python", len(goResults), len(pyResults))
tools.CompareWithPython(t, goResults, pyResults, goTextDir, pyTextDir)
// Compare tables.
goTablesDir := filepath.Join("testdata", "output", "go", goVariant, "tables")
pyTablesDir2 := filepath.Join("testdata", "output", "py", pyVariant, "tables")
tools.CompareTablesWithPython(t, goTablesDir, pyTablesDir2)
// Compare DLA + TSR raw intermediates.
goDLADir := filepath.Join("testdata", "output", "go", goVariant, "dla")
pyDLADir := filepath.Join("testdata", "output", "py", pyVariant, "dla")
tools.CompareDLAWithPython(t, goDLADir, pyDLADir)
goTSRRawDir := filepath.Join("testdata", "output", "go", goVariant, "tsr_raw")
pyTSRRawDir := filepath.Join("testdata", "output", "py", pyVariant, "tsr_raw")
tools.CompareTSRRawWithPython(t, goTSRRawDir, pyTSRRawDir)
}