mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
66 lines
2.3 KiB
Go
66 lines
2.3 KiB
Go
//go:build manual
|
||
|
||
package parser
|
||
|
||
import (
|
||
"log/slog"
|
||
"os"
|
||
"path/filepath"
|
||
"testing"
|
||
|
||
"ragflow/internal/deepdoc/parser/pdf/tools"
|
||
)
|
||
|
||
// TestBatchCompareWithPython compares Go output against Python reference
|
||
// across 4 dimensions (text, tables, DLA, TSR raw). It is read-only —
|
||
// no generation, no CGO/DeepDoc dependency. Use BATCH_SKIP_OCR=1 to
|
||
// compare the noocr variant; PY_OCR_SUFFIX to override the Python variant.
|
||
func TestBatchCompareWithPython(t *testing.T) {
|
||
level := slog.LevelInfo
|
||
if os.Getenv("BATCH_LOG_LEVEL") == "debug" {
|
||
level = slog.LevelDebug
|
||
}
|
||
if os.Getenv("BATCH_LOG_LEVEL") == "warn" {
|
||
level = slog.LevelWarn
|
||
}
|
||
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: level})))
|
||
|
||
goVariant := "ocr"
|
||
if os.Getenv("BATCH_SKIP_OCR") == "1" {
|
||
goVariant = "noocr"
|
||
}
|
||
pyVariant := os.Getenv("PY_OCR_SUFFIX")
|
||
if pyVariant == "" {
|
||
pyVariant = goVariant
|
||
}
|
||
goTextDir := filepath.Join("testdata", "output", "go", goVariant, "text")
|
||
pyTextDir := filepath.Join("testdata", "output", "py", pyVariant, "text")
|
||
|
||
// Read Go text files' #@meta (no aggregate JSON dependency).
|
||
goResults, err := tools.ReadGoTextMeta(goTextDir)
|
||
if err != nil || len(goResults) == 0 {
|
||
t.Fatalf("No Go text files in %s: %v", goTextDir, err)
|
||
}
|
||
|
||
// Read Python text files' #@meta
|
||
pyResults, err := tools.ReadPythonTextMeta(pyTextDir)
|
||
if err != nil || len(pyResults) == 0 {
|
||
t.Fatalf("No Python text files in %s: %v", pyTextDir, err)
|
||
}
|
||
|
||
t.Logf("Comparing %d Go × %d Python", len(goResults), len(pyResults))
|
||
tools.CompareWithPython(t, goResults, pyResults, goTextDir, pyTextDir)
|
||
|
||
// Compare tables.
|
||
goTablesDir := filepath.Join("testdata", "output", "go", goVariant, "tables")
|
||
pyTablesDir2 := filepath.Join("testdata", "output", "py", pyVariant, "tables")
|
||
tools.CompareTablesWithPython(t, goTablesDir, pyTablesDir2)
|
||
// Compare DLA + TSR raw intermediates.
|
||
goDLADir := filepath.Join("testdata", "output", "go", goVariant, "dla")
|
||
pyDLADir := filepath.Join("testdata", "output", "py", pyVariant, "dla")
|
||
tools.CompareDLAWithPython(t, goDLADir, pyDLADir)
|
||
goTSRRawDir := filepath.Join("testdata", "output", "go", goVariant, "tsr_raw")
|
||
pyTSRRawDir := filepath.Join("testdata", "output", "py", pyVariant, "tsr_raw")
|
||
tools.CompareTSRRawWithPython(t, goTSRRawDir, pyTSRRawDir)
|
||
}
|