Files
ragflow/internal/deepdoc/parser/pdf/generate_test.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

355 lines
9.8 KiB
Go

//go:build cgo && manual
package parser
import (
"context"
"encoding/json"
"fmt"
"log/slog"
"math"
"os"
"path/filepath"
"ragflow/internal/deepdoc/parser/pdf/tools"
"regexp"
"sort"
"strconv"
"strings"
"testing"
"time"
"unicode/utf8"
)
// TestBatchResults runs Parse() on real PDFs and writes:
//
// output/go/{variant}/text/{pdf}.txt — per-section text + #@meta
// output/go/{variant}/tables/{pdf}.json — table cells
// output/go/{variant}/dla/{pdf}.json — DLA regions (debug)
// output/go/{variant}/tsr_raw/{pdf}.json — TSR raw cells (debug)
//
// DeepDoc is mandatory (DLA+TSR are inseparable from the pipeline).
//
// BATCH_SKIP_OCR=1 skip image OCR (DLA+TSR kept)
// BATCH_COUNT=N limit to first N PDFs (by file size, smallest first)
// BATCH_SINGLE=name process exactly one PDF (full filename)
//
// For read-only comparison, see compare_test.go (no CGO needed).
func TestBatchResults(t *testing.T) {
setupLogger()
pdfDir := filepath.Join("testdata", "real_pdfs")
all := listRealPDFs(t, pdfDir)
count := countFromEnv("BATCH_COUNT", len(all))
if single := os.Getenv("BATCH_SINGLE"); single != "" {
all = filterSingle(all, single, t)
count = 1
}
pdfs := all[:min(count, len(all))]
ddClient, err := NewDeepDocClient(os.Getenv("DEEPDOC_URL"))
if err != nil {
t.Fatal(err)
}
if !ddClient.Health() {
t.Fatalf("DeepDoc service not available at %s (DLA+TSR required)", ddClient.baseURL)
}
deepDoc := DocAnalyzer(ddClient)
variant := variantFromEnv()
t.Logf("DeepDoc available — DLA+TSR%s enabled (%d PDFs)",
map[bool]string{true: ", image OCR skipped", false: ", OCR enabled"}[variant == "noocr"], len(pdfs))
dirs := mkOutputDirs(variant)
processPDFs(t, pdfDir, pdfs, deepDoc, variant, dirs)
}
// ── helpers ─────────────────────────────────────────────────────────
func setupLogger() {
level := slog.LevelInfo
switch os.Getenv("BATCH_LOG_LEVEL") {
case "debug":
level = slog.LevelDebug
case "warn":
level = slog.LevelWarn
}
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: level})))
}
func variantFromEnv() string {
if os.Getenv("BATCH_SKIP_OCR") == "1" {
return "noocr"
}
return "ocr"
}
type outputDirs struct {
text, tables, dla, tsrRaw string
}
func mkOutputDirs(variant string) outputDirs {
d := outputDirs{
text: filepath.Join("testdata", "output", "go", variant, "text"),
tables: filepath.Join("testdata", "output", "go", variant, "tables"),
dla: filepath.Join("testdata", "output", "go", variant, "dla"),
tsrRaw: filepath.Join("testdata", "output", "go", variant, "tsr_raw"),
}
os.MkdirAll(d.text, 0755)
os.MkdirAll(d.tables, 0755)
os.MkdirAll(d.dla, 0755)
os.MkdirAll(d.tsrRaw, 0755)
return d
}
func countFromEnv(key string, ceiling int) int {
if s := os.Getenv(key); s != "" {
n, err := strconv.Atoi(s)
if err == nil && n > 0 && n < ceiling {
return n
}
}
return ceiling
}
func listRealPDFs(t *testing.T, dir string) []string {
t.Helper()
entries, err := os.ReadDir(dir)
if err != nil {
t.Fatal(err)
}
var pdfs []string
for _, e := range entries {
if !e.IsDir() && strings.HasSuffix(strings.ToLower(e.Name()), ".pdf") {
pdfs = append(pdfs, e.Name())
}
}
// Sort by file size, smallest first — fast feedback on small PDFs.
sort.Slice(pdfs, func(i, j int) bool {
si, _ := os.Stat(filepath.Join(dir, pdfs[i]))
sj, _ := os.Stat(filepath.Join(dir, pdfs[j]))
if si == nil || sj == nil {
return pdfs[i] < pdfs[j]
}
return si.Size() < sj.Size()
})
return pdfs
}
func filterSingle(pdfs []string, name string, t *testing.T) []string {
t.Helper()
for _, n := range pdfs {
if n == name {
return []string{n}
}
}
t.Fatalf("BATCH_SINGLE: %s not found in real_pdfs/", name)
return nil
}
// extractPageStats returns (charCount, boxCount) for all pages in engine.
func extractPageStats(eng PDFEngine) (chars, boxes int) {
np, _ := eng.PageCount()
for pg := 0; pg < np; pg++ {
pgChars, err := eng.ExtractChars(pg)
if err != nil {
continue
}
chars += len(pgChars)
boxes += len(charsToBoxes(pgChars, pg, false))
}
return
}
func textLenFromOutput(data []byte) int {
s := string(data)
if idx := strings.LastIndex(s, "\n#@meta"); idx >= 0 {
s = s[:idx]
}
return utf8.RuneCountInString(s)
}
// ── main processing loop ────────────────────────────────────────────
func processPDFs(t *testing.T, pdfDir string, pdfs []string, deepDoc DocAnalyzer, variant string, dirs outputDirs) []tools.BatchResult {
t.Helper()
var results []tools.BatchResult
totalChars := 0
skipOCR := os.Getenv("BATCH_SKIP_OCR") == "1"
for i, name := range pdfs {
label := fmt.Sprintf("[%d/%d] %s", i+1, len(pdfs), name)
// ── cached? ──
if cached := tryLoadCached(dirs, name); cached != nil {
results = append(results, *cached)
totalChars += cached.TextLen
t.Logf("%s %s — SKIP (cached, %d chars, %d sections)",
time.Now().Format("15:04:05"), label, cached.TextLen, cached.Sections)
continue
}
// ── parse ──
res, err := parseOne(pdfDir, name, deepDoc, skipOCR)
if err != nil {
results = append(results, tools.BatchResult{File: name, Error: err.Error()})
t.Logf("%s — %v", label, err)
continue
}
writeOutputs(dirs, name, &res.result, res)
results = append(results, res.BatchResult)
totalChars += res.TextLen
t.Logf("%s %s — chars=%d boxes:%d→%d→%d→%d text=%d (%.1fs)",
time.Now().Format("15:04:05"), label, res.Chars,
res.BoxesInitial, res.BoxesTextMerg, res.BoxesVertMerg, res.Sections,
res.TextLen, res.TimeS)
}
t.Logf("\nDone. %d PDFs, %d chars. Output: %s/", len(results), totalChars, dirs.text)
return results
}
type parseOneResult struct {
tools.BatchResult
result ParseResult
}
func parseOne(pdfDir, name string, deepDoc DocAnalyzer, skipOCR bool) (*parseOneResult, error) {
data, err := os.ReadFile(filepath.Join(pdfDir, name))
if err != nil {
return nil, fmt.Errorf("read: %w", err)
}
eng, err := NewEngine(data)
if err != nil {
return nil, fmt.Errorf("engine: %w", err)
}
defer eng.Close()
pageCount, _ := eng.PageCount()
chars, _ := extractPageStats(eng)
cfg := DefaultParserConfig()
cfg.SkipOCR = skipOCR
p := NewParser(cfg, deepDoc)
t0 := time.Now()
parsed, err := p.Parse(context.Background(), eng)
elapsed := time.Since(t0).Seconds()
if err != nil {
return nil, fmt.Errorf("parse: %w", err)
}
textLen := 0
for _, s := range parsed.Sections {
textLen += utf8.RuneCountInString(s.Text)
}
return &parseOneResult{
BatchResult: tools.BatchResult{
File: name,
Pages: pageCount,
Chars: chars,
BoxesInitial: parsed.Metrics.BoxesInitial,
BoxesTextMerg: parsed.Metrics.BoxesTextMerge,
BoxesVertMerg: parsed.Metrics.BoxesVertMerge,
Sections: len(parsed.Sections),
TextLen: textLen,
TimeS: math.Round(elapsed*100) / 100,
},
result: *parsed,
}, nil
}
func tryLoadCached(dirs outputDirs, name string) *tools.BatchResult {
textPath := filepath.Join(dirs.text, name+".txt")
tablesPath := filepath.Join(dirs.tables, name+".json")
if !tools.FileExists(textPath) || !tools.FileExists(tablesPath) {
return nil
}
data, err := os.ReadFile(textPath)
if err != nil {
return nil
}
var r tools.BatchResult
r.File = name
if idx := strings.LastIndex(string(data), "\n#@meta"); idx >= 0 {
if json.Unmarshal(data[idx+7:], &r) == nil {
// TextLen must be recalculated from text-only portion (excludes #@meta line).
r.TextLen = textLenFromOutput(data)
return &r
}
}
return nil
}
// htmlToRows extracts cell text rows from an HTML <table> string,
// matching Python's html_to_rows in dump_py_results.py.
func htmlToRows(html string) [][]string {
var rows [][]string
re := regexp.MustCompile(`<tr>(.*?)</tr>`)
td := regexp.MustCompile(`<t[dh][^>]*>(.*?)</t[dh]>`)
for _, tr := range re.FindAllStringSubmatch(html, -1) {
var cells []string
for _, m := range td.FindAllStringSubmatch(tr[1], -1) {
cells = append(cells, m[1])
}
rows = append(rows, cells)
}
return rows
}
func writeOutputs(dirs outputDirs, name string, parsed *ParseResult, res *parseOneResult) {
// ── text + #@meta ──
var sb strings.Builder
for _, s := range parsed.Sections {
sb.WriteString(s.Text)
sb.WriteByte('\n')
}
if b, _ := json.Marshal(res.BatchResult); b != nil {
sb.WriteString("#@meta")
sb.Write(b)
sb.WriteByte('\n')
}
os.WriteFile(filepath.Join(dirs.text, name+".txt"), []byte(sb.String()), 0644)
// ── tables JSON — extract rows from section HTML (matching Python html_to_rows) ──
type slimTable struct {
Rows [][]string `json:"rows"`
Positions []Position `json:"positions,omitempty"`
}
// Collect all table sections in order (index-matched to TableItems).
var tableSections []Section
for _, s := range parsed.Sections {
if s.LayoutType == "table" && strings.HasPrefix(s.Text, "<table>") {
tableSections = append(tableSections, s)
}
}
slim := make([]slimTable, len(parsed.Tables))
for j, t := range parsed.Tables {
slim[j].Rows = t.Rows
slim[j].Positions = t.Positions
// Fallback: extract rows from section HTML (index-matched).
if len(slim[j].Rows) == 0 && j < len(tableSections) {
slim[j].Rows = htmlToRows(tableSections[j].Text)
}
}
if b, _ := json.MarshalIndent(slim, "", " "); b != nil {
os.WriteFile(filepath.Join(dirs.tables, name+".json"), b, 0644)
}
// ── DLA + TSR debug intermediates ──
if parsed.DLADebug != nil {
if b, _ := json.MarshalIndent(parsed.DLADebug, "", " "); b != nil {
os.WriteFile(filepath.Join(dirs.dla, name+".json"), b, 0644)
}
}
if parsed.TSRDebug != nil {
if b, _ := json.MarshalIndent(parsed.TSRDebug, "", " "); b != nil {
os.WriteFile(filepath.Join(dirs.tsrRaw, name+".json"), b, 0644)
}
}
}