ragflow/internal/deepdoc/parser/pdf/generate_test.go

//go:build cgo && manual

package parser

import (
	"context"
	"encoding/json"
	"fmt"
	"log/slog"
	"math"
	"os"
	"path/filepath"
	"ragflow/internal/deepdoc/parser/pdf/tools"
	"regexp"
	"sort"
	"strconv"
	"strings"
	"testing"
	"time"
	"unicode/utf8"
)

// TestBatchResults runs Parse() on real PDFs and writes:
//
//	output/go/{variant}/text/{pdf}.txt     — per-section text + #@meta
//	output/go/{variant}/tables/{pdf}.json  — table cells
//	output/go/{variant}/dla/{pdf}.json     — DLA regions (debug)
//	output/go/{variant}/tsr_raw/{pdf}.json — TSR raw cells (debug)
//
// DeepDoc is mandatory (DLA+TSR are inseparable from the pipeline).
//
//	BATCH_SKIP_OCR=1   skip image OCR (DLA+TSR kept)
//	BATCH_COUNT=N      limit to first N PDFs (by file size, smallest first)
//	BATCH_SINGLE=name  process exactly one PDF (full filename)
//
// For read-only comparison, see compare_test.go (no CGO needed).
func TestBatchResults(t *testing.T) {
	setupLogger()

	pdfDir := filepath.Join("testdata", "real_pdfs")
	all := listRealPDFs(t, pdfDir)

	count := countFromEnv("BATCH_COUNT", len(all))
	if single := os.Getenv("BATCH_SINGLE"); single != "" {
		all = filterSingle(all, single, t)
		count = 1
	}
	pdfs := all[:min(count, len(all))]

	ddClient, err := NewDeepDocClient(os.Getenv("DEEPDOC_URL"))
	if err != nil {
		t.Fatal(err)
	}
	if !ddClient.Health() {
		t.Fatalf("DeepDoc service not available at %s (DLA+TSR required)", ddClient.baseURL)
	}
	deepDoc := DocAnalyzer(ddClient)

	variant := variantFromEnv()
	t.Logf("DeepDoc available — DLA+TSR%s enabled (%d PDFs)",
		map[bool]string{true: ", image OCR skipped", false: ", OCR enabled"}[variant == "noocr"], len(pdfs))

	dirs := mkOutputDirs(variant)

	processPDFs(t, pdfDir, pdfs, deepDoc, variant, dirs)
}

// ── helpers ─────────────────────────────────────────────────────────

func setupLogger() {
	level := slog.LevelInfo
	switch os.Getenv("BATCH_LOG_LEVEL") {
	case "debug":
		level = slog.LevelDebug
	case "warn":
		level = slog.LevelWarn
	}
	slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: level})))
}

func variantFromEnv() string {
	if os.Getenv("BATCH_SKIP_OCR") == "1" {
		return "noocr"
	}
	return "ocr"
}

type outputDirs struct {
	text, tables, dla, tsrRaw string
}

func mkOutputDirs(variant string) outputDirs {
	d := outputDirs{
		text:   filepath.Join("testdata", "output", "go", variant, "text"),
		tables: filepath.Join("testdata", "output", "go", variant, "tables"),
		dla:    filepath.Join("testdata", "output", "go", variant, "dla"),
		tsrRaw: filepath.Join("testdata", "output", "go", variant, "tsr_raw"),
	}
	os.MkdirAll(d.text, 0755)
	os.MkdirAll(d.tables, 0755)
	os.MkdirAll(d.dla, 0755)
	os.MkdirAll(d.tsrRaw, 0755)
	return d
}

func countFromEnv(key string, ceiling int) int {
	if s := os.Getenv(key); s != "" {
		n, err := strconv.Atoi(s)
		if err == nil && n > 0 && n < ceiling {
			return n
		}
	}
	return ceiling
}

func listRealPDFs(t *testing.T, dir string) []string {
	t.Helper()
	entries, err := os.ReadDir(dir)
	if err != nil {
		t.Fatal(err)
	}
	var pdfs []string
	for _, e := range entries {
		if !e.IsDir() && strings.HasSuffix(strings.ToLower(e.Name()), ".pdf") {
			pdfs = append(pdfs, e.Name())
		}
	}
	// Sort by file size, smallest first — fast feedback on small PDFs.
	sort.Slice(pdfs, func(i, j int) bool {
		si, _ := os.Stat(filepath.Join(dir, pdfs[i]))
		sj, _ := os.Stat(filepath.Join(dir, pdfs[j]))
		if si == nil || sj == nil {
			return pdfs[i] < pdfs[j]
		}
		return si.Size() < sj.Size()
	})
	return pdfs
}

func filterSingle(pdfs []string, name string, t *testing.T) []string {
	t.Helper()
	for _, n := range pdfs {
		if n == name {
			return []string{n}
		}
	}
	t.Fatalf("BATCH_SINGLE: %s not found in real_pdfs/", name)
	return nil
}

// extractPageStats returns (charCount, boxCount) for all pages in engine.
func extractPageStats(eng PDFEngine) (chars, boxes int) {
	np, _ := eng.PageCount()
	for pg := 0; pg < np; pg++ {
		pgChars, err := eng.ExtractChars(pg)
		if err != nil {
			continue
		}
		chars += len(pgChars)
		boxes += len(charsToBoxes(pgChars, pg, false))
	}
	return
}

func textLenFromOutput(data []byte) int {
	s := string(data)
	if idx := strings.LastIndex(s, "\n#@meta"); idx >= 0 {
		s = s[:idx]
	}
	return utf8.RuneCountInString(s)
}

// ── main processing loop ────────────────────────────────────────────

func processPDFs(t *testing.T, pdfDir string, pdfs []string, deepDoc DocAnalyzer, variant string, dirs outputDirs) []tools.BatchResult {
	t.Helper()
	var results []tools.BatchResult
	totalChars := 0
	skipOCR := os.Getenv("BATCH_SKIP_OCR") == "1"

	for i, name := range pdfs {
		label := fmt.Sprintf("[%d/%d] %s", i+1, len(pdfs), name)

		// ── cached? ──
		if cached := tryLoadCached(dirs, name); cached != nil {
			results = append(results, *cached)
			totalChars += cached.TextLen
			t.Logf("%s %s — SKIP (cached, %d chars, %d sections)",
				time.Now().Format("15:04:05"), label, cached.TextLen, cached.Sections)
			continue
		}

		// ── parse ──
		res, err := parseOne(pdfDir, name, deepDoc, skipOCR)
		if err != nil {
			results = append(results, tools.BatchResult{File: name, Error: err.Error()})
			t.Logf("%s — %v", label, err)
			continue
		}

		writeOutputs(dirs, name, &res.result, res)
		results = append(results, res.BatchResult)
		totalChars += res.TextLen

		t.Logf("%s %s — chars=%d boxes:%d→%d→%d→%d text=%d (%.1fs)",
			time.Now().Format("15:04:05"), label, res.Chars,
			res.BoxesInitial, res.BoxesTextMerg, res.BoxesVertMerg, res.Sections,
			res.TextLen, res.TimeS)
	}

	t.Logf("\nDone. %d PDFs, %d chars. Output: %s/", len(results), totalChars, dirs.text)
	return results
}

type parseOneResult struct {
	tools.BatchResult
	result ParseResult
}

func parseOne(pdfDir, name string, deepDoc DocAnalyzer, skipOCR bool) (*parseOneResult, error) {
	data, err := os.ReadFile(filepath.Join(pdfDir, name))
	if err != nil {
		return nil, fmt.Errorf("read: %w", err)
	}

	eng, err := NewEngine(data)
	if err != nil {
		return nil, fmt.Errorf("engine: %w", err)
	}
	defer eng.Close()

	pageCount, _ := eng.PageCount()
	chars, _ := extractPageStats(eng)

	cfg := DefaultParserConfig()
	cfg.SkipOCR = skipOCR
	p := NewParser(cfg, deepDoc)
	t0 := time.Now()
	parsed, err := p.Parse(context.Background(), eng)
	elapsed := time.Since(t0).Seconds()
	if err != nil {
		return nil, fmt.Errorf("parse: %w", err)
	}

	textLen := 0
	for _, s := range parsed.Sections {
		textLen += utf8.RuneCountInString(s.Text)
	}

	return &parseOneResult{
		BatchResult: tools.BatchResult{
			File:          name,
			Pages:         pageCount,
			Chars:         chars,
			BoxesInitial:  parsed.Metrics.BoxesInitial,
			BoxesTextMerg: parsed.Metrics.BoxesTextMerge,
			BoxesVertMerg: parsed.Metrics.BoxesVertMerge,
			Sections:      len(parsed.Sections),
			TextLen:       textLen,
			TimeS:         math.Round(elapsed*100) / 100,
		},
		result: *parsed,
	}, nil
}

func tryLoadCached(dirs outputDirs, name string) *tools.BatchResult {
	textPath := filepath.Join(dirs.text, name+".txt")
	tablesPath := filepath.Join(dirs.tables, name+".json")
	if !tools.FileExists(textPath) || !tools.FileExists(tablesPath) {
		return nil
	}
	data, err := os.ReadFile(textPath)
	if err != nil {
		return nil
	}
	var r tools.BatchResult
	r.File = name
	if idx := strings.LastIndex(string(data), "\n#@meta"); idx >= 0 {
		if json.Unmarshal(data[idx+7:], &r) == nil {
			// TextLen must be recalculated from text-only portion (excludes #@meta line).
			r.TextLen = textLenFromOutput(data)
			return &r
		}
	}
	return nil
}

// htmlToRows extracts cell text rows from an HTML <table> string,
// matching Python's html_to_rows in dump_py_results.py.
func htmlToRows(html string) [][]string {
	var rows [][]string
	re := regexp.MustCompile(`<tr>(.*?)</tr>`)
	td := regexp.MustCompile(`<t[dh][^>]*>(.*?)</t[dh]>`)
	for _, tr := range re.FindAllStringSubmatch(html, -1) {
		var cells []string
		for _, m := range td.FindAllStringSubmatch(tr[1], -1) {
			cells = append(cells, m[1])
		}
		rows = append(rows, cells)
	}
	return rows
}

func writeOutputs(dirs outputDirs, name string, parsed *ParseResult, res *parseOneResult) {
	// ── text + #@meta ──
	var sb strings.Builder
	for _, s := range parsed.Sections {
		sb.WriteString(s.Text)
		sb.WriteByte('\n')
	}
	if b, _ := json.Marshal(res.BatchResult); b != nil {
		sb.WriteString("#@meta")
		sb.Write(b)
		sb.WriteByte('\n')
	}
	os.WriteFile(filepath.Join(dirs.text, name+".txt"), []byte(sb.String()), 0644)

	// ── tables JSON — extract rows from section HTML (matching Python html_to_rows) ──
	type slimTable struct {
		Rows      [][]string `json:"rows"`
		Positions []Position `json:"positions,omitempty"`
	}
	// Collect all table sections in order (index-matched to TableItems).
	var tableSections []Section
	for _, s := range parsed.Sections {
		if s.LayoutType == "table" && strings.HasPrefix(s.Text, "<table>") {
			tableSections = append(tableSections, s)
		}
	}
	slim := make([]slimTable, len(parsed.Tables))
	for j, t := range parsed.Tables {
		slim[j].Rows = t.Rows
		slim[j].Positions = t.Positions
		// Fallback: extract rows from section HTML (index-matched).
		if len(slim[j].Rows) == 0 && j < len(tableSections) {
			slim[j].Rows = htmlToRows(tableSections[j].Text)
		}
	}
	if b, _ := json.MarshalIndent(slim, "", "  "); b != nil {
		os.WriteFile(filepath.Join(dirs.tables, name+".json"), b, 0644)
	}

	// ── DLA + TSR debug intermediates ──
	if parsed.DLADebug != nil {
		if b, _ := json.MarshalIndent(parsed.DLADebug, "", "  "); b != nil {
			os.WriteFile(filepath.Join(dirs.dla, name+".json"), b, 0644)
		}
	}
	if parsed.TSRDebug != nil {
		if b, _ := json.MarshalIndent(parsed.TSRDebug, "", "  "); b != nil {
			os.WriteFile(filepath.Join(dirs.tsrRaw, name+".json"), b, 0644)
		}
	}
}