Files
ragflow/internal/deepdoc/parser/pdf/deepdoc_integration_test.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

765 lines
22 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//go:build cgo && integration
package parser
import (
"bytes"
"context"
"encoding/base64"
"encoding/json"
"image"
_ "image/png"
"os"
"path/filepath"
"strings"
"testing"
)
// ── helpers ────────────────────────────────────────────────────────────────
// mustConnectDeepDoc returns a DeepDocClient; skips the test if unavailable.
func mustConnectDeepDoc(t *testing.T) *DeepDocClient {
t.Helper()
url := os.Getenv("DEEPDOC_URL")
if url == "" {
url = "http://localhost:9390"
}
client, err := NewDeepDocClient(url)
if err != nil {
t.Fatal(err)
}
if !client.Health() {
t.Fatalf("DeepDoc not available at %s", url)
}
return client
}
// mustOpenEngine opens a PDF from testdata/pdfs/ and returns a PDFEngine.
func mustOpenEngine(t *testing.T, name string) PDFEngine {
t.Helper()
pdfPath := filepath.Join("testdata", "pdfs", name)
data, err := os.ReadFile(pdfPath)
if err != nil {
t.Fatalf("read fixture %s: %v", name, err)
}
eng, err := NewEngine(data)
if err != nil {
t.Fatalf("open engine %s: %v", name, err)
}
return eng
}
// ── golden-file helpers ────────────────────────────────────────────────────
// sectionGolden is the snapshot format for section output.
type sectionGolden struct {
Text string `json:"text"`
LayoutType string `json:"layout_type"`
}
// tableGolden is the snapshot format for table output.
type tableGolden struct {
Rows [][]string `json:"rows"`
}
func goldenPath(name string) string {
return filepath.Join("testdata", "integration", name)
}
func readGolden[T any](t *testing.T, path string) []T {
t.Helper()
data, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read golden %s: %v", path, err)
}
var result []T
if err := json.Unmarshal(data, &result); err != nil {
t.Fatalf("parse golden %s: %v", path, err)
}
return result
}
func writeGolden(t *testing.T, path string, v any) {
t.Helper()
dir := filepath.Dir(path)
if err := os.MkdirAll(dir, 0755); err != nil {
t.Fatalf("mkdir %s: %v", dir, err)
}
f, err := os.Create(path)
if err != nil {
t.Fatalf("create golden %s: %v", path, err)
}
defer f.Close()
enc := json.NewEncoder(f)
enc.SetIndent("", " ")
if err := enc.Encode(v); err != nil {
t.Fatalf("write golden %s: %v", path, err)
}
}
func updateGolden() bool {
return os.Getenv("UPDATE_GOLDEN") == "1"
}
// sectionsToGolden converts []Section to the snapshot format.
func sectionsToGolden(sections []Section) []sectionGolden {
result := make([]sectionGolden, len(sections))
for i, s := range sections {
result[i] = sectionGolden{
Text: s.Text,
LayoutType: s.LayoutType,
}
}
return result
}
// tablesToGolden converts []TableItem to the snapshot format.
func tablesToGolden(tables []TableItem) []tableGolden {
result := make([]tableGolden, len(tables))
for i, t := range tables {
result[i] = tableGolden{Rows: t.Rows}
}
return result
}
// ── tests ──────────────────────────────────────────────────────────────────
// TestIntegration_SectionsText verifies section text output matches golden.
func TestIntegration_SectionsText(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "01_english_simple.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) == 0 {
t.Fatal("expected at least one section")
}
golden := goldenPath("01_english_simple.sections.json")
got := sectionsToGolden(result.Sections)
if updateGolden() {
writeGolden(t, golden, got)
t.Logf("golden written: %s (%d sections)", golden, len(got))
return
}
expected := readGolden[sectionGolden](t, golden)
if len(expected) != len(got) {
t.Errorf("section count mismatch: golden=%d got=%d", len(expected), len(got))
}
n := len(expected)
if len(got) < n {
n = len(got)
}
for i := 0; i < n; i++ {
if expected[i].Text != got[i].Text {
t.Errorf("section[%d] text mismatch:\n golden: %q\n got: %q", i, expected[i].Text, got[i].Text)
}
if expected[i].LayoutType != got[i].LayoutType {
t.Errorf("section[%d] layout_type mismatch: golden=%q got=%q",
i, expected[i].LayoutType, got[i].LayoutType)
}
}
}
// TestIntegration_SectionsCount verifies section count is stable.
func TestIntegration_SectionsCount(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "01_english_simple.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
// Read back from golden to get expected count.
golden := goldenPath("01_english_simple.sections.json")
expected := readGolden[sectionGolden](t, golden)
if len(result.Sections) != len(expected) {
// Log section layout types to help debug divergence.
var types []string
for _, s := range result.Sections {
types = append(types, s.LayoutType)
}
t.Errorf("section count: golden=%d got=%d (types: %v)", len(expected), len(result.Sections), types)
}
}
// TestIntegration_TableStructure verifies table rows and cell text match golden.
func TestIntegration_TableStructure(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Tables) == 0 {
t.Skip("DLA did not detect any tables in fixture — skipping table structure check")
}
golden := goldenPath("06_table_content.tables.json")
got := tablesToGolden(result.Tables)
if updateGolden() {
writeGolden(t, golden, got)
t.Logf("golden written: %s (%d tables)", golden, len(got))
return
}
expected := readGolden[tableGolden](t, golden)
if len(expected) != len(got) {
t.Errorf("table count mismatch: golden=%d got=%d", len(expected), len(got))
}
n := len(expected)
if len(got) < n {
n = len(got)
}
for i := 0; i < n; i++ {
if len(expected[i].Rows) != len(got[i].Rows) {
t.Errorf("table[%d] row count mismatch: golden=%d got=%d", i, len(expected[i].Rows), len(got[i].Rows))
continue
}
for ri := 0; ri < len(expected[i].Rows); ri++ {
if len(expected[i].Rows[ri]) != len(got[i].Rows[ri]) {
t.Errorf("table[%d] row[%d] cell count mismatch: golden=%d got=%d", i, ri, len(expected[i].Rows[ri]), len(got[i].Rows[ri]))
continue
}
for ci := 0; ci < len(expected[i].Rows[ri]); ci++ {
goldenCell := strings.TrimSpace(expected[i].Rows[ri][ci])
gotCell := strings.TrimSpace(got[i].Rows[ri][ci])
if goldenCell != gotCell {
t.Errorf("table[%d] row[%d] cell[%d] mismatch:\n golden: %q\n got: %q",
i, ri, ci, goldenCell, gotCell)
}
}
}
}
}
// TestIntegration_TableImageB64 verifies table ImageB64 is valid base64 PNG.
func TestIntegration_TableImageB64(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Tables) == 0 {
t.Skip("DLA did not detect any tables in fixture — skipping image check")
}
for i, tbl := range result.Tables {
if tbl.ImageB64 == "" {
t.Errorf("table[%d] ImageB64 is empty", i)
continue
}
// Verify base64 decodable.
raw, err := base64.StdEncoding.DecodeString(tbl.ImageB64)
if err != nil {
t.Errorf("table[%d] ImageB64: not valid base64: %v", i, err)
continue
}
// Verify it's a valid image.
img, _, err := image.Decode(bytes.NewReader(raw))
if err != nil {
t.Errorf("table[%d] ImageB64: not a valid image: %v", i, err)
continue
}
b := img.Bounds()
if b.Dx() <= 0 || b.Dy() <= 0 {
t.Errorf("table[%d] ImageB64: zero-size image %dx%d", i, b.Dx(), b.Dy())
}
}
}
// TestIntegration_LayoutTypes verifies DLA labels boxes with expected types.
func TestIntegration_LayoutTypes(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
golden := goldenPath("06_table_content.layouts.json")
got := sectionsToGolden(result.Sections)
if updateGolden() {
writeGolden(t, golden, got)
t.Logf("golden written: %s (%d sections)", golden, len(got))
return
}
expected := readGolden[sectionGolden](t, golden)
if len(expected) != len(got) {
t.Errorf("section count mismatch: golden=%d got=%d", len(expected), len(got))
}
// Count layout types on both sides.
goldenTypes := map[string]int{}
gotTypes := map[string]int{}
for _, s := range expected {
goldenTypes[s.LayoutType]++
}
for _, s := range got {
gotTypes[s.LayoutType]++
}
for typ, gc := range goldenTypes {
if gotTypes[typ] != gc {
t.Errorf("LayoutType %q count mismatch: golden=%d got=%d", typ, gc, gotTypes[typ])
}
}
for typ, gc := range gotTypes {
if goldenTypes[typ] == 0 {
t.Errorf("LayoutType %q count mismatch: golden=0 got=%d", typ, gc)
}
}
}
// ── Idempotency tests ─────────────────────────────────────────────────
// TestIntegration_Idempotency verifies that DeepDoc APIs return consistent
// results when called multiple times with the same image. This validates
// that the ML inference is deterministic (or at least semantically stable).
func TestIntegration_Idempotency(t *testing.T) {
client := mustConnectDeepDoc(t)
// Render a fixture page as the stable input image.
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
pageImg, err := eng.RenderPageImage(0, 216)
if err != nil {
t.Fatalf("render page: %v", err)
}
const N = 5
t.Run("DLA", func(t *testing.T) {
var all [][]DLARegion
for i := 0; i < N; i++ {
regions, err := client.DLA(context.Background(), pageImg)
if err != nil {
t.Fatalf("run %d: %v", i, err)
}
all = append(all, regions)
}
checkDLAIdempotent(t, all)
})
t.Run("TSR", func(t *testing.T) {
// Crop a table region from the page for TSR input.
// Use a fixed crop area (approximate table location in 06_table_content.pdf).
cropped := cropImageRect(pageImg, 50, 200, 550, 400)
var all [][]TSRCell
for i := 0; i < N; i++ {
cells, err := client.TSR(context.Background(), cropped)
if err != nil {
t.Fatalf("run %d: %v", i, err)
}
all = append(all, cells)
}
checkTSRIdempotent(t, all)
})
t.Run("OCRDetect", func(t *testing.T) {
var all [][]OCRBox
for i := 0; i < N; i++ {
boxes, err := client.OCRDetect(context.Background(), pageImg)
if err != nil {
t.Fatalf("run %d: %v", i, err)
}
all = append(all, boxes)
}
checkOCRDetectIdempotent(t, all)
})
t.Run("OCRRecognize", func(t *testing.T) {
cropped := cropImageRect(pageImg, 50, 100, 400, 130)
var all [][]OCRText
for i := 0; i < N; i++ {
texts, err := client.OCRRecognize(context.Background(), cropped)
if err != nil {
t.Fatalf("run %d: %v", i, err)
}
all = append(all, texts)
}
checkOCRRecognizeIdempotent(t, all)
})
}
// cropImageRect crops a rectangular region from an image.
func cropImageRect(img image.Image, x0, y0, x1, y1 int) image.Image {
b := img.Bounds()
if x0 < b.Min.X {
x0 = b.Min.X
}
if y0 < b.Min.Y {
y0 = b.Min.Y
}
if x1 > b.Max.X {
x1 = b.Max.X
}
if y1 > b.Max.Y {
y1 = b.Max.Y
}
out := image.NewRGBA(image.Rect(0, 0, x1-x0, y1-y0))
for y := y0; y < y1; y++ {
for x := x0; x < x1; x++ {
out.Set(x-x0, y-y0, img.At(x, y))
}
}
return out
}
const coordEpsilon = 1.0 // pixels
const confEpsilon = 0.01
func checkDLAIdempotent(t *testing.T, all [][]DLARegion) {
t.Helper()
ref := all[0]
strictEqual := 0
for i := 1; i < len(all); i++ {
if len(all[i]) != len(ref) {
t.Errorf("run %d: %d regions (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
continue
}
strict := true
for j := range ref {
if ref[j].Label != all[i][j].Label {
t.Errorf("run %d region %d: label %q != %q", i, j, all[i][j].Label, ref[j].Label)
strict = false
}
if !coordClose(ref[j].X0, all[i][j].X0) || !coordClose(ref[j].Y0, all[i][j].Y0) ||
!coordClose(ref[j].X1, all[i][j].X1) || !coordClose(ref[j].Y1, all[i][j].Y1) {
t.Errorf("run %d region %d: coords differ beyond epsilon", i, j)
strict = false
}
if !floatClose(ref[j].Confidence, all[i][j].Confidence, confEpsilon) {
strict = false // confidence jitter is acceptable
}
}
if strict {
strictEqual++
}
}
t.Logf("DLA: %d regions, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
}
func checkTSRIdempotent(t *testing.T, all [][]TSRCell) {
t.Helper()
ref := all[0]
strictEqual := 0
for i := 1; i < len(all); i++ {
if len(all[i]) != len(ref) {
t.Errorf("run %d: %d cells (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
continue
}
strict := true
for j := range ref {
if !coordClose(ref[j].X0, all[i][j].X0) || !coordClose(ref[j].Y0, all[i][j].Y0) ||
!coordClose(ref[j].X1, all[i][j].X1) || !coordClose(ref[j].Y1, all[i][j].Y1) {
t.Errorf("run %d cell %d: coords differ beyond epsilon", i, j)
strict = false
}
}
if strict {
strictEqual++
}
}
t.Logf("TSR: %d cells, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
}
func checkOCRDetectIdempotent(t *testing.T, all [][]OCRBox) {
t.Helper()
ref := all[0]
strictEqual := 0
for i := 1; i < len(all); i++ {
if len(all[i]) != len(ref) {
t.Errorf("run %d: %d boxes (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
continue
}
strict := true
for j := range ref {
if !coordClose(ref[j].X0, all[i][j].X0) || !coordClose(ref[j].Y0, all[i][j].Y0) {
strict = false
}
}
if strict {
strictEqual++
}
}
t.Logf("OCRDetect: %d boxes, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
}
func checkOCRRecognizeIdempotent(t *testing.T, all [][]OCRText) {
t.Helper()
ref := all[0]
strictEqual := 0
for i := 1; i < len(all); i++ {
if len(all[i]) != len(ref) {
t.Errorf("run %d: %d texts (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
continue
}
strict := true
for j := range ref {
if ref[j].Text != all[i][j].Text {
t.Errorf("run %d text %d: %q != %q — NOT idempotent", i, j, all[i][j].Text, ref[j].Text)
strict = false
}
if !floatClose(ref[j].Confidence, all[i][j].Confidence, confEpsilon) {
strict = false
}
}
if strict {
strictEqual++
}
}
t.Logf("OCRRecognize: %d texts, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
}
func coordClose(a, b float64) bool {
d := a - b
if d < 0 {
d = -d
}
return d <= coordEpsilon
}
func floatClose(a, b, eps float64) bool {
d := a - b
if d < 0 {
d = -d
}
return d <= eps
}
// ── Alignment Integration Tests ─────────────────────────────────────────
// Run with: go test -v -run TestIntegration_Alignment -tags=integration -count=1 ./internal/parser/
// TestIntegration_TableAlign verifies table text backfill, text-fragment
// suppression inside table regions, and caption removal — the key alignment
// fixes from the Python→Go migration.
func TestIntegration_TableAlign(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "18_table_caption.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
// Assert 1: No caption sections remain (merged into parent or removed).
for _, s := range result.Sections {
if s.LayoutType == "table caption" || s.LayoutType == "figure caption" {
t.Errorf("caption Section should be removed: layout=%s text=%q", s.LayoutType, s.Text)
}
}
// Assert 2: Table sections have TSR-structured text (not raw OCR fragments).
var hasTable bool
for _, s := range result.Sections {
if s.LayoutType == "table" && s.TableItem != nil && len(s.TableItem.Rows) > 0 {
hasTable = true
// Structured text should contain tabs (\t) for column separation.
if !strings.Contains(s.Text, "\t") {
t.Logf("table Section.Text may not be structured: %q", s.Text[:min(80, len(s.Text))])
}
break
}
}
if !hasTable {
t.Log("no table with TSR rows found — may need different PDF layout")
}
t.Logf("Sections: %d, Tables: %d, Figures: %d",
len(result.Sections), len(result.Tables), len(result.Figures))
}
// TestIntegration_GarbageLayout verifies CID-garbled and garbage-layout
// (header/footer/reference) boxes are popped from output.
func TestIntegration_GarbageLayout(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "17_garbage_layout.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
// Assert: No CID-garbled text survives.
for _, s := range result.Sections {
if strings.Contains(s.Text, "(cid:") {
t.Errorf("CID garbage should be popped: %q", s.Text)
}
}
// Assert: No header/footer/reference sections in output.
for _, s := range result.Sections {
if s.LayoutType == "header" || s.LayoutType == "footer" || s.LayoutType == "reference" {
t.Logf("garbage layout %q survived with text %q — may be legitimate page decoration",
s.LayoutType, s.Text[:min(60, len(s.Text))])
}
}
t.Logf("Sections: %d", len(result.Sections))
}
// TestIntegration_MultiChunk verifies chunked processing for large documents.
func TestIntegration_MultiChunk(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "19_multipage_chunk.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
cfg.ChunkSize = 10 // small chunks to force multi-chunk path
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
// 52 pages with 10-page chunks → >= 6 chunks.
if len(result.Sections) == 0 {
t.Error("multi-chunk should produce sections")
}
t.Logf("52 pages × chunkSize=10: %d sections, %d tables",
len(result.Sections), len(result.Tables))
}
// TestIntegration_NoRegression runs a few snapshot PDFs and checks basic
// invariants — no panic, sections produced, no CID garbage.
func TestIntegration_NoRegression(t *testing.T) {
client := mustConnectDeepDoc(t)
for _, name := range []string{
"01_english_simple.pdf",
"02_chinese_simple.pdf",
"06_table_content.pdf",
"07_mixed_content.pdf",
} {
t.Run(name, func(t *testing.T) {
eng := mustOpenEngine(t, name)
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) == 0 {
t.Error("expected at least 1 section")
}
for _, s := range result.Sections {
if strings.Contains(s.Text, "(cid:") {
t.Errorf("CID garbage in %s: %q", name, s.Text)
}
}
t.Logf("%s: %d sections", name, len(result.Sections))
})
}
}
// TestIntegration_TableRotation verifies that evaluateTableOrientation
// correctly detects rotation using region-count scoring.
func TestIntegration_TableRotation(t *testing.T) {
client := mustConnectDeepDoc(t)
t.Run("upright_table", func(t *testing.T) {
eng := mustOpenEngine(t, "rotate_0.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) == 0 {
t.Error("expected sections from upright table")
}
t.Logf("rotate_0: %d sections, %d tables", len(result.Sections), len(result.Tables))
})
t.Run("rotated_90_table", func(t *testing.T) {
eng := mustOpenEngine(t, "rotate_90.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
// DeepDoc DLA does not yet correctly annotate boxes on rotated
// pages (regions and characters are in different coordinate
// spaces post-rotation). Character extraction and rotation are
// verified via the charsToBoxes path.
cfg.SkipOCR = true
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) == 0 {
t.Error("expected sections from rotated table")
}
t.Logf("rotate_90: %d sections, %d tables", len(result.Sections), len(result.Tables))
})
}
// TestIntegration_WordSpacing verifies space insertion between ASCII word
// characters with a visible gap (Python __img_ocr space insertion).
func TestIntegration_WordSpacing(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "01_english_simple.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
// Assert: no "word1word2" concatenation — ASCII words should be
// space-separated (either by embedded-char spacing or OCR gaps).
for _, s := range result.Sections {
run := 0
for _, r := range s.Text {
if r >= 'a' && r <= 'z' {
run++
if run > 15 {
t.Logf("long lowercase run (no space): section text=%q",
s.Text[:min(80, len(s.Text))])
break
}
} else {
run = 0
}
}
}
t.Logf("word spacing check: %d sections", len(result.Sections))
}