Refactor: migrate pdf_parser.py to golang (#16323)

### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
This commit is contained in:
Jack
2026-06-25 20:16:16 +08:00
committed by GitHub
parent c7052f4dd1
commit 304d9e02bb
98 changed files with 24591 additions and 8 deletions

View File

@@ -0,0 +1,764 @@
//go:build cgo && integration
package parser
import (
"bytes"
"context"
"encoding/base64"
"encoding/json"
"image"
_ "image/png"
"os"
"path/filepath"
"strings"
"testing"
)
// ── helpers ────────────────────────────────────────────────────────────────
// mustConnectDeepDoc returns a DeepDocClient; skips the test if unavailable.
func mustConnectDeepDoc(t *testing.T) *DeepDocClient {
t.Helper()
url := os.Getenv("DEEPDOC_URL")
if url == "" {
url = "http://localhost:9390"
}
client, err := NewDeepDocClient(url)
if err != nil {
t.Fatal(err)
}
if !client.Health() {
t.Fatalf("DeepDoc not available at %s", url)
}
return client
}
// mustOpenEngine opens a PDF from testdata/pdfs/ and returns a PDFEngine.
func mustOpenEngine(t *testing.T, name string) PDFEngine {
t.Helper()
pdfPath := filepath.Join("testdata", "pdfs", name)
data, err := os.ReadFile(pdfPath)
if err != nil {
t.Fatalf("read fixture %s: %v", name, err)
}
eng, err := NewEngine(data)
if err != nil {
t.Fatalf("open engine %s: %v", name, err)
}
return eng
}
// ── golden-file helpers ────────────────────────────────────────────────────
// sectionGolden is the snapshot format for section output.
type sectionGolden struct {
Text string `json:"text"`
LayoutType string `json:"layout_type"`
}
// tableGolden is the snapshot format for table output.
type tableGolden struct {
Rows [][]string `json:"rows"`
}
func goldenPath(name string) string {
return filepath.Join("testdata", "integration", name)
}
func readGolden[T any](t *testing.T, path string) []T {
t.Helper()
data, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read golden %s: %v", path, err)
}
var result []T
if err := json.Unmarshal(data, &result); err != nil {
t.Fatalf("parse golden %s: %v", path, err)
}
return result
}
func writeGolden(t *testing.T, path string, v any) {
t.Helper()
dir := filepath.Dir(path)
if err := os.MkdirAll(dir, 0755); err != nil {
t.Fatalf("mkdir %s: %v", dir, err)
}
f, err := os.Create(path)
if err != nil {
t.Fatalf("create golden %s: %v", path, err)
}
defer f.Close()
enc := json.NewEncoder(f)
enc.SetIndent("", " ")
if err := enc.Encode(v); err != nil {
t.Fatalf("write golden %s: %v", path, err)
}
}
func updateGolden() bool {
return os.Getenv("UPDATE_GOLDEN") == "1"
}
// sectionsToGolden converts []Section to the snapshot format.
func sectionsToGolden(sections []Section) []sectionGolden {
result := make([]sectionGolden, len(sections))
for i, s := range sections {
result[i] = sectionGolden{
Text: s.Text,
LayoutType: s.LayoutType,
}
}
return result
}
// tablesToGolden converts []TableItem to the snapshot format.
func tablesToGolden(tables []TableItem) []tableGolden {
result := make([]tableGolden, len(tables))
for i, t := range tables {
result[i] = tableGolden{Rows: t.Rows}
}
return result
}
// ── tests ──────────────────────────────────────────────────────────────────
// TestIntegration_SectionsText verifies section text output matches golden.
func TestIntegration_SectionsText(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "01_english_simple.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) == 0 {
t.Fatal("expected at least one section")
}
golden := goldenPath("01_english_simple.sections.json")
got := sectionsToGolden(result.Sections)
if updateGolden() {
writeGolden(t, golden, got)
t.Logf("golden written: %s (%d sections)", golden, len(got))
return
}
expected := readGolden[sectionGolden](t, golden)
if len(expected) != len(got) {
t.Errorf("section count mismatch: golden=%d got=%d", len(expected), len(got))
}
n := len(expected)
if len(got) < n {
n = len(got)
}
for i := 0; i < n; i++ {
if expected[i].Text != got[i].Text {
t.Errorf("section[%d] text mismatch:\n golden: %q\n got: %q", i, expected[i].Text, got[i].Text)
}
if expected[i].LayoutType != got[i].LayoutType {
t.Errorf("section[%d] layout_type mismatch: golden=%q got=%q",
i, expected[i].LayoutType, got[i].LayoutType)
}
}
}
// TestIntegration_SectionsCount verifies section count is stable.
func TestIntegration_SectionsCount(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "01_english_simple.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
// Read back from golden to get expected count.
golden := goldenPath("01_english_simple.sections.json")
expected := readGolden[sectionGolden](t, golden)
if len(result.Sections) != len(expected) {
// Log section layout types to help debug divergence.
var types []string
for _, s := range result.Sections {
types = append(types, s.LayoutType)
}
t.Errorf("section count: golden=%d got=%d (types: %v)", len(expected), len(result.Sections), types)
}
}
// TestIntegration_TableStructure verifies table rows and cell text match golden.
func TestIntegration_TableStructure(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Tables) == 0 {
t.Skip("DLA did not detect any tables in fixture — skipping table structure check")
}
golden := goldenPath("06_table_content.tables.json")
got := tablesToGolden(result.Tables)
if updateGolden() {
writeGolden(t, golden, got)
t.Logf("golden written: %s (%d tables)", golden, len(got))
return
}
expected := readGolden[tableGolden](t, golden)
if len(expected) != len(got) {
t.Errorf("table count mismatch: golden=%d got=%d", len(expected), len(got))
}
n := len(expected)
if len(got) < n {
n = len(got)
}
for i := 0; i < n; i++ {
if len(expected[i].Rows) != len(got[i].Rows) {
t.Errorf("table[%d] row count mismatch: golden=%d got=%d", i, len(expected[i].Rows), len(got[i].Rows))
continue
}
for ri := 0; ri < len(expected[i].Rows); ri++ {
if len(expected[i].Rows[ri]) != len(got[i].Rows[ri]) {
t.Errorf("table[%d] row[%d] cell count mismatch: golden=%d got=%d", i, ri, len(expected[i].Rows[ri]), len(got[i].Rows[ri]))
continue
}
for ci := 0; ci < len(expected[i].Rows[ri]); ci++ {
goldenCell := strings.TrimSpace(expected[i].Rows[ri][ci])
gotCell := strings.TrimSpace(got[i].Rows[ri][ci])
if goldenCell != gotCell {
t.Errorf("table[%d] row[%d] cell[%d] mismatch:\n golden: %q\n got: %q",
i, ri, ci, goldenCell, gotCell)
}
}
}
}
}
// TestIntegration_TableImageB64 verifies table ImageB64 is valid base64 PNG.
func TestIntegration_TableImageB64(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Tables) == 0 {
t.Skip("DLA did not detect any tables in fixture — skipping image check")
}
for i, tbl := range result.Tables {
if tbl.ImageB64 == "" {
t.Errorf("table[%d] ImageB64 is empty", i)
continue
}
// Verify base64 decodable.
raw, err := base64.StdEncoding.DecodeString(tbl.ImageB64)
if err != nil {
t.Errorf("table[%d] ImageB64: not valid base64: %v", i, err)
continue
}
// Verify it's a valid image.
img, _, err := image.Decode(bytes.NewReader(raw))
if err != nil {
t.Errorf("table[%d] ImageB64: not a valid image: %v", i, err)
continue
}
b := img.Bounds()
if b.Dx() <= 0 || b.Dy() <= 0 {
t.Errorf("table[%d] ImageB64: zero-size image %dx%d", i, b.Dx(), b.Dy())
}
}
}
// TestIntegration_LayoutTypes verifies DLA labels boxes with expected types.
func TestIntegration_LayoutTypes(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
golden := goldenPath("06_table_content.layouts.json")
got := sectionsToGolden(result.Sections)
if updateGolden() {
writeGolden(t, golden, got)
t.Logf("golden written: %s (%d sections)", golden, len(got))
return
}
expected := readGolden[sectionGolden](t, golden)
if len(expected) != len(got) {
t.Errorf("section count mismatch: golden=%d got=%d", len(expected), len(got))
}
// Count layout types on both sides.
goldenTypes := map[string]int{}
gotTypes := map[string]int{}
for _, s := range expected {
goldenTypes[s.LayoutType]++
}
for _, s := range got {
gotTypes[s.LayoutType]++
}
for typ, gc := range goldenTypes {
if gotTypes[typ] != gc {
t.Errorf("LayoutType %q count mismatch: golden=%d got=%d", typ, gc, gotTypes[typ])
}
}
for typ, gc := range gotTypes {
if goldenTypes[typ] == 0 {
t.Errorf("LayoutType %q count mismatch: golden=0 got=%d", typ, gc)
}
}
}
// ── Idempotency tests ─────────────────────────────────────────────────
// TestIntegration_Idempotency verifies that DeepDoc APIs return consistent
// results when called multiple times with the same image. This validates
// that the ML inference is deterministic (or at least semantically stable).
func TestIntegration_Idempotency(t *testing.T) {
client := mustConnectDeepDoc(t)
// Render a fixture page as the stable input image.
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
pageImg, err := eng.RenderPageImage(0, 216)
if err != nil {
t.Fatalf("render page: %v", err)
}
const N = 5
t.Run("DLA", func(t *testing.T) {
var all [][]DLARegion
for i := 0; i < N; i++ {
regions, err := client.DLA(context.Background(), pageImg)
if err != nil {
t.Fatalf("run %d: %v", i, err)
}
all = append(all, regions)
}
checkDLAIdempotent(t, all)
})
t.Run("TSR", func(t *testing.T) {
// Crop a table region from the page for TSR input.
// Use a fixed crop area (approximate table location in 06_table_content.pdf).
cropped := cropImageRect(pageImg, 50, 200, 550, 400)
var all [][]TSRCell
for i := 0; i < N; i++ {
cells, err := client.TSR(context.Background(), cropped)
if err != nil {
t.Fatalf("run %d: %v", i, err)
}
all = append(all, cells)
}
checkTSRIdempotent(t, all)
})
t.Run("OCRDetect", func(t *testing.T) {
var all [][]OCRBox
for i := 0; i < N; i++ {
boxes, err := client.OCRDetect(context.Background(), pageImg)
if err != nil {
t.Fatalf("run %d: %v", i, err)
}
all = append(all, boxes)
}
checkOCRDetectIdempotent(t, all)
})
t.Run("OCRRecognize", func(t *testing.T) {
cropped := cropImageRect(pageImg, 50, 100, 400, 130)
var all [][]OCRText
for i := 0; i < N; i++ {
texts, err := client.OCRRecognize(context.Background(), cropped)
if err != nil {
t.Fatalf("run %d: %v", i, err)
}
all = append(all, texts)
}
checkOCRRecognizeIdempotent(t, all)
})
}
// cropImageRect crops a rectangular region from an image.
func cropImageRect(img image.Image, x0, y0, x1, y1 int) image.Image {
b := img.Bounds()
if x0 < b.Min.X {
x0 = b.Min.X
}
if y0 < b.Min.Y {
y0 = b.Min.Y
}
if x1 > b.Max.X {
x1 = b.Max.X
}
if y1 > b.Max.Y {
y1 = b.Max.Y
}
out := image.NewRGBA(image.Rect(0, 0, x1-x0, y1-y0))
for y := y0; y < y1; y++ {
for x := x0; x < x1; x++ {
out.Set(x-x0, y-y0, img.At(x, y))
}
}
return out
}
const coordEpsilon = 1.0 // pixels
const confEpsilon = 0.01
func checkDLAIdempotent(t *testing.T, all [][]DLARegion) {
t.Helper()
ref := all[0]
strictEqual := 0
for i := 1; i < len(all); i++ {
if len(all[i]) != len(ref) {
t.Errorf("run %d: %d regions (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
continue
}
strict := true
for j := range ref {
if ref[j].Label != all[i][j].Label {
t.Errorf("run %d region %d: label %q != %q", i, j, all[i][j].Label, ref[j].Label)
strict = false
}
if !coordClose(ref[j].X0, all[i][j].X0) || !coordClose(ref[j].Y0, all[i][j].Y0) ||
!coordClose(ref[j].X1, all[i][j].X1) || !coordClose(ref[j].Y1, all[i][j].Y1) {
t.Errorf("run %d region %d: coords differ beyond epsilon", i, j)
strict = false
}
if !floatClose(ref[j].Confidence, all[i][j].Confidence, confEpsilon) {
strict = false // confidence jitter is acceptable
}
}
if strict {
strictEqual++
}
}
t.Logf("DLA: %d regions, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
}
func checkTSRIdempotent(t *testing.T, all [][]TSRCell) {
t.Helper()
ref := all[0]
strictEqual := 0
for i := 1; i < len(all); i++ {
if len(all[i]) != len(ref) {
t.Errorf("run %d: %d cells (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
continue
}
strict := true
for j := range ref {
if !coordClose(ref[j].X0, all[i][j].X0) || !coordClose(ref[j].Y0, all[i][j].Y0) ||
!coordClose(ref[j].X1, all[i][j].X1) || !coordClose(ref[j].Y1, all[i][j].Y1) {
t.Errorf("run %d cell %d: coords differ beyond epsilon", i, j)
strict = false
}
}
if strict {
strictEqual++
}
}
t.Logf("TSR: %d cells, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
}
func checkOCRDetectIdempotent(t *testing.T, all [][]OCRBox) {
t.Helper()
ref := all[0]
strictEqual := 0
for i := 1; i < len(all); i++ {
if len(all[i]) != len(ref) {
t.Errorf("run %d: %d boxes (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
continue
}
strict := true
for j := range ref {
if !coordClose(ref[j].X0, all[i][j].X0) || !coordClose(ref[j].Y0, all[i][j].Y0) {
strict = false
}
}
if strict {
strictEqual++
}
}
t.Logf("OCRDetect: %d boxes, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
}
func checkOCRRecognizeIdempotent(t *testing.T, all [][]OCRText) {
t.Helper()
ref := all[0]
strictEqual := 0
for i := 1; i < len(all); i++ {
if len(all[i]) != len(ref) {
t.Errorf("run %d: %d texts (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
continue
}
strict := true
for j := range ref {
if ref[j].Text != all[i][j].Text {
t.Errorf("run %d text %d: %q != %q — NOT idempotent", i, j, all[i][j].Text, ref[j].Text)
strict = false
}
if !floatClose(ref[j].Confidence, all[i][j].Confidence, confEpsilon) {
strict = false
}
}
if strict {
strictEqual++
}
}
t.Logf("OCRRecognize: %d texts, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
}
func coordClose(a, b float64) bool {
d := a - b
if d < 0 {
d = -d
}
return d <= coordEpsilon
}
func floatClose(a, b, eps float64) bool {
d := a - b
if d < 0 {
d = -d
}
return d <= eps
}
// ── Alignment Integration Tests ─────────────────────────────────────────
// Run with: go test -v -run TestIntegration_Alignment -tags=integration -count=1 ./internal/parser/
// TestIntegration_TableAlign verifies table text backfill, text-fragment
// suppression inside table regions, and caption removal — the key alignment
// fixes from the Python→Go migration.
func TestIntegration_TableAlign(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "18_table_caption.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
// Assert 1: No caption sections remain (merged into parent or removed).
for _, s := range result.Sections {
if s.LayoutType == "table caption" || s.LayoutType == "figure caption" {
t.Errorf("caption Section should be removed: layout=%s text=%q", s.LayoutType, s.Text)
}
}
// Assert 2: Table sections have TSR-structured text (not raw OCR fragments).
var hasTable bool
for _, s := range result.Sections {
if s.LayoutType == "table" && s.TableItem != nil && len(s.TableItem.Rows) > 0 {
hasTable = true
// Structured text should contain tabs (\t) for column separation.
if !strings.Contains(s.Text, "\t") {
t.Logf("table Section.Text may not be structured: %q", s.Text[:min(80, len(s.Text))])
}
break
}
}
if !hasTable {
t.Log("no table with TSR rows found — may need different PDF layout")
}
t.Logf("Sections: %d, Tables: %d, Figures: %d",
len(result.Sections), len(result.Tables), len(result.Figures))
}
// TestIntegration_GarbageLayout verifies CID-garbled and garbage-layout
// (header/footer/reference) boxes are popped from output.
func TestIntegration_GarbageLayout(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "17_garbage_layout.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
// Assert: No CID-garbled text survives.
for _, s := range result.Sections {
if strings.Contains(s.Text, "(cid:") {
t.Errorf("CID garbage should be popped: %q", s.Text)
}
}
// Assert: No header/footer/reference sections in output.
for _, s := range result.Sections {
if s.LayoutType == "header" || s.LayoutType == "footer" || s.LayoutType == "reference" {
t.Logf("garbage layout %q survived with text %q — may be legitimate page decoration",
s.LayoutType, s.Text[:min(60, len(s.Text))])
}
}
t.Logf("Sections: %d", len(result.Sections))
}
// TestIntegration_MultiChunk verifies chunked processing for large documents.
func TestIntegration_MultiChunk(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "19_multipage_chunk.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
cfg.ChunkSize = 10 // small chunks to force multi-chunk path
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
// 52 pages with 10-page chunks → >= 6 chunks.
if len(result.Sections) == 0 {
t.Error("multi-chunk should produce sections")
}
t.Logf("52 pages × chunkSize=10: %d sections, %d tables",
len(result.Sections), len(result.Tables))
}
// TestIntegration_NoRegression runs a few snapshot PDFs and checks basic
// invariants — no panic, sections produced, no CID garbage.
func TestIntegration_NoRegression(t *testing.T) {
client := mustConnectDeepDoc(t)
for _, name := range []string{
"01_english_simple.pdf",
"02_chinese_simple.pdf",
"06_table_content.pdf",
"07_mixed_content.pdf",
} {
t.Run(name, func(t *testing.T) {
eng := mustOpenEngine(t, name)
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) == 0 {
t.Error("expected at least 1 section")
}
for _, s := range result.Sections {
if strings.Contains(s.Text, "(cid:") {
t.Errorf("CID garbage in %s: %q", name, s.Text)
}
}
t.Logf("%s: %d sections", name, len(result.Sections))
})
}
}
// TestIntegration_TableRotation verifies that evaluateTableOrientation
// correctly detects rotation using region-count scoring.
func TestIntegration_TableRotation(t *testing.T) {
client := mustConnectDeepDoc(t)
t.Run("upright_table", func(t *testing.T) {
eng := mustOpenEngine(t, "rotate_0.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) == 0 {
t.Error("expected sections from upright table")
}
t.Logf("rotate_0: %d sections, %d tables", len(result.Sections), len(result.Tables))
})
t.Run("rotated_90_table", func(t *testing.T) {
eng := mustOpenEngine(t, "rotate_90.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
// DeepDoc DLA does not yet correctly annotate boxes on rotated
// pages (regions and characters are in different coordinate
// spaces post-rotation). Character extraction and rotation are
// verified via the charsToBoxes path.
cfg.SkipOCR = true
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) == 0 {
t.Error("expected sections from rotated table")
}
t.Logf("rotate_90: %d sections, %d tables", len(result.Sections), len(result.Tables))
})
}
// TestIntegration_WordSpacing verifies space insertion between ASCII word
// characters with a visible gap (Python __img_ocr space insertion).
func TestIntegration_WordSpacing(t *testing.T) {
client := mustConnectDeepDoc(t)
eng := mustOpenEngine(t, "01_english_simple.pdf")
defer eng.Close()
cfg := DefaultParserConfig()
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
// Assert: no "word1word2" concatenation — ASCII words should be
// space-separated (either by embedded-char spacing or OCR gaps).
for _, s := range result.Sections {
run := 0
for _, r := range s.Text {
if r >= 'a' && r <= 'z' {
run++
if run > 15 {
t.Logf("long lowercase run (no space): section text=%q",
s.Text[:min(80, len(s.Text))])
break
}
} else {
run = 0
}
}
}
t.Logf("word spacing check: %d sections", len(result.Sections))
}