mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
This commit is contained in:
764
internal/deepdoc/parser/pdf/deepdoc_integration_test.go
Normal file
764
internal/deepdoc/parser/pdf/deepdoc_integration_test.go
Normal file
@@ -0,0 +1,764 @@
|
||||
//go:build cgo && integration
|
||||
|
||||
package parser
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"image"
|
||||
_ "image/png"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// ── helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
// mustConnectDeepDoc returns a DeepDocClient; skips the test if unavailable.
|
||||
func mustConnectDeepDoc(t *testing.T) *DeepDocClient {
|
||||
t.Helper()
|
||||
url := os.Getenv("DEEPDOC_URL")
|
||||
if url == "" {
|
||||
url = "http://localhost:9390"
|
||||
}
|
||||
client, err := NewDeepDocClient(url)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !client.Health() {
|
||||
t.Fatalf("DeepDoc not available at %s", url)
|
||||
}
|
||||
return client
|
||||
}
|
||||
|
||||
// mustOpenEngine opens a PDF from testdata/pdfs/ and returns a PDFEngine.
|
||||
func mustOpenEngine(t *testing.T, name string) PDFEngine {
|
||||
t.Helper()
|
||||
pdfPath := filepath.Join("testdata", "pdfs", name)
|
||||
data, err := os.ReadFile(pdfPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read fixture %s: %v", name, err)
|
||||
}
|
||||
eng, err := NewEngine(data)
|
||||
if err != nil {
|
||||
t.Fatalf("open engine %s: %v", name, err)
|
||||
}
|
||||
return eng
|
||||
}
|
||||
|
||||
// ── golden-file helpers ────────────────────────────────────────────────────
|
||||
|
||||
// sectionGolden is the snapshot format for section output.
|
||||
type sectionGolden struct {
|
||||
Text string `json:"text"`
|
||||
LayoutType string `json:"layout_type"`
|
||||
}
|
||||
|
||||
// tableGolden is the snapshot format for table output.
|
||||
type tableGolden struct {
|
||||
Rows [][]string `json:"rows"`
|
||||
}
|
||||
|
||||
func goldenPath(name string) string {
|
||||
return filepath.Join("testdata", "integration", name)
|
||||
}
|
||||
|
||||
func readGolden[T any](t *testing.T, path string) []T {
|
||||
t.Helper()
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read golden %s: %v", path, err)
|
||||
}
|
||||
var result []T
|
||||
if err := json.Unmarshal(data, &result); err != nil {
|
||||
t.Fatalf("parse golden %s: %v", path, err)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func writeGolden(t *testing.T, path string, v any) {
|
||||
t.Helper()
|
||||
dir := filepath.Dir(path)
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
t.Fatalf("mkdir %s: %v", dir, err)
|
||||
}
|
||||
f, err := os.Create(path)
|
||||
if err != nil {
|
||||
t.Fatalf("create golden %s: %v", path, err)
|
||||
}
|
||||
defer f.Close()
|
||||
enc := json.NewEncoder(f)
|
||||
enc.SetIndent("", " ")
|
||||
if err := enc.Encode(v); err != nil {
|
||||
t.Fatalf("write golden %s: %v", path, err)
|
||||
}
|
||||
}
|
||||
|
||||
func updateGolden() bool {
|
||||
return os.Getenv("UPDATE_GOLDEN") == "1"
|
||||
}
|
||||
|
||||
// sectionsToGolden converts []Section to the snapshot format.
|
||||
func sectionsToGolden(sections []Section) []sectionGolden {
|
||||
result := make([]sectionGolden, len(sections))
|
||||
for i, s := range sections {
|
||||
result[i] = sectionGolden{
|
||||
Text: s.Text,
|
||||
LayoutType: s.LayoutType,
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// tablesToGolden converts []TableItem to the snapshot format.
|
||||
func tablesToGolden(tables []TableItem) []tableGolden {
|
||||
result := make([]tableGolden, len(tables))
|
||||
for i, t := range tables {
|
||||
result[i] = tableGolden{Rows: t.Rows}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// ── tests ──────────────────────────────────────────────────────────────────
|
||||
|
||||
// TestIntegration_SectionsText verifies section text output matches golden.
|
||||
func TestIntegration_SectionsText(t *testing.T) {
|
||||
client := mustConnectDeepDoc(t)
|
||||
eng := mustOpenEngine(t, "01_english_simple.pdf")
|
||||
defer eng.Close()
|
||||
|
||||
cfg := DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
if len(result.Sections) == 0 {
|
||||
t.Fatal("expected at least one section")
|
||||
}
|
||||
|
||||
golden := goldenPath("01_english_simple.sections.json")
|
||||
got := sectionsToGolden(result.Sections)
|
||||
|
||||
if updateGolden() {
|
||||
writeGolden(t, golden, got)
|
||||
t.Logf("golden written: %s (%d sections)", golden, len(got))
|
||||
return
|
||||
}
|
||||
|
||||
expected := readGolden[sectionGolden](t, golden)
|
||||
if len(expected) != len(got) {
|
||||
t.Errorf("section count mismatch: golden=%d got=%d", len(expected), len(got))
|
||||
}
|
||||
n := len(expected)
|
||||
if len(got) < n {
|
||||
n = len(got)
|
||||
}
|
||||
for i := 0; i < n; i++ {
|
||||
if expected[i].Text != got[i].Text {
|
||||
t.Errorf("section[%d] text mismatch:\n golden: %q\n got: %q", i, expected[i].Text, got[i].Text)
|
||||
}
|
||||
if expected[i].LayoutType != got[i].LayoutType {
|
||||
t.Errorf("section[%d] layout_type mismatch: golden=%q got=%q",
|
||||
i, expected[i].LayoutType, got[i].LayoutType)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_SectionsCount verifies section count is stable.
|
||||
func TestIntegration_SectionsCount(t *testing.T) {
|
||||
client := mustConnectDeepDoc(t)
|
||||
eng := mustOpenEngine(t, "01_english_simple.pdf")
|
||||
defer eng.Close()
|
||||
|
||||
cfg := DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
|
||||
// Read back from golden to get expected count.
|
||||
golden := goldenPath("01_english_simple.sections.json")
|
||||
expected := readGolden[sectionGolden](t, golden)
|
||||
|
||||
if len(result.Sections) != len(expected) {
|
||||
// Log section layout types to help debug divergence.
|
||||
var types []string
|
||||
for _, s := range result.Sections {
|
||||
types = append(types, s.LayoutType)
|
||||
}
|
||||
t.Errorf("section count: golden=%d got=%d (types: %v)", len(expected), len(result.Sections), types)
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_TableStructure verifies table rows and cell text match golden.
|
||||
func TestIntegration_TableStructure(t *testing.T) {
|
||||
client := mustConnectDeepDoc(t)
|
||||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||||
defer eng.Close()
|
||||
|
||||
cfg := DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
if len(result.Tables) == 0 {
|
||||
t.Skip("DLA did not detect any tables in fixture — skipping table structure check")
|
||||
}
|
||||
|
||||
golden := goldenPath("06_table_content.tables.json")
|
||||
got := tablesToGolden(result.Tables)
|
||||
|
||||
if updateGolden() {
|
||||
writeGolden(t, golden, got)
|
||||
t.Logf("golden written: %s (%d tables)", golden, len(got))
|
||||
return
|
||||
}
|
||||
|
||||
expected := readGolden[tableGolden](t, golden)
|
||||
if len(expected) != len(got) {
|
||||
t.Errorf("table count mismatch: golden=%d got=%d", len(expected), len(got))
|
||||
}
|
||||
n := len(expected)
|
||||
if len(got) < n {
|
||||
n = len(got)
|
||||
}
|
||||
for i := 0; i < n; i++ {
|
||||
if len(expected[i].Rows) != len(got[i].Rows) {
|
||||
t.Errorf("table[%d] row count mismatch: golden=%d got=%d", i, len(expected[i].Rows), len(got[i].Rows))
|
||||
continue
|
||||
}
|
||||
for ri := 0; ri < len(expected[i].Rows); ri++ {
|
||||
if len(expected[i].Rows[ri]) != len(got[i].Rows[ri]) {
|
||||
t.Errorf("table[%d] row[%d] cell count mismatch: golden=%d got=%d", i, ri, len(expected[i].Rows[ri]), len(got[i].Rows[ri]))
|
||||
continue
|
||||
}
|
||||
for ci := 0; ci < len(expected[i].Rows[ri]); ci++ {
|
||||
goldenCell := strings.TrimSpace(expected[i].Rows[ri][ci])
|
||||
gotCell := strings.TrimSpace(got[i].Rows[ri][ci])
|
||||
if goldenCell != gotCell {
|
||||
t.Errorf("table[%d] row[%d] cell[%d] mismatch:\n golden: %q\n got: %q",
|
||||
i, ri, ci, goldenCell, gotCell)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_TableImageB64 verifies table ImageB64 is valid base64 PNG.
|
||||
func TestIntegration_TableImageB64(t *testing.T) {
|
||||
client := mustConnectDeepDoc(t)
|
||||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||||
defer eng.Close()
|
||||
|
||||
cfg := DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
if len(result.Tables) == 0 {
|
||||
t.Skip("DLA did not detect any tables in fixture — skipping image check")
|
||||
}
|
||||
|
||||
for i, tbl := range result.Tables {
|
||||
if tbl.ImageB64 == "" {
|
||||
t.Errorf("table[%d] ImageB64 is empty", i)
|
||||
continue
|
||||
}
|
||||
// Verify base64 decodable.
|
||||
raw, err := base64.StdEncoding.DecodeString(tbl.ImageB64)
|
||||
if err != nil {
|
||||
t.Errorf("table[%d] ImageB64: not valid base64: %v", i, err)
|
||||
continue
|
||||
}
|
||||
// Verify it's a valid image.
|
||||
img, _, err := image.Decode(bytes.NewReader(raw))
|
||||
if err != nil {
|
||||
t.Errorf("table[%d] ImageB64: not a valid image: %v", i, err)
|
||||
continue
|
||||
}
|
||||
b := img.Bounds()
|
||||
if b.Dx() <= 0 || b.Dy() <= 0 {
|
||||
t.Errorf("table[%d] ImageB64: zero-size image %dx%d", i, b.Dx(), b.Dy())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_LayoutTypes verifies DLA labels boxes with expected types.
|
||||
func TestIntegration_LayoutTypes(t *testing.T) {
|
||||
client := mustConnectDeepDoc(t)
|
||||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||||
defer eng.Close()
|
||||
|
||||
cfg := DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
|
||||
golden := goldenPath("06_table_content.layouts.json")
|
||||
got := sectionsToGolden(result.Sections)
|
||||
|
||||
if updateGolden() {
|
||||
writeGolden(t, golden, got)
|
||||
t.Logf("golden written: %s (%d sections)", golden, len(got))
|
||||
return
|
||||
}
|
||||
|
||||
expected := readGolden[sectionGolden](t, golden)
|
||||
if len(expected) != len(got) {
|
||||
t.Errorf("section count mismatch: golden=%d got=%d", len(expected), len(got))
|
||||
}
|
||||
|
||||
// Count layout types on both sides.
|
||||
goldenTypes := map[string]int{}
|
||||
gotTypes := map[string]int{}
|
||||
for _, s := range expected {
|
||||
goldenTypes[s.LayoutType]++
|
||||
}
|
||||
for _, s := range got {
|
||||
gotTypes[s.LayoutType]++
|
||||
}
|
||||
for typ, gc := range goldenTypes {
|
||||
if gotTypes[typ] != gc {
|
||||
t.Errorf("LayoutType %q count mismatch: golden=%d got=%d", typ, gc, gotTypes[typ])
|
||||
}
|
||||
}
|
||||
for typ, gc := range gotTypes {
|
||||
if goldenTypes[typ] == 0 {
|
||||
t.Errorf("LayoutType %q count mismatch: golden=0 got=%d", typ, gc)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Idempotency tests ─────────────────────────────────────────────────
|
||||
|
||||
// TestIntegration_Idempotency verifies that DeepDoc APIs return consistent
|
||||
// results when called multiple times with the same image. This validates
|
||||
// that the ML inference is deterministic (or at least semantically stable).
|
||||
func TestIntegration_Idempotency(t *testing.T) {
|
||||
client := mustConnectDeepDoc(t)
|
||||
|
||||
// Render a fixture page as the stable input image.
|
||||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||||
defer eng.Close()
|
||||
pageImg, err := eng.RenderPageImage(0, 216)
|
||||
if err != nil {
|
||||
t.Fatalf("render page: %v", err)
|
||||
}
|
||||
|
||||
const N = 5
|
||||
|
||||
t.Run("DLA", func(t *testing.T) {
|
||||
var all [][]DLARegion
|
||||
for i := 0; i < N; i++ {
|
||||
regions, err := client.DLA(context.Background(), pageImg)
|
||||
if err != nil {
|
||||
t.Fatalf("run %d: %v", i, err)
|
||||
}
|
||||
all = append(all, regions)
|
||||
}
|
||||
checkDLAIdempotent(t, all)
|
||||
})
|
||||
|
||||
t.Run("TSR", func(t *testing.T) {
|
||||
// Crop a table region from the page for TSR input.
|
||||
// Use a fixed crop area (approximate table location in 06_table_content.pdf).
|
||||
cropped := cropImageRect(pageImg, 50, 200, 550, 400)
|
||||
var all [][]TSRCell
|
||||
for i := 0; i < N; i++ {
|
||||
cells, err := client.TSR(context.Background(), cropped)
|
||||
if err != nil {
|
||||
t.Fatalf("run %d: %v", i, err)
|
||||
}
|
||||
all = append(all, cells)
|
||||
}
|
||||
checkTSRIdempotent(t, all)
|
||||
})
|
||||
|
||||
t.Run("OCRDetect", func(t *testing.T) {
|
||||
var all [][]OCRBox
|
||||
for i := 0; i < N; i++ {
|
||||
boxes, err := client.OCRDetect(context.Background(), pageImg)
|
||||
if err != nil {
|
||||
t.Fatalf("run %d: %v", i, err)
|
||||
}
|
||||
all = append(all, boxes)
|
||||
}
|
||||
checkOCRDetectIdempotent(t, all)
|
||||
})
|
||||
|
||||
t.Run("OCRRecognize", func(t *testing.T) {
|
||||
cropped := cropImageRect(pageImg, 50, 100, 400, 130)
|
||||
var all [][]OCRText
|
||||
for i := 0; i < N; i++ {
|
||||
texts, err := client.OCRRecognize(context.Background(), cropped)
|
||||
if err != nil {
|
||||
t.Fatalf("run %d: %v", i, err)
|
||||
}
|
||||
all = append(all, texts)
|
||||
}
|
||||
checkOCRRecognizeIdempotent(t, all)
|
||||
})
|
||||
}
|
||||
|
||||
// cropImageRect crops a rectangular region from an image.
|
||||
func cropImageRect(img image.Image, x0, y0, x1, y1 int) image.Image {
|
||||
b := img.Bounds()
|
||||
if x0 < b.Min.X {
|
||||
x0 = b.Min.X
|
||||
}
|
||||
if y0 < b.Min.Y {
|
||||
y0 = b.Min.Y
|
||||
}
|
||||
if x1 > b.Max.X {
|
||||
x1 = b.Max.X
|
||||
}
|
||||
if y1 > b.Max.Y {
|
||||
y1 = b.Max.Y
|
||||
}
|
||||
out := image.NewRGBA(image.Rect(0, 0, x1-x0, y1-y0))
|
||||
for y := y0; y < y1; y++ {
|
||||
for x := x0; x < x1; x++ {
|
||||
out.Set(x-x0, y-y0, img.At(x, y))
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
const coordEpsilon = 1.0 // pixels
|
||||
const confEpsilon = 0.01
|
||||
|
||||
func checkDLAIdempotent(t *testing.T, all [][]DLARegion) {
|
||||
t.Helper()
|
||||
ref := all[0]
|
||||
strictEqual := 0
|
||||
for i := 1; i < len(all); i++ {
|
||||
if len(all[i]) != len(ref) {
|
||||
t.Errorf("run %d: %d regions (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
|
||||
continue
|
||||
}
|
||||
strict := true
|
||||
for j := range ref {
|
||||
if ref[j].Label != all[i][j].Label {
|
||||
t.Errorf("run %d region %d: label %q != %q", i, j, all[i][j].Label, ref[j].Label)
|
||||
strict = false
|
||||
}
|
||||
if !coordClose(ref[j].X0, all[i][j].X0) || !coordClose(ref[j].Y0, all[i][j].Y0) ||
|
||||
!coordClose(ref[j].X1, all[i][j].X1) || !coordClose(ref[j].Y1, all[i][j].Y1) {
|
||||
t.Errorf("run %d region %d: coords differ beyond epsilon", i, j)
|
||||
strict = false
|
||||
}
|
||||
if !floatClose(ref[j].Confidence, all[i][j].Confidence, confEpsilon) {
|
||||
strict = false // confidence jitter is acceptable
|
||||
}
|
||||
}
|
||||
if strict {
|
||||
strictEqual++
|
||||
}
|
||||
}
|
||||
t.Logf("DLA: %d regions, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
|
||||
}
|
||||
|
||||
func checkTSRIdempotent(t *testing.T, all [][]TSRCell) {
|
||||
t.Helper()
|
||||
ref := all[0]
|
||||
strictEqual := 0
|
||||
for i := 1; i < len(all); i++ {
|
||||
if len(all[i]) != len(ref) {
|
||||
t.Errorf("run %d: %d cells (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
|
||||
continue
|
||||
}
|
||||
strict := true
|
||||
for j := range ref {
|
||||
if !coordClose(ref[j].X0, all[i][j].X0) || !coordClose(ref[j].Y0, all[i][j].Y0) ||
|
||||
!coordClose(ref[j].X1, all[i][j].X1) || !coordClose(ref[j].Y1, all[i][j].Y1) {
|
||||
t.Errorf("run %d cell %d: coords differ beyond epsilon", i, j)
|
||||
strict = false
|
||||
}
|
||||
}
|
||||
if strict {
|
||||
strictEqual++
|
||||
}
|
||||
}
|
||||
t.Logf("TSR: %d cells, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
|
||||
}
|
||||
|
||||
func checkOCRDetectIdempotent(t *testing.T, all [][]OCRBox) {
|
||||
t.Helper()
|
||||
ref := all[0]
|
||||
strictEqual := 0
|
||||
for i := 1; i < len(all); i++ {
|
||||
if len(all[i]) != len(ref) {
|
||||
t.Errorf("run %d: %d boxes (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
|
||||
continue
|
||||
}
|
||||
strict := true
|
||||
for j := range ref {
|
||||
if !coordClose(ref[j].X0, all[i][j].X0) || !coordClose(ref[j].Y0, all[i][j].Y0) {
|
||||
strict = false
|
||||
}
|
||||
}
|
||||
if strict {
|
||||
strictEqual++
|
||||
}
|
||||
}
|
||||
t.Logf("OCRDetect: %d boxes, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
|
||||
}
|
||||
|
||||
func checkOCRRecognizeIdempotent(t *testing.T, all [][]OCRText) {
|
||||
t.Helper()
|
||||
ref := all[0]
|
||||
strictEqual := 0
|
||||
for i := 1; i < len(all); i++ {
|
||||
if len(all[i]) != len(ref) {
|
||||
t.Errorf("run %d: %d texts (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
|
||||
continue
|
||||
}
|
||||
strict := true
|
||||
for j := range ref {
|
||||
if ref[j].Text != all[i][j].Text {
|
||||
t.Errorf("run %d text %d: %q != %q — NOT idempotent", i, j, all[i][j].Text, ref[j].Text)
|
||||
strict = false
|
||||
}
|
||||
if !floatClose(ref[j].Confidence, all[i][j].Confidence, confEpsilon) {
|
||||
strict = false
|
||||
}
|
||||
}
|
||||
if strict {
|
||||
strictEqual++
|
||||
}
|
||||
}
|
||||
t.Logf("OCRRecognize: %d texts, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
|
||||
}
|
||||
|
||||
func coordClose(a, b float64) bool {
|
||||
d := a - b
|
||||
if d < 0 {
|
||||
d = -d
|
||||
}
|
||||
return d <= coordEpsilon
|
||||
}
|
||||
|
||||
func floatClose(a, b, eps float64) bool {
|
||||
d := a - b
|
||||
if d < 0 {
|
||||
d = -d
|
||||
}
|
||||
return d <= eps
|
||||
}
|
||||
|
||||
// ── Alignment Integration Tests ─────────────────────────────────────────
|
||||
// Run with: go test -v -run TestIntegration_Alignment -tags=integration -count=1 ./internal/parser/
|
||||
|
||||
// TestIntegration_TableAlign verifies table text backfill, text-fragment
|
||||
// suppression inside table regions, and caption removal — the key alignment
|
||||
// fixes from the Python→Go migration.
|
||||
func TestIntegration_TableAlign(t *testing.T) {
|
||||
client := mustConnectDeepDoc(t)
|
||||
eng := mustOpenEngine(t, "18_table_caption.pdf")
|
||||
defer eng.Close()
|
||||
|
||||
cfg := DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
|
||||
// Assert 1: No caption sections remain (merged into parent or removed).
|
||||
for _, s := range result.Sections {
|
||||
if s.LayoutType == "table caption" || s.LayoutType == "figure caption" {
|
||||
t.Errorf("caption Section should be removed: layout=%s text=%q", s.LayoutType, s.Text)
|
||||
}
|
||||
}
|
||||
|
||||
// Assert 2: Table sections have TSR-structured text (not raw OCR fragments).
|
||||
var hasTable bool
|
||||
for _, s := range result.Sections {
|
||||
if s.LayoutType == "table" && s.TableItem != nil && len(s.TableItem.Rows) > 0 {
|
||||
hasTable = true
|
||||
// Structured text should contain tabs (\t) for column separation.
|
||||
if !strings.Contains(s.Text, "\t") {
|
||||
t.Logf("table Section.Text may not be structured: %q", s.Text[:min(80, len(s.Text))])
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasTable {
|
||||
t.Log("no table with TSR rows found — may need different PDF layout")
|
||||
}
|
||||
|
||||
t.Logf("Sections: %d, Tables: %d, Figures: %d",
|
||||
len(result.Sections), len(result.Tables), len(result.Figures))
|
||||
}
|
||||
|
||||
// TestIntegration_GarbageLayout verifies CID-garbled and garbage-layout
|
||||
// (header/footer/reference) boxes are popped from output.
|
||||
func TestIntegration_GarbageLayout(t *testing.T) {
|
||||
client := mustConnectDeepDoc(t)
|
||||
eng := mustOpenEngine(t, "17_garbage_layout.pdf")
|
||||
defer eng.Close()
|
||||
|
||||
cfg := DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
|
||||
// Assert: No CID-garbled text survives.
|
||||
for _, s := range result.Sections {
|
||||
if strings.Contains(s.Text, "(cid:") {
|
||||
t.Errorf("CID garbage should be popped: %q", s.Text)
|
||||
}
|
||||
}
|
||||
|
||||
// Assert: No header/footer/reference sections in output.
|
||||
for _, s := range result.Sections {
|
||||
if s.LayoutType == "header" || s.LayoutType == "footer" || s.LayoutType == "reference" {
|
||||
t.Logf("garbage layout %q survived with text %q — may be legitimate page decoration",
|
||||
s.LayoutType, s.Text[:min(60, len(s.Text))])
|
||||
}
|
||||
}
|
||||
|
||||
t.Logf("Sections: %d", len(result.Sections))
|
||||
}
|
||||
|
||||
// TestIntegration_MultiChunk verifies chunked processing for large documents.
|
||||
func TestIntegration_MultiChunk(t *testing.T) {
|
||||
client := mustConnectDeepDoc(t)
|
||||
eng := mustOpenEngine(t, "19_multipage_chunk.pdf")
|
||||
defer eng.Close()
|
||||
|
||||
cfg := DefaultParserConfig()
|
||||
cfg.ChunkSize = 10 // small chunks to force multi-chunk path
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
|
||||
// 52 pages with 10-page chunks → >= 6 chunks.
|
||||
if len(result.Sections) == 0 {
|
||||
t.Error("multi-chunk should produce sections")
|
||||
}
|
||||
|
||||
t.Logf("52 pages × chunkSize=10: %d sections, %d tables",
|
||||
len(result.Sections), len(result.Tables))
|
||||
}
|
||||
|
||||
// TestIntegration_NoRegression runs a few snapshot PDFs and checks basic
|
||||
// invariants — no panic, sections produced, no CID garbage.
|
||||
func TestIntegration_NoRegression(t *testing.T) {
|
||||
client := mustConnectDeepDoc(t)
|
||||
|
||||
for _, name := range []string{
|
||||
"01_english_simple.pdf",
|
||||
"02_chinese_simple.pdf",
|
||||
"06_table_content.pdf",
|
||||
"07_mixed_content.pdf",
|
||||
} {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
eng := mustOpenEngine(t, name)
|
||||
defer eng.Close()
|
||||
cfg := DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
if len(result.Sections) == 0 {
|
||||
t.Error("expected at least 1 section")
|
||||
}
|
||||
for _, s := range result.Sections {
|
||||
if strings.Contains(s.Text, "(cid:") {
|
||||
t.Errorf("CID garbage in %s: %q", name, s.Text)
|
||||
}
|
||||
}
|
||||
t.Logf("%s: %d sections", name, len(result.Sections))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestIntegration_TableRotation verifies that evaluateTableOrientation
|
||||
// correctly detects rotation using region-count scoring.
|
||||
func TestIntegration_TableRotation(t *testing.T) {
|
||||
client := mustConnectDeepDoc(t)
|
||||
|
||||
t.Run("upright_table", func(t *testing.T) {
|
||||
eng := mustOpenEngine(t, "rotate_0.pdf")
|
||||
defer eng.Close()
|
||||
cfg := DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
if len(result.Sections) == 0 {
|
||||
t.Error("expected sections from upright table")
|
||||
}
|
||||
t.Logf("rotate_0: %d sections, %d tables", len(result.Sections), len(result.Tables))
|
||||
})
|
||||
|
||||
t.Run("rotated_90_table", func(t *testing.T) {
|
||||
eng := mustOpenEngine(t, "rotate_90.pdf")
|
||||
defer eng.Close()
|
||||
cfg := DefaultParserConfig()
|
||||
// DeepDoc DLA does not yet correctly annotate boxes on rotated
|
||||
// pages (regions and characters are in different coordinate
|
||||
// spaces post-rotation). Character extraction and rotation are
|
||||
// verified via the charsToBoxes path.
|
||||
cfg.SkipOCR = true
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
if len(result.Sections) == 0 {
|
||||
t.Error("expected sections from rotated table")
|
||||
}
|
||||
t.Logf("rotate_90: %d sections, %d tables", len(result.Sections), len(result.Tables))
|
||||
})
|
||||
}
|
||||
|
||||
// TestIntegration_WordSpacing verifies space insertion between ASCII word
|
||||
// characters with a visible gap (Python __img_ocr space insertion).
|
||||
func TestIntegration_WordSpacing(t *testing.T) {
|
||||
client := mustConnectDeepDoc(t)
|
||||
eng := mustOpenEngine(t, "01_english_simple.pdf")
|
||||
defer eng.Close()
|
||||
|
||||
cfg := DefaultParserConfig()
|
||||
p := NewParser(cfg, client)
|
||||
result, err := p.Parse(context.Background(), eng)
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
|
||||
// Assert: no "word1word2" concatenation — ASCII words should be
|
||||
// space-separated (either by embedded-char spacing or OCR gaps).
|
||||
for _, s := range result.Sections {
|
||||
run := 0
|
||||
for _, r := range s.Text {
|
||||
if r >= 'a' && r <= 'z' {
|
||||
run++
|
||||
if run > 15 {
|
||||
t.Logf("long lowercase run (no space): section text=%q",
|
||||
s.Text[:min(80, len(s.Text))])
|
||||
break
|
||||
}
|
||||
} else {
|
||||
run = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
t.Logf("word spacing check: %d sections", len(result.Sections))
|
||||
}
|
||||
Reference in New Issue
Block a user