mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
765 lines
22 KiB
Go
765 lines
22 KiB
Go
//go:build cgo && integration
|
||
|
||
package parser
|
||
|
||
import (
|
||
"bytes"
|
||
"context"
|
||
"encoding/base64"
|
||
"encoding/json"
|
||
"image"
|
||
_ "image/png"
|
||
"os"
|
||
"path/filepath"
|
||
"strings"
|
||
"testing"
|
||
)
|
||
|
||
// ── helpers ────────────────────────────────────────────────────────────────
|
||
|
||
// mustConnectDeepDoc returns a DeepDocClient; skips the test if unavailable.
|
||
func mustConnectDeepDoc(t *testing.T) *DeepDocClient {
|
||
t.Helper()
|
||
url := os.Getenv("DEEPDOC_URL")
|
||
if url == "" {
|
||
url = "http://localhost:9390"
|
||
}
|
||
client, err := NewDeepDocClient(url)
|
||
if err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
if !client.Health() {
|
||
t.Fatalf("DeepDoc not available at %s", url)
|
||
}
|
||
return client
|
||
}
|
||
|
||
// mustOpenEngine opens a PDF from testdata/pdfs/ and returns a PDFEngine.
|
||
func mustOpenEngine(t *testing.T, name string) PDFEngine {
|
||
t.Helper()
|
||
pdfPath := filepath.Join("testdata", "pdfs", name)
|
||
data, err := os.ReadFile(pdfPath)
|
||
if err != nil {
|
||
t.Fatalf("read fixture %s: %v", name, err)
|
||
}
|
||
eng, err := NewEngine(data)
|
||
if err != nil {
|
||
t.Fatalf("open engine %s: %v", name, err)
|
||
}
|
||
return eng
|
||
}
|
||
|
||
// ── golden-file helpers ────────────────────────────────────────────────────
|
||
|
||
// sectionGolden is the snapshot format for section output.
|
||
type sectionGolden struct {
|
||
Text string `json:"text"`
|
||
LayoutType string `json:"layout_type"`
|
||
}
|
||
|
||
// tableGolden is the snapshot format for table output.
|
||
type tableGolden struct {
|
||
Rows [][]string `json:"rows"`
|
||
}
|
||
|
||
func goldenPath(name string) string {
|
||
return filepath.Join("testdata", "integration", name)
|
||
}
|
||
|
||
func readGolden[T any](t *testing.T, path string) []T {
|
||
t.Helper()
|
||
data, err := os.ReadFile(path)
|
||
if err != nil {
|
||
t.Fatalf("read golden %s: %v", path, err)
|
||
}
|
||
var result []T
|
||
if err := json.Unmarshal(data, &result); err != nil {
|
||
t.Fatalf("parse golden %s: %v", path, err)
|
||
}
|
||
return result
|
||
}
|
||
|
||
func writeGolden(t *testing.T, path string, v any) {
|
||
t.Helper()
|
||
dir := filepath.Dir(path)
|
||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||
t.Fatalf("mkdir %s: %v", dir, err)
|
||
}
|
||
f, err := os.Create(path)
|
||
if err != nil {
|
||
t.Fatalf("create golden %s: %v", path, err)
|
||
}
|
||
defer f.Close()
|
||
enc := json.NewEncoder(f)
|
||
enc.SetIndent("", " ")
|
||
if err := enc.Encode(v); err != nil {
|
||
t.Fatalf("write golden %s: %v", path, err)
|
||
}
|
||
}
|
||
|
||
func updateGolden() bool {
|
||
return os.Getenv("UPDATE_GOLDEN") == "1"
|
||
}
|
||
|
||
// sectionsToGolden converts []Section to the snapshot format.
|
||
func sectionsToGolden(sections []Section) []sectionGolden {
|
||
result := make([]sectionGolden, len(sections))
|
||
for i, s := range sections {
|
||
result[i] = sectionGolden{
|
||
Text: s.Text,
|
||
LayoutType: s.LayoutType,
|
||
}
|
||
}
|
||
return result
|
||
}
|
||
|
||
// tablesToGolden converts []TableItem to the snapshot format.
|
||
func tablesToGolden(tables []TableItem) []tableGolden {
|
||
result := make([]tableGolden, len(tables))
|
||
for i, t := range tables {
|
||
result[i] = tableGolden{Rows: t.Rows}
|
||
}
|
||
return result
|
||
}
|
||
|
||
// ── tests ──────────────────────────────────────────────────────────────────
|
||
|
||
// TestIntegration_SectionsText verifies section text output matches golden.
|
||
func TestIntegration_SectionsText(t *testing.T) {
|
||
client := mustConnectDeepDoc(t)
|
||
eng := mustOpenEngine(t, "01_english_simple.pdf")
|
||
defer eng.Close()
|
||
|
||
cfg := DefaultParserConfig()
|
||
p := NewParser(cfg, client)
|
||
result, err := p.Parse(context.Background(), eng)
|
||
if err != nil {
|
||
t.Fatalf("Parse: %v", err)
|
||
}
|
||
if len(result.Sections) == 0 {
|
||
t.Fatal("expected at least one section")
|
||
}
|
||
|
||
golden := goldenPath("01_english_simple.sections.json")
|
||
got := sectionsToGolden(result.Sections)
|
||
|
||
if updateGolden() {
|
||
writeGolden(t, golden, got)
|
||
t.Logf("golden written: %s (%d sections)", golden, len(got))
|
||
return
|
||
}
|
||
|
||
expected := readGolden[sectionGolden](t, golden)
|
||
if len(expected) != len(got) {
|
||
t.Errorf("section count mismatch: golden=%d got=%d", len(expected), len(got))
|
||
}
|
||
n := len(expected)
|
||
if len(got) < n {
|
||
n = len(got)
|
||
}
|
||
for i := 0; i < n; i++ {
|
||
if expected[i].Text != got[i].Text {
|
||
t.Errorf("section[%d] text mismatch:\n golden: %q\n got: %q", i, expected[i].Text, got[i].Text)
|
||
}
|
||
if expected[i].LayoutType != got[i].LayoutType {
|
||
t.Errorf("section[%d] layout_type mismatch: golden=%q got=%q",
|
||
i, expected[i].LayoutType, got[i].LayoutType)
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestIntegration_SectionsCount verifies section count is stable.
|
||
func TestIntegration_SectionsCount(t *testing.T) {
|
||
client := mustConnectDeepDoc(t)
|
||
eng := mustOpenEngine(t, "01_english_simple.pdf")
|
||
defer eng.Close()
|
||
|
||
cfg := DefaultParserConfig()
|
||
p := NewParser(cfg, client)
|
||
result, err := p.Parse(context.Background(), eng)
|
||
if err != nil {
|
||
t.Fatalf("Parse: %v", err)
|
||
}
|
||
|
||
// Read back from golden to get expected count.
|
||
golden := goldenPath("01_english_simple.sections.json")
|
||
expected := readGolden[sectionGolden](t, golden)
|
||
|
||
if len(result.Sections) != len(expected) {
|
||
// Log section layout types to help debug divergence.
|
||
var types []string
|
||
for _, s := range result.Sections {
|
||
types = append(types, s.LayoutType)
|
||
}
|
||
t.Errorf("section count: golden=%d got=%d (types: %v)", len(expected), len(result.Sections), types)
|
||
}
|
||
}
|
||
|
||
// TestIntegration_TableStructure verifies table rows and cell text match golden.
|
||
func TestIntegration_TableStructure(t *testing.T) {
|
||
client := mustConnectDeepDoc(t)
|
||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||
defer eng.Close()
|
||
|
||
cfg := DefaultParserConfig()
|
||
p := NewParser(cfg, client)
|
||
result, err := p.Parse(context.Background(), eng)
|
||
if err != nil {
|
||
t.Fatalf("Parse: %v", err)
|
||
}
|
||
if len(result.Tables) == 0 {
|
||
t.Skip("DLA did not detect any tables in fixture — skipping table structure check")
|
||
}
|
||
|
||
golden := goldenPath("06_table_content.tables.json")
|
||
got := tablesToGolden(result.Tables)
|
||
|
||
if updateGolden() {
|
||
writeGolden(t, golden, got)
|
||
t.Logf("golden written: %s (%d tables)", golden, len(got))
|
||
return
|
||
}
|
||
|
||
expected := readGolden[tableGolden](t, golden)
|
||
if len(expected) != len(got) {
|
||
t.Errorf("table count mismatch: golden=%d got=%d", len(expected), len(got))
|
||
}
|
||
n := len(expected)
|
||
if len(got) < n {
|
||
n = len(got)
|
||
}
|
||
for i := 0; i < n; i++ {
|
||
if len(expected[i].Rows) != len(got[i].Rows) {
|
||
t.Errorf("table[%d] row count mismatch: golden=%d got=%d", i, len(expected[i].Rows), len(got[i].Rows))
|
||
continue
|
||
}
|
||
for ri := 0; ri < len(expected[i].Rows); ri++ {
|
||
if len(expected[i].Rows[ri]) != len(got[i].Rows[ri]) {
|
||
t.Errorf("table[%d] row[%d] cell count mismatch: golden=%d got=%d", i, ri, len(expected[i].Rows[ri]), len(got[i].Rows[ri]))
|
||
continue
|
||
}
|
||
for ci := 0; ci < len(expected[i].Rows[ri]); ci++ {
|
||
goldenCell := strings.TrimSpace(expected[i].Rows[ri][ci])
|
||
gotCell := strings.TrimSpace(got[i].Rows[ri][ci])
|
||
if goldenCell != gotCell {
|
||
t.Errorf("table[%d] row[%d] cell[%d] mismatch:\n golden: %q\n got: %q",
|
||
i, ri, ci, goldenCell, gotCell)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestIntegration_TableImageB64 verifies table ImageB64 is valid base64 PNG.
|
||
func TestIntegration_TableImageB64(t *testing.T) {
|
||
client := mustConnectDeepDoc(t)
|
||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||
defer eng.Close()
|
||
|
||
cfg := DefaultParserConfig()
|
||
p := NewParser(cfg, client)
|
||
result, err := p.Parse(context.Background(), eng)
|
||
if err != nil {
|
||
t.Fatalf("Parse: %v", err)
|
||
}
|
||
if len(result.Tables) == 0 {
|
||
t.Skip("DLA did not detect any tables in fixture — skipping image check")
|
||
}
|
||
|
||
for i, tbl := range result.Tables {
|
||
if tbl.ImageB64 == "" {
|
||
t.Errorf("table[%d] ImageB64 is empty", i)
|
||
continue
|
||
}
|
||
// Verify base64 decodable.
|
||
raw, err := base64.StdEncoding.DecodeString(tbl.ImageB64)
|
||
if err != nil {
|
||
t.Errorf("table[%d] ImageB64: not valid base64: %v", i, err)
|
||
continue
|
||
}
|
||
// Verify it's a valid image.
|
||
img, _, err := image.Decode(bytes.NewReader(raw))
|
||
if err != nil {
|
||
t.Errorf("table[%d] ImageB64: not a valid image: %v", i, err)
|
||
continue
|
||
}
|
||
b := img.Bounds()
|
||
if b.Dx() <= 0 || b.Dy() <= 0 {
|
||
t.Errorf("table[%d] ImageB64: zero-size image %dx%d", i, b.Dx(), b.Dy())
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestIntegration_LayoutTypes verifies DLA labels boxes with expected types.
|
||
func TestIntegration_LayoutTypes(t *testing.T) {
|
||
client := mustConnectDeepDoc(t)
|
||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||
defer eng.Close()
|
||
|
||
cfg := DefaultParserConfig()
|
||
p := NewParser(cfg, client)
|
||
result, err := p.Parse(context.Background(), eng)
|
||
if err != nil {
|
||
t.Fatalf("Parse: %v", err)
|
||
}
|
||
|
||
golden := goldenPath("06_table_content.layouts.json")
|
||
got := sectionsToGolden(result.Sections)
|
||
|
||
if updateGolden() {
|
||
writeGolden(t, golden, got)
|
||
t.Logf("golden written: %s (%d sections)", golden, len(got))
|
||
return
|
||
}
|
||
|
||
expected := readGolden[sectionGolden](t, golden)
|
||
if len(expected) != len(got) {
|
||
t.Errorf("section count mismatch: golden=%d got=%d", len(expected), len(got))
|
||
}
|
||
|
||
// Count layout types on both sides.
|
||
goldenTypes := map[string]int{}
|
||
gotTypes := map[string]int{}
|
||
for _, s := range expected {
|
||
goldenTypes[s.LayoutType]++
|
||
}
|
||
for _, s := range got {
|
||
gotTypes[s.LayoutType]++
|
||
}
|
||
for typ, gc := range goldenTypes {
|
||
if gotTypes[typ] != gc {
|
||
t.Errorf("LayoutType %q count mismatch: golden=%d got=%d", typ, gc, gotTypes[typ])
|
||
}
|
||
}
|
||
for typ, gc := range gotTypes {
|
||
if goldenTypes[typ] == 0 {
|
||
t.Errorf("LayoutType %q count mismatch: golden=0 got=%d", typ, gc)
|
||
}
|
||
}
|
||
}
|
||
|
||
// ── Idempotency tests ─────────────────────────────────────────────────
|
||
|
||
// TestIntegration_Idempotency verifies that DeepDoc APIs return consistent
|
||
// results when called multiple times with the same image. This validates
|
||
// that the ML inference is deterministic (or at least semantically stable).
|
||
func TestIntegration_Idempotency(t *testing.T) {
|
||
client := mustConnectDeepDoc(t)
|
||
|
||
// Render a fixture page as the stable input image.
|
||
eng := mustOpenEngine(t, "06_table_content.pdf")
|
||
defer eng.Close()
|
||
pageImg, err := eng.RenderPageImage(0, 216)
|
||
if err != nil {
|
||
t.Fatalf("render page: %v", err)
|
||
}
|
||
|
||
const N = 5
|
||
|
||
t.Run("DLA", func(t *testing.T) {
|
||
var all [][]DLARegion
|
||
for i := 0; i < N; i++ {
|
||
regions, err := client.DLA(context.Background(), pageImg)
|
||
if err != nil {
|
||
t.Fatalf("run %d: %v", i, err)
|
||
}
|
||
all = append(all, regions)
|
||
}
|
||
checkDLAIdempotent(t, all)
|
||
})
|
||
|
||
t.Run("TSR", func(t *testing.T) {
|
||
// Crop a table region from the page for TSR input.
|
||
// Use a fixed crop area (approximate table location in 06_table_content.pdf).
|
||
cropped := cropImageRect(pageImg, 50, 200, 550, 400)
|
||
var all [][]TSRCell
|
||
for i := 0; i < N; i++ {
|
||
cells, err := client.TSR(context.Background(), cropped)
|
||
if err != nil {
|
||
t.Fatalf("run %d: %v", i, err)
|
||
}
|
||
all = append(all, cells)
|
||
}
|
||
checkTSRIdempotent(t, all)
|
||
})
|
||
|
||
t.Run("OCRDetect", func(t *testing.T) {
|
||
var all [][]OCRBox
|
||
for i := 0; i < N; i++ {
|
||
boxes, err := client.OCRDetect(context.Background(), pageImg)
|
||
if err != nil {
|
||
t.Fatalf("run %d: %v", i, err)
|
||
}
|
||
all = append(all, boxes)
|
||
}
|
||
checkOCRDetectIdempotent(t, all)
|
||
})
|
||
|
||
t.Run("OCRRecognize", func(t *testing.T) {
|
||
cropped := cropImageRect(pageImg, 50, 100, 400, 130)
|
||
var all [][]OCRText
|
||
for i := 0; i < N; i++ {
|
||
texts, err := client.OCRRecognize(context.Background(), cropped)
|
||
if err != nil {
|
||
t.Fatalf("run %d: %v", i, err)
|
||
}
|
||
all = append(all, texts)
|
||
}
|
||
checkOCRRecognizeIdempotent(t, all)
|
||
})
|
||
}
|
||
|
||
// cropImageRect crops a rectangular region from an image.
|
||
func cropImageRect(img image.Image, x0, y0, x1, y1 int) image.Image {
|
||
b := img.Bounds()
|
||
if x0 < b.Min.X {
|
||
x0 = b.Min.X
|
||
}
|
||
if y0 < b.Min.Y {
|
||
y0 = b.Min.Y
|
||
}
|
||
if x1 > b.Max.X {
|
||
x1 = b.Max.X
|
||
}
|
||
if y1 > b.Max.Y {
|
||
y1 = b.Max.Y
|
||
}
|
||
out := image.NewRGBA(image.Rect(0, 0, x1-x0, y1-y0))
|
||
for y := y0; y < y1; y++ {
|
||
for x := x0; x < x1; x++ {
|
||
out.Set(x-x0, y-y0, img.At(x, y))
|
||
}
|
||
}
|
||
return out
|
||
}
|
||
|
||
const coordEpsilon = 1.0 // pixels
|
||
const confEpsilon = 0.01
|
||
|
||
func checkDLAIdempotent(t *testing.T, all [][]DLARegion) {
|
||
t.Helper()
|
||
ref := all[0]
|
||
strictEqual := 0
|
||
for i := 1; i < len(all); i++ {
|
||
if len(all[i]) != len(ref) {
|
||
t.Errorf("run %d: %d regions (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
|
||
continue
|
||
}
|
||
strict := true
|
||
for j := range ref {
|
||
if ref[j].Label != all[i][j].Label {
|
||
t.Errorf("run %d region %d: label %q != %q", i, j, all[i][j].Label, ref[j].Label)
|
||
strict = false
|
||
}
|
||
if !coordClose(ref[j].X0, all[i][j].X0) || !coordClose(ref[j].Y0, all[i][j].Y0) ||
|
||
!coordClose(ref[j].X1, all[i][j].X1) || !coordClose(ref[j].Y1, all[i][j].Y1) {
|
||
t.Errorf("run %d region %d: coords differ beyond epsilon", i, j)
|
||
strict = false
|
||
}
|
||
if !floatClose(ref[j].Confidence, all[i][j].Confidence, confEpsilon) {
|
||
strict = false // confidence jitter is acceptable
|
||
}
|
||
}
|
||
if strict {
|
||
strictEqual++
|
||
}
|
||
}
|
||
t.Logf("DLA: %d regions, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
|
||
}
|
||
|
||
func checkTSRIdempotent(t *testing.T, all [][]TSRCell) {
|
||
t.Helper()
|
||
ref := all[0]
|
||
strictEqual := 0
|
||
for i := 1; i < len(all); i++ {
|
||
if len(all[i]) != len(ref) {
|
||
t.Errorf("run %d: %d cells (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
|
||
continue
|
||
}
|
||
strict := true
|
||
for j := range ref {
|
||
if !coordClose(ref[j].X0, all[i][j].X0) || !coordClose(ref[j].Y0, all[i][j].Y0) ||
|
||
!coordClose(ref[j].X1, all[i][j].X1) || !coordClose(ref[j].Y1, all[i][j].Y1) {
|
||
t.Errorf("run %d cell %d: coords differ beyond epsilon", i, j)
|
||
strict = false
|
||
}
|
||
}
|
||
if strict {
|
||
strictEqual++
|
||
}
|
||
}
|
||
t.Logf("TSR: %d cells, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
|
||
}
|
||
|
||
func checkOCRDetectIdempotent(t *testing.T, all [][]OCRBox) {
|
||
t.Helper()
|
||
ref := all[0]
|
||
strictEqual := 0
|
||
for i := 1; i < len(all); i++ {
|
||
if len(all[i]) != len(ref) {
|
||
t.Errorf("run %d: %d boxes (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
|
||
continue
|
||
}
|
||
strict := true
|
||
for j := range ref {
|
||
if !coordClose(ref[j].X0, all[i][j].X0) || !coordClose(ref[j].Y0, all[i][j].Y0) {
|
||
strict = false
|
||
}
|
||
}
|
||
if strict {
|
||
strictEqual++
|
||
}
|
||
}
|
||
t.Logf("OCRDetect: %d boxes, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
|
||
}
|
||
|
||
func checkOCRRecognizeIdempotent(t *testing.T, all [][]OCRText) {
|
||
t.Helper()
|
||
ref := all[0]
|
||
strictEqual := 0
|
||
for i := 1; i < len(all); i++ {
|
||
if len(all[i]) != len(ref) {
|
||
t.Errorf("run %d: %d texts (run 0: %d) — NOT idempotent", i, len(all[i]), len(ref))
|
||
continue
|
||
}
|
||
strict := true
|
||
for j := range ref {
|
||
if ref[j].Text != all[i][j].Text {
|
||
t.Errorf("run %d text %d: %q != %q — NOT idempotent", i, j, all[i][j].Text, ref[j].Text)
|
||
strict = false
|
||
}
|
||
if !floatClose(ref[j].Confidence, all[i][j].Confidence, confEpsilon) {
|
||
strict = false
|
||
}
|
||
}
|
||
if strict {
|
||
strictEqual++
|
||
}
|
||
}
|
||
t.Logf("OCRRecognize: %d texts, %d/%d runs strictly equal", len(ref), strictEqual+1, len(all))
|
||
}
|
||
|
||
func coordClose(a, b float64) bool {
|
||
d := a - b
|
||
if d < 0 {
|
||
d = -d
|
||
}
|
||
return d <= coordEpsilon
|
||
}
|
||
|
||
func floatClose(a, b, eps float64) bool {
|
||
d := a - b
|
||
if d < 0 {
|
||
d = -d
|
||
}
|
||
return d <= eps
|
||
}
|
||
|
||
// ── Alignment Integration Tests ─────────────────────────────────────────
|
||
// Run with: go test -v -run TestIntegration_Alignment -tags=integration -count=1 ./internal/parser/
|
||
|
||
// TestIntegration_TableAlign verifies table text backfill, text-fragment
|
||
// suppression inside table regions, and caption removal — the key alignment
|
||
// fixes from the Python→Go migration.
|
||
func TestIntegration_TableAlign(t *testing.T) {
|
||
client := mustConnectDeepDoc(t)
|
||
eng := mustOpenEngine(t, "18_table_caption.pdf")
|
||
defer eng.Close()
|
||
|
||
cfg := DefaultParserConfig()
|
||
p := NewParser(cfg, client)
|
||
result, err := p.Parse(context.Background(), eng)
|
||
if err != nil {
|
||
t.Fatalf("Parse: %v", err)
|
||
}
|
||
|
||
// Assert 1: No caption sections remain (merged into parent or removed).
|
||
for _, s := range result.Sections {
|
||
if s.LayoutType == "table caption" || s.LayoutType == "figure caption" {
|
||
t.Errorf("caption Section should be removed: layout=%s text=%q", s.LayoutType, s.Text)
|
||
}
|
||
}
|
||
|
||
// Assert 2: Table sections have TSR-structured text (not raw OCR fragments).
|
||
var hasTable bool
|
||
for _, s := range result.Sections {
|
||
if s.LayoutType == "table" && s.TableItem != nil && len(s.TableItem.Rows) > 0 {
|
||
hasTable = true
|
||
// Structured text should contain tabs (\t) for column separation.
|
||
if !strings.Contains(s.Text, "\t") {
|
||
t.Logf("table Section.Text may not be structured: %q", s.Text[:min(80, len(s.Text))])
|
||
}
|
||
break
|
||
}
|
||
}
|
||
if !hasTable {
|
||
t.Log("no table with TSR rows found — may need different PDF layout")
|
||
}
|
||
|
||
t.Logf("Sections: %d, Tables: %d, Figures: %d",
|
||
len(result.Sections), len(result.Tables), len(result.Figures))
|
||
}
|
||
|
||
// TestIntegration_GarbageLayout verifies CID-garbled and garbage-layout
|
||
// (header/footer/reference) boxes are popped from output.
|
||
func TestIntegration_GarbageLayout(t *testing.T) {
|
||
client := mustConnectDeepDoc(t)
|
||
eng := mustOpenEngine(t, "17_garbage_layout.pdf")
|
||
defer eng.Close()
|
||
|
||
cfg := DefaultParserConfig()
|
||
p := NewParser(cfg, client)
|
||
result, err := p.Parse(context.Background(), eng)
|
||
if err != nil {
|
||
t.Fatalf("Parse: %v", err)
|
||
}
|
||
|
||
// Assert: No CID-garbled text survives.
|
||
for _, s := range result.Sections {
|
||
if strings.Contains(s.Text, "(cid:") {
|
||
t.Errorf("CID garbage should be popped: %q", s.Text)
|
||
}
|
||
}
|
||
|
||
// Assert: No header/footer/reference sections in output.
|
||
for _, s := range result.Sections {
|
||
if s.LayoutType == "header" || s.LayoutType == "footer" || s.LayoutType == "reference" {
|
||
t.Logf("garbage layout %q survived with text %q — may be legitimate page decoration",
|
||
s.LayoutType, s.Text[:min(60, len(s.Text))])
|
||
}
|
||
}
|
||
|
||
t.Logf("Sections: %d", len(result.Sections))
|
||
}
|
||
|
||
// TestIntegration_MultiChunk verifies chunked processing for large documents.
|
||
func TestIntegration_MultiChunk(t *testing.T) {
|
||
client := mustConnectDeepDoc(t)
|
||
eng := mustOpenEngine(t, "19_multipage_chunk.pdf")
|
||
defer eng.Close()
|
||
|
||
cfg := DefaultParserConfig()
|
||
cfg.ChunkSize = 10 // small chunks to force multi-chunk path
|
||
p := NewParser(cfg, client)
|
||
result, err := p.Parse(context.Background(), eng)
|
||
if err != nil {
|
||
t.Fatalf("Parse: %v", err)
|
||
}
|
||
|
||
// 52 pages with 10-page chunks → >= 6 chunks.
|
||
if len(result.Sections) == 0 {
|
||
t.Error("multi-chunk should produce sections")
|
||
}
|
||
|
||
t.Logf("52 pages × chunkSize=10: %d sections, %d tables",
|
||
len(result.Sections), len(result.Tables))
|
||
}
|
||
|
||
// TestIntegration_NoRegression runs a few snapshot PDFs and checks basic
|
||
// invariants — no panic, sections produced, no CID garbage.
|
||
func TestIntegration_NoRegression(t *testing.T) {
|
||
client := mustConnectDeepDoc(t)
|
||
|
||
for _, name := range []string{
|
||
"01_english_simple.pdf",
|
||
"02_chinese_simple.pdf",
|
||
"06_table_content.pdf",
|
||
"07_mixed_content.pdf",
|
||
} {
|
||
t.Run(name, func(t *testing.T) {
|
||
eng := mustOpenEngine(t, name)
|
||
defer eng.Close()
|
||
cfg := DefaultParserConfig()
|
||
p := NewParser(cfg, client)
|
||
result, err := p.Parse(context.Background(), eng)
|
||
if err != nil {
|
||
t.Fatalf("Parse: %v", err)
|
||
}
|
||
if len(result.Sections) == 0 {
|
||
t.Error("expected at least 1 section")
|
||
}
|
||
for _, s := range result.Sections {
|
||
if strings.Contains(s.Text, "(cid:") {
|
||
t.Errorf("CID garbage in %s: %q", name, s.Text)
|
||
}
|
||
}
|
||
t.Logf("%s: %d sections", name, len(result.Sections))
|
||
})
|
||
}
|
||
}
|
||
|
||
// TestIntegration_TableRotation verifies that evaluateTableOrientation
|
||
// correctly detects rotation using region-count scoring.
|
||
func TestIntegration_TableRotation(t *testing.T) {
|
||
client := mustConnectDeepDoc(t)
|
||
|
||
t.Run("upright_table", func(t *testing.T) {
|
||
eng := mustOpenEngine(t, "rotate_0.pdf")
|
||
defer eng.Close()
|
||
cfg := DefaultParserConfig()
|
||
p := NewParser(cfg, client)
|
||
result, err := p.Parse(context.Background(), eng)
|
||
if err != nil {
|
||
t.Fatalf("Parse: %v", err)
|
||
}
|
||
if len(result.Sections) == 0 {
|
||
t.Error("expected sections from upright table")
|
||
}
|
||
t.Logf("rotate_0: %d sections, %d tables", len(result.Sections), len(result.Tables))
|
||
})
|
||
|
||
t.Run("rotated_90_table", func(t *testing.T) {
|
||
eng := mustOpenEngine(t, "rotate_90.pdf")
|
||
defer eng.Close()
|
||
cfg := DefaultParserConfig()
|
||
// DeepDoc DLA does not yet correctly annotate boxes on rotated
|
||
// pages (regions and characters are in different coordinate
|
||
// spaces post-rotation). Character extraction and rotation are
|
||
// verified via the charsToBoxes path.
|
||
cfg.SkipOCR = true
|
||
p := NewParser(cfg, client)
|
||
result, err := p.Parse(context.Background(), eng)
|
||
if err != nil {
|
||
t.Fatalf("Parse: %v", err)
|
||
}
|
||
if len(result.Sections) == 0 {
|
||
t.Error("expected sections from rotated table")
|
||
}
|
||
t.Logf("rotate_90: %d sections, %d tables", len(result.Sections), len(result.Tables))
|
||
})
|
||
}
|
||
|
||
// TestIntegration_WordSpacing verifies space insertion between ASCII word
|
||
// characters with a visible gap (Python __img_ocr space insertion).
|
||
func TestIntegration_WordSpacing(t *testing.T) {
|
||
client := mustConnectDeepDoc(t)
|
||
eng := mustOpenEngine(t, "01_english_simple.pdf")
|
||
defer eng.Close()
|
||
|
||
cfg := DefaultParserConfig()
|
||
p := NewParser(cfg, client)
|
||
result, err := p.Parse(context.Background(), eng)
|
||
if err != nil {
|
||
t.Fatalf("Parse: %v", err)
|
||
}
|
||
|
||
// Assert: no "word1word2" concatenation — ASCII words should be
|
||
// space-separated (either by embedded-char spacing or OCR gaps).
|
||
for _, s := range result.Sections {
|
||
run := 0
|
||
for _, r := range s.Text {
|
||
if r >= 'a' && r <= 'z' {
|
||
run++
|
||
if run > 15 {
|
||
t.Logf("long lowercase run (no space): section text=%q",
|
||
s.Text[:min(80, len(s.Text))])
|
||
break
|
||
}
|
||
} else {
|
||
run = 0
|
||
}
|
||
}
|
||
}
|
||
t.Logf("word spacing check: %d sections", len(result.Sections))
|
||
}
|