mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
193 lines
4.7 KiB
Go
193 lines
4.7 KiB
Go
//go:build cgo && manual
|
|
|
|
package parser
|
|
|
|
import (
|
|
"context"
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
)
|
|
|
|
// TestTableRotation_Integration validates rotation detection with real DeepDoc.
|
|
//
|
|
// Prerequisites:
|
|
// - DeepDoc running at localhost:9390 (or set DEEPDOC_URL)
|
|
// - Test PDF: testdata/pdfs/table_rotation_test.pdf (generated by tools/generate_rotated_table_pdf.py)
|
|
//
|
|
// Run:
|
|
//
|
|
// CGO_CFLAGS="..." CGO_LDFLAGS="..." \
|
|
// go test -tags 'cgo,manual' -run TestTableRotation_Integration -v -count=1
|
|
func TestTableRotation_Integration(t *testing.T) {
|
|
pdfPath := filepath.Join("testdata", "pdfs", "table_rotation_test.pdf")
|
|
if _, err := os.Stat(pdfPath); os.IsNotExist(err) {
|
|
t.Skipf("test PDF not found: %s (run tools/generate_rotated_table_pdf.py first)", pdfPath)
|
|
}
|
|
|
|
baseURL := os.Getenv("DEEPDOC_URL")
|
|
if baseURL == "" {
|
|
baseURL = "http://localhost:9390"
|
|
}
|
|
dd, err := NewDeepDocClient(baseURL)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if !dd.Health() {
|
|
t.Fatalf("DeepDoc not available at %s", baseURL)
|
|
}
|
|
t.Logf("DeepDoc available at %s", baseURL)
|
|
|
|
// Open PDF
|
|
data, err := os.ReadFile(pdfPath)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
eng, err := NewEngine(data)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer eng.Close()
|
|
|
|
pageCount, _ := eng.PageCount()
|
|
t.Logf("PDF: %d pages", pageCount)
|
|
|
|
cfg := DefaultParserConfig()
|
|
cfg.ToPage = pageCount - 1
|
|
autoRotate := true
|
|
cfg.AutoRotateTables = &autoRotate
|
|
_ = NewParser(cfg, dd) // verify construction does not panic
|
|
|
|
for pg := 0; pg < pageCount; pg++ {
|
|
pageImg, err := renderPageToImage(eng, pg)
|
|
if err != nil {
|
|
t.Fatalf("render page %d: %v", pg, err)
|
|
}
|
|
|
|
regions, err := dd.DLA(context.Background(), pageImg)
|
|
if err != nil {
|
|
t.Fatalf("DLA page %d: %v", pg, err)
|
|
}
|
|
|
|
tableCount := 0
|
|
for _, r := range regions {
|
|
if r.Label != "table" {
|
|
continue
|
|
}
|
|
tableCount++
|
|
|
|
// Crop table region
|
|
cropped, err := cropImageRegion(pageImg, r)
|
|
if err != nil {
|
|
t.Errorf(" crop table %d: %v", tableCount, err)
|
|
continue
|
|
}
|
|
|
|
// Evaluate rotation
|
|
angle, _, scores := evaluateTableOrientation(context.Background(), cropped, dd)
|
|
t.Logf(" Page %d Table %d: %dx%d, bestAngle=%d°, scores: 0=%.3f 90=%.3f 180=%.3f 270=%.3f",
|
|
pg, tableCount, cropped.Bounds().Dx(), cropped.Bounds().Dy(),
|
|
angle,
|
|
scores[0], scores[90], scores[180], scores[270])
|
|
|
|
// Verify: page 0 should be ~0°, page 1 should be ~90°
|
|
if pg == 0 && angle != 0 {
|
|
t.Errorf("Page 0 normal table: expected 0°, got %d°", angle)
|
|
}
|
|
// Page 1 has the rotated table - expect 90° (or 270° depending on DLA bbox)
|
|
if pg == 1 {
|
|
t.Logf(" NOTE: Page 1 rotated table detected as %d° (expect 90 or 270)", angle)
|
|
|
|
// Verify TSR returns labels (6th element in bbox array).
|
|
testCells, tsrErr := dd.TSR(context.Background(), cropped)
|
|
if tsrErr == nil && len(testCells) > 0 {
|
|
hasLabel := false
|
|
for _, c := range testCells {
|
|
if c.Label != "" {
|
|
hasLabel = true
|
|
break
|
|
}
|
|
}
|
|
if !hasLabel {
|
|
t.Error("TSR returned cells without labels")
|
|
} else {
|
|
t.Logf(" TSR labels OK: %d cells", len(testCells))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
t.Logf("Page %d: %d tables detected", pg, tableCount)
|
|
}
|
|
}
|
|
|
|
// TestTableRotation_Stability runs rotation detection on a sample real PDF
|
|
// and verifies the pipeline doesn't crash. Set BATCH_COUNT to limit.
|
|
func TestTableRotation_Stability(t *testing.T) {
|
|
baseURL := os.Getenv("DEEPDOC_URL")
|
|
if baseURL == "" {
|
|
baseURL = "http://localhost:9390"
|
|
}
|
|
dd, err := NewDeepDocClient(baseURL)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if !dd.Health() {
|
|
t.Fatalf("DeepDoc not available at %s", baseURL)
|
|
}
|
|
|
|
realDir := filepath.Join("testdata", "real_pdfs")
|
|
entries, err := os.ReadDir(realDir)
|
|
if err != nil {
|
|
t.Skipf("no real PDFs: %v", err)
|
|
}
|
|
|
|
count := 0
|
|
maxCount := 3 // sample size
|
|
for _, e := range entries {
|
|
if e.IsDir() || filepath.Ext(e.Name()) != ".pdf" {
|
|
continue
|
|
}
|
|
if count >= maxCount {
|
|
break
|
|
}
|
|
|
|
data, err := os.ReadFile(filepath.Join(realDir, e.Name()))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
eng, err := NewEngine(data)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
pageImg, err := renderPageToImage(eng, 0)
|
|
eng.Close()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
regions, _ := dd.DLA(context.Background(), pageImg)
|
|
tables := 0
|
|
rotated := 0
|
|
for _, r := range regions {
|
|
if r.Label != "table" {
|
|
continue
|
|
}
|
|
tables++
|
|
cropped, _ := cropImageRegion(pageImg, r)
|
|
if cropped == nil {
|
|
continue
|
|
}
|
|
angle, _, _ := evaluateTableOrientation(context.Background(), cropped, dd)
|
|
if angle != 0 {
|
|
rotated++
|
|
t.Logf(" %s: rotated table detected (angle=%d°)", e.Name(), angle)
|
|
}
|
|
}
|
|
t.Logf(" %s: %d tables, %d rotated", e.Name(), tables, rotated)
|
|
count++
|
|
}
|
|
|
|
t.Logf("Sampled %d real PDFs", count)
|
|
}
|