mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
57 lines
1.2 KiB
Go
57 lines
1.2 KiB
Go
//go:build cgo
|
|
|
|
package pdfoxide
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
)
|
|
|
|
func TestPDFPlumber_Basic(t *testing.T) {
|
|
pdfDir := filepath.Join("..", "parser", "testdata", "pdfs")
|
|
path := filepath.Join(pdfDir, "01_english_simple.pdf")
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
t.Fatalf("read PDF: %v", err)
|
|
}
|
|
|
|
eng, err := NewEngine(data)
|
|
if err != nil {
|
|
t.Fatalf("NewEngine: %v", err)
|
|
}
|
|
defer eng.Close()
|
|
|
|
pc, _ := eng.PageCount()
|
|
t.Logf("Pages: %d", pc)
|
|
|
|
chars, err := eng.ExtractChars(0)
|
|
if err != nil {
|
|
t.Fatalf("ExtractChars: %v", err)
|
|
}
|
|
t.Logf("Page 0: %d chars extracted", len(chars))
|
|
if len(chars) == 0 {
|
|
t.Error("got 0 chars")
|
|
}
|
|
|
|
// Show first few chars
|
|
for i := 0; i < min(5, len(chars)); i++ {
|
|
t.Logf(" char[%d]: text=%q x0=%.1f x1=%.1f top=%.1f bottom=%.1f font=%q",
|
|
i, chars[i].Text, chars[i].X0, chars[i].X1, chars[i].Top, chars[i].Bottom, chars[i].FontName)
|
|
}
|
|
}
|
|
|
|
func BenchmarkPDFPlumber_ExtractChars(b *testing.B) {
|
|
pdfDir := filepath.Join("..", "parser", "testdata", "pdfs")
|
|
path := filepath.Join(pdfDir, "01_english_simple.pdf")
|
|
data, _ := os.ReadFile(path)
|
|
|
|
eng, _ := NewEngine(data)
|
|
defer eng.Close()
|
|
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
eng.ExtractChars(0)
|
|
}
|
|
}
|