mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
164 lines
3.9 KiB
Go
164 lines
3.9 KiB
Go
//go:build cgo && manual
|
||
|
||
package parser
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"os"
|
||
"path/filepath"
|
||
"sort"
|
||
"strings"
|
||
"testing"
|
||
)
|
||
|
||
// mustConnectOssDeepDoc returns a DeepDocClient pointed at the OSS service.
|
||
func mustConnectOssDeepDoc(t *testing.T) *DeepDocClient {
|
||
t.Helper()
|
||
url := os.Getenv("OSSDEEPDOC_URL")
|
||
if url == "" {
|
||
url = "http://localhost:9390"
|
||
}
|
||
client, err := NewDeepDocClient(url)
|
||
if err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
if !client.Health() {
|
||
t.Fatalf("OssDeepDoc not available at %s", url)
|
||
}
|
||
if client.ModelType() != ModelOSS {
|
||
t.Skipf("DeepDoc at %s is %q, not oss — skipping OSS-specific test", url, client.ModelType())
|
||
}
|
||
return client
|
||
}
|
||
|
||
// mustOpenEngine opens a PDF from testdata/pdfs/ and returns a PDFEngine.
|
||
func mustOpenEngine(t *testing.T, name string) PDFEngine {
|
||
t.Helper()
|
||
pdfPath := filepath.Join("testdata", "pdfs", name)
|
||
data, err := os.ReadFile(pdfPath)
|
||
if err != nil {
|
||
t.Fatalf("read fixture %s: %v", name, err)
|
||
}
|
||
eng, err := NewEngine(data)
|
||
if err != nil {
|
||
t.Fatalf("open engine %s: %v", name, err)
|
||
}
|
||
return eng
|
||
}
|
||
|
||
// TestScanAllPDFs iterates over all PDFs in testdata/pdfs/, parses each
|
||
// with OssDeepDoc TSR, and prints a summary. Run with:
|
||
//
|
||
// CGO_ENABLED=1 CGO_LDFLAGS="..." go test -tags=manual -run TestScanAllPDFs -v -count=1
|
||
func TestScanAllPDFs(t *testing.T) {
|
||
client := mustConnectOssDeepDoc(t)
|
||
|
||
pdfDir := filepath.Join("testdata", "pdfs")
|
||
entries, err := os.ReadDir(pdfDir)
|
||
if err != nil {
|
||
t.Fatalf("read pdf dir: %v", err)
|
||
}
|
||
|
||
var pdfs []string
|
||
for _, e := range entries {
|
||
if !e.IsDir() && strings.HasSuffix(strings.ToLower(e.Name()), ".pdf") {
|
||
pdfs = append(pdfs, e.Name())
|
||
}
|
||
}
|
||
sort.Strings(pdfs)
|
||
|
||
fmt.Printf("\n╔══════════════════════════════════════════════════════════════╗\n")
|
||
fmt.Printf("║ OssDeepDoc PDF Parse Report (%d PDFs) ║\n", len(pdfs))
|
||
fmt.Printf("╚══════════════════════════════════════════════════════════════╝\n")
|
||
|
||
for _, name := range pdfs {
|
||
fmt.Printf("\n── %s %s\n", name, strings.Repeat("─", maxint(1, 68-len(name))))
|
||
|
||
eng := mustOpenEngine(t, name)
|
||
cfg := DefaultParserConfig()
|
||
cfg.TableBuilder = NewOssDeepDocService(client)
|
||
p := NewParser(cfg, client)
|
||
result, err := p.Parse(context.Background(), eng)
|
||
eng.Close()
|
||
if err != nil {
|
||
fmt.Printf(" ❌ ERROR: %v\n", err)
|
||
continue
|
||
}
|
||
|
||
// Sections.
|
||
nSections := len(result.Sections)
|
||
layoutTypes := map[string]int{}
|
||
for _, s := range result.Sections {
|
||
lt := s.LayoutType
|
||
if lt == "" {
|
||
lt = "(empty)"
|
||
}
|
||
layoutTypes[lt]++
|
||
}
|
||
fmt.Printf(" Sections: %d [", nSections)
|
||
first := true
|
||
for lt, cnt := range layoutTypes {
|
||
if !first {
|
||
fmt.Print(", ")
|
||
}
|
||
fmt.Printf("%s:%d", lt, cnt)
|
||
first = false
|
||
}
|
||
fmt.Println("]")
|
||
|
||
// Tables.
|
||
nTables := len(result.Tables)
|
||
fmt.Printf(" Tables: %d\n", nTables)
|
||
for i, tbl := range result.Tables {
|
||
nr := len(tbl.Grid)
|
||
nc := 0
|
||
if nr > 0 {
|
||
nc = len(tbl.Grid[0])
|
||
}
|
||
sample := ""
|
||
for _, row := range tbl.Grid {
|
||
for _, cell := range row {
|
||
s := strings.TrimSpace(cell.Text)
|
||
if s != "" {
|
||
sample = s
|
||
goto found
|
||
}
|
||
}
|
||
}
|
||
found:
|
||
if len(sample) > 40 {
|
||
sample = sample[:40] + "..."
|
||
}
|
||
fmt.Printf(" [%d] %d×%d %q\n", i, nr, nc, sample)
|
||
}
|
||
|
||
// First text snippet.
|
||
textLen := 0
|
||
for _, s := range result.Sections {
|
||
txt := strings.TrimSpace(s.Text)
|
||
if txt == "" || s.LayoutType == "table" {
|
||
continue
|
||
}
|
||
if textLen == 0 {
|
||
if len(txt) > 80 {
|
||
txt = txt[:80] + "..."
|
||
}
|
||
fmt.Printf(" First text: %q\n", txt)
|
||
}
|
||
textLen += len(txt)
|
||
if textLen > 160 {
|
||
break
|
||
}
|
||
}
|
||
}
|
||
fmt.Println()
|
||
}
|
||
|
||
func maxint(a, b int) int {
|
||
if a > b {
|
||
return a
|
||
}
|
||
return b
|
||
}
|