Files
ragflow/internal/deepdoc/parser/pdf/scan_all_pdfs_test.go
Jack 98323e7910 Refactor: oss parser go refactor (#16391)
### What problem does this PR solve?

Package refactor and PDF post process.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-29 18:46:41 +08:00

130 lines
3.1 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//go:build cgo && manual
package parser
import (
"context"
"fmt"
"os"
"path/filepath"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
"sort"
"strings"
"testing"
)
// TestScanAllPDFs iterates over all PDFs in testdata/pdfs/, parses each
// with OssDeepDoc TSR, and prints a summary. Run with:
//
// CGO_ENABLED=1 CGO_LDFLAGS="..." go test -tags=manual -run TestScanAllPDFs -v -count=1
func TestScanAllPDFs(t *testing.T) {
client := mustConnectInferenceClient(t)
pdfDir := filepath.Join("testdata", "pdfs")
entries, err := os.ReadDir(pdfDir)
if err != nil {
t.Fatalf("read pdf dir: %v", err)
}
var pdfs []string
for _, e := range entries {
if !e.IsDir() && strings.HasSuffix(strings.ToLower(e.Name()), ".pdf") {
pdfs = append(pdfs, e.Name())
}
}
sort.Strings(pdfs)
fmt.Printf("\n╔══════════════════════════════════════════════════════════════╗\n")
fmt.Printf("║ OssDeepDoc PDF Parse Report (%d PDFs) ║\n", len(pdfs))
fmt.Printf("╚══════════════════════════════════════════════════════════════╝\n")
for _, name := range pdfs {
fmt.Printf("\n── %s %s\n", name, strings.Repeat("─", maxint(1, 68-len(name))))
eng := mustOpenEngine(t, name)
cfg := pdf.DefaultParserConfig()
cfg.TableBuilder = NewDeepDocTableBuildService(client)
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
eng.Close()
if err != nil {
fmt.Printf(" ❌ ERROR: %v\n", err)
continue
}
// Sections.
nSections := len(result.Sections)
layoutTypes := map[string]int{}
for _, s := range result.Sections {
lt := s.LayoutType
if lt == "" {
lt = "(empty)"
}
layoutTypes[lt]++
}
fmt.Printf(" Sections: %d [", nSections)
first := true
for lt, cnt := range layoutTypes {
if !first {
fmt.Print(", ")
}
fmt.Printf("%s:%d", lt, cnt)
first = false
}
fmt.Println("]")
// Tables.
nTables := len(result.Tables)
fmt.Printf(" Tables: %d\n", nTables)
for i, tbl := range result.Tables {
nr := len(tbl.Grid)
nc := 0
if nr > 0 {
nc = len(tbl.Grid[0])
}
sample := ""
for _, row := range tbl.Grid {
for _, cell := range row {
s := strings.TrimSpace(cell.Text)
if s != "" {
sample = s
goto found
}
}
}
found:
if len(sample) > 40 {
sample = sample[:40] + "..."
}
fmt.Printf(" [%d] %d×%d %q\n", i, nr, nc, sample)
}
// First text snippet.
textLen := 0
for _, s := range result.Sections {
txt := strings.TrimSpace(s.Text)
if txt == "" || s.LayoutType == "table" {
continue
}
if textLen == 0 {
if len(txt) > 80 {
txt = txt[:80] + "..."
}
fmt.Printf(" First text: %q\n", txt)
}
textLen += len(txt)
if textLen > 160 {
break
}
}
}
fmt.Println()
}
func maxint(a, b int) int {
if a > b {
return a
}
return b
}