mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
215 lines
5.6 KiB
Go
215 lines
5.6 KiB
Go
//go:build cgo && manual
|
|
|
|
package parser
|
|
|
|
import (
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
|
|
"ragflow/internal/deepdoc/parser/pdf/pdfoxide"
|
|
)
|
|
|
|
// ── Y-coordinate tests ──────────────────────────────────────────────────
|
|
|
|
// openTestingPDF opens a real PDF by name from testdata/real_pdfs/.
|
|
// Missing fixtures are skipped (soft) rather than failing — these tests
|
|
// require the "manual" build tag and rely on optional fixture files.
|
|
func openTestingPDF(t *testing.T, name string) (PDFEngine, *pdfoxide.Document) {
|
|
t.Helper()
|
|
dir := filepath.Join("testdata", "real_pdfs")
|
|
if _, err := os.Stat(filepath.Join(dir, name)); os.IsNotExist(err) {
|
|
t.Skipf("test PDF not found: %s", name)
|
|
}
|
|
return openPDF(t, dir, name)
|
|
}
|
|
|
|
// TestYCoord_SameLineCharsHaveEqualBottom checks that characters on the same
|
|
// PDF text line (same baseline) have identical Bottom values. Bottom =
|
|
// pageHeight - c.Y is derived from the screen-space baseline, which is the
|
|
// same for all chars on a line regardless of font size or descent.
|
|
func TestYCoord_SameLineCharsHaveEqualBottom(t *testing.T) {
|
|
eng, _ := openTestingPDF(t, "RAG分词召回分析.pdf")
|
|
|
|
chars, err := eng.ExtractChars(0)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if len(chars) == 0 {
|
|
t.Fatal("no chars")
|
|
}
|
|
|
|
lines := groupCharsToLines(chars, false)
|
|
for li, line := range lines {
|
|
if len(line) <= 1 {
|
|
continue
|
|
}
|
|
refBottom := line[0].Bottom
|
|
for _, c := range line[1:] {
|
|
if math.Abs(c.Bottom-refBottom) > 0.1 {
|
|
t.Errorf("line %d: char %q has Bottom=%.2f, expected ~%.2f (delta=%.2f)",
|
|
li, c.Text, c.Bottom, refBottom, c.Bottom-refBottom)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// TestYCoord_BottomEqualsTopPlusHeight checks the invariant bottom = top + height
|
|
// for every character.
|
|
func TestYCoord_BottomEqualsTopPlusHeight(t *testing.T) {
|
|
eng, _ := openTestingPDF(t, "RAG分词召回分析.pdf")
|
|
|
|
for pg := 0; pg < 1; pg++ {
|
|
chars, err := eng.ExtractChars(pg)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
for _, c := range chars {
|
|
h := c.Bottom - c.Top
|
|
expected := c.Top + h
|
|
delta := math.Abs(c.Bottom - expected)
|
|
if delta > 0.01 {
|
|
t.Errorf("char %q: Bottom=%.4f, Top=%.4f+Height=%.4f=%.4f, delta=%v",
|
|
c.Text, c.Bottom, c.Top, h, expected, delta)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// TestYCoord_XUnchanged verifies that X0/X1 are not affected by Y-axis
|
|
// coordinate transformations.
|
|
func TestYCoord_XUnchanged(t *testing.T) {
|
|
eng, doc := openTestingPDF(t, "RAG分词召回分析.pdf")
|
|
|
|
pipelineChars, err := eng.ExtractChars(0)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if len(pipelineChars) == 0 {
|
|
t.Fatal("no chars")
|
|
}
|
|
|
|
raw, err := doc.Inner.ExtractChars(0)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if len(raw) == 0 {
|
|
t.Fatal("no raw chars")
|
|
}
|
|
|
|
type xw struct {
|
|
x0, w float64
|
|
}
|
|
rawSet := make(map[xw]bool, len(raw))
|
|
for _, rc := range raw {
|
|
rawSet[xw{float64(rc.X), float64(rc.Width)}] = true
|
|
}
|
|
|
|
for _, c := range pipelineChars {
|
|
w := c.X1 - c.X0
|
|
if !rawSet[xw{c.X0, w}] {
|
|
t.Logf("pipeline char %q X0=%.1f W=%.1f not in raw set (may be deduped)",
|
|
c.Text, c.X0, w)
|
|
}
|
|
}
|
|
}
|
|
|
|
// TestYCoord_EmptyPageNoPanic ensures extracting chars from an empty page
|
|
// (out of range) returns an error, not panics.
|
|
func TestYCoord_EmptyPageNoPanic(t *testing.T) {
|
|
eng, _ := openTestingPDF(t, "RAG分词召回分析.pdf")
|
|
|
|
_, err := eng.ExtractChars(9999)
|
|
if err == nil {
|
|
t.Error("expected error for out-of-range page, got nil")
|
|
}
|
|
}
|
|
|
|
// TestYCoord_RenderedImageDimensionsMatchPage verifies that rendered page
|
|
// image dimensions are proportional to the page's CropBox.
|
|
func TestYCoord_RenderedImageDimensionsMatchPage(t *testing.T) {
|
|
eng, _ := openTestingPDF(t, "RAG分词召回分析.pdf")
|
|
|
|
img, err := eng.RenderPageImage(0, 72)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if img == nil {
|
|
t.Fatal("rendered image is nil")
|
|
}
|
|
b := img.Bounds()
|
|
if b.Dx() == 0 || b.Dy() == 0 {
|
|
t.Errorf("rendered image has 0 dimensions: %dx%d", b.Dx(), b.Dy())
|
|
}
|
|
}
|
|
|
|
// TestYCoord_MultiPageConsistency verifies that chars across pages all have
|
|
// valid Top values within page bounds.
|
|
func TestYCoord_MultiPageConsistency(t *testing.T) {
|
|
eng, _ := openTestingPDF(t, "20240815-华福证券-海光信息-688041.SH-中报略超预告中值_新增适配AI大模型通义千问_4页_467kb.pdf")
|
|
|
|
pageCount, err := eng.PageCount()
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if pageCount < 2 {
|
|
t.Skip("need multi-page PDF")
|
|
}
|
|
|
|
for pg := 0; pg < pageCount; pg++ {
|
|
chars, err := eng.ExtractChars(pg)
|
|
if err != nil {
|
|
t.Errorf("page %d: ExtractChars: %v", pg, err)
|
|
continue
|
|
}
|
|
if len(chars) == 0 {
|
|
continue
|
|
}
|
|
for _, c := range chars {
|
|
if c.Top < 0 {
|
|
t.Errorf("page %d char %q: Top=%.2f < 0", pg, c.Text, c.Top)
|
|
}
|
|
if c.Bottom <= c.Top {
|
|
t.Errorf("page %d char %q: Bottom=%.2f <= Top=%.2f", pg, c.Text, c.Bottom, c.Top)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// TestYCoord_CropBoxUsedNotMediaBox verifies that chars are positioned using
|
|
// CropBox height, not MediaBox.
|
|
func TestYCoord_CropBoxUsedNotMediaBox(t *testing.T) {
|
|
eng, doc := openTestingPDF(t, "RAG分词召回分析.pdf")
|
|
|
|
info, err := doc.Inner.PageInfo(0)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if info.CropBox.Height <= 0 {
|
|
t.Skip("test PDF doesn't have CropBox")
|
|
}
|
|
|
|
chars, err := eng.ExtractChars(0)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if len(chars) == 0 {
|
|
t.Fatal("no chars")
|
|
}
|
|
|
|
mediaBoxH := float64(info.Height)
|
|
cropBoxH := float64(info.CropBox.Height)
|
|
|
|
if mediaBoxH == cropBoxH {
|
|
t.Skip("MediaBox == CropBox, no offset to test")
|
|
}
|
|
|
|
for _, c := range chars {
|
|
if c.Top >= cropBoxH {
|
|
t.Errorf("char %q Top=%.2f >= CropBox height %.2f", c.Text, c.Top, cropBoxH)
|
|
}
|
|
}
|
|
}
|