Files
ragflow/internal/deepdoc/parser/pdf/inference_client_integration_test.go
Jack 98323e7910 Refactor: oss parser go refactor (#16391)
### What problem does this PR solve?

Package refactor and PDF post process.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-29 18:46:41 +08:00

138 lines
3.6 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//go:build cgo && integration
package parser
import (
"context"
"strings"
"testing"
tbl "ragflow/internal/deepdoc/parser/pdf/table"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
)
// TestIntegration_DeepDoc_TableStructure verifies that parsing a PDF
// through the OSS TableBuilder produces tables with the expected row/column structure.
func TestIntegration_DeepDoc_TableStructure(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
cfg := pdf.DefaultParserConfig()
cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Tables) == 0 {
t.Skip("DLA did not detect any tables in fixture")
}
t.Logf("OssDeepDoc produced %d tables", len(result.Tables))
for i, tbl := range result.Tables {
t.Logf("table[%d]: %d rows", i, len(tbl.Rows))
for ri, row := range tbl.Rows {
hasContent := false
for _, cell := range row {
if strings.TrimSpace(cell) != "" {
hasContent = true
break
}
}
if !hasContent {
t.Errorf("table[%d] row[%d]: all cells empty", i, ri)
}
}
}
}
// TestIntegration_DeepDoc_TableRows verifies each table has non-empty
// rows with the expected grid structure.
func TestIntegration_DeepDoc_TableRows(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
cfg := pdf.DefaultParserConfig()
cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Tables) == 0 {
t.Skip("DLA did not detect any tables in fixture")
}
for i, tbl := range result.Tables {
if len(tbl.Rows) == 0 {
t.Errorf("table[%d]: no rows", i)
continue
}
t.Logf("table[%d]: %d rows × ~%d cols", i, len(tbl.Rows), len(tbl.Rows[0]))
for ri, row := range tbl.Rows {
hasContent := false
for _, cell := range row {
if strings.TrimSpace(cell) != "" {
hasContent = true
break
}
}
if !hasContent {
t.Errorf("table[%d] row[%d]: all cells empty", i, ri)
}
}
}
}
// TestIntegration_DeepDoc_Idempotency verifies that parsing the same PDF
// twice produces the same table row structure.
func TestIntegration_DeepDoc_Idempotency(t *testing.T) {
client := mustConnectInferenceClient(t)
parseOnce := func() *pdf.ParseResult {
eng := mustOpenEngine(t, "06_table_content.pdf")
defer eng.Close()
cfg := pdf.DefaultParserConfig()
cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
p := NewParser(cfg, client)
result, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
return result
}
r1 := parseOnce()
r2 := parseOnce()
if len(r1.Tables) != len(r2.Tables) {
t.Errorf("table count mismatch: run1=%d run2=%d", len(r1.Tables), len(r2.Tables))
return
}
for i := 0; i < len(r1.Tables); i++ {
if len(r1.Tables[i].Rows) != len(r2.Tables[i].Rows) {
t.Errorf("table[%d] row count differs: run1=%d run2=%d", i,
len(r1.Tables[i].Rows), len(r2.Tables[i].Rows))
}
}
}
// TestIntegration_DeepDoc_EmptyPage verifies that a page with no tables
// does not crash.
func TestIntegration_DeepDoc_EmptyPage(t *testing.T) {
client := mustConnectInferenceClient(t)
eng := mustOpenEngine(t, "01_english_simple.pdf")
defer eng.Close()
cfg := pdf.DefaultParserConfig()
cfg.TableBuilder = tbl.NewDeepDocTableBuilder(client)
p := NewParser(cfg, client)
_, err := p.Parse(context.Background(), eng)
if err != nil {
t.Fatalf("Parse: %v", err)
}
}