Files
ragflow/internal/deepdoc/parser/pdf/table_parity_test.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

97 lines
2.5 KiB
Go

//go:build cgo && manual
package parser
import (
"encoding/json"
"os"
"path/filepath"
"strings"
"testing"
)
// TestTableParityWithPythonBoxes reads Python's pre-merge table boxes
// (with R/C annotations) and runs them through Go's constructTable.
// If Go produces the same HTML as Python, the pipeline is correct
// and differences are from the engine layer (pdf_oxide vs pdfplumber).
func TestTableParityWithPythonBoxes(t *testing.T) {
boxesDir := filepath.Join("testdata", "output", "py", "noocr", "table_boxes")
entries, err := os.ReadDir(boxesDir)
if err != nil {
t.Skipf("Python table_boxes not found — run dump_py_results.py first: %v", err)
}
for _, e := range entries {
if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") {
continue
}
name := strings.TrimSuffix(e.Name(), ".json")
t.Run(name, func(t *testing.T) {
data, err := os.ReadFile(filepath.Join(boxesDir, e.Name()))
if err != nil {
t.Fatal(err)
}
var pyBoxes []struct {
X0, X1, Top, Bottom float64
Text string
R, C, H, SP int
LayoutType string
}
if err := json.Unmarshal(data, &pyBoxes); err != nil {
t.Fatal(err)
}
// Convert to Go TextBox
boxes := make([]TextBox, len(pyBoxes))
for i, b := range pyBoxes {
boxes[i] = TextBox{
X0: b.X0, X1: b.X1, Top: b.Top, Bottom: b.Bottom,
Text: b.Text, R: b.R, C: b.C, H: b.H, SP: b.SP,
LayoutType: b.LayoutType,
}
}
// Run through Go's constructTable
item := &TableItem{}
html := constructTable(nil, boxes, "", item)
if html == "" {
t.Error("constructTable returned empty HTML")
return
}
if !strings.Contains(html, "<table>") {
t.Error("HTML missing <table> tag")
}
// Verify structure
trCount := strings.Count(html, "<tr>")
tdCount := strings.Count(html, "<td>")
thCount := strings.Count(html, "<th>")
if trCount == 0 {
t.Error("no <tr> rows found")
}
if tdCount == 0 && thCount == 0 {
t.Error("no <td> or <th> cells found")
}
// Check no empty rows
nonEmptyCols := 0
for _, row := range item.Rows {
for _, cell := range row {
if strings.TrimSpace(cell) != "" {
nonEmptyCols++
}
}
}
if nonEmptyCols == 0 {
t.Errorf("all %d cells are empty — R/C path broken", tdCount+thCount)
}
t.Logf("%s: %d rows, %d cells (%d th), %d non-empty",
name, trCount, tdCount+thCount, thCount, nonEmptyCols)
t.Logf("HTML snippet: %.200s...", html)
})
}
}