mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
97 lines
2.5 KiB
Go
97 lines
2.5 KiB
Go
//go:build cgo && manual
|
|
|
|
package parser
|
|
|
|
import (
|
|
"encoding/json"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
)
|
|
|
|
// TestTableParityWithPythonBoxes reads Python's pre-merge table boxes
|
|
// (with R/C annotations) and runs them through Go's constructTable.
|
|
// If Go produces the same HTML as Python, the pipeline is correct
|
|
// and differences are from the engine layer (pdf_oxide vs pdfplumber).
|
|
func TestTableParityWithPythonBoxes(t *testing.T) {
|
|
boxesDir := filepath.Join("testdata", "output", "py", "noocr", "table_boxes")
|
|
entries, err := os.ReadDir(boxesDir)
|
|
if err != nil {
|
|
t.Skipf("Python table_boxes not found — run dump_py_results.py first: %v", err)
|
|
}
|
|
|
|
for _, e := range entries {
|
|
if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") {
|
|
continue
|
|
}
|
|
name := strings.TrimSuffix(e.Name(), ".json")
|
|
t.Run(name, func(t *testing.T) {
|
|
data, err := os.ReadFile(filepath.Join(boxesDir, e.Name()))
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
var pyBoxes []struct {
|
|
X0, X1, Top, Bottom float64
|
|
Text string
|
|
R, C, H, SP int
|
|
LayoutType string
|
|
}
|
|
if err := json.Unmarshal(data, &pyBoxes); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Convert to Go TextBox
|
|
boxes := make([]TextBox, len(pyBoxes))
|
|
for i, b := range pyBoxes {
|
|
boxes[i] = TextBox{
|
|
X0: b.X0, X1: b.X1, Top: b.Top, Bottom: b.Bottom,
|
|
Text: b.Text, R: b.R, C: b.C, H: b.H, SP: b.SP,
|
|
LayoutType: b.LayoutType,
|
|
}
|
|
}
|
|
|
|
// Run through Go's constructTable
|
|
item := &TableItem{}
|
|
html := constructTable(nil, boxes, "", item)
|
|
|
|
if html == "" {
|
|
t.Error("constructTable returned empty HTML")
|
|
return
|
|
}
|
|
if !strings.Contains(html, "<table>") {
|
|
t.Error("HTML missing <table> tag")
|
|
}
|
|
|
|
// Verify structure
|
|
trCount := strings.Count(html, "<tr>")
|
|
tdCount := strings.Count(html, "<td>")
|
|
thCount := strings.Count(html, "<th>")
|
|
if trCount == 0 {
|
|
t.Error("no <tr> rows found")
|
|
}
|
|
if tdCount == 0 && thCount == 0 {
|
|
t.Error("no <td> or <th> cells found")
|
|
}
|
|
|
|
// Check no empty rows
|
|
nonEmptyCols := 0
|
|
for _, row := range item.Rows {
|
|
for _, cell := range row {
|
|
if strings.TrimSpace(cell) != "" {
|
|
nonEmptyCols++
|
|
}
|
|
}
|
|
}
|
|
if nonEmptyCols == 0 {
|
|
t.Errorf("all %d cells are empty — R/C path broken", tdCount+thCount)
|
|
}
|
|
|
|
t.Logf("%s: %d rows, %d cells (%d th), %d non-empty",
|
|
name, trCount, tdCount+thCount, thCount, nonEmptyCols)
|
|
t.Logf("HTML snippet: %.200s...", html)
|
|
})
|
|
}
|
|
}
|