mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
216 lines
6.4 KiB
Go
216 lines
6.4 KiB
Go
package parser
|
||
|
||
import (
|
||
"strings"
|
||
"testing"
|
||
)
|
||
|
||
func TestOssDeepDocService_GroupCells_Basic4x5(t *testing.T) {
|
||
b := &OssDeepDocService{}
|
||
|
||
cells := buildOSSCells(4, 5, 0, 0, 500, 200)
|
||
grid := b.GroupCells(cells)
|
||
|
||
if len(grid) != 4 {
|
||
t.Fatalf("expected 4 rows, got %d", len(grid))
|
||
}
|
||
for i, row := range grid {
|
||
if len(row) != 5 {
|
||
t.Fatalf("row %d: expected 5 cols, got %d", i, len(row))
|
||
}
|
||
}
|
||
}
|
||
|
||
func TestOssDeepDocService_GroupCells_Coords(t *testing.T) {
|
||
b := &OssDeepDocService{}
|
||
|
||
cells := buildOSSCells(2, 2, 0, 0, 200, 100)
|
||
grid := b.GroupCells(cells)
|
||
|
||
// grid[0][0] = row[0] × col[0]
|
||
if grid[0][0].X0 != 0 || grid[0][0].Y0 != 0 {
|
||
t.Errorf("grid[0][0] pos: got (%.0f,%.0f), want (0,0)", grid[0][0].X0, grid[0][0].Y0)
|
||
}
|
||
if grid[0][0].X1 != 100 || grid[0][0].Y1 != 50 {
|
||
t.Errorf("grid[0][0] size: got (%.0f,%.0f), want (100,50)", grid[0][0].X1, grid[0][0].Y1)
|
||
}
|
||
|
||
// grid[1][1] = row[1] × col[1]
|
||
if grid[1][1].X0 != 100 || grid[1][1].Y0 != 50 {
|
||
t.Errorf("grid[1][1] pos: got (%.0f,%.0f), want (100,50)", grid[1][1].X0, grid[1][1].Y0)
|
||
}
|
||
if grid[1][1].X1 != 200 || grid[1][1].Y1 != 100 {
|
||
t.Errorf("grid[1][1] size: got (%.0f,%.0f), want (200,100)", grid[1][1].X1, grid[1][1].Y1)
|
||
}
|
||
}
|
||
|
||
func TestOssDeepDocService_GroupCells_HeaderPropagation(t *testing.T) {
|
||
b := &OssDeepDocService{}
|
||
|
||
// 3 rows: header(Y=0-50) should map to row 0
|
||
cells := []TSRCell{
|
||
{X0: 0, Y0: 0, X1: 200, Y1: 150, Label: "table"},
|
||
{X0: 0, Y0: 0, X1: 200, Y1: 50, Label: "table row"},
|
||
{X0: 0, Y0: 50, X1: 200, Y1: 100, Label: "table row"},
|
||
{X0: 0, Y0: 100, X1: 200, Y1: 150, Label: "table row"},
|
||
{X0: 0, Y0: 0, X1: 100, Y1: 150, Label: "table column"},
|
||
{X0: 100, Y0: 0, X1: 200, Y1: 150, Label: "table column"},
|
||
{X0: 0, Y0: 0, X1: 200, Y1: 50, Label: "table column header"},
|
||
}
|
||
|
||
grid := b.GroupCells(cells)
|
||
if len(grid) != 3 {
|
||
t.Fatalf("expected 3 rows, got %d", len(grid))
|
||
}
|
||
|
||
// Row 0 should have header labels.
|
||
for c := range grid[0] {
|
||
if grid[0][c].Label != "table column header" {
|
||
t.Errorf("grid[0][%d].Label = %q, want 'table column header'", c, grid[0][c].Label)
|
||
}
|
||
}
|
||
|
||
// Row 1 should have empty labels (data rows).
|
||
for c := range grid[1] {
|
||
if grid[1][c].Label != "" {
|
||
t.Errorf("grid[1][%d].Label = %q, want empty", c, grid[1][c].Label)
|
||
}
|
||
}
|
||
}
|
||
|
||
func TestOssDeepDocService_GroupCells_SpanInjection(t *testing.T) {
|
||
b := &OssDeepDocService{}
|
||
|
||
// 2×3 table, spanning cell covers cols 0-1 in row 0
|
||
cells := []TSRCell{
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 100, Label: "table"},
|
||
{X0: 0, Y0: 0, X1: 300, Y1: 50, Label: "table row"},
|
||
{X0: 0, Y0: 50, X1: 300, Y1: 100, Label: "table row"},
|
||
{X0: 0, Y0: 0, X1: 100, Y1: 100, Label: "table column"},
|
||
{X0: 100, Y0: 0, X1: 200, Y1: 100, Label: "table column"},
|
||
{X0: 200, Y0: 0, X1: 300, Y1: 100, Label: "table column"},
|
||
{X0: 0, Y0: 0, X1: 200, Y1: 50, Label: "table spanning cell"},
|
||
}
|
||
|
||
grid := b.GroupCells(cells)
|
||
if len(grid) != 2 || len(grid[0]) != 3 {
|
||
t.Fatalf("expected 2×3 grid, got %d×%d", len(grid), len(grid[0]))
|
||
}
|
||
|
||
// The spanning cell at [0,0] should have Label "table spanning cell"
|
||
// and its bbox should cover the full span (X=0-200).
|
||
spanCell := grid[0][0]
|
||
if !strings.Contains(strings.ToLower(spanCell.Label), "spanning") {
|
||
t.Errorf("grid[0][0].Label = %q, want label containing 'spanning'", spanCell.Label)
|
||
}
|
||
if spanCell.X0 != 0 || spanCell.X1 != 200 {
|
||
t.Errorf("grid[0][0] X range = (%.0f,%.0f), want (0,200)", spanCell.X0, spanCell.X1)
|
||
}
|
||
|
||
// grid[0][1] should be covered (bbox zeroed).
|
||
if !isZeroCell(grid[0][1]) {
|
||
t.Errorf("grid[0][1] should be covered (zero bbox), got (%.0f,%.0f,%.0f,%.0f)",
|
||
grid[0][1].X0, grid[0][1].Y0, grid[0][1].X1, grid[0][1].Y1)
|
||
}
|
||
|
||
// grid[0][2] should be normal (not covered by span).
|
||
if isZeroCell(grid[0][2]) {
|
||
t.Error("grid[0][2] should NOT be covered")
|
||
}
|
||
}
|
||
|
||
func TestOssDeepDocService_GroupCells_IrregularSize(t *testing.T) {
|
||
b := &OssDeepDocService{}
|
||
cells := buildOSSCells(3, 2, 0, 0, 200, 120)
|
||
grid := b.GroupCells(cells)
|
||
|
||
if len(grid) != 3 {
|
||
t.Fatalf("expected 3 rows, got %d", len(grid))
|
||
}
|
||
if len(grid[0]) != 2 {
|
||
t.Fatalf("expected 2 cols, got %d", len(grid[0]))
|
||
}
|
||
}
|
||
|
||
func TestOssDeepDocService_GroupCells_EmptyInput(t *testing.T) {
|
||
b := &OssDeepDocService{}
|
||
grid := b.GroupCells(nil)
|
||
if len(grid) != 0 {
|
||
t.Errorf("expected empty grid, got %d rows", len(grid))
|
||
}
|
||
}
|
||
|
||
func TestOssDeepDocService_GroupCells_NoRows(t *testing.T) {
|
||
b := &OssDeepDocService{}
|
||
// Only a "table" cell, no row cells.
|
||
cells := []TSRCell{
|
||
{X0: 0, Y0: 0, X1: 500, Y1: 200, Label: "table"},
|
||
}
|
||
grid := b.GroupCells(cells)
|
||
if len(grid) != 0 {
|
||
t.Errorf("expected empty grid without row cells, got %d rows", len(grid))
|
||
}
|
||
}
|
||
|
||
func TestOssDeepDocService_GroupCells_NoColumns(t *testing.T) {
|
||
b := &OssDeepDocService{}
|
||
// Table + rows but no column cells → each row gets 1 wide column.
|
||
cells := []TSRCell{
|
||
{X0: 0, Y0: 0, X1: 500, Y1: 100, Label: "table"},
|
||
{X0: 0, Y0: 0, X1: 500, Y1: 50, Label: "table row"},
|
||
{X0: 0, Y0: 50, X1: 500, Y1: 100, Label: "table row"},
|
||
}
|
||
grid := b.GroupCells(cells)
|
||
if len(grid) != 2 {
|
||
t.Fatalf("expected 2 rows, got %d", len(grid))
|
||
}
|
||
if len(grid[0]) != 1 {
|
||
t.Errorf("expected 1 col (default wide column), got %d", len(grid[0]))
|
||
}
|
||
}
|
||
|
||
// ── helpers ──────────────────────────────────────────────────────────
|
||
|
||
// buildOSSCells constructs a set of OSS-style structural cells for
|
||
// an R×C table with the given overall bounding box.
|
||
func buildOSSCells(rows, cols int, x0, y0, x1, y1 float64) []TSRCell {
|
||
rowH := (y1 - y0) / float64(rows)
|
||
colW := (x1 - x0) / float64(cols)
|
||
|
||
cells := []TSRCell{
|
||
{X0: x0, Y0: y0, X1: x1, Y1: y1, Label: "table"},
|
||
}
|
||
|
||
for r := 0; r < rows; r++ {
|
||
cells = append(cells, TSRCell{
|
||
X0: x0, Y0: y0 + float64(r)*rowH,
|
||
X1: x1, Y1: y0 + float64(r+1)*rowH,
|
||
Label: "table row",
|
||
})
|
||
}
|
||
for c := 0; c < cols; c++ {
|
||
cells = append(cells, TSRCell{
|
||
X0: x0 + float64(c)*colW, Y0: y0,
|
||
X1: x0 + float64(c+1)*colW, Y1: y1,
|
||
Label: "table column",
|
||
})
|
||
}
|
||
|
||
return cells
|
||
}
|
||
|
||
// isZeroCell reports whether a cell has its bbox zeroed (covered by a span).
|
||
func isZeroCell(c TSRCell) bool {
|
||
return c.X0 == 0 && c.Y0 == 0 && c.X1 == 0 && c.Y1 == 0
|
||
}
|
||
|
||
// hasLabel reports whether any cell in a row has a label containing substr.
|
||
func hasLabel(row []TSRCell, substr string) bool {
|
||
for _, c := range row {
|
||
if strings.Contains(strings.ToLower(c.Label), strings.ToLower(substr)) {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|