Files
ragflow/internal/deepdoc/parser/pdf/layout/boxes_sections_test.go
Jack 98323e7910 Refactor: oss parser go refactor (#16391)
### What problem does this PR solve?

Package refactor and PDF post process.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-29 18:46:41 +08:00

393 lines
12 KiB
Go

package layout
import (
"testing"
pdf "ragflow/internal/deepdoc/parser/pdf/type"
)
func TestBoxesToSections_CrossPagePositionTag(t *testing.T) {
// Page 0: 267 PDF-points tall (800px at zoom=3).
// Box bottom=400 > 267 → spills into page 1 by 133pt.
boxes := []pdf.TextBox{
{X0: 100, X1: 500, Top: 200, Bottom: 400, PageNumber: 0, Text: "跨页表格"},
}
pageHeights := map[int]float64{0: 267.0}
sections := BoxesToSections(boxes, pageHeights)
if len(sections) != 1 {
t.Fatalf("expected 1 section, got %d", len(sections))
}
s := sections[0]
// Python: @@1-2\t100.0\t500.0\t200.0\t133.0##
// Page 0→1 becomes 1-indexed → pages 1-2.
if s.PositionTag != "@@1-2\t100.0\t500.0\t200.0\t133.0##" {
t.Errorf("PositionTag: got %q, want '@@1-2\\t100.0\\t500.0\\t200.0\\t133.0##'", s.PositionTag)
}
if len(s.Positions) != 1 {
t.Fatalf("expected 1 pdf.Position, got %d", len(s.Positions))
}
p := s.Positions[0]
if len(p.PageNumbers) != 2 || p.PageNumbers[0] != 0 || p.PageNumbers[1] != 1 {
t.Errorf("PageNumbers: got %v, want [0, 1]", p.PageNumbers)
}
if p.Top != 200 || p.Bottom != 133 {
t.Errorf("coords: top=%v (want 200), bottom=%v (want 133 = 400-267)", p.Top, p.Bottom)
}
}
// TestBoxesToSections_SinglePageUnchanged verifies single-page boxes are
// unaffected by the cross-page change.
func TestBoxesToSections_SinglePageUnchanged(t *testing.T) {
boxes := []pdf.TextBox{
{X0: 50, X1: 200, Top: 10, Bottom: 30, PageNumber: 0, Text: "普通文本"},
}
pageHeights := map[int]float64{0: 267.0}
sections := BoxesToSections(boxes, pageHeights)
if len(sections) != 1 {
t.Fatalf("expected 1 section, got %d", len(sections))
}
// Single page: tag should be @@1, not @@1-1
if sections[0].PositionTag != "@@1\t50.0\t200.0\t10.0\t30.0##" {
t.Errorf("single-page PositionTag: got %q", sections[0].PositionTag)
}
if len(sections[0].Positions[0].PageNumbers) != 1 {
t.Errorf("single-page PageNumbers: got %v, want [0]", sections[0].Positions[0].PageNumbers)
}
}
func TestResolvePageSpan_SinglePage(t *testing.T) {
// Box fits within the page → toPage unchanged, bottom unchanged.
toPage, bottom := ResolvePageSpan(0, 30, map[int]float64{0: 267})
if toPage != 0 || bottom != 30 {
t.Errorf("got toPage=%d bottom=%v, want 0, 30", toPage, bottom)
}
}
func TestResolvePageSpan_CrossPage(t *testing.T) {
// Box bottom=400 exceeds page 0 height=267 → spans to page 1.
toPage, bottom := ResolvePageSpan(0, 400, map[int]float64{0: 267})
if toPage != 1 {
t.Errorf("toPage = %d, want 1", toPage)
}
if bottom != 133 {
t.Errorf("bottom = %v, want 133 (400-267)", bottom)
}
}
func TestResolvePageSpan_MultiPage(t *testing.T) {
// Box bottom=600, page 0=267, page 1=200, page 2=200.
heights := map[int]float64{0: 267, 1: 200, 2: 200}
toPage, bottom := ResolvePageSpan(0, 600, heights)
if toPage != 2 {
t.Errorf("toPage = %d, want 2", toPage)
}
if bottom != 133 {
t.Errorf("bottom = %v, want 133 (600-267-200)", bottom)
}
}
func TestResolvePageSpan_NilHeights(t *testing.T) {
toPage, bottom := ResolvePageSpan(0, 400, nil)
if toPage != 0 || bottom != 400 {
t.Errorf("got toPage=%d bottom=%v, want 0, 400 (nil=no cross-page)", toPage, bottom)
}
}
func TestResolvePageSpan_ZeroHeightGuard(t *testing.T) {
// Zero-height pages must not cause an infinite loop.
// Page 0=200, page 1=0, page 2=0, page 3=300 — box bottom=500.
heights := map[int]float64{0: 200, 1: 0, 2: 0, 3: 300}
toPage, bottom := ResolvePageSpan(0, 500, heights)
// 500-200=300 remaining; page1=0 → break at unknown/invalid; toPage=1, bottom=300.
// (the break path treats zero/unknown as "assume same height once and stop")
if toPage != 1 {
t.Errorf("toPage = %d, want 1 (stopped at first zero-height page)", toPage)
}
if bottom != 300 {
t.Errorf("bottom = %v, want 300 (500-200)", bottom)
}
}
func TestResolvePageSpan_UnknownNextPage(t *testing.T) {
// Next page not in map → assume same height once, then stop.
heights := map[int]float64{0: 267}
toPage, bottom := ResolvePageSpan(0, 500, heights)
if toPage != 1 {
t.Errorf("toPage = %d, want 1 (one fallback extension)", toPage)
}
if bottom != 233 {
t.Errorf("bottom = %v, want 233 (500-267)", bottom)
}
}
func TestResolvePageSpan_NegativePh(t *testing.T) {
heights := map[int]float64{0: 200, 1: -10, 2: 200}
toPage, bottom := ResolvePageSpan(0, 500, heights)
if toPage != 1 {
t.Errorf("toPage = %d, want 1 (stopped at negative-height page)", toPage)
}
if bottom != 300 {
t.Errorf("bottom = %v, want 300 (500-200)", bottom)
}
}
func TestNormalizeSectionPositions_ValidTag(t *testing.T) {
sections := []pdf.Section{
{Text: "test", PositionTag: "@@1\t50.0\t300.0\t200.0\t400.0##", Positions: nil},
}
NormalizeSectionPositions(sections)
s := sections[0]
if len(s.Positions) != 1 {
t.Fatalf("expected 1 Position, got %d", len(s.Positions))
}
p := s.Positions[0]
if len(p.PageNumbers) != 1 || p.PageNumbers[0] != 0 {
t.Errorf("PageNumbers: got %v, want [0]", p.PageNumbers)
}
if p.Left != 50.0 || p.Right != 300.0 || p.Top != 200.0 || p.Bottom != 400.0 {
t.Errorf("coords: got (%.1f, %.1f, %.1f, %.1f), want (50.0, 300.0, 200.0, 400.0)",
p.Left, p.Right, p.Top, p.Bottom)
}
}
func TestNormalizeSectionPositions_EmptyTag(t *testing.T) {
sections := []pdf.Section{
{Text: "test", PositionTag: "", Positions: nil},
}
NormalizeSectionPositions(sections)
if len(sections[0].Positions) != 0 {
t.Errorf("expected empty Positions, got %v", sections[0].Positions)
}
}
func TestNormalizeSectionPositions_AlreadyPopulated(t *testing.T) {
existing := []pdf.Position{{PageNumbers: []int{0}, Left: 10, Right: 20, Top: 30, Bottom: 40}}
sections := []pdf.Section{
{Text: "test", PositionTag: "@@1\t50.0\t300.0\t200.0\t400.0##", Positions: existing},
}
NormalizeSectionPositions(sections)
// Should NOT overwrite existing Positions
if len(sections[0].Positions) != 1 {
t.Fatalf("expected 1 Position, got %d", len(sections[0].Positions))
}
p := sections[0].Positions[0]
if p.Left != 10 || p.Top != 30 {
t.Errorf("Positions were overwritten: got left=%.1f top=%.1f, want left=10 top=30", p.Left, p.Top)
}
}
func TestNormalizeSectionPositions_MultiPageTag(t *testing.T) {
sections := []pdf.Section{
{Text: "test", PositionTag: "@@1-2\t50.0\t300.0\t200.0\t400.0##", Positions: nil},
}
NormalizeSectionPositions(sections)
p := sections[0].Positions[0]
if len(p.PageNumbers) != 2 || p.PageNumbers[0] != 0 || p.PageNumbers[1] != 1 {
t.Errorf("PageNumbers: got %v, want [0, 1]", p.PageNumbers)
}
}
func TestNormalizeSectionPositions_MixedSlice(t *testing.T) {
existing := []pdf.Position{{PageNumbers: []int{2}, Left: 1, Right: 2, Top: 3, Bottom: 4}}
sections := []pdf.Section{
{Text: "has_positions", PositionTag: "@@9\t1.0\t2.0\t3.0\t4.0##", Positions: existing},
{Text: "no_positions", PositionTag: "@@1\t50.0\t300.0\t200.0\t400.0##", Positions: nil},
{Text: "no_tag", PositionTag: "", Positions: nil},
}
NormalizeSectionPositions(sections)
// has_positions: existing preserved
if sections[0].Positions[0].PageNumbers[0] != 2 {
t.Errorf("existing Positions were overwritten")
}
// no_positions: parsed from tag
if len(sections[1].Positions) != 1 || sections[1].Positions[0].PageNumbers[0] != 0 {
t.Errorf("no_positions not normalized: %v", sections[1].Positions)
}
// no_tag: left empty
if len(sections[2].Positions) != 0 {
t.Errorf("no_tag should remain empty: %v", sections[2].Positions)
}
}
func TestNormalizeSectionPositions_NilInput(t *testing.T) {
// Should not panic
NormalizeSectionPositions(nil)
}
func TestNormalizeSectionPositions_EmptySlice(t *testing.T) {
sections := []pdf.Section{}
NormalizeSectionPositions(sections)
if len(sections) != 0 {
t.Error("empty slice should remain empty")
}
}
func TestSectionsToMarkdown_Title(t *testing.T) {
sections := []pdf.Section{
{LayoutType: pdf.LayoutTypeTitle, Text: "标题", Image: ""},
}
got := SectionsToMarkdown(sections)
want := "\n## 标题\n"
if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
func TestSectionsToMarkdown_FigureWithImage(t *testing.T) {
sections := []pdf.Section{
{LayoutType: pdf.LayoutTypeFigure, Text: "图1", Image: "abc123"},
}
got := SectionsToMarkdown(sections)
want := "\n![Image](data:image/png;base64,abc123)"
if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
func TestSectionsToMarkdown_FigureWithoutImage(t *testing.T) {
sections := []pdf.Section{
{LayoutType: pdf.LayoutTypeFigure, Text: "图1", Image: ""},
}
got := SectionsToMarkdown(sections)
want := "图1\n"
if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
func TestSectionsToMarkdown_Text(t *testing.T) {
sections := []pdf.Section{
{LayoutType: pdf.LayoutTypeText, Text: "普通内容", Image: ""},
}
got := SectionsToMarkdown(sections)
want := "普通内容\n"
if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
func TestSectionsToMarkdown_Table(t *testing.T) {
sections := []pdf.Section{
{LayoutType: pdf.LayoutTypeTable, Text: "表格内容", Image: ""},
}
got := SectionsToMarkdown(sections)
want := "表格内容\n"
if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
func TestSectionsToMarkdown_Mixed(t *testing.T) {
sections := []pdf.Section{
{LayoutType: pdf.LayoutTypeTitle, Text: "标题", Image: ""},
{LayoutType: pdf.LayoutTypeText, Text: "内容", Image: ""},
{LayoutType: pdf.LayoutTypeFigure, Text: "图", Image: "img"},
{LayoutType: pdf.LayoutTypeTable, Text: "表格", Image: ""},
}
got := SectionsToMarkdown(sections)
want := "\n## 标题\n内容\n\n![Image](data:image/png;base64,img)表格\n"
if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
func TestSectionsToMarkdown_EmptySlice(t *testing.T) {
got := SectionsToMarkdown([]pdf.Section{})
if got != "" {
t.Errorf("got %q, want empty string", got)
}
}
func TestSectionsToMarkdown_NilSlice(t *testing.T) {
got := SectionsToMarkdown(nil)
if got != "" {
t.Errorf("got %q, want empty string", got)
}
}
func TestSectionsToJSON_SingleSection(t *testing.T) {
sections := []pdf.Section{
{
Text: "测试内容",
LayoutType: pdf.LayoutTypeText,
DocTypeKwd: "text",
Positions: []pdf.Position{{PageNumbers: []int{0}, Left: 50, Right: 300, Top: 200, Bottom: 400}},
Image: "abc123",
},
}
got := SectionsToJSON(sections)
if len(got) != 1 {
t.Fatalf("expected 1 item, got %d", len(got))
}
m := got[0]
if m["text"] != "测试内容" {
t.Errorf("text: got %v", m["text"])
}
if m["layout_type"] != "text" {
t.Errorf("layout_type: got %v", m["layout_type"])
}
if m["doc_type_kwd"] != "text" {
t.Errorf("doc_type_kwd: got %v", m["doc_type_kwd"])
}
if m["image"] != "abc123" {
t.Errorf("image: got %v", m["image"])
}
positions, ok := m["_pdf_positions"].([][]any)
if !ok || len(positions) != 1 {
t.Fatalf("_pdf_positions not correct type/length: %T %v", m["_pdf_positions"], m["_pdf_positions"])
}
pos := positions[0]
pages := pos[0].([]any)
if len(pages) != 1 || pages[0] != 0 {
t.Errorf("pages: got %v", pages)
}
if pos[1] != float64(50) || pos[2] != float64(300) || pos[3] != float64(200) || pos[4] != float64(400) {
t.Errorf("coords: got %v", pos[1:])
}
}
func TestSectionsToJSON_MultiPage(t *testing.T) {
sections := []pdf.Section{
{
Text: "跨页内容",
Positions: []pdf.Position{{PageNumbers: []int{0, 1}, Left: 100, Right: 200, Top: 50, Bottom: 300}},
},
}
got := SectionsToJSON(sections)
positions := got[0]["_pdf_positions"].([][]any)
pages := positions[0][0].([]any)
if len(pages) != 2 || pages[0] != 0 || pages[1] != 1 {
t.Errorf("multi-page pages: got %v, want [0, 1]", pages)
}
}
func TestSectionsToJSON_EmptySlice(t *testing.T) {
got := SectionsToJSON([]pdf.Section{})
if len(got) != 0 {
t.Errorf("got %d items, want 0", len(got))
}
}
func TestSectionsToJSON_NilSlice(t *testing.T) {
got := SectionsToJSON(nil)
if got == nil {
t.Error("expected non-nil slice, got nil")
}
if len(got) != 0 {
t.Errorf("got %d items, want 0", len(got))
}
}
// TestCrossPageTableMerge verifies that mergeTablesAcrossPages merges
// two TableItems on consecutive pages with overlapping X positions.
// Python: _extract_table_figure merges cross-page tables by matching layoutno.
// Spanning cells should be annotated with colspan/rowspan in the HTML output.