")
+ totalTdTh := strings.Count(html, " 1.5 × median_height ≈ 15pt).
+ // Each figure text box → separate section in result.Sections.
+ // CollectFigures collects them into result.Figures but doesn't re-insert.
+
+ var figureSections []Section
+ for _, s := range result.Sections {
+ if s.LayoutType == "figure" {
+ figureSections = append(figureSections, s)
+ }
+ }
+
+ // Assert 1: Python expects exactly 1 consolidated figure section.
+ // Go currently produces 2 (one per unmerged text box) — this FAILS.
+ if len(figureSections) != 1 {
+ t.Errorf("FIGURE INSERTION BUG: expected 1 consolidated figure section (Python insert_table_figures), got %d. Go does not consolidate figure text boxes into a single block.", len(figureSections))
+ }
+
+ // Assert 2: The single figure section must contain BOTH text fragments.
+ if len(figureSections) == 1 {
+ combined := figureSections[0].Text
+ if !strings.Contains(combined, "架构图") || !strings.Contains(combined, "系统模块") {
+ t.Errorf("FIGURE INSERTION BUG: figure section text=%q should contain both fragments. Python merges all figure-region text.", combined)
+ }
+ }
+
+ t.Logf("figure sections in Sections: %d", len(figureSections))
+ t.Logf("result.Figures count: %d", len(result.Figures))
+ t.Logf("result.Sections total: %d", len(result.Sections))
+ for i, s := range result.Sections {
+ t.Logf(" section[%d] layout=%q text=%q", i, s.LayoutType, s.Text)
+ }
+}
+
+// =============================================================================
+// Issue 3: Multi-page table merging
+// Python's _extract_table_figure merges tables with same layoutno across
+// consecutive pages (gap ≤ 1 page, Y-dis ≤ 23× median height).
+// Go's extractTableAndReplace does NOT merge tables across pages.
+// =============================================================================
+
+// TestExtractTableAndReplace_NoCrossPageMerge exposes that extractTableAndReplace
+// does not merge tables from consecutive pages even with the same layoutno.
+func TestExtractTableAndReplace_NoCrossPageMerge(t *testing.T) {
+ // Simulate a table spanning pages 0 and 1.
+ // Python would merge these because: same layoutno, consecutive pages,
+ // Y-distance ≤ 23× median_height.
+ boxes := []TextBox{
+ {X0: 10, X1: 200, Top: 500, Bottom: 530, Text: "续表内容", LayoutType: "table", PageNumber: 0, LayoutNo: "0"},
+ {X0: 10, X1: 200, Top: 50, Bottom: 80, Text: "表尾内容", LayoutType: "table", PageNumber: 1, LayoutNo: "0"},
+ }
+
+ // Two separate TableItems — one per page. Python would merge these
+ // before insert_table_figures.
+ tables := []TableItem{
+ {
+ Cells: []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "Page0", Label: "table row"}},
+ Positions: []Position{{PageNumbers: []int{0}, Left: 0, Right: 300, Top: 500, Bottom: 530}},
+ Scale: 1.0,
+ },
+ {
+ Cells: []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "Page1", Label: "table row"}},
+ Positions: []Position{{PageNumbers: []int{1}, Left: 0, Right: 300, Top: 50, Bottom: 80}},
+ Scale: 1.0,
+ },
+ }
+
+ result := extractTableAndReplace(boxes, tables)
+
+ // Go produces 2 separate HTML table boxes (one per page).
+ // Python would produce 1 merged table with cells from both pages.
+ tableCount := 0
+ for _, b := range result {
+ if strings.Contains(b.Text, "") {
+ tableCount++
+ }
+ }
+ if tableCount == 2 {
+ t.Errorf("CROSS-PAGE TABLE MERGE BUG: got %d separate HTML tables across pages. Python would merge same-layoutno tables on consecutive pages into 1 consolidated table.", tableCount)
+ }
+ t.Logf("table HTML boxes: %d (Python would merge into 1)", tableCount)
+}
+
+// =============================================================================
+// Issue 3a: nomerge_lout_no — don't merge tables separated by captions
+// Python's _extract_table_figure tracks nomerge_lout_no: when a table box
+// is followed by a caption/title/reference, the table's key is added to
+// nomerge_lout_no. Later, cross-page merge skips tables in nomerge_lout_no.
+//
+// Example:
+// Page 0: table "0-table-3" → caption "表1:..." → table "0-table-4"
+// Page 1: table "1-table-3" (same layoutNo)
+// → Page 0's table-3 should NOT merge with Page 1's table-3,
+// because the caption on page 0 indicates the table ended.
+// → Go's mergeTablesAcrossPages has no nomerge_lout_no check.
+// =============================================================================
+
+// TestMergeTablesAcrossPages_NomergeAfterCaption_Missing exposes that
+// mergeTablesAcrossPages unconditionally merges consecutive-page tables,
+// even when Python's nomerge_lout_no would prevent it.
+func TestMergeTablesAcrossPages_NomergeAfterCaption_Missing(t *testing.T) {
+ // Simulate: page 0 has table at top, followed by a caption,
+ // then another table. Page 1 has the same-layoutNo table continuing.
+ // In Python, page 0's first table goes into nomerge_lout_no because
+ // the next box is a caption → no cross-page merge for that table group.
+ tables := []TableItem{
+ {
+ Cells: []TSRCell{{Text: "Page0-first", Label: "table row"}},
+ Positions: []Position{{
+ PageNumbers: []int{0},
+ Left: 0, Right: 300,
+ Top: 0, Bottom: 50,
+ }},
+ NoMerge: true, // Set when caption follows this table on the page
+ },
+ {
+ Cells: []TSRCell{{Text: "Page1-cont", Label: "table row"}},
+ Positions: []Position{{
+ PageNumbers: []int{1},
+ Left: 0, Right: 300,
+ Top: 0, Bottom: 50,
+ }},
+ },
+ }
+
+ result := mergeTablesAcrossPages(tables, nil)
+
+ // Verify NoMerge prevents cross-page merging.
+ if len(result) != 2 {
+ t.Errorf("NOMERGE BUG: expected 2 separate table groups, got %d.", len(result))
+ }
+ t.Log("NoMerge flag correctly prevents cross-page merge.")
+}
+
+// =============================================================================
+// Issue 3b: insert position — min_rectangle_distance vs anchor
+// Python's insert_table_figures uses min_rectangle_distance to find the
+// spatially nearest text box and inserts the table/figure next to it.
+// Go's extractTableAndReplace uses the first replaced table box index as
+// the anchor (insert position).
+//
+// When the DLA table region extends beyond the anchor box's bottom and
+// overlaps a text box below the table, Python puts the table next to that
+// overlapping text box (distance=0); Go puts it at the anchor position.
+// =============================================================================
+
+// TestExtractTableAndReplace_InsertionPosition_DistanceBug exposes that
+// extractTableAndReplace uses the first table box as anchor, rather than
+// finding the spatially nearest text box like Python.
+func TestExtractTableAndReplace_InsertionPosition_DistanceBug(t *testing.T) {
+ // Two text boxes above the table: L0 (left, near table) and R0 (right, far).
+ // Python: nearest to table is L0 (dx=0, dy=70). L0 bottom=30 < table top=100
+ // → insert AFTER L0. Result: [L0, table, R0, R1, L2].
+ // Go: anchor = first table box (L1 at index 2). Result: [L0, R0, table, R1, L2].
+ // The table is one position off.
+ boxes := []TextBox{
+ {X0: 10, X1: 100, Top: 10, Bottom: 30, Text: "L0", LayoutType: "text", PageNumber: 0},
+ {X0: 300, X1: 400, Top: 10, Bottom: 30, Text: "R0", LayoutType: "text", PageNumber: 0},
+ {X0: 10, X1: 100, Top: 100, Bottom: 130, Text: "table", LayoutType: "table", PageNumber: 0},
+ {X0: 300, X1: 400, Top: 100, Bottom: 130, Text: "R1", LayoutType: "text", PageNumber: 0},
+ {X0: 10, X1: 100, Top: 250, Bottom: 270, Text: "L2", LayoutType: "text", PageNumber: 0},
+ }
+
+ tables := []TableItem{{
+ Cells: []TSRCell{{Text: "cell", Label: "table row"}},
+ Positions: []Position{{Left: 10, Right: 100, Top: 100, Bottom: 130, PageNumbers: []int{0}}},
+ Scale: 1.0,
+ RegionLeft: 10, RegionRight: 100, RegionTop: 100, RegionBottom: 130,
+ }}
+
+ result := extractTableAndReplace(boxes, tables)
+
+ // Find L0 and table positions.
+ l0Idx, tableIdx := -1, -1
+ for i, b := range result {
+ if strings.TrimSpace(b.Text) == "L0" {
+ l0Idx = i
+ }
+ if b.LayoutType == "table" {
+ tableIdx = i
+ }
+ }
+
+ // BUG: table should immediately follow L0 (nearest neighbor, insert_after).
+ // Python: min_rectangle_distance → L0 nearest (dx=0, dy=70), L0 below table
+ // → insert_at+1 → table right after L0.
+ // Go: anchor = first table box index → table at original table box position.
+ if tableIdx != l0Idx+1 {
+ t.Errorf("INSERTION POSITION BUG: table (idx=%d) should immediately follow L0 (idx=%d). "+
+ "Python's min_rectangle_distance finds L0 as nearest text box and inserts table after it. "+
+ "Go anchors at first table box position (between R0 and R1).", tableIdx, l0Idx)
+ }
+ t.Logf("L0 at idx=%d, table at idx=%d", l0Idx, tableIdx)
+ t.Log("Fix: replace first-replaced-box anchor with min_rectangle_distance nearest-neighbor (Python pdf_parser.py:1608-1655).")
+}
+
+// =============================================================================
+// Issue 4: page_cum_height coordinate system
+// Python tracks cumulative page image heights for cross-page position tags
+// and image cropping. Go uses per-page coordinates only.
+// =============================================================================
+
+// TestBoxesToSections_PerPageCoordinates confirms position tags use
+// page-relative coordinates. Python's _line_tag also produces local
+// coordinates (subtracts page_cum_height). The page number differentiates
+// pages; page_cum_height is an internal implementation detail.
+func TestBoxesToSections_PerPageCoordinates(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 10, X1: 100, Top: 40, Bottom: 60, Text: "Page 0 text", LayoutType: "text", PageNumber: 0},
+ {X0: 10, X1: 100, Top: 40, Bottom: 60, Text: "Page 1 text", LayoutType: "text", PageNumber: 1},
+ }
+ sections := boxesToSections(boxes, nil)
+ if len(sections) != 2 {
+ t.Fatalf("expected 2 sections, got %d", len(sections))
+ }
+ s0, s1 := sections[0], sections[1]
+ if len(s0.Positions) > 0 && len(s1.Positions) > 0 {
+ p0, p1 := s0.Positions[0], s1.Positions[0]
+ // Both Python and Go use local (page-relative) coordinates.
+ // Python's _line_tag: top = bx["top"] - page_cum_height[pn-1]
+ // gives local coordinate. Same as Go.
+ if p0.Top != p1.Top || p0.Bottom != p1.Bottom {
+ t.Errorf("expected same local coords, got Top=(%.0f,%.0f) Bottom=(%.0f,%.0f)", p0.Top, p1.Top, p0.Bottom, p1.Bottom)
+ }
+ t.Logf("page 0: Page=%v Top=%.0f Bottom=%.0f", p0.PageNumbers, p0.Top, p0.Bottom)
+ t.Logf("page 1: Page=%v Top=%.0f Bottom=%.0f", p1.PageNumbers, p1.Top, p1.Bottom)
+ t.Log("OK: position tags use page-relative coordinates in both Go and Python.")
+ }
+}
+
+// =============================================================================
+// Issue 6: cropSectionImage padding logic
+// Python's self.crop adds 120px context above first segment, 120px context
+// below last segment, 6px gap between pages, and overlay transparency.
+// Go has simpler crop logic.
+// =============================================================================
+
+// TestCropSectionImage_PaddingVsPython documents that Go's cropSectionImage
+// adds context padding differently from Python's self.crop.
+func TestCropSectionImage_PaddingVsPython(t *testing.T) {
+ // Create a page image and position tag for a small text region.
+ img := image.NewRGBA(image.Rect(0, 0, 300, 800)) // 300×800 page at zoom=3 → PDF 100×267
+ pageImages := map[int]image.Image{0: img}
+
+ // Position tag for a small text box near the top of the page.
+ posTag := FormatPositionTag(0, 50.0, 100.0, 10.0, 30.0)
+
+ result := cropSectionImage(posTag, pageImages, 3.0)
+
+ if result == "" {
+ t.Error("cropSectionImage returned empty string for valid position")
+ }
+ // Decode result to check image dimensions.
+ data, err := base64.StdEncoding.DecodeString(result)
+ if err != nil {
+ t.Fatalf("failed to decode base64: %v", err)
+ }
+ cropped, _, err := image.Decode(bytes.NewReader(data))
+ if err != nil {
+ t.Fatalf("failed to decode PNG: %v", err)
+ }
+ croppedH := cropped.Bounds().Dy()
+ // Original text region: Top=10, Bottom=30 → height=20 at PDF points.
+ // zoom=3 → 60px text height.
+ // Python adds 120px context above + 120px below + 6px gap → ~306px.
+ // Go adds contextPad=120 points above/below at PDF scale → with zoom=3: 360+60+360=780px.
+ // Python uses pixel-space padding (120px literally), Go uses PDF-point padding (120pt).
+ expectedMin := 60 // bare minimum: text region itself
+ if croppedH <= expectedMin {
+ t.Errorf("CROP PADDING BUG: cropped image height=%dpx, expected >%dpx with context padding. Python adds 120px above and below for context.", croppedH, expectedMin)
+ }
+ t.Logf("cropped image: %dx%d (text region 60px, expecting padding)", cropped.Bounds().Dx(), croppedH)
+ t.Log("NOTE: Python's self.crop adds 120px context padding in pixel space, multi-page stitching, and overlay transparency. Go's cropSectionImage uses PDF-point padding and simpler stitching.")
+}
+
+// =============================================================================
+// Issue 7: Data-source filter missing
+// Python's _extract_table_figure pops table/figure boxes matching
+// r"(数据|资料|图表)*来源[:: ]" (pdf_parser.py:1040-1042, 1050-1052).
+// These boxes are discarded — not extracted, not inserted back.
+// Go has no equivalent filter in extractTableAndReplace or consolidateFigures.
+// =============================================================================
+
+// dataSourcePattern is a Go translation of Python's
+// r"(数据|资料|图表)*来源[:: ]" used with re.match (anchored at start).
+var dataSourcePattern = `^(数据|资料|图表)*来源[:: ]`
+
+// TestDataSourcePattern_RegexCoverage validates the Python regex behavior
+// that should be adopted. Documents which strings match and which don't.
+func TestDataSourcePattern_RegexCoverage(t *testing.T) {
+ tests := []struct {
+ text string
+ want bool // Python re.match truthiness
+ }{
+ // ── Matching patterns (should be filtered) ──
+ {"数据来源:国家统计局", true}, // 数据 + 来源 + fullwidth colon
+ {"资料来源: 某报告", true}, // 资料 + 来源 + halfwidth colon
+ {"图表来源:某数据库", true}, // 图表 + 来源 + fullwidth colon
+ {"来源:权威机构", true}, // zero prefix + 来源 + fullwidth colon
+ {"来源: 参考数据", true}, // zero prefix + 来源 + halfwidth colon
+ {"数据来源 说明", true}, // 数据 + 来源 + space
+
+ // ── Non-matching patterns (should NOT be filtered) ──
+ {"数据来源明细", false}, // 来源 followed by 明, not ::space
+ {"普通来源说明", false}, // doesn't start with keyword
+ {"数据", false}, // too short
+ {"来源", false}, // 来源 but no ::space after
+ {"资料来源说明", false}, // 来源 followed by 说, not ::space
+ {"", false}, // empty
+ {"TABLE 1: 数据来源统计", false}, // doesn't start with keyword
+ }
+
+ for _, tt := range tests {
+ matched := regexp.MustCompile(dataSourcePattern).MatchString(tt.text)
+ if matched != tt.want {
+ t.Errorf("dataSourcePattern.MatchString(%q) = %v, want %v", tt.text, matched, tt.want)
+ }
+ }
+ t.Log("NOTE: Python re.match(r\"(数据|资料|图表)*来源[:: ]\", text) — anchored at start.")
+ t.Log("Go regexp.MatchString equivalent with ^ prefix.")
+}
+
+// TestExtractTableAndReplace_DataSourceFilter_Missing exposes that Go does NOT
+// filter out table boxes whose text matches r"(数据|资料|图表)*来源[:: ]".
+// Python's _extract_table_figure pops these boxes from self.boxes without
+// adding them to the tables dict (pdf_parser.py:1040-1042).
+func TestExtractTableAndReplace_DataSourceFilter_Missing(t *testing.T) {
+ // A table box with data-source text and a normal table box.
+ // Both overlap a TableItem position, so both would be replaced with HTML.
+ boxes := []TextBox{
+ {X0: 10, X1: 200, Top: 0, Bottom: 50, Text: "数据来源:国家统计局", LayoutType: "table", PageNumber: 0},
+ {X0: 10, X1: 200, Top: 60, Bottom: 80, Text: "表1:正常数据", LayoutType: "table", PageNumber: 0},
+ }
+
+ // Two TableItems — one per table box — so each would independently produce HTML.
+ tables := []TableItem{
+ {
+ Cells: []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "来源", Label: "table row"}},
+ Positions: []Position{{Left: 0, Right: 300, Top: 0, Bottom: 50}},
+ Scale: 1.0,
+ },
+ {
+ Cells: []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "正常", Label: "table row"}},
+ Positions: []Position{{Left: 0, Right: 300, Top: 60, Bottom: 80}},
+ Scale: 1.0,
+ },
+ }
+
+ result := extractTableAndReplace(boxes, tables)
+
+ // Python behavior: "数据来源:国家统计局" is popped from self.boxes,
+ // NOT added to tables dict, NOT replaced with HTML. Gone entirely.
+ // "表1:正常数据" is replaced with HTML as usual.
+ // Expected result: exactly 1 HTML table box for the normal table.
+ //
+ // BUG: Go replaces both boxes with HTML tables. The data-source box
+ // produces an HTML table with cell text "来源" — this should NOT exist.
+ htmlTableCount := 0
+ hasDataSourceTable := false
+ for _, b := range result {
+ if strings.Contains(b.Text, "") {
+ htmlTableCount++
+ // The data-source table's cell text "来源" ends up in the HTML.
+ // c.f. constructTable which uses TSRCell text, not box text.
+ if strings.Contains(b.Text, ">来源<") {
+ hasDataSourceTable = true
+ }
+ }
+ }
+ if htmlTableCount != 1 {
+ t.Errorf("DATA SOURCE FILTER BUG: expected 1 HTML table (normal only), got %d. Python pops data-source table box entirely in _extract_table_figure (pdf_parser.py:1040-1042). Go replaces it with an HTML table.", htmlTableCount)
+ }
+ if hasDataSourceTable {
+ t.Errorf("DATA SOURCE FILTER BUG: data-source table should NOT produce HTML output. Cell '来源' appears in HTML: Python discards these boxes, Go incorrectly constructs a table for them.")
+ }
+
+ t.Log("NOTE: Python filters table boxes matching r\"(数据|资料|图表)*来源[:: ]\" in _extract_table_figure.")
+ t.Log("Go's extractTableAndReplace has no equivalent filter — data-source boxes get replaced with HTML instead of being discarded.")
+}
+
+// TestExtractTableAndReplace_DataSourceVariants tests multiple variants of
+// the data-source pattern that should all be filtered.
+func TestExtractTableAndReplace_DataSourceVariants(t *testing.T) {
+ variants := []string{
+ "数据来源:国家统计局",
+ "资料来源: 某报告",
+ "图表来源:某数据库",
+ "来源:权威机构",
+ "来源: 参考数据",
+ }
+
+ for _, variant := range variants {
+ t.Run(variant, func(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 10, X1: 200, Top: 0, Bottom: 50, Text: variant, LayoutType: "table", PageNumber: 0},
+ }
+
+ tables := []TableItem{{
+ Cells: []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "A", Label: "table row"}},
+ Positions: []Position{{Left: 0, Right: 300, Top: 0, Bottom: 50}},
+ Scale: 1.0,
+ }}
+
+ result := extractTableAndReplace(boxes, tables)
+
+ // BUG: box with data-source text should be REMOVED entirely —
+ // zero HTML output. Python pops these boxes without replacement.
+ for _, b := range result {
+ if strings.Contains(b.Text, "") {
+ t.Errorf("DATA SOURCE FILTER BUG: variant %q should be removed without HTML replacement. Python pops data-source table boxes entirely.", variant)
+ }
+ }
+ })
+ }
+ t.Log("NOTE: All variants of r\"(数据|资料|图表)*来源[:: ]\" should be filtered by extractTableAndReplace.")
+}
+
+// TestConsolidateFigures_DataSourceFilter_Missing exposes that Go does NOT
+// filter out figure boxes whose text matches r"(数据|资料|图表)*来源[:: ]".
+// Python's _extract_table_figure pops these boxes from self.boxes without
+// adding them to the figures dict (pdf_parser.py:1050-1052).
+func TestConsolidateFigures_DataSourceFilter_Missing(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 10, X1: 200, Top: 0, Bottom: 50, Text: "数据来源:某机构", LayoutType: "figure", PageNumber: 0, LayoutNo: "figure-0"},
+ {X0: 10, X1: 200, Top: 60, Bottom: 80, Text: "架构图", LayoutType: "figure", PageNumber: 0, LayoutNo: "figure-0"},
+ }
+
+ result := consolidateFigures(boxes)
+
+ // Python behavior: "数据来源:某机构" is popped from self.boxes,
+ // NOT added to figures dict → gone entirely.
+ // "架构图" is extracted normally.
+ // Expected result: exactly 1 figure box with "架构图" text only.
+ for _, b := range result {
+ if strings.Contains(b.Text, "数据来源") || strings.Contains(b.Text, "某机构") {
+ t.Errorf("DATA SOURCE FIGURE FILTER BUG: '数据来源:某机构' figure box should be removed entirely. Python pops data-source figure boxes in _extract_table_figure (pdf_parser.py:1050-1052). Go still includes it.")
+ }
+ }
+
+ // Verify the normal figure box IS still present.
+ foundFigure := false
+ for _, b := range result {
+ if strings.Contains(b.Text, "架构图") {
+ foundFigure = true
+ }
+ }
+ if !foundFigure {
+ t.Error("normal figure box '架构图' should still be present")
+ }
+
+ t.Log("NOTE: Python filters figure boxes matching r\"(数据|资料|图表)*来源[:: ]\" in _extract_table_figure.")
+ t.Log("Go's consolidateFigures has no equivalent filter.")
+}
diff --git a/internal/deepdoc/parser/pdf/table_parity_test.go b/internal/deepdoc/parser/pdf/table_parity_test.go
new file mode 100644
index 0000000000..9fb6abe5c8
--- /dev/null
+++ b/internal/deepdoc/parser/pdf/table_parity_test.go
@@ -0,0 +1,96 @@
+//go:build cgo && manual
+
+package parser
+
+import (
+ "encoding/json"
+ "os"
+ "path/filepath"
+ "strings"
+ "testing"
+)
+
+// TestTableParityWithPythonBoxes reads Python's pre-merge table boxes
+// (with R/C annotations) and runs them through Go's constructTable.
+// If Go produces the same HTML as Python, the pipeline is correct
+// and differences are from the engine layer (pdf_oxide vs pdfplumber).
+func TestTableParityWithPythonBoxes(t *testing.T) {
+ boxesDir := filepath.Join("testdata", "output", "py", "noocr", "table_boxes")
+ entries, err := os.ReadDir(boxesDir)
+ if err != nil {
+ t.Skipf("Python table_boxes not found — run dump_py_results.py first: %v", err)
+ }
+
+ for _, e := range entries {
+ if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") {
+ continue
+ }
+ name := strings.TrimSuffix(e.Name(), ".json")
+ t.Run(name, func(t *testing.T) {
+ data, err := os.ReadFile(filepath.Join(boxesDir, e.Name()))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ var pyBoxes []struct {
+ X0, X1, Top, Bottom float64
+ Text string
+ R, C, H, SP int
+ LayoutType string
+ }
+ if err := json.Unmarshal(data, &pyBoxes); err != nil {
+ t.Fatal(err)
+ }
+
+ // Convert to Go TextBox
+ boxes := make([]TextBox, len(pyBoxes))
+ for i, b := range pyBoxes {
+ boxes[i] = TextBox{
+ X0: b.X0, X1: b.X1, Top: b.Top, Bottom: b.Bottom,
+ Text: b.Text, R: b.R, C: b.C, H: b.H, SP: b.SP,
+ LayoutType: b.LayoutType,
+ }
+ }
+
+ // Run through Go's constructTable
+ item := &TableItem{}
+ html := constructTable(nil, boxes, "", item)
+
+ if html == "" {
+ t.Error("constructTable returned empty HTML")
+ return
+ }
+ if !strings.Contains(html, "") {
+ t.Error("HTML missing tag")
+ }
+
+ // Verify structure
+ trCount := strings.Count(html, "")
+ tdCount := strings.Count(html, "| ")
+ thCount := strings.Count(html, " | ")
+ if trCount == 0 {
+ t.Error("no | rows found")
+ }
+ if tdCount == 0 && thCount == 0 {
+ t.Error("no | or | cells found")
+ }
+
+ // Check no empty rows
+ nonEmptyCols := 0
+ for _, row := range item.Rows {
+ for _, cell := range row {
+ if strings.TrimSpace(cell) != "" {
+ nonEmptyCols++
+ }
+ }
+ }
+ if nonEmptyCols == 0 {
+ t.Errorf("all %d cells are empty — R/C path broken", tdCount+thCount)
+ }
+
+ t.Logf("%s: %d rows, %d cells (%d th), %d non-empty",
+ name, trCount, tdCount+thCount, thCount, nonEmptyCols)
+ t.Logf("HTML snippet: %.200s...", html)
+ })
+ }
+}
diff --git a/internal/deepdoc/parser/pdf/table_rotate_integration_test.go b/internal/deepdoc/parser/pdf/table_rotate_integration_test.go
new file mode 100644
index 0000000000..a9c1b480ec
--- /dev/null
+++ b/internal/deepdoc/parser/pdf/table_rotate_integration_test.go
@@ -0,0 +1,192 @@
+//go:build cgo && manual
+
+package parser
+
+import (
+ "context"
+ "os"
+ "path/filepath"
+ "testing"
+)
+
+// TestTableRotation_Integration validates rotation detection with real DeepDoc.
+//
+// Prerequisites:
+// - DeepDoc running at localhost:9390 (or set DEEPDOC_URL)
+// - Test PDF: testdata/pdfs/table_rotation_test.pdf (generated by tools/generate_rotated_table_pdf.py)
+//
+// Run:
+//
+// CGO_CFLAGS="..." CGO_LDFLAGS="..." \
+// go test -tags 'cgo,manual' -run TestTableRotation_Integration -v -count=1
+func TestTableRotation_Integration(t *testing.T) {
+ pdfPath := filepath.Join("testdata", "pdfs", "table_rotation_test.pdf")
+ if _, err := os.Stat(pdfPath); os.IsNotExist(err) {
+ t.Skipf("test PDF not found: %s (run tools/generate_rotated_table_pdf.py first)", pdfPath)
+ }
+
+ baseURL := os.Getenv("DEEPDOC_URL")
+ if baseURL == "" {
+ baseURL = "http://localhost:9390"
+ }
+ dd, err := NewDeepDocClient(baseURL)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !dd.Health() {
+ t.Fatalf("DeepDoc not available at %s", baseURL)
+ }
+ t.Logf("DeepDoc available at %s", baseURL)
+
+ // Open PDF
+ data, err := os.ReadFile(pdfPath)
+ if err != nil {
+ t.Fatal(err)
+ }
+ eng, err := NewEngine(data)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer eng.Close()
+
+ pageCount, _ := eng.PageCount()
+ t.Logf("PDF: %d pages", pageCount)
+
+ cfg := DefaultParserConfig()
+ cfg.ToPage = pageCount - 1
+ autoRotate := true
+ cfg.AutoRotateTables = &autoRotate
+ _ = NewParser(cfg, dd) // verify construction does not panic
+
+ for pg := 0; pg < pageCount; pg++ {
+ pageImg, err := renderPageToImage(eng, pg)
+ if err != nil {
+ t.Fatalf("render page %d: %v", pg, err)
+ }
+
+ regions, err := dd.DLA(context.Background(), pageImg)
+ if err != nil {
+ t.Fatalf("DLA page %d: %v", pg, err)
+ }
+
+ tableCount := 0
+ for _, r := range regions {
+ if r.Label != "table" {
+ continue
+ }
+ tableCount++
+
+ // Crop table region
+ cropped, err := cropImageRegion(pageImg, r)
+ if err != nil {
+ t.Errorf(" crop table %d: %v", tableCount, err)
+ continue
+ }
+
+ // Evaluate rotation
+ angle, _, scores := evaluateTableOrientation(context.Background(), cropped, dd)
+ t.Logf(" Page %d Table %d: %dx%d, bestAngle=%d°, scores: 0=%.3f 90=%.3f 180=%.3f 270=%.3f",
+ pg, tableCount, cropped.Bounds().Dx(), cropped.Bounds().Dy(),
+ angle,
+ scores[0], scores[90], scores[180], scores[270])
+
+ // Verify: page 0 should be ~0°, page 1 should be ~90°
+ if pg == 0 && angle != 0 {
+ t.Errorf("Page 0 normal table: expected 0°, got %d°", angle)
+ }
+ // Page 1 has the rotated table - expect 90° (or 270° depending on DLA bbox)
+ if pg == 1 {
+ t.Logf(" NOTE: Page 1 rotated table detected as %d° (expect 90 or 270)", angle)
+
+ // Verify TSR returns labels (6th element in bbox array).
+ testCells, tsrErr := dd.TSR(context.Background(), cropped)
+ if tsrErr == nil && len(testCells) > 0 {
+ hasLabel := false
+ for _, c := range testCells {
+ if c.Label != "" {
+ hasLabel = true
+ break
+ }
+ }
+ if !hasLabel {
+ t.Error("TSR returned cells without labels")
+ } else {
+ t.Logf(" TSR labels OK: %d cells", len(testCells))
+ }
+ }
+ }
+ }
+ t.Logf("Page %d: %d tables detected", pg, tableCount)
+ }
+}
+
+// TestTableRotation_Stability runs rotation detection on a sample real PDF
+// and verifies the pipeline doesn't crash. Set BATCH_COUNT to limit.
+func TestTableRotation_Stability(t *testing.T) {
+ baseURL := os.Getenv("DEEPDOC_URL")
+ if baseURL == "" {
+ baseURL = "http://localhost:9390"
+ }
+ dd, err := NewDeepDocClient(baseURL)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !dd.Health() {
+ t.Fatalf("DeepDoc not available at %s", baseURL)
+ }
+
+ realDir := filepath.Join("testdata", "real_pdfs")
+ entries, err := os.ReadDir(realDir)
+ if err != nil {
+ t.Skipf("no real PDFs: %v", err)
+ }
+
+ count := 0
+ maxCount := 3 // sample size
+ for _, e := range entries {
+ if e.IsDir() || filepath.Ext(e.Name()) != ".pdf" {
+ continue
+ }
+ if count >= maxCount {
+ break
+ }
+
+ data, err := os.ReadFile(filepath.Join(realDir, e.Name()))
+ if err != nil {
+ continue
+ }
+ eng, err := NewEngine(data)
+ if err != nil {
+ continue
+ }
+
+ pageImg, err := renderPageToImage(eng, 0)
+ eng.Close()
+ if err != nil {
+ continue
+ }
+
+ regions, _ := dd.DLA(context.Background(), pageImg)
+ tables := 0
+ rotated := 0
+ for _, r := range regions {
+ if r.Label != "table" {
+ continue
+ }
+ tables++
+ cropped, _ := cropImageRegion(pageImg, r)
+ if cropped == nil {
+ continue
+ }
+ angle, _, _ := evaluateTableOrientation(context.Background(), cropped, dd)
+ if angle != 0 {
+ rotated++
+ t.Logf(" %s: rotated table detected (angle=%d°)", e.Name(), angle)
+ }
+ }
+ t.Logf(" %s: %d tables, %d rotated", e.Name(), tables, rotated)
+ count++
+ }
+
+ t.Logf("Sampled %d real PDFs", count)
+}
diff --git a/internal/deepdoc/parser/pdf/table_rotate_test.go b/internal/deepdoc/parser/pdf/table_rotate_test.go
new file mode 100644
index 0000000000..fc3796cfd8
--- /dev/null
+++ b/internal/deepdoc/parser/pdf/table_rotate_test.go
@@ -0,0 +1,238 @@
+package parser
+
+import (
+ "context"
+ "image"
+ "testing"
+)
+
+// mockRotationDoc implements DocAnalyzer with deterministic OCR results per angle.
+// The mock tracks the call sequence: evaluateTableOrientation tests angles in
+// order 0°, 90°, 180°, 270°. Each call to OCRDetect increments an internal
+// counter and returns data for the corresponding angle.
+type mockRotationDoc struct {
+ // angle → {regions count, average confidence, error}
+ angles map[int]struct {
+ regions int
+ avgConf float64
+ err error
+ }
+ callSeq int // incremented per OCRDetect call, selects the angle's data
+}
+
+var rotationOrder = []int{0, 90, 180, 270}
+
+func (m *mockRotationDoc) DLA(_ context.Context, _ image.Image) ([]DLARegion, error) { return nil, nil }
+func (m *mockRotationDoc) TSR(_ context.Context, _ image.Image) ([]TSRCell, error) { return nil, nil }
+func (m *mockRotationDoc) OCR(_ image.Image) (string, error) { return "", nil }
+func (m *mockRotationDoc) Health() bool { return true }
+func (m *mockRotationDoc) ModelType() ModelType { return ModelSaas }
+
+func (m *mockRotationDoc) currentAngle() int {
+ idx := m.callSeq % len(rotationOrder)
+ return rotationOrder[idx]
+}
+
+func (m *mockRotationDoc) OCRDetect(_ context.Context, img image.Image) ([]OCRBox, error) {
+ defer func() { m.callSeq++ }()
+ angle := m.currentAngle()
+ cfg, ok := m.angles[angle]
+ if !ok {
+ cfg = m.angles[0] // fallback to 0° config
+ }
+ if cfg.err != nil {
+ return nil, cfg.err
+ }
+ if cfg.regions == 0 {
+ return nil, nil
+ }
+ w, h := img.Bounds().Dx(), img.Bounds().Dy()
+ boxes := make([]OCRBox, cfg.regions)
+ step := w / (cfg.regions + 1)
+ for i := 0; i < cfg.regions; i++ {
+ x := step * (i + 1)
+ boxes[i] = OCRBox{
+ X0: float64(x), Y0: float64(h / 4),
+ X1: float64(x + 20), Y1: float64(h / 4),
+ X2: float64(x + 20), Y2: float64(h * 3 / 4),
+ X3: float64(x), Y3: float64(h * 3 / 4),
+ }
+ }
+ return boxes, nil
+}
+
+func (m *mockRotationDoc) OCRRecognizeBatch(_ context.Context, cropped []image.Image) ([][]OCRText, []error) {
+ results := make([][]OCRText, len(cropped))
+ errs := make([]error, len(cropped))
+ for i, img := range cropped {
+ results[i], errs[i] = m.OCRRecognize(context.Background(), img)
+ }
+ return results, errs
+}
+
+func (m *mockRotationDoc) OCRRecognize(_ context.Context, _ image.Image) ([]OCRText, error) {
+ angle := rotationOrder[(m.callSeq-1)%len(rotationOrder)] // use angle from last Detect call
+ cfg, ok := m.angles[angle]
+ if !ok {
+ cfg = m.angles[0]
+ }
+ if cfg.err != nil {
+ return nil, cfg.err
+ }
+ if cfg.regions == 0 {
+ return nil, nil
+ }
+ texts := make([]OCRText, cfg.regions)
+ for i := 0; i < cfg.regions; i++ {
+ texts[i] = OCRText{Text: "X", Confidence: cfg.avgConf}
+ }
+ return texts, nil
+}
+
+func makeTestTableImage() image.Image {
+ return image.NewRGBA(image.Rect(0, 0, 200, 100))
+}
+
+func TestEvaluateTableOrientation(t *testing.T) {
+ t.Run("normal table 0° wins", func(t *testing.T) {
+ doc := &mockRotationDoc{
+ angles: map[int]struct {
+ regions int
+ avgConf float64
+ err error
+ }{
+ 0: {regions: 10, avgConf: 0.9},
+ },
+ }
+ angle, _, scores := evaluateTableOrientation(context.Background(), makeTestTableImage(), doc)
+ if angle != 0 {
+ t.Errorf("expected 0°, got %d° (scores: %v)", angle, scores)
+ }
+ })
+
+ t.Run("90° rotated table wins", func(t *testing.T) {
+ doc := &mockRotationDoc{
+ angles: map[int]struct {
+ regions int
+ avgConf float64
+ err error
+ }{
+ 0: {regions: 2, avgConf: 0.2},
+ 90: {regions: 10, avgConf: 0.9},
+ 180: {regions: 2, avgConf: 0.2},
+ 270: {regions: 2, avgConf: 0.2},
+ },
+ }
+ angle, _, scores := evaluateTableOrientation(context.Background(), makeTestTableImage(), doc)
+ if angle != 90 {
+ t.Errorf("expected 90°, got %d° (scores: %v)", angle, scores)
+ }
+ })
+
+ t.Run("180° rotated table wins", func(t *testing.T) {
+ doc := &mockRotationDoc{
+ angles: map[int]struct {
+ regions int
+ avgConf float64
+ err error
+ }{
+ 0: {regions: 1, avgConf: 0.1},
+ 90: {regions: 1, avgConf: 0.1},
+ 180: {regions: 8, avgConf: 0.85},
+ 270: {regions: 1, avgConf: 0.1},
+ },
+ }
+ angle, _, scores := evaluateTableOrientation(context.Background(), makeTestTableImage(), doc)
+ if angle != 180 {
+ t.Errorf("expected 180°, got %d° (scores: %v)", angle, scores)
+ }
+ })
+
+ t.Run("270° rotated table wins", func(t *testing.T) {
+ doc := &mockRotationDoc{
+ angles: map[int]struct {
+ regions int
+ avgConf float64
+ err error
+ }{
+ 0: {regions: 1, avgConf: 0.1},
+ 90: {regions: 1, avgConf: 0.1},
+ 180: {regions: 1, avgConf: 0.1},
+ 270: {regions: 9, avgConf: 0.88},
+ },
+ }
+ angle, _, scores := evaluateTableOrientation(context.Background(), makeTestTableImage(), doc)
+ if angle != 270 {
+ t.Errorf("expected 270°, got %d° (scores: %v)", angle, scores)
+ }
+ })
+
+ t.Run("threshold protection — 0° keeps when diff too small", func(t *testing.T) {
+ // Region-count scoring: 8 vs 9 is too close (< 1.4×) → 0° wins.
+ doc := &mockRotationDoc{
+ angles: map[int]struct {
+ regions int
+ avgConf float64
+ err error
+ }{
+ 0: {regions: 8},
+ 90: {regions: 9},
+ },
+ }
+ angle, _, _ := evaluateTableOrientation(context.Background(), makeTestTableImage(), doc)
+ if angle != 0 {
+ t.Errorf("expected 0° (threshold protection), got %d°", angle)
+ }
+ })
+
+ t.Run("threshold pass — 90° wins when region count is clearly higher", func(t *testing.T) {
+ // 0° has few regions AND 90° has ≥1.4× more → 90° wins.
+ doc := &mockRotationDoc{
+ angles: map[int]struct {
+ regions int
+ avgConf float64
+ err error
+ }{
+ 0: {regions: 4},
+ 90: {regions: 10},
+ },
+ }
+ angle, _, _ := evaluateTableOrientation(context.Background(), makeTestTableImage(), doc)
+ if angle != 90 {
+ t.Errorf("expected 90° (threshold passed), got %d°", angle)
+ }
+ })
+
+ t.Run("all angles fail OCR → fallback 0°", func(t *testing.T) {
+ doc := &mockRotationDoc{
+ angles: map[int]struct {
+ regions int
+ avgConf float64
+ err error
+ }{
+ 0: {err: errMockOCR},
+ 90: {err: errMockOCR},
+ 180: {err: errMockOCR},
+ 270: {err: errMockOCR},
+ },
+ }
+ angle, img, scores := evaluateTableOrientation(context.Background(), makeTestTableImage(), doc)
+ if angle != 0 {
+ t.Errorf("expected 0° fallback, got %d°", angle)
+ }
+ if img == nil {
+ t.Error("expected non-nil fallback image")
+ }
+ for _, s := range scores {
+ if s != 0 {
+ t.Error("all scores should be 0 on OCR failure")
+ }
+ }
+ })
+}
+
+var errMockOCR = &mockError{"mock OCR failure"}
+
+type mockError struct{ msg string }
+
+func (e *mockError) Error() string { return e.msg }
diff --git a/internal/deepdoc/parser/pdf/table_section_test.go b/internal/deepdoc/parser/pdf/table_section_test.go
new file mode 100644
index 0000000000..38b28a8915
--- /dev/null
+++ b/internal/deepdoc/parser/pdf/table_section_test.go
@@ -0,0 +1,416 @@
+package parser
+
+import (
+ "context"
+ "image"
+ "strings"
+ "testing"
+)
+
+// TestTableSection_TextFromTSR verifies that table Sections carry
+// TSR-structured text (from TableItem.Rows) rather than raw char text.
+// Python _parse_loaded_window_into_bboxes runs _extract_table_figure
+// which pops table boxes and replaces them with consolidated table
+// entries. Go backfills Section.Text from TableItem.Rows after
+// linkTableSections.
+func TestTableSection_TextFromTSR(t *testing.T) {
+ eng := &mockEngine{
+ pageCount: 1,
+ renderW: 900, // 300pt at 3x = 900px (216 DPI)
+ renderH: 600,
+ chars: map[int][]TextChar{0: {
+ // PDF space (72 DPI): well inside DLA region
+ {X0: 50, X1: 70, Top: 40, Bottom: 55, Text: "姓"},
+ {X0: 80, X1: 100, Top: 40, Bottom: 55, Text: "名"},
+ }},
+ }
+ mock := &MockDocAnalyzer{
+ Healthy: true,
+ // DLA table region in pixel space (216 DPI).
+ // PDF space: x0=100/3≈33, y0=80/3≈27, x1=500/3≈167, y1=300/3≈100.
+ DLARegions: []DLARegion{
+ {X0: 100, Y0: 80, X1: 500, Y1: 300, Label: "table", Confidence: 0.9},
+ },
+ // TSR returns structured 2x2 cells with text.
+ // Pixel space (relative to cropped region).
+ TSRCells: []TSRCell{
+ {X0: 0, Y0: 0, X1: 200, Y1: 100, Text: "姓名", Label: "table column header"},
+ {X0: 200, Y0: 0, X1: 460, Y1: 100, Text: "年龄", Label: "table column header"},
+ {X0: 0, Y0: 100, X1: 200, Y1: 220, Text: "张三", Label: "table row"},
+ {X0: 200, Y0: 100, X1: 460, Y1: 220, Text: "25", Label: "table row"},
+ },
+ }
+ p := NewParser(DefaultParserConfig(), mock)
+
+ result, err := p.Parse(context.Background(), eng)
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+
+ // ── Assert 1: Tables exist (Cells are filled by constructTable later) ──
+ if len(result.Tables) == 0 {
+ t.Fatal("expected at least 1 TableItem")
+ }
+ tbl := result.Tables[0]
+ if len(tbl.Cells) == 0 {
+ t.Fatal("expected TSR cells in TableItem")
+ }
+
+ // ── Assert 2: A table section exists with HTML output ──
+ var tableSections []Section
+ for _, s := range result.Sections {
+ if s.LayoutType == "table" {
+ tableSections = append(tableSections, s)
+ }
+ }
+ if len(tableSections) == 0 {
+ t.Fatal("expected at least 1 section with LayoutType=='table'")
+ }
+ ts := tableSections[0]
+
+ // ── Assert 3: Section.Text is HTML table from constructTable ──
+ if !strings.HasPrefix(ts.Text, "") {
+ t.Errorf("table Section.Text = %q, want HTML ", ts.Text)
+ }
+ // TSR cells have pre-filled text ("姓名", "年龄", "张三", "25") —
+ // fillCellTextFromBoxes preserves it since cells already have text.
+ if !strings.Contains(ts.Text, "姓名") || !strings.Contains(ts.Text, "年龄") {
+ t.Errorf("table HTML should contain cell text, got %q", ts.Text)
+ }
+}
+
+// TestEnrichWithDeepDoc_ImageOnlyPage verifies that enrichWithDeepDoc
+// runs DLA on pages that have images but zero embedded chars (boxes).
+// Regression test for test.pdf (Go 0 tables, Py 1 table).
+func TestEnrichWithDeepDoc_ImageOnlyPage(t *testing.T) {
+ mock := &MockDocAnalyzer{
+ Healthy: true,
+ DLARegions: []DLARegion{
+ {X0: 54, Y0: 100, X1: 846, Y1: 500, Label: "table", Confidence: 0.95},
+ },
+ TSRCells: []TSRCell{
+ {X0: 0, Y0: 0, X1: 200, Y1: 100, Text: "A", Label: "table row"},
+ },
+ }
+ p := NewParser(DefaultParserConfig(), mock)
+
+ // 0 text boxes, but page 0 has a rendered image.
+ boxes := []TextBox{}
+ dummyImg := image.NewRGBA(image.Rect(0, 0, 900, 600))
+ pageImages := map[int]image.Image{0: dummyImg}
+
+ tables := p.enrichWithDeepDoc(context.Background(), nil, boxes, pageImages)
+ if len(tables) == 0 {
+ t.Fatal("enrichWithDeepDoc: expected at least 1 table from DLA on page with image but no boxes, got 0")
+ }
+ if len(tables[0].Cells) == 0 {
+ t.Fatal("enrichWithDeepDoc: expected TSR cells in table")
+ }
+}
+
+// TestMergeCaptions_Unit verifies mergeCaptions directly without full pipeline.
+func TestMergeCaptions_Unit(t *testing.T) {
+ sections := []Section{
+ {Text: "F", LayoutType: "figure", Positions: []Position{{PageNumbers: []int{0, 0}, Left: 40, Right: 60, Top: 30, Bottom: 45}}},
+ {Text: "C", LayoutType: "figure caption", Positions: []Position{{PageNumbers: []int{0, 0}, Left: 40, Right: 60, Top: 80, Bottom: 95}}},
+ }
+ figures := CollectFigures(sections)
+
+ result := mergeCaptions(sections, figures)
+
+ // Caption removed.
+ if len(result) != 1 {
+ t.Fatalf("expected 1 section after merge, got %d", len(result))
+ }
+ // Figure text includes caption.
+ if !strings.Contains(result[0].Text, "C") {
+ t.Errorf("expected figure Text to contain caption 'C', got %q", result[0].Text)
+ }
+ if result[0].LayoutType != "figure" {
+ t.Errorf("expected figure LayoutType, got %q", result[0].LayoutType)
+ }
+}
+
+// TestMergeCaptions_TableCaption verifies table caption merging directly.
+func TestMergeCaptions_TableCaption(t *testing.T) {
+ sections := []Section{
+ {Text: "T", LayoutType: "table", Positions: []Position{{PageNumbers: []int{0, 0}, Left: 40, Right: 60, Top: 30, Bottom: 45}}},
+ {Text: "C", LayoutType: "table caption", Positions: []Position{{PageNumbers: []int{0, 0}, Left: 40, Right: 60, Top: 80, Bottom: 95}}},
+ }
+ figures := CollectFigures(sections)
+
+ result := mergeCaptions(sections, figures)
+
+ if len(result) != 1 {
+ t.Fatalf("expected 1 section after merge, got %d", len(result))
+ }
+ if !strings.Contains(result[0].Text, "C") {
+ t.Errorf("expected table Text to contain caption 'C', got %q", result[0].Text)
+ }
+}
+
+// TestFigureCaption_MergedIntoFigure verifies that "figure caption" text
+// is merged into the nearest "figure" Section and the caption Section is
+// removed. Matches Python _extract_table_figure caption matching.
+func TestFigureCaption_MergedIntoFigure(t *testing.T) {
+ eng := &mockEngine{
+ pageCount: 1,
+ renderW: 1800, renderH: 2400,
+ chars: map[int][]TextChar{0: {
+ // Figure text — overlaps DLA figure region (pixel Y=80-300 → PDF 27-100).
+ {X0: 40, X1: 60, Top: 30, Bottom: 45, Text: "F"},
+ // Caption text — overlaps DLA figure caption region (pixel Y=310-340 → PDF 103-113).
+ {X0: 40, X1: 60, Top: 104, Bottom: 112, Text: "C"},
+ }},
+ }
+ mock := &MockDocAnalyzer{
+ Healthy: true,
+ DLARegions: []DLARegion{
+ {X0: 100, Y0: 80, X1: 500, Y1: 300, Label: "figure", Confidence: 0.9},
+ // Caption is below the figure.
+ {X0: 100, Y0: 310, X1: 500, Y1: 340, Label: "figure caption", Confidence: 0.9},
+ },
+ }
+ p := NewParser(DefaultParserConfig(), mock)
+
+ result, err := p.Parse(context.Background(), eng)
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+
+ // Assert 1: figure caption Section removed.
+ for _, s := range result.Sections {
+ if s.LayoutType == "figure caption" {
+ t.Errorf("figure caption Section should be removed after mergeCaptions, got %q", s.Text)
+ }
+ }
+
+ // Assert 2: figure Section exists and has caption text appended.
+ var fig *Section
+ for i := range result.Sections {
+ if result.Sections[i].LayoutType == "figure" {
+ fig = &result.Sections[i]
+ break
+ }
+ }
+ if fig == nil {
+ t.Fatal("expected a figure Section")
+ }
+ if !strings.Contains(fig.Text, "C") {
+ t.Errorf("figure Text should contain caption text 'C', got %q", fig.Text)
+ }
+
+ // Assert 3: figure is in result.Figures.
+ if len(result.Figures) == 0 {
+ t.Error("expected at least 1 entry in result.Figures")
+ }
+}
+
+// TestTableCaption_MergedIntoTable verifies that "table caption" text
+// is merged into the nearest table Section and the caption is removed.
+func TestTableCaption_MergedIntoTable(t *testing.T) {
+ eng := &mockEngine{
+ pageCount: 1,
+ renderW: 1800, renderH: 2400,
+ chars: map[int][]TextChar{0: {
+ // Table text — overlaps DLA table region (pixel Y=80-300 → PDF 27-100).
+ {X0: 40, X1: 60, Top: 30, Bottom: 45, Text: "T"},
+ // Caption text — overlaps DLA table caption region (pixel Y=310-340 → PDF 103-113).
+ {X0: 40, X1: 60, Top: 104, Bottom: 112, Text: "C"},
+ }},
+ }
+ mock := &MockDocAnalyzer{
+ Healthy: true,
+ DLARegions: []DLARegion{
+ {X0: 100, Y0: 80, X1: 500, Y1: 300, Label: "table", Confidence: 0.9},
+ {X0: 100, Y0: 310, X1: 500, Y1: 340, Label: "table caption", Confidence: 0.9},
+ },
+ TSRCells: []TSRCell{
+ {X0: 0, Y0: 0, X1: 200, Y1: 100, Text: "A", Label: "table row"},
+ {X0: 200, Y0: 0, X1: 460, Y1: 100, Text: "B", Label: "table row"},
+ },
+ }
+ p := NewParser(DefaultParserConfig(), mock)
+
+ result, err := p.Parse(context.Background(), eng)
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+
+ // Assert: table caption Section removed, text merged into table Section.
+ for _, s := range result.Sections {
+ if s.LayoutType == "table caption" {
+ t.Errorf("table caption Section should be removed, got %q", s.Text)
+ }
+ }
+ var tbl *Section
+ for i := range result.Sections {
+ if result.Sections[i].LayoutType == "table" {
+ tbl = &result.Sections[i]
+ break
+ }
+ }
+ if tbl == nil {
+ t.Fatal("expected a table Section")
+ }
+ if !strings.Contains(tbl.Text, "C") {
+ t.Errorf("table Text should contain caption text 'C', got %q", tbl.Text)
+ }
+}
+
+// TestTextSectionsInsideTableRegion_Suppressed verifies that Sections
+// whose positions fall inside a table region are suppressed even when
+// DLA labeled them as "text". Python _extract_table_figure pops ALL
+// boxes overlapping a table region, regardless of their DLA label.
+// This is the #1 cause of Go vs Python discrepancy on table-heavy PDFs.
+func TestTextSectionsInsideTableRegion_Suppressed(t *testing.T) {
+ eng := &mockEngine{
+ pageCount: 1,
+ renderW: 1800, renderH: 2400,
+ chars: map[int][]TextChar{0: {
+ // Box A: inside DLA table region, labeled as "text" by DLA.
+ {X0: 50, X1: 100, Top: 40, Bottom: 55, Text: "碎片文字"},
+ // Box B: inside DLA table region, same situation.
+ {X0: 120, X1: 160, Top: 40, Bottom: 55, Text: "垃圾"},
+ }},
+ }
+ // DLA returns a "table" region AND a "text" sub-region inside it.
+ // Real DLA often splits large table regions this way.
+ mock := &MockDocAnalyzer{
+ Healthy: true,
+ DLARegions: []DLARegion{
+ {X0: 100, Y0: 80, X1: 500, Y1: 300, Label: "table", Confidence: 0.9},
+ {X0: 120, Y0: 100, X1: 180, Y1: 140, Label: "text", Confidence: 0.8},
+ },
+ TSRCells: []TSRCell{
+ {X0: 0, Y0: 0, X1: 200, Y1: 100, Text: "姓名", Label: "table row"},
+ {X0: 200, Y0: 0, X1: 460, Y1: 100, Text: "年龄", Label: "table row"},
+ },
+ }
+ p := NewParser(DefaultParserConfig(), mock)
+
+ result, err := p.Parse(context.Background(), eng)
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+
+ // Assert 1: table Section exists with structured text.
+ var hasTable bool
+ for _, s := range result.Sections {
+ if s.LayoutType == "table" && s.Text != "" {
+ hasTable = true
+ break
+ }
+ }
+ if !hasTable {
+ t.Fatal("expected a table Section with structured text")
+ }
+
+ // Assert 2: NO "text" fragment sections remain — they were inside
+ // the table region and should be suppressed (Python pops them).
+ for _, s := range result.Sections {
+ if s.LayoutType != "table" && strings.Contains(s.Text, "碎片") {
+ t.Errorf("text fragment %q inside table region should be suppressed, got %q",
+ s.Text, s.LayoutType)
+ }
+ if s.LayoutType != "table" && strings.Contains(s.Text, "垃圾") {
+ t.Errorf("text fragment %q inside table region should be suppressed, got %q",
+ s.Text, s.LayoutType)
+ }
+ }
+ sectionCount := len(result.Sections)
+ if sectionCount > 3 {
+ t.Errorf("expected ≤3 sections (table + outside fragments), got %d", sectionCount)
+ }
+}
+
+// TestEmptyDoc_NoCrash verifies Parse handles edge cases gracefully.
+func TestEmptyDoc_NoCrash(t *testing.T) {
+ eng := &mockEngine{pageCount: 0}
+ p := NewParser(DefaultParserConfig(), &MockDocAnalyzer{Healthy: true, Model: ModelSaas})
+ result, err := p.Parse(context.Background(), eng)
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+ if len(result.Sections) != 0 {
+ t.Errorf("expected 0 sections for empty doc, got %d", len(result.Sections))
+ }
+}
+
+// TestNilChars_handled verifies zero-chars pages don't crash.
+func TestNilChars_Handled(t *testing.T) {
+ eng := &mockEngine{pageCount: 1, renderW: 200, renderH: 200}
+ p := NewParser(DefaultParserConfig(), &MockDocAnalyzer{Healthy: true, Model: ModelSaas})
+ result, err := p.Parse(context.Background(), eng)
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+ if len(result.Sections) != 0 && p.DeepDoc != nil {
+ t.Logf("nil chars + DeepDoc: sections=%d (may trigger OCR path)", len(result.Sections))
+ }
+}
+
+// TestMergeCaptions_EuclideanDistance verifies that caption matching uses
+// squared Euclidean distance (center-to-center), not Y-only distance.
+// Two captions at different X positions — the one closer by Euclidean
+// distance wins, even if its Y distance is slightly larger.
+func TestMergeCaptions_EuclideanDistance(t *testing.T) {
+ sections := []Section{
+ {Text: "F", LayoutType: "figure", Positions: []Position{
+ {PageNumbers: []int{0, 0}, Left: 0, Right: 100, Top: 0, Bottom: 50},
+ }},
+ // Caption A: directly below figure (dx=0, dy=20) → Euclidean = 20²
+ {Text: "close", LayoutType: "figure caption", Positions: []Position{
+ {PageNumbers: []int{0, 0}, Left: 0, Right: 100, Top: 70, Bottom: 80},
+ }},
+ }
+ figures := CollectFigures(sections)
+ result := mergeCaptions(sections, figures)
+ // Caption merged into figure — verified by figure Text containing caption.
+ if len(result) != 1 {
+ t.Fatalf("expected 1 section after merge, got %d", len(result))
+ }
+ if !strings.Contains(result[0].Text, "close") {
+ t.Errorf("figure Text should contain caption 'close', got %q", result[0].Text)
+ }
+}
+
+// mockEngine is a minimal PDFEngine stub for unit tests.
+type mockEngine struct {
+ chars map[int][]TextChar
+ pageCount int
+ renderW int
+ renderH int
+}
+
+func (m *mockEngine) ExtractChars(pg int) ([]TextChar, error) {
+ return m.chars[pg], nil
+}
+func (m *mockEngine) RenderPage(pg int, dpi float64) ([]byte, error) {
+ w, h := m.renderW, m.renderH
+ if w <= 0 {
+ w = 595
+ }
+ if h <= 0 {
+ h = 842
+ }
+ return nil, nil
+}
+func (m *mockEngine) RenderPageImage(pg int, dpi float64) (image.Image, error) {
+ w, h := m.renderW, m.renderH
+ if w <= 0 {
+ w = 100
+ }
+ if h <= 0 {
+ h = 100
+ }
+ return image.NewRGBA(image.Rect(0, 0, w, h)), nil
+}
+func (m *mockEngine) PageCount() (int, error) {
+ if m.pageCount <= 0 {
+ return 1, nil
+ }
+ return m.pageCount, nil
+}
+func (m *mockEngine) RawData() []byte { return nil }
+func (m *mockEngine) Close() error { return nil }
diff --git a/internal/deepdoc/parser/pdf/table_test.go b/internal/deepdoc/parser/pdf/table_test.go
new file mode 100644
index 0000000000..d7e9b55606
--- /dev/null
+++ b/internal/deepdoc/parser/pdf/table_test.go
@@ -0,0 +1,1862 @@
+package parser
+
+import (
+ "context"
+ "image"
+ "strings"
+ "testing"
+)
+
+// ---- groupTSRCellsToRows ----
+
+func TestGroupTSRCellsToRows_Empty(t *testing.T) {
+ if rows := groupTSRCellsToRows(nil); rows != nil {
+ t.Errorf("nil input: expected nil, got %d rows", len(rows))
+ }
+ if rows := groupTSRCellsToRows([]TSRCell{}); rows != nil {
+ t.Errorf("empty input: expected nil, got %d rows", len(rows))
+ }
+}
+
+func TestGroupTSRCellsToRows_SingleCell(t *testing.T) {
+ cells := []TSRCell{{X0: 0, Y0: 0, X1: 10, Y1: 10, Text: "A"}}
+ rows := groupTSRCellsToRows(cells)
+ if len(rows) != 1 || len(rows[0]) != 1 || rows[0][0].Text != "A" {
+ t.Errorf("single cell: expected [[A]], got %v", rows)
+ }
+}
+
+func TestGroupTSRCellsToRows_TwoRows(t *testing.T) {
+ cells := []TSRCell{
+ {X0: 00, Y0: 0, X1: 10, Y1: 10, Text: "A1"},
+ {X0: 20, Y0: 0, X1: 30, Y1: 10, Text: "B1"},
+ {X0: 00, Y0: 30, X1: 10, Y1: 40, Text: "A2"},
+ {X0: 20, Y0: 30, X1: 30, Y1: 40, Text: "B2"},
+ }
+ rows := groupTSRCellsToRows(cells)
+ if len(rows) != 2 {
+ t.Fatalf("expected 2 rows, got %d", len(rows))
+ }
+ if len(rows[0]) != 2 || len(rows[1]) != 2 {
+ t.Errorf("expected 2 cells per row, got %d/%d", len(rows[0]), len(rows[1]))
+ }
+ // Row 0 sorted by X0
+ if rows[0][0].Text != "A1" || rows[0][1].Text != "B1" {
+ t.Errorf("row 0 order wrong: %v", tsrCellTexts(rows[0]))
+ }
+ // Row 1 sorted by X0
+ if rows[1][0].Text != "A2" || rows[1][1].Text != "B2" {
+ t.Errorf("row 1 order wrong: %v", tsrCellTexts(rows[1]))
+ }
+}
+
+func TestGroupTSRCellsToRows_CloseRows(t *testing.T) {
+ // Two rows with small Y gap — should still be separate rows
+ cells := []TSRCell{
+ {X0: 0, Y0: 0, X1: 10, Y1: 8, Text: "Row1"},
+ {X0: 0, Y0: 9, X1: 10, Y1: 17, Text: "Row2"},
+ }
+ rows := groupTSRCellsToRows(cells)
+ // medianH = 8, threshold = 4. gap = 9-8 = 1 < 4? Actually Y diff = 9-8=1 < 4 → same row!
+ // No: cells sorted by Y0: Row1(0), Row2(9). gap = 9-0 = 9 > 4 → different rows.
+ if len(rows) != 2 {
+ t.Errorf("close rows: expected 2, got %d", len(rows))
+ }
+}
+
+func TestGroupTSRCellsToRows_VaryingHeights(t *testing.T) {
+ cells := []TSRCell{
+ {X0: 0, Y0: 0, X1: 10, Y1: 5, Text: "A"}, // height 5
+ {X0: 0, Y0: 50, X1: 10, Y1: 70, Text: "B"}, // height 20
+ {X0: 0, Y0: 50, X1: 10, Y1: 70, Text: "C"}, // height 20, same row as B
+ }
+ rows := groupTSRCellsToRows(cells)
+ // median height = 5 (sorted: 5,20,20 → median index 1 = 20)
+ // threshold = 10. Y gap B-to-A = 50-5 = 45 > 10 → different row
+ // Y gap C-to-B = 50-50 = 0 ≤ 10 → same row
+ if len(rows) != 2 {
+ t.Fatalf("varying heights: expected 2 rows, got %d", len(rows))
+ }
+ if len(rows[0]) != 1 || rows[0][0].Text != "A" {
+ t.Errorf("row 0: expected [A], got %v", tsrCellTexts(rows[0]))
+ }
+ if len(rows[1]) != 2 {
+ t.Errorf("row 1: expected 2 cells, got %v", tsrCellTexts(rows[1]))
+ }
+}
+
+func tsrCellTexts(cells []TSRCell) []string {
+ out := make([]string, len(cells))
+ for i, c := range cells {
+ out[i] = c.Text
+ }
+ return out
+}
+
+// ---- boxOverlapsCell ----
+
+func TestBoxOverlapsCell_FullOverlap(t *testing.T) {
+ // Box is entirely inside cell → ≥85% of box area inside cell → match.
+ cell := TSRCell{X0: 0, Y0: 0, X1: 100, Y1: 50}
+ box := TextBox{X0: 0, X1: 100, Top: 0, Bottom: 50, Text: "hello"}
+ if !boxOverlapsCell(cell, box) {
+ t.Error("full overlap should return true")
+ }
+ // Box is still entirely inside cell → box→cell = 100% ≥ 85% → match.
+ box2 := TextBox{X0: 10, X1: 90, Top: 10, Bottom: 40, Text: "partial"}
+ if !boxOverlapsCell(cell, box2) {
+ t.Error("box entirely inside cell (100% of box) should match")
+ }
+}
+
+func TestBoxOverlapsCell_NoOverlap(t *testing.T) {
+ cell := TSRCell{X0: 0, Y0: 0, X1: 100, Y1: 50}
+ box := TextBox{X0: 200, X1: 300, Top: 10, Bottom: 40, Text: "away"}
+ if boxOverlapsCell(cell, box) {
+ t.Error("no X overlap should return false")
+ }
+}
+
+func TestBoxOverlapsCell_PartialOverlap(t *testing.T) {
+ // Box is entirely inside cell (100% of box area) → matches.
+ // boxOverlapsCell uses box→cell overlap (≥85% of box area inside cell).
+ cell := TSRCell{X0: 0, Y0: 0, X1: 100, Y1: 50}
+ box := TextBox{X0: 0, X1: 30, Top: 0, Bottom: 25, Text: "small"}
+ if !boxOverlapsCell(cell, box) {
+ t.Error("box entirely inside cell should match")
+ }
+ // Box straddles cell boundary (< 85% of box inside cell) → no match.
+ box2 := TextBox{X0: 80, X1: 180, Top: 0, Bottom: 25, Text: "spill"}
+ if boxOverlapsCell(cell, box2) {
+ t.Error("box straddling boundary (<85% inside) should NOT match")
+ }
+}
+
+func TestBoxOverlapsCell_ZeroArea(t *testing.T) {
+ cell := TSRCell{X0: 0, Y0: 0, X1: 0, Y1: 50}
+ box := TextBox{X0: 0, X1: 10, Top: 0, Bottom: 10, Text: "x"}
+ if boxOverlapsCell(cell, box) {
+ t.Error("zero cell area should return false")
+ }
+}
+
+// ---- fillCellTextFromBoxes ----
+
+func TestFillCellTextFromBoxes_Simple(t *testing.T) {
+ // Box covering entire cell (>85%) → match
+ cells := []TSRCell{
+ {X0: 0, Y0: 0, X1: 100, Y1: 50},
+ {X0: 100, Y0: 0, X1: 200, Y1: 50},
+ }
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 50, Text: "cell1"},
+ {X0: 100, X1: 200, Top: 0, Bottom: 50, Text: "cell2"},
+ }
+ fillCellTextFromBoxes(cells, boxes)
+ if cells[0].Text != "cell1" {
+ t.Errorf("cell 0: got %q, want 'cell1'", cells[0].Text)
+ }
+ if cells[1].Text != "cell2" {
+ t.Errorf("cell 1: got %q, want 'cell2'", cells[1].Text)
+ }
+}
+
+func TestFillCellTextFromBoxes_MultipleBoxesPerCell(t *testing.T) {
+ // Two boxes, each covering >85% of the cell → concatenated
+ // (boxes must overlap the cell near-completely to match individually)
+ cells := []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50}}
+ boxes := []TextBox{
+ {X0: 0, X1: 95, Top: 0, Bottom: 47, Text: "part1"},
+ {X0: 5, X1: 100, Top: 3, Bottom: 50, Text: "part2"},
+ }
+ fillCellTextFromBoxes(cells, boxes)
+ // Both boxes cover >85% → both match → concatenated with space
+ if cells[0].Text == "" {
+ t.Error("expected non-empty cell text")
+ }
+}
+
+func TestFillCellTextFromBoxes_EmptyBoxText(t *testing.T) {
+ cells := []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50}}
+ boxes := []TextBox{
+ {X0: 5, X1: 95, Top: 5, Bottom: 45, Text: " "},
+ }
+ fillCellTextFromBoxes(cells, boxes)
+ if cells[0].Text != "" {
+ t.Errorf("empty box text: got %q, want empty", cells[0].Text)
+ }
+}
+
+func TestFillCellTextFromBoxes_NoMatchingBox(t *testing.T) {
+ cells := []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 50}}
+ boxes := []TextBox{
+ {X0: 500, X1: 600, Top: 500, Bottom: 550, Text: "far away"},
+ }
+ fillCellTextFromBoxes(cells, boxes)
+ if cells[0].Text != "" {
+ t.Errorf("no match: got %q, want empty", cells[0].Text)
+ }
+}
+
+// ---- regionOverlapsBox ----
+
+func TestRegionOverlapsBox_StrongOverlap(t *testing.T) {
+ region := DLARegion{X0: 0, Y0: 0, X1: 216, Y1: 108} // DLA coords at 216 DPI
+ box := TextBox{X0: 0, X1: 100, Top: 0, Bottom: 50}
+ if !regionOverlapsBox(region, box, 3.0) {
+ t.Error("full overlap should match")
+ }
+}
+
+func TestRegionOverlapsBox_NoOverlap(t *testing.T) {
+ region := DLARegion{X0: 0, Y0: 0, X1: 216, Y1: 108}
+ box := TextBox{X0: 500, X1: 600, Top: 500, Bottom: 550}
+ if regionOverlapsBox(region, box, 3.0) {
+ t.Error("no overlap should return false")
+ }
+}
+
+func TestRegionOverlapsBox_WeakOverlap(t *testing.T) {
+ // Overlap at 30% → below 40% threshold → false.
+ region := DLARegion{X0: 0, Y0: 0, X1: 90, Y1: 90} // 30x30 at scale 3
+ box := TextBox{X0: 0, X1: 100, Top: 0, Bottom: 100} // overlap = 30*30/10000 = 9%? No: 30x30=900 / 10000 = 9%
+ if regionOverlapsBox(region, box, 3.0) {
+ t.Error("9% overlap should return false")
+ }
+ // Overlap ≥ 40% → should match (Python thr=0.4).
+ // box 100x100=10000 area; region 100x40=4000 → exactly 40%.
+ region2 := DLARegion{X0: 0, Y0: 0, X1: 300, Y1: 120, Label: "table"} // 100x40 at scale 3
+ if !regionOverlapsBox(region2, box, 3.0) {
+ t.Error("40% overlap should match (>= 0.4)")
+ }
+ // Region that covers most of the box → should match
+ region3 := DLARegion{X0: 0, Y0: 0, X1: 270, Y1: 270} // 90x90 at scale 3
+ if !regionOverlapsBox(region3, box, 3.0) {
+ t.Error("81% overlap should match")
+ }
+}
+
+func TestRegionOverlapsBox_ThresholdAt040(t *testing.T) {
+ // Exact 40% overlap: 100x100 box, region just covering 40%
+ // 0.4 * 10000 = 4000. Need region with area 4000 in box space.
+ // 63.2*63.2 ≈ 3994. Let's use 100x40 = 4000.
+ box := TextBox{X0: 0, X1: 100, Top: 0, Bottom: 100}
+ region := DLARegion{X0: 0, Y0: 0, X1: 300, Y1: 120, Label: "table"} // 100x40 at scale 3
+ if !regionOverlapsBox(region, box, 3.0) {
+ t.Error("exact 40% overlap should match (>= 0.4)")
+ }
+ // 39% overlap should NOT match
+ region2 := DLARegion{X0: 0, Y0: 0, X1: 300, Y1: 117, Label: "table"} // 100x39 at scale 3
+ if regionOverlapsBox(region2, box, 3.0) {
+ t.Error("39% overlap should NOT match")
+ }
+}
+
+// ---- annotateBoxLayouts ----
+
+func TestAnnotateBoxLayouts_SetsLabel(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 20},
+ {X0: 0, X1: 100, Top: 30, Bottom: 50},
+ }
+ regions := []DLARegion{
+ {X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "title"}, // covers box 0 at scale 3
+ {X0: 0, Y0: 90, X1: 300, Y1: 150, Label: "text"}, // covers box 1 at scale 3
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
+ if boxes[0].LayoutType != "title" {
+ t.Errorf("box 0: got %q, want 'title'", boxes[0].LayoutType)
+ }
+ if boxes[1].LayoutType != "text" {
+ t.Errorf("box 1: got %q, want 'text'", boxes[1].LayoutType)
+ }
+}
+
+func TestAnnotateBoxLayouts_NoMatch(t *testing.T) {
+ // Region far away from the box — no overlap
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 20},
+ }
+ regions := []DLARegion{
+ {X0: 900, Y0: 900, X1: 1000, Y1: 1000, Label: "far"}, // completely outside
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
+ if boxes[0].LayoutType != "" {
+ t.Errorf("no match: expected empty, got %q", boxes[0].LayoutType)
+ }
+}
+
+func TestAnnotateBoxLayouts_EmptyRegions(t *testing.T) {
+ boxes := []TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 20}}
+ boxes = annotateBoxLayouts(boxes, nil, 3.0, 0)
+ boxes = annotateBoxLayouts(boxes, []DLARegion{}, 3.0, 0)
+ if boxes[0].LayoutType != "" {
+ t.Errorf("empty regions: got %q, want empty", boxes[0].LayoutType)
+ }
+}
+
+func TestAnnotateBoxLayouts_PriorityOverMaxArea(t *testing.T) {
+ // "table" type checked before "text" in priority order.
+ // Even if "text" region has larger overlap, "table" wins if it meets threshold (≥40%).
+ boxes := []TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 50}}
+ regions := []DLARegion{
+ // text region: full coverage (100% overlap) — but lower priority
+ {X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"},
+ // table region: 45% overlap (45x50 out of 100x50) — higher priority, meets threshold
+ {X0: 0, Y0: 0, X1: 45 * 3, Y1: 50 * 3, Label: "table"},
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
+ if boxes[0].LayoutType != "table" {
+ t.Errorf("priority: 'table' should win over 'text' when both meet threshold, got %q", boxes[0].LayoutType)
+ }
+}
+
+func TestAnnotateBoxLayouts_OverlapThreshold(t *testing.T) {
+ // Region overlaps only 30% of box — below 0.4 threshold — should NOT match.
+ boxes := []TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 50}}
+ regions := []DLARegion{
+ {X0: 0, Y0: 0, X1: 30 * 3, Y1: 30 * 3, Label: "table"}, // covers ~30% of box
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
+ if boxes[0].LayoutType != "" {
+ t.Errorf("threshold: overlap < 40%% should not match, got %q", boxes[0].LayoutType)
+ }
+}
+
+func TestAnnotateBoxLayouts_CIDGarbage(t *testing.T) {
+ // CID-pattern boxes should be popped entirely (Python: bxs.pop(i)).
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 20, Text: "(cid:123)"},
+ {X0: 0, X1: 100, Top: 30, Bottom: 50, Text: "normal text"},
+ }
+ regions := []DLARegion{
+ {X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "text", Confidence: 0.9},
+ {X0: 0, Y0: 90, X1: 300, Y1: 150, Label: "text", Confidence: 0.9},
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
+ // CID-garbled box was popped → only 1 box remains.
+ if len(boxes) != 1 {
+ t.Fatalf("CID-garbled box should be popped, got %d boxes", len(boxes))
+ }
+ if boxes[0].LayoutType != "text" {
+ t.Errorf("CID: remaining box should be 'text', got %q", boxes[0].LayoutType)
+ }
+}
+
+func TestAnnotateBoxLayouts_LayoutNoFormat(t *testing.T) {
+ // layoutno uses Python format: "{type}-{per_type_index}" where per_type_index
+ // is the index of the matched DLA region within its type (not global).
+ // Two boxes overlapping the SAME text region share the same layoutno → VM can merge them.
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 20},
+ {X0: 0, X1: 100, Top: 30, Bottom: 50},
+ }
+ regions := []DLARegion{
+ {X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"}, // covers both boxes
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
+ want := "text-0"
+ if boxes[0].LayoutNo != want {
+ t.Errorf("box 0 layoutno: got %q, want %q", boxes[0].LayoutNo, want)
+ }
+ if boxes[1].LayoutNo != want {
+ t.Errorf("box 1 layoutno should share same per-type index: got %q, want %q", boxes[1].LayoutNo, want)
+ }
+}
+
+func TestAnnotateBoxLayouts_LayoutNoDifferentRegions(t *testing.T) {
+ // Two boxes in different text regions → different layoutno.
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 20},
+ {X0: 0, X1: 100, Top: 100, Bottom: 120},
+ }
+ regions := []DLARegion{
+ {X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "text"}, // per-type index 0
+ {X0: 0, Y0: 300, X1: 300, Y1: 360, Label: "text"}, // per-type index 1
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
+ if boxes[0].LayoutNo != "text-0" {
+ t.Errorf("box 0: got %q, want 'text-0'", boxes[0].LayoutNo)
+ }
+ if boxes[1].LayoutNo != "text-1" {
+ t.Errorf("box 1: got %q, want 'text-1'", boxes[1].LayoutNo)
+ }
+}
+
+// TestAnnotateBoxLayouts_ConfidenceFilter verifies that DLA regions with
+// low confidence (< 0.4) for garbage layout types are excluded from matching.
+// Python: float(b["score"]) >= 0.4 filter in LayoutRecognizer.
+func TestAnnotateBoxLayouts_ConfidenceFilter(t *testing.T) {
+ boxes := []TextBox{{X0: 0, X1: 100, Top: 0, Bottom: 50}}
+ // Low-confidence footer — should be filtered out.
+ regions := []DLARegion{
+ {X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "footer", Confidence: 0.2},
+ {X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text", Confidence: 0.9},
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
+ // Footer region filtered (low confidence) → box matches "text" instead.
+ if boxes[0].LayoutType != "text" {
+ t.Errorf("low-confidence footer filtered → box should get 'text', got %q", boxes[0].LayoutType)
+ }
+}
+
+func TestAnnotateBoxLayouts_GarbageFooterRejected(t *testing.T) {
+ // Footer at page bottom: Bottom(290) > 270 (90% of 300px→PDF height 100→90% of 100=90)
+ // → real footer decoration → garbage → pop (Python: bxs.pop(i)).
+ boxes := []TextBox{{X0: 0, X1: 100, Top: 280, Bottom: 290}}
+ regions := []DLARegion{
+ {X0: 0, Y0: 840, X1: 300, Y1: 870, Label: "footer", Confidence: 0.9}, // y=280-290 after /3, PDF 93-97
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 300) // PDF height = 300/3 = 100
+ if len(boxes) != 0 {
+ t.Errorf("footer at bottom: should be popped as decoration, got %d boxes left", len(boxes))
+ }
+}
+
+func TestAnnotateBoxLayouts_HeaderRemovedAtTop(t *testing.T) {
+ // Header at page top edge (y=5 in 300px page → PDF height 100 → 5 < 10% of 100)
+ // → real header decoration → garbage → pop (Python: bxs.pop(i)).
+ boxes := []TextBox{{X0: 0, X1: 100, Top: 5, Bottom: 20}}
+ regions := []DLARegion{
+ {X0: 0, Y0: 15, X1: 300, Y1: 60, Label: "header", Confidence: 0.9}, // y=5-20 after /3
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 300)
+ if len(boxes) != 0 {
+ t.Errorf("header at very top: should be popped as decoration, got %d boxes left", len(boxes))
+ }
+}
+
+func TestAnnotateBoxLayouts_HeaderKeptInMiddle(t *testing.T) {
+ // Header in middle of page (y=50 in 300px page → PDF height 100 → 50 > 10)
+ // → DLA false positive → KEEP the text.
+ boxes := []TextBox{{X0: 0, X1: 100, Top: 50, Bottom: 70}}
+ regions := []DLARegion{
+ {X0: 0, Y0: 150, X1: 300, Y1: 210, Label: "header", Confidence: 0.9}, // y=50-70 after /3
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 300)
+ if boxes[0].LayoutType != "header" {
+ t.Errorf("header in middle of page: DLA false positive, keep text, got %q", boxes[0].LayoutType)
+ }
+}
+
+func TestAnnotateBoxLayouts_FooterRemovedAtBottom(t *testing.T) {
+ // Footer at page bottom (y=95 in 300px page → PDF height 100 → 95 > 90% of 100)
+ // → real footer decoration → garbage → REMOVE.
+ boxes := []TextBox{{X0: 0, X1: 100, Top: 95, Bottom: 100}}
+ regions := []DLARegion{
+ {X0: 0, Y0: 285, X1: 300, Y1: 300, Label: "footer", Confidence: 0.9}, // y=95-100 after /3
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 300)
+ if len(boxes) != 0 {
+ t.Errorf("footer at very bottom: should be popped as decoration, got %d boxes left", len(boxes))
+ }
+}
+
+func TestAnnotateBoxLayouts_FooterKeptInMiddle(t *testing.T) {
+ // Footer in middle of page (y=50 in 300px page → PDF height 100 → 50 < 90)
+ // → DLA false positive → KEEP the text.
+ boxes := []TextBox{{X0: 0, X1: 100, Top: 50, Bottom: 70}}
+ regions := []DLARegion{
+ {X0: 0, Y0: 150, X1: 300, Y1: 210, Label: "footer", Confidence: 0.9}, // y=50-70 after /3
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 300)
+ if boxes[0].LayoutType != "footer" {
+ t.Errorf("footer in middle of page: DLA false positive, keep text, got %q", boxes[0].LayoutType)
+ }
+}
+
+func TestAnnotateBoxLayouts_ReferenceAlwaysGarbage(t *testing.T) {
+ // Reference type is always garbage regardless of position (no keep_feat).
+ boxes := []TextBox{{X0: 0, X1: 100, Top: 50, Bottom: 70}}
+ regions := []DLARegion{
+ {X0: 0, Y0: 150, X1: 300, Y1: 210, Label: "reference", Confidence: 0.9},
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 300)
+ if len(boxes) != 0 {
+ t.Errorf("reference: should always be garbage-filtered, got %q", boxes[0].LayoutType)
+ }
+}
+
+func TestAnnotateBoxLayouts_NonGarbageTypeUnaffected(t *testing.T) {
+ // "text" type is NOT a garbage type — should always be assigned.
+ boxes := []TextBox{{X0: 0, X1: 100, Top: 200, Bottom: 220}}
+ regions := []DLARegion{
+ {X0: 0, Y0: 600, X1: 300, Y1: 660, Label: "text"},
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 300)
+ if boxes[0].LayoutType != "text" {
+ t.Errorf("non-garbage type: should be assigned, got %q", boxes[0].LayoutType)
+ }
+}
+
+func TestAnnotateBoxLayouts_ZeroPageHeightDisablesGarbage(t *testing.T) {
+ // pageImgHeight=0 → garbage check disabled → all types assigned.
+ boxes := []TextBox{{X0: 0, X1: 100, Top: 100, Bottom: 120}}
+ regions := []DLARegion{
+ {X0: 0, Y0: 300, X1: 300, Y1: 360, Label: "header", Confidence: 0.9},
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
+ if boxes[0].LayoutType != "header" {
+ t.Errorf("zero page height: garbage check disabled, got %q", boxes[0].LayoutType)
+ }
+}
+
+// TestAnnotateBoxLayouts_SyntheticFigure creates synthetic figure boxes for
+// unmatched figure/equation DLA regions (Python: dla_cli.py:187-195).
+func TestAnnotateBoxLayouts_SyntheticFigure(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 20, Text: "text box"},
+ }
+ // Two figure regions, one text region
+ regions := []DLARegion{
+ {X0: 0, Y0: 0, X1: 150, Y1: 60, Label: "text", Confidence: 0.9}, // matches text box → visited
+ {X0: 300, Y0: 300, X1: 600, Y1: 600, Label: "figure", Confidence: 0.9}, // no box overlaps → synthetic
+ {X0: 600, Y0: 0, X1: 900, Y1: 300, Label: "figure", Confidence: 0.9}, // no box overlaps → synthetic
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
+ // Original text box + 2 synthetic figure boxes = 3
+ if len(boxes) != 3 {
+ t.Fatalf("expected 3 boxes (1 original + 2 synthetic figures), got %d", len(boxes))
+ }
+ // Check synthetic boxes
+ foundFig0, foundFig1 := false, false
+ for _, b := range boxes {
+ if b.LayoutType == "figure" && b.Text == "" {
+ if b.LayoutNo == "figure-0" {
+ foundFig0 = true
+ if b.X0 != 100 || b.X1 != 200 {
+ t.Errorf("synthetic figure-0: expected x0=100,x1=200 (300/3,600/3), got x0=%v,x1=%v", b.X0, b.X1)
+ }
+ }
+ if b.LayoutNo == "figure-1" {
+ foundFig1 = true
+ }
+ }
+ }
+ if !foundFig0 {
+ t.Error("missing synthetic figure-0 box")
+ }
+ if !foundFig1 {
+ t.Error("missing synthetic figure-1 box")
+ }
+}
+
+// TestAnnotateBoxLayouts_EquationMappedToFigure verifies equation DLA regions
+// get LayoutType="figure" but LayoutNo keeps "equation" prefix (Python behavior).
+func TestAnnotateBoxLayouts_EquationMappedToFigure(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 20},
+ }
+ regions := []DLARegion{
+ {X0: 0, Y0: 0, X1: 300, Y1: 60, Label: "equation", Confidence: 0.9},
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
+ if len(boxes) != 1 {
+ t.Fatalf("expected 1 box, got %d", len(boxes))
+ }
+ if boxes[0].LayoutType != "figure" {
+ t.Errorf("equation → LayoutType: got %q, want 'figure'", boxes[0].LayoutType)
+ }
+ if boxes[0].LayoutNo != "equation-0" {
+ t.Errorf("equation → LayoutNo: got %q, want 'equation-0'", boxes[0].LayoutNo)
+ }
+}
+
+// TestAnnotateBoxLayouts_MixedTypesLayoutNo verifies per-type LayoutNo counting
+// with multiple region types present.
+func TestAnnotateBoxLayouts_MixedTypesLayoutNo(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 20}, // overlaps text region 0
+ {X0: 0, X1: 100, Top: 200, Bottom: 220}, // overlaps text region 1
+ {X0: 200, X1: 300, Top: 0, Bottom: 20}, // overlaps figure region 0 only
+ }
+ regions := []DLARegion{
+ {X0: 0, Y0: 0, X1: 150, Y1: 60, Label: "text", Confidence: 0.9}, // text-0
+ {X0: 0, Y0: 600, X1: 150, Y1: 660, Label: "text", Confidence: 0.9}, // text-1
+ {X0: 600, Y0: 0, X1: 900, Y1: 60, Label: "figure", Confidence: 0.9}, // figure-0 (PDF: x0=200, x1=300)
+ }
+ boxes = annotateBoxLayouts(boxes, regions, 3.0, 0)
+ if len(boxes) != 3 {
+ t.Fatalf("expected 3 boxes, got %d", len(boxes))
+ }
+ // Check that text and figure indices are independent
+ if boxes[0].LayoutNo != "text-0" {
+ t.Errorf("box 0: got %q, want 'text-0'", boxes[0].LayoutNo)
+ }
+ if boxes[1].LayoutNo != "text-1" {
+ t.Errorf("box 1: got %q, want 'text-1'", boxes[1].LayoutNo)
+ }
+ if boxes[2].LayoutNo != "figure-0" {
+ t.Errorf("box 2: got %q, want 'figure-0' (independent from text counter)", boxes[2].LayoutNo)
+ }
+}
+
+// ---- Mock-integration: DLA→TSR pipeline with MockDeepDoc ----
+
+func TestExtractTableBoxes_PriorityPreservesTable(t *testing.T) {
+ // One box overlaps both a large "text" region and a smaller "table" region.
+ // Priority order (table before text) must ensure the box gets "table" label,
+ // triggering TSR and producing TableItems.
+ dummyImg := image.NewRGBA(image.Rect(0, 0, 900, 900))
+ boxes := []TextBox{
+ {X0: 200, X1: 400, Top: 200, Bottom: 400, Text: "cell content"},
+ }
+ mock := &MockDocAnalyzer{
+ Healthy: true,
+ DLARegions: []DLARegion{
+ {X0: 0, Y0: 0, X1: 2700, Y1: 2700, Label: "text"}, // full-page, 3x scale
+ {X0: 300, Y0: 300, X1: 1500, Y1: 1500, Label: "table"}, // partial, 3x scale
+ },
+ TSRCells: []TSRCell{{X0: 200, Y0: 200, X1: 400, Y1: 400, Text: "cell1"}},
+ }
+ p := NewParser(DefaultParserConfig(), mock)
+
+ items := p.extractTableBoxesFromImage(context.Background(), boxes, dummyImg, 0, 0)
+ if len(items) == 0 {
+ t.Error("priority: table should win over text, got 0 tables")
+ }
+}
+
+func TestExtractTableBoxes_OverlapBelowThresholdNoTable(t *testing.T) {
+ // Table region covers <40% of the box's area → matches no box → no table.
+ dummyImg := image.NewRGBA(image.Rect(0, 0, 900, 900))
+ boxes := []TextBox{
+ {X0: 200, X1: 400, Top: 200, Bottom: 400, Text: "content"},
+ }
+ // Table region only touches a tiny corner (40*40/3 = 13x13 in PDF space).
+ mock := &MockDocAnalyzer{
+ Healthy: true,
+ DLARegions: []DLARegion{
+ {X0: 600, Y0: 600, X1: 720, Y1: 720, Label: "table"}, // tiny corner
+ },
+ TSRCells: []TSRCell{},
+ }
+ p := NewParser(DefaultParserConfig(), mock)
+
+ items := p.extractTableBoxesFromImage(context.Background(), boxes, dummyImg, 0, 0)
+ if len(items) != 0 {
+ t.Errorf("threshold: overlap < 40%% should produce 0 tables, got %d", len(items))
+ }
+}
+
+func TestExtractTableBoxes_FooterGarbageNotTriggerTable(t *testing.T) {
+ // Footer at page bottom → garbage-filtered → not kept as footer.
+ // Since no other type matches, box remains unannotated.
+ dummyImg := image.NewRGBA(image.Rect(0, 0, 900, 900)) // 900/3=300 PDF height
+ boxes := []TextBox{
+ {X0: 100, X1: 300, Top: 280, Bottom: 295, Text: "page 1"},
+ }
+ mock := &MockDocAnalyzer{
+ Healthy: true,
+ DLARegions: []DLARegion{
+ {X0: 300, Y0: 840, X1: 900, Y1: 885, Label: "footer", Confidence: 0.9}, // y=280-295 in PDF
+ },
+ }
+ p := NewParser(DefaultParserConfig(), mock)
+
+ items := p.extractTableBoxesFromImage(context.Background(), boxes, dummyImg, 0, 0)
+ // Footer at bottom edge → garbage → no table regions match
+ if len(items) != 0 {
+ t.Errorf("footer garbage: should not produce tables, got %d", len(items))
+ }
+}
+
+// ---- helpers ----
+
+func TestCellTexts(t *testing.T) {
+ cells := []TSRCell{
+ {Text: "A"}, {Text: "B"}, {Text: "C"},
+ }
+ texts := tsrCellTexts(cells)
+ got := strings.Join(texts, ",")
+ if got != "A,B,C" {
+ t.Errorf("cellTexts: got %q, want 'A,B,C'", got)
+ }
+}
+
+// ── constructTable unit tests ─────────────────────────────────────────
+
+func TestConstructTable_Simple3x2(t *testing.T) {
+ // 3 columns × 2 rows — cells pre-filled (simulating extractTableBoxesFromImage).
+ cells := []TSRCell{
+ {X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "A", Label: "table row"},
+ {X0: 101, Y0: 0, X1: 200, Y1: 50, Text: "B", Label: "table row"},
+ {X0: 201, Y0: 0, X1: 300, Y1: 50, Text: "C", Label: "table row"},
+ {X0: 0, Y0: 51, X1: 100, Y1: 100, Text: "D", Label: "table row"},
+ {X0: 101, Y0: 51, X1: 200, Y1: 100, Text: "E", Label: "table row"},
+ {X0: 201, Y0: 51, X1: 300, Y1: 100, Text: "F", Label: "table row"},
+ }
+ boxes := []TextBox{}
+ html := constructTable(cells, boxes, "", nil)
+ if !strings.Contains(html, "") {
+ t.Error("expected tag")
+ }
+ if !strings.Contains(html, "A") || !strings.Contains(html, "B") || !strings.Contains(html, "C") {
+ t.Error("expected cell texts A, B, C in HTML")
+ }
+ // Should have 2 elements
+ trCount := strings.Count(html, " ")
+ if trCount != 2 {
+ t.Errorf("expected 2 rows, got %d", trCount)
+ }
+ tdCount := strings.Count(html, "| cells, got %d", tdCount)
+ }
+ t.Logf("HTML:\n%s", html)
+}
+
+func TestConstructTable_EmptyCells(t *testing.T) {
+ html := constructTable(nil, nil, "", nil)
+ if html != "" {
+ t.Errorf("expected empty string for empty cells, got %q", html)
+ }
+ html = constructTable([]TSRCell{}, []TextBox{}, "", nil)
+ if html != "" {
+ t.Errorf("expected empty string for empty cells slice, got %q", html)
+ }
+}
+
+func TestConstructTable_NoMatchingBox(t *testing.T) {
+ // Cell has no overlapping text box → empty |
+ cells := []TSRCell{
+ {X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "Has text", Label: "table row"},
+ {X0: 101, Y0: 0, X1: 200, Y1: 50, Label: "table row"},
+ }
+ boxes := []TextBox{}
+ html := constructTable(cells, boxes, "", nil)
+ if !strings.Contains(html, "Has text") {
+ t.Error("expected first cell text")
+ }
+ // Should still have 2 | cells
+ if strings.Count(html, " | cells, got %d. HTML:\n%s", strings.Count(html, " | 表1:测试标题") {
+ t.Errorf("expected caption, got:\n%s", html)
+ }
+ t.Logf("HTML:\n%s", html)
+}
+
+func TestConstructTable_SingleRow(t *testing.T) {
+ cells := []TSRCell{
+ {X0: 0, Y0: 0, X1: 50, Y1: 40, Text: "Col1", Label: "table row"},
+ {X0: 51, Y0: 0, X1: 100, Y1: 40, Text: "Col2", Label: "table row"},
+ }
+ html := constructTable(cells, nil, "", nil)
+ if strings.Count(html, " | ") != 1 {
+ t.Errorf("expected 1 row, got %d", strings.Count(html, " "))
+ }
+ if strings.Count(html, "| ") != 2 {
+ t.Errorf("expected 2 rows from Y-fallback, got %d", strings.Count(html, " | "))
+ }
+ if strings.Count(html, "| ") {
+ t.Error("output should contain HTML table")
+ }
+
+ // Key assertion: constructTable backfills tables[0].Rows.
+ rows := tables[0].Rows
+ if len(rows) != 2 {
+ t.Fatalf("expected 2 rows, got %d", len(rows))
+ }
+ if rows[0][0] != "标职务" {
+ t.Errorf("row 0 col 0 = %q, want %q", rows[0][0], "标职务")
+ }
+ if rows[0][1] != "飞机" {
+ t.Errorf("row 0 col 1 = %q, want %q", rows[0][1], "飞机")
+ }
+ if rows[1][0] != "公司级领导" {
+ t.Errorf("row 1 col 0 = %q, want %q", rows[1][0], "公司级领导")
+ }
+ if rows[1][1] != "经济舱位" {
+ t.Errorf("row 1 col 1 = %q, want %q", rows[1][1], "经济舱位")
+ }
+}
+
+// TestConstructTable_FromBoxesRC builds HTML directly from boxes with R/C
+// annotations, matching Python's construct_table. No cells needed for text.
+func TestConstructTable_FromBoxesRC(t *testing.T) {
+ // Boxes with R (row) and C (col) annotations — like the output of
+ // annotateTableBoxes after layout cleanup.
+ boxes := []TextBox{
+ {X0: 50, X1: 150, Top: 100, Bottom: 130, Text: "姓名", R: 0, C: 0},
+ {X0: 155, X1: 255, Top: 100, Bottom: 130, Text: "年龄", R: 0, C: 1},
+ {X0: 50, X1: 150, Top: 135, Bottom: 165, Text: "张三", R: 1, C: 0},
+ {X0: 155, X1: 255, Top: 135, Bottom: 165, Text: "25", R: 1, C: 1},
+ }
+
+ // constructTable should build HTML directly from boxes by R/C grouping,
+ // ignoring cell text (matching Python's construct_table).
+ item := &TableItem{}
+ html := constructTable(nil, boxes, "", item)
+
+ if !strings.Contains(html, "姓名") || !strings.Contains(html, "张三") {
+ t.Errorf("HTML missing box text: %s", html)
+ }
+ // 2 rows, 2 cols
+ if strings.Count(html, " | ") != 2 {
+ t.Errorf("expected 2 rows, got %d. HTML: %s", strings.Count(html, " "), html)
+ }
+ if strings.Count(html, "| ") != 3 {
+ t.Errorf("expected 3 rows, got %d. HTML: %s", strings.Count(html, " | "), html)
+ }
+ if item.Rows[0][0] != "第一行" || item.Rows[1][0] != "第二行" || item.Rows[2][0] != "第三行" {
+ t.Errorf("wrong text: row0=%q row1=%q row2=%q", item.Rows[0][0], item.Rows[1][0], item.Rows[2][0])
+ }
+}
+
+// TestConstructTable_RCAfterMerge verifies that R/C annotations survive
+// text merge. The merged box expands bounds but keeps the first box's R/C.
+func TestConstructTable_RCAfterMerge(t *testing.T) {
+ // Simulate two adjacent fragments merged into one box.
+ // The merged box keeps R/C from the first fragment.
+ postMerge := []TextBox{
+ {X0: 0, X1: 350, Top: 0, Bottom: 30, Text: "公司级领导人员(含公司董事、总监)", R: 0, C: 0},
+ {X0: 355, X1: 500, Top: 0, Bottom: 30, Text: "经济舱位", R: 0, C: 1},
+ {X0: 0, X1: 200, Top: 35, Bottom: 65, Text: "其他工作人员", R: 1, C: 0},
+ {X0: 355, X1: 500, Top: 35, Bottom: 65, Text: "经济舱位", R: 1, C: 1},
+ }
+ item := &TableItem{}
+ html := constructTable(nil, postMerge, "", item)
+ if !strings.Contains(html, "公司级领导") {
+ t.Errorf("missing merged text: %s", html)
+ }
+ if strings.Count(html, " ") != 2 {
+ t.Errorf("expected 2 rows, got %d", strings.Count(html, " "))
+ }
+ if item.Rows[0][0] != "公司级领导人员(含公司董事、总监)" {
+ t.Errorf("row 0 col 0 = %q", item.Rows[0][0])
+ }
+}
+
+// TestGroupTSRCellsToRowsLabeled_DefaultTableLabel verifies that cells with
+// the real TSR default label "table" (class 0) are grouped correctly.
+// The current deepDocReRowHdr regex only matches ".* (row|header)" — it misses
+// the default "table" label, causing gatherTSR to return empty and forcing
+// a fallback to pure Y-based grouping (which loses R/C annotations).
+func TestGroupTSRCellsToRowsLabeled_DefaultTableLabel(t *testing.T) {
+ cells := []TSRCell{
+ {X0: 10, Y0: 0, X1: 100, Y1: 30, Label: "table"},
+ {X0: 101, Y0: 0, X1: 200, Y1: 30, Label: "table"},
+ {X0: 10, Y0: 35, X1: 100, Y1: 65, Label: "table"},
+ {X0: 101, Y0: 35, X1: 200, Y1: 65, Label: "table"},
+ }
+ rows := groupTSRCellsToRowsLabeled(cells)
+ if len(rows) != 2 {
+ t.Fatalf("label %q: expected 2 rows, got %d (BUG: deepDocReRowHdr does not match %q)", "table", len(rows), "table")
+ }
+ if len(rows[0]) != 2 || len(rows[1]) != 2 {
+ t.Errorf("expected 2 cols/row, got %d/%d", len(rows[0]), len(rows[1]))
+ }
+}
+
+// TestGroupBoxesByRC_RDiffSplitsRows verifies that groupBoxesByRC
+// creates separate rows for different R values (Python: R differs → new row).
+// Even when boxes share the same Y, different R → different grid row.
+func TestGroupBoxesByRC_RDiffSplitsRows(t *testing.T) {
+ // 6 boxes with 6 different R values → 6 rows (Python R-first splitting).
+ boxes := []TextBox{
+ {X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "A", R: 0, C: 0},
+ {X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "B", R: 1, C: 1},
+ {X0: 210, X1: 290, Top: 0, Bottom: 30, Text: "C", R: 2, C: 2},
+ {X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "D", R: 3, C: 0},
+ {X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "E", R: 4, C: 1},
+ {X0: 210, X1: 290, Top: 35, Bottom: 65, Text: "F", R: 5, C: 2},
+ }
+ rows := groupBoxesByRC(boxes)
+ // R=0,1,2,3,4,5 → 6 rows (Python: R differs → new row).
+ if len(rows) != 6 {
+ t.Fatalf("expected 6 rows (R differs → split), got %d", len(rows))
+ }
+}
+
+// TestGroupBoxesByRC_MergesCloseCols verifies that C compression works
+// within each R group — merging different C values that are close in X.
+func TestGroupBoxesByRC_MergesCloseCols(t *testing.T) {
+ // R=0 has C=0,1. R=1 has C=0,1. C compression → 2 cols each.
+ boxes := []TextBox{
+ {X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "A", R: 0, C: 0},
+ {X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "B", R: 0, C: 1},
+ {X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "C", R: 1, C: 0},
+ {X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "D", R: 1, C: 1},
+ }
+ rows := groupBoxesByRC(boxes)
+ if len(rows) != 2 {
+ t.Fatalf("expected 2 rows (R diff), got %d", len(rows))
+ }
+ if len(rows[0]) != 2 || len(rows[1]) != 2 {
+ t.Errorf("expected 2 cols/row, got %d/%d", len(rows[0]), len(rows[1]))
+ }
+ if rows[0][0].Text != "A" || rows[0][1].Text != "B" {
+ t.Errorf("row0 wrong: %q %q", rows[0][0].Text, rows[0][1].Text)
+ }
+ if rows[1][0].Text != "C" || rows[1][1].Text != "D" {
+ t.Errorf("row1 wrong: %q %q", rows[1][0].Text, rows[1][1].Text)
+ }
+}
+
+// TestGroupBoxesByRC_RDiffSplitsRow verifies that boxes with different R
+// values are placed in separate rows even when their Y ranges overlap.
+// Matches Python: R differs → new row unconditionally.
+func TestGroupBoxesByRC_RDiffSplitsRow(t *testing.T) {
+ // R=0 and R=1 at same Y (overlapping) → two separate rows in the grid.
+ boxes := []TextBox{
+ {X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "A", R: 0, C: 0},
+ {X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "B", R: 1, C: 1},
+ {X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "C", R: 2, C: 0},
+ {X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "D", R: 3, C: 1},
+ }
+ rows := groupBoxesByRC(boxes)
+ // R=0,1,2,3 → 4 different R values → 4 rows (Python: R differs → new row).
+ if len(rows) != 4 {
+ t.Fatalf("expected 4 rows (R differs → split), got %d", len(rows))
+ }
+ if rows[0][0].Text != "A" || rows[1][0].Text != "B" {
+ t.Errorf("row0/1 wrong: A=%q B=%q", rows[0][0].Text, rows[1][0].Text)
+ }
+}
+
+// TestFillCellTextFromBoxes_RCOnly verifies that box text goes to exactly
+// one cell via R/C annotations, not multiple cells via spatial overlap.
+// A box overlapping two cells should only fill the one matching its R/C.
+func TestFillCellTextFromBoxes_RCOnly(t *testing.T) {
+ cells := []TSRCell{
+ {X0: 0, Y0: 0, X1: 100, Y1: 50, Label: "table"},
+ {X0: 90, Y0: 0, X1: 200, Y1: 50, Label: "table"},
+ }
+ // This box straddles cell 0 (X=0-100) and cell 1 (X=90-200).
+ // Spatial overlap: both match. R/C: should go to cell R=0, C=0 only.
+ boxes := []TextBox{
+ {X0: 80, X1: 120, Top: 0, Bottom: 50, Text: "TEXT", LayoutType: "table", R: 0, C: 0},
+ }
+ rows := groupTSRCellsToRowsLabeled(cells)
+ for _, b := range boxes {
+ t := strings.TrimSpace(b.Text)
+ if t == "" {
+ continue
+ }
+ if b.R >= 0 && b.R < len(rows) && b.C >= 0 && b.C < len(rows[b.R]) {
+ rows[b.R][b.C].Text = t
+ }
+ }
+ // Cell 0 should have text, cell 1 should NOT.
+ if rows[0][0].Text != "TEXT" {
+ t.Errorf("cell[0][0] = %q, want %q", rows[0][0].Text, "TEXT")
+ }
+ if rows[0][1].Text != "" {
+ t.Errorf("cell[0][1] = %q, should be empty (spatial overlap leak)", rows[0][1].Text)
+ }
+}
+
+// TestRowsToHTML_HeaderRows verifies that header rows use | instead of | .
+func TestRowsToHTML_HeaderRows(t *testing.T) {
+ cells := []TSRCell{
+ {X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "Name", Label: "table column header"},
+ {X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "Age", Label: "table column header"},
+ {X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "John", Label: "table row"},
+ {X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "30", Label: "table row"},
+ }
+ // constructTable should produce | for header row.
+ item := &TableItem{}
+ html := constructTable(cells, nil, "", item)
+ // Header row should use | , data row | .
+ if !strings.Contains(html, " | ") {
+ t.Errorf("expected | for header row. HTML: %s", html)
+ }
+ if strings.Count(html, " | cells, got %d. HTML: %s", strings.Count(html, " | cells (data row), got %d", strings.Count(html, " | 30% each — spatial fills ALL).
+ // With R/C, it belongs only to cell[1] (R=0, C=1).
+ cells := []TSRCell{
+ {X0: 0, Y0: 0, X1: 100, Y1: 30, Label: "table"},
+ {X0: 90, Y0: 0, X1: 200, Y1: 30, Label: "table"},
+ {X0: 180, Y0: 0, X1: 300, Y1: 30, Label: "table"},
+ }
+ boxes := []TextBox{
+ {X0: 30, X1: 270, Top: 0, Bottom: 30, Text: "TEXT", LayoutType: "table", R: 0, C: 1},
+ }
+
+ // Spatial fill: fills ALL overlapping cells —> duplication.
+ cellsCopy := make([]TSRCell, 3)
+ copy(cellsCopy, cells)
+ fillCellTextFromBoxes(cellsCopy, boxes)
+ spatialCount := 0
+ for _, c := range cellsCopy {
+ if c.Text != "" {
+ spatialCount++
+ }
+ }
+ if spatialCount <= 1 {
+ t.Errorf("spatial fill: expected >1 cells with text, got %d", spatialCount)
+ }
+ t.Logf("spatial fill: %d cells (WRONG — duplication)", spatialCount)
+
+ // R/C fill: only cell matching box.R/C gets text.
+ cellsRC := make([]TSRCell, 3)
+ copy(cellsRC, cells)
+ rows := groupTSRCellsToRowsLabeled(cellsRC)
+ for _, b := range boxes {
+ if b.R >= 0 && b.R < len(rows) && b.C >= 0 && b.C < len(rows[b.R]) {
+ rows[b.R][b.C].Text = strings.TrimSpace(b.Text)
+ }
+ }
+ rcCount := 0
+ for _, row := range rows {
+ for _, c := range row {
+ if c.Text == "TEXT" {
+ rcCount++
+ }
+ }
+ }
+ if rcCount != 1 {
+ t.Errorf("R/C fill: expected 1 cell with 'TEXT', got %d", rcCount)
+ }
+}
+
+func TestIsCaptionBox(t *testing.T) {
+ tests := []struct {
+ text string
+ want bool
+ }{
+ {"表1:交通工具等级", true},
+ {"Table 1: Transport Levels", true},
+ {"图表 1: 测试", true},
+ {"公司领导班子成员、出差地", false}, // plain text, not caption
+ {"第十条到厂矿单位出差", false}, // normal paragraph
+ {"", false},
+ }
+ for _, tt := range tests {
+ if got := isCaptionBox(tt.text, ""); got != tt.want {
+ t.Errorf("isCaptionBox(%q) = %v, want %v", tt.text, got, tt.want)
+ }
+ }
+}
+
+func TestFillCellTextFromBoxes_SkipsCaption(t *testing.T) {
+ cells := []TSRCell{
+ {X0: 0, Y0: 0, X1: 200, Y1: 30, Label: "table"},
+ {X0: 0, Y0: 35, X1: 200, Y1: 65, Label: "table"},
+ }
+ boxes := []TextBox{
+ // Caption box (should be skipped)
+ {X0: 0, X1: 200, Top: 0, Bottom: 30, Text: "表1:交通工具等级"},
+ // Data box
+ {X0: 0, X1: 200, Top: 35, Bottom: 65, Text: "数据行"},
+ }
+ fillCellTextFromBoxes(cells, boxes)
+ if cells[0].Text != "" {
+ t.Errorf("caption leaked into cell 0: %q", cells[0].Text)
+ }
+ if cells[1].Text != "数据行" {
+ t.Errorf("data not in cell 1: %q", cells[1].Text)
+ }
+}
+
+func TestFillCellText_RCPreventsCrossCellLeak(t *testing.T) {
+ // Caption box at Y=0-15 overlaps BOTH cell rows (both are "empty").
+ // Spatial fill: text leaks to both cells. R/C fill: only cell[0] gets text.
+ cells := []TSRCell{
+ {X0: 0, Y0: 0, X1: 300, Y1: 30, Label: "table"},
+ {X0: 0, Y0: 35, X1: 300, Y1: 65, Label: "table"},
+ }
+ boxes := []TextBox{
+ {X0: 10, X1: 200, Top: 12, Bottom: 28, Text: "公司领导班子成员、出差地", R: 0, C: 0},
+ }
+
+ // Spatial fill → leaks to cells[1] (overlap ≥30%).
+ cellsSp := make([]TSRCell, 2)
+ copy(cellsSp, cells)
+ fillCellTextFromBoxes(cellsSp, boxes)
+ if cellsSp[1].Text != "" {
+ t.Errorf("spatial fill: caption leaked to cell[1]: %q", cellsSp[1].Text)
+ }
+
+ // R/C fill → only cell[0] (R=0,C=0).
+ cellsRC := make([]TSRCell, 2)
+ copy(cellsRC, cells)
+ rows := groupTSRCellsToRowsLabeled(cellsRC)
+ for _, b := range boxes {
+ if b.R >= 0 && b.R < len(rows) && b.C >= 0 && b.C < len(rows[b.R]) {
+ if rows[b.R][b.C].Text == "" {
+ rows[b.R][b.C].Text = strings.TrimSpace(b.Text)
+ }
+ }
+ }
+ if cellsRC[1].Text != "" {
+ t.Errorf("R/C fill: caption leaked to cell[1]: %q", cellsRC[1].Text)
+ }
+}
+
+func TestGroupBoxesByRC_FallbackToYXWhenNoAnnotations(t *testing.T) {
+ // When all boxes have R=-1 (Python's case: regex didn't match "table" label),
+ // groupBoxesByRC should fall back to YX coordinate grouping.
+ boxes := []TextBox{
+ {X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "A", R: -1, C: -1},
+ {X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "B", R: -1, C: -1},
+ {X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "C", R: -1, C: -1},
+ {X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "D", R: -1, C: -1},
+ }
+ rows := groupBoxesByRC(boxes)
+ // R=-1 for all → maxR = -1 → grid would be 0 rows. Must fall back to YX.
+ if len(rows) == 0 {
+ t.Fatal("groupBoxesByRC returned 0 rows when R=-1 — no YX fallback")
+ }
+ if len(rows) != 2 {
+ t.Errorf("expected 2 rows (Y-split), got %d", len(rows))
+ }
+}
+
+func TestRowsToHTML_Colspan(t *testing.T) {
+ // Box spanning 2 columns: SP annotation with HLeft/HRight covering cols 0-1.
+ boxes := []TextBox{
+ {X0: 10, X1: 90, Top: 0, Bottom: 30, Text: "Name", R: 0, C: 0, H: 1, HLeft: 10, HRight: 190},
+ {X0: 110, X1: 190, Top: 0, Bottom: 30, Text: "", R: 0, C: 1, SP: 1},
+ {X0: 10, X1: 90, Top: 35, Bottom: 65, Text: "John", R: 1, C: 0},
+ {X0: 110, X1: 190, Top: 35, Bottom: 65, Text: "30", R: 1, C: 1},
+ }
+ rows := groupBoxesByRC(boxes)
+ spans, covered := calSpans(rows)
+ html := rowsToHTML(rows, "", nil, spans, covered)
+ if !strings.Contains(html, "colspan") {
+ t.Errorf("expected colspan attribute, got: %s", html)
+ }
+ t.Logf("HTML: %s", html)
+}
+
+// TestStripCaptionFromCells verifies that caption-like text is cleared
+// from TSR cells before the table HTML is built.
+func TestStripCaptionFromCells_ClearsCaptionPattern(t *testing.T) {
+ cells := []TSRCell{
+ {X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "表1:差旅费标准"},
+ {X0: 100, Y0: 0, X1: 200, Y1: 50, Text: ""},
+ {X0: 0, Y0: 60, X1: 100, Y1: 110, Text: "张三"},
+ {X0: 100, Y0: 60, X1: 200, Y1: 110, Text: "100"},
+ }
+ stripCaptionFromCells(cells)
+ if cells[0].Text != "" {
+ t.Errorf("caption cell should be cleared, got %q", cells[0].Text)
+ }
+ if cells[2].Text != "张三" {
+ t.Errorf("data cell should be preserved, got %q", cells[2].Text)
+ }
+}
+
+// TestStripCaptionFromCells_PreservesData verifies that non-caption
+// cells are not cleared.
+func TestStripCaptionFromCells_PreservesData(t *testing.T) {
+ cells := []TSRCell{
+ {X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "姓名"},
+ {X0: 100, Y0: 0, X1: 200, Y1: 50, Text: "年龄"},
+ {X0: 0, Y0: 60, X1: 100, Y1: 110, Text: "张三"},
+ {X0: 100, Y0: 60, X1: 200, Y1: 110, Text: "25"},
+ }
+ // Make a copy and strip
+ orig := make([]string, len(cells))
+ for i, c := range cells {
+ orig[i] = c.Text
+ }
+ stripCaptionFromCells(cells)
+ for i := range cells {
+ if cells[i].Text != orig[i] {
+ t.Errorf("cell[%d] changed: %q -> %q", i, orig[i], cells[i].Text)
+ }
+ }
+}
+
+// TestStripCaptionFromCells_Empty is a no-op on empty cells.
+func TestStripCaptionFromCells_Empty(t *testing.T) {
+ cells := []TSRCell{}
+ stripCaptionFromCells(cells) // must not panic
+}
+
+// TestConstructTable_StripsCaptionFromCells verifies that constructTable
+// strips caption text from cells before building HTML.
+func TestConstructTable_StripsCaptionFromCells(t *testing.T) {
+ // Cell[0] has caption text "表1:标题"; cell[1] has real data.
+ cells := []TSRCell{
+ {X0: 0, Y0: 0, X1: 100, Y1: 50, Text: "表1:标题"},
+ {X0: 100, Y0: 0, X1: 200, Y1: 50, Text: "数据"},
+ }
+ html := constructTable(cells, nil, "", nil)
+ // "表1:标题" should NOT appear in the HTML (stripped as caption).
+ if strings.Contains(html, "表1") {
+ t.Errorf("caption text '表1:标题' should be stripped: %s", html)
+ }
+ // "数据" should still be there.
+ if !strings.Contains(html, "数据") {
+ t.Errorf("data text '数据' should be preserved: %s", html)
+ }
+ t.Logf("HTML: %s", html)
+}
+
+// TestCalSpans_NonSpanningCellsNotPolluted verifies that a regular cell
+// at position [0,0] is NOT detected as spanning when a spanning cell at
+// [0,1] extends to the left, polluting column boundary calculations.
+// Bug: calSpans computed column boundaries from ALL cells including
+// spanning cells. "部门开支汇总" at [0,1] with X0=0 extends colLeft[1]
+// to 0 instead of 101, shifting the center and causing "Q1" at [0,0]
+// to be incorrectly detected as spanning 2 columns.
+func TestCalSpans_NonSpanningCellsNotPolluted(t *testing.T) {
+ // Simulate the SpannedTable test grid: row 0 has Q1(regular), 部门开支汇总(span), Q2(regular)
+ rows := [][]TSRCell{
+ {
+ {X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "Q1", Label: "table row"},
+ {X0: 0, Y0: 0, X1: 200, Y1: 30, Text: "部门开支汇总", Label: "table spanning cell"},
+ {X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "Q2", Label: "table row"},
+ },
+ {
+ {X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "100", Label: "table row"},
+ {X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "200", Label: "table row"},
+ },
+ }
+
+ spans, covered := calSpans(rows)
+
+ // Q1 at [0,0] has X0=0, X1=100 which should only cover its own column.
+ // It should NOT get a colspan.
+ if s, ok := spans[[2]int{0, 0}]; ok {
+ t.Errorf("Q1 at [0,0] should NOT have colspan, got %v. "+
+ "Spanning cell at [0,1] polluted column boundaries", s)
+ }
+
+ // 部门开支汇总 at [0,1] has X0=0, X1=200 which DOES span columns 0 and 1.
+ if s, ok := spans[[2]int{0, 1}]; !ok {
+ t.Error("部门开支汇总 at [0,1] should have colspan=2 (covers X=0-200)")
+ } else if s[0] != 2 {
+ t.Errorf("部门开支汇总 colspan = %d, want 2", s[0])
+ }
+
+ // Q2 at [0,2] should be covered by the spanning cell (col 2 is within X=0-200).
+ if !covered[[2]int{0, 2}] {
+ t.Error("Q2 at [0,2] should be covered by spanning cell at [0,1]")
+ }
+
+ t.Logf("spans: %v, covered: %v", spans, covered)
+}
+
+// ── coordinate space conversion helpers ─────────────────────────────────
+
+func TestCellToPageSpace(t *testing.T) {
+ cell := TSRCell{X0: 100, Y0: 200, X1: 300, Y1: 400, Text: "hello", Label: "table"}
+ got := cellToPageSpace(cell, 15, 25, 3.0)
+
+ // (100+15)/3 = 38.33..., (200+25)/3 = 75
+ if got.X0 != 38.333333333333336 || got.Y0 != 75 || got.X1 != 105 || got.Y1 != 141.66666666666666 {
+ t.Errorf("cellToPageSpace: got (%f,%f,%f,%f), want (38.33,75,105,141.67)", got.X0, got.Y0, got.X1, got.Y1)
+ }
+ if got.Text != "hello" || got.Label != "table" {
+ t.Error("cellToPageSpace should preserve Text and Label")
+ }
+}
+
+func TestCellAddOffset(t *testing.T) {
+ cell := TSRCell{X0: 100, Y0: 200, X1: 300, Y1: 400, Text: "hello"}
+ got := cellAddOffset(cell, 15, 25)
+ if got.X0 != 115 || got.Y0 != 225 || got.X1 != 315 || got.Y1 != 425 {
+ t.Errorf("cellAddOffset: got (%f,%f,%f,%f)", got.X0, got.Y0, got.X1, got.Y1)
+ }
+ if got.Text != "hello" {
+ t.Error("cellAddOffset should preserve Text")
+ }
+}
+
+func TestBoxToCropSpace(t *testing.T) {
+ box := TextBox{X0: 50, X1: 200, Top: 100, Bottom: 300, Text: "text"}
+ got := boxToCropSpace(box, 3.0, 10, 20)
+ if got.X0 != 140 || got.Top != 280 || got.X1 != 590 || got.Bottom != 880 {
+ t.Errorf("boxToCropSpace: got (%f,%f,%f,%f)", got.X0, got.Top, got.X1, got.Bottom)
+ }
+ if got.Text != "text" {
+ t.Error("boxToCropSpace should preserve Text")
+ }
+}
+
+func TestCopyBoxAnnotations(t *testing.T) {
+ src := &TextBox{R: 1, C: 2, RTop: 10, RBott: 20, H: 3, HTop: 30, HBott: 40,
+ HLeft: 50, HRight: 60, CLeft: 70, CRight: 80, SP: 4}
+ dst := &TextBox{}
+ copyBoxAnnotations(dst, src)
+ if dst.R != 1 || dst.C != 2 || dst.RTop != 10 || dst.RBott != 20 {
+ t.Error("R/C fields not copied")
+ }
+ if dst.H != 3 || dst.HTop != 30 || dst.HBott != 40 {
+ t.Error("H fields not copied")
+ }
+ if dst.HLeft != 50 || dst.HRight != 60 || dst.CLeft != 70 || dst.CRight != 80 {
+ t.Error("spanning fields not copied")
+ }
+ if dst.SP != 4 {
+ t.Error("SP not copied")
+ }
+}
+
+// TestAnnotateBoxLayouts_CompactionPreservesWriteBackMapping verifies that
+// when annotateBoxLayouts drops some boxes (CID garbage or garbage-layout
+// at non-edge positions), the compaction step does not corrupt the caller's
+// ability to write annotations back to the correct global box indices.
+//
+// The bug: annotateBoxLayouts compacts boxes in place in the shared backing
+// array, shifting survivors forward. enrichWithDeepDoc then iterates
+// len(indices) positions and writes pageBoxes[i] back to boxes[indices[i]],
+// but after compaction pageBoxes[1] holds what was originally pageBoxes[2],
+// so annotations land on the wrong global box.
+func TestAnnotateBoxLayouts_CompactionPreservesWriteBackMapping(t *testing.T) {
+ // ── Simulate the exact enrichWithDeepDoc write-back pattern ──
+ // Global boxes on a page: B0, B1, B2 (indices 0, 1, 2 in the PDF-space
+ // boxes slice).
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 50, Text: "will be dropped via reference match"},
+ {X0: 0, X1: 100, Top: 60, Bottom: 110, Text: "text box A"},
+ {X0: 110, X1: 200, Top: 60, Bottom: 110, Text: "text box B"},
+ }
+
+ // Per-page subset (what enrichWithDeepDoc constructs from byPage[pg]).
+ indices := []int{0, 1, 2}
+ pageBoxes := make([]TextBox, len(indices))
+ for i, idx := range indices {
+ pageBoxes[i] = boxes[idx] // value copy
+ }
+
+ // DLA regions: one reference (garbage type → matched boxes are dropped
+ // unless at page edge), two text regions for the surviving boxes.
+ // scale=1.0 so DLA pixel coords == PDF point coords.
+ regions := []DLARegion{
+ {Label: "reference", Confidence: 0.9, X0: 0, Y0: 0, X1: 100, Y1: 50},
+ {Label: "text", Confidence: 0.9, X0: 0, Y0: 60, X1: 100, Y1: 110},
+ {Label: "text", Confidence: 0.9, X0: 110, Y0: 60, X1: 200, Y1: 110},
+ }
+ pageImgHeight := 200.0
+
+ // The function under test.
+ _ = annotateBoxLayouts(pageBoxes, regions, 1.0, pageImgHeight)
+
+ // Simulate enrichWithDeepDoc write-back (table.go:52-58).
+ for i, idx := range indices {
+ if pageBoxes[i].LayoutType != "" {
+ boxes[idx].LayoutType = pageBoxes[i].LayoutType
+ boxes[idx].LayoutNo = pageBoxes[i].LayoutNo
+ }
+ copyBoxAnnotations(&boxes[idx], &pageBoxes[i])
+ }
+
+ // ── Assertions ──
+
+ // B0 matched a "reference" region far from page edge → must be dropped.
+ if boxes[0].LayoutType != "" {
+ t.Errorf("B0 was dropped (reference region) but got LayoutType=%q from a shifted survivor",
+ boxes[0].LayoutType)
+ }
+
+ // B1 matched the first text region → must be text-0.
+ if boxes[1].LayoutType != "text" {
+ t.Errorf("B1 LayoutType = %q, want text", boxes[1].LayoutType)
+ }
+ if boxes[1].LayoutNo != "text-0" {
+ t.Errorf("B1 LayoutNo = %q, want text-0 (compaction shifted B2 into position 1)", boxes[1].LayoutNo)
+ }
+
+ // B2 matched the second text region → must be text-1.
+ if boxes[2].LayoutType != "text" {
+ t.Errorf("B2 LayoutType = %q, want text", boxes[2].LayoutType)
+ }
+ if boxes[2].LayoutNo != "text-1" {
+ t.Errorf("B2 LayoutNo = %q, want text-1 (stale element at position 2 after compaction)", boxes[2].LayoutNo)
+ }
+}
+
+// ── matchTableRegions unit tests ─────────────────────────────────────
+
+func TestMatchTableRegions_SingleMatch(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 50},
+ {X0: 200, X1: 300, Top: 0, Bottom: 50},
+ }
+ regions := []DLARegion{
+ {X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "table"}, // covers box 0 at scale 3
+ {X0: 600, Y0: 0, X1: 900, Y1: 150, Label: "text"}, // non-table, ignored
+ }
+ matches := matchTableRegions(boxes, regions, 3.0)
+ if len(matches) != 1 {
+ t.Fatalf("expected 1 match, got %d", len(matches))
+ }
+ if len(matches[0].boxIdx) != 1 || matches[0].boxIdx[0] != 0 {
+ t.Errorf("expected box 0 matched, got %v", matches[0].boxIdx)
+ }
+}
+
+func TestMatchTableRegions_NoTableLabel(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 50},
+ }
+ regions := []DLARegion{
+ {X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"},
+ {X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "figure"},
+ }
+ matches := matchTableRegions(boxes, regions, 3.0)
+ if len(matches) != 0 {
+ t.Errorf("non-table labels: expected 0 matches, got %d", len(matches))
+ }
+}
+
+func TestMatchTableRegions_MultipleBoxesSameTable(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 50}, // box 0
+ {X0: 110, X1: 210, Top: 0, Bottom: 50}, // box 1
+ }
+ regions := []DLARegion{
+ {X0: 0, Y0: 0, X1: 630, Y1: 150, Label: "table"}, // covers both boxes at scale 3
+ }
+ matches := matchTableRegions(boxes, regions, 3.0)
+ if len(matches) != 1 {
+ t.Fatalf("expected 1 match, got %d", len(matches))
+ }
+ if len(matches[0].boxIdx) != 2 {
+ t.Errorf("expected 2 boxes matched, got %d: %v", len(matches[0].boxIdx), matches[0].boxIdx)
+ }
+}
+
+func TestMatchTableRegions_ImageOnlyPDF(t *testing.T) {
+ // Zero boxes — image-only PDF. Python processes every table DLA region
+ // regardless of text box overlap.
+ var boxes []TextBox // nil
+ regions := []DLARegion{
+ {X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "table"},
+ {X0: 0, Y0: 0, X1: 300, Y1: 150, Label: "text"},
+ }
+ matches := matchTableRegions(boxes, regions, 3.0)
+ if len(matches) != 1 {
+ t.Fatalf("image-only: expected 1 table match, got %d", len(matches))
+ }
+ if len(matches[0].boxIdx) != 0 {
+ t.Errorf("image-only: expected empty boxIdx, got %d", len(matches[0].boxIdx))
+ }
+}
+
+func TestMatchTableRegions_BelowThreshold(t *testing.T) {
+ // Region overlaps only a sliver of the box (<40%) → no match.
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 100},
+ }
+ regions := []DLARegion{
+ {X0: 0, Y0: 0, X1: 90, Y1: 90, Label: "table"}, // 30x30 at scale 3 → 9% overlap
+ }
+ matches := matchTableRegions(boxes, regions, 3.0)
+ if len(matches) != 0 {
+ t.Errorf("below threshold: expected 0 matches, got %d", len(matches))
+ }
+}
+
+func TestCellSliceToPageSpace(t *testing.T) {
+ cells := []TSRCell{
+ {X0: 100, Y0: 200, X1: 300, Y1: 400},
+ {X0: 400, Y0: 200, X1: 600, Y1: 400},
+ }
+ got := cellSliceToPageSpace(cells, 15, 25, 3)
+ if len(got) != 2 {
+ t.Fatal("expected 2 cells")
+ }
+ if got[0].X0 != 38.333333333333336 || got[1].X0 != 138.33333333333334 {
+ t.Error("wrong conversion")
+ }
+}
+
+// MockTableBuilder is a test-only TableBuilder with a configurable GroupCells.
+type MockTableBuilder struct {
+ GroupCellsFn func(cells []TSRCell) [][]TSRCell
+}
+
+func (m *MockTableBuilder) Name() string { return "mock" }
+func (m *MockTableBuilder) DetectCells(_ context.Context, _ image.Image) ([]TSRCell, error) {
+ return nil, nil
+}
+func (m *MockTableBuilder) GroupCells(cells []TSRCell) [][]TSRCell {
+ if m.GroupCellsFn != nil {
+ return m.GroupCellsFn(cells)
+ }
+ return nil
+}
+
+// ── writeTableAnnotations unit tests ──────────────────────────────────
+
+func TestWriteTableAnnotations_WriteBack(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 10, X1: 100, Top: 10, Bottom: 30, Text: "A", LayoutType: "table"},
+ {X0: 110, X1: 200, Top: 10, Bottom: 30, Text: "B", LayoutType: "table"},
+ {X0: 10, X1: 100, Top: 35, Bottom: 55, Text: "C", LayoutType: "table"},
+ }
+ boxIdx := []int{0, 2}
+ cells := []TSRCell{
+ {X0: 30, Y0: 30, X1: 300, Y1: 90, Label: "table row"},
+ {X0: 30, Y0: 110, X1: 300, Y1: 170, Label: "table row"},
+ }
+ scale := 3.0
+
+ tb := &MockTableBuilder{GroupCellsFn: func(cells []TSRCell) [][]TSRCell {
+ return [][]TSRCell{{cells[0]}, {cells[1]}}
+ }}
+ writeTableAnnotations(boxes, boxIdx, cells, scale, 0, 0, tb)
+
+ if boxes[0].R != 0 {
+ t.Errorf("box 0 R = %d, want 0", boxes[0].R)
+ }
+ if boxes[0].C != 0 {
+ t.Errorf("box 0 C = %d, want 0", boxes[0].C)
+ }
+ // Box 1 was not in boxIdx — should NOT be annotated
+ if boxes[1].R != 0 || boxes[1].C != 0 {
+ t.Errorf("box 1 should not be annotated: R=%d C=%d", boxes[1].R, boxes[1].C)
+ }
+ if boxes[2].R != 1 {
+ t.Errorf("box 2 R = %d, want 1", boxes[2].R)
+ }
+}
+
+func TestWriteTableAnnotations_ScaleDown(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 10, X1: 100, Top: 10, Bottom: 50, Text: "X", LayoutType: "table"},
+ }
+ boxIdx := []int{0}
+ cells := []TSRCell{
+ {X0: 30, Y0: 30, X1: 300, Y1: 150, Label: "table row"},
+ }
+ scale := 3.0
+
+ tb := &MockTableBuilder{GroupCellsFn: func(cells []TSRCell) [][]TSRCell {
+ return [][]TSRCell{{cells[0]}}
+ }}
+ writeTableAnnotations(boxes, boxIdx, cells, scale, 0, 0, tb)
+
+ // After scale-down: RTop / 3 should be in PDF space (~10).
+ if boxes[0].RTop == 0 {
+ t.Error("RTop should be non-zero after annotation")
+ }
+}
+
+func TestWriteTableAnnotations_EmptyCells(t *testing.T) {
+ boxes := []TextBox{{X0: 10, X1: 100, Top: 10, Bottom: 50, Text: "X", LayoutType: "table"}}
+ boxIdx := []int{0}
+ var cells []TSRCell
+
+ tb := &MockTableBuilder{GroupCellsFn: func(cells []TSRCell) [][]TSRCell {
+ return nil
+ }}
+ // Should not panic with empty cells.
+ writeTableAnnotations(boxes, boxIdx, cells, 3.0, 0, 0, tb)
+ if boxes[0].R != 0 || boxes[0].C != 0 {
+ t.Errorf("empty cells: R=%d C=%d, want 0,0", boxes[0].R, boxes[0].C)
+ }
+}
+
+// ── markNoMergeTables unit tests ─────────────────────────────────────
+
+func TestMarkNoMergeTables_CaptionAfterTable(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
+ {X0: 0, X1: 100, Top: 35, Bottom: 50, LayoutType: "table caption", Text: "表1:标题"},
+ }
+ tables := []TableItem{
+ {Positions: []Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}},
+ }
+ markNoMergeTables(boxes, tables)
+ if !tables[0].NoMerge {
+ t.Error("table followed by caption should be marked NoMerge")
+ }
+}
+
+func TestMarkNoMergeTables_TitleAfterTable(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
+ {X0: 0, X1: 100, Top: 35, Bottom: 50, LayoutType: "title"},
+ }
+ tables := []TableItem{
+ {Positions: []Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}},
+ }
+ markNoMergeTables(boxes, tables)
+ if !tables[0].NoMerge {
+ t.Error("table followed by title should be marked NoMerge")
+ }
+}
+
+func TestMarkNoMergeTables_NoCaptionAfter(t *testing.T) {
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
+ {X0: 0, X1: 100, Top: 35, Bottom: 50, LayoutType: "text"},
+ {X0: 0, X1: 100, Top: 55, Bottom: 70, LayoutType: "table"},
+ }
+ tables := []TableItem{
+ {Positions: []Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}},
+ {Positions: []Position{{Left: 0, Right: 100, Top: 55, Bottom: 70}}},
+ }
+ markNoMergeTables(boxes, tables)
+ if tables[0].NoMerge {
+ t.Error("table followed by text should NOT be marked NoMerge")
+ }
+ if tables[1].NoMerge {
+ t.Error("last table should NOT be marked NoMerge")
+ }
+}
+
+func TestMarkNoMergeTables_StaleLastTableTI(t *testing.T) {
+ // Scenario: table box that does NOT overlap any TableItem.Position
+ // should reset lastTableTI. Otherwise the next caption marks the
+ // wrong (non-adjacent) table as NoMerge.
+ // Box 0: "table", overlaps table[0] → lastTableTI = 0
+ // Box 1: "table", no overlap → lastTableTI should reset to -1
+ // Box 2: "title" → should be a no-op (no adjacent table)
+ boxes := []TextBox{
+ {X0: 0, X1: 100, Top: 0, Bottom: 30, LayoutType: "table"},
+ {X0: 500, X1: 600, Top: 100, Bottom: 130, LayoutType: "table"}, // far away, no overlap
+ {X0: 0, X1: 100, Top: 140, Bottom: 160, LayoutType: "title"},
+ }
+ tables := []TableItem{
+ {Positions: []Position{{Left: 0, Right: 100, Top: 0, Bottom: 30}}}, // table 0
+ {Positions: []Position{{Left: 0, Right: 100, Top: 35, Bottom: 65}}}, // table 1 — box 0 doesn't overlap this either
+ }
+ markNoMergeTables(boxes, tables)
+ // table[0] should NOT be NoMerge: the title follows a non-matching
+ // table box, not table[0] directly.
+ if tables[0].NoMerge {
+ t.Error("stale lastTableTI: table[0] incorrectly marked NoMerge — " +
+ "the non-overlapping table box (box 1) should have reset lastTableTI")
+ }
+}
+
+func TestMarkNoMergeTables_EmptyInputs(t *testing.T) {
+ // Should not panic with empty inputs.
+ markNoMergeTables(nil, nil)
+ markNoMergeTables([]TextBox{}, []TableItem{})
+}
diff --git a/internal/deepdoc/parser/pdf/text_dump_test.go b/internal/deepdoc/parser/pdf/text_dump_test.go
new file mode 100644
index 0000000000..a9610056a9
--- /dev/null
+++ b/internal/deepdoc/parser/pdf/text_dump_test.go
@@ -0,0 +1,89 @@
+//go:build cgo && manual
+
+package parser
+
+import (
+ "context"
+ "os"
+ "path/filepath"
+ "strings"
+ "testing"
+)
+
+// TestDumpTextOutput runs Parse on real PDFs and saves per-PDF text
+// to testdata/output/go/noocr/text/{pdf}.txt. Set DUMP_COUNT env to limit first N PDFs.
+func TestDumpTextOutput(t *testing.T) {
+ pdfDir := filepath.Join("testdata", "real_pdfs")
+ outDir := filepath.Join("testdata", "output", "go", "noocr", "text")
+ os.MkdirAll(outDir, 0755)
+
+ entries, err := os.ReadDir(pdfDir)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ count := len(entries)
+ if n := os.Getenv("DUMP_COUNT"); n != "" {
+ c := 0
+ for _, ch := range n {
+ c = c*10 + int(ch-'0')
+ }
+ if c > 0 && c < count {
+ count = c
+ }
+ }
+
+ totalChars := 0
+ for i, e := range entries {
+ if i >= count {
+ break
+ }
+ if e.IsDir() || !strings.HasSuffix(strings.ToLower(e.Name()), ".pdf") {
+ continue
+ }
+ name := e.Name()
+ outPath := filepath.Join(outDir, name+".txt")
+ if _, err := os.Stat(outPath); err == nil {
+ data, _ := os.ReadFile(outPath)
+ n := len(data)
+ totalChars += n
+ t.Logf("[%d/%d] %s — SKIP (%d chars)", i+1, count, name, n)
+ continue
+ }
+
+ pdfPath := filepath.Join(pdfDir, name)
+ data, err := os.ReadFile(pdfPath)
+ if err != nil {
+ t.Logf("[%d/%d] %s — read error: %v", i+1, count, name, err)
+ continue
+ }
+
+ eng, err := NewEngine(data)
+ if err != nil {
+ t.Logf("[%d/%d] %s — engine error: %v", i+1, count, name, err)
+ continue
+ }
+
+ cfg := DefaultParserConfig()
+ p := NewParser(cfg, &MockDocAnalyzer{Healthy: true, Model: ModelSaas})
+ result, err := p.Parse(context.Background(), eng)
+ eng.Close()
+ if err != nil {
+ t.Logf("[%d/%d] %s — parse error: %v", i+1, count, name, err)
+ continue
+ }
+
+ var sb strings.Builder
+ for _, s := range result.Sections {
+ sb.WriteString(s.Text)
+ sb.WriteByte('\n')
+ }
+ text := sb.String()
+ os.WriteFile(outPath, []byte(text), 0644)
+
+ totalChars += len(text)
+ t.Logf("[%d/%d] %s — %d chars", i+1, count, name, len(text))
+ }
+
+ t.Logf("Done. %d chars total. Output: %s/", totalChars, outDir)
+}
diff --git a/internal/deepdoc/parser/pdf/tools/compare.go b/internal/deepdoc/parser/pdf/tools/compare.go
new file mode 100644
index 0000000000..652a7372f7
--- /dev/null
+++ b/internal/deepdoc/parser/pdf/tools/compare.go
@@ -0,0 +1,645 @@
+package tools
+
+import (
+ "encoding/csv"
+ "encoding/json"
+ "fmt"
+ "math"
+ "os"
+ "path/filepath"
+ "sort"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/xuri/excelize/v2"
+ "golang.org/x/text/unicode/norm"
+)
+
+// Diff stores per-PDF comparison metrics between Go and Python output.
+type Diff struct {
+ File string
+ PagesOk bool
+ BoxesInitDiffPct float64
+ BoxesTMDiffPct float64
+ BoxesVMDiffPct float64
+ SectionsDiffPct float64
+ TextLenDiffPct float64
+ CharsDiffPct float64
+ TablesDiff int
+ CharSim float64
+ LcsSim float64
+ RawCharSim float64 // CharSim without NFKC normalization
+ RawLcsSim float64 // LcsSim without space stripping
+}
+
+// CompareWithPython compares Go results against Python reference.
+func CompareWithPython(log TLogger, goResults []BatchResult, pyResults []PyResult, goTextDir, pyTextDir string) {
+ pyMap := make(map[string]PyResult, len(pyResults))
+ for _, pr := range pyResults {
+ pyMap[pr.File] = pr
+ }
+ goMap := make(map[string]BatchResult, len(goResults))
+ for _, r := range goResults {
+ goMap[r.File] = r
+ }
+
+ var diffs []Diff
+ matched, mismatched := 0, 0
+
+ for _, r := range goResults {
+ py, ok := pyMap[r.File]
+ if !ok {
+ continue
+ }
+ d := Diff{File: r.File, TablesDiff: r.TSTables - py.Tables}
+ if py.Pages > 0 {
+ d.PagesOk = r.Pages == py.Pages
+ if r.Pages == py.Pages {
+ matched++
+ } else {
+ mismatched++
+ }
+ }
+ if py.BoxesInitial > 0 {
+ d.BoxesInitDiffPct = math.Abs(float64(r.BoxesInitial-py.BoxesInitial)) / float64(py.BoxesInitial) * 100
+ }
+ if py.BoxesTextMerge > 0 {
+ d.BoxesTMDiffPct = math.Abs(float64(r.BoxesTextMerg-py.BoxesTextMerge)) / float64(py.BoxesTextMerge) * 100
+ }
+ if py.BoxesVertMerge > 0 {
+ d.BoxesVMDiffPct = math.Abs(float64(r.BoxesVertMerg-py.BoxesVertMerge)) / float64(py.BoxesVertMerge) * 100
+ }
+ if py.Sections > 0 {
+ d.SectionsDiffPct = math.Abs(float64(r.Sections-py.Sections)) / float64(py.Sections) * 100
+ }
+ if py.TextLen > 0 {
+ d.TextLenDiffPct = math.Abs(float64(r.TextLen-py.TextLen)) / float64(py.TextLen) * 100
+ }
+ if py.Chars > 0 {
+ d.CharsDiffPct = math.Abs(float64(r.Chars-py.Chars)) / float64(py.Chars) * 100
+ }
+
+ goTextPath := filepath.Join(goTextDir, r.File+".txt")
+ pyTextPath := filepath.Join(pyTextDir, r.File+".txt")
+ if goTxt, err := os.ReadFile(goTextPath); err == nil {
+ if pyTxt, err := os.ReadFile(pyTextPath); err == nil {
+ goStr, pyStr := string(goTxt), string(pyTxt)
+ // NFKC normalisation: fullwidth→halfwidth (e.g. ",(" → ",(")
+ goStr = norm.NFKC.String(goStr)
+ pyStr = norm.NFKC.String(pyStr)
+ d.CharSim = CharSimilarity(goStr, pyStr)
+ // Section-level LCS: align sections by position window,
+ // compute per-section LCS, bidirectional F1.
+ d.LcsSim = SectionAlignedScore(goStr, pyStr)
+ // Raw metrics without NFKC / space stripping.
+ d.RawCharSim = RawCharSimilarity(string(goTxt), string(pyTxt))
+ d.RawLcsSim = SectionAlignedScore(string(goTxt), string(pyTxt))
+ }
+ }
+ diffs = append(diffs, d)
+ log.Logf(" [%d/%d] %s CharDiff=D%.1f%% LcsDiff=D%.1f%% RawCharDiff=D%.1f%% RawLcsDiff=D%.1f%%",
+ len(diffs), len(goResults), r.File, 100-d.CharSim, 100-d.LcsSim, 100-d.RawCharSim, 100-d.RawLcsSim)
+ }
+
+ sort.Slice(diffs, func(i, j int) bool { return diffs[i].SectionsDiffPct < diffs[j].SectionsDiffPct })
+
+ log.Logf("\n=== Go vs Python (%d PDFs) ===", len(diffs))
+ log.Logf("Pages match: %d/%d", matched, matched+mismatched)
+ log.Logf("%-40s %-18s %-18s %s %s %s %s %s %s %s %s %s %s",
+ "file", "Go:init->tm->vm->sec", "Py:init->tm->vm->sec",
+ "Init%", "TM%", "VM%", "Sec%", "Txt%", "TabD", "CharDiff%", "LcsDiff%", "RawCharDiff%", "RawLcsDiff%")
+ log.Logf("%s", strings.Repeat("-", 168))
+
+ for _, d := range diffs {
+ py := pyMap[d.File]
+ gr := goMap[d.File]
+ goStages := fmt.Sprintf("%3d->%3d->%3d->%3d", gr.BoxesInitial, gr.BoxesTextMerg, gr.BoxesVertMerg, gr.Sections)
+ pyStages := fmt.Sprintf("%3d->%3d->%3d->%3d", py.BoxesInitial, py.BoxesTextMerge, py.BoxesVertMerge, py.Sections)
+ log.Logf("%-40s %-18s %-18s %4.0f%% %4.0f%% %4.0f%% %4.0f%% %4.0f%% %+4d %.0f%% %.0f%% %.0f%% %.0f%%",
+ d.File, goStages, pyStages,
+ d.BoxesInitDiffPct, d.BoxesTMDiffPct, d.BoxesVMDiffPct,
+ d.SectionsDiffPct, d.TextLenDiffPct, d.TablesDiff,
+ 100-d.CharSim, 100-d.LcsSim,
+ 100-d.RawCharSim, 100-d.RawLcsSim)
+ }
+
+ n := len(diffs)
+ if n == 0 {
+ return
+ }
+
+ type stats struct {
+ median, mean, max, min float64
+ over5, over10 int
+ }
+ computeStats := func(get func(Diff) float64) stats {
+ sort.Slice(diffs, func(i, j int) bool { return get(diffs[i]) < get(diffs[j]) })
+ s := stats{min: 1e9}
+ if n%2 == 0 {
+ s.median = (get(diffs[n/2-1]) + get(diffs[n/2])) / 2
+ } else {
+ s.median = get(diffs[n/2])
+ }
+ var sum float64
+ for _, d := range diffs {
+ v := get(d)
+ sum += v
+ if v > s.max {
+ s.max = v
+ }
+ if v < s.min {
+ s.min = v
+ }
+ if v > 5 {
+ s.over5++
+ }
+ if v > 10 {
+ s.over10++
+ }
+ }
+ s.mean = sum / float64(n)
+ return s
+ }
+
+ label := func(name string, s stats) string {
+ return fmt.Sprintf("%s Med=%.1f%% Mean=%.1f%% Min=%.0f%% Max=%.0f%% >5%%:%d >10%%:%d",
+ name, s.median, s.mean, s.min, s.max, s.over5, s.over10)
+ }
+
+ log.Logf("\nSummary (n=%d):", n)
+ log.Logf(" %s", label("BoxesInit ", computeStats(func(d Diff) float64 { return d.BoxesInitDiffPct })))
+ log.Logf(" %s", label("TextMerge", computeStats(func(d Diff) float64 { return d.BoxesTMDiffPct })))
+ log.Logf(" %s", label("VertMerge", computeStats(func(d Diff) float64 { return d.BoxesVMDiffPct })))
+ log.Logf(" %s", label("Sections ", computeStats(func(d Diff) float64 { return d.SectionsDiffPct })))
+ log.Logf(" %s", label("TextLen ", computeStats(func(d Diff) float64 { return d.TextLenDiffPct })))
+ log.Logf(" %s", label("CharDiff ", computeStats(func(d Diff) float64 { return 100 - d.CharSim })))
+ log.Logf(" %s", label("LcsDiff ", computeStats(func(d Diff) float64 { return 100 - d.LcsSim })))
+ log.Logf(" %s", label("RawCharDiff", computeStats(func(d Diff) float64 { return 100 - d.RawCharSim })))
+ log.Logf(" %s", label("RawLcsDiff ", computeStats(func(d Diff) float64 { return 100 - d.RawLcsSim })))
+
+ // Auto-generate xlsx report with timestamp.
+ mode := filepath.Base(filepath.Dir(goTextDir)) // "ocr"
+ ts := time.Now().Format("20060102_1504")
+ xlsxDir := filepath.Join("testdata", "output")
+ os.MkdirAll(xlsxDir, 0755)
+ xlsxPath := filepath.Join(xlsxDir, fmt.Sprintf("compare_%s_%s.xlsx", mode, ts))
+ if err := WriteExcel(xlsxPath, diffs); err != nil {
+ log.Logf("Excel write error: %v", err)
+ } else {
+ log.Logf("Excel report: %s", xlsxPath)
+ }
+
+ // Also write CSV if BATCH_CSV env is set (backward compat).
+ if csvPath := os.Getenv("BATCH_CSV"); csvPath != "" {
+ if err := WriteCSV(csvPath, diffs); err != nil {
+ log.Logf("CSV write error: %v", err)
+ } else {
+ log.Logf("CSV written to %s", csvPath)
+ }
+ }
+}
+
+// WriteCSV writes comparison results to a CSV file using encoding/csv
+// for proper field escaping (filenames may contain commas/quotes).
+func WriteCSV(path string, diffs []Diff) error {
+ f, err := os.Create(path)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ w := csv.NewWriter(f)
+ defer w.Flush()
+
+ if err := w.Write([]string{"file", "init%", "tm%", "vm%", "sec%", "txt%", "tabsD", "chrdiff%", "lcsdiff%", "rawChr%", "rawLcs%"}); err != nil {
+ return err
+ }
+ for _, d := range diffs {
+ row := []string{
+ d.File,
+ strconv.FormatFloat(d.BoxesInitDiffPct, 'f', 1, 64),
+ strconv.FormatFloat(d.BoxesTMDiffPct, 'f', 1, 64),
+ strconv.FormatFloat(d.BoxesVMDiffPct, 'f', 1, 64),
+ strconv.FormatFloat(d.SectionsDiffPct, 'f', 1, 64),
+ strconv.FormatFloat(d.TextLenDiffPct, 'f', 1, 64),
+ strconv.Itoa(d.TablesDiff),
+ strconv.FormatFloat(100-d.CharSim, 'f', 1, 64),
+ strconv.FormatFloat(100-d.LcsSim, 'f', 1, 64),
+ strconv.FormatFloat(100-d.RawCharSim, 'f', 1, 64),
+ strconv.FormatFloat(100-d.RawLcsSim, 'f', 1, 64),
+ }
+ if err := w.Write(row); err != nil {
+ return err
+ }
+ }
+ w.Flush()
+ return w.Error()
+}
+
+// WriteExcel writes comparison results to an xlsx file with formatting.
+func WriteExcel(path string, diffs []Diff) error {
+ f := excelize.NewFile()
+ defer f.Close()
+ sheet := "Comparison"
+ f.SetSheetName("Sheet1", sheet)
+
+ // Styles.
+ headerStyle, _ := f.NewStyle(&excelize.Style{
+ Font: &excelize.Font{Bold: true},
+ Fill: excelize.Fill{Type: "pattern", Pattern: 1, Color: []string{"D9E1F2"}},
+ Alignment: &excelize.Alignment{Horizontal: "center"},
+ })
+ greenStyle, _ := f.NewStyle(&excelize.Style{
+ Fill: excelize.Fill{Type: "pattern", Pattern: 1, Color: []string{"C6EFCE"}},
+ NumFmt: 2,
+ })
+ yellowStyle, _ := f.NewStyle(&excelize.Style{
+ Fill: excelize.Fill{Type: "pattern", Pattern: 1, Color: []string{"FFEB9C"}},
+ NumFmt: 2,
+ })
+ redStyle, _ := f.NewStyle(&excelize.Style{
+ Fill: excelize.Fill{Type: "pattern", Pattern: 1, Color: []string{"FFC7CE"}},
+ NumFmt: 2,
+ })
+
+ // Header row.
+ headers := []string{"File", "Init%", "TM%", "VM%", "Sec%", "Txt%", "TabsD", "ChrDiff%", "LcsDiff%"}
+ for i, h := range headers {
+ cell, _ := excelize.CoordinatesToCellName(i+1, 1)
+ f.SetCellValue(sheet, cell, h)
+ f.SetCellStyle(sheet, cell, cell, headerStyle)
+ }
+
+ // Data rows.
+ for row, d := range diffs {
+ r := row + 2 // 1-indexed, skip header
+ vals := []float64{
+ 0, // placeholder for file
+ d.BoxesInitDiffPct, d.BoxesTMDiffPct, d.BoxesVMDiffPct,
+ d.SectionsDiffPct, d.TextLenDiffPct, float64(d.TablesDiff),
+ 100 - d.CharSim, 100 - d.LcsSim,
+ }
+
+ // File name (column A).
+ f.SetCellValue(sheet, cellName(1, r), d.File)
+
+ // Numeric columns (B-I).
+ for col := 2; col <= 9; col++ {
+ cell := cellName(col, r)
+ v := vals[col-1]
+ f.SetCellValue(sheet, cell, v)
+ // Color: green <5, yellow 5-20, red >=20.
+ if col == 7 { // TabsD is a count, not percentage
+ continue
+ }
+ abs := math.Abs(v)
+ switch {
+ case abs < 5:
+ f.SetCellStyle(sheet, cell, cell, greenStyle)
+ case abs < 20:
+ f.SetCellStyle(sheet, cell, cell, yellowStyle)
+ default:
+ f.SetCellStyle(sheet, cell, cell, redStyle)
+ }
+ }
+ }
+
+ // Column widths.
+ f.SetColWidth(sheet, "A", "A", 45)
+ f.SetColWidth(sheet, "B", "I", 12)
+
+ // Freeze header row.
+ f.SetPanes(sheet, &excelize.Panes{
+ Freeze: true,
+ Split: false,
+ XSplit: 0,
+ YSplit: 1,
+ TopLeftCell: "A2",
+ ActivePane: "bottomLeft",
+ })
+
+ return f.SaveAs(path)
+}
+
+func cellName(col, row int) string {
+ s, _ := excelize.CoordinatesToCellName(col, row)
+ return s
+}
+
+// including per-cell text comparison.
+func CompareTablesWithPython(log TLogger, goTablesDir, pyTablesDir string) {
+ goEntries, err := os.ReadDir(goTablesDir)
+ if err != nil {
+ log.Logf("Tables compare: no Go tables dir %s", goTablesDir)
+ return
+ }
+
+ type goTable struct {
+ Rows [][]string `json:"rows"`
+ }
+ type pyCell struct {
+ X0 float64 `json:"x0"`
+ X1 float64 `json:"x1"`
+ Top float64 `json:"top"`
+ Bottom float64 `json:"bottom"`
+ Text string `json:"text"`
+ Page int `json:"page"`
+ }
+ type pyResult struct {
+ Cells []pyCell `json:"cells"`
+ Page int `json:"page"`
+ Rows [][]string `json:"rows"`
+ }
+ type pyFile struct {
+ Tables int `json:"tables"`
+ Results []pyResult `json:"results"`
+ }
+
+ matched, tableDiffs, cellDiffs, textMismatches := 0, 0, 0, 0
+ totalCellsCompared, totalCellsMatched := 0, 0
+
+ log.Logf("\n=== Table Comparison (Go vs Python) ===")
+ log.Logf("%-40s %6s %6s %6s %6s %8s %s",
+ "file", "GoTbl", "PyTbl", "GoCel", "PyCel", "TxtMatch", "Result")
+ log.Logf("%s", strings.Repeat("-", 100))
+
+ for _, e := range goEntries {
+ if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") {
+ continue
+ }
+
+ goPath := filepath.Join(goTablesDir, e.Name())
+ pyPath := filepath.Join(pyTablesDir, e.Name())
+ if !FileExists(pyPath) {
+ continue
+ }
+
+ // Read Go tables.
+ goData, _ := os.ReadFile(goPath)
+ var goTables []goTable
+ if err := json.Unmarshal(goData, &goTables); err != nil {
+ log.Logf(" %s: Go JSON parse error: %v", e.Name(), err)
+ continue
+ }
+
+ // Read Python tables.
+ pyData, _ := os.ReadFile(pyPath)
+ var pyF pyFile
+ if err := json.Unmarshal(pyData, &pyF); err != nil {
+ log.Logf(" %s: Py JSON parse error: %v", e.Name(), err)
+ continue
+ }
+
+ matched++
+
+ // Count cells.
+ goTotalCells := 0
+ for _, t := range goTables {
+ for _, row := range t.Rows {
+ goTotalCells += len(row)
+ }
+ }
+ pyTotalCells := 0
+ for _, r := range pyF.Results {
+ if len(r.Cells) > 0 {
+ pyTotalCells += len(r.Cells)
+ } else {
+ for _, row := range r.Rows {
+ pyTotalCells += len(row)
+ }
+ }
+ }
+
+ // Cell-level text comparison (table by table, row by row, cell by cell).
+ cellsCompared, cellsMatched := 0, 0
+ nTables := min(len(goTables), len(pyF.Results))
+ for ti := 0; ti < nTables; ti++ {
+ goRows := goTables[ti].Rows
+ pyRows := pyF.Results[ti].Rows
+ nRows := min(len(goRows), len(pyRows))
+ for ri := 0; ri < nRows; ri++ {
+ nCols := min(len(goRows[ri]), len(pyRows[ri]))
+ for ci := 0; ci < nCols; ci++ {
+ cellsCompared++
+ if strings.TrimSpace(goRows[ri][ci]) == strings.TrimSpace(pyRows[ri][ci]) {
+ cellsMatched++
+ }
+ }
+ }
+ }
+
+ totalCellsCompared += cellsCompared
+ totalCellsMatched += cellsMatched
+
+ // Status.
+ status := "✅"
+ txtMatch := ""
+ if len(goTables) != len(pyF.Results) {
+ tableDiffs++
+ status = "❌ tables"
+ }
+ if goTotalCells != pyTotalCells {
+ cellDiffs++
+ if status == "✅" {
+ status = "⚠️ cells"
+ }
+ }
+ if cellsCompared > 0 {
+ pct := float64(cellsMatched) / float64(cellsCompared) * 100
+ txtMatch = fmt.Sprintf("%.0f%%", pct)
+ if pct < 100 && status == "✅" {
+ status = "⚠️ text"
+ textMismatches++
+ }
+ if pct < 100 && status != "✅" {
+ textMismatches++
+ }
+ } else {
+ txtMatch = "-"
+ }
+
+ name := strings.TrimSuffix(e.Name(), ".json")
+ log.Logf("%-40s %6d %6d %6d %6d %8s %s",
+ name, len(goTables), len(pyF.Results), goTotalCells, pyTotalCells, txtMatch, status)
+ }
+
+ if matched == 0 {
+ log.Logf("No matching table files found")
+ return
+ }
+
+ txtPct := 0.0
+ if totalCellsCompared > 0 {
+ txtPct = float64(totalCellsMatched) / float64(totalCellsCompared) * 100
+ }
+ log.Logf("\nTable Summary: %d PDFs, %d table diffs, %d cell diffs, %d text mismatches",
+ matched, tableDiffs, cellDiffs, textMismatches)
+ log.Logf("Cell text match: %d/%d (%.1f%%)", totalCellsMatched, totalCellsCompared, txtPct)
+}
+
+// ── DLA intermediate comparison ──────────────────────────────────────────
+
+type jsonDlaPage struct {
+ Page int `json:"page"`
+ Regions []jsonDlaRegion `json:"regions"`
+}
+type jsonDlaRegion struct {
+ Label string `json:"label"` // Go uses "label"
+ Type string `json:"type"` // Python uses "type"
+ X0 float64 `json:"x0"`
+ Y0 float64 `json:"y0"`
+ X1 float64 `json:"x1"`
+ Y1 float64 `json:"y1"`
+}
+
+// CompareDLAWithPython compares per-page DLA layout regions.
+// Both dirs contain {pdf}.json files with []dlaPageRegion.
+func CompareDLAWithPython(log TLogger, goDLADir, pyDLADir string) {
+ goEntries, _ := os.ReadDir(goDLADir)
+ pyEntries, _ := os.ReadDir(pyDLADir)
+ pySet := map[string]bool{}
+ for _, e := range pyEntries {
+ pySet[e.Name()] = true
+ }
+
+ matched := 0
+ log.Logf("\n=== DLA Comparison (Go vs Python) ===")
+ log.Logf("%-40s %6s %6s %6s %6s %6s",
+ "file", "GoPg", "PyPg", "GoReg", "PyReg", "TblReg")
+ log.Logf("%s", strings.Repeat("-", 80))
+
+ for _, e := range goEntries {
+ if !strings.HasSuffix(e.Name(), ".json") || !pySet[e.Name()] {
+ continue
+ }
+ goData, _ := os.ReadFile(filepath.Join(goDLADir, e.Name()))
+ pyData, _ := os.ReadFile(filepath.Join(pyDLADir, e.Name()))
+
+ var goPages []jsonDlaPage
+ json.Unmarshal(goData, &goPages)
+ var pyPages []jsonDlaPage
+ json.Unmarshal(pyData, &pyPages)
+
+ matched++
+ goRegions, pyRegions := 0, 0
+ goTables, pyTables := 0, 0
+ for _, p := range goPages {
+ goRegions += len(p.Regions)
+ for _, r := range p.Regions {
+ if dlaRegionIsTable(r) {
+ goTables++
+ }
+ }
+ }
+ for _, p := range pyPages {
+ pyRegions += len(p.Regions)
+ for _, r := range p.Regions {
+ if dlaRegionIsTable(r) {
+ pyTables++
+ }
+ }
+ }
+
+ name := strings.TrimSuffix(e.Name(), ".json")
+ log.Logf("%-40s %6d %6d %6d %6d %6d",
+ name, len(goPages), len(pyPages), goRegions, pyRegions, goTables-pyTables)
+ }
+ if matched == 0 {
+ log.Logf("No matching DLA files found (go=%s py=%s)", goDLADir, pyDLADir)
+ }
+}
+
+// ── TSR raw intermediate comparison ──────────────────────────────────────
+
+type tsrRawCell struct {
+ TableIndex int `json:"table_index"`
+ Page int `json:"page"`
+ Label string `json:"label"`
+ X0, Y0 float64 `json:"x0" y0:"y0"`
+ X1, Y1 float64 `json:"x1" y1:"y1"`
+ Text string `json:"text"`
+}
+
+// CompareTSRRawWithPython compares raw TSR cells per table.
+// Both dirs contain {pdf}.json files with []tsrRawCell (Go) or []tsrRawCell (Py).
+func CompareTSRRawWithPython(log TLogger, goTSRDir, pyTSRDir string) {
+ goEntries, _ := os.ReadDir(goTSRDir)
+ pyEntries, _ := os.ReadDir(pyTSRDir)
+ pySet := map[string]bool{}
+ for _, e := range pyEntries {
+ pySet[e.Name()] = true
+ }
+
+ matched := 0
+ totalDiffs := 0
+ log.Logf("\n=== TSR Raw Comparison (Go vs Python) ===")
+ log.Logf("%-40s %6s %6s %8s %8s %6s",
+ "file", "GoTbl", "PyTbl", "GoCell", "PyCell", "LabelD")
+ log.Logf("%s", strings.Repeat("-", 85))
+
+ for _, e := range goEntries {
+ if !strings.HasSuffix(e.Name(), ".json") || !pySet[e.Name()] {
+ continue
+ }
+ goData, _ := os.ReadFile(filepath.Join(goTSRDir, e.Name()))
+ pyData, _ := os.ReadFile(filepath.Join(pyTSRDir, e.Name()))
+
+ var goCells []tsrRawCell
+ json.Unmarshal(goData, &goCells)
+ var pyCells []tsrRawCell
+ json.Unmarshal(pyData, &pyCells)
+
+ // Group by table.
+ goByTable := map[int][]tsrRawCell{}
+ pyByTable := map[int][]tsrRawCell{}
+ for _, c := range goCells {
+ goByTable[c.TableIndex] = append(goByTable[c.TableIndex], c)
+ }
+ for _, c := range pyCells {
+ pyByTable[c.TableIndex] = append(pyByTable[c.TableIndex], c)
+ }
+
+ matched++
+ labelDiffs := 0
+ goTotal, pyTotal := len(goCells), len(pyCells)
+ for ti := range goByTable {
+ goTab := goByTable[ti]
+ pyTab := pyByTable[ti]
+ n := min(len(goTab), len(pyTab))
+ for i := 0; i < n; i++ {
+ if goTab[i].Label != pyTab[i].Label {
+ labelDiffs++
+ }
+ }
+ labelDiffs += abs(len(goTab) - len(pyTab))
+ }
+ if labelDiffs > 0 {
+ totalDiffs++
+ }
+
+ name := strings.TrimSuffix(e.Name(), ".json")
+ log.Logf("%-40s %6d %6d %8d %8d %6d",
+ name, len(goByTable), len(pyByTable), goTotal, pyTotal, labelDiffs)
+ }
+ if matched == 0 {
+ log.Logf("No matching TSR raw files found (go=%s py=%s)", goTSRDir, pyTSRDir)
+ } else {
+ log.Logf("TSR Raw Summary: %d PDFs, %d with label diffs", matched, totalDiffs)
+ }
+}
+
+func dlaRegionIsTable(r jsonDlaRegion) bool {
+ label := r.Label
+ if label == "" {
+ label = r.Type
+ }
+ return label == "table"
+}
+
+func abs(x int) int {
+ if x < 0 {
+ return -x
+ }
+ return x
+}
diff --git a/internal/deepdoc/parser/pdf/tools/config.go b/internal/deepdoc/parser/pdf/tools/config.go
new file mode 100644
index 0000000000..a9796d3a18
--- /dev/null
+++ b/internal/deepdoc/parser/pdf/tools/config.go
@@ -0,0 +1,66 @@
+package tools
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+ "strconv"
+ "time"
+)
+
+type Config struct {
+ Count int
+ Single string
+ SkipOCR bool // DLA+TSR but no image OCR
+ CompareOnly bool
+ CompareFilter string
+ CSVOutput string
+ GoTextDir string
+ PyTextDir string
+ TablesDir string
+ GoSuffix string
+}
+
+func LoadConfig() Config {
+ goVariant := "ocr"
+ pyVariant := "ocr"
+ td := filepath.Join("testdata")
+ return Config{
+ Count: envInt("BATCH_COUNT", 0),
+ Single: os.Getenv("BATCH_SINGLE"),
+ SkipOCR: os.Getenv("BATCH_SKIP_OCR") == "1",
+ CompareOnly: os.Getenv("BATCH_COMPARE_ONLY") == "1",
+ CompareFilter: os.Getenv("BATCH_COMPARE_FILTER"),
+ CSVOutput: envStr("BATCH_COMPARE_CSV", filepath.Join(td, "output", fmt.Sprintf("compare_%s.csv", time.Now().Format("20060102_150405")))),
+ GoTextDir: filepath.Join(td, "output", "go", goVariant, "text"),
+ PyTextDir: filepath.Join(td, "output", "py", pyVariant, "text"),
+ TablesDir: filepath.Join(td, "output", "go", goVariant, "tables"),
+ GoSuffix: goVariant,
+ }
+}
+
+func envInt(key string, def int) int {
+ v := os.Getenv(key)
+ if v == "" {
+ return def
+ }
+ n, err := strconv.Atoi(v)
+ if err != nil {
+ return def
+ }
+ return n
+}
+
+func envStr(key, def string) string {
+ v := os.Getenv(key)
+ if v == "" {
+ return def
+ }
+ return v
+}
+
+// FileExists returns true if the path exists.
+func FileExists(path string) bool {
+ _, err := os.Stat(path)
+ return err == nil
+}
diff --git a/internal/deepdoc/parser/pdf/tools/metadata.go b/internal/deepdoc/parser/pdf/tools/metadata.go
new file mode 100644
index 0000000000..55d380ceb4
--- /dev/null
+++ b/internal/deepdoc/parser/pdf/tools/metadata.go
@@ -0,0 +1,90 @@
+package tools
+
+import (
+ "encoding/json"
+ "os"
+ "path/filepath"
+ "strings"
+ "unicode/utf8"
+)
+
+// ReadPythonTextMeta reads Python pipeline stage data from #@meta lines.
+func ReadPythonTextMeta(pyTextDir string) ([]PyResult, error) {
+ entries, err := os.ReadDir(pyTextDir)
+ if err != nil {
+ return nil, err
+ }
+ var results []PyResult
+ for _, e := range entries {
+ if !strings.HasSuffix(e.Name(), ".txt") {
+ continue
+ }
+ data, err := os.ReadFile(filepath.Join(pyTextDir, e.Name()))
+ if err != nil {
+ continue
+ }
+ py := PyResult{File: strings.TrimSuffix(e.Name(), ".txt"), TextLen: utf8.RuneCount(data)}
+ if idx := strings.LastIndex(string(data), "\n#@meta"); idx >= 0 {
+ var meta struct {
+ Chars int `json:"chars"`
+ BoxesInitial int `json:"boxes_initial"`
+ BoxesTextMerge int `json:"boxes_text_merge"`
+ BoxesVertMerge int `json:"boxes_vertical_merge"`
+ Sections int `json:"sections"`
+ }
+ if json.Unmarshal(data[idx+7:], &meta) == nil {
+ py.Chars = meta.Chars
+ py.BoxesInitial = meta.BoxesInitial
+ py.BoxesTextMerge = meta.BoxesTextMerge
+ py.BoxesVertMerge = meta.BoxesVertMerge
+ py.Sections = meta.Sections
+ py.Pages = 0
+ py.TextLen = utf8.RuneCount(data[:idx])
+ }
+ }
+ results = append(results, py)
+ }
+ return results, nil
+}
+
+// ReadGoTextMeta reads Go pipeline stage data from #@meta lines.
+func ReadGoTextMeta(goTextDir string) ([]BatchResult, error) {
+ entries, err := os.ReadDir(goTextDir)
+ if err != nil {
+ return nil, err
+ }
+ var results []BatchResult
+ for _, e := range entries {
+ if !strings.HasSuffix(e.Name(), ".txt") {
+ continue
+ }
+ data, err := os.ReadFile(filepath.Join(goTextDir, e.Name()))
+ if err != nil {
+ continue
+ }
+ r := BatchResult{
+ File: strings.TrimSuffix(e.Name(), ".txt"),
+ Pages: 1,
+ TextLen: utf8.RuneCount(data),
+ }
+ if idx := strings.LastIndex(string(data), "\n#@meta"); idx >= 0 {
+ r.TextLen = utf8.RuneCount(data[:idx]) // text only, exclude #@meta
+ var meta struct {
+ Chars int `json:"chars"`
+ BoxesIn int `json:"boxes_initial"`
+ BoxesTM int `json:"boxes_text_merge"`
+ BoxesVM int `json:"boxes_vertical_merge"`
+ Sections int `json:"sections"`
+ }
+ if json.Unmarshal(data[idx+7:], &meta) == nil {
+ r.Chars = meta.Chars
+ r.BoxesInitial = meta.BoxesIn
+ r.BoxesTextMerg = meta.BoxesTM
+ r.BoxesVertMerg = meta.BoxesVM
+ r.Sections = meta.Sections
+ }
+ }
+ results = append(results, r)
+ }
+ return results, nil
+}
diff --git a/internal/deepdoc/parser/pdf/tools/similarity.go b/internal/deepdoc/parser/pdf/tools/similarity.go
new file mode 100644
index 0000000000..9c271b4188
--- /dev/null
+++ b/internal/deepdoc/parser/pdf/tools/similarity.go
@@ -0,0 +1,277 @@
+package tools
+
+import (
+ "sort"
+ "strings"
+ "unicode"
+)
+
+func StripMeta(s string) string {
+ if idx := strings.LastIndex(s, "\n#@meta"); idx >= 0 {
+ return s[:idx]
+ }
+ return s
+}
+
+func CharSimilarity(a, b string) float64 {
+ a = StripMeta(a)
+ b = StripMeta(b)
+ extract := func(s string) map[rune]int {
+ m := make(map[rune]int)
+ for _, r := range s {
+ if !unicode.IsSpace(r) {
+ m[r]++
+ }
+ }
+ return m
+ }
+ ca, cb := extract(a), extract(b)
+ if len(ca) == 0 && len(cb) == 0 {
+ return 100
+ }
+ common, totalA, totalB := 0, 0, 0
+ for r, n := range ca {
+ totalA += n
+ if n2, ok := cb[r]; ok {
+ common += min(n, n2)
+ }
+ }
+ for _, n := range cb {
+ totalB += n
+ }
+ if totalA+totalB == 0 {
+ return 100
+ }
+ return float64(common*2) / float64(totalA+totalB) * 100
+}
+
+func lcsRunes(a, b []rune) int {
+ if len(a) < len(b) {
+ a, b = b, a
+ }
+ m, n := len(b), len(a)
+ prev := make([]int, m+1)
+ cur := make([]int, m+1)
+ for i := 1; i <= n; i++ {
+ for j := 1; j <= m; j++ {
+ if a[i-1] == b[j-1] {
+ cur[j] = prev[j-1] + 1
+ } else {
+ cur[j] = max(cur[j-1], prev[j])
+ }
+ }
+ prev, cur = cur, prev
+ }
+ return prev[m]
+}
+
+func LcsSimilarity(a, b string) float64 {
+ a = StripMeta(a)
+ b = StripMeta(b)
+ ra := make([]rune, 0)
+ for _, r := range a {
+ if !unicode.IsSpace(r) {
+ ra = append(ra, r)
+ }
+ }
+ rb := make([]rune, 0)
+ for _, r := range b {
+ if !unicode.IsSpace(r) {
+ rb = append(rb, r)
+ }
+ }
+ if len(ra) == 0 && len(rb) == 0 {
+ return 100
+ }
+ if len(ra) == 0 || len(rb) == 0 {
+ return 0
+ }
+ return float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
+}
+
+// RawCharSimilarity is CharSimilarity without space stripping — spaces
+// count as characters. Still strips #@meta lines.
+func RawCharSimilarity(a, b string) float64 {
+ a = StripMeta(a)
+ b = StripMeta(b)
+ ca := make(map[rune]int)
+ for _, r := range a {
+ ca[r]++
+ }
+ cb := make(map[rune]int)
+ for _, r := range b {
+ cb[r]++
+ }
+ if len(ca) == 0 && len(cb) == 0 {
+ return 100
+ }
+ common, totalA, totalB := 0, 0, 0
+ for r, n := range ca {
+ totalA += n
+ if n2, ok := cb[r]; ok {
+ common += min(n, n2)
+ }
+ }
+ for _, n := range cb {
+ totalB += n
+ }
+ if totalA+totalB == 0 {
+ return 100
+ }
+ return float64(common*2) / float64(totalA+totalB) * 100
+}
+
+// RawLcsSimilarity is LcsSimilarity without space stripping — whitespace
+// is kept in the LCS comparison. Still strips #@meta lines.
+func RawLcsSimilarity(a, b string) float64 {
+ a = StripMeta(a)
+ b = StripMeta(b)
+ ra := []rune(a)
+ rb := []rune(b)
+ if len(ra) == 0 && len(rb) == 0 {
+ return 100
+ }
+ if len(ra) == 0 || len(rb) == 0 {
+ return 0
+ }
+ return float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
+}
+
+// SectionAlignedScore computes a two-phase LCS similarity:
+//
+// Phase 1: One-to-one section matching — pair Go and Python sections by
+// CharSimilarity (greedy, highest first). For matched pairs, compute
+// per-section LCS ratio.
+//
+// Phase 2: Residual — concatenate all unmatched sections from both sides
+// into one string each, compute LCS ratio once. This handles cases where
+// one side merges sections that the other side keeps separate.
+//
+// Final score is a char-weighted average of matched and residual scores.
+func SectionAlignedScore(goText, pyText string) float64 {
+ split := func(s string) []string {
+ s = StripMeta(s)
+ return strings.Split(strings.TrimSpace(s), "\n")
+ }
+ gs := split(goText)
+ ps := split(pyText)
+ if len(gs) == 0 && len(ps) == 0 {
+ return 100
+ }
+ if len(gs) == 0 || len(ps) == 0 {
+ return 0
+ }
+
+ // Phase 1: Position-window greedy matching.
+ // Sections are ordered top-to-bottom by page position, so a global
+ // match beyond a small positional offset is extremely unlikely.
+ // Constrain candidates to ±window to avoid O(n×m) blow-up on large docs.
+ const alignWindow = 5
+ type candidate struct {
+ gi, pi int
+ sim float64
+ }
+ // Precompute rune lengths for length-ratio gating.
+ glens := make([]int, len(gs))
+ plens := make([]int, len(ps))
+ for i, s := range gs {
+ glens[i] = len([]rune(s))
+ }
+ for i, s := range ps {
+ plens[i] = len([]rune(s))
+ }
+
+ candidates := make([]candidate, 0, len(gs)*(alignWindow*2+1))
+ for i, g := range gs {
+ lo := max(0, i-alignWindow)
+ hi := min(len(ps)-1, i+alignWindow)
+ for j := lo; j <= hi; j++ {
+ // Skip pairs with >2x length difference — a 500-char section
+ // matching a 30-char section produces near-zero LCS.
+ if glens[i] > plens[j]*2 || plens[j] > glens[i]*2 {
+ continue
+ }
+ if sim := CharSimilarity(g, ps[j]); sim > 30 {
+ candidates = append(candidates, candidate{i, j, sim})
+ }
+ }
+ }
+ // Sort descending by similarity — best matches first.
+ sort.Slice(candidates, func(a, b int) bool {
+ return candidates[a].sim > candidates[b].sim
+ })
+
+ goUsed := make([]bool, len(gs))
+ pyUsed := make([]bool, len(ps))
+ matchedScore := 0.0
+ matchedChars := 0
+
+ for _, c := range candidates {
+ if goUsed[c.gi] || pyUsed[c.pi] {
+ continue
+ }
+ goUsed[c.gi] = true
+ pyUsed[c.pi] = true
+
+ // Compute LCS ratio for matched pair.
+ ra := nonSpaceRunes(gs[c.gi])
+ rb := nonSpaceRunes(ps[c.pi])
+ lcsScore := 0.0
+ if len(ra) > 0 && len(rb) > 0 {
+ lcsScore = float64(lcsRunes(ra, rb)) / float64(max(len(ra), len(rb))) * 100
+ } else if len(ra) == 0 && len(rb) == 0 {
+ lcsScore = 100
+ }
+ chars := max(len(ra), len(rb))
+ matchedScore += lcsScore * float64(chars)
+ matchedChars += chars
+ }
+
+ // Phase 2: Residual — concat unmatched sections, compute LCS once.
+ var goRes, pyRes strings.Builder
+ for i, g := range gs {
+ if !goUsed[i] {
+ goRes.WriteString(g)
+ goRes.WriteByte(' ')
+ }
+ }
+ for j, p := range ps {
+ if !pyUsed[j] {
+ pyRes.WriteString(p)
+ pyRes.WriteByte(' ')
+ }
+ }
+
+ residualScore := 0.0
+ residualChars := 0
+ goResRunes := nonSpaceRunes(goRes.String())
+ pyResRunes := nonSpaceRunes(pyRes.String())
+ residualChars = max(len(goResRunes), len(pyResRunes))
+ if residualChars > 0 {
+ if len(goResRunes) > 5000 || len(pyResRunes) > 5000 {
+ // Residual too large for O(n²) LCS — fall back to CharSimilarity.
+ residualScore = CharSimilarity(goRes.String(), pyRes.String())
+ } else {
+ residualScore = float64(lcsRunes(goResRunes, pyResRunes)) / float64(residualChars) * 100
+ }
+ } else if len(goResRunes) == 0 && len(pyResRunes) == 0 {
+ residualScore = 100
+ }
+
+ // Weighted average.
+ totalChars := matchedChars + residualChars
+ if totalChars == 0 {
+ return 100
+ }
+ return (matchedScore + residualScore*float64(residualChars)) / float64(totalChars)
+}
+
+func nonSpaceRunes(s string) []rune {
+ out := make([]rune, 0, len(s))
+ for _, r := range s {
+ if !unicode.IsSpace(r) {
+ out = append(out, r)
+ }
+ }
+ return out
+}
diff --git a/internal/deepdoc/parser/pdf/tools/types.go b/internal/deepdoc/parser/pdf/tools/types.go
new file mode 100644
index 0000000000..eb19cb894f
--- /dev/null
+++ b/internal/deepdoc/parser/pdf/tools/types.go
@@ -0,0 +1,70 @@
+package tools
+
+// BatchResult stores per-PDF pipeline stage output.
+type BatchResult struct {
+ File string `json:"file"`
+ Pages int `json:"pages"`
+ Chars int `json:"chars"`
+ BoxesInitial int `json:"boxes_initial"`
+ BoxesTextMerg int `json:"boxes_text_merge"`
+ BoxesVertMerg int `json:"boxes_vertical_merge"`
+ Sections int `json:"sections"`
+ TSTables int `json:"tsr_tables,omitempty"`
+ TextLen int `json:"text_len"`
+ TimeS float64 `json:"time_s"`
+ Error string `json:"error,omitempty"`
+}
+
+// PyResult mirrors Python dump_py_results.py output.
+type PyResult struct {
+ File string `json:"file"`
+ Pages int `json:"pages"`
+ Chars int `json:"chars"`
+ BoxesInitial int `json:"boxes_initial"`
+ BoxesTextMerge int `json:"boxes_text_merge"`
+ BoxesVertMerge int `json:"boxes_vertical_merge"`
+ Sections int `json:"sections"`
+ Tables int `json:"tables"`
+ TextLen int `json:"text_len"`
+ IsEnglish *bool `json:"is_english"`
+ TimeS float64 `json:"time_s"`
+ Error string `json:"error,omitempty"`
+}
+
+// TableItem stores per-table output.
+type TableItem struct {
+ ImageB64 string `json:"image_b64"`
+ Rows [][]string `json:"rows"`
+ Cells []TSRCell `json:"cells,omitempty"`
+ Positions []Position `json:"positions"`
+}
+
+// TSRCell mirrors parser.TSRCell for serialization.
+type TSRCell struct {
+ X0, Y0, X1, Y1 float64 `json:"x0,y0,x1,y1"`
+ Text string `json:"text"`
+ Label string `json:"label"`
+}
+
+// Position stores a bounding box.
+type Position struct {
+ Left, Right, Top, Bottom float64
+}
+
+// RealPDFResult holds per-PDF stats for Go vs Python comparison.
+type RealPDFResult struct {
+ File string `json:"file"`
+ Pages int `json:"pages"`
+ Chars int `json:"chars"`
+ Sections int `json:"sections"`
+ TextLen int `json:"text_len"`
+ Error string `json:"error,omitempty"`
+}
+
+// TLogger is a minimal interface for logging in comparison functions.
+type TLogger interface {
+ Logf(format string, args ...any)
+ Errorf(format string, args ...any)
+ Fatalf(format string, args ...any)
+ Skipf(format string, args ...any)
+}
diff --git a/internal/deepdoc/parser/pdf/types.go b/internal/deepdoc/parser/pdf/types.go
new file mode 100644
index 0000000000..35169c0e85
--- /dev/null
+++ b/internal/deepdoc/parser/pdf/types.go
@@ -0,0 +1,320 @@
+// Package pdfparser provides Go equivalents of RAGFlow's deepdoc/parser/pdf_parser.py
+// layout analysis and text extraction logic.
+//
+// Each exported function documents its corresponding Python original with
+// file:line references to pdf_parser.py.
+package parser
+
+import (
+ "context"
+ "image"
+)
+
+// PipelineMetrics records diagnostic counts at each pipeline stage.
+// Used for Go-vs-Python parity comparison and logging.
+type PipelineMetrics struct {
+ BoxesInitial int
+ BoxesTextMerge int
+ BoxesVertMerge int
+ BoxesFinal int
+ TablesCount int
+}
+
+// ParseResult encapsulates all outputs from a single Parse() call.
+// Parser itself is stateless and safe to reuse across documents.
+type ParseResult struct {
+ Sections []Section
+ Tables []TableItem
+ PageImages map[int]image.Image
+ Figures []Section
+ Metrics PipelineMetrics
+
+ // Debug intermediates for DLA/TSR comparison with Python.
+ // Populated only during fresh Parse, not from cached results.
+ DLADebug []DLAPageRegions
+ TSRDebug []TSRRawCell
+}
+
+// DLAPageRegions holds DLA layout regions for one page.
+type DLAPageRegions struct {
+ Page int
+ Regions []DLARegion
+}
+
+// TSRRawCell holds a raw TSR cell before row/column grouping.
+type TSRRawCell struct {
+ TableIndex int `json:"table_index"`
+ Page int `json:"page"`
+ Label string `json:"label"`
+ X0 float64 `json:"x0"`
+ Y0 float64 `json:"y0"`
+ X1 float64 `json:"x1"`
+ Y1 float64 `json:"y1"`
+ Text string `json:"text"`
+}
+
+// TextChar represents a single character extracted from a PDF page.
+// Corresponds to pdfplumber page.chars dict elements in pdf_parser.py.
+//
+// Python equivalent:
+//
+// c = {"x0": 100.5, "x1": 108.2, "top": 200.0, "bottom": 212.0,
+// "text": "A", "fontname": "ABCDE+SimSun", "page_number": 3}
+//
+// Example:
+//
+// c := TextChar{X0: 100.5, X1: 108.2, Top: 200.0, Bottom: 212.0,
+// Text: "A", FontName: "ABCDE+SimSun", PageNumber: 3}
+type TextChar struct {
+ X0, X1 float64 // horizontal bounds in PDF points
+ Top, Bottom float64 // vertical bounds in PDF points
+ Text string // single character (or small text run)
+ FontName string // e.g. "ABCDE+SimSun"
+ FontSize float64
+ PageNumber int
+ LayoutType string // "text", "table", "figure", "equation"
+ LayoutNo string // layout identifier
+ ColID int // column ID assigned by _assign_column
+ R int // rotation/orientation marker
+}
+
+func (c TextChar) Bounds() (float64, float64, float64, float64) {
+ return c.X0, c.Top, c.X1, c.Bottom
+}
+
+// TextBox represents a rectangular region of text on a PDF page,
+// typically a line or paragraph fragment. Created by layout analysis
+// (e.g. _assign_column, _text_merge).
+//
+// Python equivalent:
+//
+// b = {"x0": 50.0, "x1": 550.0, "top": 100.0, "bottom": 112.0,
+// "text": "第三章 财务分析", "page_number": 3, "layout_type": "text"}
+type TextBox struct {
+ X0, X1 float64
+ Top, Bottom float64
+ Text string
+ PageNumber int
+ LayoutType string // "text", "table", "figure", "equation"
+ LayoutNo string
+ ColID int
+ R int
+ // Post-TSR table annotation fields (Python: R/H/C/SP tags)
+ RTop, RBott float64 // row top/bottom
+ HTop, HBott float64 // header top/bottom
+ HLeft, HRight float64 // header left/right
+ H int // header index
+ C int // column index
+ CLeft, CRight float64 // column left/right
+ SP int // spanning cell index
+}
+
+func (b TextBox) Bounds() (float64, float64, float64, float64) {
+ return b.X0, b.Top, b.X1, b.Bottom
+}
+
+// Position represents a parsed position tag from @@...## format.
+//
+// Python: pdf_parser.py:1872 extract_positions()
+//
+// Format: @@{page_range}\t{left}\t{right}\t{top}\t{bottom}##
+// Example: "@@0-1\t50.0\t300.0\t200.0\t400.0##"
+type Position struct {
+ PageNumbers []int // e.g. [0, 1] for cross-page content
+ Left float64
+ Right float64
+ Top float64
+ Bottom float64
+}
+
+// Section represents a text segment with its spatial position on a PDF page.
+// This is the primary output of layout analysis, consumed by NLP merge/split.
+//
+// Python equivalent: sections elements in naive.py::chunk()
+//
+// [(text_with_tags, position_tag_string), ...]
+type Section struct {
+ Text string // text content
+ PositionTag string // "@@page-left-right-top-bottom##" format
+ LayoutType string // "text", "table", "title", "figure", ...
+ Positions []Position // parsed from PositionTag
+ TableItem *TableItem // non-nil when this section is a table
+ Image string // base64-encoded PNG of the cropped region (Python: b["image"])
+}
+
+// CollectFigures returns all sections with LayoutType "figure".
+// Returns nil if the input is nil, empty slice if no figures found.
+func CollectFigures(sections []Section) []Section {
+ if sections == nil {
+ return nil
+ }
+ figures := make([]Section, 0)
+ for _, s := range sections {
+ if s.LayoutType == LayoutTypeFigure {
+ figures = append(figures, s)
+ }
+ }
+ return figures
+}
+
+// TableItem represents a detected table or figure region.
+//
+// Python equivalent: tables elements in naive.py::chunk()
+//
+// [((img, rows), positions), ...]
+type TableItem struct {
+ ImageB64 string // base64-encoded PNG of the table/figure region
+ Rows [][]string // DEPRECATED: replaced by Cells; kept for batch output compat
+ Cells []TSRCell // raw TSR cells in crop pixel space
+ Positions []Position // spatial positions (PDF points, pre-merge)
+ Scale float64 // zoom factor for coordinate conversion
+ CropOffX float64 // crop origin X in pixel space
+ CropOffY float64 // crop origin Y in pixel space
+ Caption string // caption text merged from adjacent caption box
+
+ // DLA table region boundaries in PDF point space (72 DPI).
+ // Matches Python's cropout using DLA layout region boundaries
+ // instead of text box anchor coordinates.
+ RegionLeft, RegionRight, RegionTop, RegionBottom float64
+
+ // NoMerge prevents cross-page merging for this table. Python's
+ // _extract_table_figure adds table keys to nomerge_lout_no when
+ // the next box is a caption/title/reference, indicating the table
+ // group ended and should not merge with its continuation.
+ NoMerge bool
+
+ // Grid is the row-column grid produced by TableBuilder.GroupCells.
+ // Consumed by constructTable Path 1 and annotateTableBoxes.
+ // Nil for tables without TSR cells (fallback paths use boxes instead).
+ Grid [][]TSRCell
+}
+
+// ParserConfig holds parser configuration.
+//
+// Python equivalent: kwargs merged with parser_config in task_executor.py
+type ParserConfig struct {
+ Zoom float64 // zoom factor for page rendering, default 3
+ FromPage int // 0-based start page
+ ToPage int // 0-based end page (-1 = all)
+ TableContextSize int // tokens of surrounding context for tables
+ ImageContextSize int // tokens of surrounding context for images
+ AutoRotateTables *bool // enable auto table rotation detection
+ SeparateTablesFigs bool // separate tables and figures
+ SortByTop bool // true = Top-based sort (parity tests); false = Bottom (production)
+ ChunkSize int // pages per chunk (0 = default 50, matching Python batch_size)
+ SkipOCR bool // true = DLA+TSR only, no image OCR (matching Python SKIP_OCR=1)
+ MaxOCRConcurrency int // max concurrent OCR pages (0 = sequential); matches Python PARALLEL_DEVICES
+ TableBuilder TableBuilder // TSR model adapter; injected by caller via NewTableBuilderFor
+}
+
+// DefaultParserConfig returns a ParserConfig with sensible defaults.
+func DefaultParserConfig() ParserConfig {
+ return ParserConfig{
+ Zoom: 3,
+ FromPage: 0,
+ ToPage: -1,
+ ChunkSize: 50,
+ TableContextSize: 0,
+ ImageContextSize: 0,
+ SeparateTablesFigs: false,
+ }
+}
+
+// DetectGarbled returns true if a page's text is likely garbled due to
+// font encoding issues, indicating OCR is needed.
+//
+// This is a convenience wrapper around IsGarbledByFontEncoding.
+//
+// Python: pdf_parser.py:264 _is_garbled_by_font_encoding()
+func DetectGarbled(chars []TextChar) bool {
+ return IsGarbledByFontEncoding(chars, 20)
+}
+
+// HasColor checks if a character has visible color (not invisible white-on-white).
+//
+// Python: pdf_parser.py:190 _has_color()
+//
+// All extracted chars are assumed visible since the PDF engine handles
+// rendering internally.
+func HasColor(c TextChar) bool {
+ return true
+}
+
+// ── DeepDoc interfaces (shared between cgo and non-cgo builds) ──────────
+
+// ModelType identifies the DeepDoc TSR model flavour.
+type ModelType string
+
+const (
+ ModelSaas ModelType = "saas" // cpu DeepDoc — cell-level TSR output
+ ModelOSS ModelType = "oss" // oss DeepDoc — column/row line TSR output
+)
+
+// Layout type constants — used for LayoutType field comparisons across
+// the pipeline. Values match DLA label taxonomy.
+const (
+ LayoutTypeText = "text"
+ LayoutTypeTable = "table"
+ LayoutTypeFigure = "figure"
+ LayoutTypeEquation = "equation"
+ LayoutTypeTitle = "title"
+ LayoutTypeReference = "reference"
+ LayoutTypeFooter = "footer"
+ LayoutTypeHeader = "header"
+
+ // Compound DLA labels (used in priority-ordered annotation matching).
+ DLALabelFigureCaption = "figure caption"
+ DLALabelTableCaption = "table caption"
+)
+
+// DocAnalyzer abstracts DeepDoc vision operations so the Parser can
+// work with either a live service or a test mock.
+// I/O methods accept a context for cancellation and deadline propagation.
+type DocAnalyzer interface {
+ DLA(ctx context.Context, pageImage image.Image) ([]DLARegion, error)
+ TSR(ctx context.Context, cropped image.Image) ([]TSRCell, error)
+ OCRDetect(ctx context.Context, cropped image.Image) ([]OCRBox, error)
+ OCRRecognize(ctx context.Context, cropped image.Image) ([]OCRText, error)
+ OCRRecognizeBatch(ctx context.Context, cropped []image.Image) ([][]OCRText, []error)
+ Health() bool
+ ModelType() ModelType
+}
+
+// OCRBox represents a detected text region from DeepDoc OCR detection.
+// DeepDoc /predict/ocr?operator=det returns:
+//
+// {"output": [[[[[x0,y0],[x1,y1],[x2,y2],[x3,y3]], ...]]]}
+type OCRBox struct {
+ X0, Y0, X1, Y1, X2, Y2, X3, Y3 float64
+}
+
+// OCRText represents recognized text with confidence from DeepDoc OCR rec.
+// DeepDoc /predict/ocr?operator=rec returns:
+//
+// {"output": [[[["text", confidence], ...]]]}
+type OCRText struct {
+ Text string
+ Confidence float64
+}
+
+// DLARegion represents one detected layout region.
+type DLARegion struct {
+ X0, Y0, X1, Y1 float64
+ Label string
+ Confidence float64
+}
+
+func (r DLARegion) Bounds() (float64, float64, float64, float64) {
+ return r.X0, r.Y0, r.X1, r.Y1
+}
+
+// TSRCell represents one table cell from TSR.
+type TSRCell struct {
+ X0, Y0, X1, Y1 float64
+ Text string
+ Label string // "table", "table row", "table column", etc.
+}
+
+func (c TSRCell) Bounds() (float64, float64, float64, float64) {
+ return c.X0, c.Y0, c.X1, c.Y1
+}
diff --git a/internal/deepdoc/parser/pdf/types_test.go b/internal/deepdoc/parser/pdf/types_test.go
new file mode 100644
index 0000000000..7076f6a5bf
--- /dev/null
+++ b/internal/deepdoc/parser/pdf/types_test.go
@@ -0,0 +1,116 @@
+package parser
+
+import (
+ "testing"
+)
+
+func TestCollectFigures(t *testing.T) {
+ t.Run("mixed layout types", func(t *testing.T) {
+ sections := []Section{
+ {LayoutType: "figure", Text: "fig1", Image: "img1"},
+ {LayoutType: "text", Text: "text1"},
+ {LayoutType: "table", Text: "tbl1"},
+ {LayoutType: "figure", Text: "fig2", Image: "img2"},
+ {LayoutType: "title", Text: "title1"},
+ }
+ figures := CollectFigures(sections)
+ if len(figures) != 2 {
+ t.Fatalf("expected 2 figures, got %d", len(figures))
+ }
+ if figures[0].Text != "fig1" || figures[0].Image != "img1" {
+ t.Errorf("first figure: expected (fig1, img1), got (%s, %s)", figures[0].Text, figures[0].Image)
+ }
+ if figures[1].Text != "fig2" || figures[1].Image != "img2" {
+ t.Errorf("second figure: expected (fig2, img2), got (%s, %s)", figures[1].Text, figures[1].Image)
+ }
+ })
+
+ t.Run("no figures", func(t *testing.T) {
+ sections := []Section{
+ {LayoutType: "text", Text: "text1"},
+ {LayoutType: "table", Text: "tbl1"},
+ {LayoutType: "title", Text: "title1"},
+ }
+ figures := CollectFigures(sections)
+ if len(figures) != 0 {
+ t.Fatalf("expected 0 figures, got %d", len(figures))
+ }
+ })
+
+ t.Run("nil input", func(t *testing.T) {
+ figures := CollectFigures(nil)
+ if figures != nil {
+ t.Fatalf("expected nil for nil input, got %d elements", len(figures))
+ }
+ })
+
+ t.Run("empty input", func(t *testing.T) {
+ figures := CollectFigures([]Section{})
+ if figures == nil {
+ t.Fatal("expected empty slice (not nil) for empty input")
+ }
+ if len(figures) != 0 {
+ t.Fatalf("expected 0 figures, got %d", len(figures))
+ }
+ })
+
+ t.Run("all figures", func(t *testing.T) {
+ sections := []Section{
+ {LayoutType: "figure", Text: "fig1"},
+ {LayoutType: "figure", Text: "fig2"},
+ {LayoutType: "figure", Text: "fig3"},
+ }
+ figures := CollectFigures(sections)
+ if len(figures) != 3 {
+ t.Fatalf("expected 3 figures, got %d", len(figures))
+ }
+ })
+
+ t.Run("figure with empty image", func(t *testing.T) {
+ sections := []Section{
+ {LayoutType: "figure", Text: "fig1", Image: ""},
+ {LayoutType: "figure", Text: "fig2", Image: "img2"},
+ }
+ figures := CollectFigures(sections)
+ if len(figures) != 2 {
+ t.Fatalf("expected 2 figures, got %d", len(figures))
+ }
+ // Figure with empty image is still collected — downstream should handle.
+ if figures[0].Image != "" {
+ t.Errorf("first figure: expected empty Image, got %s", figures[0].Image)
+ }
+ })
+
+ t.Run("single section, figure", func(t *testing.T) {
+ figures := CollectFigures([]Section{
+ {LayoutType: "figure", Text: "only", Image: "img"},
+ })
+ if len(figures) != 1 {
+ t.Fatalf("expected 1 figure, got %d", len(figures))
+ }
+ })
+
+ t.Run("single section, not figure", func(t *testing.T) {
+ figures := CollectFigures([]Section{
+ {LayoutType: "text", Text: "only"},
+ })
+ if len(figures) != 0 {
+ t.Fatalf("expected 0 figures, got %d", len(figures))
+ }
+ })
+
+ t.Run("case sensitive", func(t *testing.T) {
+ sections := []Section{
+ {LayoutType: "Figure", Text: "fig1"},
+ {LayoutType: "FIGURE", Text: "fig2"},
+ {LayoutType: "figure", Text: "fig3"},
+ }
+ figures := CollectFigures(sections)
+ if len(figures) != 1 {
+ t.Fatalf("only lowercase 'figure' should match, got %d", len(figures))
+ }
+ if figures[0].Text != "fig3" {
+ t.Errorf("expected fig3, got %s", figures[0].Text)
+ }
+ })
+}
diff --git a/internal/deepdoc/parser/pdf/ycoord_test.go b/internal/deepdoc/parser/pdf/ycoord_test.go
new file mode 100644
index 0000000000..7f9d6b5a4b
--- /dev/null
+++ b/internal/deepdoc/parser/pdf/ycoord_test.go
@@ -0,0 +1,214 @@
+//go:build cgo && manual
+
+package parser
+
+import (
+ "math"
+ "os"
+ "path/filepath"
+ "testing"
+
+ "ragflow/internal/deepdoc/parser/pdf/pdfoxide"
+)
+
+// ── Y-coordinate tests ──────────────────────────────────────────────────
+
+// openTestingPDF opens a real PDF by name from testdata/real_pdfs/.
+// Missing fixtures are skipped (soft) rather than failing — these tests
+// require the "manual" build tag and rely on optional fixture files.
+func openTestingPDF(t *testing.T, name string) (PDFEngine, *pdfoxide.Document) {
+ t.Helper()
+ dir := filepath.Join("testdata", "real_pdfs")
+ if _, err := os.Stat(filepath.Join(dir, name)); os.IsNotExist(err) {
+ t.Skipf("test PDF not found: %s", name)
+ }
+ return openPDF(t, dir, name)
+}
+
+// TestYCoord_SameLineCharsHaveEqualBottom checks that characters on the same
+// PDF text line (same baseline) have identical Bottom values. Bottom =
+// pageHeight - c.Y is derived from the screen-space baseline, which is the
+// same for all chars on a line regardless of font size or descent.
+func TestYCoord_SameLineCharsHaveEqualBottom(t *testing.T) {
+ eng, _ := openTestingPDF(t, "RAG分词召回分析.pdf")
+
+ chars, err := eng.ExtractChars(0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(chars) == 0 {
+ t.Fatal("no chars")
+ }
+
+ lines := groupCharsToLines(chars, false)
+ for li, line := range lines {
+ if len(line) <= 1 {
+ continue
+ }
+ refBottom := line[0].Bottom
+ for _, c := range line[1:] {
+ if math.Abs(c.Bottom-refBottom) > 0.1 {
+ t.Errorf("line %d: char %q has Bottom=%.2f, expected ~%.2f (delta=%.2f)",
+ li, c.Text, c.Bottom, refBottom, c.Bottom-refBottom)
+ }
+ }
+ }
+}
+
+// TestYCoord_BottomEqualsTopPlusHeight checks the invariant bottom = top + height
+// for every character.
+func TestYCoord_BottomEqualsTopPlusHeight(t *testing.T) {
+ eng, _ := openTestingPDF(t, "RAG分词召回分析.pdf")
+
+ for pg := 0; pg < 1; pg++ {
+ chars, err := eng.ExtractChars(pg)
+ if err != nil {
+ t.Fatal(err)
+ }
+ for _, c := range chars {
+ h := c.Bottom - c.Top
+ expected := c.Top + h
+ delta := math.Abs(c.Bottom - expected)
+ if delta > 0.01 {
+ t.Errorf("char %q: Bottom=%.4f, Top=%.4f+Height=%.4f=%.4f, delta=%v",
+ c.Text, c.Bottom, c.Top, h, expected, delta)
+ }
+ }
+ }
+}
+
+// TestYCoord_XUnchanged verifies that X0/X1 are not affected by Y-axis
+// coordinate transformations.
+func TestYCoord_XUnchanged(t *testing.T) {
+ eng, doc := openTestingPDF(t, "RAG分词召回分析.pdf")
+
+ pipelineChars, err := eng.ExtractChars(0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(pipelineChars) == 0 {
+ t.Fatal("no chars")
+ }
+
+ raw, err := doc.Inner.ExtractChars(0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(raw) == 0 {
+ t.Fatal("no raw chars")
+ }
+
+ type xw struct {
+ x0, w float64
+ }
+ rawSet := make(map[xw]bool, len(raw))
+ for _, rc := range raw {
+ rawSet[xw{float64(rc.X), float64(rc.Width)}] = true
+ }
+
+ for _, c := range pipelineChars {
+ w := c.X1 - c.X0
+ if !rawSet[xw{c.X0, w}] {
+ t.Logf("pipeline char %q X0=%.1f W=%.1f not in raw set (may be deduped)",
+ c.Text, c.X0, w)
+ }
+ }
+}
+
+// TestYCoord_EmptyPageNoPanic ensures extracting chars from an empty page
+// (out of range) returns an error, not panics.
+func TestYCoord_EmptyPageNoPanic(t *testing.T) {
+ eng, _ := openTestingPDF(t, "RAG分词召回分析.pdf")
+
+ _, err := eng.ExtractChars(9999)
+ if err == nil {
+ t.Error("expected error for out-of-range page, got nil")
+ }
+}
+
+// TestYCoord_RenderedImageDimensionsMatchPage verifies that rendered page
+// image dimensions are proportional to the page's CropBox.
+func TestYCoord_RenderedImageDimensionsMatchPage(t *testing.T) {
+ eng, _ := openTestingPDF(t, "RAG分词召回分析.pdf")
+
+ img, err := eng.RenderPageImage(0, 72)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if img == nil {
+ t.Fatal("rendered image is nil")
+ }
+ b := img.Bounds()
+ if b.Dx() == 0 || b.Dy() == 0 {
+ t.Errorf("rendered image has 0 dimensions: %dx%d", b.Dx(), b.Dy())
+ }
+}
+
+// TestYCoord_MultiPageConsistency verifies that chars across pages all have
+// valid Top values within page bounds.
+func TestYCoord_MultiPageConsistency(t *testing.T) {
+ eng, _ := openTestingPDF(t, "20240815-华福证券-海光信息-688041.SH-中报略超预告中值_新增适配AI大模型通义千问_4页_467kb.pdf")
+
+ pageCount, err := eng.PageCount()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if pageCount < 2 {
+ t.Skip("need multi-page PDF")
+ }
+
+ for pg := 0; pg < pageCount; pg++ {
+ chars, err := eng.ExtractChars(pg)
+ if err != nil {
+ t.Errorf("page %d: ExtractChars: %v", pg, err)
+ continue
+ }
+ if len(chars) == 0 {
+ continue
+ }
+ for _, c := range chars {
+ if c.Top < 0 {
+ t.Errorf("page %d char %q: Top=%.2f < 0", pg, c.Text, c.Top)
+ }
+ if c.Bottom <= c.Top {
+ t.Errorf("page %d char %q: Bottom=%.2f <= Top=%.2f", pg, c.Text, c.Bottom, c.Top)
+ }
+ }
+ }
+}
+
+// TestYCoord_CropBoxUsedNotMediaBox verifies that chars are positioned using
+// CropBox height, not MediaBox.
+func TestYCoord_CropBoxUsedNotMediaBox(t *testing.T) {
+ eng, doc := openTestingPDF(t, "RAG分词召回分析.pdf")
+
+ info, err := doc.Inner.PageInfo(0)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if info.CropBox.Height <= 0 {
+ t.Skip("test PDF doesn't have CropBox")
+ }
+
+ chars, err := eng.ExtractChars(0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(chars) == 0 {
+ t.Fatal("no chars")
+ }
+
+ mediaBoxH := float64(info.Height)
+ cropBoxH := float64(info.CropBox.Height)
+
+ if mediaBoxH == cropBoxH {
+ t.Skip("MediaBox == CropBox, no offset to test")
+ }
+
+ for _, c := range chars {
+ if c.Top >= cropBoxH {
+ t.Errorf("char %q Top=%.2f >= CropBox height %.2f", c.Text, c.Top, cropBoxH)
+ }
+ }
+}
| | |