//go:build manual package parser import ( "bytes" "context" "encoding/base64" "image" "regexp" "strings" "testing" ) // ============================================================================= // Issue 1: Figure insertion strategy // Python's insert_table_figures(figs, "figure") inserts figure boxes back into // self.boxes. Go's extractTableAndReplace only handles LayoutType=="table", // leaving figure boxes in the list. This test documents the current behavior. // ============================================================================= // TestExtractTableAndReplace_IgnoresFigures documents that extractTableAndReplace // does NOT pop or replace figure boxes. In Python's _extract_table_figure, // figure boxes are popped and re-inserted via insert_table_figures with cropped // images. Go leaves them in the box list for downstream boxesToSections. func TestExtractTableAndReplace_IgnoresFigures(t *testing.T) { boxes := []TextBox{ {X0: 10, X1: 200, Top: 0, Bottom: 50, Text: "Figure text", LayoutType: "figure", PageNumber: 0}, {X0: 10, X1: 200, Top: 60, Bottom: 80, Text: "表1:标题", LayoutType: "table", PageNumber: 0}, } // Table with cells so extractTableAndReplace generates HTML. tables := []TableItem{{ Cells: []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "A", Label: "table row"}}, Positions: []Position{{Left: 0, Right: 300, Top: 0, Bottom: 100}}, Scale: 1.0, }} result := extractTableAndReplace(boxes, tables) // BUG: Figure box is still present — it was not popped or replaced. // Python's _extract_table_figure pops figure boxes and re-inserts them // via insert_table_figures with cropped images. hasFigure := false for _, b := range result { if b.LayoutType == "figure" { hasFigure = true // Figure text is still raw text, not a consolidated image+text block // like Python's insert_table_figures would produce. if b.Text != "Figure text" { t.Errorf("figure text should be unchanged, got %q", b.Text) } } } if !hasFigure { t.Error("BUG EXPOSED: extractTableAndReplace removed figure box (unexpected)") } t.Log("NOTE: Figure box remains in list as raw text. Python inserts figures back with cropped images via insert_table_figures. Go collects figures separately via CollectFigures without re-inserting.") } // TestBoxesToSections_FiguresNotReinserted documents that boxesToSections converts // figure boxes to sections but without the consolidated image that Python's // insert_table_figures would attach. func TestBoxesToSections_FiguresNotReinserted(t *testing.T) { // Simulate post-extractTableAndReplace boxes with figures still present. boxes := []TextBox{ {X0: 10, X1: 200, Top: 0, Bottom: 50, Text: "Some text", LayoutType: "text", PageNumber: 0}, {X0: 10, X1: 200, Top: 60, Bottom: 100, Text: "Figure description", LayoutType: "figure", PageNumber: 0}, } sections := boxesToSections(boxes, nil) figures := CollectFigures(sections) // BUG: figures are collected separately but NOT re-inserted into sections // after image processing. In Python, insert_table_figures(figs, "figure") // creates new boxes with layout_type="figure", image=cropped_img, and // inserts them at the nearest position among text boxes. if len(figures) != 1 { t.Fatalf("expected 1 figure, got %d", len(figures)) } if figures[0].LayoutType != "figure" { t.Errorf("expected LayoutType 'figure', got %q", figures[0].LayoutType) } // Figure image is empty at this stage (cropSectionImage runs later in pipeline). if figures[0].Image != "" { t.Log("figure has image (cropSectionImage already ran)") } else { t.Log("NOTE: Figure section has no Image yet. Python's cropout creates a consolidated cropped image for the entire figure region before insert_table_figures.") } t.Logf("Sections count: %d (figure present as raw text section)", len(sections)) t.Logf("Figures count: %d (collected separately, Python re-inserts them)", len(figures)) } // ============================================================================= // Issue 2a: blockType classification missing // Python's construct_table classifies each cell into 9 types (Dt/Nu/Ca/En/NE/ // Sg/Tx/Lx/Nr/Ot). The dominant type drives header detection: if max_type is // "Nu" (numeric), numeric cells don't count as headers. Go's headerSet only // checks TSR labels — no cell content type analysis. // ============================================================================= // TestConstructTable_HeaderDetection_NoBlockType documents that Go's header // detection is purely TSR-label-based. Python would use blockType to skip // numeric cells when the dominant type is "Nu". func TestConstructTable_HeaderDetection_NoBlockType(t *testing.T) { // A table where the "header" row has numeric content (like years, amounts). // With blockType: "2020","2021" → Nu, "100","200" → Nu — maxType=Nu. // block-type-aware detection skips Nu cells → 0 headers. // Falls back to TSR label-based detection → still gets 2 . cells := []TSRCell{ {X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "2020", Label: "table column header"}, {X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "2021", Label: "table column header"}, {X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "100", Label: "table row"}, {X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "200", Label: "table row"}, } item := &TableItem{} html := constructTable(cells, nil, "", item) // FIX VERIFIED: headerSetWithBlockType computes block types (all "Nu"), // skips Nu headers when maxType=Nu, then falls back to TSR label detection. // Header row still gets because TSR labels contain "header". thCount := strings.Count(html, ", got %d. HTML: %s", thCount, html) } t.Log("FIX: blockType classification added. maxType=Nu skips Nu headers in primary pass.") t.Log("TSR label fallback still marks header rows with 'header' in label.") } // TestConstructTable_BlockType_DominantTypeMissing documents that Go has no // concept of a "dominant cell type" that Python uses for header detection. func TestConstructTable_BlockType_DominantTypeMissing(t *testing.T) { // Mixed table with numeric-dominant data, testing blockType header detection. // "年份"/"金额" → Tx (short text), "2020"/"1000"/etc → Nu. maxType=Nu. // Header cells are non-Nu → count as headers even under Nu-dominant logic. // FIX: blockType now classifies cells and drives header detection. cells := []TSRCell{ {X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "年份", Label: "table column header"}, {X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "金额", Label: "table column header"}, {X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "2020", Label: "table row"}, {X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "1000", Label: "table row"}, {X0: 0, Y0: 70, X1: 100, Y1: 100, Text: "2021", Label: "table row"}, {X0: 101, Y0: 70, X1: 200, Y1: 100, Text: "2000", Label: "table row"}, {X0: 0, Y0: 105, X1: 100, Y1: 135, Text: "2022", Label: "table row"}, {X0: 101, Y0: 105, X1: 200, Y1: 135, Text: "3000", Label: "table row"}, } item := &TableItem{} html := constructTable(cells, nil, "", item) thCount := strings.Count(html, " for non-numeric headers under Nu-dominant table, got %d. HTML: %s", thCount, html) } t.Log("FIX: blockType classifies '年份'/'金额' as non-Nu headers, '2020'/'1000' as Nu data.") t.Logf("blockType('年份')=%q blockType('2020')=%q", blockType("年份"), blockType("2020")) } // TestConstructTable_BlockTypeChangesHeaderDetection verifies blockType // changes header detection for a table WITHOUT TSR header labels. // This is the case where pure label-based detection would fail. func TestConstructTable_BlockTypeChangesHeaderDetection(t *testing.T) { // Table with NO "header" labels — label-based detection gives 0 headers. // blockType: "姓名"/"年龄" → Tx, "张三"/"25" → Ot/En/? — maxType varies. // With Nu-dominant data, non-Nu top row cells count as possible headers. cells := []TSRCell{ {X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "姓名", Label: "table row"}, {X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "年龄", Label: "table row"}, {X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "张三", Label: "table row"}, {X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "25", Label: "table row"}, {X0: 0, Y0: 70, X1: 100, Y1: 100, Text: "李四", Label: "table row"}, {X0: 101, Y0: 70, X1: 200, Y1: 100, Text: "30", Label: "table row"}, {X0: 0, Y0: 105, X1: 100, Y1: 135, Text: "王五", Label: "table row"}, {X0: 101, Y0: 105, X1: 200, Y1: 135, Text: "28", Label: "table row"}, } html := constructTable(cells, nil, "", &TableItem{Grid: groupTSRCellsToRowsLabeled(cells)}) // blockType analysis: // "姓名"(Tx), "年龄"(Tx), "张三"(Ot), "25"(Nu), "李四"(Ot), "30"(Nu), "王五"(Ot), "28"(Nu) // maxType could be Ot(3), Nu(3), or Tx(2). // Fallback catches the case where no headers detected by block-type path. t.Logf("HTML:\n%s", html) t.Log("FIX: blockType+fallback header detection works for tables without TSR header labels") } // ============================================================================= // Issue 2b: colspan/rowspan missing // Python's __cal_spans computes colspan/rowspan from spanning cells by // clustering column centers and row centers. Go's rowsToHTML produces // a flat grid with no spanning attributes. // ============================================================================= // TestRowsToHTML_NoColspanRowspan documents that rowsToHTML never produces // colspan or rowspan attributes, even for spanning cells. func TestRowsToHTML_NoColspanRowspan(t *testing.T) { // Two rows with a spanning cell in row 0. // In Python, a "table spanning cell" covering columns 0-1 would get colspan=2. rows := [][]TSRCell{ { {Text: "跨列标题", Label: "table spanning cell"}, {Text: "", Label: ""}, // padded cell }, { {Text: "数据A", Label: "table row"}, {Text: "数据B", Label: "table row"}, }, } html := rowsToHTML(rows, "", nil, nil, nil) // BUG: No colspan or rowspan attributes in output. if strings.Contains(html, "colspan") { t.Error("unexpected: colspan found in output (should not be present without __cal_spans)") } if strings.Contains(html, "rowspan") { t.Error("unexpected: rowspan found in output (should not be present without __cal_spans)") } // The spanning cell is rendered as a plain with text, and the padded // empty cell is also rendered as an empty . Python would merge them. tdCount := strings.Count(html, " cells (flat grid, spanning cell + padded empty cell both rendered)", tdCount) } else { t.Logf("Got %d cells. HTML:\n%s", tdCount, html) } t.Log("NOTE: Python's __cal_spans clusters column centers within spanning cells") t.Log("to compute colspan/rowspan. Go outputs a flat grid without spanning attributes.") } // TestConstructTable_SpannedTable_NoMerge documents the full constructTable // path with spanning cells — no colspan/rowspan in output. func TestConstructTable_SpannedTable_NoMerge(t *testing.T) { // Spanning cell at same Y as row cells so groupTSRCellsToRowsLabeled // puts them in the same row group. The spanning cell covers X=0-200 // (both columns); Python's __cal_spans would give it colspan=2. cells := []TSRCell{ // Row 0: a spanning cell that covers both columns + one regular cell. {X0: 0, Y0: 0, X1: 200, Y1: 30, Text: "部门开支汇总", Label: "table spanning cell"}, {X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "Q1", Label: "table row"}, {X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "Q2", Label: "table row"}, // Row 1: data row {X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "100", Label: "table row"}, {X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "200", Label: "table row"}, } item := &TableItem{} html := constructTable(cells, nil, "", item) // Verify colspan IS now detected (calSpans aligned with Python's __cal_spans). if !strings.Contains(html, "colspan") { t.Error("expected colspan on spanning cell, calSpans should detect it") } // Verify the HTML structure — spanning cell exists WITH colspan. if !strings.Contains(html, "部门开支汇总") { t.Error("spanning cell text missing") } if !strings.Contains(html, "Q1") { t.Error("Q1 cell should still be present (covered by span)") } t.Logf("HTML:\n%s", html) } // ============================================================================= // Issue 2c: Single column/row cleanup missing // Python's construct_table removes orphan columns (only one non-empty cell) // when ≥4 rows, and orphan rows when ≥4 columns. Go has no such cleanup. // ============================================================================= // TestConstructTable_OrphanColumn_NotCleanedUp documents that Go does NOT // remove columns that have only one non-empty cell. func TestConstructTable_OrphanColumn_NotCleanedUp(t *testing.T) { // 4 rows × 3 columns. Column index 1 has only ONE non-empty cell. // Python would relocate/merge that orphan column. cells := []TSRCell{ {X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "姓名", Label: "table column header"}, {X0: 101, Y0: 0, X1: 200, Y1: 30, Text: "备注", Label: "table row"}, // orphan col {X0: 201, Y0: 0, X1: 300, Y1: 30, Text: "年龄", Label: "table column header"}, {X0: 0, Y0: 35, X1: 100, Y1: 65, Text: "张三", Label: "table row"}, {X0: 101, Y0: 35, X1: 200, Y1: 65, Text: "", Label: "table row"}, // col 1 empty {X0: 201, Y0: 35, X1: 300, Y1: 65, Text: "25", Label: "table row"}, {X0: 0, Y0: 70, X1: 100, Y1: 100, Text: "李四", Label: "table row"}, {X0: 101, Y0: 70, X1: 200, Y1: 100, Text: "", Label: "table row"}, // col 1 empty {X0: 201, Y0: 70, X1: 300, Y1: 100, Text: "30", Label: "table row"}, {X0: 0, Y0: 105, X1: 100, Y1: 135, Text: "王五", Label: "table row"}, {X0: 101, Y0: 105, X1: 200, Y1: 135, Text: "", Label: "table row"}, // col 1 empty {X0: 201, Y0: 105, X1: 300, Y1: 135, Text: "28", Label: "table row"}, } item := &TableItem{} html := constructTable(cells, nil, "", item) // BUG: All 4 rows have 3 cells each (orphan column preserved). // Python's construct_table pops single-cell columns when ≥4 rows. trCount := strings.Count(html, "") totalTdTh := strings.Count(html, " 1.5 × median_height ≈ 15pt). // Each figure text box → separate section in result.Sections. // CollectFigures collects them into result.Figures but doesn't re-insert. var figureSections []Section for _, s := range result.Sections { if s.LayoutType == "figure" { figureSections = append(figureSections, s) } } // Assert 1: Python expects exactly 1 consolidated figure section. // Go currently produces 2 (one per unmerged text box) — this FAILS. if len(figureSections) != 1 { t.Errorf("FIGURE INSERTION BUG: expected 1 consolidated figure section (Python insert_table_figures), got %d. Go does not consolidate figure text boxes into a single block.", len(figureSections)) } // Assert 2: The single figure section must contain BOTH text fragments. if len(figureSections) == 1 { combined := figureSections[0].Text if !strings.Contains(combined, "架构图") || !strings.Contains(combined, "系统模块") { t.Errorf("FIGURE INSERTION BUG: figure section text=%q should contain both fragments. Python merges all figure-region text.", combined) } } t.Logf("figure sections in Sections: %d", len(figureSections)) t.Logf("result.Figures count: %d", len(result.Figures)) t.Logf("result.Sections total: %d", len(result.Sections)) for i, s := range result.Sections { t.Logf(" section[%d] layout=%q text=%q", i, s.LayoutType, s.Text) } } // ============================================================================= // Issue 3: Multi-page table merging // Python's _extract_table_figure merges tables with same layoutno across // consecutive pages (gap ≤ 1 page, Y-dis ≤ 23× median height). // Go's extractTableAndReplace does NOT merge tables across pages. // ============================================================================= // TestExtractTableAndReplace_NoCrossPageMerge exposes that extractTableAndReplace // does not merge tables from consecutive pages even with the same layoutno. func TestExtractTableAndReplace_NoCrossPageMerge(t *testing.T) { // Simulate a table spanning pages 0 and 1. // Python would merge these because: same layoutno, consecutive pages, // Y-distance ≤ 23× median_height. boxes := []TextBox{ {X0: 10, X1: 200, Top: 500, Bottom: 530, Text: "续表内容", LayoutType: "table", PageNumber: 0, LayoutNo: "0"}, {X0: 10, X1: 200, Top: 50, Bottom: 80, Text: "表尾内容", LayoutType: "table", PageNumber: 1, LayoutNo: "0"}, } // Two separate TableItems — one per page. Python would merge these // before insert_table_figures. tables := []TableItem{ { Cells: []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "Page0", Label: "table row"}}, Positions: []Position{{PageNumbers: []int{0}, Left: 0, Right: 300, Top: 500, Bottom: 530}}, Scale: 1.0, }, { Cells: []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "Page1", Label: "table row"}}, Positions: []Position{{PageNumbers: []int{1}, Left: 0, Right: 300, Top: 50, Bottom: 80}}, Scale: 1.0, }, } result := extractTableAndReplace(boxes, tables) // Go produces 2 separate HTML table boxes (one per page). // Python would produce 1 merged table with cells from both pages. tableCount := 0 for _, b := range result { if strings.Contains(b.Text, "") { tableCount++ } } if tableCount == 2 { t.Errorf("CROSS-PAGE TABLE MERGE BUG: got %d separate HTML tables across pages. Python would merge same-layoutno tables on consecutive pages into 1 consolidated table.", tableCount) } t.Logf("table HTML boxes: %d (Python would merge into 1)", tableCount) } // ============================================================================= // Issue 3a: nomerge_lout_no — don't merge tables separated by captions // Python's _extract_table_figure tracks nomerge_lout_no: when a table box // is followed by a caption/title/reference, the table's key is added to // nomerge_lout_no. Later, cross-page merge skips tables in nomerge_lout_no. // // Example: // Page 0: table "0-table-3" → caption "表1:..." → table "0-table-4" // Page 1: table "1-table-3" (same layoutNo) // → Page 0's table-3 should NOT merge with Page 1's table-3, // because the caption on page 0 indicates the table ended. // → Go's mergeTablesAcrossPages has no nomerge_lout_no check. // ============================================================================= // TestMergeTablesAcrossPages_NomergeAfterCaption_Missing exposes that // mergeTablesAcrossPages unconditionally merges consecutive-page tables, // even when Python's nomerge_lout_no would prevent it. func TestMergeTablesAcrossPages_NomergeAfterCaption_Missing(t *testing.T) { // Simulate: page 0 has table at top, followed by a caption, // then another table. Page 1 has the same-layoutNo table continuing. // In Python, page 0's first table goes into nomerge_lout_no because // the next box is a caption → no cross-page merge for that table group. tables := []TableItem{ { Cells: []TSRCell{{Text: "Page0-first", Label: "table row"}}, Positions: []Position{{ PageNumbers: []int{0}, Left: 0, Right: 300, Top: 0, Bottom: 50, }}, NoMerge: true, // Set when caption follows this table on the page }, { Cells: []TSRCell{{Text: "Page1-cont", Label: "table row"}}, Positions: []Position{{ PageNumbers: []int{1}, Left: 0, Right: 300, Top: 0, Bottom: 50, }}, }, } result := mergeTablesAcrossPages(tables, nil) // Verify NoMerge prevents cross-page merging. if len(result) != 2 { t.Errorf("NOMERGE BUG: expected 2 separate table groups, got %d.", len(result)) } t.Log("NoMerge flag correctly prevents cross-page merge.") } // ============================================================================= // Issue 3b: insert position — min_rectangle_distance vs anchor // Python's insert_table_figures uses min_rectangle_distance to find the // spatially nearest text box and inserts the table/figure next to it. // Go's extractTableAndReplace uses the first replaced table box index as // the anchor (insert position). // // When the DLA table region extends beyond the anchor box's bottom and // overlaps a text box below the table, Python puts the table next to that // overlapping text box (distance=0); Go puts it at the anchor position. // ============================================================================= // TestExtractTableAndReplace_InsertionPosition_DistanceBug exposes that // extractTableAndReplace uses the first table box as anchor, rather than // finding the spatially nearest text box like Python. func TestExtractTableAndReplace_InsertionPosition_DistanceBug(t *testing.T) { // Two text boxes above the table: L0 (left, near table) and R0 (right, far). // Python: nearest to table is L0 (dx=0, dy=70). L0 bottom=30 < table top=100 // → insert AFTER L0. Result: [L0, table, R0, R1, L2]. // Go: anchor = first table box (L1 at index 2). Result: [L0, R0, table, R1, L2]. // The table is one position off. boxes := []TextBox{ {X0: 10, X1: 100, Top: 10, Bottom: 30, Text: "L0", LayoutType: "text", PageNumber: 0}, {X0: 300, X1: 400, Top: 10, Bottom: 30, Text: "R0", LayoutType: "text", PageNumber: 0}, {X0: 10, X1: 100, Top: 100, Bottom: 130, Text: "table", LayoutType: "table", PageNumber: 0}, {X0: 300, X1: 400, Top: 100, Bottom: 130, Text: "R1", LayoutType: "text", PageNumber: 0}, {X0: 10, X1: 100, Top: 250, Bottom: 270, Text: "L2", LayoutType: "text", PageNumber: 0}, } tables := []TableItem{{ Cells: []TSRCell{{Text: "cell", Label: "table row"}}, Positions: []Position{{Left: 10, Right: 100, Top: 100, Bottom: 130, PageNumbers: []int{0}}}, Scale: 1.0, RegionLeft: 10, RegionRight: 100, RegionTop: 100, RegionBottom: 130, }} result := extractTableAndReplace(boxes, tables) // Find L0 and table positions. l0Idx, tableIdx := -1, -1 for i, b := range result { if strings.TrimSpace(b.Text) == "L0" { l0Idx = i } if b.LayoutType == "table" { tableIdx = i } } // BUG: table should immediately follow L0 (nearest neighbor, insert_after). // Python: min_rectangle_distance → L0 nearest (dx=0, dy=70), L0 below table // → insert_at+1 → table right after L0. // Go: anchor = first table box index → table at original table box position. if tableIdx != l0Idx+1 { t.Errorf("INSERTION POSITION BUG: table (idx=%d) should immediately follow L0 (idx=%d). "+ "Python's min_rectangle_distance finds L0 as nearest text box and inserts table after it. "+ "Go anchors at first table box position (between R0 and R1).", tableIdx, l0Idx) } t.Logf("L0 at idx=%d, table at idx=%d", l0Idx, tableIdx) t.Log("Fix: replace first-replaced-box anchor with min_rectangle_distance nearest-neighbor (Python pdf_parser.py:1608-1655).") } // ============================================================================= // Issue 4: page_cum_height coordinate system // Python tracks cumulative page image heights for cross-page position tags // and image cropping. Go uses per-page coordinates only. // ============================================================================= // TestBoxesToSections_PerPageCoordinates confirms position tags use // page-relative coordinates. Python's _line_tag also produces local // coordinates (subtracts page_cum_height). The page number differentiates // pages; page_cum_height is an internal implementation detail. func TestBoxesToSections_PerPageCoordinates(t *testing.T) { boxes := []TextBox{ {X0: 10, X1: 100, Top: 40, Bottom: 60, Text: "Page 0 text", LayoutType: "text", PageNumber: 0}, {X0: 10, X1: 100, Top: 40, Bottom: 60, Text: "Page 1 text", LayoutType: "text", PageNumber: 1}, } sections := boxesToSections(boxes, nil) if len(sections) != 2 { t.Fatalf("expected 2 sections, got %d", len(sections)) } s0, s1 := sections[0], sections[1] if len(s0.Positions) > 0 && len(s1.Positions) > 0 { p0, p1 := s0.Positions[0], s1.Positions[0] // Both Python and Go use local (page-relative) coordinates. // Python's _line_tag: top = bx["top"] - page_cum_height[pn-1] // gives local coordinate. Same as Go. if p0.Top != p1.Top || p0.Bottom != p1.Bottom { t.Errorf("expected same local coords, got Top=(%.0f,%.0f) Bottom=(%.0f,%.0f)", p0.Top, p1.Top, p0.Bottom, p1.Bottom) } t.Logf("page 0: Page=%v Top=%.0f Bottom=%.0f", p0.PageNumbers, p0.Top, p0.Bottom) t.Logf("page 1: Page=%v Top=%.0f Bottom=%.0f", p1.PageNumbers, p1.Top, p1.Bottom) t.Log("OK: position tags use page-relative coordinates in both Go and Python.") } } // ============================================================================= // Issue 6: cropSectionImage padding logic // Python's self.crop adds 120px context above first segment, 120px context // below last segment, 6px gap between pages, and overlay transparency. // Go has simpler crop logic. // ============================================================================= // TestCropSectionImage_PaddingVsPython documents that Go's cropSectionImage // adds context padding differently from Python's self.crop. func TestCropSectionImage_PaddingVsPython(t *testing.T) { // Create a page image and position tag for a small text region. img := image.NewRGBA(image.Rect(0, 0, 300, 800)) // 300×800 page at zoom=3 → PDF 100×267 pageImages := map[int]image.Image{0: img} // Position tag for a small text box near the top of the page. posTag := FormatPositionTag(0, 50.0, 100.0, 10.0, 30.0) result := cropSectionImage(posTag, pageImages, 3.0) if result == "" { t.Error("cropSectionImage returned empty string for valid position") } // Decode result to check image dimensions. data, err := base64.StdEncoding.DecodeString(result) if err != nil { t.Fatalf("failed to decode base64: %v", err) } cropped, _, err := image.Decode(bytes.NewReader(data)) if err != nil { t.Fatalf("failed to decode PNG: %v", err) } croppedH := cropped.Bounds().Dy() // Original text region: Top=10, Bottom=30 → height=20 at PDF points. // zoom=3 → 60px text height. // Python adds 120px context above + 120px below + 6px gap → ~306px. // Go adds contextPad=120 points above/below at PDF scale → with zoom=3: 360+60+360=780px. // Python uses pixel-space padding (120px literally), Go uses PDF-point padding (120pt). expectedMin := 60 // bare minimum: text region itself if croppedH <= expectedMin { t.Errorf("CROP PADDING BUG: cropped image height=%dpx, expected >%dpx with context padding. Python adds 120px above and below for context.", croppedH, expectedMin) } t.Logf("cropped image: %dx%d (text region 60px, expecting padding)", cropped.Bounds().Dx(), croppedH) t.Log("NOTE: Python's self.crop adds 120px context padding in pixel space, multi-page stitching, and overlay transparency. Go's cropSectionImage uses PDF-point padding and simpler stitching.") } // ============================================================================= // Issue 7: Data-source filter missing // Python's _extract_table_figure pops table/figure boxes matching // r"(数据|资料|图表)*来源[:: ]" (pdf_parser.py:1040-1042, 1050-1052). // These boxes are discarded — not extracted, not inserted back. // Go has no equivalent filter in extractTableAndReplace or consolidateFigures. // ============================================================================= // dataSourcePattern is a Go translation of Python's // r"(数据|资料|图表)*来源[:: ]" used with re.match (anchored at start). var dataSourcePattern = `^(数据|资料|图表)*来源[:: ]` // TestDataSourcePattern_RegexCoverage validates the Python regex behavior // that should be adopted. Documents which strings match and which don't. func TestDataSourcePattern_RegexCoverage(t *testing.T) { tests := []struct { text string want bool // Python re.match truthiness }{ // ── Matching patterns (should be filtered) ── {"数据来源:国家统计局", true}, // 数据 + 来源 + fullwidth colon {"资料来源: 某报告", true}, // 资料 + 来源 + halfwidth colon {"图表来源:某数据库", true}, // 图表 + 来源 + fullwidth colon {"来源:权威机构", true}, // zero prefix + 来源 + fullwidth colon {"来源: 参考数据", true}, // zero prefix + 来源 + halfwidth colon {"数据来源 说明", true}, // 数据 + 来源 + space // ── Non-matching patterns (should NOT be filtered) ── {"数据来源明细", false}, // 来源 followed by 明, not ::space {"普通来源说明", false}, // doesn't start with keyword {"数据", false}, // too short {"来源", false}, // 来源 but no ::space after {"资料来源说明", false}, // 来源 followed by 说, not ::space {"", false}, // empty {"TABLE 1: 数据来源统计", false}, // doesn't start with keyword } for _, tt := range tests { matched := regexp.MustCompile(dataSourcePattern).MatchString(tt.text) if matched != tt.want { t.Errorf("dataSourcePattern.MatchString(%q) = %v, want %v", tt.text, matched, tt.want) } } t.Log("NOTE: Python re.match(r\"(数据|资料|图表)*来源[:: ]\", text) — anchored at start.") t.Log("Go regexp.MatchString equivalent with ^ prefix.") } // TestExtractTableAndReplace_DataSourceFilter_Missing exposes that Go does NOT // filter out table boxes whose text matches r"(数据|资料|图表)*来源[:: ]". // Python's _extract_table_figure pops these boxes from self.boxes without // adding them to the tables dict (pdf_parser.py:1040-1042). func TestExtractTableAndReplace_DataSourceFilter_Missing(t *testing.T) { // A table box with data-source text and a normal table box. // Both overlap a TableItem position, so both would be replaced with HTML. boxes := []TextBox{ {X0: 10, X1: 200, Top: 0, Bottom: 50, Text: "数据来源:国家统计局", LayoutType: "table", PageNumber: 0}, {X0: 10, X1: 200, Top: 60, Bottom: 80, Text: "表1:正常数据", LayoutType: "table", PageNumber: 0}, } // Two TableItems — one per table box — so each would independently produce HTML. tables := []TableItem{ { Cells: []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "来源", Label: "table row"}}, Positions: []Position{{Left: 0, Right: 300, Top: 0, Bottom: 50}}, Scale: 1.0, }, { Cells: []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "正常", Label: "table row"}}, Positions: []Position{{Left: 0, Right: 300, Top: 60, Bottom: 80}}, Scale: 1.0, }, } result := extractTableAndReplace(boxes, tables) // Python behavior: "数据来源:国家统计局" is popped from self.boxes, // NOT added to tables dict, NOT replaced with HTML. Gone entirely. // "表1:正常数据" is replaced with HTML as usual. // Expected result: exactly 1 HTML table box for the normal table. // // BUG: Go replaces both boxes with HTML tables. The data-source box // produces an HTML table with cell text "来源" — this should NOT exist. htmlTableCount := 0 hasDataSourceTable := false for _, b := range result { if strings.Contains(b.Text, "
") { htmlTableCount++ // The data-source table's cell text "来源" ends up in the HTML. // c.f. constructTable which uses TSRCell text, not box text. if strings.Contains(b.Text, ">来源<") { hasDataSourceTable = true } } } if htmlTableCount != 1 { t.Errorf("DATA SOURCE FILTER BUG: expected 1 HTML table (normal only), got %d. Python pops data-source table box entirely in _extract_table_figure (pdf_parser.py:1040-1042). Go replaces it with an HTML table.", htmlTableCount) } if hasDataSourceTable { t.Errorf("DATA SOURCE FILTER BUG: data-source table should NOT produce HTML output. Cell '来源' appears in HTML: Python discards these boxes, Go incorrectly constructs a table for them.") } t.Log("NOTE: Python filters table boxes matching r\"(数据|资料|图表)*来源[:: ]\" in _extract_table_figure.") t.Log("Go's extractTableAndReplace has no equivalent filter — data-source boxes get replaced with HTML instead of being discarded.") } // TestExtractTableAndReplace_DataSourceVariants tests multiple variants of // the data-source pattern that should all be filtered. func TestExtractTableAndReplace_DataSourceVariants(t *testing.T) { variants := []string{ "数据来源:国家统计局", "资料来源: 某报告", "图表来源:某数据库", "来源:权威机构", "来源: 参考数据", } for _, variant := range variants { t.Run(variant, func(t *testing.T) { boxes := []TextBox{ {X0: 10, X1: 200, Top: 0, Bottom: 50, Text: variant, LayoutType: "table", PageNumber: 0}, } tables := []TableItem{{ Cells: []TSRCell{{X0: 0, Y0: 0, X1: 100, Y1: 30, Text: "A", Label: "table row"}}, Positions: []Position{{Left: 0, Right: 300, Top: 0, Bottom: 50}}, Scale: 1.0, }} result := extractTableAndReplace(boxes, tables) // BUG: box with data-source text should be REMOVED entirely — // zero HTML output. Python pops these boxes without replacement. for _, b := range result { if strings.Contains(b.Text, "
") { t.Errorf("DATA SOURCE FILTER BUG: variant %q should be removed without HTML replacement. Python pops data-source table boxes entirely.", variant) } } }) } t.Log("NOTE: All variants of r\"(数据|资料|图表)*来源[:: ]\" should be filtered by extractTableAndReplace.") } // TestConsolidateFigures_DataSourceFilter_Missing exposes that Go does NOT // filter out figure boxes whose text matches r"(数据|资料|图表)*来源[:: ]". // Python's _extract_table_figure pops these boxes from self.boxes without // adding them to the figures dict (pdf_parser.py:1050-1052). func TestConsolidateFigures_DataSourceFilter_Missing(t *testing.T) { boxes := []TextBox{ {X0: 10, X1: 200, Top: 0, Bottom: 50, Text: "数据来源:某机构", LayoutType: "figure", PageNumber: 0, LayoutNo: "figure-0"}, {X0: 10, X1: 200, Top: 60, Bottom: 80, Text: "架构图", LayoutType: "figure", PageNumber: 0, LayoutNo: "figure-0"}, } result := consolidateFigures(boxes) // Python behavior: "数据来源:某机构" is popped from self.boxes, // NOT added to figures dict → gone entirely. // "架构图" is extracted normally. // Expected result: exactly 1 figure box with "架构图" text only. for _, b := range result { if strings.Contains(b.Text, "数据来源") || strings.Contains(b.Text, "某机构") { t.Errorf("DATA SOURCE FIGURE FILTER BUG: '数据来源:某机构' figure box should be removed entirely. Python pops data-source figure boxes in _extract_table_figure (pdf_parser.py:1050-1052). Go still includes it.") } } // Verify the normal figure box IS still present. foundFigure := false for _, b := range result { if strings.Contains(b.Text, "架构图") { foundFigure = true } } if !foundFigure { t.Error("normal figure box '架构图' should still be present") } t.Log("NOTE: Python filters figure boxes matching r\"(数据|资料|图表)*来源[:: ]\" in _extract_table_figure.") t.Log("Go's consolidateFigures has no equivalent filter.") }