package table import ( "fmt" "math" "regexp" "sort" "strings" pdf "ragflow/internal/deepdoc/parser/pdf/type" ) // ── construct table ───────────────────────────────────────────────────── // MergeTablesAcrossPages merges TableItems on consecutive pages with // overlapping X and close Y proximity. Matches Python's // _extract_table_figure table merge (pdf_parser.py:1061-1080). func MergeTablesAcrossPages(tables []pdf.TableItem, medianHeights map[int]float64) []pdf.TableItem { if len(tables) <= 1 { return tables } // Sort by position for deterministic adjacency. type indexed struct { idx int pg int top float64 } var items []indexed for i, tbl := range tables { if len(tbl.Positions) == 0 { continue } p := tbl.Positions[0] pg := 0 if len(p.PageNumbers) > 0 { pg = p.PageNumbers[0] } items = append(items, indexed{i, pg, p.Top}) } sort.Slice(items, func(a, b int) bool { if items[a].pg != items[b].pg { return items[a].pg < items[b].pg } return items[a].top < items[b].top }) merged := make([]bool, len(tables)) var result []pdf.TableItem for _, it := range items { if merged[it.idx] { continue } anchor := tables[it.idx] merged[it.idx] = true // Python nomerge_lout_no: tables whose box is followed by a // caption/title/reference should not be merged cross-page. if anchor.NoMerge { result = append(result, anchor) continue } anchorPg := it.pg anchorBott := anchor.Positions[0].Bottom // Look for consecutive-page continuations. for _, jt := range items { if merged[jt.idx] || jt.pg <= anchorPg { continue } // Python nomerge_lout_no: skip continuation candidates // tagged as no-merge. if tables[jt.idx].NoMerge { continue } if jt.pg-anchorPg > 1 { break // pages must be consecutive } if len(tables[jt.idx].Positions) == 0 { continue } bp := tables[jt.idx].Positions[0] bpg := 0 if len(bp.PageNumbers) > 0 { bpg = bp.PageNumbers[0] } if bpg != anchorPg+1 { continue } // Check X overlap. ap := anchor.Positions[0] if ap.Right < bp.Left || bp.Right < ap.Left { continue } // Check Y proximity: page 1 table top should be close below // page 0 table bottom. Python: y_dis ≤ mh * 23. mh := 10.0 if medianHeights != nil { if h, ok := medianHeights[anchorPg]; ok && h > 0 { mh = h } } yDis := (bp.Top + bp.Bottom - anchorBott - ap.Bottom) / 2 if yDis > mh*23 { continue } // Merge: combine cells and positions. anchor.Cells = append(anchor.Cells, tables[jt.idx].Cells...) anchor.Positions = append(anchor.Positions, tables[jt.idx].Positions...) if tables[jt.idx].Caption != "" { if anchor.Caption != "" { anchor.Caption += " " } anchor.Caption += tables[jt.idx].Caption } merged[jt.idx] = true anchorPg = bpg anchorBott = bp.Bottom } result = append(result, anchor) } return result } // constructTable produces an HTML table string from TSR cells and text boxes. // Both cells and boxes must be in the same coordinate space (crop pixel space). // Fills item.Rows so downstream consumers don't need to re-group cells. // // Python equivalent: TableStructureRecognizer.construct_table() // stripCaptionFromCells clears caption-like text from TSR cells. // This catches captions that fillCellTextFromBoxes missed (e.g. text // that doesn't match isCaptionBox patterns like "公司差旅费管理办法"). // Only clears cells whose text matches caption patterns or that contain // only number+separator text (pure "1. ", "一、" etc. without data). func StripCaptionFromCells(cells []pdf.TSRCell) { for i := range cells { t := strings.TrimSpace(cells[i].Text) if t == "" { continue } // Clear cells that match caption patterns (e.g. "表1", "Table 1"). if IsCaptionBox(t, "") { cells[i].Text = "" } } // Second pass: if the first row (lowest Y) has all-numeric/numbering text // (e.g. "1", "1.", "一"), it's likely a caption numbering line — clear it. // But don't clear actual numeric data cells. // This pass is intentionally conservative — only clears clearly-non-data text. } func ConstructTable(cells []pdf.TSRCell, boxes []pdf.TextBox, caption string, item *pdf.TableItem) string { // Strip caption-like text from cells (defense-in-depth: fillCellTextFromBoxes // may include caption text that doesn't match isCaptionBox patterns). StripCaptionFromCells(cells) // Use the pre-computed grid from pdf.TableBuilder.GroupCells. // Falls back to cell-level grouping only when called directly by // tests without a pre-computed Grid (production always sets it). var rows [][]pdf.TSRCell if item != nil { rows = item.Grid } if rows == nil && len(cells) > 0 && HasAnyText(cells) { rows = GroupTSRCellsToRows(cells) } if len(rows) > 0 && HasText(rows) { hdrs := HeaderSetWithBlockType(rows) if item != nil { item.Rows = RowsToStrings(rows) } rows = CleanupOrphanColumns(rows) spanInfo, covered := CalSpans(rows) return RowsToHTML(rows, caption, hdrs, spanInfo, covered) } // Fallback: boxes with R/C annotations. if len(boxes) > 0 && BoxesHaveAnnotations(boxes) { rows := GroupBoxesByRC(boxes) if HasText(rows) { if item != nil { item.Rows = RowsToStrings(rows) } spanInfo, covered := CalSpans(rows) return RowsToHTML(rows, caption, BoxHeaderSet(rows, boxes), spanInfo, covered) } } // Test-only: Y/X coordinate grouping (matching Python construct_table). // Used by table_parity_test.go to verify pipeline with Python boxes. if len(boxes) > 0 && !BoxesHaveAnnotations(boxes) { rows := GroupBoxesByYX(boxes) if HasText(rows) { if item != nil { item.Rows = RowsToStrings(rows) } spanInfo, covered := CalSpans(rows) return RowsToHTML(rows, caption, BoxHeaderSet(rows, boxes), spanInfo, covered) } } return "" } // boxHeaderSet returns rows that contain boxes with H annotations. func BoxHeaderSet(rows [][]pdf.TSRCell, boxes []pdf.TextBox) map[int]bool { hdrs := make(map[int]bool) for _, b := range boxes { if b.H > 0 && b.R >= 0 && b.R < len(rows) { hdrs[b.R] = true } } return hdrs } func HasAnyText(cells []pdf.TSRCell) bool { for _, c := range cells { if strings.TrimSpace(c.Text) != "" { return true } } return false } // groupBoxesByRC groups text boxes into a cell grid by R/C annotations. // Matches Python's construct_table: sort by R, merge nearby rows by Y proximity, // sort by C within each row, merge nearby columns by X proximity. func GroupBoxesByRC(boxes []pdf.TextBox) [][]pdf.TSRCell { if len(boxes) == 0 { return nil } // If no real R/C annotations (maxR <= 0), fall back to YX coordinate // grouping — matching Python's construct_table when all R=-1. maxR := 0 for _, b := range boxes { if b.R > maxR { maxR = b.R } } if maxR <= 0 { return GroupBoxesByYX(boxes) } // Sort by R index first (Python: sort_R_firstly), then Y, then X. sort.Slice(boxes, func(i, j int) bool { if boxes[i].R != boxes[j].R { return boxes[i].R < boxes[j].R } if boxes[i].Top != boxes[j].Top { return boxes[i].Top < boxes[j].Top } return boxes[i].X0 < boxes[j].X0 }) // Compress R indices: Python's sort_R_firstly grouping. // R differs → always a new row. Same R + Y gap → also new row. rowMap := make(map[int]int) // original R → compressed row index compressed := 0 rowMap[boxes[0].R] = 0 lastR := boxes[0].R btm := boxes[0].Bottom for i := 1; i < len(boxes); i++ { // Python: b["R"] != last_R → new row. // Same R → always same row (Python doesn't check Y for same R). if boxes[i].R != lastR { compressed++ rowMap[boxes[i].R] = compressed lastR = boxes[i].R btm = boxes[i].Bottom } else { // Same R → same physical row. rowMap[boxes[i].R] = compressed btm = (btm + boxes[i].Bottom) / 2.0 } } // Collect boxes per row, sort by C within each row. type rb struct { row, col int txt string x0, y0, x1, y1 float64 label string } cmap := make(map[int]map[int]*rb) // row → col → entry maxCols := make(map[int]int) for _, b := range boxes { t := strings.TrimSpace(b.Text) // Keep boxes with SP/H annotations even if text is empty — // their coordinates are needed for colspan/rowspan calculation. if t == "" && b.H <= 0 && b.SP <= 0 { continue } r := rowMap[b.R] c := b.C if cmap[r] == nil { cmap[r] = make(map[int]*rb) } x0, y0, x1, y1, label := cellPosFromBox(b) if v, ok := cmap[r][c]; ok { v.txt += " " + t // Merge spanning coordinates (use widest extent). if b.H > 0 || b.SP > 0 { v.label = cellLabelFromBox(b) if v.x0 > x0 { v.x0 = x0 } if v.y0 > y0 { v.y0 = y0 } if v.x1 < x1 { v.x1 = x1 } if v.y1 < y1 { v.y1 = y1 } } } else { cmap[r][c] = &rb{r, c, t, x0, y0, x1, y1, label} } if c > maxCols[r] { maxCols[r] = c } } // Compress C indices per row: sort boxes by X0 within the row, // group disjoint X ranges into separate columns. This is equivalent // to Python's sort_C_firstly but uses X0 ordering instead of C labels. cCompressed := make(map[int]map[int]int) // row → (original C → compressed col) cMaxCol := make(map[int]int) for ri := 0; ri <= compressed; ri++ { rowEntries := cmap[ri] if rowEntries == nil { continue } // Collect all boxes in this row, sorted by X0. type rowBox struct { c, idx int x0, x1 float64 txt string } var rowBoxes []rowBox for i, b := range boxes { if rowMap[b.R] == ri && (strings.TrimSpace(b.Text) != "" || b.H > 0 || b.SP > 0) { rowBoxes = append(rowBoxes, rowBox{c: b.C, idx: i, x0: b.X0, x1: b.X1, txt: b.Text}) } } sort.Slice(rowBoxes, func(i, j int) bool { return rowBoxes[i].x0 < rowBoxes[j].x0 }) // Assign compressed column by X-order (disjoint X → new col). cMap := make(map[int]int) // original C → compressed col right := 0.0 for _, rb := range rowBoxes { if len(cMap) == 0 || rb.x0 >= right { cc := len(cMap) cMap[rb.c] = cc right = rb.x1 } else { // Overlapping X → merge into last column. cMap[rb.c] = len(cMap) - 1 if rb.x1 > right { right = rb.x1 } } } cCompressed[ri] = cMap cMaxCol[ri] = len(cMap) - 1 } // Build grid. rows := make([][]pdf.TSRCell, compressed+1) for ri := 0; ri <= compressed; ri++ { maxC := cMaxCol[ri] rows[ri] = make([]pdf.TSRCell, maxC+1) for ci, v := range cmap[ri] { cci := cCompressed[ri][ci] if cci <= maxC { rows[ri][cci].Text = v.txt rows[ri][cci].X0 = v.x0 rows[ri][cci].Y0 = v.y0 rows[ri][cci].X1 = v.x1 rows[ri][cci].Y1 = v.y1 rows[ri][cci].Label = v.label } } } return rows } // cellPosFromBox returns the position coordinates and label for a cell // derived from a text box. Header cells use HLeft/HRight/HTop/HBott // for spanning-aware positions; regular cells use the box's own bounds. func cellPosFromBox(b pdf.TextBox) (x0, y0, x1, y1 float64, label string) { x0, y0, x1, y1 = b.X0, b.Top, b.X1, b.Bottom if b.H > 0 { label = "table header" if b.HLeft != 0 || b.HRight != 0 { if b.HLeft != 0 { x0 = b.HLeft } if b.HRight != 0 { x1 = b.HRight } } if b.HTop != 0 { y0 = b.HTop } if b.HBott != 0 { y1 = b.HBott } } else if b.SP > 0 { label = "table spanning cell" } return } // cellLabelFromBox returns the TSR label for a box based on H/SP annotations. // Used when merging multiple boxes into one cell — preserves the spanning label. func cellLabelFromBox(b pdf.TextBox) string { if b.H > 0 { return "table header" } if b.SP > 0 { return "table spanning cell" } return "" } // groupBoxesByYX groups boxes into a cell grid by Y/X coordinates, // matching Python's construct_table which uses sort_R_firstly and // sort_C_firstly when R/C annotations are absent. // This is test-only — used by table_parity_test.go to verify pipeline // parity with Python boxes that lack R/C annotations. func GroupBoxesByYX(boxes []pdf.TextBox) [][]pdf.TSRCell { if len(boxes) == 0 { return nil } // Sort by (page, top, x0) — same as Python sort_R_firstly with R=-1. sort.Slice(boxes, func(i, j int) bool { if boxes[i].PageNumber != boxes[j].PageNumber { return boxes[i].PageNumber < boxes[j].PageNumber } if boxes[i].Top != boxes[j].Top { return boxes[i].Top < boxes[j].Top } return boxes[i].X0 < boxes[j].X0 }) // Group into rows by Y proximity (Python's row grouping). type rowGroup struct { boxes []pdf.TextBox top, btm float64 } var rowGroups []rowGroup rowGroups = append(rowGroups, rowGroup{boxes: []pdf.TextBox{boxes[0]}, top: boxes[0].Top, btm: boxes[0].Bottom}) for i := 1; i < len(boxes); i++ { prev := &rowGroups[len(rowGroups)-1] // Python: same row if top < prev.btm (Y overlaps) and same page. if boxes[i].PageNumber == prev.boxes[0].PageNumber && boxes[i].Top < prev.btm { prev.boxes = append(prev.boxes, boxes[i]) if boxes[i].Top < prev.top { prev.top = boxes[i].Top } if boxes[i].Bottom > prev.btm { prev.btm = boxes[i].Bottom } } else { rowGroups = append(rowGroups, rowGroup{boxes: []pdf.TextBox{boxes[i]}, top: boxes[i].Top, btm: boxes[i].Bottom}) } } // Within each row, group into columns by X proximity. rows := make([][]pdf.TSRCell, len(rowGroups)) for ri, rg := range rowGroups { // Sort by X0. sort.Slice(rg.boxes, func(i, j int) bool { return rg.boxes[i].X0 < rg.boxes[j].X0 }) // Group by X overlap. var cols []struct { boxes []pdf.TextBox x1 float64 } cols = append(cols, struct { boxes []pdf.TextBox x1 float64 }{boxes: []pdf.TextBox{rg.boxes[0]}, x1: rg.boxes[0].X1}) for i := 1; i < len(rg.boxes); i++ { prev := &cols[len(cols)-1] if rg.boxes[i].X0 < prev.x1 { prev.boxes = append(prev.boxes, rg.boxes[i]) if rg.boxes[i].X1 > prev.x1 { prev.x1 = rg.boxes[i].X1 } } else { cols = append(cols, struct { boxes []pdf.TextBox x1 float64 }{boxes: []pdf.TextBox{rg.boxes[i]}, x1: rg.boxes[i].X1}) } } rows[ri] = make([]pdf.TSRCell, len(cols)) for ci, col := range cols { var sb strings.Builder for _, b := range col.boxes { t := strings.TrimSpace(b.Text) if t == "" { continue } if sb.Len() > 0 { sb.WriteByte(' ') } sb.WriteString(t) } rows[ri][ci].Text = sb.String() } } return rows } func BoxesHaveAnnotations(boxes []pdf.TextBox) bool { maxR, maxC := 0, 0 for _, b := range boxes { if b.R > maxR { maxR = b.R } if b.C > maxC { maxC = b.C } } // True if at least 2 rows or 2 cols (R/C are 0-based, so maxR>0 means ≥2 rows). return maxR > 0 || maxC > 0 } func HasText(rows [][]pdf.TSRCell) bool { for _, row := range rows { for _, c := range row { if strings.TrimSpace(c.Text) != "" { return true } } } return false } func RowsToStrings(rows [][]pdf.TSRCell) [][]string { out := make([][]string, len(rows)) for ri, row := range rows { out[ri] = make([]string, len(row)) for ci, c := range row { out[ri][ci] = c.Text } } return out } // fillCellTextFromAnnotations fills cell text from text boxes using R/C labels. // This matches Python's construct_table which assigns boxes to cells by their // R (row) and C (col) annotations rather than spatial overlap. func FillCellTextFromAnnotations(rows [][]pdf.TSRCell, boxes []pdf.TextBox) { // Build R→(C→text) map: row index → (col index → text). rBoxes := make(map[int]map[int][]string) for _, b := range boxes { if b.Text == "" { continue } if rBoxes[b.R] == nil { rBoxes[b.R] = make(map[int][]string) } rBoxes[b.R][b.C] = append(rBoxes[b.R][b.C], b.Text) } // Fill each cell from the matching R/C position. for ri, row := range rows { colMap := rBoxes[ri] if colMap == nil { continue } // Build sorted column list for positional matching. type colEntry struct { c int texts []string } var cols []colEntry for c, texts := range colMap { cols = append(cols, colEntry{c, texts}) } sort.Slice(cols, func(i, j int) bool { return cols[i].c < cols[j].c }) for ci, col := range cols { if ci < len(row) { row[ci].Text = strings.TrimSpace(strings.Join(col.texts, " ")) } } } } // dataSourceRe matches table/figure boxes that should be discarded as // data-source attribution lines rather than extracted content. // // Python: pdf_parser.py:1040-1042, 1050-1052 // // re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]) var dataSourceRe = regexp.MustCompile(`^(数据|资料|图表)*来源[:: ]`) // isDataSourceBox returns true if the box text matches the data-source // discard pattern (Python's _extract_table_figure data-source filter). func isDataSourceBox(text string) bool { return dataSourceRe.MatchString(text) } // tableRegionBox returns a pdf.TextBox for a table replacement, using DLA region // boundaries when available (Region* set), falling back to anchor box coordinates. // Python's insert_table_figures uses DLA layout region boundaries; the fallback // handles test TableItems or bare engines without DLA. func tableRegionBox(tbl *pdf.TableItem, ref *pdf.TextBox, html string) pdf.TextBox { pg := 0 if len(tbl.Positions) > 0 && len(tbl.Positions[0].PageNumbers) > 0 { pg = tbl.Positions[0].PageNumbers[0] } // Use DLA region boundaries when set. if tbl.RegionLeft != 0 || tbl.RegionRight != 0 || tbl.RegionTop != 0 || tbl.RegionBottom != 0 { return pdf.TextBox{ X0: tbl.RegionLeft, X1: tbl.RegionRight, Top: tbl.RegionTop, Bottom: tbl.RegionBottom, Text: html, PageNumber: pg, LayoutType: pdf.LayoutTypeTable, } } // Fallback: use anchor box coordinates. x0, x1, top, bot := ref.X0, ref.X1, ref.Top, ref.Bottom return pdf.TextBox{ X0: x0, X1: x1, Top: top, Bottom: bot, Text: html, PageNumber: pg, LayoutType: pdf.LayoutTypeTable, } } // minRectangleDistance computes the Euclidean distance between two rectangles. // Returns 0 when rectangles overlap. Matches Python's min_rectangle_distance // in insert_table_figures (pdf_parser.py:1609-1626). func minRectangleDistance(left1, right1, top1, bottom1, left2, right2, top2, bottom2 float64) float64 { if right1 >= left2 && right2 >= left1 && bottom1 >= top2 && bottom2 >= top1 { return 0 } var dx, dy float64 if right1 < left2 { dx = left2 - right1 } else if right2 < left1 { dx = left1 - right2 } if bottom1 < top2 { dy = top2 - bottom1 } else if bottom2 < top1 { dy = top1 - bottom2 } return math.Sqrt(dx*dx + dy*dy) } func RowsToHTML(rows [][]pdf.TSRCell, caption string, headerRows map[int]bool, spanInfo map[[2]int][2]int, covered map[[2]int]bool) string { var b strings.Builder b.WriteString("") if caption != "" { b.WriteString("") } for ri, row := range rows { b.WriteString("") for ci, cell := range row { if covered[[2]int{ri, ci}] { continue } tag := "td" if headerRows[ri] { tag = "th" } b.WriteString("<") b.WriteString(tag) sp := "" if s, ok := spanInfo[[2]int{ri, ci}]; ok { if s[0] > 1 { sp = fmt.Sprintf("colspan=%d", s[0]) } if s[1] > 1 { if sp != "" { sp += " " } sp += fmt.Sprintf("rowspan=%d", s[1]) } } if sp != "" { b.WriteString(" ") b.WriteString(sp) } b.WriteString(" >") b.WriteString(cell.Text) b.WriteString("") } b.WriteString("") } b.WriteString("
") b.WriteString(caption) b.WriteString("
") return b.String() } // ── Span computation (Python: __cal_spans) ── // calSpans computes colspan and rowspan for spanning cells in the grid. // Returns spanInfo (row,col → colspan,rowspan) and covered (cells hidden by spans). // Matches Python's __cal_spans (table_structure_recognizer.py:535). // flattenGrid flattens a 2D grid into a 1D slice for fillCellTextFromBoxes. func FlattenGrid(grid [][]pdf.TSRCell) []pdf.TSRCell { n := 0 for _, row := range grid { n += len(row) } flat := make([]pdf.TSRCell, 0, n) for _, row := range grid { flat = append(flat, row...) } return flat } func CalSpans(rows [][]pdf.TSRCell) (map[[2]int][2]int, map[[2]int]bool) { spanInfo := make(map[[2]int][2]int) covered := make(map[[2]int]bool) if len(rows) == 0 || len(rows[0]) == 0 { return spanInfo, covered } // Compute column center positions. nCols := len(rows[0]) colLeft := make([]float64, nCols) colRight := make([]float64, nCols) for j := 0; j < nCols; j++ { colLeft[j] = 1e9 colRight[j] = -1e9 } nRows := len(rows) rowTop := make([]float64, nRows) rowBott := make([]float64, nRows) for i := 0; i < nRows; i++ { rowTop[i] = 1e9 rowBott[i] = -1e9 } for i, row := range rows { for j, cell := range row { if j >= nCols { continue } // Exclude spanning cells from column/row boundary calculations. // Use label-based detection (O(1), no dependency on column midpoints). if strings.Contains(cell.Label, "spanning") { continue } if cell.X0 < colLeft[j] { colLeft[j] = cell.X0 } if cell.X1 > colRight[j] { colRight[j] = cell.X1 } if cell.Y0 < rowTop[i] { rowTop[i] = cell.Y0 } if cell.Y1 > rowBott[i] { rowBott[i] = cell.Y1 } } } // For each spanning cell, compute how many cols/rows it covers. for i, row := range rows { for j, cell := range row { if j >= nCols || covered[[2]int{i, j}] { continue } // Skip cells without position data (they can't span). if cell.X0 == 0 && cell.X1 == 0 && cell.Y0 == 0 && cell.Y1 == 0 { continue } cs, rs := 1, 1 // Count columns whose center is inside this cell's X range. for k := j + 1; k < nCols; k++ { // Skip columns with no non-spanning cells (initial values unchanged). if colLeft[k] == 1e9 && colRight[k] == -1e9 { continue } colCenter := (colLeft[k] + colRight[k]) / 2 if colCenter >= cell.X0 && colCenter <= cell.X1 { cs++ } } // Count rows whose center is inside this cell's Y range. for k := i + 1; k < nRows; k++ { // Skip rows with no non-spanning cells. if rowTop[k] == 1e9 && rowBott[k] == -1e9 { continue } rowCenter := (rowTop[k] + rowBott[k]) / 2 if rowCenter >= cell.Y0 && rowCenter <= cell.Y1 { rs++ } } if cs > 1 || rs > 1 { spanInfo[[2]int{i, j}] = [2]int{cs, rs} // Mark covered cells. for ri := i; ri < i+rs && ri < nRows; ri++ { for cj := j; cj < j+cs && cj < nCols; cj++ { if ri != i || cj != j { covered[[2]int{ri, cj}] = true } } } } } } return spanInfo, covered } // ── Orphan column/row cleanup (Python: construct_table lines 256-368) ── // cleanupOrphanColumns removes columns that have only a single non-empty cell // when there are ≥4 rows. Matches Python's construct_table column cleanup. func CleanupOrphanColumns(rows [][]pdf.TSRCell) [][]pdf.TSRCell { if len(rows) < 4 || len(rows) == 0 { return rows } nCols := len(rows[0]) j := 0 colLoop: for j < nCols { e, ii := 0, 0 for i := range rows { if j < len(rows[i]) && strings.TrimSpace(rows[i][j].Text) != "" { e++ ii = i } if e > 1 { j++ continue colLoop } } // Column j has only one non-empty cell at row ii. // Check if adjacent columns have text for this row. f := (j > 0 && j-1 < len(rows[ii]) && strings.TrimSpace(rows[ii][j-1].Text) != "") || j == 0 ff := (j+1 < len(rows[ii]) && strings.TrimSpace(rows[ii][j+1].Text) != "") || j+1 >= len(rows[ii]) if f && ff { // Both adjacent columns are ok for merging — but this means // there's text on both sides, keep column. j++ continue } // Determine which side to merge into. left := 1e9 right := 1e9 if j > 0 && !f { for i := range rows { if j-1 < len(rows[i]) && strings.TrimSpace(rows[i][j-1].Text) != "" { // Distance from orphan cell to left neighbor. if d := rows[ii][j].X0 - rows[i][j-1].X1; d < left { left = d } } } } if j+1 < nCols && !ff { for i := range rows { if j+1 < len(rows[i]) && strings.TrimSpace(rows[i][j+1].Text) != "" { if d := rows[i][j+1].X0 - rows[ii][j].X1; d < right { right = d } } } } if left < right && j > 0 { // Merge into left column. for i := range rows { if j-1 < len(rows[i]) && j < len(rows[i]) { if rows[i][j-1].Text == "" { rows[i][j-1].Text = rows[i][j].Text } else if rows[i][j].Text != "" { rows[i][j-1].Text += " " + rows[i][j].Text } } } } else if j+1 < nCols { // Merge into right column. for i := range rows { if j < len(rows[i]) && j+1 < len(rows[i]) { if rows[i][j+1].Text == "" { rows[i][j+1].Text = rows[i][j].Text } else if rows[i][j].Text != "" { rows[i][j+1].Text = rows[i][j].Text + " " + rows[i][j+1].Text } } } } // Remove column j. for i := range rows { if j < len(rows[i]) { rows[i] = append(rows[i][:j], rows[i][j+1:]...) } } nCols-- // Don't increment j — the next column shifted into position j. } return rows }