package parser import ( "context" "encoding/base64" "fmt" "image" "log/slog" "math" "regexp" "sort" "strings" ) // enrichWithDeepDoc runs DLA+TSR via p.DeepDoc and returns detected tables. // pageImages optionally provides pre-rendered page images to avoid re-rendering. func (p *Parser) enrichWithDeepDoc(ctx context.Context, engine PDFEngine, boxes []TextBox, pageImages map[int]image.Image) []TableItem { if !p.DeepDoc.Health() { return nil } // Group boxes by page for annotation write-back. byPage := make(map[int][]int) for i, b := range boxes { byPage[b.PageNumber] = append(byPage[b.PageNumber], i) } // Collect all pages that have images (from pageImages) or boxes. // This matches Python's __images__ which processes every page regardless // of embedded chars — image-only PDFs still get DLA+TSR. allPages := make(map[int]bool) for pg := range pageImages { allPages[pg] = true } for pg := range byPage { allPages[pg] = true } pageKeys := make([]int, 0, len(allPages)) for pg := range allPages { pageKeys = append(pageKeys, pg) } sort.Ints(pageKeys) var tableItems []TableItem for _, pg := range pageKeys { if err := ctx.Err(); err != nil { return tableItems } indices := byPage[pg] pageBoxes := make([]TextBox, len(indices)) for i, idx := range indices { pageBoxes[i] = boxes[idx] } tables := p.extractTableBoxes(ctx, pageBoxes, engine, pg, pageImages, len(tableItems)) tableItems = append(tableItems, tables...) // Write back DLA and TSR annotations (R/C/H/SP) to the original boxes. for i, idx := range indices { if pageBoxes[i].LayoutType != "" { boxes[idx].LayoutType = pageBoxes[i].LayoutType boxes[idx].LayoutNo = pageBoxes[i].LayoutNo } copyBoxAnnotations(&boxes[idx], &pageBoxes[i]) } } return tableItems } func (p *Parser) extractTableBoxes(ctx context.Context, boxes []TextBox, engine PDFEngine, pageNum int, pageImages map[int]image.Image, tableBaseIdx int) []TableItem { pageImg, ok := pageImages[pageNum] if !ok { var err error pageImg, err = renderPageToImage(engine, pageNum) if err != nil { slog.Warn("render page for DeepDoc failed", "page", pageNum, "err", err) return nil } } return p.extractTableBoxesFromImage(ctx, boxes, pageImg, pageNum, tableBaseIdx) } func (p *Parser) extractTableBoxesFromImage(ctx context.Context, boxes []TextBox, pageImg image.Image, pageNum int, tableBaseIdx int) []TableItem { regions, err := p.DeepDoc.DLA(ctx, pageImg) if err != nil { slog.Warn("DLA failed", "page", pageNum, "err", err) return nil } // Collect DLA debug intermediates. p.debugDLA = append(p.debugDLA, DLAPageRegions{Page: pageNum, Regions: regions}) // Annotate boxes with DLA layout types (title, text, figure, table, ...). scale := dlaScale boxes = annotateBoxLayouts(boxes, regions, scale, float64(pageImg.Bounds().Dy())) tableMatches := matchTableRegions(boxes, regions, scale) var items []TableItem for _, tm := range tableMatches { cropped, cropErr := cropImageRegion(pageImg, tm.region) if cropErr != nil { // DLA returned an invalid region (e.g. x1 < x0). Python // PIL.Image.crop() raises ValueError here; we skip this // table instead of passing a full-page image to TSR. continue } // Rotation detection (Python: _evaluate_table_orientation). // If rotated, TSR and OCR use the rotated image; cell coords // are mapped back to original crop space for box matching. autoRotate := p.Config.AutoRotateTables != nil && *p.Config.AutoRotateTables bestAngle := 0 origW, origH := cropped.Bounds().Dx(), cropped.Bounds().Dy() tsrImg := cropped if autoRotate { angle, rotated, _ := evaluateTableOrientation(ctx, cropped, p.DeepDoc) bestAngle = angle tsrImg = rotated } imgB64, encErr := encodeImageToBase64PNG(cropped) if encErr != nil { slog.Warn("table PNG encode failed", "page", pageNum, "err", encErr) } var cells []TSRCell var tsrErr error cells, tsrErr = p.tableBuilder.DetectCells(ctx, tsrImg) if tsrErr != nil { slog.Warn("TSR failed", "page", pageNum, "err", tsrErr) } // Collect TSR raw cells for debug comparison. if tsrErr == nil { for _, c := range cells { p.debugTSR = append(p.debugTSR, TSRRawCell{ TableIndex: tableBaseIdx + len(items), Page: pageNum, Label: c.Label, X0: c.X0, Y0: c.Y0, X1: c.X1, Y1: c.Y1, Text: c.Text, }) } } // Python margin: w*0.03, h*0.03 (_table_transformer_job:374-376). w := tm.region.X1 - tm.region.X0 h := tm.region.Y1 - tm.region.Y0 marginX := w * 0.03 marginY := h * 0.03 cropOffX := math.Max(0, tm.region.X0-marginX) cropOffY := math.Max(0, tm.region.Y0-marginY) var boxInCrop []TextBox if tsrErr == nil && len(cells) > 0 { if bestAngle != 0 { // OCR on rotated image before mapping cells back. // Cells are in rotated-pixel space; OCR works best // on upright text. After mapping, cells move to // original crop space where boxInCrop lives. if !p.Config.SkipOCR { ocrTableCells(ctx, cells, tsrImg, p.DeepDoc) } for i := range cells { cells[i].X0, cells[i].Y0 = mapRotatedPointToOriginal(cells[i].X0, cells[i].Y0, bestAngle, origW, origH) cells[i].X1, cells[i].Y1 = mapRotatedPointToOriginal(cells[i].X1, cells[i].Y1, bestAngle, origW, origH) } } // Fill cell text from pre-merge boxes, skipping caption boxes // (text entirely above the first TSR cell row). firstCellTop := 1e9 for _, c := range cells { if c.Y0 >= 0 && c.Y0 < firstCellTop { firstCellTop = c.Y0 } } if firstCellTop == 1e9 { firstCellTop = cells[0].Y0 // fallback if all cells have Y0 < 0 } boxInCrop = make([]TextBox, 0, len(tm.boxIdx)) for _, idx := range tm.boxIdx { b := boxes[idx] if b.Bottom*scale-cropOffY < firstCellTop { continue // caption box above first TSR cell } boxInCrop = append(boxInCrop, boxToCropSpace(b, scale, cropOffX, cropOffY)) } } var positions []Position for _, idx := range tm.boxIdx { b := boxes[idx] positions = append(positions, Position{ PageNumbers: []int{pageNum}, Left: b.X0, Right: b.X1, Top: b.Top, Bottom: b.Bottom, }) } // Pre-compute grid from raw TSR cells (without crop offset). // Stored in TableItem for constructTable; annotateTableBoxes // recomputes with offset cells for spatial matching precision. var grid [][]TSRCell if len(cells) > 0 { grid = p.tableBuilder.GroupCells(cells) // Fill cell text from boxes in crop space. Works for both // SaasDeepDoc (cells rearranged) and OssDeepDoc (cross-product creates new cells). if len(grid) > 0 { flat := flattenGrid(grid) fillCellTextFromBoxes(flat, boxInCrop) idx := 0 for ri := range grid { for ci := range grid[ri] { grid[ri][ci].Text = flat[idx].Text idx++ } } if bestAngle == 0 && !p.Config.SkipOCR { ocrTableCells(ctx, flat, tsrImg, p.DeepDoc) idx = 0 for ri := range grid { for ci := range grid[ri] { grid[ri][ci].Text = flat[idx].Text idx++ } } } } } items = append(items, TableItem{ ImageB64: imgB64, Cells: cells, Grid: grid, Positions: positions, Scale: scale, CropOffX: cropOffX, CropOffY: cropOffY, // DLA region in PDF point space (Python's cropout uses layout region boundaries). RegionLeft: tm.region.X0 / scale, RegionRight: tm.region.X1 / scale, RegionTop: tm.region.Y0 / scale, RegionBottom: tm.region.Y1 / scale, }) writeTableAnnotations(boxes, tm.boxIdx, cells, scale, cropOffX, cropOffY, p.tableBuilder) } return items } // tableMatch pairs a DLA table region with the indices of boxes that overlap it. type tableMatch struct { region DLARegion boxIdx []int } // ── cell row grouping ────────────────────────────────────────────────── // ── region matching ──────────────────────────────────────────────────── func regionOverlapsBox(region DLARegion, box TextBox, scale float64) bool { rx0 := region.X0 / scale ry0 := region.Y0 / scale rx1 := region.X1 / scale ry1 := region.Y1 / scale scaledR := DLARegion{X0: rx0, Y0: ry0, X1: rx1, Y1: ry1} inter := OverlapInter(&scaledR, &box) boxArea := Area(&box) if boxArea <= 0 { return false } return inter/boxArea >= 0.4 // matches Python thr=0.4 } // matchTableRegions pairs DLA table regions with boxes that overlap them. // Each table region is matched if at least one box overlaps it (>40% of box // area) or if there are no boxes at all (image-only PDF), matching Python's // _table_transformer_job which processes every table DLA region. func matchTableRegions(boxes []TextBox, regions []DLARegion, scale float64) []tableMatch { var matches []tableMatch for _, r := range regions { if r.Label != LayoutTypeTable { continue } var matched []int for i, b := range boxes { if regionOverlapsBox(r, b, scale) { matched = append(matched, i) } } if len(matched) > 0 || len(boxes) == 0 { matches = append(matches, tableMatch{region: r, boxIdx: matched}) } } return matches } // writeTableAnnotations annotates boxes at boxIdx with table cell grid // information (R/C/H/SP). Cells are offset by cropOff, grouped into a grid, // and annotation fields are scaled back to PDF space for each box. func writeTableAnnotations(boxes []TextBox, boxIdx []int, cells []TSRCell, scale, cropOffX, cropOffY float64, tb TableBuilder) { tableCells := make([]TSRCell, len(cells)) for k := range cells { tableCells[k] = cellAddOffset(cells[k], cropOffX, cropOffY) } tblBoxes := make([]TextBox, len(boxIdx)) for k, idx := range boxIdx { b := boxes[idx] tblBoxes[k] = TextBox{ X0: b.X0 * scale, X1: b.X1 * scale, Top: b.Top * scale, Bottom: b.Bottom * scale, LayoutType: b.LayoutType, Text: b.Text, } } annotGrid := tb.GroupCells(tableCells) annotateTableBoxes(tblBoxes, annotGrid) // Write back per-box annotations scaled to PDF space. for k, idx := range boxIdx { bp := &tblBoxes[k] boxes[idx].R = bp.R boxes[idx].RTop = bp.RTop / scale boxes[idx].RBott = bp.RBott / scale boxes[idx].H = bp.H boxes[idx].HTop = bp.HTop / scale boxes[idx].HBott = bp.HBott / scale boxes[idx].HLeft = bp.HLeft / scale boxes[idx].HRight = bp.HRight / scale boxes[idx].C = bp.C boxes[idx].CLeft = bp.CLeft / scale boxes[idx].CRight = bp.CRight / scale boxes[idx].SP = bp.SP } } // ── image helpers ────────────────────────────────────────────────────── // table crop margin in DLA pixel space. Python uses MARGIN=10 in DPI 72 // space then scales by ZM (zoom factor). Since ZM=3 (default), the effective // cropImageRegion crops a DLARegion from an image with a 3% margin // (matching Python's _table_transformer_job: w*0.03, h*0.03). func cropImageRegion(img image.Image, r DLARegion) (image.Image, error) { w := r.X1 - r.X0 h := r.Y1 - r.Y0 marginX := w * 0.03 marginY := h * 0.03 maxX := float64(img.Bounds().Dx()) maxY := float64(img.Bounds().Dy()) x0 := int(math.Max(0, r.X0-marginX)) y0 := int(math.Max(0, r.Y0-marginY)) x1 := int(math.Min(maxX, r.X1+marginX)) y1 := int(math.Min(maxY, r.Y1+marginY)) // Python PIL.Image.crop() raises ValueError when right < left or // bottom < top. We return an error instead of silently falling back // to the full-page image — the caller skips this table gracefully. if x0 >= x1 || y0 >= y1 { return nil, fmt.Errorf("crop: invalid region x0=%d y0=%d x1=%d y1=%d (DLA raw: %.1f,%.1f,%.1f,%.1f)", x0, y0, x1, y1, r.X0, r.Y0, r.X1, r.Y1) } cropped := fastCrop(img, x0, y0, x1, y1) return cropped, nil } // annotateBoxLayouts sets LayoutType and LayoutNo on each box, matching // Python's LayoutRecognizer.__call__ which assigns layout types in priority // order (footer→header→…→equation) with an overlap threshold of 40% of the // box's area. // // Python: _layouts_rec (pdf_parser.py:827) → LayoutRecognizer.__call__ → // // for lt in priority_order: findLayout(lt) // // Each findLayout(ty): for each unannotated box, find the DLA region of // type ty with max overlap ≥ 0.4 × box_area. First type to match wins. // // CID-pattern boxes (e.g. "(cid:123)") are skipped as garbage. // annotateBoxLayouts assigns LayoutType and LayoutNo to boxes based on DLA // regions. Returns the filtered slice (Python pops CID-garbled boxes and // garbage-layout boxes at wrong positions — Go mirrors with compact). // Also creates synthetic figure boxes for unmatched figure/equation regions. func annotateBoxLayouts(boxes []TextBox, regions []DLARegion, scale float64, pageImgHeight float64) []TextBox { if len(regions) == 0 { return boxes } // Scale all regions to PDF space once. type scaledRegion struct { x0, y0, x1, y1 float64 label string } scaled := make([]scaledRegion, len(regions)) for i, r := range regions { scaled[i] = scaledRegion{ x0: r.X0 / scale, y0: r.Y0 / scale, x1: r.X1 / scale, y1: r.Y1 / scale, label: r.Label, } } // DLA confidence filter — matches Python's `score >= 0.4`. regionOK := make([]bool, len(regions)) for i, r := range regions { regionOK[i] = r.Confidence >= 0.4 || !isGarbageLayoutType(r.Label) } // Pre-compute per-type index for each region (Python: matched index within // filtered layouts_of_type list). "text" regions get 0,1,2... independent // of "figure" regions. typeIndex := make([]int, len(regions)) typeCounters := make(map[string]int) for j, r := range scaled { if regionOK[j] { typeIndex[j] = typeCounters[r.label] typeCounters[r.label]++ } } // Track visited regions (Python: layout["visited"] = True). visited := make([]bool, len(regions)) // Marks for Python-style pop removal. dropped := make([]bool, len(boxes)) // Priority order matching Python's findLayout loop. priorityOrder := []string{ LayoutTypeFooter, LayoutTypeHeader, LayoutTypeReference, DLALabelFigureCaption, DLALabelTableCaption, LayoutTypeTitle, LayoutTypeTable, LayoutTypeText, LayoutTypeFigure, LayoutTypeEquation, } for _, ty := range priorityOrder { for i := range boxes { if boxes[i].LayoutType != "" || dropped[i] { continue } // CID garbage: pop the box entirely (Python: bxs.pop(i)). if cidPattern.MatchString(boxes[i].Text) { dropped[i] = true continue } boxArea := (boxes[i].X1 - boxes[i].X0) * (boxes[i].Bottom - boxes[i].Top) if boxArea <= 0 { continue } bestOverlap := 0.0 bestJ := -1 for j, r := range scaled { if r.label != ty || !regionOK[j] { continue } ix0 := math.Max(r.x0, boxes[i].X0) iy0 := math.Max(r.y0, boxes[i].Top) ix1 := math.Min(r.x1, boxes[i].X1) iy1 := math.Min(r.y1, boxes[i].Bottom) if ix0 < ix1 && iy0 < iy1 { ov := (ix1 - ix0) * (iy1 - iy0) / boxArea if ov > bestOverlap { bestOverlap = ov bestJ = j } } } if bestJ >= 0 && bestOverlap >= 0.4 { // Garbage layout not at page edge → pop (Python: bxs.pop(i)). if isGarbageLayoutType(ty) && pageImgHeight > 0 && !garbageKeepFeat(ty, boxes[i], pageImgHeight/scale) { dropped[i] = true continue } visited[bestJ] = true // Python: equation mapped to "figure" for layout_type if ty == LayoutTypeEquation { boxes[i].LayoutType = LayoutTypeFigure } else { boxes[i].LayoutType = ty } // Python: f"{layout_type}-{matched}" where matched is per-type index boxes[i].LayoutNo = fmt.Sprintf("%s-%d", ty, typeIndex[bestJ]) } } } // Compact: remove popped boxes into a new backing array (Python // bxs.pop). Allocating a fresh slice is deliberate: annotations were // set in-place on the input elements, and callers (enrichWithDeepDoc) // rely on positional stability of the original slice for their // write-back loop. Reusing the input backing array would shift // survivors forward and break that index mapping. survivors := 0 for i := range boxes { if !dropped[i] { survivors++ } } compacted := make([]TextBox, 0, survivors) for i := range boxes { if !dropped[i] { compacted = append(compacted, boxes[i]) } } boxes = compacted // Synthetic figure boxes for unmatched figure/equation regions (Python: // dla_cli.py:187-195). Use a fresh per-type counter for synthetic boxes. synthIdx := 0 for j, r := range scaled { if !regionOK[j] || visited[j] { continue } if r.label != LayoutTypeFigure && r.label != LayoutTypeEquation { continue } boxes = append(boxes, TextBox{ X0: r.x0, X1: r.x1, Top: r.y0, Bottom: r.y1, Text: "", LayoutType: LayoutTypeFigure, LayoutNo: fmt.Sprintf("figure-%d", synthIdx), }) synthIdx++ } return boxes } // garbageLayoutTypes matches Python's self.garbage_layouts. var garbageLayoutTypes = map[string]bool{ LayoutTypeFooter: true, LayoutTypeHeader: true, LayoutTypeReference: true, } func isGarbageLayoutType(ty string) bool { return garbageLayoutTypes[ty] } // garbageKeepFeat matches Python's keep_feats in LayoutRecognizer.__call__: // footer near page bottom (>90% of page height) or header near page top (<10%) // are real page decorations — keep them. Others are DLA noise. func garbageKeepFeat(ty string, box TextBox, pageImgHeight float64) bool { switch ty { case LayoutTypeFooter: return box.Bottom < pageImgHeight*0.9 case LayoutTypeHeader: return box.Top > pageImgHeight*0.1 } return false } func encodeImageToBase64PNG(img image.Image) (string, error) { data, err := encodePNG(img) if err != nil { return "", err } return base64.StdEncoding.EncodeToString(data), nil } // ── construct table ───────────────────────────────────────────────────── // mergeTablesAcrossPages merges TableItems on consecutive pages with // overlapping X and close Y proximity. Matches Python's // _extract_table_figure table merge (pdf_parser.py:1061-1080). func mergeTablesAcrossPages(tables []TableItem, medianHeights map[int]float64) []TableItem { if len(tables) <= 1 { return tables } // Sort by position for deterministic adjacency. type indexed struct { idx int pg int top float64 } var items []indexed for i, tbl := range tables { if len(tbl.Positions) == 0 { continue } p := tbl.Positions[0] pg := 0 if len(p.PageNumbers) > 0 { pg = p.PageNumbers[0] } items = append(items, indexed{i, pg, p.Top}) } sort.Slice(items, func(a, b int) bool { if items[a].pg != items[b].pg { return items[a].pg < items[b].pg } return items[a].top < items[b].top }) merged := make([]bool, len(tables)) var result []TableItem for _, it := range items { if merged[it.idx] { continue } anchor := tables[it.idx] merged[it.idx] = true // Python nomerge_lout_no: tables whose box is followed by a // caption/title/reference should not be merged cross-page. if anchor.NoMerge { result = append(result, anchor) continue } anchorPg := it.pg anchorBott := anchor.Positions[0].Bottom // Look for consecutive-page continuations. for _, jt := range items { if merged[jt.idx] || jt.pg <= anchorPg { continue } // Python nomerge_lout_no: skip continuation candidates // tagged as no-merge. if tables[jt.idx].NoMerge { continue } if jt.pg-anchorPg > 1 { break // pages must be consecutive } if len(tables[jt.idx].Positions) == 0 { continue } bp := tables[jt.idx].Positions[0] bpg := 0 if len(bp.PageNumbers) > 0 { bpg = bp.PageNumbers[0] } if bpg != anchorPg+1 { continue } // Check X overlap. ap := anchor.Positions[0] if ap.Right < bp.Left || bp.Right < ap.Left { continue } // Check Y proximity: page 1 table top should be close below // page 0 table bottom. Python: y_dis ≤ mh * 23. mh := 10.0 if medianHeights != nil { if h, ok := medianHeights[anchorPg]; ok && h > 0 { mh = h } } yDis := (bp.Top + bp.Bottom - anchorBott - ap.Bottom) / 2 if yDis > mh*23 { continue } // Merge: combine cells and positions. anchor.Cells = append(anchor.Cells, tables[jt.idx].Cells...) anchor.Positions = append(anchor.Positions, tables[jt.idx].Positions...) if tables[jt.idx].Caption != "" { if anchor.Caption != "" { anchor.Caption += " " } anchor.Caption += tables[jt.idx].Caption } merged[jt.idx] = true anchorPg = bpg anchorBott = bp.Bottom } result = append(result, anchor) } return result } // constructTable produces an HTML table string from TSR cells and text boxes. // Both cells and boxes must be in the same coordinate space (crop pixel space). // Fills item.Rows so downstream consumers don't need to re-group cells. // // Python equivalent: TableStructureRecognizer.construct_table() // stripCaptionFromCells clears caption-like text from TSR cells. // This catches captions that fillCellTextFromBoxes missed (e.g. text // that doesn't match isCaptionBox patterns like "公司差旅费管理办法"). // Only clears cells whose text matches caption patterns or that contain // only number+separator text (pure "1. ", "一、" etc. without data). func stripCaptionFromCells(cells []TSRCell) { for i := range cells { t := strings.TrimSpace(cells[i].Text) if t == "" { continue } // Clear cells that match caption patterns (e.g. "表1", "Table 1"). if isCaptionBox(t, "") { cells[i].Text = "" } } // Second pass: if the first row (lowest Y) has all-numeric/numbering text // (e.g. "1", "1.", "一"), it's likely a caption numbering line — clear it. // But don't clear actual numeric data cells. // This pass is intentionally conservative — only clears clearly-non-data text. } func constructTable(cells []TSRCell, boxes []TextBox, caption string, item *TableItem) string { // Strip caption-like text from cells (defense-in-depth: fillCellTextFromBoxes // may include caption text that doesn't match isCaptionBox patterns). stripCaptionFromCells(cells) // Use the pre-computed grid from TableBuilder.GroupCells. // Falls back to cell-level grouping only when called directly by // tests without a pre-computed Grid (production always sets it). var rows [][]TSRCell if item != nil { rows = item.Grid } if rows == nil && len(cells) > 0 && hasAnyText(cells) { rows = groupTSRCellsToRowsLabeled(cells) } if len(rows) > 0 && hasText(rows) { hdrs := headerSetWithBlockType(rows) if item != nil { item.Rows = rowsToStrings(rows) } rows = cleanupOrphanColumns(rows) spanInfo, covered := calSpans(rows) return rowsToHTML(rows, caption, hdrs, spanInfo, covered) } // Fallback: boxes with R/C annotations. if len(boxes) > 0 && boxesHaveAnnotations(boxes) { rows := groupBoxesByRC(boxes) if hasText(rows) { if item != nil { item.Rows = rowsToStrings(rows) } spanInfo, covered := calSpans(rows) return rowsToHTML(rows, caption, boxHeaderSet(rows, boxes), spanInfo, covered) } } // Test-only: Y/X coordinate grouping (matching Python construct_table). // Used by table_parity_test.go to verify pipeline with Python boxes. if len(boxes) > 0 && !boxesHaveAnnotations(boxes) { rows := groupBoxesByYX(boxes) if hasText(rows) { if item != nil { item.Rows = rowsToStrings(rows) } spanInfo, covered := calSpans(rows) return rowsToHTML(rows, caption, boxHeaderSet(rows, boxes), spanInfo, covered) } } return "" } // boxHeaderSet returns rows that contain boxes with H annotations. func boxHeaderSet(rows [][]TSRCell, boxes []TextBox) map[int]bool { hdrs := make(map[int]bool) for _, b := range boxes { if b.H > 0 && b.R >= 0 && b.R < len(rows) { hdrs[b.R] = true } } return hdrs } func hasAnyText(cells []TSRCell) bool { for _, c := range cells { if strings.TrimSpace(c.Text) != "" { return true } } return false } // groupBoxesByRC groups text boxes into a cell grid by R/C annotations. // Matches Python's construct_table: sort by R, merge nearby rows by Y proximity, // sort by C within each row, merge nearby columns by X proximity. func groupBoxesByRC(boxes []TextBox) [][]TSRCell { if len(boxes) == 0 { return nil } // If no real R/C annotations (maxR <= 0), fall back to YX coordinate // grouping — matching Python's construct_table when all R=-1. maxR := 0 for _, b := range boxes { if b.R > maxR { maxR = b.R } } if maxR <= 0 { return groupBoxesByYX(boxes) } // Sort by R index first (Python: sort_R_firstly), then Y, then X. sort.Slice(boxes, func(i, j int) bool { if boxes[i].R != boxes[j].R { return boxes[i].R < boxes[j].R } if boxes[i].Top != boxes[j].Top { return boxes[i].Top < boxes[j].Top } return boxes[i].X0 < boxes[j].X0 }) // Compress R indices: Python's sort_R_firstly grouping. // R differs → always a new row. Same R + Y gap → also new row. rowMap := make(map[int]int) // original R → compressed row index compressed := 0 rowMap[boxes[0].R] = 0 lastR := boxes[0].R btm := boxes[0].Bottom for i := 1; i < len(boxes); i++ { // Python: b["R"] != last_R → new row. // Same R → always same row (Python doesn't check Y for same R). if boxes[i].R != lastR { compressed++ rowMap[boxes[i].R] = compressed lastR = boxes[i].R btm = boxes[i].Bottom } else { // Same R → same physical row. rowMap[boxes[i].R] = compressed btm = (btm + boxes[i].Bottom) / 2.0 } } // Collect boxes per row, sort by C within each row. type rb struct { row, col int txt string x0, y0, x1, y1 float64 label string } cmap := make(map[int]map[int]*rb) // row → col → entry maxCols := make(map[int]int) for _, b := range boxes { t := strings.TrimSpace(b.Text) // Keep boxes with SP/H annotations even if text is empty — // their coordinates are needed for colspan/rowspan calculation. if t == "" && b.H <= 0 && b.SP <= 0 { continue } r := rowMap[b.R] c := b.C if cmap[r] == nil { cmap[r] = make(map[int]*rb) } x0, y0, x1, y1, label := cellPosFromBox(b) if v, ok := cmap[r][c]; ok { v.txt += " " + t // Merge spanning coordinates (use widest extent). if b.H > 0 || b.SP > 0 { v.label = cellLabelFromBox(b) if v.x0 > x0 { v.x0 = x0 } if v.y0 > y0 { v.y0 = y0 } if v.x1 < x1 { v.x1 = x1 } if v.y1 < y1 { v.y1 = y1 } } } else { cmap[r][c] = &rb{r, c, t, x0, y0, x1, y1, label} } if c > maxCols[r] { maxCols[r] = c } } // Compress C indices per row: sort boxes by X0 within the row, // group disjoint X ranges into separate columns. This is equivalent // to Python's sort_C_firstly but uses X0 ordering instead of C labels. cCompressed := make(map[int]map[int]int) // row → (original C → compressed col) cMaxCol := make(map[int]int) for ri := 0; ri <= compressed; ri++ { rowEntries := cmap[ri] if rowEntries == nil { continue } // Collect all boxes in this row, sorted by X0. type rowBox struct { c, idx int x0, x1 float64 txt string } var rowBoxes []rowBox for i, b := range boxes { if rowMap[b.R] == ri && (strings.TrimSpace(b.Text) != "" || b.H > 0 || b.SP > 0) { rowBoxes = append(rowBoxes, rowBox{c: b.C, idx: i, x0: b.X0, x1: b.X1, txt: b.Text}) } } sort.Slice(rowBoxes, func(i, j int) bool { return rowBoxes[i].x0 < rowBoxes[j].x0 }) // Assign compressed column by X-order (disjoint X → new col). cMap := make(map[int]int) // original C → compressed col right := 0.0 for _, rb := range rowBoxes { if len(cMap) == 0 || rb.x0 >= right { cc := len(cMap) cMap[rb.c] = cc right = rb.x1 } else { // Overlapping X → merge into last column. cMap[rb.c] = len(cMap) - 1 if rb.x1 > right { right = rb.x1 } } } cCompressed[ri] = cMap cMaxCol[ri] = len(cMap) - 1 } // Build grid. rows := make([][]TSRCell, compressed+1) for ri := 0; ri <= compressed; ri++ { maxC := cMaxCol[ri] rows[ri] = make([]TSRCell, maxC+1) for ci, v := range cmap[ri] { cci := cCompressed[ri][ci] if cci <= maxC { rows[ri][cci].Text = v.txt rows[ri][cci].X0 = v.x0 rows[ri][cci].Y0 = v.y0 rows[ri][cci].X1 = v.x1 rows[ri][cci].Y1 = v.y1 rows[ri][cci].Label = v.label } } } return rows } // cellPosFromBox returns the position coordinates and label for a cell // derived from a text box. Header cells use HLeft/HRight/HTop/HBott // for spanning-aware positions; regular cells use the box's own bounds. func cellPosFromBox(b TextBox) (x0, y0, x1, y1 float64, label string) { x0, y0, x1, y1 = b.X0, b.Top, b.X1, b.Bottom if b.H > 0 { label = "table header" if b.HLeft != 0 || b.HRight != 0 { if b.HLeft != 0 { x0 = b.HLeft } if b.HRight != 0 { x1 = b.HRight } } if b.HTop != 0 { y0 = b.HTop } if b.HBott != 0 { y1 = b.HBott } } else if b.SP > 0 { label = "table spanning cell" } return } // cellLabelFromBox returns the TSR label for a box based on H/SP annotations. // Used when merging multiple boxes into one cell — preserves the spanning label. func cellLabelFromBox(b TextBox) string { if b.H > 0 { return "table header" } if b.SP > 0 { return "table spanning cell" } return "" } // groupBoxesByYX groups boxes into a cell grid by Y/X coordinates, // matching Python's construct_table which uses sort_R_firstly and // sort_C_firstly when R/C annotations are absent. // This is test-only — used by table_parity_test.go to verify pipeline // parity with Python boxes that lack R/C annotations. func groupBoxesByYX(boxes []TextBox) [][]TSRCell { if len(boxes) == 0 { return nil } // Sort by (page, top, x0) — same as Python sort_R_firstly with R=-1. sort.Slice(boxes, func(i, j int) bool { if boxes[i].PageNumber != boxes[j].PageNumber { return boxes[i].PageNumber < boxes[j].PageNumber } if boxes[i].Top != boxes[j].Top { return boxes[i].Top < boxes[j].Top } return boxes[i].X0 < boxes[j].X0 }) // Group into rows by Y proximity (Python's row grouping). type rowGroup struct { boxes []TextBox top, btm float64 } var rowGroups []rowGroup rowGroups = append(rowGroups, rowGroup{boxes: []TextBox{boxes[0]}, top: boxes[0].Top, btm: boxes[0].Bottom}) for i := 1; i < len(boxes); i++ { prev := &rowGroups[len(rowGroups)-1] // Python: same row if top < prev.btm (Y overlaps) and same page. if boxes[i].PageNumber == prev.boxes[0].PageNumber && boxes[i].Top < prev.btm { prev.boxes = append(prev.boxes, boxes[i]) if boxes[i].Top < prev.top { prev.top = boxes[i].Top } if boxes[i].Bottom > prev.btm { prev.btm = boxes[i].Bottom } } else { rowGroups = append(rowGroups, rowGroup{boxes: []TextBox{boxes[i]}, top: boxes[i].Top, btm: boxes[i].Bottom}) } } // Within each row, group into columns by X proximity. rows := make([][]TSRCell, len(rowGroups)) for ri, rg := range rowGroups { // Sort by X0. sort.Slice(rg.boxes, func(i, j int) bool { return rg.boxes[i].X0 < rg.boxes[j].X0 }) // Group by X overlap. var cols []struct { boxes []TextBox x1 float64 } cols = append(cols, struct { boxes []TextBox x1 float64 }{boxes: []TextBox{rg.boxes[0]}, x1: rg.boxes[0].X1}) for i := 1; i < len(rg.boxes); i++ { prev := &cols[len(cols)-1] if rg.boxes[i].X0 < prev.x1 { prev.boxes = append(prev.boxes, rg.boxes[i]) if rg.boxes[i].X1 > prev.x1 { prev.x1 = rg.boxes[i].X1 } } else { cols = append(cols, struct { boxes []TextBox x1 float64 }{boxes: []TextBox{rg.boxes[i]}, x1: rg.boxes[i].X1}) } } rows[ri] = make([]TSRCell, len(cols)) for ci, col := range cols { var sb strings.Builder for _, b := range col.boxes { t := strings.TrimSpace(b.Text) if t == "" { continue } if sb.Len() > 0 { sb.WriteByte(' ') } sb.WriteString(t) } rows[ri][ci].Text = sb.String() } } return rows } func boxesHaveAnnotations(boxes []TextBox) bool { maxR, maxC := 0, 0 for _, b := range boxes { if b.R > maxR { maxR = b.R } if b.C > maxC { maxC = b.C } } // True if at least 2 rows or 2 cols (R/C are 0-based, so maxR>0 means ≥2 rows). return maxR > 0 || maxC > 0 } func hasText(rows [][]TSRCell) bool { for _, row := range rows { for _, c := range row { if strings.TrimSpace(c.Text) != "" { return true } } } return false } func rowsToStrings(rows [][]TSRCell) [][]string { out := make([][]string, len(rows)) for ri, row := range rows { out[ri] = make([]string, len(row)) for ci, c := range row { out[ri][ci] = c.Text } } return out } // fillCellTextFromAnnotations fills cell text from text boxes using R/C labels. // This matches Python's construct_table which assigns boxes to cells by their // R (row) and C (col) annotations rather than spatial overlap. func fillCellTextFromAnnotations(rows [][]TSRCell, boxes []TextBox) { // Build R→(C→text) map: row index → (col index → text). rBoxes := make(map[int]map[int][]string) for _, b := range boxes { if b.Text == "" { continue } if rBoxes[b.R] == nil { rBoxes[b.R] = make(map[int][]string) } rBoxes[b.R][b.C] = append(rBoxes[b.R][b.C], b.Text) } // Fill each cell from the matching R/C position. for ri, row := range rows { colMap := rBoxes[ri] if colMap == nil { continue } // Build sorted column list for positional matching. type colEntry struct { c int texts []string } var cols []colEntry for c, texts := range colMap { cols = append(cols, colEntry{c, texts}) } sort.Slice(cols, func(i, j int) bool { return cols[i].c < cols[j].c }) for ci, col := range cols { if ci < len(row) { row[ci].Text = strings.TrimSpace(strings.Join(col.texts, " ")) } } } } // dataSourceRe matches table/figure boxes that should be discarded as // data-source attribution lines rather than extracted content. // // Python: pdf_parser.py:1040-1042, 1050-1052 // // re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]) var dataSourceRe = regexp.MustCompile(`^(数据|资料|图表)*来源[:: ]`) // isDataSourceBox returns true if the box text matches the data-source // discard pattern (Python's _extract_table_figure data-source filter). func isDataSourceBox(text string) bool { return dataSourceRe.MatchString(text) } // tableRegionBox returns a TextBox for a table replacement, using DLA region // boundaries when available (Region* set), falling back to anchor box coordinates. // Python's insert_table_figures uses DLA layout region boundaries; the fallback // handles test TableItems or bare engines without DLA. func tableRegionBox(tbl *TableItem, ref *TextBox, html string) TextBox { pg := 0 if len(tbl.Positions) > 0 && len(tbl.Positions[0].PageNumbers) > 0 { pg = tbl.Positions[0].PageNumbers[0] } // Use DLA region boundaries when set. if tbl.RegionLeft != 0 || tbl.RegionRight != 0 || tbl.RegionTop != 0 || tbl.RegionBottom != 0 { return TextBox{ X0: tbl.RegionLeft, X1: tbl.RegionRight, Top: tbl.RegionTop, Bottom: tbl.RegionBottom, Text: html, PageNumber: pg, LayoutType: LayoutTypeTable, } } // Fallback: use anchor box coordinates. x0, x1, top, bot := ref.X0, ref.X1, ref.Top, ref.Bottom return TextBox{ X0: x0, X1: x1, Top: top, Bottom: bot, Text: html, PageNumber: pg, LayoutType: LayoutTypeTable, } } // minRectangleDistance computes the Euclidean distance between two rectangles. // Returns 0 when rectangles overlap. Matches Python's min_rectangle_distance // in insert_table_figures (pdf_parser.py:1609-1626). func minRectangleDistance(left1, right1, top1, bottom1, left2, right2, top2, bottom2 float64) float64 { if right1 >= left2 && right2 >= left1 && bottom1 >= top2 && bottom2 >= top1 { return 0 } var dx, dy float64 if right1 < left2 { dx = left2 - right1 } else if right2 < left1 { dx = left1 - right2 } if bottom1 < top2 { dy = top2 - bottom1 } else if bottom2 < top1 { dy = top1 - bottom2 } return math.Sqrt(dx*dx + dy*dy) } // extractTableAndReplace pops table boxes and replaces them with consolidated // HTML boxes (one per table). This matches Python's _extract_table_figure which // pops all boxes inside a table DLA region and inserts a single HTML box. // // Table boxes whose text matches the data-source discard pattern // (r"(数据|资料|图表)*来源[:: ]") are removed entirely without replacement — // matching Python's _extract_table_figure discard behavior. // markNoMergeTables traverses boxes in page order. When a caption, title, or // reference immediately follows a table, the preceding table is marked NoMerge // to prevent cross-page merge. Matches Python's nomerge_lout_no. func markNoMergeTables(boxes []TextBox, tables []TableItem) { var lastTableTI int = -1 for i := range boxes { lt := boxes[i].LayoutType if lt == LayoutTypeTable { matched := false for ti := range tables { for _, tp := range tables[ti].Positions { if boxOverlapsPosition(boxes[i], tp) { lastTableTI = ti matched = true break } } } if !matched { lastTableTI = -1 } continue } if lastTableTI >= 0 && (lt == LayoutTypeTitle || lt == DLALabelTableCaption || lt == DLALabelFigureCaption || lt == LayoutTypeReference || isCaptionBox(boxes[i].Text, lt)) { tables[lastTableTI].NoMerge = true } } } // boxes must be post-TextMerge + post-VerticalMerge. TableItem.Cells are in // crop pixel space; boxes are in PDF point space — conversion via Scale/CropOff. // replacement pairs a table index with the box index it replaces. type replacement struct { tableIdx int boxIdx int } // buildReplacements scans for data-source-attribution boxes to remove and maps // each table to overlapping table-layout boxes, producing the replacement list. func buildReplacements(boxes []TextBox, tables []TableItem) (map[int]bool, []replacement) { removeSet := make(map[int]bool) for i := range boxes { if boxes[i].LayoutType == LayoutTypeTable && isDataSourceBox(boxes[i].Text) { removeSet[i] = true } } var reps []replacement for ti := range tables { for i := range boxes { if boxes[i].LayoutType != LayoutTypeTable || removeSet[i] { continue } for _, tp := range tables[ti].Positions { if boxOverlapsPosition(boxes[i], tp) { reps = append(reps, replacement{tableIdx: ti, boxIdx: i}) break } } } } return removeSet, reps } func extractTableAndReplace(boxes []TextBox, tables []TableItem) []TextBox { if len(tables) == 0 { return boxes } // Pre-merge nomerge detection: match Python's nomerge_lout_no. // Traverse boxes in page order. When a caption/title/reference is // found, mark the preceding table group as NoMerge, preventing // cross-page merge when a caption ends a table group. // Python: if is_caption(c) or layout_type in ["table caption", "title", // "figure caption", "reference"]: nomerge_lout_no.append(lst_lout_no) markNoMergeTables(boxes, tables) // Merge same-layoutno tables across consecutive pages (Python _extract_table_figure). tables = mergeTablesAcrossPages(tables, nil) // Pre-scan: mark data-source-attribution table boxes for removal. // Python: if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]): // self.boxes.pop(i); continue — box discarded, no HTML replacement. removeSet, replacements := buildReplacements(boxes, tables) // Image-only PDFs (0 boxes) may have tables with cells but no // overlapping LayoutType=="table" boxes — generate HTML directly. if len(replacements) == 0 && len(boxes) == 0 { var out []TextBox for ti := range tables { if len(tables[ti].Cells) == 0 { continue } s := tables[ti].Scale pageGlobalCells := cellSliceToPageSpace(tables[ti].Cells, tables[ti].CropOffX, tables[ti].CropOffY, s) var tableBoxes []TextBox html := constructTable(pageGlobalCells, tableBoxes, tables[ti].Caption, &tables[ti]) if html != "" { out = append(out, TextBox{ Text: html, LayoutType: "table", PageNumber: 0, }) } } return out } if len(replacements) == 0 { // No HTML replacements, but data-source boxes still need removal. if len(removeSet) == 0 { return boxes } out := make([]TextBox, 0, len(boxes)-len(removeSet)) for i, b := range boxes { if !removeSet[i] { out = append(out, b) } } return out } // Distance-based anchor selection (Python's min_rectangle_distance). // Find the spatially nearest non-table text box for each table and // use that as the anchor, matching insert_table_figures behavior. replacedByTable := make(map[int]int) for ti := range tables { if len(tables[ti].Cells) == 0 { continue } tbl := &tables[ti] tblLeft, tblRight := tbl.RegionLeft, tbl.RegionRight tblTop, tblBottom := tbl.RegionTop, tbl.RegionBottom tblPg := 0 if len(tbl.Positions) > 0 { p := tbl.Positions[0] if len(p.PageNumbers) > 0 { tblPg = p.PageNumbers[0] } if tblLeft == 0 && tblRight == 0 && tblTop == 0 && tblBottom == 0 { tblLeft, tblRight = p.Left, p.Right tblTop, tblBottom = p.Top, p.Bottom } } bestDist := math.MaxFloat64 bestIdx := -1 for i, b := range boxes { if b.LayoutType == LayoutTypeTable || b.LayoutType == LayoutTypeFigure { continue } if b.PageNumber != tblPg { continue } dist := minRectangleDistance( b.X0, b.X1, b.Top, b.Bottom, tblLeft, tblRight, tblTop, tblBottom, ) if dist < bestDist { bestDist = dist bestIdx = i } } if bestIdx >= 0 { if boxes[bestIdx].Bottom < tblTop { bestIdx++ } replacedByTable[ti] = bestIdx } else { for _, r := range replacements { if r.tableIdx == ti { if _, ok := replacedByTable[ti]; !ok || r.boxIdx < replacedByTable[ti] { replacedByTable[ti] = r.boxIdx } } } } } for _, r := range replacements { removeSet[r.boxIdx] = true } // Build HTML for each table using post-merge boxes converted to crop space. htmlByTable := make(map[int]string) for ti := range tables { if len(tables[ti].Cells) == 0 { continue } // Convert TSR cells from crop-pixel space to page-global 72 DPI, // matching Python's coordinate space. Text boxes are already in // page-global 72 DPI (from ocrMergeChars), so no conversion needed. s := tables[ti].Scale pageGlobalCells := cellSliceToPageSpace(tables[ti].Cells, tables[ti].CropOffX, tables[ti].CropOffY, s) // Collect only table-labelled boxes (Python: filters by layout_type). var tableBoxes []TextBox for i := range boxes { if boxes[i].LayoutType != LayoutTypeTable { continue } for _, tp := range tables[ti].Positions { if boxOverlapsPosition(boxes[i], tp) { tableBoxes = append(tableBoxes, boxes[i]) break } } } slog.Debug("extractTableAndReplace constructTable", "table", ti, "cells", len(pageGlobalCells), "boxes", len(tableBoxes)) htmlByTable[ti] = constructTable(pageGlobalCells, tableBoxes, tables[ti].Caption, &tables[ti]) } // Sort anchors by position for stable insertion. anchorList := make([]struct{ ti, pos int }, 0, len(replacedByTable)) for ti, pos := range replacedByTable { anchorList = append(anchorList, struct{ ti, pos int }{ti, pos}) } sort.Slice(anchorList, func(i, j int) bool { return anchorList[i].pos < anchorList[j].pos }) out := make([]TextBox, 0, len(boxes)-len(removeSet)+len(replacedByTable)) anchorIdx := 0 for i, b := range boxes { // Insert any HTML boxes whose anchor position is before or at i. for anchorIdx < len(anchorList) && anchorList[anchorIdx].pos <= i { ti := anchorList[anchorIdx].ti html := htmlByTable[ti] if html != "" { tbl := &tables[ti] out = append(out, tableRegionBox(tbl, &b, html)) } anchorIdx++ } if !removeSet[i] { out = append(out, b) } } // Remaining anchors after last box. for anchorIdx < len(anchorList) { ti := anchorList[anchorIdx].ti html := htmlByTable[ti] if html != "" { tbl := &tables[ti] last := &boxes[len(boxes)-1] out = append(out, tableRegionBox(tbl, last, html)) } anchorIdx++ } return out } // consolidateFigures merges figure boxes that share the same LayoutNo // (i.e., belong to the same DLA figure region) into a single TextBox. // Matches Python's _extract_table_figure + insert_table_figures which pops // individual figure boxes and re-inserts one consolidated figure block // per DLA region with combined text. // // Figure boxes whose text matches the data-source discard pattern // (r"(数据|资料|图表)*来源[:: ]") are removed entirely — matching Python's // _extract_table_figure discard behavior (pdf_parser.py:1050-1052). func consolidateFigures(boxes []TextBox) []TextBox { // Pre-scan: mark data-source-attribution figure boxes for removal. // Python: if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]): // self.boxes.pop(i); continue — box discarded. removeSet := make(map[int]bool) for i, b := range boxes { if b.LayoutType == LayoutTypeFigure && isDataSourceBox(b.Text) { removeSet[i] = true } } // Group figure boxes by (page, layoutno). type figKey struct { page int ln string } groups := make(map[figKey][]int) for i, b := range boxes { if b.LayoutType != LayoutTypeFigure || removeSet[i] { continue } key := figKey{b.PageNumber, b.LayoutNo} groups[key] = append(groups[key], i) } if len(groups) == 0 { // Still need to filter out data-source figure boxes. if len(removeSet) == 0 { return boxes } out := make([]TextBox, 0, len(boxes)-len(removeSet)) for i, b := range boxes { if !removeSet[i] { out = append(out, b) } } return out } // Collect indices to remove (all group members except the first). for _, indices := range groups { if len(indices) <= 1 { continue } // Merge into the first box of the group. anchor := indices[0] for _, idx := range indices[1:] { b := boxes[idx] boxes[anchor].Text += "\n" + b.Text boxes[anchor].X0 = math.Min(boxes[anchor].X0, b.X0) boxes[anchor].X1 = math.Max(boxes[anchor].X1, b.X1) boxes[anchor].Top = math.Min(boxes[anchor].Top, b.Top) boxes[anchor].Bottom = math.Max(boxes[anchor].Bottom, b.Bottom) removeSet[idx] = true } } if len(removeSet) == 0 { return boxes } out := make([]TextBox, 0, len(boxes)-len(removeSet)) for i, b := range boxes { if !removeSet[i] { out = append(out, b) } } return out } // boxOverlapsPosition checks if a TextBox overlaps a Position with margin. func boxOverlapsPosition(box TextBox, pos Position) bool { const margin = 2.0 return box.X0 <= pos.Right+margin && box.X1 >= pos.Left-margin && box.Top <= pos.Bottom+margin && box.Bottom >= pos.Top-margin } // ── coordinate space conversion helpers ────────────────────────────── // cellToPageSpace converts from crop-pixel space to page-global 72-DPI space. func cellToPageSpace(c TSRCell, cropOffX, cropOffY, scale float64) TSRCell { return TSRCell{ X0: (c.X0 + cropOffX) / scale, Y0: (c.Y0 + cropOffY) / scale, X1: (c.X1 + cropOffX) / scale, Y1: (c.Y1 + cropOffY) / scale, Text: c.Text, Label: c.Label, } } // cellAddOffset applies a crop offset to cell coordinates (stays in pixel space). func cellAddOffset(c TSRCell, offX, offY float64) TSRCell { return TSRCell{ X0: c.X0 + offX, Y0: c.Y0 + offY, X1: c.X1 + offX, Y1: c.Y1 + offY, Text: c.Text, Label: c.Label, } } // cellSliceToPageSpace converts a slice of cells from crop-pixel to page DPI space. func cellSliceToPageSpace(cells []TSRCell, cropOffX, cropOffY, scale float64) []TSRCell { out := make([]TSRCell, len(cells)) for i, c := range cells { out[i] = cellToPageSpace(c, cropOffX, cropOffY, scale) } return out } // boxToCropSpace converts a TextBox from PDF-point space to crop-pixel space. func boxToCropSpace(b TextBox, scale, cropOffX, cropOffY float64) TextBox { return TextBox{ X0: b.X0*scale - cropOffX, X1: b.X1*scale - cropOffX, Top: b.Top*scale - cropOffY, Bottom: b.Bottom*scale - cropOffY, Text: b.Text, } } // copyBoxAnnotations copies the DLA/TSR annotation fields from src to dst. func copyBoxAnnotations(dst, src *TextBox) { dst.R = src.R dst.C = src.C dst.RTop = src.RTop dst.RBott = src.RBott dst.H = src.H dst.HTop = src.HTop dst.HBott = src.HBott dst.HLeft = src.HLeft dst.HRight = src.HRight dst.CLeft = src.CLeft dst.CRight = src.CRight dst.SP = src.SP } // rowsToHTML converts grouped TSR cell rows to an HTML table string. // spanInfo maps (row,col) → (colspan, rowspan) for spanning cells; // covered marks cells hidden by a span. Both may be nil. func rowsToHTML(rows [][]TSRCell, caption string, headerRows map[int]bool, spanInfo map[[2]int][2]int, covered map[[2]int]bool) string { var b strings.Builder b.WriteString("