Files
ragflow/internal/deepdoc/parser/pdf/parser.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

1069 lines
33 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package parser
import (
"context"
"errors"
"fmt"
"image"
"log/slog"
"math"
"math/rand/v2"
"regexp"
"sort"
"strings"
"sync"
)
// dlaDPI is the DPI used for rendering page images for DeepDoc DLA/OCR.
const dlaDPI = 216
// dlaScale is the scale factor from PDF points (72 DPI) to DLA image space.
const dlaScale = dlaDPI / 72.0
// Parser is the main PDF text/layout extraction pipeline.
// It corresponds to RAGFlowPdfParser in pdf_parser.py.
// Parser is stateless after construction — safe to reuse across documents.
type Parser struct {
Config ParserConfig
// DeepDoc is the required document layout / OCR / table recognition
// service. Set at construction time by NewParser.
DeepDoc DocAnalyzer
// SampleChars samples up to n chars from a page for English detection.
// Defaults to random sampling (matching Python's random.choices).
// Inject a deterministic sampler for reproducible tests.
SampleChars SampleFunc
// tableBuilder is the TSR model adapter. Set at construction time
// by NewParser from DeepDoc.ModelType(). Callers can inject a
// different implementation via Config.TableBuilder.
tableBuilder TableBuilder
// debugDLA and debugTSR collect intermediates for comparison with Python.
// Set before Parse(), read from ParseResult after, cleared by Parse().
debugDLA []DLAPageRegions
debugTSR []TSRRawCell
}
// PDFEngine abstracts page extraction capabilities.
// Calling code provides the implementation (pdfplumber-rs, etc.).
type PDFEngine interface {
// ExtractChars returns all characters on a page with position data.
// pageNum is 0-indexed.
ExtractChars(pageNum int) ([]TextChar, error)
// RenderPage renders a page to PNG bytes at the given DPI.
RenderPage(pageNum int, dpi float64) ([]byte, error)
// RenderPageImage renders a page as image.Image at the given DPI.
// Used by DeepDoc DLA/TSR/OCR which need width/height metadata.
RenderPageImage(pageNum int, dpi float64) (image.Image, error)
// RawData returns the original PDF bytes, used by the pdfium
// rendering path. Must return the full, unmodified PDF content.
RawData() []byte
// PageCount returns the total number of pages.
PageCount() (int, error)
// Close releases resources held by the engine.
Close() error
}
// Tokenizer provides text tokenization matching rag_tokenizer.
// Used by MergeSameBullet to detect Chinese characters.
type Tokenizer interface {
Tag(token string) string // POS tag
}
// SampleFunc samples up to n characters from a page's chars,
// returning them concatenated as a single string.
// The default implementation uses random sampling (matching Python's
// random.choices). Tests can inject a deterministic sampler.
type SampleFunc func(chars []TextChar, n int) string
// NewParser creates a new Parser with the required DeepDoc service.
func NewParser(cfg ParserConfig, doc DocAnalyzer) *Parser {
tb := cfg.TableBuilder
if tb == nil {
tb = NewTableBuilderFor(doc)
}
return &Parser{
Config: cfg,
DeepDoc: doc,
tableBuilder: tb,
}
}
// Parse runs the full PDF extraction pipeline: chars → boxes →
// column assignment → text merge → vertical merge → sections.
//
// For documents larger than Config.ChunkSize pages, processes in chunks
// to bound memory usage (matching Python's batch_size=50).
//
// Returns a ParseResult containing sections, tables, page images, figures,
// and pipeline stage metrics. Parser itself remains stateless.
func (p *Parser) Parse(ctx context.Context, engine PDFEngine) (*ParseResult, error) {
// Normalize page range
pageCount, err := engine.PageCount()
if err != nil {
return nil, fmt.Errorf("page count: %w", err)
}
toPage := p.Config.ToPage
if toPage < 0 || toPage >= pageCount {
toPage = pageCount - 1
}
fromPage := p.Config.FromPage
if toPage < fromPage {
return &ParseResult{PageImages: make(map[int]image.Image)}, nil
}
totalPages := toPage - fromPage + 1
chunkSize := p.Config.ChunkSize
if chunkSize <= 0 {
chunkSize = 50 // default, matching Python's batch_size
}
// ── Prescan: lightweight char extraction for language/noise detection ──
// No rendering, no OCR — just raw chars for global decisions.
prescanChars := make(map[int][]TextChar)
prescanMedianH := make(map[int]float64)
prescanMedianW := make(map[int]float64)
for pg := fromPage; pg <= toPage; pg++ {
chars, extractErr := engine.ExtractChars(pg)
if extractErr != nil {
slog.Warn("prescan: ExtractChars failed", "page", pg, "err", extractErr)
chars = nil // skip broken pages (matching old behavior)
}
prescanChars[pg] = chars
prescanMedianH[pg] = MedianCharHeight(chars)
prescanMedianW[pg] = MedianCharWidth(chars)
}
isEnglish := detectEnglish(prescanChars, totalPages, p.SampleChars)
scanNoise := isScanNoise(fullTextFromChars(prescanChars))
// ── Small document: process all at once (no chunking overhead) ──
if totalPages <= chunkSize {
return p.processPages(ctx, engine, fromPage, toPage,
prescanChars, prescanMedianH, prescanMedianW, isEnglish, scanNoise)
}
// ── Large document: process in chunks to bound memory ──
slog.Info("chunked processing", "pages", totalPages, "chunkSize", chunkSize)
result := &ParseResult{PageImages: make(map[int]image.Image)}
for start := fromPage; start <= toPage; start += chunkSize {
if err := ctx.Err(); err != nil {
return nil, fmt.Errorf("cancelled at chunk starting page %d: %w", start, err)
}
end := min(start+chunkSize-1, toPage)
// Slice prescan data for this chunk.
chunkChars := make(map[int][]TextChar, end-start+1)
chunkMH := make(map[int]float64, end-start+1)
chunkMW := make(map[int]float64, end-start+1)
for pg := start; pg <= end; pg++ {
chunkChars[pg] = prescanChars[pg]
chunkMH[pg] = prescanMedianH[pg]
chunkMW[pg] = prescanMedianW[pg]
}
chunk, err := p.processPages(ctx, engine, start, end,
chunkChars, chunkMH, chunkMW, isEnglish, scanNoise)
if err != nil {
return nil, err
}
// Merge chunk results.
result.Sections = append(result.Sections, chunk.Sections...)
result.Tables = append(result.Tables, chunk.Tables...)
result.Figures = append(result.Figures, chunk.Figures...)
for pg, img := range chunk.PageImages {
result.PageImages[pg] = img
}
result.Metrics.BoxesInitial += chunk.Metrics.BoxesInitial
result.Metrics.BoxesTextMerge += chunk.Metrics.BoxesTextMerge
result.Metrics.BoxesVertMerge += chunk.Metrics.BoxesVertMerge
result.Metrics.BoxesFinal += chunk.Metrics.BoxesFinal
result.Metrics.TablesCount += chunk.Metrics.TablesCount
}
return result, nil
}
// extractPages runs per-page OCR (detect + recognize) for the given page
// range, returning text boxes, char data, whether any page used OCR, and
// any errors encountered. Partial results are returned even when some
// pages fail — callers should inspect the error for diagnostics but may
// still use the returned boxes and chars.
func (p *Parser) extractPages(ctx context.Context, engine PDFEngine,
fromPage, toPage int,
prescanChars map[int][]TextChar,
medianHeights, medianWidths map[int]float64,
pageImages map[int]image.Image,
) ([]TextBox, map[int][]TextChar, bool, error) {
var boxes []TextBox
pageChars := make(map[int][]TextChar)
ocrUsedAny := false
type pr struct {
pg int
ocrBoxes []TextBox
chars []TextChar
ocrUsed bool
pageImg image.Image
err error
}
pageCount := toPage - fromPage + 1
results := make([]pr, pageCount)
// Semaphore cap: 0 → sequential; >0 → bounded parallelism.
cap := p.Config.MaxOCRConcurrency
if cap <= 0 {
cap = 1
}
sem := make(chan struct{}, cap)
var wg sync.WaitGroup
for i := 0; i < pageCount; i++ {
pg := fromPage + i
chars := prescanChars[pg]
// Fast path: pages with embedded chars → sequential inline (no HTTP OCR).
if len(chars) > 0 && !isGarbledPage(chars) {
pageImg, renderErr := renderPageToImage(engine, pg)
if renderErr == nil && pageImg != nil {
pageImages[pg] = pageImg
}
var ocrBoxes []TextBox
ocrUsed := false
if !p.Config.SkipOCR && renderErr == nil && pageImg != nil {
ocrBoxes = ocrMergeChars(ctx, pageImg, chars, p.DeepDoc, pg)
if ocrBoxes == nil {
ocrBoxes = charsToBoxes(chars, pg, p.Config.SortByTop)
} else {
ocrUsed = true
ocrUsedAny = true
}
} else {
ocrBoxes = charsToBoxes(chars, pg, p.Config.SortByTop)
}
results[i] = pr{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed}
continue
}
// OCR path: render + detect + recognize (potentially parallel).
wg.Add(1)
go func(i, pg int, chars []TextChar) {
defer wg.Done()
select {
case <-ctx.Done():
results[i] = pr{pg: pg, err: ctx.Err()}
return
case sem <- struct{}{}:
}
defer func() { <-sem }()
pageImg, err := renderPageToImage(engine, pg)
if err != nil {
results[i] = pr{pg: pg, err: err}
return
}
// Check if context was cancelled during render.
if err := ctx.Err(); err != nil {
results[i] = pr{pg: pg, err: err}
return
}
var ocrBoxes []TextBox
ocrUsed := false
if !p.Config.SkipOCR {
label := "scan page"
if len(chars) > 0 {
label = "garbled page"
}
ocrBoxes = ocrDetectAndRecognize(ctx, pageImg, p.DeepDoc, pg, label)
if ocrBoxes != nil {
for j := range ocrBoxes {
for _, r := range ocrBoxes[j].Text {
chars = append(chars, TextChar{Text: string(r), PageNumber: pg})
break
}
}
ocrUsed = true
}
}
// Merged OCR path for pages with both embedded and OCR chars.
if !ocrUsed && len(chars) > 0 && !p.Config.SkipOCR {
ocrBoxes = ocrMergeChars(ctx, pageImg, chars, p.DeepDoc, pg)
if ocrBoxes != nil {
ocrUsed = true
}
}
if !ocrUsed {
if len(chars) > 0 {
ocrBoxes = charsToBoxes(chars, pg, p.Config.SortByTop)
}
}
results[i] = pr{pg: pg, ocrBoxes: ocrBoxes, chars: chars, ocrUsed: ocrUsed, pageImg: pageImg}
}(i, pg, chars)
}
wg.Wait()
// Merge results in page order.
var errs []error
for i := 0; i < pageCount; i++ {
r := results[i]
if r.err != nil {
slog.Warn("page OCR failed", "page", r.pg, "err", r.err)
errs = append(errs, fmt.Errorf("page %d: %w", r.pg, r.err))
continue
}
if r.ocrUsed {
boxes = append(boxes, r.ocrBoxes...)
ocrUsedAny = true
} else if len(r.ocrBoxes) > 0 {
boxes = append(boxes, r.ocrBoxes...)
}
if r.pageImg != nil {
pageImages[r.pg] = r.pageImg
}
pageChars[r.pg] = r.chars
if r.ocrUsed {
medianHeights[r.pg] = MedianCharHeight(r.chars)
medianWidths[r.pg] = MedianCharWidth(r.chars)
}
}
return boxes, pageChars, ocrUsedAny, errors.Join(errs...)
}
// retryScanNoise re-runs OCR on all pages when prescan detects scan noise,
// overwriting page-level state with fresh detect+recognize results.
func (p *Parser) retryScanNoise(ctx context.Context, engine PDFEngine,
fromPage, toPage int,
pageImages map[int]image.Image,
pageChars map[int][]TextChar,
medianHeights, medianWidths map[int]float64,
ocrUsedAny bool,
) ([]TextBox, map[int][]TextChar, bool) {
slog.Warn("scan noise: OCR retry", "from", fromPage, "to", toPage)
var boxes []TextBox
for pg := fromPage; pg <= toPage; pg++ {
img := pageImages[pg]
if img == nil {
var err error
img, err = renderPageToImage(engine, pg)
if err != nil {
slog.Warn("scan noise: page render failed", "page", pg, "err", err)
continue
}
pageImages[pg] = img
}
ocrBoxes := ocrDetectAndRecognize(ctx, img, p.DeepDoc, pg, "scan page")
if ocrBoxes == nil {
slog.Warn("scan noise: page OCR empty", "page", pg)
continue
}
boxes = append(boxes, ocrBoxes...)
var chars []TextChar
for _, b := range ocrBoxes {
for _, r := range b.Text {
chars = append(chars, TextChar{Text: string(r), Top: b.Top, Bottom: b.Bottom, PageNumber: pg})
break
}
}
pageChars[pg] = chars
medianHeights[pg] = MedianCharHeight(chars)
medianWidths[pg] = MedianCharWidth(chars)
}
slog.Debug("scan noise OCR retry complete", "pages", toPage-fromPage+1, "boxes", len(boxes))
return boxes, pageChars, true
}
// retryZoom re-renders pages at higher resolution and re-runs OCR when the
// initial extraction produced zero boxes. Box coordinates are scaled back
// to Config.Zoom space. Matches Python's __images__ retry.
func (p *Parser) retryZoom(ctx context.Context, engine PDFEngine,
fromPage, toPage int,
pageImages map[int]image.Image,
boxes []TextBox, ocrUsedAny bool,
) ([]TextBox, bool) {
retryZoom := p.Config.Zoom * dlaScale
retryDPI := retryZoom * 72
slog.Info("zoom retry: re-rendering", "oldZoom", p.Config.Zoom, "newZoom", retryZoom)
for pg := fromPage; pg <= toPage; pg++ {
img, err := engine.RenderPageImage(pg, retryDPI)
if err != nil {
slog.Warn("zoom retry: render failed", "page", pg, "err", err)
continue
}
pageImages[pg] = img
// Downstream DLA/TSR assumes dlaDPI. Re-render at standard
// resolution so layout coordinates are scaled correctly.
if retryDPI != dlaDPI {
if dlaImg, dlaErr := engine.RenderPageImage(pg, dlaDPI); dlaErr == nil {
pageImages[pg] = dlaImg
}
}
ocrBoxes := ocrDetectAndRecognize(ctx, img, p.DeepDoc, pg, "zoom retry")
if ocrBoxes == nil {
continue
}
scaleFactor := retryZoom / p.Config.Zoom
for i := range ocrBoxes {
ocrBoxes[i].X0 /= scaleFactor
ocrBoxes[i].X1 /= scaleFactor
ocrBoxes[i].Top /= scaleFactor
ocrBoxes[i].Bottom /= scaleFactor
}
boxes = append(boxes, ocrBoxes...)
ocrUsedAny = true
}
return boxes, ocrUsedAny
}
// buildLayout runs the DLA → TSR → Column → TextMerge → VM → Section
// pipeline and populates result.Metrics, result.Tables, result.Sections,
// and result.Figures. Matches Python's _parse_loaded_window_into_bboxes
// order.
func (p *Parser) buildLayout(ctx context.Context,
result *ParseResult, engine PDFEngine,
boxes []TextBox, pageChars map[int][]TextChar,
medianHeights, medianWidths map[int]float64,
fromPage, toPage int, ocrUsedAny bool, isEnglish bool,
) error {
result.Metrics.BoxesInitial = len(boxes)
result.Tables = p.enrichWithDeepDoc(ctx, engine, boxes, result.PageImages)
result.Metrics.TablesCount = len(result.Tables)
if err := ctx.Err(); err != nil {
return err
}
boxes = AssignColumn(boxes, p.Config.Zoom)
boxes = TextMerge(boxes, medianHeights, p.Config.Zoom)
result.Metrics.BoxesTextMerge = len(boxes)
sortByPageThenY(boxes, p.Config.SortByTop)
if ocrUsedAny {
isEnglish = detectEnglish(pageChars, toPage-fromPage+1, p.SampleChars)
}
boxes = NaiveVerticalMerge(boxes, medianHeights, medianWidths, isEnglish)
result.Metrics.BoxesVertMerge = len(boxes)
if err := ctx.Err(); err != nil {
return err
}
boxes = extractTableAndReplace(boxes, result.Tables)
boxes = consolidateFigures(boxes)
pageHeights := make(map[int]float64, len(result.PageImages))
for pg, img := range result.PageImages {
pageHeights[pg] = float64(img.Bounds().Dy()) / p.Config.Zoom
}
result.Sections = boxesToSections(boxes, pageHeights)
result.Metrics.BoxesFinal = len(result.Sections)
result.Figures = CollectFigures(result.Sections)
result.Sections = mergeCaptions(result.Sections, result.Figures)
return nil
}
// processPages runs the full pipeline on pages [fromPage, toPage].
// prescanChars provides pre-extracted chars (avoids double extraction).
func (p *Parser) processPages(ctx context.Context, engine PDFEngine,
fromPage, toPage int,
prescanChars map[int][]TextChar,
medianHeights, medianWidths map[int]float64,
isEnglish, isScanNoiseDoc bool,
) (*ParseResult, error) {
result := &ParseResult{PageImages: make(map[int]image.Image)}
// 1. OCR extraction — per-page detect + recognize + char merge.
boxes, pageChars, ocrUsedAny, ocrErr := p.extractPages(ctx, engine,
fromPage, toPage, prescanChars,
medianHeights, medianWidths, result.PageImages)
if ocrErr != nil {
slog.Warn("extractPages: some pages failed OCR", "err", ocrErr)
}
// 2. Scan noise retry — re-OCR all pages when prescan detects scan noise.
if isScanNoiseDoc {
boxes, pageChars, ocrUsedAny = p.retryScanNoise(ctx, engine,
fromPage, toPage, result.PageImages,
pageChars, medianHeights, medianWidths, ocrUsedAny)
}
// 3. Zoom retry — re-render at higher resolution if OCR produced zero boxes.
if len(boxes) == 0 && p.Config.Zoom < 9 && !p.Config.SkipOCR {
boxes, ocrUsedAny = p.retryZoom(ctx, engine, fromPage, toPage,
result.PageImages, boxes, ocrUsedAny)
}
if len(boxes) == 0 {
return result, nil
}
// 4. Layout pipeline — DLA → TSR → Column → TextMerge → VM → Sections.
if err := p.buildLayout(ctx, result, engine, boxes, pageChars,
medianHeights, medianWidths, fromPage, toPage, ocrUsedAny, isEnglish); err != nil {
return nil, fmt.Errorf("buildLayout: %w", err)
}
// Text sections use cropSectionImage based on their PositionTag.
if len(result.PageImages) > 0 {
// Build lookup: DLA region → TableItem index for image matching.
tableImgByRegion := make(map[string]string, len(result.Tables))
for _, tbl := range result.Tables {
if tbl.ImageB64 == "" {
continue
}
pg := 0
if len(tbl.Positions) > 0 && len(tbl.Positions[0].PageNumbers) > 0 {
pg = tbl.Positions[0].PageNumbers[0]
}
key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f",
pg, tbl.RegionLeft, tbl.RegionRight, tbl.RegionTop, tbl.RegionBottom)
tableImgByRegion[key] = tbl.ImageB64
}
for i := range result.Sections {
if result.Sections[i].LayoutType == LayoutTypeTable && len(result.Sections[i].Positions) > 0 {
pos := result.Sections[i].Positions[0]
pg := 0
if len(pos.PageNumbers) > 0 {
pg = pos.PageNumbers[0]
}
key := fmt.Sprintf("%d_%.1f_%.1f_%.1f_%.1f",
pg, pos.Left, pos.Right, pos.Top, pos.Bottom)
if img, ok := tableImgByRegion[key]; ok {
result.Sections[i].Image = img
continue
}
}
// Try DLA-aware cropping for figure sections (matching Python's
// cropout which uses DLA region boundaries instead of text boxes).
if result.Sections[i].LayoutType == LayoutTypeFigure && len(result.Sections[i].Positions) > 0 {
if dlaImg := cropSectionByDLA(result.Sections[i], p.debugDLA, result.PageImages); dlaImg != "" {
result.Sections[i].Image = dlaImg
continue
}
}
img := cropSectionImage(result.Sections[i].PositionTag, result.PageImages, p.Config.Zoom)
result.Sections[i].Image = img
if img == "" && result.Sections[i].Text != "" {
tag := result.Sections[i].PositionTag
slog.Warn("cropSectionImage empty for non-empty section",
"section", i, "posTag", tag[:min(80, len(tag))])
}
}
}
// Collect DLA/TSR debug intermediates if available.
result.DLADebug = p.debugDLA
result.TSRDebug = p.debugTSR
p.debugDLA = nil
p.debugTSR = nil
return result, nil
}
// isASCIIPrintable returns true for characters that match Python's
// is_english regex: [ a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]
func isASCIIPrintable(r rune) bool {
if r == ' ' {
return true
}
if r >= 'a' && r <= 'z' {
return true
}
if r >= 'A' && r <= 'Z' {
return true
}
if r >= '0' && r <= '9' {
return true
}
// Additional ASCII symbols from the Python regex
switch r {
case ',', '/', '¸', ';', ':', '\'', '[', ']', '(', ')',
'!', '@', '#', '$', '%', '^', '&', '*', '"', '?',
'<', '>', '.', '_', '-':
return true
}
return false
}
// defaultSampleChars returns a random sample of up to n character texts,
// concatenated. Matches Python's random.choices([c["text"] for c in
// page_chars], k=min(100, len(page_chars))).
func defaultSampleChars(chars []TextChar, n int) string {
if n <= 0 || len(chars) == 0 {
return ""
}
m := min(n, len(chars))
// Fisher-Yates shuffle on indices, then take first m.
indices := make([]int, len(chars))
for i := range indices {
indices[i] = i
}
rand.Shuffle(len(indices), func(i, j int) {
indices[i], indices[j] = indices[j], indices[i]
})
var buf strings.Builder
for i := 0; i < m; i++ {
buf.WriteString(chars[indices[i]].Text)
}
return buf.String()
}
// fullTextFromChars concatenates all chars text across pages for scan noise detection.
func fullTextFromChars(pageChars map[int][]TextChar) string {
var sb strings.Builder
for _, chars := range pageChars {
for _, c := range chars {
sb.WriteString(c.Text)
}
}
return sb.String()
}
// detectEnglish detects whether a PDF is primarily English by per-page
// majority vote, matching Python's is_english logic in __images__
// (pdf_parser.py:1519-1526).
//
// Each page: sample up to 100 character texts via sampler, join into one
// string, check if there is a run of 30+ consecutive ASCII characters
// (letters, digits, spaces, punctuation). Pages with such a run vote
// "English". Returns true when a strict majority of pages vote yes.
//
// totalPages is the denominator (len(self.page_images) in Python), including
// image-only pages that have zero chars. This matches Python's behavior
// where empty pages dilute the majority.
func detectEnglish(pageChars map[int][]TextChar, totalPages int, sample SampleFunc) bool {
if totalPages == 0 || len(pageChars) == 0 {
return false
}
if sample == nil {
sample = defaultSampleChars
}
pagesWithSeq := 0
for _, chars := range pageChars {
if len(chars) == 0 {
continue
}
sampleText := sample(chars, 100)
run := 0
for _, r := range sampleText {
if isASCIIPrintable(r) {
run++
if run >= 30 {
pagesWithSeq++
break
}
} else {
run = 0
}
}
}
return pagesWithSeq > totalPages/2
}
// charsToBoxes converts raw characters to initial text boxes by grouping
// characters into lines based on vertical overlap.
//
// Python: pdf_parser.__images__ producing self.boxes
func charsToBoxes(chars []TextChar, pageNum int, sortByTop bool) []TextBox {
if len(chars) == 0 {
return nil
}
lines := groupCharsToLines(chars, sortByTop)
// Page-level column gap threshold from ALL inter-char gaps.
// Falls back to per-line threshold when page has too few gaps.
threshold := pageXGapThreshold(lines)
boxes := make([]TextBox, 0, len(lines))
for _, line := range lines {
thr := threshold
if thr > 100 {
// No significant column gaps on this page → use per-line threshold.
thr = perLineXGapThreshold(line)
}
subLines := splitLineByXGap(line, thr)
for _, sub := range subLines {
box := lineToTextBox(sub)
box.PageNumber = pageNum
boxes = append(boxes, box)
}
}
return boxes
}
// perLineXGapThreshold computes a dynamic X-gap threshold for column
// splitting within a single line (fallback when page has few gaps).
func perLineXGapThreshold(chars []TextChar) float64 {
if len(chars) <= 1 {
return 1e9
}
var gaps []float64
for i := 1; i < len(chars); i++ {
g := chars[i].X0 - chars[i-1].X1
gaps = append(gaps, g)
}
if len(gaps) == 0 {
return 1e9
}
sort.Float64s(gaps)
medianGap := gaps[len(gaps)/2]
if medianGap < 6 {
medianGap = 6
}
return medianGap * 2.5
}
// pageXGapThreshold computes a global X-gap column threshold from all
// inter-char gaps across all lines on the page. 95th percentile catches
// column boundaries while excluding word-level gaps.
// Returns a value > 100 when there are too few gaps for reliable p95,
// signalling the caller to fall back to perLineXGapThreshold.
func pageXGapThreshold(lines [][]TextChar) float64 {
var allGaps []float64
for _, line := range lines {
for i := 1; i < len(line); i++ {
g := line[i].X0 - line[i-1].X1
allGaps = append(allGaps, g)
}
}
if len(allGaps) < 10 {
return 1e9 // too few gaps for reliable p95 → fall back to per-line
}
sort.Float64s(allGaps)
// 95th percentile: only the largest 5% of gaps are column boundaries.
p95 := allGaps[len(allGaps)*95/100]
if p95 < 30 {
p95 = 30 // floor: column gaps are ≥30pt in practice
}
return p95
}
// splitLineByXGap splits a character line into sub-lines where X gaps
// meet or exceed the threshold (column boundaries). Uses >= to match the
// p95 boundary value — a gap exactly at the 95th percentile is a column gap,
// not a word gap.
func splitLineByXGap(chars []TextChar, threshold float64) [][]TextChar {
if len(chars) <= 1 {
return [][]TextChar{chars}
}
var result [][]TextChar
start := 0
for i := 1; i < len(chars); i++ {
gap := chars[i].X0 - chars[i-1].X1
if gap >= threshold {
result = append(result, chars[start:i])
start = i
}
}
result = append(result, chars[start:])
return result
}
// resolvePageSpan computes the ending page and bottom coordinate for a box
// that may span multiple pages. When pageHeights is nil or the box fits
// within its starting page the returned (toPage, bottom) equal the inputs.
//
// Zero or negative page heights are treated as invalid: the span stops at
// the preceding page, guarding against infinite loops caused by corrupted
// page images.
func resolvePageSpan(pageNum int, bottom float64, pageHeights map[int]float64) (toPage int, newBottom float64) {
toPage = pageNum
newBottom = bottom
if pageHeights == nil {
return
}
ph, ok := pageHeights[pageNum]
if !ok || ph <= 0 || bottom <= ph {
return
}
remaining := bottom
for remaining > ph && ph > 0 {
nextPh, ok := pageHeights[toPage+1]
if !ok || nextPh <= 0 {
// Unknown or invalid next page height — extend by the
// last known height once and stop (Python: _line_tag
// while-loop break path).
remaining -= ph
toPage++
break
}
remaining -= ph
ph = nextPh
toPage++
}
newBottom = remaining
return
}
// boxesToSections converts layout boxes to section format with position tags.
//
// pageHeights provides the PDF-point height of each page (image height / zoom).
// Boxes that extend beyond their page produce multi-page position tags
// (Python's _line_tag while-loop detection via resolvePageSpan).
//
// Python equivalent: output consumed by naive.py::chunk()
func boxesToSections(boxes []TextBox, pageHeights map[int]float64) []Section {
sections := make([]Section, 0, len(boxes))
for _, b := range boxes {
t := strings.TrimSpace(b.Text)
if t == "" {
continue
}
toPage, bottom := resolvePageSpan(b.PageNumber, b.Bottom, pageHeights)
var posTag string
var pageNums []int
if b.PageNumber == toPage {
posTag = FormatPositionTag(b.PageNumber, b.X0, b.X1, b.Top, bottom)
pageNums = []int{b.PageNumber}
} else {
posTag = FormatPositionTagRange(b.PageNumber, toPage, b.X0, b.X1, b.Top, bottom)
pageNums = make([]int, 0, toPage-b.PageNumber+1)
for p := b.PageNumber; p <= toPage; p++ {
pageNums = append(pageNums, p)
}
}
sections = append(sections, Section{
Text: t,
PositionTag: posTag,
LayoutType: b.LayoutType,
Positions: []Position{{PageNumbers: pageNums, Left: b.X0, Right: b.X1, Top: b.Top, Bottom: bottom}},
})
}
return sections
}
// mergeCaptions finds "figure caption" and "table caption" sections,
// appends their text to the nearest figure/table, then removes the
// caption sections. Matches Python _extract_table_figure caption
// matching (pdf_parser.py:1196-1232).
// Also uses isCaptionBox to detect captions that DLA mislabeled as
// "text" — matching Python's is_caption(text) pattern matching.
func mergeCaptions(sections []Section, figures []Section) []Section {
captions := make([]int, 0, 4)
for i, s := range sections {
captionType := captionKind(s)
if captionType == "" {
continue
}
target := findNearestParent(i, s, sections, figures, captionType)
if target >= 0 {
// For table sections, prepend caption before the HTML table
// (matching Python's _extract_table_figure caption->construct_table).
if sections[target].LayoutType == LayoutTypeTable && sections[target].Text != "" {
sections[target].Text = s.Text + sections[target].Text
} else if sections[target].Text != "" {
sections[target].Text += " " + s.Text
} else {
sections[target].Text = s.Text
}
}
captions = append(captions, i)
}
// Remove caption sections in reverse order.
n := len(sections)
out := make([]Section, 0, n-len(captions))
capSet := make(map[int]bool, len(captions))
for _, idx := range captions {
capSet[idx] = true
}
for i, s := range sections {
if !capSet[i] {
out = append(out, s)
}
}
return out
}
// findNearestParent finds the nearest figure (for figure caption) or
// table (for table caption) section by position proximity.
// captionType is "table" or "figure" (from captionKind).
// Returns the index in `sections` (for tables) or a virtual index mapping
// to `figures` (negative offset for figures).
func findNearestParent(captionIdx int, caption Section, sections []Section, figures []Section, captionType string) int {
find := func(targets []Section, skipIdx int) (int, float64) {
bestIdx := -1
bestDist := 1e9
for i, t := range targets {
if i == skipIdx {
continue // don't match caption to itself
}
if len(t.Positions) == 0 || len(caption.Positions) == 0 {
continue
}
tp := t.Positions[0]
cp := caption.Positions[0]
// Squared Euclidean distance (Python _extract_table_figure:1196).
// Caption is typically below. Use center-point distance.
cx := (tp.Left + tp.Right) / 2
cy := (tp.Top + tp.Bottom) / 2
ccx := (cp.Left + cp.Right) / 2
ccy := (cp.Top + cp.Bottom) / 2
dist := (cx-ccx)*(cx-ccx) + (cy-ccy)*(cy-ccy)
if dist < bestDist {
bestDist = dist
bestIdx = i
}
}
return bestIdx, bestDist
}
const maxCaptionGap = 40000.0 // PDF points (~7cm) — beyond this, don't attach.
if captionType == LayoutTypeFigure && len(figures) > 0 {
idx, dist := find(figures, -1) // figures don't contain the caption itself
if idx >= 0 && dist < maxCaptionGap {
// Match by position coordinates, not PositionTag strings.
f := figures[idx]
for i, s := range sections {
if s.LayoutType != LayoutTypeFigure || len(s.Positions) == 0 || len(f.Positions) == 0 {
continue
}
sp, fp := s.Positions[0], f.Positions[0]
if sp.Left == fp.Left && sp.Right == fp.Right &&
sp.Top == fp.Top && sp.Bottom == fp.Bottom {
return i
}
}
}
}
if captionType == LayoutTypeTable {
idx, dist := find(sections, captionIdx)
if idx >= 0 && dist < maxCaptionGap && sections[idx].LayoutType == LayoutTypeTable {
return idx
}
}
return -1
}
// sortByPageThenY sorts boxes by page → vertical key → x0.
func sortByPageThenY(boxes []TextBox, sortByTop bool) {
key := func(b TextBox) float64 { return b.Bottom }
if sortByTop {
key = func(b TextBox) float64 { return b.Top }
}
sort.Slice(boxes, func(i, j int) bool {
if boxes[i].PageNumber != boxes[j].PageNumber {
return boxes[i].PageNumber < boxes[j].PageNumber
}
if key(boxes[i]) != key(boxes[j]) {
return key(boxes[i]) < key(boxes[j])
}
return boxes[i].X0 < boxes[j].X0
})
}
// ---- internal helpers ----
// groupCharsToLines groups characters into horizontal lines based on vertical overlap.
func groupCharsToLines(chars []TextChar, sortByTop bool) [][]TextChar {
if len(chars) == 0 {
return nil
}
key := func(c TextChar) float64 { return c.Bottom }
if sortByTop {
key = func(c TextChar) float64 { return c.Top }
}
// Sort by vertical key (Bottom or Top) then x0 using sort.SliceStable.
// Guard against NaN: a NaN key sorts after everything else.
sort.SliceStable(chars, func(i, j int) bool {
ki, kj := key(chars[i]), key(chars[j])
if ki != kj && !math.IsNaN(ki) && !math.IsNaN(kj) {
return ki < kj
}
if math.IsNaN(ki) != math.IsNaN(kj) {
return !math.IsNaN(ki) // non-NaN before NaN
}
return chars[i].X0 < chars[j].X0
})
var lines [][]TextChar
var currentLine []TextChar
for _, c := range chars {
if len(currentLine) == 0 {
currentLine = append(currentLine, c)
continue
}
if verticalOverlap(currentLine[len(currentLine)-1], c) {
currentLine = append(currentLine, c)
} else {
if len(currentLine) > 0 {
lines = append(lines, currentLine)
}
currentLine = []TextChar{c}
}
}
if len(currentLine) > 0 {
lines = append(lines, currentLine)
}
return lines
}
// verticalOverlap checks if two characters are on the same horizontal line.
func verticalOverlap(a, b TextChar) bool {
mh := math.Max(CharHeight(a), CharHeight(b))
if mh <= 0 {
mh = 1.0
}
return math.Abs(a.Top-b.Top) < mh*0.5
}
// lineToTextBox converts a line of characters to a single TextBox.
// asciiWordPattern matches strings composed entirely of ASCII word
// characters. Python uses re.match (prefix match) — the stricter
// full-string match here is equivalent in practice because each
// TextChar.Text is a single rune, so prevText+currText ≤ 2 chars.
// Python: pdf_parser.py:1528 re.match(r"[0-9a-zA-Z,.:;!%]+", ...)
var asciiWordPattern = regexp.MustCompile(`^[0-9a-zA-Z,.:;!%]+$`)
func lineToTextBox(chars []TextChar) TextBox {
if len(chars) == 0 {
return TextBox{}
}
box := TextBox{
X0: chars[0].X0,
X1: chars[0].X1,
Top: chars[0].Top,
Bottom: chars[0].Bottom,
}
var textParts []string
for i, c := range chars {
// Insert space between adjacent ASCII words with a visible gap.
// Python: pdf_parser.py:1524-1532 __img_ocr space insertion.
if i > 0 {
prev := chars[i-1]
prevText := strings.TrimSpace(prev.Text)
currText := strings.TrimSpace(c.Text)
if prevText != "" && currText != "" {
gap := c.X0 - prev.X1
minWidth := math.Min(c.X1-c.X0, prev.X1-prev.X0)
if gap >= minWidth/2 &&
asciiWordPattern.MatchString(prevText+currText) {
textParts = append(textParts, " ")
}
}
}
box.X0 = math.Min(box.X0, c.X0)
box.X1 = math.Max(box.X1, c.X1)
box.Top = math.Min(box.Top, c.Top)
box.Bottom = math.Max(box.Bottom, c.Bottom)
textParts = append(textParts, c.Text)
if c.LayoutType != "" {
box.LayoutType = c.LayoutType
}
if c.LayoutNo != "" {
box.LayoutNo = c.LayoutNo
}
}
box.Text = strings.Join(textParts, "")
return box
}