Files
ragflow/internal/deepdoc/parser/pdf/parser_ocr.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

584 lines
16 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package parser
import (
"context"
"fmt"
"image"
"log/slog"
"math"
"sort"
"strings"
"unicode"
)
// isGarbledPage returns true if a page is garbled by PUA ratio, font encoding,
// pdf_oxide unmapped glyphs, or scan noise (no real words).
func isGarbledPage(chars []TextChar) bool {
if len(chars) < 20 {
return false
}
// Build full-page text for detection (all O(n) single pass).
var fullText strings.Builder
for _, c := range chars {
fullText.WriteString(c.Text)
}
text := fullText.String()
if IsGarbledText(text, 0.3) {
return true
}
if pdfOxideUnmappedGarbled(text) && isScanNoise(text) {
return true
}
if IsGarbledByFontEncoding(chars, 20) {
return true
}
if isScanNoise(text) {
return true
}
return false
}
// isScanNoise detects scanned pages where pdf_oxide extracts noise glyphs
// instead of real text. Real text in any language contains word-like runs
// of consecutive letters (L category). Scan noise consists of random ASCII
// symbols with at most 2-letter fragments.
//
// Three indicators of real (non-noise) text, any one is sufficient:
// - ≥4 consecutive lowercase Latin letters (e.g. "the", "and")
// - ≥2 consecutive CJK characters (Han, Hiragana, Katakana, Hangul)
// - ≥4 consecutive non-ASCII letters (Arabic, Thai, Cyrillic, etc.)
//
// Pure-uppercase fragments like "RASB" are common in pdf_oxide noise but
// never appear as standalone words in real text without lowercase context.
func isScanNoise(text string) bool {
nonSpace := 0
digitCount := 0
lowerRun := 0
maxLowerRun := 0
cjkRun := 0
maxCJKRun := 0
nonASCIILetterRun := 0
maxNonASCIILetterRun := 0
for _, r := range text {
if r == ' ' || r == '\t' || r == '\n' || r == '\r' {
lowerRun = 0
cjkRun = 0
nonASCIILetterRun = 0
continue
}
nonSpace++
// Digit density: real content (tables, dates) has digits;
// pdf_oxide noise (unmapped glyphs) never produces digits.
if r >= '0' && r <= '9' {
digitCount++
}
// Lowercase Latin (Ll)
if unicode.Is(unicode.Ll, r) {
lowerRun++
if lowerRun > maxLowerRun {
maxLowerRun = lowerRun
}
} else {
lowerRun = 0
}
// CJK: Han, Hiragana, Katakana, Hangul Syllables & Jamo
if isCJK(r) {
cjkRun++
if cjkRun > maxCJKRun {
maxCJKRun = cjkRun
}
} else {
cjkRun = 0
}
// Non-ASCII letter (Arabic U+0600U+06FF, Thai U+0E00U+0E7F,
// Cyrillic U+0400U+04FF, etc.). Excludes ASCII so uppercase
// Latin fragments like "RASB" don't count.
if unicode.IsLetter(r) && r > unicode.MaxASCII {
nonASCIILetterRun++
if nonASCIILetterRun > maxNonASCIILetterRun {
maxNonASCIILetterRun = nonASCIILetterRun
}
} else {
nonASCIILetterRun = 0
}
}
// Need enough characters to make a meaningful decision.
if nonSpace < 30 {
return false
}
// Digit density: pdf_oxide never substitutes digits for unmapped
// glyphs. Real content (tables, dates, page numbers) has ≥10%
// digits; noise consists of random ASCII punctuation.
if float64(digitCount)/float64(nonSpace) >= 0.10 {
return false
}
// Real text in any script — any one indicator is sufficient.
isNoise := maxLowerRun < 4 && maxCJKRun < 2 && maxNonASCIILetterRun < 4
return isNoise
}
// isCJK reports whether r is a CJK character: Han ideograph, Hiragana,
// Katakana, Hangul syllable, or Hangul Jamo.
func isCJK(r rune) bool {
return unicode.Is(unicode.Han, r) ||
unicode.Is(unicode.Hiragana, r) ||
unicode.Is(unicode.Katakana, r) ||
unicode.Is(unicode.Hangul, r)
}
// pdfOxideUnmappedGarbled detects pdf_oxide's '#' placeholder glyphs.
// pdf_oxide uses '#' (U+0023) for every glyph it cannot map; consecutive
// unmapped glyphs form "##", "###", "####" sequences. Three or more
// consecutive '#' is virtually impossible in normal text.
//
// Two conditions (either is sufficient):
// - ≥ 2 occurrences of "###" (3+ consecutive #)
// - # density ≥ 5% of non-space characters
func pdfOxideUnmappedGarbled(text string) bool {
hashCount := 0
total := 0
consecutive := 0
tripleClusters := 0
for _, r := range text {
if r == ' ' || r == '\t' || r == '\n' || r == '\r' {
continue
}
total++
if r == '#' {
hashCount++
consecutive++
if consecutive == 3 {
tripleClusters++
}
} else {
consecutive = 0
}
}
if total == 0 {
return false
}
density := float64(hashCount) / float64(total)
if tripleClusters >= 1 {
return true
}
// Density check only meaningful with enough chars (matches isGarbledPage's
// min 20 char guard). In production the sample is 200 chars.
if total >= 40 && density >= 0.03 {
return true
}
return false
}
// ocrDetectAndRecognize runs OCR detection + recognition and returns
// recognized TextBox results. logLabel distinguishes callers in log output
// ("scan page", "garbled page").
func ocrDetectAndRecognize(ctx context.Context, pageImg image.Image, doc DocAnalyzer, pageNum int, logLabel string) []TextBox {
boxes, err := doc.OCRDetect(ctx, pageImg)
if err != nil || len(boxes) == 0 {
if err != nil {
slog.Warn(logLabel+" OCR detect failed", "page", pageNum, "err", err)
}
return nil
}
var result []TextBox
for _, box := range boxes {
x0 := int(math.Min(box.X0, math.Min(box.X1, math.Min(box.X2, box.X3))))
y0 := int(math.Min(box.Y0, math.Min(box.Y1, math.Min(box.Y2, box.Y3))))
x1 := int(math.Max(box.X0, math.Max(box.X1, math.Max(box.X2, box.X3))))
y1 := int(math.Max(box.Y0, math.Max(box.Y1, math.Max(box.Y2, box.Y3))))
if x0 >= x1 || y0 >= y1 {
continue
}
cropped := fastCrop(pageImg, x0, y0, x1, y1)
texts, recErr := doc.OCRRecognize(ctx, cropped)
if recErr != nil {
slog.Warn(logLabel+" OCR recognize failed", "page", pageNum, "err", recErr)
continue
}
for _, t := range texts {
if strings.TrimSpace(t.Text) != "" {
result = append(result, TextBox{
X0: float64(x0), X1: float64(x1),
Top: float64(y0), Bottom: float64(y1),
Text: t.Text,
PageNumber: pageNum,
})
}
}
}
return result
}
// ocrMergeChars runs full-page detect on a page that has embedded chars,
// merges the chars into detect regions, and OCRs any regions without chars.
// Matches Python's __ocr: detect → match chars to boxes → use char text
// for boxes with embedded chars → OCR recognize only empty/garbled boxes.
func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []TextChar, doc DocAnalyzer, pageNum int) []TextBox {
detectBoxes, err := doc.OCRDetect(ctx, pageImg)
if err != nil || len(detectBoxes) == 0 {
return nil
}
slog.Debug("ocrMergeChars detect", "page", pageNum, "boxes", len(detectBoxes))
// Detect boxes are in pixel space (216 DPI). Scale to PDF space (72 DPI)
// so coordinates match embedded chars.
scale := dlaScale // 3.0
imgBounds := pageImg.Bounds()
imgW := float64(imgBounds.Dx()) / scale
imgH := float64(imgBounds.Dy()) / scale
// Step 1: match embedded chars to detect boxes (Python __ocr char matching).
type detectBox struct {
box TextBox
x0, y0, x1, y1 float64 // PDF-space bounds
}
boxes := make([]detectBox, 0, len(detectBoxes))
for _, b := range detectBoxes {
x0 := min(b.X0, b.X1, b.X2, b.X3) / scale
y0 := min(b.Y0, b.Y1, b.Y2, b.Y3) / scale
x1 := max(b.X0, b.X1, b.X2, b.X3) / scale
y1 := max(b.Y0, b.Y1, b.Y2, b.Y3) / scale
if x0 < 0 {
x0 = 0
}
if y0 < 0 {
y0 = 0
}
if x1 > imgW {
x1 = imgW
}
if y1 > imgH {
y1 = imgH
}
if x0 >= x1 || y0 >= y1 {
continue
}
boxes = append(boxes, detectBox{box: TextBox{
X0: x0, X1: x1, Top: y0, Bottom: y1, PageNumber: pageNum,
}, x0: x0, y0: y0, x1: x1, y1: y1})
}
// Sort detect boxes top-down (fuzzy Y-group), matching Python's
// Recognizer.sort_Y_firstly with threshold = median box height / 3.
if len(boxes) > 1 {
boxHeights := make([]float64, len(boxes))
for i := range boxes {
boxHeights[i] = boxes[i].y1 - boxes[i].y0
}
sort.Float64s(boxHeights)
threshold := boxHeights[len(boxHeights)/2] / 3
sort.Slice(boxes, func(a, b int) bool {
if math.Abs(boxes[a].y0-boxes[b].y0) < threshold {
return boxes[a].x0 < boxes[b].x0
}
return boxes[a].y0 < boxes[b].y0
})
}
// Step 2: match each char to the best overlapping detect box
// (char perspective), matching Python's find_overlapped.
boxChars := make([][]TextChar, len(boxes))
for _, c := range chars {
bestIdx := -1
bestOverlap := 1e-6 // Python: thr=1e-6
for i := range boxes {
overlap := charBoxOverlapRatio(c, boxes[i].x0, boxes[i].x1, boxes[i].y0, boxes[i].y1)
if overlap >= bestOverlap {
bestOverlap = overlap
bestIdx = i
}
}
if bestIdx < 0 {
continue
}
// Height gating, matching Python: skip when height differs >70%,
// except space chars which are always kept.
ch := c.Bottom - c.Top
if ch <= 0 {
ch = 1
}
bh := boxes[bestIdx].y1 - boxes[bestIdx].y0
if math.Abs(ch-bh)/math.Max(ch, bh) >= 0.7 && c.Text != " " {
continue
}
boxChars[bestIdx] = append(boxChars[bestIdx], c)
}
// Step 3: assemble text for each box.
var result []TextBox
var needOCR []int
for i := range boxes {
tb := boxes[i].box
tb.Text = ""
if len(boxChars[i]) > 0 {
// Sort chars by reading order, matching Python's sort_Y_firstly.
// Fuzzy Y-group: chars within median char height are "same line",
// sorted by X; different lines sorted by Y.
sortCharsYFirstly(boxChars[i], medianCharHeight(boxChars[i]))
// Use lineToTextBox for correct space insertion + garbled detection.
// lineToTextBox inserts ASCII word spaces at visible gaps —
// matching Python's __img_ocr + __ocr char logic.
lineBox := lineToTextBox(boxChars[i])
tb.Text = lineBox.Text
// Strategy 1: If majority of chars are garbled (PUA), clear text → OCR.
var garbledCnt, totalCnt int
for _, c := range boxChars[i] {
for _, r := range c.Text {
totalCnt++
if IsGarbledChar(string(r)) {
garbledCnt++
}
}
}
if totalCnt > 0 && float64(garbledCnt)/float64(totalCnt) >= 0.5 {
tb.Text = ""
}
// Strategy 2: font-encoding garbled (subset fonts, min 5 chars).
if tb.Text != "" && IsGarbledByFontEncoding(boxChars[i], 5) {
tb.Text = ""
}
}
// Step 4: batch OCR recognize boxes without embedded chars (or garbled).
if tb.Text == "" {
needOCR = append(needOCR, i)
}
result = append(result, tb)
}
if len(needOCR) > 0 {
cropped := make([]image.Image, len(needOCR))
for j, idx := range needOCR {
cropped[j] = fastCrop(pageImg,
int(boxes[idx].x0*scale), int(boxes[idx].y0*scale),
int(boxes[idx].x1*scale), int(boxes[idx].y1*scale))
}
allTexts, allErrs := doc.OCRRecognizeBatch(ctx, cropped)
for j, idx := range needOCR {
if allErrs[j] != nil {
slog.Warn("ocr merge: recognize failed", "page", pageNum, "err", allErrs[j])
continue
}
var ocrParts []string
for _, t := range allTexts[j] {
if strings.TrimSpace(t.Text) != "" {
ocrParts = append(ocrParts, t.Text)
}
}
result[idx].Text = strings.TrimSpace(strings.Join(ocrParts, " "))
}
}
// Filter out boxes with no text.
filtered := result[:0]
for _, tb := range result {
if tb.Text != "" {
filtered = append(filtered, tb)
}
}
result = filtered
slog.Debug("ocrMergeChars result", "page", pageNum, "boxes", len(result))
return result
}
// medianCharHeight returns the median height of chars, or 0 if empty.
// Used as the fuzzy-sort threshold matching Python's np.mean([c["height"]]).
func medianCharHeight(chars []TextChar) float64 {
if len(chars) == 0 {
return 0
}
heights := make([]float64, len(chars))
for i, c := range chars {
heights[i] = c.Bottom - c.Top
}
sort.Float64s(heights)
return heights[len(heights)/2]
}
// sortYFirstly sorts chars by Y (fuzzy group by threshold), then by X.
// Matching Python Recognizer.sort_Y_firstly in recognizer.py:26-33:
//
// If two chars have Y diff < threshold → same line → sort by X.
// Otherwise → sort by Y.
func sortCharsYFirstly(chars []TextChar, threshold float64) {
sort.Slice(chars, func(a, b int) bool {
diff := chars[a].Top - chars[b].Top
if math.Abs(diff) < threshold {
return chars[a].X0 < chars[b].X0
}
return diff < 0
})
}
// charBoxOverlapRatio computes the overlap ratio between a char and a box,
// from the char's perspective. Returns overlap_area / char_area.
// Matching Python's Recognizer.overlapped_area(char, box, ratio=True).
func charBoxOverlapRatio(c TextChar, x0, x1, y0, y1 float64) float64 {
cw := c.X1 - c.X0
ch := c.Bottom - c.Top
if cw <= 0 {
cw = 1
}
if ch <= 0 {
ch = 1
}
charArea := cw * ch
if charArea <= 0 {
return 0
}
inter := rectOverlapInter(c.X0, c.Top, c.X1, c.Bottom, x0, y0, x1, y1)
return inter / charArea
}
// ocrTableCells fills empty TSR cells via OCR recognition.
func ocrTableCells(ctx context.Context, cells []TSRCell, tableImg image.Image, doc DocAnalyzer) {
if doc == nil || tableImg == nil || len(cells) == 0 {
return
}
for i := range cells {
if cells[i].Text != "" {
continue
}
x0 := int(math.Max(0, cells[i].X0))
y0 := int(math.Max(0, cells[i].Y0))
x1 := int(math.Min(float64(tableImg.Bounds().Dx()), cells[i].X1))
y1 := int(math.Min(float64(tableImg.Bounds().Dy()), cells[i].Y1))
if x0 >= x1 || y0 >= y1 {
continue
}
cropped := fastCrop(tableImg, x0, y0, x1, y1)
texts, err := doc.OCRRecognize(ctx, cropped)
if err != nil {
slog.Warn("table cell OCR failed", "err", err)
continue
}
var parts []string
for _, t := range texts {
if t.Text != "" {
parts = append(parts, t.Text)
}
}
cells[i].Text = strings.TrimSpace(strings.Join(parts, " "))
}
}
// evaluateTableOrientation tests 4 rotation angles (0/90/180/270) and picks
// the best orientation based on OCR confidence scores.
//
// Returns bestAngle (0/90/180/270), the rotated image, and per-angle scores.
// Scores map[angle]{avgConfidence, totalRegions, combinedScore}.
//
// Absolute threshold: non-0° wins only if its combined score exceeds 0° by
// more than 0.2 AND the 0° score is below 0.8.
//
// Python: pdf_parser.py:314 _evaluate_table_orientation()
func evaluateTableOrientation(ctx context.Context, tableImg image.Image, doc DocAnalyzer) (bestAngle int, bestImg image.Image, scores map[int]float64) {
rotations := []struct {
angle int
name string
}{
{0, "original"},
{90, "rotate_90"},
{180, "rotate_180"},
{270, "rotate_270"},
}
scores = make(map[int]float64, 4)
bestScore := float64(-1)
bestAngle = 0
bestImg = tableImg
for _, rot := range rotations {
rotated := tableImg
if rot.angle != 0 {
rotated = rotateImageCW(tableImg, rot.angle)
if rotated == nil {
slog.Warn("table rotate failed", "angle", rot.angle)
continue
}
}
detectBoxes, err := doc.OCRDetect(ctx, rotated)
if err != nil || len(detectBoxes) == 0 {
scores[rot.angle] = 0
continue
}
// Score by detect-region count (primary) + area (tiebreaker).
// Per-region OCRRecognize calls are NOT needed to judge table
// orientation — the count of detect regions is a reliable proxy
// (a well-oriented table has more/fuller text regions).
// Skipping recognize cuts ~N HTTP calls per angle.
imageArea := float64(rotated.Bounds().Dx() * rotated.Bounds().Dy())
totalRegions := 0
var totalArea float64
for _, box := range detectBoxes {
x0 := math.Min(box.X0, math.Min(box.X1, math.Min(box.X2, box.X3)))
y0 := math.Min(box.Y0, math.Min(box.Y1, math.Min(box.Y2, box.Y3)))
x1 := math.Max(box.X0, math.Max(box.X1, math.Max(box.X2, box.X3)))
y1 := math.Max(box.Y0, math.Max(box.Y1, math.Max(box.Y2, box.Y3)))
if x0 >= x1 || y0 >= y1 {
continue
}
totalRegions++
totalArea += (x1 - x0) * (y1 - y0)
}
if totalRegions == 0 {
scores[rot.angle] = 0
continue
}
areaRatio := totalArea / imageArea
// Region count is the primary signal. Area coverage provides a
// small bonus (up to +6%) so that when region counts are tied the
// angle with fuller text boxes wins.
combined := float64(totalRegions) * (1 + 0.06*areaRatio)
scores[rot.angle] = combined
slog.Debug("table orientation",
"angle", rot.angle,
"regions", totalRegions,
"area_ratio", fmt.Sprintf("%.4f", areaRatio),
"combined", fmt.Sprintf("%.2f", combined))
if combined > bestScore {
bestScore = combined
bestAngle = rot.angle
bestImg = rotated
}
}
// Absolute threshold: only accept non-0° if region count is clearly
// higher (≥1.4×) AND 0° has few regions (< 6).
// Prevents false rotation when the table is roughly upright.
score0 := scores[0]
if bestAngle != 0 && score0 > 0 {
if !(bestScore > score0*1.4 && score0 < 6.0) {
bestAngle = 0
bestImg = tableImg
bestScore = score0
}
}
slog.Debug("best table orientation",
"angle", bestAngle,
"score", fmt.Sprintf("%.4f", bestScore))
return bestAngle, bestImg, scores
}