ragflow/internal/deepdoc/parser/pdf/parser_ocr.go

package parser

import (
	"context"
	"image"
	"log/slog"
	"math"
	lyt "ragflow/internal/deepdoc/parser/pdf/layout"
	pdf "ragflow/internal/deepdoc/parser/pdf/type"
	util "ragflow/internal/deepdoc/parser/pdf/util"
	"sort"
	"strings"
)

func ocrDetectAndRecognize(ctx context.Context, pageImg image.Image, doc pdf.DocAnalyzer, pageNum int, logLabel string) []pdf.TextBox {
	boxes, err := doc.OCRDetect(ctx, pageImg)
	if err != nil || len(boxes) == 0 {
		if err != nil {
			slog.Warn(logLabel+" OCR detect failed", "page", pageNum, "err", err)
		}
		return nil
	}

	var result []pdf.TextBox
	for _, box := range boxes {
		x0 := int(math.Min(box.X0, math.Min(box.X1, math.Min(box.X2, box.X3))))
		y0 := int(math.Min(box.Y0, math.Min(box.Y1, math.Min(box.Y2, box.Y3))))
		x1 := int(math.Max(box.X0, math.Max(box.X1, math.Max(box.X2, box.X3))))
		y1 := int(math.Max(box.Y0, math.Max(box.Y1, math.Max(box.Y2, box.Y3))))
		if x0 >= x1 || y0 >= y1 {
			continue
		}
		cropped := util.FastCrop(pageImg, x0, y0, x1, y1)
		texts, recErr := doc.OCRRecognize(ctx, cropped)
		if recErr != nil {
			slog.Warn(logLabel+" OCR recognize failed", "page", pageNum, "err", recErr)
			continue
		}
		for _, t := range texts {
			if strings.TrimSpace(t.Text) != "" {
				result = append(result, pdf.TextBox{
					X0: float64(x0), X1: float64(x1),
					Top: float64(y0), Bottom: float64(y1),
					Text:       t.Text,
					PageNumber: pageNum,
				})
			}
		}
	}
	return result
}

// ocrMergeChars runs full-page detect on a page that has embedded chars,
// merges the chars into detect regions, and OCRs any regions without chars.
// Matches Python's __ocr: detect → match chars to boxes → use char text
// for boxes with embedded chars → OCR recognize only empty/garbled boxes.
func ocrMergeChars(ctx context.Context, pageImg image.Image, chars []pdf.TextChar, doc pdf.DocAnalyzer, pageNum int) []pdf.TextBox {
	detectBoxes, err := doc.OCRDetect(ctx, pageImg)
	if err != nil || len(detectBoxes) == 0 {
		return nil
	}
	slog.Debug("ocrMergeChars detect", "page", pageNum, "boxes", len(detectBoxes))

	// Detect boxes are in pixel space (216 DPI).  Scale to PDF space (72 DPI)
	// so coordinates match embedded chars.
	scale := pdf.DlaScale // 3.0
	imgBounds := pageImg.Bounds()
	imgW := float64(imgBounds.Dx()) / scale
	imgH := float64(imgBounds.Dy()) / scale

	// Step 1: match embedded chars to detect boxes (Python __ocr char matching).
	type detectBox struct {
		box            pdf.TextBox
		x0, y0, x1, y1 float64 // PDF-space bounds
	}
	boxes := make([]detectBox, 0, len(detectBoxes))
	for _, b := range detectBoxes {
		x0 := min(b.X0, b.X1, b.X2, b.X3) / scale
		y0 := min(b.Y0, b.Y1, b.Y2, b.Y3) / scale
		x1 := max(b.X0, b.X1, b.X2, b.X3) / scale
		y1 := max(b.Y0, b.Y1, b.Y2, b.Y3) / scale
		if x0 < 0 {
			x0 = 0
		}
		if y0 < 0 {
			y0 = 0
		}
		if x1 > imgW {
			x1 = imgW
		}
		if y1 > imgH {
			y1 = imgH
		}
		if x0 >= x1 || y0 >= y1 {
			continue
		}
		boxes = append(boxes, detectBox{box: pdf.TextBox{
			X0: x0, X1: x1, Top: y0, Bottom: y1, PageNumber: pageNum,
		}, x0: x0, y0: y0, x1: x1, y1: y1})
	}

	// Sort detect boxes top-down (fuzzy Y-group), matching Python's
	// Recognizer.sort_Y_firstly with threshold = median box height / 3.
	if len(boxes) > 1 {
		boxHeights := make([]float64, len(boxes))
		for i := range boxes {
			boxHeights[i] = boxes[i].y1 - boxes[i].y0
		}
		sort.Float64s(boxHeights)
		threshold := boxHeights[len(boxHeights)/2] / 3
		sort.Slice(boxes, func(a, b int) bool {
			if math.Abs(boxes[a].y0-boxes[b].y0) < threshold {
				return boxes[a].x0 < boxes[b].x0
			}
			return boxes[a].y0 < boxes[b].y0
		})
	}

	// Step 2: match each char to the best overlapping detect box
	// (char perspective), matching Python's find_overlapped.
	boxChars := make([][]pdf.TextChar, len(boxes))
	for _, c := range chars {
		bestIdx := -1
		bestOverlap := 1e-6 // Python: thr=1e-6
		for i := range boxes {
			overlap := charBoxOverlapRatio(c, boxes[i].x0, boxes[i].x1, boxes[i].y0, boxes[i].y1)
			if overlap >= bestOverlap {
				bestOverlap = overlap
				bestIdx = i
			}
		}
		if bestIdx < 0 {
			continue
		}
		// Height gating, matching Python: skip when height differs >70%,
		// except space chars which are always kept.
		ch := c.Bottom - c.Top
		if ch <= 0 {
			ch = 1
		}
		bh := boxes[bestIdx].y1 - boxes[bestIdx].y0
		if math.Abs(ch-bh)/math.Max(ch, bh) >= 0.7 && c.Text != " " {
			continue
		}
		boxChars[bestIdx] = append(boxChars[bestIdx], c)
	}

	// Step 3: assemble text for each box.
	var result []pdf.TextBox
	var needOCR []int
	for i := range boxes {
		tb := boxes[i].box
		tb.Text = ""

		if len(boxChars[i]) > 0 {
			// Sort chars by reading order, matching Python's sort_Y_firstly.
			// Fuzzy Y-group: chars within median char height are "same line",
			// sorted by X; different lines sorted by Y.
			sortCharsYFirstly(boxChars[i], util.MedianCharHeight(boxChars[i]))
			// Use lineToTextBox for correct space insertion + garbled detection.
			// lineToTextBox inserts ASCII word spaces at visible gaps —
			// matching Python's __img_ocr + __ocr char logic.
			lineBox := lyt.LineToTextBox(boxChars[i])
			tb.Text = lineBox.Text

			// Strategy 1: If majority of chars are garbled (PUA), clear text → OCR.
			var garbledCnt, totalCnt int
			for _, c := range boxChars[i] {
				for _, r := range c.Text {
					totalCnt++
					if util.IsGarbledChar(string(r)) {
						garbledCnt++
					}
				}
			}
			if totalCnt > 0 && float64(garbledCnt)/float64(totalCnt) >= 0.5 {
				tb.Text = ""
			}
			// Strategy 2: font-encoding garbled (subset fonts, min 5 chars).
			if tb.Text != "" && util.IsGarbledByFontEncoding(boxChars[i], 5) {
				tb.Text = ""
			}
		}

		// Step 4: batch OCR recognize boxes without embedded chars (or garbled).
		if tb.Text == "" {
			needOCR = append(needOCR, i)
		}
		result = append(result, tb)
	}

	if len(needOCR) > 0 {
		cropped := make([]image.Image, len(needOCR))
		for j, idx := range needOCR {
			cropped[j] = util.FastCrop(pageImg,
				int(boxes[idx].x0*scale), int(boxes[idx].y0*scale),
				int(boxes[idx].x1*scale), int(boxes[idx].y1*scale))
		}
		allTexts, allErrs := doc.OCRRecognizeBatch(ctx, cropped)
		for j, idx := range needOCR {
			if allErrs[j] != nil {
				slog.Warn("ocr merge: recognize failed", "page", pageNum, "err", allErrs[j])
				continue
			}
			var ocrParts []string
			for _, t := range allTexts[j] {
				if strings.TrimSpace(t.Text) != "" {
					ocrParts = append(ocrParts, t.Text)
				}
			}
			result[idx].Text = strings.TrimSpace(strings.Join(ocrParts, " "))
		}
	}
	// Filter out boxes with no text.
	filtered := result[:0]
	for _, tb := range result {
		if tb.Text != "" {
			filtered = append(filtered, tb)
		}
	}
	result = filtered
	slog.Debug("ocrMergeChars result", "page", pageNum, "boxes", len(result))
	return result
}

// sortYFirstly sorts chars by Y (fuzzy group by threshold), then by X.
// Matching Python Recognizer.sort_Y_firstly in recognizer.py:26-33:
//
//	If two chars have Y diff < threshold → same line → sort by X.
//	Otherwise → sort by Y.
func sortCharsYFirstly(chars []pdf.TextChar, threshold float64) {
	sort.Slice(chars, func(a, b int) bool {
		diff := chars[a].Top - chars[b].Top
		if math.Abs(diff) < threshold {
			return chars[a].X0 < chars[b].X0
		}
		return diff < 0
	})
}

// charBoxOverlapRatio computes the overlap ratio between a char and a box,
// from the char's perspective.  Returns overlap_area / char_area.
// Matching Python's Recognizer.overlapped_area(char, box, ratio=True).
func charBoxOverlapRatio(c pdf.TextChar, x0, x1, y0, y1 float64) float64 {
	cw := c.X1 - c.X0
	ch := c.Bottom - c.Top
	if cw <= 0 {
		cw = 1
	}
	if ch <= 0 {
		ch = 1
	}
	charArea := cw * ch
	if charArea <= 0 {
		return 0
	}
	inter := util.RectOverlapInter(c.X0, c.Top, c.X1, c.Bottom, x0, y0, x1, y1)
	return inter / charArea
}

// ocrTableCells fills empty TSR cells via OCR recognition.
func ocrTableCells(ctx context.Context, cells []pdf.TSRCell, tableImg image.Image, doc pdf.DocAnalyzer) {
	if doc == nil || tableImg == nil || len(cells) == 0 {
		return
	}
	for i := range cells {
		if cells[i].Text != "" {
			continue
		}
		x0 := int(math.Max(0, cells[i].X0))
		y0 := int(math.Max(0, cells[i].Y0))
		x1 := int(math.Min(float64(tableImg.Bounds().Dx()), cells[i].X1))
		y1 := int(math.Min(float64(tableImg.Bounds().Dy()), cells[i].Y1))
		if x0 >= x1 || y0 >= y1 {
			continue
		}
		cropped := util.FastCrop(tableImg, x0, y0, x1, y1)
		texts, err := doc.OCRRecognize(ctx, cropped)
		if err != nil {
			slog.Warn("table cell OCR failed", "err", err)
			continue
		}
		var parts []string
		for _, t := range texts {
			if t.Text != "" {
				parts = append(parts, t.Text)
			}
		}
		cells[i].Text = strings.TrimSpace(strings.Join(parts, " "))
	}
}