ragflow/internal/deepdoc/parser/pdf/garbled.go

package parser

import (
	"regexp"
	"strings"
	"unicode"
)

// cidPattern matches pdfminer's CID placeholder like "(cid:123)".
//
// Python: pdf_parser.py:198 _CID_PATTERN
var cidPattern = regexp.MustCompile(`\(cid\s*:\s*\d+\s*\)`)

// subsetFontPattern matches PDF subset font prefixes like "ABCDEF+".
// PDF subset fonts use a 2-6 uppercase alphanumeric tag followed by '+'.
//
// Python: pdf_parser.py:261 _has_subset_font_prefix()
var subsetFontPattern = regexp.MustCompile(`^[A-Z0-9]{2,6}\+`)

// HasSubsetFontPrefix checks if a font name has a PDF subset prefix.
//
// Example:
//
//	HasSubsetFontPrefix("DY1+ZLQDm1-1") → true
//	HasSubsetFontPrefix("SimSun")        → false
//	HasSubsetFontPrefix("")              → false
//
// Python: pdf_parser.py:253 _has_subset_font_prefix()
func HasSubsetFontPrefix(fontname string) bool {
	if fontname == "" {
		return false
	}
	return subsetFontPattern.MatchString(fontname)
}

// IsGarbledChar checks if a single character is garbled (unmappable from PDF font encoding).
//
// A character is garbled if it falls into:
//   - Private Use Areas (PUA): U+E000-U+F8FF, U+F0000-U+FFFFF, U+100000-U+10FFFF
//   - Replacement character U+FFFD
//   - Control characters (except tab, newline, carriage return)
//   - C1 control range U+0080-U+009F
//   - Unicode categories "Cn" (unassigned) or "Cs" (surrogate)
//
// Python: pdf_parser.py:201 _is_garbled_char()
//
// Example:
//
//	IsGarbledChar("") → true  (PUA)
//	IsGarbledChar("A")       → false
//	IsGarbledChar("<22>")  → true  (replacement char)
//	IsGarbledChar("")        → false
func IsGarbledChar(ch string) bool {
	if ch == "" {
		return false
	}
	// Always use the actual rune value (handles multi-byte UTF-8 correctly)
	runes := []rune(ch)
	cp := int(runes[0])

	// Private Use Area
	if (cp >= 0xE000 && cp <= 0xF8FF) ||
		(cp >= 0xF0000 && cp <= 0xFFFFF) ||
		(cp >= 0x100000 && cp <= 0x10FFFF) {
		return true
	}
	// Replacement character
	if cp == 0xFFFD {
		return true
	}
	// Control characters (except \t \n \r)
	if cp < 0x20 && ch != "\t" && ch != "\n" && ch != "\r" {
		return true
	}
	// C1 control range
	if cp >= 0x80 && cp <= 0x9F {
		return true
	}

	// Check Unicode category for each rune
	for _, r := range ch {
		cat := catOf(rune(r))
		if cat == "Cn" || cat == "Cs" {
			return true
		}
	}
	return false
}

// IsGarbledText checks if a text string contains too many garbled characters.
// Also detects CID placeholder patterns like "(cid:123)".
//
// Python: pdf_parser.py:229 _is_garbled_text()
//
// Example:
//
//	IsGarbledText("正常文本", 0.5)     → false
//	IsGarbledText("", 0.5) → true
//	IsGarbledText("(cid:123)", 0.5)   → true
//	IsGarbledText("", 0.5)             → false
func IsGarbledText(text string, threshold float64) bool {
	trimmed := strings.TrimSpace(text)
	if trimmed == "" {
		return false
	}
	if cidPattern.MatchString(trimmed) {
		return true
	}

	garbledCount := 0
	total := 0
	for _, r := range trimmed {
		if unicode.IsSpace(r) {
			continue
		}
		total++
		if IsGarbledChar(string(r)) {
			garbledCount++
		}
	}
	if total == 0 {
		return false
	}
	return float64(garbledCount)/float64(total) >= threshold
}

// IsGarbledByFontEncoding detects if a page's text is garbled due to
// broken font encoding mappings.
//
// Detection: if ≥30% of characters come from subset fonts AND
// <5% are CJK/Hangul/Kana AND >40% are ASCII punctuation/symbols,
// the page is likely garbled.
//
// Python: pdf_parser.py:264 _is_garbled_by_font_encoding()
//
// Example:
//
//	chars := []TextChar{
//	  {Text: "!", FontName: "DY1+SimSun"},
//	  {Text: "#", FontName: "DY1+SimSun"},
//	  // ... mostly ASCII punctuation with subset font prefix
//	}
//	IsGarbledByFontEncoding(chars, 20) → true  // OCR needed!
func IsGarbledByFontEncoding(chars []TextChar, minChars int) bool {
	if len(chars) < minChars {
		return false
	}

	subsetFontCount := 0
	totalNonSpace := 0
	asciiPunctSym := 0
	cjkLike := 0

	for _, c := range chars {
		text := strings.TrimSpace(c.Text)
		if text == "" {
			continue
		}
		totalNonSpace++

		if HasSubsetFontPrefix(c.FontName) {
			subsetFontCount++
		}

		// Always use the rune value
		runes := []rune(text)
		cp := int(runes[0])

		// CJK Unified Ideographs, CJK Compatibility, CJK Extension B
		// Hangul syllables, Hiragana, Katakana
		// Fullwidth forms (U+FF00-U+FF5E): legitimate CJK typographic characters
		if (cp >= 0x2E80 && cp <= 0x9FFF) ||
			(cp >= 0xF900 && cp <= 0xFAFF) ||
			(cp >= 0x20000 && cp <= 0x2FA1F) ||
			(cp >= 0xAC00 && cp <= 0xD7AF) ||
			(cp >= 0x3040 && cp <= 0x30FF) ||
			(cp >= 0xFF00 && cp <= 0xFF5E) {
			cjkLike++
		} else if (cp >= 0x21 && cp <= 0x2F) || // !"#$%&'()*+,-./
			(cp >= 0x3A && cp <= 0x40) || // :;<=>?@
			(cp >= 0x5B && cp <= 0x60) || // [\]^_`
			(cp >= 0x7B && cp <= 0x7E) { // {|}~
			asciiPunctSym++
		}
	}

	if totalNonSpace < minChars {
		return false
	}

	subsetRatio := float64(subsetFontCount) / float64(totalNonSpace)
	if subsetRatio < 0.3 {
		return false
	}

	cjkRatio := float64(cjkLike) / float64(totalNonSpace)
	punctRatio := float64(asciiPunctSym) / float64(totalNonSpace)

	return cjkRatio < 0.05 && punctRatio > 0.4
}

// catOf returns "Cs" for surrogates, "Cn" for unassigned code points
// (not in any Unicode category), and "" for everything else.
// Python unicodedata.category() returns "Cc" for control chars, "Cn" only
// for truly unassigned — we match that behavior.
func catOf(r rune) string {
	if r >= 0xD800 && r <= 0xDFFF {
		return "Cs" // surrogate
	}
	// C1 controls (0x80-0x9F): Python returns "Cc", not "Cn".
	if r >= 0x80 && r <= 0x9F {
		return ""
	}
	// A rune is unassigned (Cn) if it's NOT in any recognized category.
	// Python unicodedata.category() returns "Cc" for control chars,
	// "Cn" only for truly unassigned. We match that behavior.
	if !unicode.IsPrint(r) &&
		!unicode.IsSpace(r) &&
		!unicode.IsControl(r) &&
		!unicode.Is(unicode.Cf, r) &&
		!unicode.Is(unicode.Co, r) &&
		r > 0x20 {
		return "Cn"
	}
	return ""
}