mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-03 09:11:59 +08:00
### What problem does this PR solve? Package refactor and PDF post process. ### Type of change - [x] Refactoring --------- Co-authored-by: Claude <noreply@anthropic.com>
398 lines
10 KiB
Go
398 lines
10 KiB
Go
package util
|
||
|
||
import (
|
||
"regexp"
|
||
"strings"
|
||
"unicode"
|
||
|
||
pdf "ragflow/internal/deepdoc/parser/pdf/type"
|
||
)
|
||
|
||
// CIDPattern matches pdfminer's CID placeholder like "(cid:123)".
|
||
//
|
||
// Python: pdf_parser.py:198 _CID_PATTERN
|
||
var CIDPattern = regexp.MustCompile(`\(cid\s*:\s*\d+\s*\)`)
|
||
|
||
// subsetFontPattern matches PDF subset font prefixes like "ABCDEF+".
|
||
// PDF subset fonts use a 2-6 uppercase alphanumeric tag followed by '+'.
|
||
//
|
||
// Python: pdf_parser.py:261 _has_subset_font_prefix()
|
||
var subsetFontPattern = regexp.MustCompile(`^[A-Z0-9]{2,6}\+`)
|
||
|
||
// HasSubsetFontPrefix checks if a font name has a PDF subset prefix.
|
||
//
|
||
// Example:
|
||
//
|
||
// HasSubsetFontPrefix("DY1+ZLQDm1-1") → true
|
||
// HasSubsetFontPrefix("SimSun") → false
|
||
// HasSubsetFontPrefix("") → false
|
||
//
|
||
// Python: pdf_parser.py:253 _has_subset_font_prefix()
|
||
func HasSubsetFontPrefix(fontname string) bool {
|
||
if fontname == "" {
|
||
return false
|
||
}
|
||
return subsetFontPattern.MatchString(fontname)
|
||
}
|
||
|
||
// IsGarbledChar checks if a single character is garbled (unmappable from PDF font encoding).
|
||
//
|
||
// A character is garbled if it falls into:
|
||
// - Private Use Areas (PUA): U+E000-U+F8FF, U+F0000-U+FFFFF, U+100000-U+10FFFF
|
||
// - Replacement character U+FFFD
|
||
// - Control characters (except tab, newline, carriage return)
|
||
// - C1 control range U+0080-U+009F
|
||
// - Unicode categories "Cn" (unassigned) or "Cs" (surrogate)
|
||
//
|
||
// Python: pdf_parser.py:201 _is_garbled_char()
|
||
//
|
||
// Example:
|
||
//
|
||
// IsGarbledChar("") → true (PUA)
|
||
// IsGarbledChar("A") → false
|
||
// IsGarbledChar("<22>") → true (replacement char)
|
||
// IsGarbledChar("") → false
|
||
func IsGarbledChar(ch string) bool {
|
||
if ch == "" {
|
||
return false
|
||
}
|
||
// Always use the actual rune value (handles multi-byte UTF-8 correctly)
|
||
runes := []rune(ch)
|
||
cp := int(runes[0])
|
||
|
||
// Private Use Area
|
||
if (cp >= 0xE000 && cp <= 0xF8FF) ||
|
||
(cp >= 0xF0000 && cp <= 0xFFFFF) ||
|
||
(cp >= 0x100000 && cp <= 0x10FFFF) {
|
||
return true
|
||
}
|
||
// Replacement character
|
||
if cp == 0xFFFD {
|
||
return true
|
||
}
|
||
// Control characters (except \t \n \r)
|
||
if cp < 0x20 && ch != "\t" && ch != "\n" && ch != "\r" {
|
||
return true
|
||
}
|
||
// C1 control range
|
||
if cp >= 0x80 && cp <= 0x9F {
|
||
return true
|
||
}
|
||
|
||
// Check Unicode category for each rune
|
||
for _, r := range ch {
|
||
cat := catOf(rune(r))
|
||
if cat == "Cn" || cat == "Cs" {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
// IsGarbledText checks if a text string contains too many garbled characters.
|
||
// Also detects CID placeholder patterns like "(cid:123)".
|
||
//
|
||
// Python: pdf_parser.py:229 _is_garbled_text()
|
||
//
|
||
// Example:
|
||
//
|
||
// IsGarbledText("正常文本", 0.5) → false
|
||
// IsGarbledText("", 0.5) → true
|
||
// IsGarbledText("(cid:123)", 0.5) → true
|
||
// IsGarbledText("", 0.5) → false
|
||
func IsGarbledText(text string, threshold float64) bool {
|
||
trimmed := strings.TrimSpace(text)
|
||
if trimmed == "" {
|
||
return false
|
||
}
|
||
if CIDPattern.MatchString(trimmed) {
|
||
return true
|
||
}
|
||
|
||
garbledCount := 0
|
||
total := 0
|
||
for _, r := range trimmed {
|
||
if unicode.IsSpace(r) {
|
||
continue
|
||
}
|
||
total++
|
||
if IsGarbledChar(string(r)) {
|
||
garbledCount++
|
||
}
|
||
}
|
||
if total == 0 {
|
||
return false
|
||
}
|
||
return float64(garbledCount)/float64(total) >= threshold
|
||
}
|
||
|
||
// IsGarbledByFontEncoding detects if a page's text is garbled due to
|
||
// broken font encoding mappings.
|
||
//
|
||
// Detection: if ≥30% of characters come from subset fonts AND
|
||
// <5% are CJK/Hangul/Kana AND >40% are ASCII punctuation/symbols,
|
||
// the page is likely garbled.
|
||
//
|
||
// Python: pdf_parser.py:264 _is_garbled_by_font_encoding()
|
||
//
|
||
// Example:
|
||
//
|
||
// chars := []pdf.TextChar{
|
||
// {Text: "!", FontName: "DY1+SimSun"},
|
||
// {Text: "#", FontName: "DY1+SimSun"},
|
||
// // ... mostly ASCII punctuation with subset font prefix
|
||
// }
|
||
// IsGarbledByFontEncoding(chars, 20) → true // OCR needed!
|
||
func IsGarbledByFontEncoding(chars []pdf.TextChar, minChars int) bool {
|
||
if len(chars) < minChars {
|
||
return false
|
||
}
|
||
|
||
subsetFontCount := 0
|
||
totalNonSpace := 0
|
||
asciiPunctSym := 0
|
||
cjkLike := 0
|
||
|
||
for _, c := range chars {
|
||
text := strings.TrimSpace(c.Text)
|
||
if text == "" {
|
||
continue
|
||
}
|
||
totalNonSpace++
|
||
|
||
if HasSubsetFontPrefix(c.FontName) {
|
||
subsetFontCount++
|
||
}
|
||
|
||
// Always use the rune value
|
||
runes := []rune(text)
|
||
cp := int(runes[0])
|
||
|
||
// CJK Unified Ideographs, CJK Compatibility, CJK Extension B
|
||
// Hangul syllables, Hiragana, Katakana
|
||
// Fullwidth forms (U+FF00-U+FF5E): legitimate CJK typographic characters
|
||
if (cp >= 0x2E80 && cp <= 0x9FFF) ||
|
||
(cp >= 0xF900 && cp <= 0xFAFF) ||
|
||
(cp >= 0x20000 && cp <= 0x2FA1F) ||
|
||
(cp >= 0xAC00 && cp <= 0xD7AF) ||
|
||
(cp >= 0x3040 && cp <= 0x30FF) ||
|
||
(cp >= 0xFF00 && cp <= 0xFF5E) {
|
||
cjkLike++
|
||
} else if (cp >= 0x21 && cp <= 0x2F) || // !"#$%&'()*+,-./
|
||
(cp >= 0x3A && cp <= 0x40) || // :;<=>?@
|
||
(cp >= 0x5B && cp <= 0x60) || // [\]^_`
|
||
(cp >= 0x7B && cp <= 0x7E) { // {|}~
|
||
asciiPunctSym++
|
||
}
|
||
}
|
||
|
||
if totalNonSpace < minChars {
|
||
return false
|
||
}
|
||
|
||
subsetRatio := float64(subsetFontCount) / float64(totalNonSpace)
|
||
if subsetRatio < 0.3 {
|
||
return false
|
||
}
|
||
|
||
cjkRatio := float64(cjkLike) / float64(totalNonSpace)
|
||
punctRatio := float64(asciiPunctSym) / float64(totalNonSpace)
|
||
|
||
return cjkRatio < 0.05 && punctRatio > 0.4
|
||
}
|
||
|
||
// catOf returns "Cs" for surrogates, "Cn" for unassigned code points
|
||
// (not in any Unicode category), and "" for everything else.
|
||
// Python unicodedata.category() returns "Cc" for control chars, "Cn" only
|
||
// for truly unassigned — we match that behavior.
|
||
func catOf(r rune) string {
|
||
if r >= 0xD800 && r <= 0xDFFF {
|
||
return "Cs" // surrogate
|
||
}
|
||
// C1 controls (0x80-0x9F): Python returns "Cc", not "Cn".
|
||
if r >= 0x80 && r <= 0x9F {
|
||
return ""
|
||
}
|
||
// A rune is unassigned (Cn) if it's NOT in any recognized category.
|
||
// Python unicodedata.category() returns "Cc" for control chars,
|
||
// "Cn" only for truly unassigned. We match that behavior.
|
||
if !unicode.IsPrint(r) &&
|
||
!unicode.IsSpace(r) &&
|
||
!unicode.IsControl(r) &&
|
||
!unicode.Is(unicode.Cf, r) &&
|
||
!unicode.Is(unicode.Co, r) &&
|
||
r > 0x20 {
|
||
return "Cn"
|
||
}
|
||
return ""
|
||
}
|
||
|
||
// IsGarbledPage returns true if a page is garbled by PUA ratio, font encoding,
|
||
// pdf_oxide unmapped glyphs, or scan noise (no real words).
|
||
func IsGarbledPage(chars []pdf.TextChar) bool {
|
||
if len(chars) < 20 {
|
||
return false
|
||
}
|
||
// Build full-page text for detection (all O(n) single pass).
|
||
var fullText strings.Builder
|
||
for _, c := range chars {
|
||
fullText.WriteString(c.Text)
|
||
}
|
||
text := fullText.String()
|
||
if IsGarbledText(text, 0.3) {
|
||
return true
|
||
}
|
||
if PdfOxideUnmappedGarbled(text) && IsScanNoise(text) {
|
||
return true
|
||
}
|
||
if IsGarbledByFontEncoding(chars, 20) {
|
||
return true
|
||
}
|
||
if IsScanNoise(text) {
|
||
return true
|
||
}
|
||
return false
|
||
}
|
||
|
||
// IsScanNoise detects scanned pages where pdf_oxide extracts noise glyphs
|
||
// instead of real text. Real text in any language contains word-like runs
|
||
// of consecutive letters (L category). Scan noise consists of random ASCII
|
||
// symbols with at most 2-letter fragments.
|
||
//
|
||
// Three indicators of real (non-noise) text, any one is sufficient:
|
||
// - ≥4 consecutive lowercase Latin letters (e.g. "the", "and")
|
||
// - ≥2 consecutive CJK characters (Han, Hiragana, Katakana, Hangul)
|
||
// - ≥4 consecutive non-ASCII letters (Arabic, Thai, Cyrillic, etc.)
|
||
//
|
||
// Pure-uppercase fragments like "RASB" are common in pdf_oxide noise but
|
||
// never appear as standalone words in real text without lowercase context.
|
||
func IsScanNoise(text string) bool {
|
||
nonSpace := 0
|
||
digitCount := 0
|
||
lowerRun := 0
|
||
maxLowerRun := 0
|
||
cjkRun := 0
|
||
maxCJKRun := 0
|
||
nonASCIILetterRun := 0
|
||
maxNonASCIILetterRun := 0
|
||
|
||
for _, r := range text {
|
||
if r == ' ' || r == '\t' || r == '\n' || r == '\r' {
|
||
lowerRun = 0
|
||
cjkRun = 0
|
||
nonASCIILetterRun = 0
|
||
continue
|
||
}
|
||
nonSpace++
|
||
|
||
// Digit density: real content (tables, dates) has digits;
|
||
// pdf_oxide noise (unmapped glyphs) never produces digits.
|
||
if r >= '0' && r <= '9' {
|
||
digitCount++
|
||
}
|
||
|
||
// Lowercase Latin (Ll)
|
||
if unicode.Is(unicode.Ll, r) {
|
||
lowerRun++
|
||
if lowerRun > maxLowerRun {
|
||
maxLowerRun = lowerRun
|
||
}
|
||
} else {
|
||
lowerRun = 0
|
||
}
|
||
|
||
// CJK: Han, Hiragana, Katakana, Hangul Syllables & Jamo
|
||
if pdf.IsCJK(r) {
|
||
cjkRun++
|
||
if cjkRun > maxCJKRun {
|
||
maxCJKRun = cjkRun
|
||
}
|
||
} else {
|
||
cjkRun = 0
|
||
}
|
||
|
||
// Non-ASCII letter (Arabic U+0600–U+06FF, Thai U+0E00–U+0E7F,
|
||
// Cyrillic U+0400–U+04FF, etc.). Excludes ASCII so uppercase
|
||
// Latin fragments like "RASB" don't count.
|
||
if unicode.IsLetter(r) && r > unicode.MaxASCII {
|
||
nonASCIILetterRun++
|
||
if nonASCIILetterRun > maxNonASCIILetterRun {
|
||
maxNonASCIILetterRun = nonASCIILetterRun
|
||
}
|
||
} else {
|
||
nonASCIILetterRun = 0
|
||
}
|
||
}
|
||
|
||
// Need enough characters to make a meaningful decision.
|
||
if nonSpace < 30 {
|
||
return false
|
||
}
|
||
|
||
// Digit density: pdf_oxide never substitutes digits for unmapped
|
||
// glyphs. Real content (tables, dates, page numbers) has ≥10%
|
||
// digits; noise consists of random ASCII punctuation.
|
||
if float64(digitCount)/float64(nonSpace) >= 0.10 {
|
||
return false
|
||
}
|
||
|
||
// Real text in any script — any one indicator is sufficient.
|
||
isNoise := maxLowerRun < 4 && maxCJKRun < 2 && maxNonASCIILetterRun < 4
|
||
|
||
return isNoise
|
||
}
|
||
|
||
// isCJK reports whether r is a CJK character: Han ideograph, Hiragana,
|
||
// Katakana, Hangul syllable, or Hangul Jamo.
|
||
|
||
// PdfOxideUnmappedGarbled detects pdf_oxide's '#' placeholder glyphs.
|
||
// pdf_oxide uses '#' (U+0023) for every glyph it cannot map; consecutive
|
||
// unmapped glyphs form "##", "###", "####" sequences. Three or more
|
||
// consecutive '#' is virtually impossible in normal text.
|
||
//
|
||
// Two conditions (either is sufficient):
|
||
// - ≥ 2 occurrences of "###" (3+ consecutive #)
|
||
// - # density ≥ 5% of non-space characters
|
||
func PdfOxideUnmappedGarbled(text string) bool {
|
||
hashCount := 0
|
||
total := 0
|
||
consecutive := 0
|
||
tripleClusters := 0
|
||
|
||
for _, r := range text {
|
||
if r == ' ' || r == '\t' || r == '\n' || r == '\r' {
|
||
continue
|
||
}
|
||
total++
|
||
if r == '#' {
|
||
hashCount++
|
||
consecutive++
|
||
if consecutive == 3 {
|
||
tripleClusters++
|
||
}
|
||
} else {
|
||
consecutive = 0
|
||
}
|
||
}
|
||
|
||
if total == 0 {
|
||
return false
|
||
}
|
||
|
||
density := float64(hashCount) / float64(total)
|
||
|
||
if tripleClusters >= 1 {
|
||
return true
|
||
}
|
||
// Density check only meaningful with enough chars (matches isGarbledPage's
|
||
// min 20 char guard). In production the sample is 200 chars.
|
||
if total >= 40 && density >= 0.03 {
|
||
return true
|
||
}
|
||
return false
|
||
}
|
||
|
||
// ocrDetectAndRecognize runs OCR detection + recognition and returns
|
||
// recognized pdf.TextBox results. logLabel distinguishes callers in log output
|
||
// ("scan page", "garbled page").
|