Files
ragflow/internal/deepdoc/parser/pdf/garbled.go
Jack 304d9e02bb Refactor: migrate pdf_parser.py to golang (#16323)
### What problem does this PR solve?

Http API based on onnx model.
pdf_parser.py to golang

### Type of change

- [x] Refactoring
2026-06-25 20:16:16 +08:00

227 lines
5.9 KiB
Go
Raw Blame History

package parser
import (
"regexp"
"strings"
"unicode"
)
// cidPattern matches pdfminer's CID placeholder like "(cid:123)".
//
// Python: pdf_parser.py:198 _CID_PATTERN
var cidPattern = regexp.MustCompile(`\(cid\s*:\s*\d+\s*\)`)
// subsetFontPattern matches PDF subset font prefixes like "ABCDEF+".
// PDF subset fonts use a 2-6 uppercase alphanumeric tag followed by '+'.
//
// Python: pdf_parser.py:261 _has_subset_font_prefix()
var subsetFontPattern = regexp.MustCompile(`^[A-Z0-9]{2,6}\+`)
// HasSubsetFontPrefix checks if a font name has a PDF subset prefix.
//
// Example:
//
// HasSubsetFontPrefix("DY1+ZLQDm1-1") → true
// HasSubsetFontPrefix("SimSun") → false
// HasSubsetFontPrefix("") → false
//
// Python: pdf_parser.py:253 _has_subset_font_prefix()
func HasSubsetFontPrefix(fontname string) bool {
if fontname == "" {
return false
}
return subsetFontPattern.MatchString(fontname)
}
// IsGarbledChar checks if a single character is garbled (unmappable from PDF font encoding).
//
// A character is garbled if it falls into:
// - Private Use Areas (PUA): U+E000-U+F8FF, U+F0000-U+FFFFF, U+100000-U+10FFFF
// - Replacement character U+FFFD
// - Control characters (except tab, newline, carriage return)
// - C1 control range U+0080-U+009F
// - Unicode categories "Cn" (unassigned) or "Cs" (surrogate)
//
// Python: pdf_parser.py:201 _is_garbled_char()
//
// Example:
//
// IsGarbledChar("") → true (PUA)
// IsGarbledChar("A") → false
// IsGarbledChar("<22>") → true (replacement char)
// IsGarbledChar("") → false
func IsGarbledChar(ch string) bool {
if ch == "" {
return false
}
// Always use the actual rune value (handles multi-byte UTF-8 correctly)
runes := []rune(ch)
cp := int(runes[0])
// Private Use Area
if (cp >= 0xE000 && cp <= 0xF8FF) ||
(cp >= 0xF0000 && cp <= 0xFFFFF) ||
(cp >= 0x100000 && cp <= 0x10FFFF) {
return true
}
// Replacement character
if cp == 0xFFFD {
return true
}
// Control characters (except \t \n \r)
if cp < 0x20 && ch != "\t" && ch != "\n" && ch != "\r" {
return true
}
// C1 control range
if cp >= 0x80 && cp <= 0x9F {
return true
}
// Check Unicode category for each rune
for _, r := range ch {
cat := catOf(rune(r))
if cat == "Cn" || cat == "Cs" {
return true
}
}
return false
}
// IsGarbledText checks if a text string contains too many garbled characters.
// Also detects CID placeholder patterns like "(cid:123)".
//
// Python: pdf_parser.py:229 _is_garbled_text()
//
// Example:
//
// IsGarbledText("正常文本", 0.5) → false
// IsGarbledText("", 0.5) → true
// IsGarbledText("(cid:123)", 0.5) → true
// IsGarbledText("", 0.5) → false
func IsGarbledText(text string, threshold float64) bool {
trimmed := strings.TrimSpace(text)
if trimmed == "" {
return false
}
if cidPattern.MatchString(trimmed) {
return true
}
garbledCount := 0
total := 0
for _, r := range trimmed {
if unicode.IsSpace(r) {
continue
}
total++
if IsGarbledChar(string(r)) {
garbledCount++
}
}
if total == 0 {
return false
}
return float64(garbledCount)/float64(total) >= threshold
}
// IsGarbledByFontEncoding detects if a page's text is garbled due to
// broken font encoding mappings.
//
// Detection: if ≥30% of characters come from subset fonts AND
// <5% are CJK/Hangul/Kana AND >40% are ASCII punctuation/symbols,
// the page is likely garbled.
//
// Python: pdf_parser.py:264 _is_garbled_by_font_encoding()
//
// Example:
//
// chars := []TextChar{
// {Text: "!", FontName: "DY1+SimSun"},
// {Text: "#", FontName: "DY1+SimSun"},
// // ... mostly ASCII punctuation with subset font prefix
// }
// IsGarbledByFontEncoding(chars, 20) → true // OCR needed!
func IsGarbledByFontEncoding(chars []TextChar, minChars int) bool {
if len(chars) < minChars {
return false
}
subsetFontCount := 0
totalNonSpace := 0
asciiPunctSym := 0
cjkLike := 0
for _, c := range chars {
text := strings.TrimSpace(c.Text)
if text == "" {
continue
}
totalNonSpace++
if HasSubsetFontPrefix(c.FontName) {
subsetFontCount++
}
// Always use the rune value
runes := []rune(text)
cp := int(runes[0])
// CJK Unified Ideographs, CJK Compatibility, CJK Extension B
// Hangul syllables, Hiragana, Katakana
// Fullwidth forms (U+FF00-U+FF5E): legitimate CJK typographic characters
if (cp >= 0x2E80 && cp <= 0x9FFF) ||
(cp >= 0xF900 && cp <= 0xFAFF) ||
(cp >= 0x20000 && cp <= 0x2FA1F) ||
(cp >= 0xAC00 && cp <= 0xD7AF) ||
(cp >= 0x3040 && cp <= 0x30FF) ||
(cp >= 0xFF00 && cp <= 0xFF5E) {
cjkLike++
} else if (cp >= 0x21 && cp <= 0x2F) || // !"#$%&'()*+,-./
(cp >= 0x3A && cp <= 0x40) || // :;<=>?@
(cp >= 0x5B && cp <= 0x60) || // [\]^_`
(cp >= 0x7B && cp <= 0x7E) { // {|}~
asciiPunctSym++
}
}
if totalNonSpace < minChars {
return false
}
subsetRatio := float64(subsetFontCount) / float64(totalNonSpace)
if subsetRatio < 0.3 {
return false
}
cjkRatio := float64(cjkLike) / float64(totalNonSpace)
punctRatio := float64(asciiPunctSym) / float64(totalNonSpace)
return cjkRatio < 0.05 && punctRatio > 0.4
}
// catOf returns "Cs" for surrogates, "Cn" for unassigned code points
// (not in any Unicode category), and "" for everything else.
// Python unicodedata.category() returns "Cc" for control chars, "Cn" only
// for truly unassigned — we match that behavior.
func catOf(r rune) string {
if r >= 0xD800 && r <= 0xDFFF {
return "Cs" // surrogate
}
// C1 controls (0x80-0x9F): Python returns "Cc", not "Cn".
if r >= 0x80 && r <= 0x9F {
return ""
}
// A rune is unassigned (Cn) if it's NOT in any recognized category.
// Python unicodedata.category() returns "Cc" for control chars,
// "Cn" only for truly unassigned. We match that behavior.
if !unicode.IsPrint(r) &&
!unicode.IsSpace(r) &&
!unicode.IsControl(r) &&
!unicode.Is(unicode.Cf, r) &&
!unicode.Is(unicode.Co, r) &&
r > 0x20 {
return "Cn"
}
return ""
}