mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
### What problem does this PR solve? Http API based on onnx model. pdf_parser.py to golang ### Type of change - [x] Refactoring
227 lines
5.9 KiB
Go
227 lines
5.9 KiB
Go
package parser
|
||
|
||
import (
|
||
"regexp"
|
||
"strings"
|
||
"unicode"
|
||
)
|
||
|
||
// cidPattern matches pdfminer's CID placeholder like "(cid:123)".
|
||
//
|
||
// Python: pdf_parser.py:198 _CID_PATTERN
|
||
var cidPattern = regexp.MustCompile(`\(cid\s*:\s*\d+\s*\)`)
|
||
|
||
// subsetFontPattern matches PDF subset font prefixes like "ABCDEF+".
|
||
// PDF subset fonts use a 2-6 uppercase alphanumeric tag followed by '+'.
|
||
//
|
||
// Python: pdf_parser.py:261 _has_subset_font_prefix()
|
||
var subsetFontPattern = regexp.MustCompile(`^[A-Z0-9]{2,6}\+`)
|
||
|
||
// HasSubsetFontPrefix checks if a font name has a PDF subset prefix.
|
||
//
|
||
// Example:
|
||
//
|
||
// HasSubsetFontPrefix("DY1+ZLQDm1-1") → true
|
||
// HasSubsetFontPrefix("SimSun") → false
|
||
// HasSubsetFontPrefix("") → false
|
||
//
|
||
// Python: pdf_parser.py:253 _has_subset_font_prefix()
|
||
func HasSubsetFontPrefix(fontname string) bool {
|
||
if fontname == "" {
|
||
return false
|
||
}
|
||
return subsetFontPattern.MatchString(fontname)
|
||
}
|
||
|
||
// IsGarbledChar checks if a single character is garbled (unmappable from PDF font encoding).
|
||
//
|
||
// A character is garbled if it falls into:
|
||
// - Private Use Areas (PUA): U+E000-U+F8FF, U+F0000-U+FFFFF, U+100000-U+10FFFF
|
||
// - Replacement character U+FFFD
|
||
// - Control characters (except tab, newline, carriage return)
|
||
// - C1 control range U+0080-U+009F
|
||
// - Unicode categories "Cn" (unassigned) or "Cs" (surrogate)
|
||
//
|
||
// Python: pdf_parser.py:201 _is_garbled_char()
|
||
//
|
||
// Example:
|
||
//
|
||
// IsGarbledChar("") → true (PUA)
|
||
// IsGarbledChar("A") → false
|
||
// IsGarbledChar("<22>") → true (replacement char)
|
||
// IsGarbledChar("") → false
|
||
func IsGarbledChar(ch string) bool {
|
||
if ch == "" {
|
||
return false
|
||
}
|
||
// Always use the actual rune value (handles multi-byte UTF-8 correctly)
|
||
runes := []rune(ch)
|
||
cp := int(runes[0])
|
||
|
||
// Private Use Area
|
||
if (cp >= 0xE000 && cp <= 0xF8FF) ||
|
||
(cp >= 0xF0000 && cp <= 0xFFFFF) ||
|
||
(cp >= 0x100000 && cp <= 0x10FFFF) {
|
||
return true
|
||
}
|
||
// Replacement character
|
||
if cp == 0xFFFD {
|
||
return true
|
||
}
|
||
// Control characters (except \t \n \r)
|
||
if cp < 0x20 && ch != "\t" && ch != "\n" && ch != "\r" {
|
||
return true
|
||
}
|
||
// C1 control range
|
||
if cp >= 0x80 && cp <= 0x9F {
|
||
return true
|
||
}
|
||
|
||
// Check Unicode category for each rune
|
||
for _, r := range ch {
|
||
cat := catOf(rune(r))
|
||
if cat == "Cn" || cat == "Cs" {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
// IsGarbledText checks if a text string contains too many garbled characters.
|
||
// Also detects CID placeholder patterns like "(cid:123)".
|
||
//
|
||
// Python: pdf_parser.py:229 _is_garbled_text()
|
||
//
|
||
// Example:
|
||
//
|
||
// IsGarbledText("正常文本", 0.5) → false
|
||
// IsGarbledText("", 0.5) → true
|
||
// IsGarbledText("(cid:123)", 0.5) → true
|
||
// IsGarbledText("", 0.5) → false
|
||
func IsGarbledText(text string, threshold float64) bool {
|
||
trimmed := strings.TrimSpace(text)
|
||
if trimmed == "" {
|
||
return false
|
||
}
|
||
if cidPattern.MatchString(trimmed) {
|
||
return true
|
||
}
|
||
|
||
garbledCount := 0
|
||
total := 0
|
||
for _, r := range trimmed {
|
||
if unicode.IsSpace(r) {
|
||
continue
|
||
}
|
||
total++
|
||
if IsGarbledChar(string(r)) {
|
||
garbledCount++
|
||
}
|
||
}
|
||
if total == 0 {
|
||
return false
|
||
}
|
||
return float64(garbledCount)/float64(total) >= threshold
|
||
}
|
||
|
||
// IsGarbledByFontEncoding detects if a page's text is garbled due to
|
||
// broken font encoding mappings.
|
||
//
|
||
// Detection: if ≥30% of characters come from subset fonts AND
|
||
// <5% are CJK/Hangul/Kana AND >40% are ASCII punctuation/symbols,
|
||
// the page is likely garbled.
|
||
//
|
||
// Python: pdf_parser.py:264 _is_garbled_by_font_encoding()
|
||
//
|
||
// Example:
|
||
//
|
||
// chars := []TextChar{
|
||
// {Text: "!", FontName: "DY1+SimSun"},
|
||
// {Text: "#", FontName: "DY1+SimSun"},
|
||
// // ... mostly ASCII punctuation with subset font prefix
|
||
// }
|
||
// IsGarbledByFontEncoding(chars, 20) → true // OCR needed!
|
||
func IsGarbledByFontEncoding(chars []TextChar, minChars int) bool {
|
||
if len(chars) < minChars {
|
||
return false
|
||
}
|
||
|
||
subsetFontCount := 0
|
||
totalNonSpace := 0
|
||
asciiPunctSym := 0
|
||
cjkLike := 0
|
||
|
||
for _, c := range chars {
|
||
text := strings.TrimSpace(c.Text)
|
||
if text == "" {
|
||
continue
|
||
}
|
||
totalNonSpace++
|
||
|
||
if HasSubsetFontPrefix(c.FontName) {
|
||
subsetFontCount++
|
||
}
|
||
|
||
// Always use the rune value
|
||
runes := []rune(text)
|
||
cp := int(runes[0])
|
||
|
||
// CJK Unified Ideographs, CJK Compatibility, CJK Extension B
|
||
// Hangul syllables, Hiragana, Katakana
|
||
// Fullwidth forms (U+FF00-U+FF5E): legitimate CJK typographic characters
|
||
if (cp >= 0x2E80 && cp <= 0x9FFF) ||
|
||
(cp >= 0xF900 && cp <= 0xFAFF) ||
|
||
(cp >= 0x20000 && cp <= 0x2FA1F) ||
|
||
(cp >= 0xAC00 && cp <= 0xD7AF) ||
|
||
(cp >= 0x3040 && cp <= 0x30FF) ||
|
||
(cp >= 0xFF00 && cp <= 0xFF5E) {
|
||
cjkLike++
|
||
} else if (cp >= 0x21 && cp <= 0x2F) || // !"#$%&'()*+,-./
|
||
(cp >= 0x3A && cp <= 0x40) || // :;<=>?@
|
||
(cp >= 0x5B && cp <= 0x60) || // [\]^_`
|
||
(cp >= 0x7B && cp <= 0x7E) { // {|}~
|
||
asciiPunctSym++
|
||
}
|
||
}
|
||
|
||
if totalNonSpace < minChars {
|
||
return false
|
||
}
|
||
|
||
subsetRatio := float64(subsetFontCount) / float64(totalNonSpace)
|
||
if subsetRatio < 0.3 {
|
||
return false
|
||
}
|
||
|
||
cjkRatio := float64(cjkLike) / float64(totalNonSpace)
|
||
punctRatio := float64(asciiPunctSym) / float64(totalNonSpace)
|
||
|
||
return cjkRatio < 0.05 && punctRatio > 0.4
|
||
}
|
||
|
||
// catOf returns "Cs" for surrogates, "Cn" for unassigned code points
|
||
// (not in any Unicode category), and "" for everything else.
|
||
// Python unicodedata.category() returns "Cc" for control chars, "Cn" only
|
||
// for truly unassigned — we match that behavior.
|
||
func catOf(r rune) string {
|
||
if r >= 0xD800 && r <= 0xDFFF {
|
||
return "Cs" // surrogate
|
||
}
|
||
// C1 controls (0x80-0x9F): Python returns "Cc", not "Cn".
|
||
if r >= 0x80 && r <= 0x9F {
|
||
return ""
|
||
}
|
||
// A rune is unassigned (Cn) if it's NOT in any recognized category.
|
||
// Python unicodedata.category() returns "Cc" for control chars,
|
||
// "Cn" only for truly unassigned. We match that behavior.
|
||
if !unicode.IsPrint(r) &&
|
||
!unicode.IsSpace(r) &&
|
||
!unicode.IsControl(r) &&
|
||
!unicode.Is(unicode.Cf, r) &&
|
||
!unicode.Is(unicode.Co, r) &&
|
||
r > 0x20 {
|
||
return "Cn"
|
||
}
|
||
return ""
|
||
}
|