mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-03 17:21:59 +08:00
Fix: PDF page count detection for compressed PDFs (#16487)
This commit is contained in:
@@ -41,6 +41,7 @@ import (
|
||||
|
||||
"ragflow/internal/common"
|
||||
"ragflow/internal/dao"
|
||||
"ragflow/internal/deepdoc/parser/pdf/pdfoxide"
|
||||
"ragflow/internal/engine"
|
||||
"ragflow/internal/engine/redis"
|
||||
enginetypes "ragflow/internal/engine/types"
|
||||
@@ -1617,6 +1618,10 @@ func documentParseTaskRanges(doc *entity.Document, bucket, objectName string) ([
|
||||
}
|
||||
}
|
||||
if len(ranges) == 0 {
|
||||
// pages == 0 means page count detection failed (e.g. compressed
|
||||
// PDF where both regex and pdfoxide fallbacks failed). Fall back
|
||||
// to maximumTaskPageNumber so the Python parser processes all
|
||||
// pages via slicing (Python gracefully caps at actual page count).
|
||||
ranges = append(ranges, documentParsePageRange{from: 0, to: maximumTaskPageNumber})
|
||||
}
|
||||
return ranges, nil
|
||||
@@ -1754,7 +1759,20 @@ func documentEstimatePDFPageCount(binary []byte) int64 {
|
||||
if len(binary) == 0 {
|
||||
return 0
|
||||
}
|
||||
return int64(len(documentPDFPagePattern.FindAll(binary, -1)))
|
||||
// Fast path: regex works for uncompressed PDFs.
|
||||
count := int64(len(documentPDFPagePattern.FindAll(binary, -1)))
|
||||
if count > 0 {
|
||||
return count
|
||||
}
|
||||
// Fallback for compressed PDFs where /Type /Page is inside a
|
||||
// compressed object stream: use pdf_oxide to get the real page count.
|
||||
if doc, err := pdfoxide.OpenBytes(binary); err == nil {
|
||||
defer doc.Close()
|
||||
if pages, err := doc.PageCount(); err == nil {
|
||||
return int64(pages)
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func documentEstimateTableRowCount(name string, binary []byte) int {
|
||||
|
||||
Reference in New Issue
Block a user