Fix: PDF page count detection for compressed PDFs (#16487)

This commit is contained in:
euvre
2026-07-02 19:08:49 +08:00
committed by GitHub
parent f7e39a09dc
commit fc9116578c

View File

@@ -41,6 +41,7 @@ import (
"ragflow/internal/common"
"ragflow/internal/dao"
"ragflow/internal/deepdoc/parser/pdf/pdfoxide"
"ragflow/internal/engine"
"ragflow/internal/engine/redis"
enginetypes "ragflow/internal/engine/types"
@@ -1617,6 +1618,10 @@ func documentParseTaskRanges(doc *entity.Document, bucket, objectName string) ([
}
}
if len(ranges) == 0 {
// pages == 0 means page count detection failed (e.g. compressed
// PDF where both regex and pdfoxide fallbacks failed). Fall back
// to maximumTaskPageNumber so the Python parser processes all
// pages via slicing (Python gracefully caps at actual page count).
ranges = append(ranges, documentParsePageRange{from: 0, to: maximumTaskPageNumber})
}
return ranges, nil
@@ -1754,7 +1759,20 @@ func documentEstimatePDFPageCount(binary []byte) int64 {
if len(binary) == 0 {
return 0
}
return int64(len(documentPDFPagePattern.FindAll(binary, -1)))
// Fast path: regex works for uncompressed PDFs.
count := int64(len(documentPDFPagePattern.FindAll(binary, -1)))
if count > 0 {
return count
}
// Fallback for compressed PDFs where /Type /Page is inside a
// compressed object stream: use pdf_oxide to get the real page count.
if doc, err := pdfoxide.OpenBytes(binary); err == nil {
defer doc.Close()
if pages, err := doc.PageCount(); err == nil {
return int64(pages)
}
}
return 0
}
func documentEstimateTableRowCount(name string, binary []byte) int {