From fc9116578c4633eb4627e83c9f589ddc1ceaae92 Mon Sep 17 00:00:00 2001 From: euvre <93761161+euvre@users.noreply.github.com> Date: Thu, 2 Jul 2026 19:08:49 +0800 Subject: [PATCH] Fix: PDF page count detection for compressed PDFs (#16487) --- internal/service/document.go | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/internal/service/document.go b/internal/service/document.go index 314016e1aa..75ea1df6d2 100644 --- a/internal/service/document.go +++ b/internal/service/document.go @@ -41,6 +41,7 @@ import ( "ragflow/internal/common" "ragflow/internal/dao" + "ragflow/internal/deepdoc/parser/pdf/pdfoxide" "ragflow/internal/engine" "ragflow/internal/engine/redis" enginetypes "ragflow/internal/engine/types" @@ -1617,6 +1618,10 @@ func documentParseTaskRanges(doc *entity.Document, bucket, objectName string) ([ } } if len(ranges) == 0 { + // pages == 0 means page count detection failed (e.g. compressed + // PDF where both regex and pdfoxide fallbacks failed). Fall back + // to maximumTaskPageNumber so the Python parser processes all + // pages via slicing (Python gracefully caps at actual page count). ranges = append(ranges, documentParsePageRange{from: 0, to: maximumTaskPageNumber}) } return ranges, nil @@ -1754,7 +1759,20 @@ func documentEstimatePDFPageCount(binary []byte) int64 { if len(binary) == 0 { return 0 } - return int64(len(documentPDFPagePattern.FindAll(binary, -1))) + // Fast path: regex works for uncompressed PDFs. + count := int64(len(documentPDFPagePattern.FindAll(binary, -1))) + if count > 0 { + return count + } + // Fallback for compressed PDFs where /Type /Page is inside a + // compressed object stream: use pdf_oxide to get the real page count. + if doc, err := pdfoxide.OpenBytes(binary); err == nil { + defer doc.Close() + if pages, err := doc.PageCount(); err == nil { + return int64(pages) + } + } + return 0 } func documentEstimateTableRowCount(name string, binary []byte) int {