From 32d31284cc740fd13531ea67647a384e654ea8de Mon Sep 17 00:00:00 2001 From: guptas6est Date: Mon, 9 Mar 2026 04:06:00 +0000 Subject: [PATCH] Fix: upgrade pypdf to 6.7.5 and migrate from deprecated pypdf2 to fix CVE-2026-28804 and CVE-2023-36464 (#13454) ### What problem does this PR solve? This PR addresses security vulnerabilities in PDF processing dependencies identified by Trivy security scan: 1. CVE-2026-28804 (MEDIUM): pypdf 6.7.4 vulnerable to inefficient decoding of ASCIIHexDecode streams 2. CVE-2023-36464 (MEDIUM): pypdf2 3.0.1 susceptible to infinite loop when parsing malformed comments Since pypdf2 is deprecated with no available fixes, this PR migrates all pypdf2 usage to the actively maintained pypdf library (version 6.7.5), which resolves both vulnerabilities. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- pyproject.toml | 3 +-- rag/app/presentation.py | 2 +- rag/utils/file_utils.py | 2 +- uv.lock | 19 ++++--------------- 4 files changed, 7 insertions(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0665a1c536..73006ac28f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,8 +81,7 @@ dependencies = [ "pyobvector==0.2.22", "pyodbc>=5.2.0,<6.0.0", "pypandoc>=1.16", - "pypdf>=6.6.2", - "pypdf2>=3.0.1,<4.0.0", + "pypdf>=6.7.5", "python-calamine>=0.4.0", "python-docx>=1.1.2,<2.0.0", "python-pptx>=1.0.2,<2.0.0", diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 909fd61a30..390955041a 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -20,7 +20,7 @@ import re from collections import defaultdict from io import BytesIO -from PyPDF2 import PdfReader as pdf2_read +from pypdf import PdfReader as pdf2_read from deepdoc.parser import PdfParser, PlainParser from deepdoc.parser.ppt_parser import RAGFlowPptParser diff --git a/rag/utils/file_utils.py b/rag/utils/file_utils.py index 8d19079b76..c9ec50a36a 100644 --- a/rag/utils/file_utils.py +++ b/rag/utils/file_utils.py @@ -21,7 +21,7 @@ import requests from requests.exceptions import Timeout, RequestException from io import BytesIO from typing import List, Union, Tuple, Optional, Dict -import PyPDF2 +import pypdf as PyPDF2 from docx import Document import olefile diff --git a/uv.lock b/uv.lock index 0b1423a014..3432723677 100644 --- a/uv.lock +++ b/uv.lock @@ -5760,20 +5760,11 @@ wheels = [ [[package]] name = "pypdf" -version = "6.7.4" +version = "6.7.5" source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } -sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/09/dc/f52deef12797ad58b88e4663f097a343f53b9361338aef6573f135ac302f/pypdf-6.7.4.tar.gz", hash = "sha256:9edd1cd47938bb35ec87795f61225fd58a07cfaf0c5699018ae1a47d6f8ab0e3", size = 5304821, upload-time = "2026-02-27T10:44:39.395Z" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f6/52/37cc0aa9e9d1bf7729a737a0d83f8b3f851c8eb137373d9f71eafb0a3405/pypdf-6.7.5.tar.gz", hash = "sha256:40bb2e2e872078655f12b9b89e2f900888bb505e88a82150b64f9f34fa25651d", size = 5304278, upload-time = "2026-03-02T09:05:21.464Z" } wheels = [ - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/be/cded021305f5c81b47265b8c5292b99388615a4391c21ff00fd538d34a56/pypdf-6.7.4-py3-none-any.whl", hash = "sha256:527d6da23274a6c70a9cb59d1986d93946ba8e36a6bc17f3f7cce86331492dda", size = 331496, upload-time = "2026-02-27T10:44:37.527Z" }, -] - -[[package]] -name = "pypdf2" -version = "3.0.1" -source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } -sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9f/bb/18dc3062d37db6c491392007dfd1a7f524bb95886eb956569ac38a23a784/PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440", size = 227419, upload-time = "2022-12-31T10:36:13.13Z" } -wheels = [ - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572, upload-time = "2022-12-31T10:36:10.327Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/05/89/336673efd0a88956562658aba4f0bbef7cb92a6fbcbcaf94926dbc82b408/pypdf-6.7.5-py3-none-any.whl", hash = "sha256:07ba7f1d6e6d9aa2a17f5452e320a84718d4ce863367f7ede2fd72280349ab13", size = 331421, upload-time = "2026-03-02T09:05:19.722Z" }, ] [[package]] @@ -6323,7 +6314,6 @@ dependencies = [ { name = "pyodbc" }, { name = "pypandoc" }, { name = "pypdf" }, - { name = "pypdf2" }, { name = "python-calamine" }, { name = "python-docx" }, { name = "python-gitlab" }, @@ -6462,8 +6452,7 @@ requires-dist = [ { name = "pyobvector", specifier = "==0.2.22" }, { name = "pyodbc", specifier = ">=5.2.0,<6.0.0" }, { name = "pypandoc", specifier = ">=1.16" }, - { name = "pypdf", specifier = ">=6.6.2" }, - { name = "pypdf2", specifier = ">=3.0.1,<4.0.0" }, + { name = "pypdf", specifier = ">=6.7.5" }, { name = "python-calamine", specifier = ">=0.4.0" }, { name = "python-docx", specifier = ">=1.1.2,<2.0.0" }, { name = "python-gitlab", specifier = ">=7.0.0" },