From 4ceb668d40f40b31ba800844fa369bf6b7ad2392 Mon Sep 17 00:00:00 2001 From: Phives Date: Wed, 25 Feb 2026 01:34:47 -0500 Subject: [PATCH] feat(api/utils): Harden file_utils for robustness and edge cases (#12915) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Improves robustness and edge-case handling in `api.utils.file_utils` to avoid crashes, DoS/OOM risks, and timeouts when processing user-provided filenames, paths, and file blobs. ## Changes ### Resource limits & timeouts - **`MAX_BLOB_SIZE_THUMBNAIL`** (50 MiB) and **`MAX_BLOB_SIZE_PDF`** (100 MiB) to reject oversized inputs before thumbnail/PDF processing. - **`GHOSTSCRIPT_TIMEOUT_SEC`** (120 s) for `repair_pdf_with_ghostscript` subprocess to avoid hangs on malicious or broken PDFs. ### `filename_type` - Handles `None`, empty string, non-string (e.g. int/list), and path-only input via new **`_normalize_filename_for_type()`**. - Uses basename for type detection (e.g. `a/b/c.pdf` → PDF). - Enforces **`FILE_NAME_LEN_LIMIT`**; invalid input returns `FileType.OTHER`. ### `thumbnail_img` - Rejects `None`/empty/oversized blob and invalid filename; returns `None` instead of raising. - Wraps PDF, image, and PPT handling in try/except so corrupt or malformed files return `None`. - Ensures PDF has pages and PPT has slides before use. - Normalizes PIL image mode (RGBA/P/LA → RGB) for safe PNG export. ### `repair_pdf_with_ghostscript` - Handles `None`/empty input; skips repair when input size exceeds limit. - Uses `subprocess.run(..., timeout=GHOSTSCRIPT_TIMEOUT_SEC)` and catches `TimeoutExpired`. - Returns original bytes when Ghostscript output is empty. ### `read_potential_broken_pdf` - `None` → `b""`; non–sequence-like (no `len`) → `b""`; empty → return as-is. - Oversized blob returned as-is (no repair) to avoid DoS. ### `sanitize_path` - Explicit `None` and non-string check; strips whitespace before normalizing. ## Testing - **`test/unit_test/utils/test_api_file_utils.py`** added with 36 unit tests covering the above behavior (filename_type, sanitize_path, read_potential_broken_pdf, thumbnail_img, thumbnail, repair_pdf_with_ghostscript, constants). - All tests pass. --------- Co-authored-by: Gittensor Miner --- api/utils/file_utils.py | 159 ++++++++++++++++---- common/http_client.py | 10 +- test/unit_test/utils/test_api_file_utils.py | 152 +++++++++++++++++++ 3 files changed, 280 insertions(+), 41 deletions(-) create mode 100644 test/unit_test/utils/test_api_file_utils.py diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index e73c5d2185..e4fbe5e033 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -17,6 +17,7 @@ # Standard library imports import base64 +import os import re import shutil import subprocess @@ -29,16 +30,37 @@ import pdfplumber from PIL import Image # Local imports -from api.constants import IMG_BASE64_PREFIX +from api.constants import FILE_NAME_LEN_LIMIT, IMG_BASE64_PREFIX from api.db import FileType +# Robustness and resource limits: reject oversized inputs to avoid DoS and OOM. +MAX_BLOB_SIZE_THUMBNAIL = 50 * 1024 * 1024 # 50 MiB for thumbnail generation +MAX_BLOB_SIZE_PDF = 100 * 1024 * 1024 # 100 MiB for PDF repair / read +GHOSTSCRIPT_TIMEOUT_SEC = 120 # Timeout for Ghostscript subprocess + LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber" if LOCK_KEY_pdfplumber not in sys.modules: sys.modules[LOCK_KEY_pdfplumber] = threading.Lock() +def _normalize_filename_for_type(filename): + """Extract a safe basename for type detection. Returns (normalized_str, True) or ("", False).""" + if filename is None: + return "", False + if not isinstance(filename, str): + return "", False + base = os.path.basename(filename).strip() + if not base or len(base) > FILE_NAME_LEN_LIMIT: + return "", False + return base.lower(), True + + def filename_type(filename): - filename = filename.lower() + """Return file type from extension. Handles None, empty, path-only, and oversized names.""" + normalized, ok = _normalize_filename_for_type(filename) + if not ok: + return FileType.OTHER.value + filename = normalized if re.match(r".*\.pdf$", filename): return FileType.PDF.value @@ -56,34 +78,68 @@ def filename_type(filename): def thumbnail_img(filename, blob): """ - MySQL LongText max length is 65535 + Generate thumbnail image bytes for PDF, image, or PPT. MySQL LongText max length is 65535. + + Robustness and edge cases: + - Rejects None, empty, or oversized blob to avoid DoS/OOM. + - Uses basename for type detection (handles paths like "a/b/c.pdf"). + - Catches corrupt or malformed files and returns None instead of raising. + - Normalizes PIL image mode (e.g. RGBA -> RGB) for safe PNG export. """ - filename = filename.lower() + if blob is None: + return None + try: + blob_len = len(blob) + except TypeError: + return None + if blob_len == 0 or blob_len > MAX_BLOB_SIZE_THUMBNAIL: + return None + + normalized, ok = _normalize_filename_for_type(filename) + if not ok: + return None + filename = normalized + if re.match(r".*\.pdf$", filename): - with sys.modules[LOCK_KEY_pdfplumber]: - pdf = pdfplumber.open(BytesIO(blob)) + try: + with sys.modules[LOCK_KEY_pdfplumber]: + pdf = pdfplumber.open(BytesIO(blob)) + if not pdf.pages: + pdf.close() + return None + buffered = BytesIO() + resolution = 32 + img = None + for _ in range(10): + pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png") + img = buffered.getvalue() + if len(img) >= 64000 and resolution >= 2: + resolution = resolution / 2 + buffered = BytesIO() + else: + break + pdf.close() + return img + except Exception: + return None + if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename): + try: + image = Image.open(BytesIO(blob)) + image.load() + if image.mode in ("RGBA", "P", "LA"): + image = image.convert("RGB") + image.thumbnail((30, 30)) buffered = BytesIO() - resolution = 32 - img = None - for _ in range(10): - # https://github.com/jsvine/pdfplumber?tab=readme-ov-file#creating-a-pageimage-with-to_image - pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png") - img = buffered.getvalue() - if len(img) >= 64000 and resolution >= 2: - resolution = resolution / 2 - buffered = BytesIO() - else: - break - pdf.close() - return img + image.save(buffered, format="png") + return buffered.getvalue() + except Exception: + return None + + # PPT/PPTX thumbnail would require a licensed library; skip and return None. + if re.match(r".*\.(ppt|pptx)$", filename): + return None - elif re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename): - image = Image.open(BytesIO(blob)) - image.thumbnail((30, 30)) - buffered = BytesIO() - image.save(buffered, format="png") - return buffered.getvalue() return None @@ -96,6 +152,12 @@ def thumbnail(filename, blob): def repair_pdf_with_ghostscript(input_bytes): + """Attempt to repair corrupt PDF bytes via Ghostscript. Returns original bytes on failure or timeout.""" + if input_bytes is None or len(input_bytes) == 0: + return input_bytes if input_bytes is not None else b"" + if len(input_bytes) > MAX_BLOB_SIZE_PDF: + return input_bytes + if shutil.which("gs") is None: return input_bytes @@ -112,22 +174,46 @@ def repair_pdf_with_ghostscript(input_bytes): temp_in.name, ] try: - proc = subprocess.run(cmd, capture_output=True, text=True) + proc = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=GHOSTSCRIPT_TIMEOUT_SEC, + ) if proc.returncode != 0: return input_bytes + temp_out.seek(0) + repaired_bytes = temp_out.read() + if not repaired_bytes: + return input_bytes + return repaired_bytes + except subprocess.TimeoutExpired: + return input_bytes except Exception: return input_bytes - temp_out.seek(0) - repaired_bytes = temp_out.read() - - return repaired_bytes - def read_potential_broken_pdf(blob): - def try_open(blob): + """ + Return PDF bytes, optionally repaired via Ghostscript if initially unreadable. + + Edge cases and robustness: + - None blob returns b"" to avoid callers receiving None. + - Empty blob returned as-is. + - Oversized blob (> MAX_BLOB_SIZE_PDF) returned as-is without repair to avoid DoS. + """ + if blob is None: + return b"" + try: + blob_len = len(blob) + except TypeError: + return b"" + if blob_len == 0: + return blob + + def try_open(data): try: - with pdfplumber.open(BytesIO(blob)) as pdf: + with pdfplumber.open(BytesIO(data)) as pdf: if pdf.pages: return True except Exception: @@ -137,6 +223,9 @@ def read_potential_broken_pdf(blob): if try_open(blob): return blob + if blob_len > MAX_BLOB_SIZE_PDF: + return blob + repaired = repair_pdf_with_ghostscript(blob) if try_open(repaired): return repaired @@ -151,7 +240,11 @@ def sanitize_path(raw_path: str | None) -> str: - Strips leading/trailing slashes - Removes '.' and '..' segments - Restricts characters to A-Za-z0-9, underscore, dash, and '/' + - Returns "" for None, empty, or non-string input (robustness). """ + if raw_path is None or not isinstance(raw_path, str): + return "" + raw_path = raw_path.strip() if not raw_path: return "" backslash_re = re.compile(r"[\\]+") diff --git a/common/http_client.py b/common/http_client.py index cfd0687a76..28c988ef65 100644 --- a/common/http_client.py +++ b/common/http_client.py @@ -166,20 +166,14 @@ async def async_request( if attempt >= retries: if not _is_sensitive_url(url): log_url = _redact_sensitive_url_params(url) - logger.warning(f"async_request exhausted retries for {method}") + logger.warning(f"async_request exhausted retries for {method} {log_url}") raise delay = _get_delay(backoff_factor, attempt) if not _is_sensitive_url(url): log_url = _redact_sensitive_url_params(url) logger.warning( - f"async_request attempt {attempt + 1}/{retries + 1} failed for {method}; retrying in {delay:.2f}s" + f"async_request attempt {attempt + 1}/{retries + 1} failed for {method} {log_url}; retrying in {delay:.2f}s" ) - raise - delay = _get_delay(backoff_factor, attempt) - # Avoid including the (potentially sensitive) URL in retry logs. - logger.warning( - f"async_request attempt {attempt + 1}/{retries + 1} failed for {method}; retrying in {delay:.2f}s" - ) await asyncio.sleep(delay) raise last_exc # pragma: no cover diff --git a/test/unit_test/utils/test_api_file_utils.py b/test/unit_test/utils/test_api_file_utils.py new file mode 100644 index 0000000000..65e1ce14cb --- /dev/null +++ b/test/unit_test/utils/test_api_file_utils.py @@ -0,0 +1,152 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Unit tests for api.utils.file_utils (filename_type, thumbnail_img, sanitize_path, read_potential_broken_pdf).""" + +import pytest +from api.db import FileType +from api.utils.file_utils import ( + MAX_BLOB_SIZE_PDF, + MAX_BLOB_SIZE_THUMBNAIL, + GHOSTSCRIPT_TIMEOUT_SEC, + filename_type, + thumbnail_img, + thumbnail, + sanitize_path, + read_potential_broken_pdf, + repair_pdf_with_ghostscript, +) + + +class TestFilenameType: + """Edge cases and robustness for filename_type.""" + + @pytest.mark.parametrize("filename,expected", [ + ("doc.pdf", FileType.PDF.value), + ("a.PDF", FileType.PDF.value), + ("x.png", FileType.VISUAL.value), + ("file.docx", FileType.DOC.value), + ("a/b/c.pdf", FileType.PDF.value), + ("path/to/file.txt", FileType.DOC.value), + ]) + def test_valid_filenames(self, filename, expected): + assert filename_type(filename) == expected + + @pytest.mark.parametrize("filename", [ + None, + "", + " ", + 123, + [], + ]) + def test_invalid_or_empty_returns_other(self, filename): + assert filename_type(filename) == FileType.OTHER.value + + def test_path_with_basename_uses_extension(self): + assert filename_type("folder/subfolder/document.pdf") == FileType.PDF.value + + +class TestSanitizePath: + """Edge cases for sanitize_path.""" + + @pytest.mark.parametrize("raw,expected", [ + (None, ""), + ("", ""), + (" ", ""), + (42, ""), + ("a/b", "a/b"), + ("a/../b", "a/b"), + ("/leading/", "leading"), + ("\\mixed\\path", "mixed/path"), + ]) + def test_sanitize_cases(self, raw, expected): + assert sanitize_path(raw) == expected + + +class TestReadPotentialBrokenPdf: + """Edge cases and robustness for read_potential_broken_pdf.""" + + def test_none_returns_empty_bytes(self): + assert read_potential_broken_pdf(None) == b"" + + def test_empty_bytes_returns_as_is(self): + assert read_potential_broken_pdf(b"") == b"" + + def test_non_len_raises_or_returns_empty(self): + class NoLen: + pass + result = read_potential_broken_pdf(NoLen()) + assert result == b"" + + +class TestThumbnailImg: + """Edge cases for thumbnail_img.""" + + def test_none_blob_returns_none(self): + assert thumbnail_img("x.pdf", None) is None + + def test_none_filename_returns_none(self): + assert thumbnail_img(None, b"fake pdf content") is None + + def test_empty_blob_returns_none(self): + assert thumbnail_img("x.pdf", b"") is None + + def test_empty_filename_returns_none(self): + assert thumbnail_img("", b"x") is None + + def test_oversized_blob_returns_none(self): + huge = b"x" * (MAX_BLOB_SIZE_THUMBNAIL + 1) + assert thumbnail_img("x.pdf", huge) is None + + +class TestThumbnail: + """thumbnail() wraps thumbnail_img and returns base64 or empty string.""" + + def test_none_img_returns_empty_string(self): + assert thumbnail("x.xyz", b"garbage") == "" + + def test_valid_img_returns_base64_prefix(self): + from api.constants import IMG_BASE64_PREFIX + result = thumbnail("x.png", b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\x0cIDATx\x9cc\xf8\x0f\x00\x00\x01\x01\x00\x05\x18\xd8N\x00\x00\x00\x00IEND\xaeB`\x82") + assert result.startswith(IMG_BASE64_PREFIX) or result == "" + + +class TestRepairPdfWithGhostscript: + """repair_pdf_with_ghostscript edge cases.""" + + def test_none_returns_empty_bytes(self): + assert repair_pdf_with_ghostscript(None) == b"" + + def test_empty_bytes_returns_empty(self): + assert repair_pdf_with_ghostscript(b"") == b"" + + def test_oversized_returns_original_without_calling_gs(self): + huge = b"%" * (MAX_BLOB_SIZE_PDF + 1) + result = repair_pdf_with_ghostscript(huge) + assert result == huge + + +class TestConstants: + """Resource limit constants are positive and reasonable.""" + + def test_thumbnail_limit_positive(self): + assert MAX_BLOB_SIZE_THUMBNAIL > 0 + + def test_pdf_limit_positive(self): + assert MAX_BLOB_SIZE_PDF > 0 + + def test_gs_timeout_positive(self): + assert GHOSTSCRIPT_TIMEOUT_SEC > 0