From 72b89304c1bdb2e376e4f35c643d5d6db523f136 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Wed, 25 Feb 2026 09:47:39 +0800 Subject: [PATCH] Fix: LFI vulnerability in document parsing API (#13196) ### What problem does this PR solve? Fix LFI vulnerability in document parsing API. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/apps/document_app.py | 20 ++++++++++++++++++-- deepdoc/parser/utils.py | 2 +- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/api/apps/document_app.py b/api/apps/document_app.py index b45c3c0a7e..504eae9eb7 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -17,7 +17,7 @@ import json import os.path import pathlib import re -from pathlib import Path +from pathlib import Path, PurePosixPath, PureWindowsPath from quart import request, make_response from api.apps import current_user, login_required from api.common.check_team_permission import check_kb_team_permission @@ -50,6 +50,18 @@ from rag.nlp import search, rag_tokenizer from common import settings +def _is_safe_download_filename(name: str) -> bool: + if not name or name in {".", ".."}: + return False + if "\x00" in name or len(name) > 255: + return False + if name != PurePosixPath(name).name: + return False + if name != PureWindowsPath(name).name: + return False + return True + + @manager.route("/upload", methods=["POST"]) # noqa: F821 @login_required @validate_request("kb_id") @@ -874,7 +886,11 @@ async def parse(): r = re.search(r"filename=\"([^\"]+)\"", str(res_headers)) if not r or not r.group(1): return get_json_result(data=False, message="Can't not identify downloaded file", code=RetCode.ARGUMENT_ERROR) - f = File(r.group(1), os.path.join(download_path, r.group(1))) + filename = r.group(1).strip() + if not _is_safe_download_filename(filename): + return get_json_result(data=False, message="Invalid downloaded filename", code=RetCode.ARGUMENT_ERROR) + filepath = os.path.join(download_path, filename) + f = File(filename, filepath) txt = FileService.parse_docs([f], current_user.id) return get_json_result(data=txt) diff --git a/deepdoc/parser/utils.py b/deepdoc/parser/utils.py index 85a3554955..528e21faa9 100644 --- a/deepdoc/parser/utils.py +++ b/deepdoc/parser/utils.py @@ -19,7 +19,7 @@ from rag.nlp import find_codec def get_text(fnm: str, binary=None) -> str: txt = "" - if binary: + if binary is not None: encoding = find_codec(binary) txt = binary.decode(encoding, errors="ignore") else: