From 72b89304c1bdb2e376e4f35c643d5d6db523f136 Mon Sep 17 00:00:00 2001
From: Yongteng Lei <yongtengrey@outlook.com>
Date: Wed, 25 Feb 2026 09:47:39 +0800
Subject: [PATCH] Fix: LFI vulnerability in document parsing API (#13196)

### What problem does this PR solve?

Fix LFI vulnerability in document parsing API.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 api/apps/document_app.py | 20 ++++++++++++++++++--
 deepdoc/parser/utils.py  |  2 +-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/api/apps/document_app.py b/api/apps/document_app.py
index b45c3c0a7e..504eae9eb7 100644
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@@ -17,7 +17,7 @@ import json
 import os.path
 import pathlib
 import re
-from pathlib import Path
+from pathlib import Path, PurePosixPath, PureWindowsPath
 from quart import request, make_response
 from api.apps import current_user, login_required
 from api.common.check_team_permission import check_kb_team_permission
@@ -50,6 +50,18 @@ from rag.nlp import search, rag_tokenizer
 from common import settings
 
 
+def _is_safe_download_filename(name: str) -> bool:
+    if not name or name in {".", ".."}:
+        return False
+    if "\x00" in name or len(name) > 255:
+        return False
+    if name != PurePosixPath(name).name:
+        return False
+    if name != PureWindowsPath(name).name:
+        return False
+    return True
+
+
 @manager.route("/upload", methods=["POST"])  # noqa: F821
 @login_required
 @validate_request("kb_id")
@@ -874,7 +886,11 @@ async def parse():
         r = re.search(r"filename=\"([^\"]+)\"", str(res_headers))
         if not r or not r.group(1):
             return get_json_result(data=False, message="Can't not identify downloaded file", code=RetCode.ARGUMENT_ERROR)
-        f = File(r.group(1), os.path.join(download_path, r.group(1)))
+        filename = r.group(1).strip()
+        if not _is_safe_download_filename(filename):
+            return get_json_result(data=False, message="Invalid downloaded filename", code=RetCode.ARGUMENT_ERROR)
+        filepath = os.path.join(download_path, filename)
+        f = File(filename, filepath)
         txt = FileService.parse_docs([f], current_user.id)
         return get_json_result(data=txt)
 
diff --git a/deepdoc/parser/utils.py b/deepdoc/parser/utils.py
index 85a3554955..528e21faa9 100644
--- a/deepdoc/parser/utils.py
+++ b/deepdoc/parser/utils.py
@@ -19,7 +19,7 @@ from rag.nlp import find_codec
 
 def get_text(fnm: str, binary=None) -> str:
     txt = ""
-    if binary:
+    if binary is not None:
         encoding = find_codec(binary)
         txt = binary.decode(encoding, errors="ignore")
     else: