From c93ec0a1f379e2402f970d2c32c88d490fed32e8 Mon Sep 17 00:00:00 2001 From: Daniil Sivak Date: Wed, 15 Apr 2026 13:43:53 +0300 Subject: [PATCH] Fix: reject empty/space-only content in update_chunk API (#14082) Closes #6541 ### What problem does this PR solve? Add content validation to `update_chunk` (SDK and non-SDK) to reject empty or whitespace-only content before it reaches the embedding model. **Before:** Calling `update_chunk` with space-only content (like `" "`, `""`, `"\n"`) bypassed validation and was sent directly to the embedding model, which returned an error. This was the same bug previously fixed for `add_chunk` in #6390, but `update_chunk` was missed. **After:** Empty/whitespace-only content is caught by validation and returns an error: `` `content` is required `` ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/apps/chunk_app.py | 4 +++- api/apps/sdk/doc.py | 10 ++++++---- common/string_utils.py | 4 ++++ .../test_add_chunk.py | 2 +- .../test_update_chunk.py | 14 ++------------ .../test_update_chunk.py | 12 ++---------- .../test_chunk_app/test_chunk_routes_unit.py | 1 + .../test_chunk_app/test_update_chunk.py | 4 ++-- 8 files changed, 21 insertions(+), 30 deletions(-) diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index 41307c98c6..e6ceb66e69 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -43,7 +43,7 @@ from rag.app.qa import beAdoc, rmPrefix from rag.app.tag import label_question from rag.nlp import rag_tokenizer, search from rag.prompts.generator import cross_languages, keyword_extraction -from common.string_utils import remove_redundant_spaces +from common.string_utils import is_content_empty, remove_redundant_spaces from common.constants import RetCode, LLMType, ParserType, PAGERANK_FLD from common import settings from api.apps import login_required, current_user @@ -140,6 +140,8 @@ async def set(): raise TypeError("expected string or bytes-like object") if isinstance(content_with_weight, bytes): content_with_weight = content_with_weight.decode("utf-8", errors="ignore") + if is_content_empty(content_with_weight): + return get_data_error_result(message="`content_with_weight` is required") d = { "id": req["chunk_id"], "content_with_weight": content_with_weight} diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index e11a73ea9d..244c6f2292 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -38,7 +38,7 @@ from common import settings from common.constants import FileSource, LLMType, ParserType, RetCode, TaskStatus from common.metadata_utils import convert_conditions, meta_filter from common.misc_utils import thread_pool_exec -from common.string_utils import remove_redundant_spaces +from common.string_utils import is_content_empty, remove_redundant_spaces from common.tag_feature_utils import validate_tag_features from rag.app.qa import beAdoc, rmPrefix from rag.app.tag import label_question @@ -933,7 +933,7 @@ async def add_chunk(tenant_id, dataset_id, document_id): return get_error_data_result(message=f"You don't own the document {document_id}.") doc = doc[0] req = await get_request_json() - if not str(req.get("content", "")).strip(): + if is_content_empty(req.get("content")): return get_error_data_result(message="`content` is required") if "important_keywords" in req: if not isinstance(req["important_keywords"], list): @@ -1176,8 +1176,10 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id): return get_error_data_result(message=f"You don't own the document {document_id}.") doc = doc[0] req = await get_request_json() - if "content" in req and req["content"] is not None: - content = req["content"] + content = req.get("content") + if content is not None: + if is_content_empty(content): + return get_error_data_result(message="`content` is required") else: content = chunk.get("content_with_weight", "") d = {"id": chunk_id, "content_with_weight": content} diff --git a/common/string_utils.py b/common/string_utils.py index 5af008933a..ba8371311b 100644 --- a/common/string_utils.py +++ b/common/string_utils.py @@ -71,3 +71,7 @@ def clean_markdown_block(text): # Return text with surrounding whitespace removed return text.strip() + + +def is_content_empty(content: str) -> bool: + return content is None or not str(content).strip() diff --git a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py index 34ef239204..d175409075 100644 --- a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py +++ b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py @@ -58,7 +58,7 @@ class TestAddChunk: @pytest.mark.parametrize( "payload, expected_code, expected_message", [ - ({"content": None}, 100, """TypeError("unsupported operand type(s) for +: \'NoneType\' and \'str\'")"""), + ({"content": None}, 102, "`content` is required"), ({"content": ""}, 102, "`content` is required"), pytest.param( {"content": 1}, diff --git a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py index 96f70a7bcb..cb5420f302 100644 --- a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py +++ b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py @@ -48,12 +48,7 @@ class TestUpdatedChunk: "payload, expected_code, expected_message", [ pytest.param({"content": None}, 0, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="issues/6509")), - pytest.param( - {"content": ""}, - 100, - """APIRequestFailedError(\'Error code: 400, with error text {"error":{"code":"1213","message":"未正常接收到prompt参数。"}}\')""", - marks=pytest.mark.skip(reason="issues/6541"), - ), + ({"content": ""}, 102, "`content` is required"), pytest.param( {"content": 1}, 100, @@ -61,12 +56,7 @@ class TestUpdatedChunk: marks=pytest.mark.skip, ), ({"content": "update chunk"}, 0, ""), - pytest.param( - {"content": " "}, - 100, - """APIRequestFailedError(\'Error code: 400, with error text {"error":{"code":"1213","message":"未正常接收到prompt参数。"}}\')""", - marks=pytest.mark.skip(reason="issues/6541"), - ), + ({"content": " "}, 102, "`content` is required"), ({"content": "\n!?。;!?\"'"}, 0, ""), ], ) diff --git a/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_update_chunk.py b/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_update_chunk.py index fda87745c5..4f4debffab 100644 --- a/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_update_chunk.py +++ b/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_update_chunk.py @@ -26,22 +26,14 @@ class TestUpdatedChunk: "payload, expected_message", [ ({"content": None}, ""), - pytest.param( - {"content": ""}, - """APIRequestFailedError(\'Error code: 400, with error text {"error":{"code":"1213","message":"未正常接收到prompt参数。"}}\')""", - marks=pytest.mark.skip(reason="issues/6541"), - ), + ({"content": ""}, "`content` is required"), pytest.param( {"content": 1}, "TypeError('expected string or bytes-like object')", marks=pytest.mark.skip, ), ({"content": "update chunk"}, ""), - pytest.param( - {"content": " "}, - """APIRequestFailedError(\'Error code: 400, with error text {"error":{"code":"1213","message":"未正常接收到prompt参数。"}}\')""", - marks=pytest.mark.skip(reason="issues/6541"), - ), + ({"content": " "}, "`content` is required"), ({"content": "\n!?。;!?\"'"}, ""), ], ) diff --git a/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py b/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py index 6fd72427b9..3f5ab6b11d 100644 --- a/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py +++ b/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py @@ -220,6 +220,7 @@ def _load_chunk_module(monkeypatch): string_utils_mod = ModuleType("common.string_utils") string_utils_mod.remove_redundant_spaces = lambda text: " ".join(str(text).split()) + string_utils_mod.is_content_empty = lambda content: content is None or not str(content).strip() monkeypatch.setitem(sys.modules, "common.string_utils", string_utils_mod) metadata_utils_mod = ModuleType("common.metadata_utils") diff --git a/test/testcases/test_web_api/test_chunk_app/test_update_chunk.py b/test/testcases/test_web_api/test_chunk_app/test_update_chunk.py index 1b2ea37101..a78c135e2f 100644 --- a/test/testcases/test_web_api/test_chunk_app/test_update_chunk.py +++ b/test/testcases/test_web_api/test_chunk_app/test_update_chunk.py @@ -46,10 +46,10 @@ class TestUpdateChunk: "payload, expected_code, expected_message", [ ({"content_with_weight": None}, 100, "TypeError('expected string or bytes-like object')"), - ({"content_with_weight": ""}, 100, """Exception('Error: 413 - {"error":"Input validation error: `inputs` cannot be empty","error_type":"Validation"}')"""), + ({"content_with_weight": ""}, 102, "`content_with_weight` is required"), ({"content_with_weight": 1}, 100, "TypeError('expected string or bytes-like object')"), ({"content_with_weight": "update chunk"}, 0, ""), - ({"content_with_weight": " "}, 0, ""), + ({"content_with_weight": " "}, 102, "`content_with_weight` is required"), ({"content_with_weight": "\n!?。;!?\"'"}, 0, ""), ], )