From 38cefd88e2c75407101c8d73d9e9e2522cd14824 Mon Sep 17 00:00:00 2001 From: Ea001 <42531682+EaEa0001@users.noreply.github.com> Date: Wed, 15 Apr 2026 16:31:11 +0800 Subject: [PATCH] Fix tag_feas code injection in retrieval ranking (#13923) ## Summary - remove eval-based parsing from retrieval rank feature scoring - validate `tag_feas` at write time in chunk APIs and SDK routes - add regression tests for safe parsing and malicious payload rejection ## Details `tag_feas` is intended to be structured rank-feature data, but the retrieval ranking path was evaluating stored values as Python expressions. This change treats `tag_feas` strictly as data. ### What changed - replace `eval()` in `rag/nlp/search.py` with safe parsing via `json.loads()` and optional `ast.literal_eval()` compatibility for legacy Python-dict strings - strictly filter parsed values down to `dict[str, finite number]` - reject invalid `tag_feas` payloads at write time in web chunk routes and SDK document chunk routes - add focused regression tests to prove executable strings are ignored and invalid payloads are rejected ## Validation - `python -m pytest test/unit_test/common/test_tag_feature_utils.py test/unit_test/rag/test_rank_feature_scores.py -q` --------- Co-authored-by: unknown Co-authored-by: Yingfeng Zhang --- api/apps/chunk_app.py | 11 ++- api/apps/sdk/doc.py | 11 ++- common/tag_feature_utils.py | 85 ++++++++++++++++ rag/nlp/search.py | 9 +- .../test_chunk_app/test_chunk_routes_unit.py | 20 +++- .../test_chunk_app/test_create_chunk.py | 2 +- .../common/test_tag_feature_utils.py | 32 ++++++ .../unit_test/rag/test_rank_feature_scores.py | 97 +++++++++++++++++++ 8 files changed, 259 insertions(+), 8 deletions(-) create mode 100644 common/tag_feature_utils.py create mode 100644 test/unit_test/common/test_tag_feature_utils.py create mode 100644 test/unit_test/rag/test_rank_feature_scores.py diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index 5e06d872a6..41307c98c6 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -38,6 +38,7 @@ from api.utils.api_utils import ( get_request_json, ) from common.misc_utils import thread_pool_exec +from common.tag_feature_utils import validate_tag_features from rag.app.qa import beAdoc, rmPrefix from rag.app.tag import label_question from rag.nlp import rag_tokenizer, search @@ -161,7 +162,10 @@ async def set(): return get_data_error_result(message="`tag_kwd` must be a list of strings") d["tag_kwd"] = req["tag_kwd"] if "tag_feas" in req: - d["tag_feas"] = req["tag_feas"] + try: + d["tag_feas"] = validate_tag_features(req["tag_feas"]) + except ValueError as exc: + return get_data_error_result(message=f"`tag_feas` {exc}") if "available_int" in req: d["available_int"] = req["available_int"] @@ -328,7 +332,10 @@ async def create(): return get_data_error_result(message="`tag_kwd` must be a list of strings") d["tag_kwd"] = req["tag_kwd"] if "tag_feas" in req: - d["tag_feas"] = req["tag_feas"] + try: + d["tag_feas"] = validate_tag_features(req["tag_feas"]) + except ValueError as exc: + return get_data_error_result(message=f"`tag_feas` {exc}") image_base64 = req.get("image_base64", None) try: diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 726cbab97c..e11a73ea9d 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -39,6 +39,7 @@ from common.constants import FileSource, LLMType, ParserType, RetCode, TaskStatu from common.metadata_utils import convert_conditions, meta_filter from common.misc_utils import thread_pool_exec from common.string_utils import remove_redundant_spaces +from common.tag_feature_utils import validate_tag_features from rag.app.qa import beAdoc, rmPrefix from rag.app.tag import label_question from rag.nlp import rag_tokenizer, search @@ -963,7 +964,10 @@ async def add_chunk(tenant_id, dataset_id, document_id): return get_error_data_result("`tag_kwd` must be a list of strings") d["tag_kwd"] = req["tag_kwd"] if "tag_feas" in req: - d["tag_feas"] = req["tag_feas"] + try: + d["tag_feas"] = validate_tag_features(req["tag_feas"]) + except ValueError as exc: + return get_error_data_result(f"`tag_feas` {exc}") import base64 image_base64 = req.get("image_base64", None) @@ -1202,7 +1206,10 @@ async def update_chunk(tenant_id, dataset_id, document_id, chunk_id): return get_error_data_result("`tag_kwd` must be a list of strings") d["tag_kwd"] = req["tag_kwd"] if "tag_feas" in req: - d["tag_feas"] = req["tag_feas"] + try: + d["tag_feas"] = validate_tag_features(req["tag_feas"]) + except ValueError as exc: + return get_error_data_result(f"`tag_feas` {exc}") tenant_embd_id = DocumentService.get_tenant_embd_id(document_id) if tenant_embd_id: model_config = get_model_config_by_id(tenant_embd_id) diff --git a/common/tag_feature_utils.py b/common/tag_feature_utils.py new file mode 100644 index 0000000000..27488c2d5e --- /dev/null +++ b/common/tag_feature_utils.py @@ -0,0 +1,85 @@ +# +# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import ast +import json +import math + + +def parse_tag_features(raw, *, allow_json_string=True, allow_python_literal=False): + if raw is None: + return {} + + parsed = raw + if isinstance(raw, str): + raw = raw.strip() + if not raw: + return {} + parsed = None + if allow_json_string: + try: + parsed = json.loads(raw) + except Exception: + parsed = None + if parsed is None and allow_python_literal: + try: + parsed = ast.literal_eval(raw) + except Exception: + parsed = None + if parsed is None: + return {} + elif not isinstance(raw, dict): + return {} + + if not isinstance(parsed, dict): + return {} + + cleaned = {} + for key, value in parsed.items(): + if not isinstance(key, str): + continue + key = key.strip() + if not key: + continue + if isinstance(value, bool): + continue + if isinstance(value, (int, float)) and math.isfinite(float(value)): + cleaned[key] = float(value) + return cleaned + + +def validate_tag_features(raw): + if raw is None: + return None + + if not isinstance(raw, dict): + raise ValueError("must be an object mapping string tags to finite numeric scores") + + cleaned = {} + for key, value in raw.items(): + if not isinstance(key, str): + raise ValueError("keys must be strings") + key = key.strip() + if not key: + raise ValueError("keys must be non-empty strings") + if isinstance(value, bool) or not isinstance(value, (int, float)): + raise ValueError("values must be finite numbers") + numeric = float(value) + if not math.isfinite(numeric): + raise ValueError("values must be finite numbers") + cleaned[key] = numeric + + return cleaned diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 8d187766ee..7ad19fe7c4 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -26,6 +26,7 @@ from common.doc_store.doc_store_base import MatchDenseExpr, FusionExpr, OrderByE from common.string_utils import remove_redundant_spaces from common.float_utils import get_float from common.constants import PAGERANK_FLD, TAG_FLD +from common.tag_feature_utils import parse_tag_features from common import settings from common.misc_utils import thread_pool_exec @@ -279,12 +280,18 @@ class Dealer: return np.array([0 for _ in range(len(search_res.ids))]) + pageranks q_denor = np.sqrt(np.sum([s * s for t, s in query_rfea.items() if t != PAGERANK_FLD])) + if q_denor == 0: + return np.array([0 for _ in range(len(search_res.ids))]) + pageranks for i in search_res.ids: nor, denor = 0, 0 if not search_res.field[i].get(TAG_FLD): rank_fea.append(0) continue - for t, sc in eval(search_res.field[i].get(TAG_FLD, "{}")).items(): + tag_feas = parse_tag_features(search_res.field[i].get(TAG_FLD), allow_json_string=True, allow_python_literal=True) + if not tag_feas: + rank_fea.append(0) + continue + for t, sc in tag_feas.items(): if t in query_rfea: nor += query_rfea[t] * sc denor += sc * sc diff --git a/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py b/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py index 970d571b50..6fd72427b9 100644 --- a/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py +++ b/test/testcases/test_web_api/test_chunk_app/test_chunk_routes_unit.py @@ -584,6 +584,14 @@ def test_set_chunk_bytes_qa_image_and_guard_matrix_unit(monkeypatch): "get_by_id", lambda _doc_id: (True, _DummyDoc(doc_id="doc-1", parser_id=module.ParserType.NAIVE)), ) + _set_request_json( + monkeypatch, + module, + {"doc_id": "doc-1", "chunk_id": "chunk-1", "content_with_weight": "abc", "tag_feas": [0.1]}, + ) + res = _run(module.set()) + assert "`tag_feas` must be an object mapping string tags to finite numeric scores" in res["message"], res + _set_request_json( monkeypatch, module, @@ -594,7 +602,7 @@ def test_set_chunk_bytes_qa_image_and_guard_matrix_unit(monkeypatch): "important_kwd": ["important"], "question_kwd": ["question"], "tag_kwd": ["tag"], - "tag_feas": [0.1], + "tag_feas": {"tag": 0.1}, "available_int": 0, }, ) @@ -762,6 +770,14 @@ def test_create_chunk_guards_pagerank_and_success_unit(monkeypatch): assert res["message"] == "Knowledgebase not found!", res monkeypatch.setattr(module.KnowledgebaseService, "get_by_id", lambda _kb_id: (True, SimpleNamespace(pagerank=0.8))) + _set_request_json( + monkeypatch, + module, + {"doc_id": "doc-1", "content_with_weight": "chunk", "tag_feas": [0.2]}, + ) + res = _run(module.create()) + assert "`tag_feas` must be an object mapping string tags to finite numeric scores" in res["message"], res + _set_request_json( monkeypatch, module, @@ -770,7 +786,7 @@ def test_create_chunk_guards_pagerank_and_success_unit(monkeypatch): "content_with_weight": "chunk", "important_kwd": ["i1"], "question_kwd": ["q1"], - "tag_feas": [0.2], + "tag_feas": {"tag": 0.2}, }, ) res = _run(module.create()) diff --git a/test/testcases/test_web_api/test_chunk_app/test_create_chunk.py b/test/testcases/test_web_api/test_chunk_app/test_create_chunk.py index ca7f696198..3293dee3ec 100644 --- a/test/testcases/test_web_api/test_chunk_app/test_create_chunk.py +++ b/test/testcases/test_web_api/test_chunk_app/test_create_chunk.py @@ -166,7 +166,7 @@ class TestAddChunk: payload = { "doc_id": doc_id, "content_with_weight": "chunk with tags", - "tag_feas": [0.1, 0.2], + "tag_feas": {"tag1": 0.1, "tag2": 0.2}, "important_kwd": ["tag"], "question_kwd": ["question"], } diff --git a/test/unit_test/common/test_tag_feature_utils.py b/test/unit_test/common/test_tag_feature_utils.py new file mode 100644 index 0000000000..ee4e935451 --- /dev/null +++ b/test/unit_test/common/test_tag_feature_utils.py @@ -0,0 +1,32 @@ +import pytest + +from common.tag_feature_utils import parse_tag_features, validate_tag_features + + +def test_validate_tag_features_accepts_numeric_dict(): + assert validate_tag_features({"apple": 1, "banana": 2.5}) == { + "apple": 1.0, + "banana": 2.5, + } + + +def test_validate_tag_features_rejects_string_payload(): + with pytest.raises(ValueError, match="object mapping string tags"): + validate_tag_features('{"apple": 1.0}') + + +def test_validate_tag_features_rejects_non_finite_or_non_numeric_values(): + with pytest.raises(ValueError, match="finite numbers"): + validate_tag_features({"apple": float("inf")}) + + with pytest.raises(ValueError, match="finite numbers"): + validate_tag_features({"apple": "1.0"}) + + +def test_parse_tag_features_supports_legacy_python_literal_strings(): + assert parse_tag_features("{'apple': 2.0}", allow_python_literal=True) == {"apple": 2.0} + + +def test_parse_tag_features_ignores_executable_strings(): + payload = '{"apple": (__import__("time").sleep(1) or 1.0)}' + assert parse_tag_features(payload, allow_python_literal=True) == {} diff --git a/test/unit_test/rag/test_rank_feature_scores.py b/test/unit_test/rag/test_rank_feature_scores.py new file mode 100644 index 0000000000..b25f1b2971 --- /dev/null +++ b/test/unit_test/rag/test_rank_feature_scores.py @@ -0,0 +1,97 @@ +import sys +import types + +import numpy as np + +from common.constants import PAGERANK_FLD, TAG_FLD + + +class _DummyTokenizer: + def tag(self, *args, **kwargs): + return [] + + def freq(self, *args, **kwargs): + return 0 + + def _tradi2simp(self, text): + return text + + def _strQ2B(self, text): + return text + + +fake_infinity = types.ModuleType("infinity") +fake_infinity_tokenizer = types.ModuleType("infinity.rag_tokenizer") +fake_infinity_tokenizer.RagTokenizer = _DummyTokenizer +fake_infinity_tokenizer.is_chinese = lambda text: False +fake_infinity_tokenizer.is_number = lambda text: False +fake_infinity_tokenizer.is_alphabet = lambda text: True +fake_infinity_tokenizer.naive_qie = lambda text: text.split() +fake_infinity.rag_tokenizer = fake_infinity_tokenizer +sys.modules.setdefault("infinity", fake_infinity) +sys.modules.setdefault("infinity.rag_tokenizer", fake_infinity_tokenizer) + +fake_query = types.ModuleType("rag.nlp.query") + + +class _DummyFulltextQueryer: + pass + + +fake_query.FulltextQueryer = _DummyFulltextQueryer +sys.modules.setdefault("rag.nlp.query", fake_query) + +fake_settings = types.ModuleType("common.settings") +sys.modules.setdefault("common.settings", fake_settings) + +from rag.nlp.search import Dealer + + +def _make_search_res(tag_feas): + return Dealer.SearchResult( + total=1, + ids=["c1"], + field={"c1": {TAG_FLD: tag_feas, PAGERANK_FLD: 0}}, + ) + + +def test_rank_feature_scores_parses_python_dict_string(): + dealer = Dealer.__new__(Dealer) + sres = _make_search_res("{'apple': 2.0}") + scores = dealer._rank_feature_scores({"apple": 1.0}, sres) + assert np.isclose(scores[0], 10.0) + + +def test_rank_feature_scores_parses_json_string(): + dealer = Dealer.__new__(Dealer) + sres = _make_search_res('{"apple": 2.0}') + scores = dealer._rank_feature_scores({"apple": 1.0}, sres) + assert np.isclose(scores[0], 10.0) + + +def test_rank_feature_scores_handles_dict_value(): + dealer = Dealer.__new__(Dealer) + sres = _make_search_res({"apple": 2.0}) + scores = dealer._rank_feature_scores({"apple": 1.0}, sres) + assert np.isclose(scores[0], 10.0) + + +def test_rank_feature_scores_ignores_invalid_tag_feas_string(): + dealer = Dealer.__new__(Dealer) + sres = _make_search_res("not a dict") + scores = dealer._rank_feature_scores({"apple": 1.0}, sres) + assert np.isclose(scores[0], 0.0) + + +def test_rank_feature_scores_ignores_executable_tag_feas_string(): + dealer = Dealer.__new__(Dealer) + sres = _make_search_res('{"apple": (__import__("time").sleep(1) or 1.0)}') + scores = dealer._rank_feature_scores({"apple": 1.0}, sres) + assert np.isclose(scores[0], 0.0) + + +def test_rank_feature_scores_returns_pagerank_when_no_tag_feature(): + dealer = Dealer.__new__(Dealer) + sres = _make_search_res("{'apple': 2.0}") + scores = dealer._rank_feature_scores({PAGERANK_FLD: 10}, sres) + assert np.isclose(scores[0], 0.0)