fix(llm): strip non-generation keys from gen_conf for LiteLLM providers (#15427) (#15432)

### What problem does this PR solve?

Fixes #15427.

All LiteLLM-routed chats fail with:

- Anthropic: `litellm.BadRequestError: AnthropicException -
{"type":"invalid_request_error","message":"model_type: Extra inputs are
not permitted"}`
- OpenAI: `litellm.BadRequestError: OpenAIException - Unknown parameter:
'model_type'`

This is a regression from v0.25.4.

#### Root cause

A chat assistant's `llm_setting` is forwarded to the model as
`gen_conf`. `llm_setting` can legitimately carry RAGFlow-internal
metadata such as `model_type` (the chat REST APIs in
`api/apps/restful_apis/` read it back out of `llm_setting`), so that key
ends up inside `gen_conf`.

`Base._clean_conf` (OpenAI-compatible providers) already **whitelists**
the keys it forwards, so direct-OpenAI providers were unaffected.
`LiteLLMBase._clean_conf` only dropped `max_tokens` and passed
everything else straight through to `litellm.acompletion`, which
forwarded `model_type` to the upstream provider — and Anthropic / OpenAI
reject it. Because both Claude and GPT route through LiteLLM, every chat
broke.

#### Fix

- Extract the allowed-key set into a shared `ALLOWED_GEN_CONF_KEYS`
constant and reuse it in `Base._clean_conf`.
- Apply the same whitelist in `LiteLLMBase._clean_conf`, plus the
LiteLLM-specific reasoning params (`thinking`, `reasoning_effort`,
`extra_body`) that the model-family policies inject for reasoning
models.

This covers all four LiteLLM completion paths (`async_chat`,
`async_chat_streamly`, `async_chat_with_tools`,
`async_chat_streamly_with_tools`), since they all route through
`_clean_conf`.

#### Tests

Adds `test/unit_test/rag/llm/test_clean_conf_whitelist.py` covering both
backends: `model_type` (and other stray keys) are dropped, genuine
generation params and `thinking` survive, `max_tokens` is removed, and
the whitelist invariants hold.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Added test cases
This commit is contained in:
nickmopen
2026-06-02 05:04:11 +03:00
committed by GitHub
parent eaa19bdb02
commit bebf6ed244
3 changed files with 175 additions and 25 deletions

View File

@@ -61,6 +61,47 @@ ERROR_PREFIX = "**ERROR**"
LENGTH_NOTIFICATION_CN = "······\n由于大模型的上下文窗口大小限制,回答已经被大模型截断。"
LENGTH_NOTIFICATION_EN = "...\nThe answer is truncated by your chosen LLM due to its limitation on context length."
# Generation parameters that are safe to forward to the underlying completion
# call. `gen_conf` originates from a chat assistant's `llm_setting`, which can
# also carry RAGFlow-internal metadata (e.g. `model_type`). Anything outside
# this set is dropped so providers don't reject the request with errors like
# "Extra inputs are not permitted" / "Unknown parameter: 'model_type'" (#15427).
ALLOWED_GEN_CONF_KEYS = frozenset(
{
"temperature",
"max_completion_tokens",
"top_p",
"stream",
"stream_options",
"stop",
"n",
"presence_penalty",
"frequency_penalty",
"functions",
"function_call",
"logit_bias",
"user",
"response_format",
"seed",
"tools",
"tool_choice",
"logprobs",
"top_logprobs",
"extra_headers",
}
)
# LiteLLM additionally understands reasoning-control parameters that the
# model-family policies may inject into `gen_conf` (e.g. `thinking` for
# Anthropic / Kimi reasoning models, `reasoning_effort` for OpenAI o-series).
LITELLM_ALLOWED_GEN_CONF_KEYS = ALLOWED_GEN_CONF_KEYS | frozenset(
{
"thinking",
"reasoning_effort",
"extra_body",
}
)
def _apply_model_family_policies(
model_name: str,
@@ -159,30 +200,7 @@ class Base(ABC):
if "max_tokens" in gen_conf:
del gen_conf["max_tokens"]
allowed_conf = {
"temperature",
"max_completion_tokens",
"top_p",
"stream",
"stream_options",
"stop",
"n",
"presence_penalty",
"frequency_penalty",
"functions",
"function_call",
"logit_bias",
"user",
"response_format",
"seed",
"tools",
"tool_choice",
"logprobs",
"top_logprobs",
"extra_headers",
}
gen_conf = {k: v for k, v in gen_conf.items() if k in allowed_conf}
gen_conf = {k: v for k, v in gen_conf.items() if k in ALLOWED_GEN_CONF_KEYS}
return gen_conf
async def _async_chat_streamly(self, history, gen_conf, **kwargs):
@@ -1395,6 +1413,7 @@ class LiteLLMBase(ABC):
)
gen_conf.pop("max_tokens", None)
gen_conf = {k: v for k, v in gen_conf.items() if k in LITELLM_ALLOWED_GEN_CONF_KEYS}
return gen_conf
def _need_reasoning_content_back(self) -> bool:

View File

@@ -47,4 +47,12 @@ sys.modules["common.connection_utils"].timeout = lambda *a, **kw: (lambda fn: fn
sys.modules["api.db.services.task_service"].has_canceled = lambda *_a, **_kw: False
sys.modules["rag.graphrag.general.leiden"].run = lambda *_a, **_kw: {}
sys.modules["rag.graphrag.general.leiden"].add_community_info2graph = lambda *_a, **_kw: None
sys.modules["rag.llm.chat_model"].Base = object
# Only stub ``Base`` when we actually mocked chat_model. This conftest mutates
# the global sys.modules at import time, and rag/graphrag/ is collected before
# rag/llm/. If an earlier test package already imported the real chat_model,
# unconditionally assigning ``Base = object`` clobbered the genuine class and
# leaked into the rag/llm unit tests that import it (AttributeError: no
# attribute '_clean_conf'). graphrag only uses ``Base`` as a type alias, so the
# real class works just as well when it is already loaded.
if isinstance(sys.modules["rag.llm.chat_model"], MagicMock):
sys.modules["rag.llm.chat_model"].Base = object

View File

@@ -0,0 +1,123 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Regression guard for issue #15427: LiteLLM-routed chats failed with
``model_type: Extra inputs are not permitted`` (Anthropic) /
``Unknown parameter: 'model_type'`` (OpenAI).
A chat assistant's ``llm_setting`` is forwarded to the provider as
``gen_conf``. ``llm_setting`` can legitimately carry RAGFlow-internal
metadata such as ``model_type`` (the new chat REST APIs read it back out).
``Base._clean_conf`` already whitelisted the keys it forwards, so OpenAI-
compatible providers were unaffected, but ``LiteLLMBase._clean_conf`` only
dropped ``max_tokens`` and passed everything else straight through to
``litellm.acompletion`` — which forwarded ``model_type`` to the upstream
provider and got rejected.
These tests pin the whitelisting behaviour for both backends so the leak
cannot reappear.
"""
import pytest
from rag.llm.chat_model import (
ALLOWED_GEN_CONF_KEYS,
LITELLM_ALLOWED_GEN_CONF_KEYS,
Base,
LiteLLMBase,
)
class _ConcreteBase(Base):
"""Concrete subclass so we can build an instance without touching the
real OpenAI client constructor."""
class _ConcreteLiteLLM(LiteLLMBase):
"""Concrete subclass for the same reason on the LiteLLM path."""
def _make_base(model_name="gpt-4o"):
inst = _ConcreteBase.__new__(_ConcreteBase)
inst.model_name = model_name
return inst
def _make_litellm(model_name="gpt-4o", provider=""):
inst = _ConcreteLiteLLM.__new__(_ConcreteLiteLLM)
inst.model_name = model_name
inst.provider = provider
return inst
# --------------------------------------------------------------------------- #
# The actual bug: model_type must never reach the provider.
# --------------------------------------------------------------------------- #
def test_litellm_drops_model_type():
cleaned = _make_litellm()._clean_conf({"temperature": 0.5, "model_type": "chat"})
assert "model_type" not in cleaned
assert cleaned["temperature"] == 0.5
def test_base_drops_model_type():
cleaned = _make_base()._clean_conf({"temperature": 0.5, "model_type": "chat"})
assert "model_type" not in cleaned
assert cleaned["temperature"] == 0.5
@pytest.mark.parametrize("stray_key", ["model_type", "llm_id", "parameter", "icon", "foo"])
def test_litellm_drops_arbitrary_internal_keys(stray_key):
cleaned = _make_litellm()._clean_conf({stray_key: "x", "top_p": 0.9})
assert stray_key not in cleaned
assert cleaned["top_p"] == 0.9
# --------------------------------------------------------------------------- #
# The fix must not over-filter: genuine generation params still pass through.
# --------------------------------------------------------------------------- #
def test_litellm_preserves_known_generation_params():
gen_conf = {
"temperature": 0.7,
"top_p": 0.95,
"presence_penalty": 0.1,
"frequency_penalty": 0.2,
}
cleaned = _make_litellm()._clean_conf(dict(gen_conf))
assert cleaned == gen_conf
def test_litellm_preserves_thinking_param():
"""``thinking`` is injected by the model-family policy for reasoning
models and must survive the whitelist (it is a valid LiteLLM param)."""
cleaned = _make_litellm()._clean_conf({"thinking": {"type": "enabled"}, "temperature": 1.0})
assert cleaned["thinking"] == {"type": "enabled"}
def test_max_tokens_is_dropped_on_both_backends():
assert "max_tokens" not in _make_litellm()._clean_conf({"max_tokens": 100, "temperature": 0.3})
assert "max_tokens" not in _make_base()._clean_conf({"max_tokens": 100, "temperature": 0.3})
# --------------------------------------------------------------------------- #
# Whitelist invariants.
# --------------------------------------------------------------------------- #
def test_litellm_whitelist_is_superset_of_base():
assert ALLOWED_GEN_CONF_KEYS <= LITELLM_ALLOWED_GEN_CONF_KEYS
def test_model_type_not_whitelisted_anywhere():
assert "model_type" not in ALLOWED_GEN_CONF_KEYS
assert "model_type" not in LITELLM_ALLOWED_GEN_CONF_KEYS