Files
ragflow/test/unit_test/rag/llm/test_rerank_normalization.py
cleanjunc 38f9ea5fec fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?

Closes #15428

The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:

```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea  # tkweight=0.3, vtweight=0.7
```

The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.

This PR enforces a single scoring contract in one place:

- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.

A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 11:53:22 +08:00

173 lines
6.2 KiB
Python

#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Regression tests for the shared reranker score-normalization contract.
Every reranker must return scores on a single ``[0, 1]`` scale so that the
hybrid blend in ``rag/nlp/search.py`` (``tkweight * tksim + vtweight * vtsim``)
stays comparable across providers. Historically only 3 of ~17 providers
normalized, and NVIDIA returned raw, unbounded logits — which corrupted
retrieval ordering. The contract is now enforced once in ``Base.similarity``.
"""
from unittest.mock import MagicMock, patch
import numpy as np
import pytest
from rag.llm.rerank_model import (
Base,
JinaRerank,
NvidiaRerank,
)
pytestmark = pytest.mark.p1
def _mock_post(payload):
"""Patch ``requests.post`` so ``response.json()`` returns ``payload``."""
response = MagicMock()
response.raise_for_status.return_value = None
response.json.return_value = payload
return patch("rag.llm.rerank_model.requests.post", return_value=response)
class _RawRerank(Base):
"""Minimal provider that emits arbitrary raw scores via ``_compute_rank``."""
def __init__(self, raw):
self._raw = np.asarray(raw, dtype=float)
def _compute_rank(self, query, texts):
return self._raw, 0
# --- The central guarantee: every provider's output lands in [0, 1] ----------
@pytest.mark.parametrize(
"raw, expected",
[
# Unbounded NVIDIA-style logits, including negatives -> rescaled.
([10.0, -3.0, 0.0], [1.0, 0.0, 3.0 / 13.0]),
# Large positive logits -> rescaled.
([100.0, 50.0, 75.0], [1.0, 0.0, 0.5]),
# Negative-only logits -> rescaled.
([-1.0, -5.0, -3.0], [1.0, 0.0, 0.5]),
],
)
def test_out_of_range_scores_are_rescaled(raw, expected):
rank, _ = _RawRerank(raw).similarity("q", ["a", "b", "c"])
assert np.allclose(rank, expected)
assert rank.min() >= 0.0 and rank.max() <= 1.0
@pytest.mark.parametrize(
"raw",
[
[0.9, 0.1, 0.5], # spread relevance scores
[0.8, 0.8, 0.8], # all-equal but valid -> not zeroed
[1.0], # single calibrated candidate -> not zeroed
[0.0, 1.0, 0.42], # already spanning the full range
],
)
def test_in_range_scores_are_preserved(raw):
# Calibrated [0,1] providers (Cohere/Jina/Voyage/...) keep their absolute
# magnitudes, so similarity_threshold and reported vector_similarity stay
# meaningful and degenerate batches are NOT collapsed to zero.
rank, _ = _RawRerank(raw).similarity("q", ["x"] * len(raw))
assert np.allclose(rank, raw)
def test_normalization_preserves_ordering():
raw = [-5.0, 12.0, 3.0, -1.0]
rank, _ = _RawRerank(raw).similarity("q", ["a", "b", "c", "d"])
assert list(np.argsort(rank)) == list(np.argsort(raw))
@pytest.mark.parametrize(
"raw, expected",
[
# Single out-of-range candidate: clamped, never zeroed and never NaN.
([5.0], [1.0]),
([-3.0], [0.0]),
# Spreadless out-of-range batch: clamped per element, not collapsed.
([5.0, 5.0, 5.0], [1.0, 1.0, 1.0]),
([-2.0, -2.0, -2.0], [0.0, 0.0, 0.0]),
],
)
def test_spreadless_out_of_range_batch_is_clamped(raw, expected):
rank, _ = _RawRerank(raw).similarity("q", ["x"] * len(raw))
assert np.allclose(rank, expected)
assert not np.isnan(rank).any()
# --- Empty input short-circuits before any backend call ----------------------
@pytest.mark.parametrize("query, texts", [("", ["a"]), ("q", []), ("", [])])
def test_empty_input_returns_zeros_without_backend(query, texts):
provider = _RawRerank([1.0])
provider._compute_rank = MagicMock(side_effect=AssertionError("backend called"))
rank, tokens = provider.similarity(query, texts)
assert tokens == 0
assert rank.size == len(texts)
assert rank.dtype == float
# --- Per-provider: raw backend payloads come out normalized ------------------
def test_nvidia_logits_are_normalized():
"""NVIDIA emits raw logits; without central normalization a negative logit
with vtweight=0.7 would sink a relevant chunk below keyword matches."""
nv = NvidiaRerank("key", "nvidia/rerank-qa-mistral-4b")
payload = {"rankings": [{"index": 0, "logit": 8.0}, {"index": 1, "logit": -4.0}, {"index": 2, "logit": 1.0}]}
with _mock_post(payload):
rank, _ = nv.similarity("q", ["a", "b", "c"])
# _compute_rank still returns the raw logits (no per-provider normalization)...
with _mock_post(payload):
raw, _ = nv._compute_rank("q", ["a", "b", "c"])
assert raw.min() < 0 # genuinely unbounded/negative
# ...but the public contract normalizes them.
assert np.allclose(rank, [1.0, 0.0, 5.0 / 12.0])
assert rank.min() >= 0.0 and rank.max() <= 1.0
def test_calibrated_relevance_scores_are_preserved():
# A provider already returning [0,1] relevance scores keeps them verbatim;
# min-max would have stretched these to [1.0, 0.0, 0.5].
jina = JinaRerank("key", base_url="http://x/rerank")
payload = {"results": [{"index": 0, "relevance_score": 0.8}, {"index": 1, "relevance_score": 0.2}, {"index": 2, "relevance_score": 0.5}]}
with _mock_post(payload):
rank, _ = jina.similarity("q", ["a", "b", "c"])
assert np.allclose(rank, [0.8, 0.2, 0.5])
# --- Structural guarantee: providers override _compute_rank, not similarity --
def test_providers_share_single_similarity_entrypoint():
import inspect
import rag.llm.rerank_model as rm
overrides = []
for _, cls in inspect.getmembers(rm, inspect.isclass):
if issubclass(cls, Base) and cls is not Base and "similarity" in cls.__dict__:
overrides.append(cls.__name__)
assert overrides == [], f"providers must not override similarity(): {overrides}"