2024-05-29 16:50:02 +08:00
|
|
|
#
|
|
|
|
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
#
|
2025-07-03 19:05:31 +08:00
|
|
|
import json
|
feat: add FuturMix as model provider (#14419)
## Summary
Add [FuturMix](https://futurmix.ai) as a new model provider. FuturMix is
an OpenAI-compatible unified AI gateway that provides access to 22+
models (GPT, Claude, Gemini, DeepSeek, and more) through a single API
endpoint and key.
- **API Base**: `https://futurmix.ai/v1` (OpenAI-compatible)
- **Supported capabilities**: Chat, Embedding, Image2Text, TTS,
Speech2Text, Rerank
### Changes
| File | Change |
|------|--------|
| `rag/llm/__init__.py` | Add `FuturMix` to `SupportedLiteLLMProvider`
enum, `FACTORY_DEFAULT_BASE_URL`, and `LITELLM_PROVIDER_PREFIX` |
| `rag/llm/chat_model.py` | Add `FuturMixChat(Base)` — follows
Astraflow/Avian pattern |
| `rag/llm/embedding_model.py` | Add `FuturMixEmbed(OpenAIEmbed)` —
follows Astraflow pattern |
| `rag/llm/cv_model.py` | Add `FuturMixCV(GptV4)` — follows
SILICONFLOW/OpenRouter pattern |
| `rag/llm/tts_model.py` | Add `FuturMixTTS(OpenAITTS)` — follows
CometAPI/DeerAPI pattern |
| `rag/llm/sequence2txt_model.py` | Add `FuturMixSeq2txt(GPTSeq2txt)` —
follows StepFun pattern |
| `rag/llm/rerank_model.py` | Add `FuturMixRerank(OpenAI_APIRerank)` |
| `conf/llm_factories.json` | Add factory config with 8 chat, 2
embedding, 1 image2text, 2 TTS, 1 speech2text models |
| `docs/guides/models/supported_models.mdx` | Add FuturMix to supported
models table |
### Models included
- **Chat**: claude-sonnet-4-20250514, claude-3.5-haiku, gpt-4o,
gpt-4o-mini, gemini-2.5-flash, gemini-2.0-flash, deepseek-chat,
deepseek-reasoner
- **Embedding**: text-embedding-3-small, text-embedding-3-large
- **Image2Text**: gpt-4o
- **TTS**: tts-1, tts-1-hd
- **Speech2Text**: whisper-1
## Test plan
- [ ] Verify FuturMix appears in the model provider list in RAGFlow UI
- [ ] Configure FuturMix with API key and test chat completion
- [ ] Test embedding model with document indexing
- [ ] Test image2text with a sample image
🤖 Generated with [Claude Code](https://claude.com/claude-code)
---------
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-30 10:59:37 +08:00
|
|
|
import logging
|
2025-07-03 19:05:31 +08:00
|
|
|
from abc import ABC
|
2024-10-09 19:37:11 +08:00
|
|
|
from urllib.parse import urljoin
|
2026-05-11 12:40:41 +08:00
|
|
|
from typing import Tuple, List
|
|
|
|
|
from http import HTTPStatus
|
2024-10-09 19:37:11 +08:00
|
|
|
|
2024-05-29 16:50:02 +08:00
|
|
|
import numpy as np
|
2025-07-03 19:05:31 +08:00
|
|
|
import requests
|
2025-01-15 14:15:58 +08:00
|
|
|
from yarl import URL
|
2024-09-24 19:22:01 +08:00
|
|
|
|
2025-11-03 20:25:02 +08:00
|
|
|
from common.log_utils import log_exception
|
2025-11-03 08:50:05 +08:00
|
|
|
from common.token_utils import num_tokens_from_string, truncate, total_token_count_from_response
|
2024-05-29 16:50:02 +08:00
|
|
|
|
|
|
|
|
class Base(ABC):
|
2025-08-07 08:45:37 +07:00
|
|
|
def __init__(self, key, model_name, **kwargs):
|
2024-05-29 16:50:02 +08:00
|
|
|
pass
|
|
|
|
|
|
2026-05-11 12:40:41 +08:00
|
|
|
def similarity(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
"""Score ``texts`` against ``query`` and return ``(rank, token_count)``.
|
|
|
|
|
|
|
|
|
|
This is the single public entry point shared by every reranker. It
|
|
|
|
|
short-circuits empty input and guarantees the returned scores are
|
|
|
|
|
min-max normalized to ``[0, 1]`` regardless of what the backend emits
|
|
|
|
|
(relevance scores, cosine similarities or raw logits). Downstream
|
|
|
|
|
hybrid scoring blends the reranker output with token similarity on a
|
|
|
|
|
fixed ``[0, 1]`` scale, so an un-normalized provider (e.g. NVIDIA's
|
|
|
|
|
unbounded logits) would otherwise corrupt the final ordering.
|
|
|
|
|
|
|
|
|
|
Subclasses implement provider-specific scoring in :meth:`_compute_rank`
|
|
|
|
|
and must not normalize themselves.
|
|
|
|
|
"""
|
|
|
|
|
if not query or not texts:
|
|
|
|
|
return np.zeros(len(texts) if texts else 0, dtype=float), 0
|
|
|
|
|
rank, token_count = self._compute_rank(query, texts)
|
|
|
|
|
rank = np.asarray(rank, dtype=float)
|
|
|
|
|
if rank.size:
|
|
|
|
|
logging.debug(
|
|
|
|
|
"Rerank %s scores before normalization: count=%d min=%.4f max=%.4f",
|
|
|
|
|
self.__class__.__name__,
|
|
|
|
|
rank.size,
|
|
|
|
|
float(np.min(rank)),
|
|
|
|
|
float(np.max(rank)),
|
|
|
|
|
)
|
|
|
|
|
return self._normalize_rank(rank), token_count
|
|
|
|
|
|
|
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
|
|
|
|
"""Provider-specific scoring. ``query`` and ``texts`` are non-empty."""
|
|
|
|
|
raise NotImplementedError("Please implement _compute_rank method!")
|
2024-05-29 16:50:02 +08:00
|
|
|
|
2026-01-12 11:07:11 +08:00
|
|
|
@staticmethod
|
|
|
|
|
def _normalize_rank(rank: np.ndarray) -> np.ndarray:
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
"""Guarantee scores land in ``[0, 1]`` for the hybrid blend.
|
|
|
|
|
|
|
|
|
|
Providers that already emit calibrated relevance scores in ``[0, 1]``
|
|
|
|
|
(Cohere, Jina, Voyage, ...) are returned unchanged, so their absolute
|
|
|
|
|
magnitudes, ``similarity_threshold`` semantics and reported
|
|
|
|
|
``vector_similarity`` are preserved. Only out-of-range output (e.g.
|
|
|
|
|
NVIDIA's unbounded, often negative logits) is rescaled: a batch with a
|
|
|
|
|
usable spread is min-max mapped onto ``[0, 1]`` (which stops a negative
|
|
|
|
|
logit from dragging a relevant chunk below pure keyword matches once
|
|
|
|
|
weighted by ``vtweight``), while a spreadless batch (including a single
|
|
|
|
|
candidate) has no relative signal and is clamped instead, so a lone
|
|
|
|
|
high score is not silently zeroed.
|
|
|
|
|
"""
|
2026-05-11 12:40:41 +08:00
|
|
|
if rank.size == 0:
|
|
|
|
|
return rank
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
min_rank = float(np.min(rank))
|
|
|
|
|
max_rank = float(np.max(rank))
|
2026-01-12 11:07:11 +08:00
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
if min_rank >= 0.0 and max_rank <= 1.0:
|
|
|
|
|
return rank
|
|
|
|
|
span = max_rank - min_rank
|
|
|
|
|
if span < 1e-3:
|
|
|
|
|
return np.clip(rank, 0.0, 1.0)
|
|
|
|
|
return (rank - min_rank) / span
|
2026-01-12 11:07:11 +08:00
|
|
|
|
2024-05-29 16:50:02 +08:00
|
|
|
|
|
|
|
|
class JinaRerank(Base):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "Jina"
|
|
|
|
|
|
|
|
|
|
def __init__(self, key, model_name="jina-reranker-v2-base-multilingual", base_url="https://api.jina.ai/v1/rerank"):
|
2026-05-11 12:40:41 +08:00
|
|
|
self.base_url = base_url or "https://api.jina.ai/v1/rerank"
|
2025-07-03 19:05:31 +08:00
|
|
|
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
2024-05-29 16:50:02 +08:00
|
|
|
self.model_name = model_name
|
|
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
2024-05-29 16:50:02 +08:00
|
|
|
texts = [truncate(t, 8196) for t in texts]
|
2025-07-03 19:05:31 +08:00
|
|
|
data = {"model": self.model_name, "query": query, "documents": texts, "top_n": len(texts)}
|
2026-05-11 12:40:41 +08:00
|
|
|
response = requests.post(self.base_url, headers=self.headers, json=data, timeout=30)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
res = response.json()
|
2024-09-26 16:05:25 +08:00
|
|
|
rank = np.zeros(len(texts), dtype=float)
|
2025-06-12 17:53:59 +08:00
|
|
|
try:
|
2026-05-11 12:40:41 +08:00
|
|
|
for d in res.get("results", []):
|
2025-06-12 17:53:59 +08:00
|
|
|
rank[d["index"]] = d["relevance_score"]
|
|
|
|
|
except Exception as _e:
|
|
|
|
|
log_exception(_e, res)
|
2025-10-31 09:46:16 +08:00
|
|
|
return rank, total_token_count_from_response(res)
|
2024-05-29 16:50:02 +08:00
|
|
|
|
|
|
|
|
|
2024-07-11 18:37:41 +08:00
|
|
|
class XInferenceRerank(Base):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "Xinference"
|
|
|
|
|
|
2025-03-25 15:01:13 +08:00
|
|
|
def __init__(self, key="x", model_name="", base_url=""):
|
2024-10-09 19:37:11 +08:00
|
|
|
if base_url.find("/v1") == -1:
|
|
|
|
|
base_url = urljoin(base_url, "/v1/rerank")
|
2024-11-28 18:56:10 +08:00
|
|
|
if base_url.find("/rerank") == -1:
|
|
|
|
|
base_url = urljoin(base_url, "/v1/rerank")
|
2024-07-12 12:33:37 +08:00
|
|
|
self.model_name = model_name
|
|
|
|
|
self.base_url = base_url
|
2025-07-03 19:05:31 +08:00
|
|
|
self.headers = {"Content-Type": "application/json", "accept": "application/json"}
|
2025-03-25 15:01:13 +08:00
|
|
|
if key and key != "x":
|
|
|
|
|
self.headers["Authorization"] = f"Bearer {key}"
|
2024-05-29 16:50:02 +08:00
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
2025-01-16 11:35:51 +08:00
|
|
|
pairs = [(query, truncate(t, 4096)) for t in texts]
|
|
|
|
|
token_count = 0
|
|
|
|
|
for _, t in pairs:
|
|
|
|
|
token_count += num_tokens_from_string(t)
|
2025-07-03 19:05:31 +08:00
|
|
|
data = {"model": self.model_name, "query": query, "return_documents": "true", "return_len": "true", "documents": texts}
|
2026-05-11 12:40:41 +08:00
|
|
|
response = requests.post(self.base_url, headers=self.headers, json=data, timeout=30)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
res = response.json()
|
2024-09-26 16:05:25 +08:00
|
|
|
rank = np.zeros(len(texts), dtype=float)
|
2025-06-12 17:53:59 +08:00
|
|
|
try:
|
2026-05-11 12:40:41 +08:00
|
|
|
for d in res.get("results", []):
|
2025-06-12 17:53:59 +08:00
|
|
|
rank[d["index"]] = d["relevance_score"]
|
|
|
|
|
except Exception as _e:
|
|
|
|
|
log_exception(_e, res)
|
2025-01-16 11:35:51 +08:00
|
|
|
return rank, token_count
|
2024-07-19 15:50:28 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class LocalAIRerank(Base):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "LocalAI"
|
|
|
|
|
|
2024-07-19 15:50:28 +08:00
|
|
|
def __init__(self, key, model_name, base_url):
|
2024-11-18 07:05:52 +03:00
|
|
|
if base_url.find("/rerank") == -1:
|
|
|
|
|
self.base_url = urljoin(base_url, "/rerank")
|
|
|
|
|
else:
|
|
|
|
|
self.base_url = base_url
|
2025-07-03 19:05:31 +08:00
|
|
|
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
2024-12-05 13:28:42 +08:00
|
|
|
self.model_name = model_name.split("___")[0]
|
2024-07-19 15:50:28 +08:00
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
2024-11-18 07:05:52 +03:00
|
|
|
texts = [truncate(t, 500) for t in texts]
|
|
|
|
|
data = {
|
|
|
|
|
"model": self.model_name,
|
|
|
|
|
"query": query,
|
|
|
|
|
"documents": texts,
|
|
|
|
|
"top_n": len(texts),
|
|
|
|
|
}
|
|
|
|
|
token_count = 0
|
|
|
|
|
for t in texts:
|
|
|
|
|
token_count += num_tokens_from_string(t)
|
2026-05-11 12:40:41 +08:00
|
|
|
response = requests.post(self.base_url, headers=self.headers, json=data, timeout=30)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
res = response.json()
|
2024-11-18 07:05:52 +03:00
|
|
|
rank = np.zeros(len(texts), dtype=float)
|
2025-06-12 17:53:59 +08:00
|
|
|
try:
|
2026-05-11 12:40:41 +08:00
|
|
|
for d in res.get("results", []):
|
2025-06-12 17:53:59 +08:00
|
|
|
rank[d["index"]] = d["relevance_score"]
|
|
|
|
|
except Exception as _e:
|
|
|
|
|
log_exception(_e, res)
|
2024-11-18 07:05:52 +03:00
|
|
|
return rank, token_count
|
2024-07-23 10:43:09 +08:00
|
|
|
|
2025-03-06 10:44:04 +08:00
|
|
|
|
2024-07-23 10:43:09 +08:00
|
|
|
class NvidiaRerank(Base):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "NVIDIA"
|
|
|
|
|
|
|
|
|
|
def __init__(self, key, model_name, base_url="https://ai.api.nvidia.com/v1/retrieval/nvidia/"):
|
2024-07-23 10:43:09 +08:00
|
|
|
if not base_url:
|
|
|
|
|
base_url = "https://ai.api.nvidia.com/v1/retrieval/nvidia/"
|
|
|
|
|
self.model_name = model_name
|
|
|
|
|
|
|
|
|
|
if self.model_name == "nvidia/nv-rerankqa-mistral-4b-v3":
|
2025-07-03 19:05:31 +08:00
|
|
|
self.base_url = urljoin(base_url, "nv-rerankqa-mistral-4b-v3/reranking")
|
2024-07-23 10:43:09 +08:00
|
|
|
|
|
|
|
|
if self.model_name == "nvidia/rerank-qa-mistral-4b":
|
2025-06-03 14:18:40 +08:00
|
|
|
self.base_url = urljoin(base_url, "reranking")
|
2024-07-23 10:43:09 +08:00
|
|
|
self.model_name = "nv-rerank-qa-mistral-4b:1"
|
|
|
|
|
|
|
|
|
|
self.headers = {
|
|
|
|
|
"accept": "application/json",
|
|
|
|
|
"Content-Type": "application/json",
|
|
|
|
|
"Authorization": f"Bearer {key}",
|
|
|
|
|
}
|
|
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
2025-07-03 19:05:31 +08:00
|
|
|
token_count = num_tokens_from_string(query) + sum([num_tokens_from_string(t) for t in texts])
|
2024-07-23 10:43:09 +08:00
|
|
|
data = {
|
|
|
|
|
"model": self.model_name,
|
|
|
|
|
"query": {"text": query},
|
|
|
|
|
"passages": [{"text": text} for text in texts],
|
|
|
|
|
"truncate": "END",
|
|
|
|
|
"top_n": len(texts),
|
|
|
|
|
}
|
2026-05-11 12:40:41 +08:00
|
|
|
response = requests.post(self.base_url, headers=self.headers, json=data, timeout=30)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
res = response.json()
|
2024-09-26 16:05:25 +08:00
|
|
|
rank = np.zeros(len(texts), dtype=float)
|
2025-06-12 17:53:59 +08:00
|
|
|
try:
|
2026-05-11 12:40:41 +08:00
|
|
|
for d in res.get("rankings", []):
|
2025-06-12 17:53:59 +08:00
|
|
|
rank[d["index"]] = d["logit"]
|
|
|
|
|
except Exception as _e:
|
|
|
|
|
log_exception(_e, res)
|
2024-09-26 16:05:25 +08:00
|
|
|
return rank, token_count
|
2024-07-24 12:46:43 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class LmStudioRerank(Base):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "LM-Studio"
|
|
|
|
|
|
2025-08-07 08:45:37 +07:00
|
|
|
def __init__(self, key, model_name, base_url, **kwargs):
|
2024-07-24 12:46:43 +08:00
|
|
|
pass
|
|
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
2026-05-11 12:40:41 +08:00
|
|
|
raise NotImplementedError("The LmStudioRerank has not been implemented")
|
2024-08-06 16:20:21 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class OpenAI_APIRerank(Base):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "OpenAI-API-Compatible"
|
|
|
|
|
|
2024-08-06 16:20:21 +08:00
|
|
|
def __init__(self, key, model_name, base_url):
|
2026-02-10 16:13:21 +08:00
|
|
|
normalized_base_url = (base_url or "").strip()
|
|
|
|
|
if "/rerank" in normalized_base_url:
|
|
|
|
|
self.base_url = normalized_base_url.rstrip("/")
|
Resolves #2905 openai compatible model provider add llama.cpp rerank support (#2906)
### What problem does this PR solve?
Resolve #2905
due to the in-consistent of token size, I make it safe to limit 500 in
code, since there is no config param to control
my llama.cpp run set -ub to 1024:
${llama_path}/bin/llama-server --host 0.0.0.0 --port 9901 -ub 1024 -ngl
99 -m $gguf_file --reranking "$@"
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
Here is my test Ragflow use llama.cpp
```
lot update_slots: id 0 | task 458 | prompt done, n_past = 416, n_tokens = 416
slot release: id 0 | task 458 | stop processing: n_past = 416, truncated = 0
slot launch_slot_: id 0 | task 459 | processing task
slot update_slots: id 0 | task 459 | tokenizing prompt, len = 2
slot update_slots: id 0 | task 459 | prompt tokenized, n_ctx_slot = 8192, n_keep = 0, n_prompt_tokens = 111
slot update_slots: id 0 | task 459 | kv cache rm [0, end)
slot update_slots: id 0 | task 459 | prompt processing progress, n_past = 111, n_tokens = 111, progress = 1.000000
slot update_slots: id 0 | task 459 | prompt done, n_past = 111, n_tokens = 111
slot release: id 0 | task 459 | stop processing: n_past = 111, truncated = 0
srv update_slots: all slots are idle
request: POST /rerank 172.23.0.4 200
```
2024-10-21 10:06:29 +08:00
|
|
|
else:
|
2026-02-10 16:13:21 +08:00
|
|
|
self.base_url = urljoin(f"{normalized_base_url.rstrip('/')}/", "rerank").rstrip("/")
|
2025-07-03 19:05:31 +08:00
|
|
|
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
2024-12-05 10:26:21 +08:00
|
|
|
self.model_name = model_name.split("___")[0]
|
2024-08-06 16:20:21 +08:00
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
Resolves #2905 openai compatible model provider add llama.cpp rerank support (#2906)
### What problem does this PR solve?
Resolve #2905
due to the in-consistent of token size, I make it safe to limit 500 in
code, since there is no config param to control
my llama.cpp run set -ub to 1024:
${llama_path}/bin/llama-server --host 0.0.0.0 --port 9901 -ub 1024 -ngl
99 -m $gguf_file --reranking "$@"
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
Here is my test Ragflow use llama.cpp
```
lot update_slots: id 0 | task 458 | prompt done, n_past = 416, n_tokens = 416
slot release: id 0 | task 458 | stop processing: n_past = 416, truncated = 0
slot launch_slot_: id 0 | task 459 | processing task
slot update_slots: id 0 | task 459 | tokenizing prompt, len = 2
slot update_slots: id 0 | task 459 | prompt tokenized, n_ctx_slot = 8192, n_keep = 0, n_prompt_tokens = 111
slot update_slots: id 0 | task 459 | kv cache rm [0, end)
slot update_slots: id 0 | task 459 | prompt processing progress, n_past = 111, n_tokens = 111, progress = 1.000000
slot update_slots: id 0 | task 459 | prompt done, n_past = 111, n_tokens = 111
slot release: id 0 | task 459 | stop processing: n_past = 111, truncated = 0
srv update_slots: all slots are idle
request: POST /rerank 172.23.0.4 200
```
2024-10-21 10:06:29 +08:00
|
|
|
texts = [truncate(t, 500) for t in texts]
|
|
|
|
|
data = {
|
|
|
|
|
"model": self.model_name,
|
|
|
|
|
"query": query,
|
|
|
|
|
"documents": texts,
|
|
|
|
|
"top_n": len(texts),
|
|
|
|
|
}
|
|
|
|
|
token_count = 0
|
|
|
|
|
for t in texts:
|
|
|
|
|
token_count += num_tokens_from_string(t)
|
2026-05-11 12:40:41 +08:00
|
|
|
response = requests.post(self.base_url, headers=self.headers, json=data, timeout=30)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
res = response.json()
|
Resolves #2905 openai compatible model provider add llama.cpp rerank support (#2906)
### What problem does this PR solve?
Resolve #2905
due to the in-consistent of token size, I make it safe to limit 500 in
code, since there is no config param to control
my llama.cpp run set -ub to 1024:
${llama_path}/bin/llama-server --host 0.0.0.0 --port 9901 -ub 1024 -ngl
99 -m $gguf_file --reranking "$@"
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
Here is my test Ragflow use llama.cpp
```
lot update_slots: id 0 | task 458 | prompt done, n_past = 416, n_tokens = 416
slot release: id 0 | task 458 | stop processing: n_past = 416, truncated = 0
slot launch_slot_: id 0 | task 459 | processing task
slot update_slots: id 0 | task 459 | tokenizing prompt, len = 2
slot update_slots: id 0 | task 459 | prompt tokenized, n_ctx_slot = 8192, n_keep = 0, n_prompt_tokens = 111
slot update_slots: id 0 | task 459 | kv cache rm [0, end)
slot update_slots: id 0 | task 459 | prompt processing progress, n_past = 111, n_tokens = 111, progress = 1.000000
slot update_slots: id 0 | task 459 | prompt done, n_past = 111, n_tokens = 111
slot release: id 0 | task 459 | stop processing: n_past = 111, truncated = 0
srv update_slots: all slots are idle
request: POST /rerank 172.23.0.4 200
```
2024-10-21 10:06:29 +08:00
|
|
|
rank = np.zeros(len(texts), dtype=float)
|
2025-06-12 17:53:59 +08:00
|
|
|
try:
|
2026-05-11 12:40:41 +08:00
|
|
|
for d in res.get("results", []):
|
2025-06-12 17:53:59 +08:00
|
|
|
rank[d["index"]] = d["relevance_score"]
|
|
|
|
|
except Exception as _e:
|
|
|
|
|
log_exception(_e, res)
|
Resolves #2905 openai compatible model provider add llama.cpp rerank support (#2906)
### What problem does this PR solve?
Resolve #2905
due to the in-consistent of token size, I make it safe to limit 500 in
code, since there is no config param to control
my llama.cpp run set -ub to 1024:
${llama_path}/bin/llama-server --host 0.0.0.0 --port 9901 -ub 1024 -ngl
99 -m $gguf_file --reranking "$@"
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
Here is my test Ragflow use llama.cpp
```
lot update_slots: id 0 | task 458 | prompt done, n_past = 416, n_tokens = 416
slot release: id 0 | task 458 | stop processing: n_past = 416, truncated = 0
slot launch_slot_: id 0 | task 459 | processing task
slot update_slots: id 0 | task 459 | tokenizing prompt, len = 2
slot update_slots: id 0 | task 459 | prompt tokenized, n_ctx_slot = 8192, n_keep = 0, n_prompt_tokens = 111
slot update_slots: id 0 | task 459 | kv cache rm [0, end)
slot update_slots: id 0 | task 459 | prompt processing progress, n_past = 111, n_tokens = 111, progress = 1.000000
slot update_slots: id 0 | task 459 | prompt done, n_past = 111, n_tokens = 111
slot release: id 0 | task 459 | stop processing: n_past = 111, truncated = 0
srv update_slots: all slots are idle
request: POST /rerank 172.23.0.4 200
```
2024-10-21 10:06:29 +08:00
|
|
|
return rank, token_count
|
2024-08-07 18:40:51 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class CoHereRerank(Base):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = ["Cohere", "VLLM"]
|
|
|
|
|
|
2024-08-07 18:40:51 +08:00
|
|
|
def __init__(self, key, model_name, base_url=None):
|
|
|
|
|
from cohere import Client
|
|
|
|
|
|
2026-05-11 12:40:41 +08:00
|
|
|
client_kwargs = {"api_key": key, "timeout": 30.0}
|
2025-11-20 02:46:39 +01:00
|
|
|
if base_url and base_url.strip():
|
|
|
|
|
client_kwargs["base_url"] = base_url
|
|
|
|
|
self.client = Client(**client_kwargs)
|
2025-03-20 11:52:42 +08:00
|
|
|
self.model_name = model_name.split("___")[0]
|
2024-08-07 18:40:51 +08:00
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
2025-07-03 19:05:31 +08:00
|
|
|
token_count = num_tokens_from_string(query) + sum([num_tokens_from_string(t) for t in texts])
|
2024-08-07 18:40:51 +08:00
|
|
|
res = self.client.rerank(
|
|
|
|
|
model=self.model_name,
|
|
|
|
|
query=query,
|
|
|
|
|
documents=texts,
|
|
|
|
|
top_n=len(texts),
|
|
|
|
|
return_documents=False,
|
|
|
|
|
)
|
2024-09-26 16:05:25 +08:00
|
|
|
rank = np.zeros(len(texts), dtype=float)
|
2025-06-12 17:53:59 +08:00
|
|
|
try:
|
|
|
|
|
for d in res.results:
|
|
|
|
|
rank[d.index] = d.relevance_score
|
|
|
|
|
except Exception as _e:
|
|
|
|
|
log_exception(_e, res)
|
2024-09-26 16:05:25 +08:00
|
|
|
return rank, token_count
|
2024-08-12 10:15:21 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class TogetherAIRerank(Base):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "TogetherAI"
|
|
|
|
|
|
2025-08-07 08:45:37 +07:00
|
|
|
def __init__(self, key, model_name, base_url, **kwargs):
|
2024-08-12 10:15:21 +08:00
|
|
|
pass
|
|
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
2026-05-11 12:40:41 +08:00
|
|
|
raise NotImplementedError("The api has not been implemented")
|
2024-08-13 16:09:10 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class SILICONFLOWRerank(Base):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "SILICONFLOW"
|
|
|
|
|
|
|
|
|
|
def __init__(self, key, model_name, base_url="https://api.siliconflow.cn/v1/rerank"):
|
2026-03-02 15:37:42 +08:00
|
|
|
normalized_base_url = (base_url or "").strip()
|
|
|
|
|
if not normalized_base_url:
|
|
|
|
|
normalized_base_url = "https://api.siliconflow.cn/v1/rerank"
|
|
|
|
|
if "/rerank" not in normalized_base_url:
|
|
|
|
|
normalized_base_url = urljoin(f"{normalized_base_url.rstrip('/')}/", "rerank").rstrip("/")
|
2024-08-13 16:09:10 +08:00
|
|
|
self.model_name = model_name
|
2026-03-02 15:37:42 +08:00
|
|
|
self.base_url = normalized_base_url
|
2024-08-13 16:09:10 +08:00
|
|
|
self.headers = {
|
|
|
|
|
"accept": "application/json",
|
|
|
|
|
"content-type": "application/json",
|
|
|
|
|
"authorization": f"Bearer {key}",
|
|
|
|
|
}
|
|
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
2024-08-13 16:09:10 +08:00
|
|
|
payload = {
|
|
|
|
|
"model": self.model_name,
|
|
|
|
|
"query": query,
|
|
|
|
|
"documents": texts,
|
|
|
|
|
"top_n": len(texts),
|
|
|
|
|
"return_documents": False,
|
|
|
|
|
"max_chunks_per_doc": 1024,
|
|
|
|
|
"overlap_tokens": 80,
|
|
|
|
|
}
|
2026-05-11 12:40:41 +08:00
|
|
|
response = requests.post(self.base_url, json=payload, headers=self.headers, timeout=30)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
res = response.json()
|
2024-09-26 16:05:25 +08:00
|
|
|
rank = np.zeros(len(texts), dtype=float)
|
2025-06-12 17:53:59 +08:00
|
|
|
try:
|
2026-05-11 12:40:41 +08:00
|
|
|
for d in res.get("results", []):
|
2025-06-12 17:53:59 +08:00
|
|
|
rank[d["index"]] = d["relevance_score"]
|
|
|
|
|
except Exception as _e:
|
|
|
|
|
log_exception(_e, response)
|
2026-05-11 12:40:41 +08:00
|
|
|
return rank, total_token_count_from_response(res)
|
2024-08-22 16:45:15 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class BaiduYiyanRerank(Base):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "BaiduYiyan"
|
|
|
|
|
|
2024-08-22 16:45:15 +08:00
|
|
|
def __init__(self, key, model_name, base_url=None):
|
|
|
|
|
from qianfan.resources import Reranker
|
|
|
|
|
|
|
|
|
|
key = json.loads(key)
|
|
|
|
|
ak = key.get("yiyan_ak", "")
|
|
|
|
|
sk = key.get("yiyan_sk", "")
|
2026-05-11 12:40:41 +08:00
|
|
|
self.client = Reranker(ak=ak, sk=sk, request_timeout=30)
|
2024-08-22 16:45:15 +08:00
|
|
|
self.model_name = model_name
|
|
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
2024-08-22 16:45:15 +08:00
|
|
|
res = self.client.do(
|
|
|
|
|
model=self.model_name,
|
|
|
|
|
query=query,
|
|
|
|
|
documents=texts,
|
|
|
|
|
top_n=len(texts),
|
|
|
|
|
).body
|
2024-09-26 16:05:25 +08:00
|
|
|
rank = np.zeros(len(texts), dtype=float)
|
2025-06-12 17:53:59 +08:00
|
|
|
try:
|
2026-05-11 12:40:41 +08:00
|
|
|
for d in res.get("results", []):
|
2025-06-12 17:53:59 +08:00
|
|
|
rank[d["index"]] = d["relevance_score"]
|
|
|
|
|
except Exception as _e:
|
|
|
|
|
log_exception(_e, res)
|
2025-10-31 09:46:16 +08:00
|
|
|
return rank, total_token_count_from_response(res)
|
2024-08-29 16:14:49 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class VoyageRerank(Base):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "Voyage AI"
|
|
|
|
|
|
2024-08-29 16:14:49 +08:00
|
|
|
def __init__(self, key, model_name, base_url=None):
|
|
|
|
|
import voyageai
|
|
|
|
|
|
2026-05-11 12:40:41 +08:00
|
|
|
self.client = voyageai.Client(api_key=key, timeout=30.0)
|
2024-08-29 16:14:49 +08:00
|
|
|
self.model_name = model_name
|
|
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
2025-08-19 10:31:04 +08:00
|
|
|
rank = np.zeros(len(texts), dtype=float)
|
|
|
|
|
|
2025-07-03 19:05:31 +08:00
|
|
|
res = self.client.rerank(query=query, documents=texts, model=self.model_name, top_k=len(texts))
|
2025-06-12 17:53:59 +08:00
|
|
|
try:
|
|
|
|
|
for r in res.results:
|
|
|
|
|
rank[r.index] = r.relevance_score
|
|
|
|
|
except Exception as _e:
|
|
|
|
|
log_exception(_e, res)
|
2024-09-26 16:05:25 +08:00
|
|
|
return rank, res.total_tokens
|
2024-10-21 12:11:08 +08:00
|
|
|
|
2024-11-06 18:47:53 +08:00
|
|
|
|
2024-10-21 12:11:08 +08:00
|
|
|
class QWenRerank(Base):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "Tongyi-Qianwen"
|
|
|
|
|
|
2026-04-28 20:17:34 +08:00
|
|
|
def __init__(self, key, model_name="gte-rerank", **kwargs):
|
2024-10-21 12:11:08 +08:00
|
|
|
import dashscope
|
|
|
|
|
self.api_key = key
|
|
|
|
|
self.model_name = dashscope.TextReRank.Models.gte_rerank if model_name is None else model_name
|
2026-05-11 12:40:41 +08:00
|
|
|
# Remove invalid global timeout, use official SDK per-request timeout parameter
|
|
|
|
|
self.request_timeout = 30.0
|
2024-10-21 12:11:08 +08:00
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
2025-07-03 19:05:31 +08:00
|
|
|
import dashscope
|
|
|
|
|
|
2026-05-11 12:40:41 +08:00
|
|
|
# Pass official request_timeout parameter to both API call branches
|
|
|
|
|
if self.model_name.startswith("qwen3-rerank"):
|
|
|
|
|
resp = dashscope.TextReRank.call(
|
|
|
|
|
api_key=self.api_key, model=self.model_name,
|
|
|
|
|
query=query, documents=texts, top_n=len(texts),
|
|
|
|
|
request_timeout=self.request_timeout
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
resp = dashscope.TextReRank.call(
|
|
|
|
|
api_key=self.api_key, model=self.model_name,
|
|
|
|
|
query=query, documents=texts,
|
|
|
|
|
top_n=len(texts), return_documents=False,
|
|
|
|
|
request_timeout=self.request_timeout
|
|
|
|
|
)
|
2026-04-20 11:39:17 +08:00
|
|
|
|
2024-10-21 12:11:08 +08:00
|
|
|
rank = np.zeros(len(texts), dtype=float)
|
|
|
|
|
if resp.status_code == HTTPStatus.OK:
|
2025-06-12 17:53:59 +08:00
|
|
|
try:
|
|
|
|
|
for r in resp.output.results:
|
|
|
|
|
rank[r.index] = r.relevance_score
|
|
|
|
|
except Exception as _e:
|
|
|
|
|
log_exception(_e, resp)
|
2025-10-31 09:46:16 +08:00
|
|
|
return rank, total_token_count_from_response(resp)
|
2024-11-22 22:34:34 +08:00
|
|
|
else:
|
2026-06-10 13:05:24 +08:00
|
|
|
try:
|
|
|
|
|
error_body = resp["text"] if isinstance(resp, dict) and "text" in resp else None
|
|
|
|
|
except Exception:
|
|
|
|
|
error_body = None
|
|
|
|
|
if not error_body:
|
|
|
|
|
try:
|
|
|
|
|
error_body = json.dumps(dict(resp), ensure_ascii=False)
|
|
|
|
|
except Exception:
|
|
|
|
|
error_body = str(resp)
|
|
|
|
|
raise ValueError(f"Error calling QWenRerank model {self.model_name}: {resp.status_code} - {error_body}")
|
2025-01-15 14:15:58 +08:00
|
|
|
|
2025-03-06 10:44:04 +08:00
|
|
|
|
2025-10-23 23:02:27 +08:00
|
|
|
class HuggingfaceRerank(Base):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "HuggingFace"
|
|
|
|
|
|
2025-03-06 10:44:04 +08:00
|
|
|
@staticmethod
|
2026-05-11 05:04:40 +03:00
|
|
|
def post(query: str, texts: list, url: str = "http://127.0.0.1"):
|
2025-03-06 10:44:04 +08:00
|
|
|
exc = None
|
|
|
|
|
scores = [0 for _ in range(len(texts))]
|
|
|
|
|
batch_size = 8
|
2026-05-11 12:40:41 +08:00
|
|
|
# FIX: Robust URL construction to avoid duplicate "/rerank" path suffix
|
|
|
|
|
base_url = url.rstrip("/")
|
|
|
|
|
if not base_url.startswith(("http://", "https://")):
|
|
|
|
|
base_url = f"http://{base_url}"
|
|
|
|
|
# Only append "/rerank" when endpoint does not already end with it
|
|
|
|
|
endpoint = base_url if base_url.endswith("/rerank") else f"{base_url}/rerank"
|
|
|
|
|
|
2025-03-06 10:44:04 +08:00
|
|
|
for i in range(0, len(texts), batch_size):
|
|
|
|
|
try:
|
2026-05-14 11:56:09 +08:00
|
|
|
# Fix: Add request timeout
|
2025-07-03 19:05:31 +08:00
|
|
|
res = requests.post(
|
2026-05-11 12:40:41 +08:00
|
|
|
endpoint, headers={"Content-Type": "application/json"},
|
|
|
|
|
json={"query": query, "texts": texts[i:i+batch_size], "raw_scores": False, "truncate": True},
|
2026-05-11 11:19:07 +08:00
|
|
|
timeout=30
|
2025-07-03 19:05:31 +08:00
|
|
|
)
|
2026-05-11 12:40:41 +08:00
|
|
|
res.raise_for_status()
|
2025-03-06 10:44:04 +08:00
|
|
|
for o in res.json():
|
|
|
|
|
scores[o["index"] + i] = o["score"]
|
|
|
|
|
except Exception as e:
|
|
|
|
|
exc = e
|
|
|
|
|
|
|
|
|
|
if exc:
|
|
|
|
|
raise exc
|
|
|
|
|
return np.array(scores)
|
|
|
|
|
|
|
|
|
|
def __init__(self, key, model_name="BAAI/bge-reranker-v2-m3", base_url="http://127.0.0.1"):
|
2025-03-21 12:43:32 +08:00
|
|
|
self.model_name = model_name.split("___")[0]
|
2025-03-06 10:44:04 +08:00
|
|
|
self.base_url = base_url
|
|
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> tuple[np.ndarray, int]:
|
2025-03-06 10:44:04 +08:00
|
|
|
token_count = 0
|
|
|
|
|
for t in texts:
|
|
|
|
|
token_count += num_tokens_from_string(t)
|
|
|
|
|
return HuggingfaceRerank.post(query, texts, self.base_url), token_count
|
|
|
|
|
|
|
|
|
|
|
2025-01-15 14:15:58 +08:00
|
|
|
class GPUStackRerank(Base):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "GPUStack"
|
|
|
|
|
|
|
|
|
|
def __init__(self, key, model_name, base_url):
|
2025-01-15 14:15:58 +08:00
|
|
|
if not base_url:
|
|
|
|
|
raise ValueError("url cannot be None")
|
|
|
|
|
|
|
|
|
|
self.model_name = model_name
|
2025-03-06 10:44:04 +08:00
|
|
|
self.base_url = str(URL(base_url) / "v1" / "rerank")
|
2025-01-15 14:15:58 +08:00
|
|
|
self.headers = {
|
|
|
|
|
"accept": "application/json",
|
|
|
|
|
"content-type": "application/json",
|
|
|
|
|
"authorization": f"Bearer {key}",
|
|
|
|
|
}
|
|
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
2025-01-15 14:15:58 +08:00
|
|
|
payload = {
|
|
|
|
|
"model": self.model_name,
|
|
|
|
|
"query": query,
|
|
|
|
|
"documents": texts,
|
|
|
|
|
"top_n": len(texts),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try:
|
2026-05-11 11:19:07 +08:00
|
|
|
response = requests.post(self.base_url, json=payload, headers=self.headers, timeout=30)
|
2025-01-15 14:15:58 +08:00
|
|
|
response.raise_for_status()
|
|
|
|
|
response_json = response.json()
|
|
|
|
|
|
|
|
|
|
rank = np.zeros(len(texts), dtype=float)
|
2026-05-11 12:40:41 +08:00
|
|
|
token_count = sum(num_tokens_from_string(t) for t in texts)
|
2025-06-12 17:53:59 +08:00
|
|
|
try:
|
2026-05-11 12:40:41 +08:00
|
|
|
for result in response_json.get("results", []):
|
2025-06-12 17:53:59 +08:00
|
|
|
rank[result["index"]] = result["relevance_score"]
|
|
|
|
|
except Exception as _e:
|
|
|
|
|
log_exception(_e, response)
|
2025-01-15 14:15:58 +08:00
|
|
|
|
2026-05-11 12:40:41 +08:00
|
|
|
return (rank, token_count)
|
2025-01-15 14:15:58 +08:00
|
|
|
|
2026-05-11 12:40:41 +08:00
|
|
|
except requests.exceptions.RequestException as e:
|
|
|
|
|
raise ValueError(f"Error calling GPUStackRerank model {self.model_name}: {str(e)}") from e
|
2025-01-15 14:15:58 +08:00
|
|
|
|
2025-06-13 15:42:17 +08:00
|
|
|
|
|
|
|
|
class NovitaRerank(JinaRerank):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "NovitaAI"
|
|
|
|
|
|
2025-06-13 15:42:17 +08:00
|
|
|
def __init__(self, key, model_name, base_url="https://api.novita.ai/v3/openai/rerank"):
|
2025-06-30 11:22:11 +08:00
|
|
|
if not base_url:
|
|
|
|
|
base_url = "https://api.novita.ai/v3/openai/rerank"
|
2025-06-30 09:22:31 +08:00
|
|
|
super().__init__(key, model_name, base_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GiteeRerank(JinaRerank):
|
2025-07-03 19:05:31 +08:00
|
|
|
_FACTORY_NAME = "GiteeAI"
|
|
|
|
|
|
2025-06-30 09:22:31 +08:00
|
|
|
def __init__(self, key, model_name, base_url="https://ai.gitee.com/v1/rerank"):
|
2025-06-30 11:22:11 +08:00
|
|
|
if not base_url:
|
|
|
|
|
base_url = "https://ai.gitee.com/v1/rerank"
|
2025-07-03 19:05:31 +08:00
|
|
|
super().__init__(key, model_name, base_url)
|
2025-07-31 14:48:30 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class Ai302Rerank(Base):
|
|
|
|
|
_FACTORY_NAME = "302.AI"
|
|
|
|
|
|
|
|
|
|
def __init__(self, key, model_name, base_url="https://api.302.ai/v1/rerank"):
|
2026-05-11 12:40:41 +08:00
|
|
|
self.base_url = base_url or "https://api.302.ai/v1/rerank"
|
|
|
|
|
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
|
|
|
|
self.model_name = model_name
|
|
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
2026-05-11 12:40:41 +08:00
|
|
|
texts = [truncate(t, 500) for t in texts]
|
|
|
|
|
data = {"model": self.model_name, "query": query, "documents": texts, "top_n": len(texts)}
|
|
|
|
|
response = requests.post(self.base_url, headers=self.headers, json=data, timeout=30)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
res = response.json()
|
|
|
|
|
rank = np.zeros(len(texts), dtype=float)
|
|
|
|
|
try:
|
|
|
|
|
for d in res.get("results", []):
|
|
|
|
|
rank[d["index"]] = d["relevance_score"]
|
|
|
|
|
except Exception as _e:
|
|
|
|
|
log_exception(_e, res)
|
|
|
|
|
return rank, total_token_count_from_response(res)
|
2025-11-17 19:47:46 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class JiekouAIRerank(JinaRerank):
|
|
|
|
|
_FACTORY_NAME = "Jiekou.AI"
|
|
|
|
|
|
|
|
|
|
def __init__(self, key, model_name, base_url="https://api.jiekou.ai/openai/v1/rerank"):
|
|
|
|
|
if not base_url:
|
|
|
|
|
base_url = "https://api.jiekou.ai/openai/v1/rerank"
|
|
|
|
|
super().__init__(key, model_name, base_url)
|
2026-03-06 02:37:27 +01:00
|
|
|
|
feat: add FuturMix as model provider (#14419)
## Summary
Add [FuturMix](https://futurmix.ai) as a new model provider. FuturMix is
an OpenAI-compatible unified AI gateway that provides access to 22+
models (GPT, Claude, Gemini, DeepSeek, and more) through a single API
endpoint and key.
- **API Base**: `https://futurmix.ai/v1` (OpenAI-compatible)
- **Supported capabilities**: Chat, Embedding, Image2Text, TTS,
Speech2Text, Rerank
### Changes
| File | Change |
|------|--------|
| `rag/llm/__init__.py` | Add `FuturMix` to `SupportedLiteLLMProvider`
enum, `FACTORY_DEFAULT_BASE_URL`, and `LITELLM_PROVIDER_PREFIX` |
| `rag/llm/chat_model.py` | Add `FuturMixChat(Base)` — follows
Astraflow/Avian pattern |
| `rag/llm/embedding_model.py` | Add `FuturMixEmbed(OpenAIEmbed)` —
follows Astraflow pattern |
| `rag/llm/cv_model.py` | Add `FuturMixCV(GptV4)` — follows
SILICONFLOW/OpenRouter pattern |
| `rag/llm/tts_model.py` | Add `FuturMixTTS(OpenAITTS)` — follows
CometAPI/DeerAPI pattern |
| `rag/llm/sequence2txt_model.py` | Add `FuturMixSeq2txt(GPTSeq2txt)` —
follows StepFun pattern |
| `rag/llm/rerank_model.py` | Add `FuturMixRerank(OpenAI_APIRerank)` |
| `conf/llm_factories.json` | Add factory config with 8 chat, 2
embedding, 1 image2text, 2 TTS, 1 speech2text models |
| `docs/guides/models/supported_models.mdx` | Add FuturMix to supported
models table |
### Models included
- **Chat**: claude-sonnet-4-20250514, claude-3.5-haiku, gpt-4o,
gpt-4o-mini, gemini-2.5-flash, gemini-2.0-flash, deepseek-chat,
deepseek-reasoner
- **Embedding**: text-embedding-3-small, text-embedding-3-large
- **Image2Text**: gpt-4o
- **TTS**: tts-1, tts-1-hd
- **Speech2Text**: whisper-1
## Test plan
- [ ] Verify FuturMix appears in the model provider list in RAGFlow UI
- [ ] Configure FuturMix with API key and test chat completion
- [ ] Test embedding model with document indexing
- [ ] Test image2text with a sample image
🤖 Generated with [Claude Code](https://claude.com/claude-code)
---------
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-30 10:59:37 +08:00
|
|
|
|
|
|
|
|
class FuturMixRerank(OpenAI_APIRerank):
|
|
|
|
|
_FACTORY_NAME = "FuturMix"
|
|
|
|
|
|
|
|
|
|
def __init__(self, key, model_name, base_url="https://futurmix.ai/v1/rerank"):
|
|
|
|
|
if not base_url:
|
|
|
|
|
base_url = "https://futurmix.ai/v1/rerank"
|
|
|
|
|
super().__init__(key, model_name, base_url)
|
|
|
|
|
logging.info("[FuturMix] Rerank initialized with model %s", model_name)
|
|
|
|
|
|
|
|
|
|
|
2026-03-06 02:37:27 +01:00
|
|
|
class RAGconRerank(Base):
|
|
|
|
|
_FACTORY_NAME = "RAGcon"
|
|
|
|
|
|
|
|
|
|
def __init__(self, key, model_name, base_url=None, **kwargs):
|
|
|
|
|
if not base_url:
|
|
|
|
|
base_url = "https://connect.ragcon.com/v1"
|
|
|
|
|
|
|
|
|
|
self._api_key = key
|
|
|
|
|
self._base_url = base_url
|
|
|
|
|
|
|
|
|
|
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
|
|
|
|
self.model_name = model_name
|
|
|
|
|
|
|
|
|
|
|
fix(rerank): normalize reranker scores onto a single scale before hybrid blend (#15429)
### What problem does this PR solve?
Closes #15428
The hybrid score in `rag/nlp/search.py` (`rerank_by_model`) blends
reranker similarity with token similarity on a fixed `[0, 1]` scale:
```python
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea # tkweight=0.3, vtweight=0.7
```
The reranker implementations did not agree on that scale. Only three of
roughly 17 providers normalized their output, and `NvidiaRerank`
returned raw, unbounded logits. Weighted at `0.7`, a negative logit
could push a genuinely relevant chunk below pure keyword matches, and
its magnitude swamped `tksim`, which lives in `[0, 1]`. The practical
effect was that the same query produced differently scaled scores
depending on the configured reranker, and logit based providers degraded
retrieval quality instead of improving it.
This PR enforces a single scoring contract in one place:
- `Base.similarity` is now the only public entry point. It
short-circuits empty input and guarantees a normalized result. Each
provider implements its raw scoring in `_compute_rank`, which removes
sixteen duplicated empty input guards and the three scattered
normalization calls.
- Normalization is range aware. Providers that already return calibrated
`[0, 1]` relevance scores (Cohere, Jina, Voyage, and others) keep their
absolute magnitudes, so `similarity_threshold` filtering and the
reported `vector_similarity` stay meaningful. Only out-of-range output
such as NVIDIA logits is min-max rescaled into `[0, 1]`.
- The twelve leftover `[DEBUG ...]` prints in `rerank_by_model`,
introduced in #14231, are removed. They ran on every retrieval, added
per chunk overhead, and leaked queries, keywords, and document content
to stdout and logs.
A new regression suite in
`test/unit_test/rag/llm/test_rerank_normalization.py` covers logit
rescaling (positive, negative, and flat batches), preservation of
already calibrated scores, ordering, empty input handling, and the per
provider HTTP path. It also asserts that no provider overrides
`similarity()`, so the contract cannot silently drift.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-06-08 06:53:22 +03:00
|
|
|
def _compute_rank(self, query: str, texts: List) -> Tuple[np.ndarray, int]:
|
2026-03-06 02:37:27 +01:00
|
|
|
texts = [truncate(t, 500) for t in texts]
|
|
|
|
|
data = {
|
|
|
|
|
"model": self.model_name,
|
|
|
|
|
"query": query,
|
|
|
|
|
"documents": texts,
|
|
|
|
|
"top_n": len(texts),
|
|
|
|
|
}
|
2026-05-11 12:40:41 +08:00
|
|
|
token_count = sum(num_tokens_from_string(t) for t in texts)
|
|
|
|
|
response = requests.post(self._base_url + "/rerank", headers=self.headers, json=data, timeout=30)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
res = response.json()
|
2026-03-06 02:37:27 +01:00
|
|
|
rank = np.zeros(len(texts), dtype=float)
|
|
|
|
|
try:
|
2026-05-11 12:40:41 +08:00
|
|
|
for d in res.get("results", []):
|
2026-03-06 02:37:27 +01:00
|
|
|
rank[d["index"]] = d["relevance_score"]
|
|
|
|
|
except Exception as _e:
|
|
|
|
|
log_exception(_e, res)
|
2026-04-20 11:39:17 +08:00
|
|
|
return rank, token_count
|
Feat: Add New API model provider for OpenAI-compatible gateways (#15991)
## Summary
Add support for **"New API"** as a model provider, enabling connection
to [New API](https://github.com/QuantumNous/new-api) /
[one-api](https://github.com/songquanpeng/one-api) compatible gateways
that aggregate multiple LLM backends behind a unified OpenAI-compatible
`/v1` endpoint.
### Features
- **All model types**: Chat, Embedding, Rerank, Image2Text, TTS,
Speech2Text
- **List Models discovery**: `NewAPI(OpenAIAPICompatible)` class in
`model_meta.py` queries the gateway's `/v1/models` to auto-discover
available models via the native `GET /api/v1/providers/<name>/models`
endpoint
- **Model parameter editing**: Pencil icon on each discovered model row
to edit `model_type`, `max_tokens`, and `features` (e.g. tool call
support) before submitting
- **Custom model addition**: "Add Custom Model" button at the bottom of
the List Models dropdown for models not returned by the API
- **Gear icon settings**: Enabled the Settings gear button on provider
instances to manage models on existing instances (viewMode)
- **viewMode credential passthrough**: Fixed List Models in viewMode —
merges `initialValues` credentials when `api_key`/`base_url` fields are
hidden by `hideWhenInstanceExists`
### Changes
**Backend** (8 files):
- `rag/llm/chat_model.py` — `NewAPIChat(Base)` class
- `rag/llm/embedding_model.py` — `NewAPIEmbed(OpenAIEmbed)` class (no
auto `/v1` append)
- `rag/llm/rerank_model.py` — `NewAPIRerank(Base)` class (uses `/rerank`
endpoint)
- `rag/llm/cv_model.py` — `NewAPICv(GptV4)` class
- `rag/llm/tts_model.py` — `NewAPITTS(OpenAITTS)` class
- `rag/llm/sequence2txt_model.py` — `NewAPISeq2txt(GPTSeq2txt)` class
- `rag/llm/model_meta.py` — `NewAPI(OpenAIAPICompatible)` class for List
Models discovery
- `conf/llm_factories.json` — New API factory entry with all model type
tags
**Frontend** (8 files + 1 new SVG):
- `web/src/assets/svg/llm/new-api.svg` — New API logo icon
- `web/src/constants/llm.ts` — `LLMFactory.NewAPI` enum + `IconMap`
entry
- `web/src/components/svg-icon.tsx` — `NewAPI` added to `svgIcons`
-
`web/src/pages/user-setting/setting-model/modal/provider-modal/field-config/local-llm-configs.ts`
— New API `buildLocalConfig`
-
`web/src/pages/user-setting/setting-model/modal/provider-modal/constants.ts`
— `LIST_MODEL_PROVIDERS` includes NewAPI
- `web/src/pages/user-setting/setting-model/components/used-model.tsx` —
Enable Settings gear button
-
`web/src/pages/user-setting/setting-model/modal/provider-modal/hooks/use-list-models-picker.ts`
— viewMode credential merge + model editing state/handlers
-
`web/src/pages/user-setting/setting-model/modal/provider-modal/hooks/use-list-models-options.tsx`
— Pencil edit icon per model row
-
`web/src/pages/user-setting/setting-model/modal/provider-modal/index.tsx`
— `AddCustomModelDialog` import + edit dialog rendering
**Note on Go implementation**: A Go model driver (`NewAPIModel`
delegating to `OpenAIModel`) has been prepared but is deferred until the
Go runtime is enabled in a future release (current v0.26.0 images use
`API_PROXY_SCHEME=python` and do not compile Go binaries). Will submit
as a follow-up PR.
## Related
- Depends on: #15996 (provider instance API improvements — server-side
credential lookup, idempotent `add_model`, security fixes — required for
viewMode gear icon and batch model submission)
## Test plan
- [ ] Add New API provider with api_key and base_url pointing to an
OpenAI-compatible gateway
- [ ] Click "List Models" — should discover and display available models
from `/v1/models`
- [ ] Click pencil icon on a model — should open edit dialog to change
model_type, max_tokens, features
- [ ] Select multiple models and click OK — should add all selected
models
- [ ] Click gear icon on the added instance — should open viewMode with
List Models working
- [ ] In viewMode, select new models including pre-existing ones, click
OK — should succeed (requires #15996)
- [ ] Verify all model types work: create a Chat assistant, Embedding
KB, Rerank setting
🤖 Generated with [Claude Code](https://claude.com/claude-code)
---------
Co-authored-by: Tim Wang <wanghualoong@users.noreply.github.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-26 18:47:20 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class NewAPIRerank(Base):
|
|
|
|
|
_FACTORY_NAME = "New API"
|
|
|
|
|
|
|
|
|
|
def __init__(self, key, model_name, base_url):
|
|
|
|
|
normalized_base_url = (base_url or "").strip()
|
|
|
|
|
if "/rerank" in normalized_base_url:
|
|
|
|
|
self.base_url = normalized_base_url.rstrip("/")
|
|
|
|
|
else:
|
|
|
|
|
self.base_url = urljoin(f"{normalized_base_url.rstrip('/')}/", "rerank").rstrip("/")
|
|
|
|
|
self.headers = {
|
|
|
|
|
"Content-Type": "application/json",
|
|
|
|
|
"Authorization": f"Bearer {key}",
|
|
|
|
|
}
|
|
|
|
|
self.model_name = model_name.split("___")[0]
|
|
|
|
|
|
|
|
|
|
def _compute_rank(self, query: str, texts: list):
|
|
|
|
|
texts = [truncate(t, 500) for t in texts]
|
|
|
|
|
data = {
|
|
|
|
|
"model": self.model_name,
|
|
|
|
|
"query": query,
|
|
|
|
|
"documents": texts,
|
|
|
|
|
"top_n": len(texts),
|
|
|
|
|
}
|
|
|
|
|
token_count = sum(num_tokens_from_string(t) for t in texts)
|
|
|
|
|
res = requests.post(self.base_url, headers=self.headers, json=data).json()
|
|
|
|
|
rank = np.zeros(len(texts), dtype=float)
|
|
|
|
|
try:
|
|
|
|
|
for d in res["results"]:
|
|
|
|
|
rank[d["index"]] = d["relevance_score"]
|
|
|
|
|
except Exception as _e:
|
|
|
|
|
log_exception(_e, res)
|
|
|
|
|
return rank, token_count
|