fix(opensearch): keep the BM25 leg in hybrid search (#15760)

### What problem does this PR solve?

Fixes the OpenSearch side of #10747: hybrid search drops the keyword
(BM25) leg and
ends up doing plain vector search.

When a search has both a text and a vector leg, `OSConnection.search()`
throws the text
query away:

    del q["query"]
    q["query"] = {"knn": knn_query}

The text clause only stays on as a filter inside the knn query, so it
narrows the
candidate set but doesn't count towards scoring. So hybrid search on
OpenSearch behaves
like plain vector search, unlike the Elasticsearch backend.

What I changed:

- when both legs are present, send a real hybrid query
`{"hybrid": {"queries": [bm25, {"knn": ...}]}}` and let a
normalization-processor
  search pipeline score and combine the two legs
- only the actual filters (kb_id, available_int, ...) go in the knn
filter, not the
  text must clause
- create the pipeline on startup if it's missing, so there's no separate
provisioning
step. name and weights can be set under `os:` in service_conf.yaml, or
via
`OS_HYBRID_PIPELINE`; defaults are `ragflow_hybrid_pipeline` and `[0.5,
0.5]`
- normalization-processor needs OpenSearch 2.10+. on older clusters, or
when the
pipeline can't be created, log a warning and fall back to vector-only
instead of
  pointing at a pipeline that doesn't exist

This is only the hybrid-search fix; `create_doc_meta_idx` is already on
main.

Testing (there's no OpenSearch path in CI): added a unit test
(`test/unit_test/rag/utils/test_opensearch_hybrid_search.py`, no
services needed) that
checks the query built in each case — hybrid + pipeline param for
text+vector, plain knn
for vector-only, plain bool for text-only, the knn filter never carrying
the text
query_string, and the vector-only fallback when the pipeline isn't
available. Also ran
it against a real OpenSearch 2.19.1 container with a doc that matches
the keyword but
sits outside the knn top-k: pure knn returns `['D1','D2','D5']` (keyword
doc missing),
the hybrid query returns `['A','D1','D2','D5']` (keyword doc present).

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Signed-off-by: Danut Matei <matei.danut.dm@gmail.com>
This commit is contained in:
Danut Matei
2026-06-08 11:17:47 +03:00
committed by GitHub
parent 8f4809d1b5
commit e2b0da9eea
4 changed files with 309 additions and 4 deletions

View File

@@ -99,6 +99,62 @@ class OSConnection(DocStoreConnection):
with open(fp_mapping, "r") as f:
self.mapping = json.load(f)
logger.info(f"OpenSearch {settings.OS['hosts']} is healthy.")
self._init_hybrid_search()
# normalization-processor (needed to merge the BM25 and KNN scores) only
# exists on OpenSearch 2.10+.
HYBRID_MIN_VERSION = (2, 10)
def _init_hybrid_search(self):
"""Create the hybrid-search pipeline if it isn't there yet.
A {"hybrid": {...}} query is scored by a normalization-processor that has
to live on a search pipeline, otherwise OpenSearch rejects the query. We
create it once at startup (PUT _search/pipeline is idempotent) so there's
no extra setup step to run.
Sets self.hybrid_search_enabled. If the pipeline can't be created
(OpenSearch < 2.10, or no permission to manage pipelines) we log a
warning, leave it off, and search() keeps doing vector-only.
"""
self.hybrid_search_enabled = False
self._hybrid_pipeline = os.environ.get("OS_HYBRID_PIPELINE") \
or settings.OS.get("hybrid_search_pipeline") or "ragflow_hybrid_pipeline"
version_number = self.info.get("version", {}).get("number", "")
try:
version = tuple(int(p) for p in version_number.split(".")[:2])
except (ValueError, AttributeError):
version = (0, 0)
if version < self.HYBRID_MIN_VERSION:
logger.warning(f"OpenSearch {version_number or 'unknown'} does not support the "
f"normalization-processor (requires >= {self.HYBRID_MIN_VERSION[0]}."
f"{self.HYBRID_MIN_VERSION[1]}); hybrid search is disabled and "
f"queries fall back to vector-only.")
return
weights = settings.OS.get("hybrid_search_weights", [0.5, 0.5])
pipeline_body = {
"description": "RAGFlow hybrid search normalization pipeline (BM25 + KNN).",
"phase_results_processors": [
{"normalization-processor": {
"normalization": {"technique": "min_max"},
"combination": {"technique": "arithmetic_mean",
"parameters": {"weights": weights}}}}
],
}
try:
self.os.transport.perform_request(
"PUT", f"/_search/pipeline/{self._hybrid_pipeline}", body=pipeline_body)
self.hybrid_search_enabled = True
logger.info(f"OpenSearch hybrid search enabled via pipeline "
f"'{self._hybrid_pipeline}' (weights {weights}).")
except Exception:
logger.warning(f"Could not create OpenSearch search pipeline '{self._hybrid_pipeline}'; "
f"hybrid search is disabled and queries fall back to vector-only. "
f"Creating a search pipeline needs the "
f"'cluster:admin/search/pipeline/put' privilege (relevant on "
f"locked-down or managed OpenSearch).", exc_info=True)
"""
Database operations
@@ -276,6 +332,7 @@ class OSConnection(DocStoreConnection):
Refers to https://github.com/opensearch-project/opensearch-py/blob/main/guides/dsl.md
"""
use_knn = False
use_text = False
if isinstance(index_names, str):
index_names = index_names.split(",")
assert isinstance(index_names, list) and len(index_names) > 0
@@ -313,6 +370,7 @@ class OSConnection(DocStoreConnection):
knn_query = {}
for m in match_expressions:
if isinstance(m, MatchTextExpr):
use_text = True
minimum_should_match = m.extra_options.get("minimum_should_match", 0.0)
if isinstance(minimum_should_match, float):
minimum_should_match = str(int(minimum_should_match * 100)) + "%"
@@ -336,7 +394,13 @@ class OSConnection(DocStoreConnection):
knn_query[vector_column_name] = {}
knn_query[vector_column_name]["vector"] = list(m.embedding_data)
knn_query[vector_column_name]["k"] = m.topn
knn_query[vector_column_name]["filter"] = bqry.to_dict()
# The knn filter holds only the structural filters (kb_id,
# available_int, ...). The text query is deliberately kept out of it:
# it's scored as its own leg in the hybrid query below, not used to
# pre-filter knn candidates.
bool_inner = bqry.to_dict().get("bool", {})
if bool_inner.get("filter"):
knn_query[vector_column_name]["filter"] = {"bool": {"filter": bool_inner["filter"]}}
knn_query[vector_column_name]["boost"] = similarity
if bqry and rank_feature:
@@ -372,9 +436,22 @@ class OSConnection(DocStoreConnection):
q = s.to_dict()
logger.debug(f"OSConnection.search {str(index_names)} query: " + json.dumps(q))
hybrid_search = use_knn and use_text and getattr(self, "hybrid_search_enabled", False)
if use_knn:
del q["query"]
q["query"] = {"knn": knn_query}
if hybrid_search:
# both legs + a pipeline available: send a real hybrid query so the
# keyword (BM25) and vector (knn) legs are scored separately and
# merged by the pipeline.
keyword_query = q.get("query")
q["query"] = {"hybrid": {"queries": [keyword_query, {"knn": knn_query}]}}
else:
# vector-only, or no pipeline available: fall back to a plain knn query.
del q["query"]
q["query"] = {"knn": knn_query}
search_kwargs = {}
if hybrid_search:
search_kwargs["params"] = {"search_pipeline": self._hybrid_pipeline}
for i in range(ATTEMPT_TIME):
try:
@@ -383,7 +460,8 @@ class OSConnection(DocStoreConnection):
timeout=600,
# search_type="dfs_query_then_fetch",
track_total_hits=True,
_source=True)
_source=True,
**search_kwargs)
if str(res.get("timed_out", "")).lower() == "true":
raise Exception("OpenSearch Timeout.")
logger.debug(f"OSConnection.search {str(index_names)} res: " + str(res))