mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
fix(opensearch): keep the BM25 leg in hybrid search (#15760)
### What problem does this PR solve? Fixes the OpenSearch side of #10747: hybrid search drops the keyword (BM25) leg and ends up doing plain vector search. When a search has both a text and a vector leg, `OSConnection.search()` throws the text query away: del q["query"] q["query"] = {"knn": knn_query} The text clause only stays on as a filter inside the knn query, so it narrows the candidate set but doesn't count towards scoring. So hybrid search on OpenSearch behaves like plain vector search, unlike the Elasticsearch backend. What I changed: - when both legs are present, send a real hybrid query `{"hybrid": {"queries": [bm25, {"knn": ...}]}}` and let a normalization-processor search pipeline score and combine the two legs - only the actual filters (kb_id, available_int, ...) go in the knn filter, not the text must clause - create the pipeline on startup if it's missing, so there's no separate provisioning step. name and weights can be set under `os:` in service_conf.yaml, or via `OS_HYBRID_PIPELINE`; defaults are `ragflow_hybrid_pipeline` and `[0.5, 0.5]` - normalization-processor needs OpenSearch 2.10+. on older clusters, or when the pipeline can't be created, log a warning and fall back to vector-only instead of pointing at a pipeline that doesn't exist This is only the hybrid-search fix; `create_doc_meta_idx` is already on main. Testing (there's no OpenSearch path in CI): added a unit test (`test/unit_test/rag/utils/test_opensearch_hybrid_search.py`, no services needed) that checks the query built in each case — hybrid + pipeline param for text+vector, plain knn for vector-only, plain bool for text-only, the knn filter never carrying the text query_string, and the vector-only fallback when the pipeline isn't available. Also ran it against a real OpenSearch 2.19.1 container with a doc that matches the keyword but sits outside the knn top-k: pure knn returns `['D1','D2','D5']` (keyword doc missing), the hybrid query returns `['A','D1','D2','D5']` (keyword doc present). ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) Signed-off-by: Danut Matei <matei.danut.dm@gmail.com>
This commit is contained in:
@@ -99,6 +99,62 @@ class OSConnection(DocStoreConnection):
|
||||
with open(fp_mapping, "r") as f:
|
||||
self.mapping = json.load(f)
|
||||
logger.info(f"OpenSearch {settings.OS['hosts']} is healthy.")
|
||||
self._init_hybrid_search()
|
||||
|
||||
# normalization-processor (needed to merge the BM25 and KNN scores) only
|
||||
# exists on OpenSearch 2.10+.
|
||||
HYBRID_MIN_VERSION = (2, 10)
|
||||
|
||||
def _init_hybrid_search(self):
|
||||
"""Create the hybrid-search pipeline if it isn't there yet.
|
||||
|
||||
A {"hybrid": {...}} query is scored by a normalization-processor that has
|
||||
to live on a search pipeline, otherwise OpenSearch rejects the query. We
|
||||
create it once at startup (PUT _search/pipeline is idempotent) so there's
|
||||
no extra setup step to run.
|
||||
|
||||
Sets self.hybrid_search_enabled. If the pipeline can't be created
|
||||
(OpenSearch < 2.10, or no permission to manage pipelines) we log a
|
||||
warning, leave it off, and search() keeps doing vector-only.
|
||||
"""
|
||||
self.hybrid_search_enabled = False
|
||||
self._hybrid_pipeline = os.environ.get("OS_HYBRID_PIPELINE") \
|
||||
or settings.OS.get("hybrid_search_pipeline") or "ragflow_hybrid_pipeline"
|
||||
|
||||
version_number = self.info.get("version", {}).get("number", "")
|
||||
try:
|
||||
version = tuple(int(p) for p in version_number.split(".")[:2])
|
||||
except (ValueError, AttributeError):
|
||||
version = (0, 0)
|
||||
if version < self.HYBRID_MIN_VERSION:
|
||||
logger.warning(f"OpenSearch {version_number or 'unknown'} does not support the "
|
||||
f"normalization-processor (requires >= {self.HYBRID_MIN_VERSION[0]}."
|
||||
f"{self.HYBRID_MIN_VERSION[1]}); hybrid search is disabled and "
|
||||
f"queries fall back to vector-only.")
|
||||
return
|
||||
|
||||
weights = settings.OS.get("hybrid_search_weights", [0.5, 0.5])
|
||||
pipeline_body = {
|
||||
"description": "RAGFlow hybrid search normalization pipeline (BM25 + KNN).",
|
||||
"phase_results_processors": [
|
||||
{"normalization-processor": {
|
||||
"normalization": {"technique": "min_max"},
|
||||
"combination": {"technique": "arithmetic_mean",
|
||||
"parameters": {"weights": weights}}}}
|
||||
],
|
||||
}
|
||||
try:
|
||||
self.os.transport.perform_request(
|
||||
"PUT", f"/_search/pipeline/{self._hybrid_pipeline}", body=pipeline_body)
|
||||
self.hybrid_search_enabled = True
|
||||
logger.info(f"OpenSearch hybrid search enabled via pipeline "
|
||||
f"'{self._hybrid_pipeline}' (weights {weights}).")
|
||||
except Exception:
|
||||
logger.warning(f"Could not create OpenSearch search pipeline '{self._hybrid_pipeline}'; "
|
||||
f"hybrid search is disabled and queries fall back to vector-only. "
|
||||
f"Creating a search pipeline needs the "
|
||||
f"'cluster:admin/search/pipeline/put' privilege (relevant on "
|
||||
f"locked-down or managed OpenSearch).", exc_info=True)
|
||||
|
||||
"""
|
||||
Database operations
|
||||
@@ -276,6 +332,7 @@ class OSConnection(DocStoreConnection):
|
||||
Refers to https://github.com/opensearch-project/opensearch-py/blob/main/guides/dsl.md
|
||||
"""
|
||||
use_knn = False
|
||||
use_text = False
|
||||
if isinstance(index_names, str):
|
||||
index_names = index_names.split(",")
|
||||
assert isinstance(index_names, list) and len(index_names) > 0
|
||||
@@ -313,6 +370,7 @@ class OSConnection(DocStoreConnection):
|
||||
knn_query = {}
|
||||
for m in match_expressions:
|
||||
if isinstance(m, MatchTextExpr):
|
||||
use_text = True
|
||||
minimum_should_match = m.extra_options.get("minimum_should_match", 0.0)
|
||||
if isinstance(minimum_should_match, float):
|
||||
minimum_should_match = str(int(minimum_should_match * 100)) + "%"
|
||||
@@ -336,7 +394,13 @@ class OSConnection(DocStoreConnection):
|
||||
knn_query[vector_column_name] = {}
|
||||
knn_query[vector_column_name]["vector"] = list(m.embedding_data)
|
||||
knn_query[vector_column_name]["k"] = m.topn
|
||||
knn_query[vector_column_name]["filter"] = bqry.to_dict()
|
||||
# The knn filter holds only the structural filters (kb_id,
|
||||
# available_int, ...). The text query is deliberately kept out of it:
|
||||
# it's scored as its own leg in the hybrid query below, not used to
|
||||
# pre-filter knn candidates.
|
||||
bool_inner = bqry.to_dict().get("bool", {})
|
||||
if bool_inner.get("filter"):
|
||||
knn_query[vector_column_name]["filter"] = {"bool": {"filter": bool_inner["filter"]}}
|
||||
knn_query[vector_column_name]["boost"] = similarity
|
||||
|
||||
if bqry and rank_feature:
|
||||
@@ -372,9 +436,22 @@ class OSConnection(DocStoreConnection):
|
||||
q = s.to_dict()
|
||||
logger.debug(f"OSConnection.search {str(index_names)} query: " + json.dumps(q))
|
||||
|
||||
hybrid_search = use_knn and use_text and getattr(self, "hybrid_search_enabled", False)
|
||||
if use_knn:
|
||||
del q["query"]
|
||||
q["query"] = {"knn": knn_query}
|
||||
if hybrid_search:
|
||||
# both legs + a pipeline available: send a real hybrid query so the
|
||||
# keyword (BM25) and vector (knn) legs are scored separately and
|
||||
# merged by the pipeline.
|
||||
keyword_query = q.get("query")
|
||||
q["query"] = {"hybrid": {"queries": [keyword_query, {"knn": knn_query}]}}
|
||||
else:
|
||||
# vector-only, or no pipeline available: fall back to a plain knn query.
|
||||
del q["query"]
|
||||
q["query"] = {"knn": knn_query}
|
||||
|
||||
search_kwargs = {}
|
||||
if hybrid_search:
|
||||
search_kwargs["params"] = {"search_pipeline": self._hybrid_pipeline}
|
||||
|
||||
for i in range(ATTEMPT_TIME):
|
||||
try:
|
||||
@@ -383,7 +460,8 @@ class OSConnection(DocStoreConnection):
|
||||
timeout=600,
|
||||
# search_type="dfs_query_then_fetch",
|
||||
track_total_hits=True,
|
||||
_source=True)
|
||||
_source=True,
|
||||
**search_kwargs)
|
||||
if str(res.get("timed_out", "")).lower() == "true":
|
||||
raise Exception("OpenSearch Timeout.")
|
||||
logger.debug(f"OSConnection.search {str(index_names)} res: " + str(res))
|
||||
|
||||
Reference in New Issue
Block a user