mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
### What problem does this PR solve? Fixes the OpenSearch side of #10747: hybrid search drops the keyword (BM25) leg and ends up doing plain vector search. When a search has both a text and a vector leg, `OSConnection.search()` throws the text query away: del q["query"] q["query"] = {"knn": knn_query} The text clause only stays on as a filter inside the knn query, so it narrows the candidate set but doesn't count towards scoring. So hybrid search on OpenSearch behaves like plain vector search, unlike the Elasticsearch backend. What I changed: - when both legs are present, send a real hybrid query `{"hybrid": {"queries": [bm25, {"knn": ...}]}}` and let a normalization-processor search pipeline score and combine the two legs - only the actual filters (kb_id, available_int, ...) go in the knn filter, not the text must clause - create the pipeline on startup if it's missing, so there's no separate provisioning step. name and weights can be set under `os:` in service_conf.yaml, or via `OS_HYBRID_PIPELINE`; defaults are `ragflow_hybrid_pipeline` and `[0.5, 0.5]` - normalization-processor needs OpenSearch 2.10+. on older clusters, or when the pipeline can't be created, log a warning and fall back to vector-only instead of pointing at a pipeline that doesn't exist This is only the hybrid-search fix; `create_doc_meta_idx` is already on main. Testing (there's no OpenSearch path in CI): added a unit test (`test/unit_test/rag/utils/test_opensearch_hybrid_search.py`, no services needed) that checks the query built in each case — hybrid + pipeline param for text+vector, plain knn for vector-only, plain bool for text-only, the knn filter never carrying the text query_string, and the vector-only fallback when the pipeline isn't available. Also ran it against a real OpenSearch 2.19.1 container with a doc that matches the keyword but sits outside the knn top-k: pure knn returns `['D1','D2','D5']` (keyword doc missing), the hybrid query returns `['A','D1','D2','D5']` (keyword doc present). ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) Signed-off-by: Danut Matei <matei.danut.dm@gmail.com>
177 lines
5.6 KiB
Plaintext
177 lines
5.6 KiB
Plaintext
ragflow:
|
|
host: ${RAGFLOW_HOST:-0.0.0.0}
|
|
http_port: 9380
|
|
admin:
|
|
host: ${RAGFLOW_HOST:-0.0.0.0}
|
|
http_port: 9381
|
|
mysql:
|
|
name: '${MYSQL_DBNAME:-rag_flow}'
|
|
user: '${MYSQL_USER:-root}'
|
|
password: '${MYSQL_PASSWORD:-infini_rag_flow}'
|
|
host: '${MYSQL_HOST:-mysql}'
|
|
port: ${MYSQL_PORT:-3306}
|
|
max_connections: 900
|
|
stale_timeout: 300
|
|
max_allowed_packet: ${MYSQL_MAX_PACKET:-1073741824}
|
|
minio:
|
|
user: '${MINIO_USER:-rag_flow}'
|
|
password: '${MINIO_PASSWORD:-infini_rag_flow}'
|
|
host: '${MINIO_HOST:-minio}:9000'
|
|
bucket: '${MINIO_BUCKET:-}'
|
|
prefix_path: '${MINIO_PREFIX_PATH:-}'
|
|
# optional: set to true for HTTPS (SSL/TLS). Used by MinIO client and health check.
|
|
# secure: ${MINIO_SECURE:-false}
|
|
# optional: set to false to allow self-signed certificates (e.g. in development).
|
|
# verify: ${MINIO_VERIFY:-true}
|
|
es:
|
|
hosts: 'http://${ES_HOST:-es01}:9200'
|
|
username: '${ES_USER:-elastic}'
|
|
password: '${ELASTIC_PASSWORD:-infini_rag_flow}'
|
|
os:
|
|
hosts: 'http://${OS_HOST:-opensearch01}:9201'
|
|
username: '${OS_USER:-admin}'
|
|
password: '${OPENSEARCH_PASSWORD:-infini_rag_flow_OS_01}'
|
|
# Optional hybrid (BM25 + KNN) search tuning. The connector self-provisions the
|
|
# normalization search pipeline on start-up (requires OpenSearch >= 2.10).
|
|
# hybrid_search_pipeline: 'ragflow_hybrid_pipeline'
|
|
# hybrid_search_weights: [0.5, 0.5] # [text/BM25 leg, vector/KNN leg]
|
|
infinity:
|
|
uri: '${INFINITY_HOST:-infinity}:23817'
|
|
postgres_port: 5432
|
|
db_name: 'default_db'
|
|
oceanbase:
|
|
scheme: 'oceanbase' # set 'mysql' to create connection using mysql config
|
|
config:
|
|
db_name: '${OCEANBASE_DOC_DBNAME:-test}'
|
|
user: '${OCEANBASE_USER:-root@ragflow}'
|
|
password: '${OCEANBASE_PASSWORD:-infini_rag_flow}'
|
|
host: '${OCEANBASE_HOST:-oceanbase}'
|
|
port: ${OCEANBASE_PORT:-2881}
|
|
seekdb:
|
|
scheme: 'oceanbase' # SeekDB is the lite version of OceanBase
|
|
config:
|
|
db_name: '${SEEKDB_DOC_DBNAME:-ragflow_doc}'
|
|
user: '${SEEKDB_USER:-root}'
|
|
password: '${SEEKDB_PASSWORD:-infini_rag_flow}'
|
|
host: '${SEEKDB_HOST:-seekdb}'
|
|
port: ${SEEKDB_PORT:-2881}
|
|
redis:
|
|
db: 1
|
|
username: '${REDIS_USERNAME:-}'
|
|
password: '${REDIS_PASSWORD:-infini_rag_flow}'
|
|
host: '${REDIS_HOST:-redis}:6379'
|
|
user_default_llm:
|
|
default_models:
|
|
embedding_model:
|
|
api_key: 'xxx'
|
|
base_url: 'http://${TEI_HOST}:80'
|
|
# postgres:
|
|
# name: '${POSTGRES_DBNAME:-rag_flow}'
|
|
# user: '${POSTGRES_USER:-rag_flow}'
|
|
# password: '${POSTGRES_PASSWORD:-infini_rag_flow}'
|
|
# host: '${POSTGRES_HOST:-postgres}'
|
|
# port: 5432
|
|
# max_connections: 100
|
|
# stale_timeout: 30
|
|
# s3:
|
|
# access_key: 'access_key'
|
|
# secret_key: 'secret_key'
|
|
# region: 'region'
|
|
# endpoint_url: 'endpoint_url'
|
|
# bucket: 'bucket'
|
|
# prefix_path: 'prefix_path'
|
|
# signature_version: 'v4'
|
|
# addressing_style: 'path'
|
|
# oss:
|
|
# access_key: '${ACCESS_KEY}'
|
|
# secret_key: '${SECRET_KEY}'
|
|
# endpoint_url: '${ENDPOINT}'
|
|
# region: '${REGION}'
|
|
# bucket: '${BUCKET}'
|
|
# prefix_path: '${OSS_PREFIX_PATH}'
|
|
# signature_version: 's3'
|
|
# addressing_style: 'virtual'
|
|
# azure:
|
|
# auth_type: 'sas'
|
|
# container_url: 'container_url'
|
|
# sas_token: 'sas_token'
|
|
# azure:
|
|
# auth_type: 'spn'
|
|
# account_url: 'account_url'
|
|
# client_id: 'client_id'
|
|
# secret: 'secret'
|
|
# tenant_id: 'tenant_id'
|
|
# container_name: 'container_name'
|
|
# cloud: 'public' # Azure cloud: 'public', 'china', 'government', or 'germany'
|
|
# The OSS object storage uses the MySQL configuration above by default. If you need to switch to another object storage service, please uncomment and configure the following parameters.
|
|
# opendal:
|
|
# scheme: 'mysql' # Storage type, such as s3, oss, azure, etc.
|
|
# config:
|
|
# oss_table: 'opendal_storage'
|
|
# user_default_llm:
|
|
# factory: 'BAAI'
|
|
# api_key: 'backup'
|
|
# base_url: 'backup_base_url'
|
|
# default_models:
|
|
# chat_model:
|
|
# name: 'qwen2.5-7b-instruct'
|
|
# factory: 'xxxx'
|
|
# api_key: 'xxxx'
|
|
# base_url: 'https://api.xx.com'
|
|
# embedding_model:
|
|
# name: 'bge-m3'
|
|
# rerank_model: 'bge-reranker-v2'
|
|
# asr_model:
|
|
# model: 'whisper-large-v3' # alias of name
|
|
# image2text_model: ''
|
|
# oauth:
|
|
# oauth2:
|
|
# display_name: "OAuth2"
|
|
# client_id: "your_client_id"
|
|
# client_secret: "your_client_secret"
|
|
# authorization_url: "https://your-oauth-provider.com/oauth/authorize"
|
|
# token_url: "https://your-oauth-provider.com/oauth/token"
|
|
# userinfo_url: "https://your-oauth-provider.com/oauth/userinfo"
|
|
# redirect_uri: "https://your-app.com/v1/user/oauth/callback/oauth2"
|
|
# oidc:
|
|
# display_name: "OIDC"
|
|
# client_id: "your_client_id"
|
|
# client_secret: "your_client_secret"
|
|
# issuer: "https://your-oauth-provider.com/oidc"
|
|
# scope: "openid email profile"
|
|
# redirect_uri: "https://your-app.com/v1/user/oauth/callback/oidc"
|
|
# github:
|
|
# type: "github"
|
|
# icon: "github"
|
|
# display_name: "Github"
|
|
# client_id: "your_client_id"
|
|
# client_secret: "your_client_secret"
|
|
# redirect_uri: "https://your-app.com/v1/user/oauth/callback/github"
|
|
# authentication:
|
|
# client:
|
|
# switch: false
|
|
# http_app_key:
|
|
# http_secret_key:
|
|
# site:
|
|
# switch: false
|
|
# disable_password_login: false
|
|
# permission:
|
|
# switch: false
|
|
# component: false
|
|
# dataset: false
|
|
# smtp:
|
|
# mail_server: ""
|
|
# mail_port: 465
|
|
# mail_use_ssl: true
|
|
# mail_use_tls: false
|
|
# mail_username: ""
|
|
# mail_password: ""
|
|
# mail_default_sender:
|
|
# - "RAGFlow" # display name
|
|
# - "" # sender email address
|
|
# mail_frontend_url: "https://your-frontend.example.com"
|
|
# tcadp_config:
|
|
# secret_id: '${TENCENT_SECRET_ID}'
|
|
# secret_key: '${TENCENT_SECRET_KEY}'
|
|
# region: '${TENCENT_REGION}'
|