Push metadata filters down to Infinity (#14974)

### What problem does this PR solve?

Push metadata filters down to Infinity

### Type of change

- [x] Refactoring
This commit is contained in:
qinling0210
2026-05-18 14:22:04 +08:00
committed by GitHub
parent 7cdc74bbe5
commit f1d2383572
7 changed files with 1148 additions and 572 deletions

View File

@@ -1344,6 +1344,7 @@ async def search_datasets(tenant_id: str, req: dict):
chat_mdl = LLMBundle(tenant_id, chat_model_config)
if meta_data_filter:
logging.debug(f"Metadata filter: {meta_data_filter}, question: {question}, chat_mdl={'None' if chat_mdl is None else chat_mdl.llm_name}")
local_doc_ids = await apply_meta_data_filter(
meta_data_filter,
None,

View File

@@ -448,7 +448,8 @@ class DocMetadataService:
# Post-process to split combined values
processed_meta = cls._split_combined_values(meta_fields)
logging.debug(f"[update_document_metadata] Updating doc_id: {doc_id}, kb_id: {kb_id}, meta_fields: {processed_meta}")
logging.debug(
f"[update_document_metadata] Updating doc_id: {doc_id}, kb_id: {kb_id}, meta_fields: {processed_meta}")
# For Elasticsearch, use efficient partial update
if not settings.DOC_ENGINE_INFINITY and not settings.DOC_ENGINE_OCEANBASE:
@@ -456,7 +457,8 @@ class DocMetadataService:
index_exists = settings.docStoreConn.index_exist(index_name, "")
if not index_exists:
# Index doesn't exist - create it and insert directly
logging.debug(f"[update_document_metadata] Index {index_name} does not exist, creating and inserting")
logging.debug(
f"[update_document_metadata] Index {index_name} does not exist, creating and inserting")
result = settings.docStoreConn.create_doc_meta_idx(index_name)
if result is False:
logging.error(f"Failed to create metadata index {index_name}")
@@ -477,7 +479,8 @@ class DocMetadataService:
# to a backend-provided scripted assignment that fully overwrites it.
replace_meta_fields = getattr(settings.docStoreConn, "replace_meta_fields", None)
if callable(replace_meta_fields) and replace_meta_fields(index_name, doc_id, processed_meta):
logging.debug(f"Successfully updated metadata for document {doc_id} via {type(settings.docStoreConn).__name__}.replace_meta_fields")
logging.debug(
f"Successfully updated metadata for document {doc_id} via {type(settings.docStoreConn).__name__}.replace_meta_fields")
return True
logging.warning(
f"replace_meta_fields unavailable or failed on backend "
@@ -537,7 +540,8 @@ class DocMetadataService:
# Check if metadata table exists before attempting deletion
# This is the key optimization - no table = no metadata = nothing to delete
if not settings.docStoreConn.index_exist(index_name, ""):
logging.debug(f"Metadata table {index_name} does not exist, skipping metadata deletion for document {doc_id}")
logging.debug(
f"Metadata table {index_name} does not exist, skipping metadata deletion for document {doc_id}")
return True # No metadata to delete is considered success
# Try to get the metadata to confirm it exists before deleting
@@ -627,7 +631,8 @@ class DocMetadataService:
if isinstance(results, tuple) and len(results) == 2:
# Infinity returns (DataFrame, int)
df, total = results
logging.debug(f"[DROP EMPTY TABLE] Infinity format - total: {total}, df length: {len(df) if hasattr(df, '__len__') else 'N/A'}")
logging.debug(
f"[DROP EMPTY TABLE] Infinity format - total: {total}, df length: {len(df) if hasattr(df, '__len__') else 'N/A'}")
is_empty = (total == 0 or (hasattr(df, '__len__') and len(df) == 0))
elif hasattr(results, 'get') and 'hits' in results:
# ES format - MUST check this before hasattr(results, '__len__')
@@ -797,20 +802,56 @@ class DocMetadataService:
logic: str = "and",
limit: int = 10000,
) -> Optional[List[str]]:
"""Run a metadata filter directly against ES, returning matching doc IDs.
"""Run a metadata filter directly against ES or Infinity, returning matching doc IDs.
Returns ``None`` to signal "push-down not viable, use the in-memory
``meta_filter`` fallback". Reasons for ``None``:
- Active doc store is not Elasticsearch (Infinity / OceanBase have
different filter semantics for the JSON ``meta_fields`` column).
- One of the user filters cannot be expressed in ES DSL.
- The ES request itself failed (network, mapping, missing index).
- kb_ids or filters is empty
- One of the user filters cannot be expressed in ES DSL or Infinity SQL
- The request itself failed (network, mapping, missing index)
On success returns the deduplicated, ordered list of document IDs the
ES query matched. Callers can union or intersect this with their own
query matched. Callers can union or intersect this with their own
base ``doc_ids`` rather than fetching the entire metadata table.
"""
if not kb_ids or not filters:
logging.debug("Metadata filter skipped: empty kb_ids or filters")
return None
try:
kb = Knowledgebase.get_by_id(kb_ids[0])
except Exception as e:
logging.warning(f"Metadata filter cannot resolve tenant for kb {kb_ids[0]}: {e}")
return None
if not kb:
return None
tenant_id = kb.tenant_id
index_name = cls._get_doc_meta_index_name(tenant_id)
if not settings.docStoreConn.index_exist(index_name, ""):
return []
if settings.DOC_ENGINE_INFINITY:
return cls._filter_doc_ids_by_metadata_infinity(
index_name, kb_ids, filters, logic
)
else:
return cls._filter_doc_ids_by_metadata_es(
index_name, kb_ids, filters, logic, limit
)
@classmethod
def _filter_doc_ids_by_metadata_es(
cls,
index_name: str,
kb_ids: List[str],
filters: List[Dict],
logic: str,
limit: int,
) -> Optional[List[str]]:
"""ES push-down path for metadata filtering."""
from common.metadata_es_filter import (
UnsupportedMetaFilter,
build_meta_filter_query,
@@ -818,14 +859,6 @@ class DocMetadataService:
is_pushdown_supported,
)
if not kb_ids:
return []
if settings.DOC_ENGINE_INFINITY:
# Infinity stores ``meta_fields`` as a JSON column without dotted
# field access; the in-memory path is still the reliable answer.
return None
es_client = getattr(settings.docStoreConn, "es", None)
if es_client is None:
return None
@@ -833,35 +866,12 @@ class DocMetadataService:
if not is_pushdown_supported(filters):
return None
try:
kb = Knowledgebase.get_by_id(kb_ids[0])
except Exception as e:
logging.warning(f"[meta_pushdown] cannot resolve tenant for kb {kb_ids[0]}: {e}")
return None
if not kb:
return None
tenant_id = kb.tenant_id
index_name = cls._get_doc_meta_index_name(tenant_id)
try:
if not settings.docStoreConn.index_exist(index_name, ""):
# No metadata index → no metadata-filtered docs. Returning an
# empty list (rather than ``None``) so callers don't bounce
# back to the in-memory path and re-query MySQL for nothing.
return []
except Exception as e:
logging.warning(f"[meta_pushdown] index_exist check failed for {index_name}: {e}")
return None
try:
query_body = build_meta_filter_query(filters, logic, kb_ids)
except UnsupportedMetaFilter as e:
logging.debug(f"[meta_pushdown] falling back to in-memory: {e.reason}")
logging.error(f"ES build query failed: {e.reason}, filters={filters}")
return None
# Only the doc id is needed downstream; trimming ``_source`` keeps the
# response small when the metadata blob is large.
request_body = {
**query_body,
"size": limit,
@@ -871,12 +881,10 @@ class DocMetadataService:
try:
response = es_client.search(index=index_name, body=request_body)
except Exception as e:
logging.warning(f"[meta_pushdown] ES query failed for {index_name}: {e}")
logging.error(f"ES metadata filter failed for {index_name}: {e}")
return None
doc_ids = extract_doc_ids(response if isinstance(response, dict) else dict(response))
# Preserve order while removing duplicates so caller-side de-dupe stays
# cheap.
seen: set[str] = set()
unique: List[str] = []
for did in doc_ids:
@@ -887,12 +895,52 @@ class DocMetadataService:
if len(unique) >= limit:
logging.warning(
f"[meta_pushdown] hit limit {limit} for KBs {kb_ids}; some matches may be missing"
f"ES metadata filter hit limit {limit} for KBs {kb_ids}"
)
logging.debug(f"[meta_pushdown] {len(unique)} matches for KBs {kb_ids}")
logging.debug(f"ES metadata filter returned {len(unique)} matches for KBs {kb_ids}")
return unique
@classmethod
def _filter_doc_ids_by_metadata_infinity(
cls,
index_name: str,
kb_ids: List[str],
filters: List[Dict],
logic: str,
) -> Optional[List[str]]:
"""Infinity push-down path for metadata filtering."""
from common.metadata_infinity_filter import (
build_infinity_filter,
extract_doc_ids,
is_pushdown_supported,
)
if not is_pushdown_supported(filters):
return None
try:
sql_filter = build_infinity_filter(filters, logic)
escaped_kb_ids = [k.replace("'", "''") for k in kb_ids]
kb_filter = "kb_id IN (" + ", ".join([f"'{k}'" for k in escaped_kb_ids]) + ")"
where_clause = f"{kb_filter} AND {sql_filter}"
logging.debug(f"Infinity metadata filter: {where_clause}")
inf_conn = settings.docStoreConn.connPool.get_conn()
try:
db_instance = inf_conn.get_database(settings.docStoreConn.dbName)
table_instance = db_instance.get_table(index_name)
df, _ = table_instance.output(["id"]).filter(where_clause).to_df()
doc_ids = extract_doc_ids(df)
logging.debug(
f"Infinity metadata filter returned {len(doc_ids)} doc IDs for kb_ids={kb_ids}, logic={logic}")
return doc_ids
finally:
settings.docStoreConn.connPool.release_conn(inf_conn)
except Exception:
logging.warning("Metadata filter push-down failed; falling back to in-memory filter", exc_info=True)
return None
@classmethod
def get_metadata_keys_by_kbs(cls, kb_ids: List[str]) -> List[str]:
"""
@@ -955,7 +1003,8 @@ class DocMetadataService:
if doc_meta:
meta_mapping[doc_id] = doc_meta
logging.debug(f"[get_metadata_for_documents] Found metadata for {len(meta_mapping)}/{len(doc_ids) if doc_ids else 'all'} documents")
logging.debug(
f"[get_metadata_for_documents] Found metadata for {len(meta_mapping)}/{len(doc_ids) if doc_ids else 'all'} documents")
return meta_mapping
except Exception as e:
@@ -981,6 +1030,7 @@ class DocMetadataService:
}
}
"""
def _is_time_string(value: str) -> bool:
"""Check if a string value is an ISO 8601 datetime (e.g., '2026-02-03T00:00:00')."""
if not isinstance(value, str):
@@ -1220,7 +1270,8 @@ class DocMetadataService:
doc_ids_set = set(doc_ids)
missing_doc_ids = doc_ids_set - found_doc_ids
if missing_doc_ids and updates:
logging.debug(f"[batch_update_metadata] Inserting new metadata for documents without metadata rows: {missing_doc_ids}")
logging.debug(
f"[batch_update_metadata] Inserting new metadata for documents without metadata rows: {missing_doc_ids}")
for doc_id in missing_doc_ids:
# Apply updates to create new metadata
meta = {}

View File

@@ -0,0 +1,296 @@
#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Translate RAGflow document-metadata filter lists into Infinity SQL filter expressions.
"""
from __future__ import annotations
import ast
import re
from typing import Any, Dict, List, Sequence
_KEY_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
def _validate_key(key: str, flt: Dict[str, Any]) -> None:
if not _KEY_PATTERN.match(key):
raise ValueError(f"invalid key format (must be identifier-like): {flt}")
SUPPORTED_OPERATORS: frozenset[str] = frozenset(
{
"=",
"",
">",
"<",
"",
"",
"in",
"not in",
"contains",
"not contains",
"start with",
"end with",
"empty",
"not empty",
}
)
_RANGE_OPS: Dict[str, str] = {
">": ">",
"<": "<",
"": ">=",
"": "<=",
}
class MetaFilterTranslator:
"""Translate one user filter clause at a time into Infinity SQL filter strings."""
def translate(self, flt: Dict[str, Any]) -> str:
op = flt.get("op")
key = flt.get("key")
value = flt.get("value")
if not key or not isinstance(key, str):
raise ValueError(f"filter is missing a string key: {flt}")
_validate_key(key, flt)
if op not in SUPPORTED_OPERATORS:
raise ValueError(f"unknown operator: {op!r}, filter: {flt}")
if op == "empty":
return self._translate_empty(key)
if op == "not empty":
return self._translate_not_empty(key)
if op == "=":
return self._translate_equal(key, value, flt)
if op == "":
return self._translate_not_equal(key, value, flt)
if op in _RANGE_OPS:
return self._translate_range(key, op, value, flt)
if op == "in":
return self._translate_in(key, value, flt)
if op == "not in":
return self._translate_not_in(key, value, flt)
if op == "contains":
return self._translate_contains(key, value, flt)
if op == "not contains":
return self._translate_not_contains(key, value, flt)
if op == "start with":
return self._translate_start_with(key, value, flt)
if op == "end with":
return self._translate_end_with(key, value, flt)
raise ValueError(f"no handler for operator: {op!r}, filter: {flt}")
def _translate_empty(self, key: str) -> str:
return f"JSON_EXTRACT_STRING(meta_fields, '$.{key}') = '\"\"'"
def _translate_not_empty(self, key: str) -> str:
return f"JSON_EXTRACT_STRING(meta_fields, '$.{key}') != '\"\"'"
def _translate_equal(self, key: str, value: Any, flt: Dict[str, Any]) -> str:
coerced = _coerce_scalar(value, flt)
if isinstance(coerced, str):
escaped = _escape_sql_string(coerced)
return f"JSON_CONTAINS(meta_fields, '$.{key}', '\"{escaped}\"')"
return f"JSON_CONTAINS(meta_fields, '$.{key}', {coerced})"
def _translate_not_equal(self, key: str, value: Any, flt: Dict[str, Any]) -> str:
coerced = _coerce_scalar(value, flt)
if isinstance(coerced, str):
escaped = _escape_sql_string(coerced)
return f"NOT JSON_CONTAINS(meta_fields, '$.{key}', '\"{escaped}\"')"
return f"NOT JSON_CONTAINS(meta_fields, '$.{key}', {coerced})"
def _translate_range(self, key: str, op: str, value: Any, flt: Dict[str, Any]) -> str:
coerced = _coerce_range_value(value, flt)
sql_op = _RANGE_OPS.get(op, op)
if isinstance(coerced, str):
escaped = _escape_sql_string(coerced)
return f"JSON_EXTRACT_STRING(meta_fields, '$.{key}') {sql_op} '{escaped}'"
return f"JSON_EXTRACT_DOUBLE(meta_fields, '$.{key}') {sql_op} {coerced}"
def _translate_in(self, key: str, value: Any, flt: Dict[str, Any]) -> str:
members = _csv_or_list(value, flt)
string_parts = []
num_parts = []
for m in members:
# Use same coercion as range operators to detect numeric values
coerced = _coerce_range_value(m, flt)
if isinstance(coerced, (int, float)):
num_parts.append(f"JSON_CONTAINS(meta_fields, '$.{key}', {coerced})")
else:
escaped = _escape_sql_string(coerced)
string_parts.append(f"JSON_CONTAINS(meta_fields, '$.{key}', '\"{escaped}\"')")
conditions = []
if string_parts:
conditions.append("(" + " OR ".join(string_parts) + ")")
if num_parts:
conditions.append("(" + " OR ".join(num_parts) + ")")
return "(" + " OR ".join(conditions) + ")"
def _translate_not_in(self, key: str, value: Any, flt: Dict[str, Any]) -> str:
members = _csv_or_list(value, flt)
string_parts = []
num_parts = []
for m in members:
# Use same coercion as range operators to detect numeric values
coerced = _coerce_range_value(m, flt)
if isinstance(coerced, (int, float)):
num_parts.append(f"NOT JSON_CONTAINS(meta_fields, '$.{key}', {coerced})")
else:
escaped = _escape_sql_string(coerced)
string_parts.append(f"NOT JSON_CONTAINS(meta_fields, '$.{key}', '\"{escaped}\"')")
conditions = []
if string_parts:
conditions.append("(" + " AND ".join(string_parts) + ")")
if num_parts:
conditions.append("(" + " AND ".join(num_parts) + ")")
return " AND ".join(conditions)
def _translate_contains(self, key: str, value: Any, flt: Dict[str, Any]) -> str:
if not value and value != 0:
raise ValueError(f"contains value is empty: {flt}")
# Use same coercion as range operators to detect numeric values
coerced = _coerce_range_value(value, flt)
if isinstance(coerced, (int, float)):
return f"JSON_CONTAINS(meta_fields, '$.{key}', {coerced})"
escaped = _escape_sql_string(str(value))
return f"JSON_CONTAINS(meta_fields, '$.{key}', '\"{escaped}\"')"
def _translate_not_contains(self, key: str, value: Any, flt: Dict[str, Any]) -> str:
text = _coerce_string(value, flt)
escaped = _escape_sql_string(text)
# Use Infinity's JSON_CONTAINS to check if value does NOT exist in JSON array
return f"NOT JSON_CONTAINS(meta_fields, '$.{key}', '\"{escaped}\"')"
def _translate_start_with(self, key: str, value: Any, flt: Dict[str, Any]) -> str:
text = _coerce_string(value, flt)
escaped = _escape_sql_string(_escape_likeWildcards(text))
return f"JSON_EXTRACT_STRING(meta_fields, '$.{key}') LIKE '{escaped}%'"
def _translate_end_with(self, key: str, value: Any, flt: Dict[str, Any]) -> str:
text = _coerce_string(value, flt)
escaped = _escape_sql_string(_escape_likeWildcards(text))
return f"JSON_EXTRACT_STRING(meta_fields, '$.{key}') LIKE '%{escaped}'"
def plan_pushdown(filters: Sequence[Dict[str, Any]], logic: str) -> List[str]:
if logic not in {"and", "or"}:
raise ValueError(f"unknown logic {logic!r}")
translator = MetaFilterTranslator()
return [translator.translate(flt) for flt in filters]
def build_infinity_filter(filters: Sequence[Dict[str, Any]], logic: str) -> str:
if not filters:
return "1=1"
fragments = plan_pushdown(filters, logic)
joiner = " AND " if logic == "and" else " OR "
result = "(" + joiner.join(fragments) + ")"
return result
def is_pushdown_supported(filters: Sequence[Dict[str, Any]]) -> bool:
for flt in filters:
op = flt.get("op")
if op not in SUPPORTED_OPERATORS:
return False
if not isinstance(flt.get("key"), str) or not flt.get("key"):
return False
return True
def extract_doc_ids(df) -> List[str]:
if df is None or not hasattr(df, "iterrows"):
return []
return [str(row["id"]) for _, row in df.iterrows() if "id" in row]
# ---------------------------------------------------------------------------
# Value coercion helpers
# ---------------------------------------------------------------------------
def _coerce_scalar(value: Any, flt: Dict[str, Any]) -> Any:
if value is None:
raise ValueError(f"scalar comparison value is None: {flt}")
if isinstance(value, (list, dict)):
raise ValueError(f"scalar comparison value is non-scalar: {flt}")
try:
parsed = ast.literal_eval(str(value).strip())
if isinstance(parsed, (int, float, bool)):
return parsed
except Exception:
pass
return str(value)
def _coerce_range_value(value: Any, flt: Dict[str, Any]) -> Any:
if value is None:
raise ValueError(f"range comparison value is None: {flt}")
try:
parsed = ast.literal_eval(str(value).strip())
if isinstance(parsed, (int, float)):
return parsed
except Exception:
pass
return str(value)
def _coerce_string(value: Any, flt: Dict[str, Any]) -> str:
if value is None:
raise ValueError(f"string-operator value is None: {flt}")
if isinstance(value, (list, dict)):
raise ValueError(f"string-operator value must be a scalar: {flt}")
s = str(value)
if not s:
raise ValueError(f"string-operator value is empty: {flt}")
return s
def _csv_or_list(value: Any, flt: Dict[str, Any]) -> List[Any]:
if value is None:
raise ValueError(f"membership value is None: {flt}")
if isinstance(value, (list, tuple)):
members = list(value)
elif isinstance(value, str):
try:
parsed = ast.literal_eval(value)
except Exception:
parsed = value
if isinstance(parsed, (list, tuple)):
members = list(parsed)
else:
members = [m.strip() for m in value.split(",") if m.strip()]
else:
members = [value]
if not members:
raise ValueError(f"membership value resolved to empty list: {flt}")
normalised: List[Any] = []
for m in members:
if isinstance(m, str):
normalised.append(m.lower().strip())
else:
normalised.append(m)
return normalised
def _escape_sql_string(s: str) -> str:
return s.replace("'", "''")
def _escape_likeWildcards(text: str) -> str:
return text.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")

View File

@@ -19,6 +19,7 @@ from typing import Any, Callable, Dict
import json_repair
def convert_conditions(metadata_condition):
if metadata_condition is None:
metadata_condition = {}
@@ -109,17 +110,23 @@ def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
matched = False
try:
if operator == "contains":
matched = str(input).find(value) >= 0 if not isinstance(input, list) else any(str(i).find(value) >= 0 for i in input)
matched = str(input).find(value) >= 0 if not isinstance(input, list) else any(
str(i).find(value) >= 0 for i in input)
elif operator == "not contains":
matched = str(input).find(value) == -1 if not isinstance(input, list) else all(str(i).find(value) == -1 for i in input)
matched = str(input).find(value) == -1 if not isinstance(input, list) else all(
str(i).find(value) == -1 for i in input)
elif operator == "in":
matched = input in value if not isinstance(input, list) else all(i in value for i in input)
elif operator == "not in":
matched = input not in value if not isinstance(input, list) else all(i not in value for i in input)
elif operator == "start with":
matched = str(input).lower().startswith(str(value).lower()) if not isinstance(input, list) else "".join([str(i).lower() for i in input]).startswith(str(value).lower())
matched = str(input).lower().startswith(str(value).lower()) if not isinstance(input,
list) else "".join(
[str(i).lower() for i in input]).startswith(str(value).lower())
elif operator == "end with":
matched = str(input).lower().endswith(str(value).lower()) if not isinstance(input, list) else "".join([str(i).lower() for i in input]).endswith(str(value).lower())
matched = str(input).lower().endswith(str(value).lower()) if not isinstance(input,
list) else "".join(
[str(i).lower() for i in input]).endswith(str(value).lower())
elif operator == "empty":
matched = not input
elif operator == "not empty":
@@ -158,9 +165,11 @@ def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
if logic == "and":
doc_ids = doc_ids & set(ids)
if not doc_ids:
logging.debug(f"meta_filter filters={filters}, logic={logic}, early return []")
return []
else:
doc_ids = doc_ids | set(ids)
logging.debug(f"meta_filter filters={filters}, logic={logic}, returning doc_ids={list(doc_ids)}")
return list(doc_ids)
@@ -182,12 +191,11 @@ async def apply_meta_data_filter(
- semi_auto: generate conditions using selected metadata keys only
- manual: directly filter based on provided conditions
When ``kb_ids`` is supplied and the active doc store is Elasticsearch the
generated filter conditions are pushed down to ES via
``DocMetadataService.filter_doc_ids_by_meta_pushdown`` instead of being
evaluated in Python over ``metas``. The in-memory ``meta_filter`` path
remains the fallback so callers without a KB scope, or backends without
push-down support, behave exactly as before.
When ``kb_ids`` is supplied, metadata filters are pushed down to the doc metadata
index (ES/Infinity) via ``DocMetadataService.filter_doc_ids_by_metadata`` instead
of being evaluated in Python over ``metas``. The in-memory ``meta_filter`` path
remains the fallback so callers without a KB scope, or backends without push-down
support, behave exactly as before.
``metas`` may be supplied eagerly or via ``metas_loader``. The loader is
only invoked when the metadata dict is actually needed — i.e. for the LLM
@@ -220,17 +228,26 @@ async def apply_meta_data_filter(
cached_metas = metas_loader() if metas_loader else {}
return cached_metas
def _evaluate(conditions: list[dict], logic: str) -> list[str]:
"""Run conditions through ES push-down when possible, in-memory otherwise."""
def _run_metadata_filter(conditions: list[dict], logic: str) -> list[str]:
"""Run conditions through ES/Infinity push-down when possible, in-memory otherwise."""
if conditions and kb_ids:
pushed = _try_meta_pushdown(kb_ids, conditions, logic)
if pushed is not None:
return pushed
try:
from api.db.services.doc_metadata_service import DocMetadataService
doc_ids = DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, conditions, logic)
logging.debug(f"Doc ids filtered by metadata: {doc_ids}")
if doc_ids is not None:
return doc_ids
except Exception as e:
logging.error(f"Metadata filter push down errored: {e}")
# In-memory fallback
logging.debug("Metadata filter falls back to in-memory filter")
return meta_filter(_get_metas(), conditions, logic)
if method == "auto":
filters: dict = await gen_meta_filter(chat_mdl, _get_metas(), question)
doc_ids.extend(_evaluate(filters["conditions"], filters.get("logic", "and")))
logging.debug(f"Metadata filter(auto) generated: {filters}")
doc_ids.extend(_run_metadata_filter(filters["conditions"], filters.get("logic", "and")))
if not doc_ids:
return None
elif method == "semi_auto":
@@ -251,17 +268,20 @@ async def apply_meta_data_filter(
filtered_metas = {key: current_metas[key] for key in selected_keys if key in current_metas}
if filtered_metas:
filters: dict = await gen_meta_filter(chat_mdl, filtered_metas, question, constraints=constraints)
doc_ids.extend(_evaluate(filters["conditions"], filters.get("logic", "and")))
logging.debug(f"Metadata filter(semi_auto) generated: {filters}")
doc_ids.extend(_run_metadata_filter(filters["conditions"], filters.get("logic", "and")))
if not doc_ids:
return None
elif method == "manual":
filters = meta_data_filter.get("manual", [])
if manual_value_resolver:
filters = [manual_value_resolver(flt) for flt in filters]
doc_ids.extend(_evaluate(filters, meta_data_filter.get("logic", "and")))
logging.debug(f"Metadata filter(manual): {filters}")
doc_ids.extend(_run_metadata_filter(filters, meta_data_filter.get("logic", "and")))
if filters and not doc_ids:
doc_ids = ["-999"]
logging.debug(f"apply_meta_data_filter meta_filter={meta_data_filter}, returning doc_ids={doc_ids}")
return doc_ids
@@ -335,7 +355,7 @@ def update_metadata_to(metadata, meta):
return metadata
def metadata_schema(metadata: dict|list|None) -> Dict[str, Any]:
def metadata_schema(metadata: dict | list | None) -> Dict[str, Any]:
if not metadata:
return {}
properties = {}

View File

@@ -494,6 +494,28 @@ async def rank_memories_async(chat_mdl, goal: str, sub_goal: str, tool_call_summ
async def gen_meta_filter(chat_mdl, meta_data: dict, query: str, constraints: dict = None) -> dict:
"""Generate metadata filter conditions from a user query using an LLM.
Args:
chat_mdl: LLM bundle for generating filters
meta_data: Dict of {key: set of values} - e.g. {"character": {"Caocao", "Liubei"}, "year": {2026}}
query: User question (e.g. "Caocao in 2026")
constraints: Optional dict of {key: operator} to constrain which op to use for a key
Returns:
Dict with "logic" ("and"/"or") and "conditions" list.
Example return value:
{
"logic": "and",
"conditions": [
{"key": "year", "value": "2026", "op": "="},
{"key": "character", "value": "Caocao", "op": "="}
]
}
The LLM is prompted with the available metadata keys and values, and is asked to
generate filter conditions that match the user's query semantics.
"""
meta_data_structure = {}
for key, values in meta_data.items():
meta_data_structure[key] = list(values.keys()) if isinstance(values, dict) else values

View File

@@ -1,473 +0,0 @@
"""Unit tests for the Elasticsearch push-down translator.
These tests cover the public surface of ``common.metadata_es_filter`` without
touching the live ES cluster. They verify the shape of the produced query DSL
operator-by-operator and confirm that the parity rules with the in-memory
``meta_filter`` (lower-casing, list-membership coercion, date detection) hold.
"""
import pytest
from common.metadata_es_filter import (
META_FIELDS_PREFIX,
MetaFilterPushdownPlan,
MetaFilterTranslator,
SUPPORTED_OPERATORS,
UnsupportedMetaFilter,
build_meta_filter_query,
extract_doc_ids,
is_pushdown_supported,
plan_pushdown,
)
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def translator() -> MetaFilterTranslator:
return MetaFilterTranslator()
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
# ---------------------------------------------------------------------------
# Translator: per-operator shape
# ---------------------------------------------------------------------------
def test_equal_translates_to_term_with_lowercased_value(translator):
"""String equality runs against ``.keyword`` so multi-word phrases match.
Querying the analyzed parent field with ``term`` only matches docs whose
inverted index contains the literal phrase token, which never happens for
multi-word values. The ``.keyword`` sub-field stores the unmodified string,
and ``case_insensitive: true`` keeps the lower-cased compare semantics from
the in-memory ``meta_filter``.
"""
clauses = translator.translate({"key": "tag", "op": "=", "value": "Alpha"}).to_clauses()
assert clauses == [
{"term": {_field("tag") + ".keyword": {"value": "alpha", "case_insensitive": True}}}
]
def test_equal_parses_numeric_literal(translator):
"""Numeric values stay on the parent path — no ``.keyword`` sub-field exists for ``long``."""
clauses = translator.translate({"key": "score", "op": "=", "value": "5"}).to_clauses()
assert clauses == [{"term": {_field("score"): 5}}]
def test_equal_multiword_uses_keyword_subfield(translator):
"""Regression for qinling0210's report: multi-word string values must match.
Before the keyword-routing fix this emitted
``term: meta_fields.author = "alice wonderland"`` against an analyzed text
field, which never matched (inverted index only contained per-token
entries). Routing through ``.keyword`` preserves the full phrase.
"""
clauses = translator.translate(
{"key": "author", "op": "=", "value": "Alice Wonderland"}
).to_clauses()
assert clauses == [
{
"term": {
_field("author") + ".keyword": {
"value": "alice wonderland",
"case_insensitive": True,
}
}
}
]
def test_not_equal_requires_field_to_exist(translator):
clauses = translator.translate({"key": "tag", "op": "", "value": "alpha"}).to_clauses()
assert clauses == [
{
"bool": {
"must": [{"exists": {"field": _field("tag")}}],
"must_not": [
{"term": {_field("tag") + ".keyword": {"value": "alpha", "case_insensitive": True}}}
],
}
}
]
@pytest.mark.parametrize(
"op,es_key",
[(">", "gt"), ("<", "lt"), ("", "gte"), ("", "lte")],
)
def test_range_operator_translation(translator, op, es_key):
# Multi-clause positive filters wrap into a single bool so OR-logic
# parents can't match on just the ``exists`` half of the range.
clauses = translator.translate({"key": "score", "op": op, "value": "10"}).to_clauses()
assert clauses == [
{
"bool": {
"must": [
{"exists": {"field": _field("score")}},
{"range": {_field("score"): {es_key: 10}}},
]
}
}
]
def test_range_passes_iso_date_through_unparsed(translator):
clauses = translator.translate({"key": "published", "op": "", "value": "2025-01-15"}).to_clauses()
range_clause = clauses[0]["bool"]["must"][1]
assert range_clause == {"range": {_field("published"): {"gte": "2025-01-15"}}}
def _string_terms_should(field_path: str, members):
"""``in``/``not in`` over string members expands per-element so each ``term``
can carry ``case_insensitive`` (``terms`` does not accept that flag)."""
return {
"bool": {
"should": [
{"term": {field_path + ".keyword": {"value": m, "case_insensitive": True}}}
for m in members
],
"minimum_should_match": 1,
}
}
def test_in_operator_csv_value_lowercased(translator):
clauses = translator.translate({"key": "status", "op": "in", "value": "Active,Pending"}).to_clauses()
assert clauses == [_string_terms_should(_field("status"), ["active", "pending"])]
def test_in_operator_python_list_literal(translator):
clauses = translator.translate({"key": "status", "op": "in", "value": "['Open', 'Closed']"}).to_clauses()
assert clauses == [_string_terms_should(_field("status"), ["open", "closed"])]
def test_in_operator_numeric_members_keep_terms(translator):
"""All-numeric member lists keep the cheaper ``terms`` form on the parent path."""
clauses = translator.translate({"key": "year", "op": "in", "value": "[2024, 2025]"}).to_clauses()
assert clauses == [{"terms": {_field("year"): [2024, 2025]}}]
def test_not_in_negates_with_existence_guard(translator):
clauses = translator.translate({"key": "status", "op": "not in", "value": "active,pending"}).to_clauses()
assert clauses == [
{
"bool": {
"must": [{"exists": {"field": _field("status")}}],
"must_not": [_string_terms_should(_field("status"), ["active", "pending"])],
}
}
]
def test_contains_uses_case_insensitive_wildcard(translator):
clauses = translator.translate({"key": "version", "op": "contains", "value": "earth"}).to_clauses()
assert clauses == [
{
"wildcard": {
_field("version") + ".keyword": {
"value": "*earth*",
"case_insensitive": True,
}
}
}
]
def test_contains_escapes_user_wildcards(translator):
clauses = translator.translate({"key": "title", "op": "contains", "value": "a*b?c"}).to_clauses()
pattern = clauses[0]["wildcard"][_field("title") + ".keyword"]["value"]
assert pattern == "*a\\*b\\?c*"
def test_not_contains_negates_with_exists(translator):
clauses = translator.translate({"key": "version", "op": "not contains", "value": "earth"}).to_clauses()
assert clauses == [
{
"bool": {
"must": [{"exists": {"field": _field("version")}}],
"must_not": [
{
"wildcard": {
_field("version") + ".keyword": {
"value": "*earth*",
"case_insensitive": True,
}
}
}
],
}
}
]
def test_start_with_uses_prefix(translator):
clauses = translator.translate({"key": "name", "op": "start with", "value": "pre"}).to_clauses()
assert clauses == [
{"prefix": {_field("name") + ".keyword": {"value": "pre", "case_insensitive": True}}}
]
def test_end_with_uses_trailing_wildcard(translator):
clauses = translator.translate({"key": "file", "op": "end with", "value": ".pdf"}).to_clauses()
pattern = clauses[0]["wildcard"][_field("file") + ".keyword"]["value"]
assert pattern == "*.pdf"
def test_empty_matches_missing_or_blank(translator):
clauses = translator.translate({"key": "notes", "op": "empty", "value": ""}).to_clauses()
assert clauses == [
{
"bool": {
"should": [
{"bool": {"must_not": [{"exists": {"field": _field("notes")}}]}},
{"term": {_field("notes") + ".keyword": ""}},
],
"minimum_should_match": 1,
}
}
]
def test_not_empty_requires_exists_and_excludes_blank(translator):
clauses = translator.translate({"key": "notes", "op": "not empty", "value": ""}).to_clauses()
assert clauses == [
{
"bool": {
"must": [{"exists": {"field": _field("notes")}}],
"must_not": [{"term": {_field("notes") + ".keyword": ""}}],
}
}
]
# ---------------------------------------------------------------------------
# Translator: validation paths
# ---------------------------------------------------------------------------
def test_unknown_operator_raises(translator):
with pytest.raises(UnsupportedMetaFilter) as exc:
translator.translate({"key": "tag", "op": "regex", "value": "^foo"})
assert "regex" in exc.value.reason
def test_missing_key_raises(translator):
with pytest.raises(UnsupportedMetaFilter):
translator.translate({"op": "=", "value": "x"})
def test_scalar_op_with_list_value_raises(translator):
with pytest.raises(UnsupportedMetaFilter):
translator.translate({"key": "tag", "op": "=", "value": ["a", "b"]})
def test_string_op_with_empty_value_raises(translator):
with pytest.raises(UnsupportedMetaFilter):
translator.translate({"key": "tag", "op": "contains", "value": ""})
def test_membership_with_empty_csv_raises(translator):
with pytest.raises(UnsupportedMetaFilter):
translator.translate({"key": "tag", "op": "in", "value": ""})
def test_supported_operator_set_matches_documentation():
expected = {
"=",
"",
">",
"<",
"",
"",
"in",
"not in",
"contains",
"not contains",
"start with",
"end with",
"empty",
"not empty",
}
assert SUPPORTED_OPERATORS == expected
# ---------------------------------------------------------------------------
# Plan composition
# ---------------------------------------------------------------------------
def test_plan_emits_must_clauses_for_and_logic():
plan = plan_pushdown(
[
{"key": "tag", "op": "=", "value": "alpha"},
{"key": "score", "op": ">", "value": "5"},
],
logic="and",
)
assert isinstance(plan, MetaFilterPushdownPlan)
body = plan.to_query(["kb1"])
bool_root = body["query"]["bool"]
assert bool_root["filter"][0] == {"terms": {"kb_id": ["kb1"]}}
inner = bool_root["filter"][1]["bool"]
assert "must" in inner
# Each translated filter contributes exactly one clause to the parent bool:
# ``=`` is a single ``term``; ``>`` is wrapped into one atomic ``bool``.
assert len(inner["must"]) == 2
expected_tag_term = {
"term": {_field("tag") + ".keyword": {"value": "alpha", "case_insensitive": True}}
}
assert expected_tag_term in inner["must"]
range_wrap = {
"bool": {
"must": [
{"exists": {"field": _field("score")}},
{"range": {_field("score"): {"gt": 5}}},
]
}
}
assert range_wrap in inner["must"]
def test_range_filter_under_or_stays_atomic():
"""An OR'd range must not split into independent ``exists`` + ``range`` should branches."""
body = build_meta_filter_query(
[
{"key": "tag", "op": "=", "value": "alpha"},
{"key": "score", "op": ">", "value": "5"},
],
logic="or",
kb_ids=["kb1"],
)
should = body["query"]["bool"]["filter"][1]["bool"]["should"]
# Two filters → two should branches, not three or four.
assert len(should) == 2
assert {
"term": {_field("tag") + ".keyword": {"value": "alpha", "case_insensitive": True}}
} in should
def test_plan_emits_should_clauses_for_or_logic():
plan = plan_pushdown(
[
{"key": "tag", "op": "=", "value": "alpha"},
{"key": "tag", "op": "=", "value": "beta"},
],
logic="or",
)
inner = plan.to_query(["kb1"])["query"]["bool"]["filter"][1]["bool"]
assert inner["minimum_should_match"] == 1
assert len(inner["should"]) == 2
def test_unknown_logic_rejected():
with pytest.raises(UnsupportedMetaFilter):
plan_pushdown([{"key": "k", "op": "=", "value": "v"}], logic="xor")
def test_empty_filter_list_returns_kb_only_query():
body = build_meta_filter_query([], "and", ["kb1", "kb2"])
assert body == {"query": {"bool": {"filter": [{"terms": {"kb_id": ["kb1", "kb2"]}}]}}}
def test_negative_filter_in_or_logic_keeps_negation_scope():
"""Wrapping ``≠`` in an OR should not let the ``must_not`` swallow other branches.
``≠`` is rejected by :func:`is_pushdown_supported` for multi-value safety, so
this test exercises the translator directly to confirm the per-filter
wrapping invariant. The same shape protects ``not contains`` (which IS
pushed down) from leaking its ``must_not`` into a parent should.
"""
body = build_meta_filter_query(
[
{"key": "tag", "op": "=", "value": "alpha"},
{"key": "tag", "op": "", "value": "beta"},
],
logic="or",
kb_ids=["kb1"],
)
inner = body["query"]["bool"]["filter"][1]["bool"]
should = inner["should"]
assert should[0] == {
"term": {_field("tag") + ".keyword": {"value": "alpha", "case_insensitive": True}}
}
# The ≠ branch is wrapped so its must_not does not bleed into the OR set.
assert "bool" in should[1]
assert "must_not" in should[1]["bool"]
# ---------------------------------------------------------------------------
# is_pushdown_supported pre-check
# ---------------------------------------------------------------------------
def test_pushdown_check_accepts_known_ops():
assert is_pushdown_supported(
[
{"key": "tag", "op": "=", "value": "v"},
{"key": "tag", "op": "contains", "value": "x"},
]
)
def test_pushdown_check_rejects_unknown_op():
assert not is_pushdown_supported([{"key": "tag", "op": "regex", "value": "^v"}])
def test_pushdown_check_rejects_missing_key():
assert not is_pushdown_supported([{"op": "=", "value": "v"}])
@pytest.mark.parametrize("op", ["", "not in"])
def test_pushdown_check_rejects_multivalue_unsafe_negatives(op):
"""Negatives that diverge on multi-valued fields force the in-memory fallback."""
assert not is_pushdown_supported([{"key": "tag", "op": op, "value": "x"}])
def test_pushdown_check_one_unsafe_op_rejects_whole_request():
"""Mixing one unsafe op with safe ones still falls back, preserving correctness."""
assert not is_pushdown_supported(
[
{"key": "tag", "op": "=", "value": "v"},
{"key": "tag", "op": "", "value": "w"},
]
)
def test_pushdown_check_accepts_not_contains():
"""``not contains`` stays in push-down; ``all(not contains)`` ≡ ``not any(contains)``."""
assert is_pushdown_supported([{"key": "tag", "op": "not contains", "value": "x"}])
# ---------------------------------------------------------------------------
# extract_doc_ids
# ---------------------------------------------------------------------------
def test_extract_doc_ids_from_dict_response():
response = {
"hits": {
"hits": [
{"_id": "doc1", "_source": {"id": "doc1"}},
{"_id": "doc2", "_source": {"id": "doc2"}},
]
}
}
assert extract_doc_ids(response) == ["doc1", "doc2"]
def test_extract_doc_ids_falls_back_to_source_id():
response = {"hits": {"hits": [{"_source": {"id": "src-id"}}]}}
assert extract_doc_ids(response) == ["src-id"]
def test_extract_doc_ids_empty_response():
assert extract_doc_ids({}) == []
assert extract_doc_ids({"hits": {}}) == []
assert extract_doc_ids({"hits": {"hits": []}}) == []

View File

@@ -0,0 +1,659 @@
"""Unit tests for the metadata filter push-down translators (ES and Infinity).
Verifies the shape of the produced filter expressions for both ES DSL and
Infinity SQL, and confirms that coercion rules (lower-casing, list-membership,
date detection) are consistent between the two backends.
"""
import pytest
pytestmark = pytest.mark.p2
from common.metadata_es_filter import MetaFilterTranslator as ESMetaFilterTranslator
from common.metadata_infinity_filter import (
MetaFilterTranslator as InfinityMetaFilterTranslator,
SUPPORTED_OPERATORS,
build_infinity_filter,
is_pushdown_supported,
plan_pushdown,
extract_doc_ids,
)
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def es_translator() -> ESMetaFilterTranslator:
return ESMetaFilterTranslator()
@pytest.fixture
def infinity_translator() -> InfinityMetaFilterTranslator:
return InfinityMetaFilterTranslator()
# ---------------------------------------------------------------------------
# Shared: is_pushdown_supported pre-check (same logic for both backends)
# ---------------------------------------------------------------------------
def test_pushdown_check_accepts_known_ops():
assert is_pushdown_supported(
[
{"key": "tag", "op": "=", "value": "v"},
{"key": "tag", "op": "contains", "value": "x"},
]
)
def test_pushdown_check_rejects_unknown_op():
assert not is_pushdown_supported([{"key": "tag", "op": "regex", "value": "^v"}])
def test_pushdown_check_rejects_missing_key():
assert not is_pushdown_supported([{"op": "=", "value": "v"}])
def test_pushdown_check_accepts_not_contains():
assert is_pushdown_supported([{"key": "tag", "op": "not contains", "value": "x"}])
# ---------------------------------------------------------------------------
# Shared: plan_pushdown (same logic for both backends)
# ---------------------------------------------------------------------------
def test_plan_pushdown_and_logic():
fragments = plan_pushdown(
[
{"key": "tag", "op": "=", "value": "alpha"},
{"key": "score", "op": ">", "value": "5"},
],
logic="and",
)
assert len(fragments) == 2
def test_plan_pushdown_or_logic():
fragments = plan_pushdown(
[
{"key": "tag", "op": "=", "value": "alpha"},
{"key": "tag", "op": "=", "value": "beta"},
],
logic="or",
)
assert len(fragments) == 2
def test_unknown_logic_rejected():
with pytest.raises(ValueError):
plan_pushdown([{"key": "k", "op": "=", "value": "v"}], logic="xor")
# ---------------------------------------------------------------------------
# Shared: extract_doc_ids (same implementation)
# ---------------------------------------------------------------------------
def test_extract_doc_ids_from_dataframe():
import pandas as pd
df = pd.DataFrame({"id": ["doc1", "doc2", "doc3"]})
assert extract_doc_ids(df) == ["doc1", "doc2", "doc3"]
def test_extract_doc_ids_empty_dataframe():
import pandas as pd
df = pd.DataFrame({"id": []})
assert extract_doc_ids(df) == []
def test_extract_doc_ids_none_input():
assert extract_doc_ids(None) == []
def test_extract_doc_ids_non_dataframe():
assert extract_doc_ids("not a dataframe") == []
# ---------------------------------------------------------------------------
# Shared: SUPPORTED_OPERATORS
# ---------------------------------------------------------------------------
def test_supported_operator_set_matches_documentation():
expected = {
"=",
"",
">",
"<",
"",
"",
"in",
"not in",
"contains",
"not contains",
"start with",
"end with",
"empty",
"not empty",
}
assert SUPPORTED_OPERATORS == expected
# ===========================================================================
# ES-only tests
# ===========================================================================
def test_equal_translates_to_term_with_lowercased_value(es_translator):
"""String equality runs against ``.keyword`` so multi-word phrases match."""
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
clauses = es_translator.translate({"key": "tag", "op": "=", "value": "Alpha"}).to_clauses()
assert clauses == [
{"term": {_field("tag") + ".keyword": {"value": "alpha", "case_insensitive": True}}}
]
def test_equal_parses_numeric_literal(es_translator):
"""Numeric values stay on the parent path — no ``.keyword`` sub-field exists for ``long``."""
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
clauses = es_translator.translate({"key": "score", "op": "=", "value": "5"}).to_clauses()
assert clauses == [{"term": {_field("score"): 5}}]
def test_equal_multiword_uses_keyword_subfield(es_translator):
"""Regression: multi-word string values must match via .keyword sub-field."""
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
clauses = es_translator.translate(
{"key": "author", "op": "=", "value": "Alice Wonderland"}
).to_clauses()
assert clauses == [
{
"term": {
_field("author") + ".keyword": {
"value": "alice wonderland",
"case_insensitive": True,
}
}
}
]
def test_not_equal_requires_field_to_exist(es_translator):
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
clauses = es_translator.translate({"key": "tag", "op": "", "value": "alpha"}).to_clauses()
assert clauses == [
{
"bool": {
"must": [{"exists": {"field": _field("tag")}}],
"must_not": [
{"term": {_field("tag") + ".keyword": {"value": "alpha", "case_insensitive": True}}}
],
}
}
]
@pytest.mark.parametrize(
"op,es_key",
[(">", "gt"), ("<", "lt"), ("", "gte"), ("", "lte")],
)
def test_range_operator_translation(es_translator, op, es_key):
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
clauses = es_translator.translate({"key": "score", "op": op, "value": "10"}).to_clauses()
assert clauses == [
{
"bool": {
"must": [
{"exists": {"field": _field("score")}},
{"range": {_field("score"): {es_key: 10}}},
]
}
}
]
def test_range_passes_iso_date_through_unparsed(es_translator):
clauses = es_translator.translate({"key": "published", "op": "", "value": "2025-01-15"}).to_clauses()
range_clause = clauses[0]["bool"]["must"][1]
assert range_clause == {"range": {"meta_fields.published": {"gte": "2025-01-15"}}}
def test_in_operator_csv_value_lowercased(es_translator):
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
def _string_terms_should(field_path: str, members):
return {
"bool": {
"should": [
{"term": {field_path + ".keyword": {"value": m, "case_insensitive": True}}}
for m in members
],
"minimum_should_match": 1,
}
}
clauses = es_translator.translate({"key": "status", "op": "in", "value": "Active,Pending"}).to_clauses()
assert clauses == [_string_terms_should(_field("status"), ["active", "pending"])]
def test_in_operator_python_list_literal(es_translator):
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
def _string_terms_should(field_path: str, members):
return {
"bool": {
"should": [
{"term": {field_path + ".keyword": {"value": m, "case_insensitive": True}}}
for m in members
],
"minimum_should_match": 1,
}
}
clauses = es_translator.translate({"key": "status", "op": "in", "value": "['Open', 'Closed']"}).to_clauses()
assert clauses == [_string_terms_should(_field("status"), ["open", "closed"])]
def test_in_operator_numeric_members_keep_terms(es_translator):
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
clauses = es_translator.translate({"key": "year", "op": "in", "value": "[2024, 2025]"}).to_clauses()
assert clauses == [{"terms": {_field("year"): [2024, 2025]}}]
def test_not_in_negates_with_existence_guard(es_translator):
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
def _string_terms_should(field_path: str, members):
return {
"bool": {
"should": [
{"term": {field_path + ".keyword": {"value": m, "case_insensitive": True}}}
for m in members
],
"minimum_should_match": 1,
}
}
clauses = es_translator.translate({"key": "status", "op": "not in", "value": "active,pending"}).to_clauses()
assert clauses == [
{
"bool": {
"must": [{"exists": {"field": _field("status")}}],
"must_not": [_string_terms_should(_field("status"), ["active", "pending"])],
}
}
]
def test_contains_uses_case_insensitive_wildcard(es_translator):
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
clauses = es_translator.translate({"key": "version", "op": "contains", "value": "earth"}).to_clauses()
assert clauses == [
{
"wildcard": {
_field("version") + ".keyword": {
"value": "*earth*",
"case_insensitive": True,
}
}
}
]
def test_contains_escapes_user_wildcards(es_translator):
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
clauses = es_translator.translate({"key": "title", "op": "contains", "value": "a*b?c"}).to_clauses()
pattern = clauses[0]["wildcard"][_field("title") + ".keyword"]["value"]
assert pattern == "*a\\*b\\?c*"
def test_not_contains_negates_with_exists(es_translator):
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
clauses = es_translator.translate({"key": "version", "op": "not contains", "value": "earth"}).to_clauses()
assert clauses == [
{
"bool": {
"must": [{"exists": {"field": _field("version")}}],
"must_not": [
{
"wildcard": {
_field("version") + ".keyword": {
"value": "*earth*",
"case_insensitive": True,
}
}
}
],
}
}
]
def test_start_with_uses_prefix(es_translator):
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
clauses = es_translator.translate({"key": "name", "op": "start with", "value": "pre"}).to_clauses()
assert clauses == [
{"prefix": {_field("name") + ".keyword": {"value": "pre", "case_insensitive": True}}}
]
def test_end_with_uses_trailing_wildcard(es_translator):
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
clauses = es_translator.translate({"key": "file", "op": "end with", "value": ".pdf"}).to_clauses()
pattern = clauses[0]["wildcard"][_field("file") + ".keyword"]["value"]
assert pattern == "*.pdf"
def test_empty_matches_missing_or_blank(es_translator):
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
clauses = es_translator.translate({"key": "notes", "op": "empty", "value": ""}).to_clauses()
assert clauses == [
{
"bool": {
"should": [
{"bool": {"must_not": [{"exists": {"field": _field("notes")}}]}},
{"term": {_field("notes") + ".keyword": ""}},
],
"minimum_should_match": 1,
}
}
]
def test_not_empty_requires_exists_and_excludes_blank(es_translator):
from common.metadata_es_filter import META_FIELDS_PREFIX
def _field(key: str) -> str:
return f"{META_FIELDS_PREFIX}.{key}"
clauses = es_translator.translate({"key": "notes", "op": "not empty", "value": ""}).to_clauses()
assert clauses == [
{
"bool": {
"must": [{"exists": {"field": _field("notes")}}],
"must_not": [{"term": {_field("notes") + ".keyword": ""}}],
}
}
]
def test_unknown_operator_raises(es_translator):
from common.metadata_es_filter import UnsupportedMetaFilter
with pytest.raises(UnsupportedMetaFilter) as exc:
es_translator.translate({"key": "tag", "op": "regex", "value": "^foo"})
assert "regex" in exc.value.reason
def test_missing_key_raises(es_translator):
from common.metadata_es_filter import UnsupportedMetaFilter
with pytest.raises(UnsupportedMetaFilter):
es_translator.translate({"op": "=", "value": "x"})
def test_scalar_op_with_list_value_raises(es_translator):
from common.metadata_es_filter import UnsupportedMetaFilter
with pytest.raises(UnsupportedMetaFilter):
es_translator.translate({"key": "tag", "op": "=", "value": ["a", "b"]})
def test_string_op_with_empty_value_raises(es_translator):
from common.metadata_es_filter import UnsupportedMetaFilter
with pytest.raises(UnsupportedMetaFilter):
es_translator.translate({"key": "tag", "op": "contains", "value": ""})
def test_membership_with_empty_csv_raises(es_translator):
from common.metadata_es_filter import UnsupportedMetaFilter
with pytest.raises(UnsupportedMetaFilter):
es_translator.translate({"key": "tag", "op": "in", "value": ""})
# ===========================================================================
# Infinity-only tests
# ===========================================================================
def test_build_infinity_filter_and_logic():
body = build_infinity_filter(
[
{"key": "tag", "op": "=", "value": "alpha"},
{"key": "score", "op": ">", "value": "5"},
],
logic="and",
)
assert " AND " in body
assert "alpha" in body
def test_build_infinity_filter_or_logic():
body = build_infinity_filter(
[
{"key": "tag", "op": "=", "value": "alpha"},
{"key": "tag", "op": "=", "value": "beta"},
],
logic="or",
)
assert " OR " in body
assert "alpha" in body
assert "beta" in body
def test_empty_filter_list_returns_1eq1():
body = build_infinity_filter([], "and")
assert body == "1=1"
def test_infinity_equal_string_uses_lowercase(infinity_translator):
cond = infinity_translator.translate({"key": "tag", "op": "=", "value": "Alpha"})
assert cond == "JSON_CONTAINS(meta_fields, '$.tag', '\"Alpha\"')"
def test_infinity_equal_numeric_keeps_number(infinity_translator):
cond = infinity_translator.translate({"key": "score", "op": "=", "value": "5"})
assert cond == "JSON_CONTAINS(meta_fields, '$.score', 5)"
def test_infinity_equal_date_passes_unparsed(infinity_translator):
cond = infinity_translator.translate({"key": "published", "op": "=", "value": "2025-01-15"})
assert cond == "JSON_CONTAINS(meta_fields, '$.published', '\"2025-01-15\"')"
def test_infinity_not_equal_string(infinity_translator):
cond = infinity_translator.translate({"key": "tag", "op": "", "value": "alpha"})
assert "JSON_CONTAINS" in cond
assert "alpha" in cond
assert "NOT" in cond
def test_infinity_not_equal_numeric(infinity_translator):
cond = infinity_translator.translate({"key": "score", "op": "", "value": "5"})
assert "JSON_CONTAINS" in cond and "NOT" in cond and "5" in cond
@pytest.mark.parametrize("op,sql_op", [(">", ">"), ("<", "<"), ("", ">="), ("", "<=")])
def test_infinity_range_operators(infinity_translator, op, sql_op):
cond = infinity_translator.translate({"key": "score", "op": op, "value": "10"})
assert sql_op in cond
assert "JSON_EXTRACT_DOUBLE(meta_fields, '$.score')" in cond
def test_infinity_range_string_value(infinity_translator):
cond = infinity_translator.translate({"key": "published", "op": "", "value": "2025-01-15"})
assert ">=" in cond
assert "2025-01-15" in cond
def test_infinity_in_csv_lowercased(infinity_translator):
cond = infinity_translator.translate({"key": "status", "op": "in", "value": "Active,Pending"})
assert "JSON_CONTAINS" in cond
assert "active" in cond
assert "pending" in cond
def test_infinity_in_python_list(infinity_translator):
cond = infinity_translator.translate({"key": "status", "op": "in", "value": "['Open', 'Closed']"})
assert "JSON_CONTAINS" in cond
assert "open" in cond
assert "closed" in cond
def test_infinity_in_numeric_members(infinity_translator):
cond = infinity_translator.translate({"key": "year", "op": "in", "value": "[2024, 2025]"})
assert "JSON_CONTAINS" in cond
assert "2024" in cond
assert "2025" in cond
def test_infinity_not_in_csv(infinity_translator):
cond = infinity_translator.translate({"key": "status", "op": "not in", "value": "active,pending"})
assert "NOT JSON_CONTAINS" in cond
def test_infinity_contains_uses_JSON_CONTAINS(infinity_translator):
"""Infinity 'contains' uses JSON_CONTAINS for JSON array membership."""
cond = infinity_translator.translate({"key": "version", "op": "contains", "value": "earth"})
assert "JSON_CONTAINS" in cond
assert "earth" in cond
def test_infinity_contains_escapes_quotes(infinity_translator):
"""Special characters in contains value are escaped for JSON_CONTAINS."""
cond = infinity_translator.translate({"key": "title", "op": "contains", "value": "a%b_c"})
assert "JSON_CONTAINS" in cond
assert "a%b_c" in cond
def test_infinity_not_contains_uses_JSON_CONTAINS(infinity_translator):
"""Infinity 'not contains' uses JSON_CONTAINS with NOT."""
cond = infinity_translator.translate({"key": "version", "op": "not contains", "value": "earth"})
assert "JSON_CONTAINS" in cond
assert "NOT" in cond or "not" in cond.lower()
def test_infinity_start_with(infinity_translator):
cond = infinity_translator.translate({"key": "name", "op": "start with", "value": "pre"})
assert "LIKE" in cond
assert "'pre%" in cond
def test_infinity_end_with(infinity_translator):
"""Infinity 'end with' uses LIKE with trailing wildcard."""
cond = infinity_translator.translate({"key": "file", "op": "end with", "value": ".pdf"})
assert "LIKE" in cond
assert "%.pdf" in cond
def test_infinity_empty(infinity_translator):
cond = infinity_translator.translate({"key": "notes", "op": "empty", "value": ""})
assert "JSON_EXTRACT_STRING" in cond
assert '""' in cond
def test_infinity_not_empty(infinity_translator):
cond = infinity_translator.translate({"key": "notes", "op": "not empty", "value": ""})
assert "JSON_EXTRACT_STRING" in cond
assert "!=" in cond
def test_infinity_unknown_operator_raises(infinity_translator):
with pytest.raises(ValueError) as exc:
infinity_translator.translate({"key": "tag", "op": "regex", "value": "^foo"})
assert "regex" in str(exc.value)
def test_infinity_missing_key_raises(infinity_translator):
with pytest.raises(ValueError):
infinity_translator.translate({"op": "=", "value": "x"})
def test_infinity_invalid_key_format_raises(infinity_translator):
with pytest.raises(ValueError, match="invalid key format"):
infinity_translator.translate({"key": "a;b", "op": "=", "value": "x"})
def test_infinity_key_with_brace_raises(infinity_translator):
with pytest.raises(ValueError, match="invalid key format"):
infinity_translator.translate({"key": "field$}", "op": "=", "value": "x"})
def test_infinity_scalar_op_with_list_value_raises(infinity_translator):
with pytest.raises(ValueError):
infinity_translator.translate({"key": "tag", "op": "=", "value": ["a", "b"]})
def test_infinity_string_op_with_empty_value_raises(infinity_translator):
with pytest.raises(ValueError):
infinity_translator.translate({"key": "tag", "op": "contains", "value": ""})
def test_infinity_membership_with_empty_csv_raises(infinity_translator):
with pytest.raises(ValueError):
infinity_translator.translate({"key": "tag", "op": "in", "value": ""})