2025-12-12 17:12:38 +08:00
|
|
|
#
|
|
|
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
#
|
2025-12-30 12:56:48 +09:00
|
|
|
import ast
|
2025-12-17 16:50:36 +08:00
|
|
|
import logging
|
|
|
|
|
from typing import Any, Callable, Dict
|
|
|
|
|
|
|
|
|
|
import json_repair
|
2025-12-12 17:12:38 +08:00
|
|
|
|
2026-05-18 14:22:04 +08:00
|
|
|
|
2025-12-12 17:12:38 +08:00
|
|
|
def convert_conditions(metadata_condition):
|
|
|
|
|
if metadata_condition is None:
|
|
|
|
|
metadata_condition = {}
|
|
|
|
|
op_mapping = {
|
|
|
|
|
"is": "=",
|
fix: support date comparison operators (>=, <=, >, <) in metadata filtering (#12982)
## Description
This PR fixes the issue where date metadata conditions with comparison
operators (`>=`, `<=`, `>`, `<`) did not work correctly in the
`/api/v1/retrieval` endpoint.
## Problem
When using metadata conditions like:
```json
{
"metadata_condition": {
"conditions": [
{
"name": "date",
"comparison_operator": ">=",
"value": "2027-01-13"
}
]
}
}
The filtering did not work as expected because:
1. Operators >= and <= were not mapped to internal symbols ≥ and ≤
2. Date strings like "2027-01-13" failed to parse with
ast.literal_eval()
3. Non-standard date formats were incorrectly compared as strings
Solution
Changes in common/metadata_utils.py:
1. Added operator mapping in convert_conditions():
- >= → ≥
- <= → ≤
- != → ≠
2. Implemented strict date format detection in meta_filter():
- Only processes dates in YYYY-MM-DD format (10 characters, properly
formatted)
- When query value is a date, only matches data in the same standard
format
- Non-standard formats (e.g., "2026年1月13日", "2026-1-22") are skipped
3. Maintained backward compatibility:
- Numeric comparisons still work
- String comparisons still work
- Only affects date-formatted queries
Testing
All test cases pass (8/8):
- ✅ Date >= comparison
- ✅ Date > comparison
- ✅ Date < comparison
- ✅ Date <= comparison
- ✅ Date = comparison
- ✅ Date range queries
- ✅ Non-date string comparison (backward compatibility)
- ✅ Numeric comparison (backward compatibility)
Example Usage
{
"dataset_ids": ["xxx"],
"question": "test",
"metadata_condition": {
"conditions": [
{
"name": "date",
"comparison_operator": ">=",
"value": "2027-01-13"
}
]
}
}
Notes
- Only supports standard YYYY-MM-DD format
- Non-standard date formats in data are treated as data quality issues
and will not match
- Users should ensure their date metadata is in the correct format
---------
Co-authored-by: Clint-chan <Clint-chan@users.noreply.github.com>
2026-02-05 13:52:51 +08:00
|
|
|
"not is": "≠",
|
|
|
|
|
">=": "≥",
|
|
|
|
|
"<=": "≤",
|
|
|
|
|
"!=": "≠"
|
2025-12-12 17:12:38 +08:00
|
|
|
}
|
|
|
|
|
return [
|
|
|
|
|
{
|
|
|
|
|
"op": op_mapping.get(cond["comparison_operator"], cond["comparison_operator"]),
|
|
|
|
|
"key": cond["name"],
|
|
|
|
|
"value": cond["value"]
|
|
|
|
|
}
|
|
|
|
|
for cond in metadata_condition.get("conditions", [])
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
|
2026-05-29 04:33:26 -07:00
|
|
|
doc_ids = None
|
2025-12-12 17:12:38 +08:00
|
|
|
|
2026-05-06 14:28:25 +08:00
|
|
|
def normalize_string_values(value):
|
|
|
|
|
if isinstance(value, str):
|
|
|
|
|
return value.lower()
|
|
|
|
|
if isinstance(value, list):
|
|
|
|
|
return [item.lower() if isinstance(item, str) else item for item in value]
|
|
|
|
|
return value
|
|
|
|
|
|
2025-12-12 17:12:38 +08:00
|
|
|
def filter_out(v2docs, operator, value):
|
|
|
|
|
ids = []
|
|
|
|
|
for input, docids in v2docs.items():
|
2025-12-24 09:32:19 +08:00
|
|
|
|
2025-12-12 17:12:38 +08:00
|
|
|
if operator in ["=", "≠", ">", "<", "≥", "≤"]:
|
fix: support date comparison operators (>=, <=, >, <) in metadata filtering (#12982)
## Description
This PR fixes the issue where date metadata conditions with comparison
operators (`>=`, `<=`, `>`, `<`) did not work correctly in the
`/api/v1/retrieval` endpoint.
## Problem
When using metadata conditions like:
```json
{
"metadata_condition": {
"conditions": [
{
"name": "date",
"comparison_operator": ">=",
"value": "2027-01-13"
}
]
}
}
The filtering did not work as expected because:
1. Operators >= and <= were not mapped to internal symbols ≥ and ≤
2. Date strings like "2027-01-13" failed to parse with
ast.literal_eval()
3. Non-standard date formats were incorrectly compared as strings
Solution
Changes in common/metadata_utils.py:
1. Added operator mapping in convert_conditions():
- >= → ≥
- <= → ≤
- != → ≠
2. Implemented strict date format detection in meta_filter():
- Only processes dates in YYYY-MM-DD format (10 characters, properly
formatted)
- When query value is a date, only matches data in the same standard
format
- Non-standard formats (e.g., "2026年1月13日", "2026-1-22") are skipped
3. Maintained backward compatibility:
- Numeric comparisons still work
- String comparisons still work
- Only affects date-formatted queries
Testing
All test cases pass (8/8):
- ✅ Date >= comparison
- ✅ Date > comparison
- ✅ Date < comparison
- ✅ Date <= comparison
- ✅ Date = comparison
- ✅ Date range queries
- ✅ Non-date string comparison (backward compatibility)
- ✅ Numeric comparison (backward compatibility)
Example Usage
{
"dataset_ids": ["xxx"],
"question": "test",
"metadata_condition": {
"conditions": [
{
"name": "date",
"comparison_operator": ">=",
"value": "2027-01-13"
}
]
}
}
Notes
- Only supports standard YYYY-MM-DD format
- Non-standard date formats in data are treated as data quality issues
and will not match
- Users should ensure their date metadata is in the correct format
---------
Co-authored-by: Clint-chan <Clint-chan@users.noreply.github.com>
2026-02-05 13:52:51 +08:00
|
|
|
# Check if input is in YYYY-MM-DD date format
|
|
|
|
|
input_str = str(input).strip()
|
|
|
|
|
value_str = str(value).strip()
|
|
|
|
|
|
|
|
|
|
# Strict date format detection: YYYY-MM-DD (must be 10 chars with correct format)
|
|
|
|
|
is_input_date = (
|
2026-05-18 14:22:04 +08:00
|
|
|
len(input_str) == 10 and
|
|
|
|
|
input_str[4] == '-' and
|
|
|
|
|
input_str[7] == '-' and
|
|
|
|
|
input_str[:4].isdigit() and
|
|
|
|
|
input_str[5:7].isdigit() and
|
|
|
|
|
input_str[8:10].isdigit()
|
fix: support date comparison operators (>=, <=, >, <) in metadata filtering (#12982)
## Description
This PR fixes the issue where date metadata conditions with comparison
operators (`>=`, `<=`, `>`, `<`) did not work correctly in the
`/api/v1/retrieval` endpoint.
## Problem
When using metadata conditions like:
```json
{
"metadata_condition": {
"conditions": [
{
"name": "date",
"comparison_operator": ">=",
"value": "2027-01-13"
}
]
}
}
The filtering did not work as expected because:
1. Operators >= and <= were not mapped to internal symbols ≥ and ≤
2. Date strings like "2027-01-13" failed to parse with
ast.literal_eval()
3. Non-standard date formats were incorrectly compared as strings
Solution
Changes in common/metadata_utils.py:
1. Added operator mapping in convert_conditions():
- >= → ≥
- <= → ≤
- != → ≠
2. Implemented strict date format detection in meta_filter():
- Only processes dates in YYYY-MM-DD format (10 characters, properly
formatted)
- When query value is a date, only matches data in the same standard
format
- Non-standard formats (e.g., "2026年1月13日", "2026-1-22") are skipped
3. Maintained backward compatibility:
- Numeric comparisons still work
- String comparisons still work
- Only affects date-formatted queries
Testing
All test cases pass (8/8):
- ✅ Date >= comparison
- ✅ Date > comparison
- ✅ Date < comparison
- ✅ Date <= comparison
- ✅ Date = comparison
- ✅ Date range queries
- ✅ Non-date string comparison (backward compatibility)
- ✅ Numeric comparison (backward compatibility)
Example Usage
{
"dataset_ids": ["xxx"],
"question": "test",
"metadata_condition": {
"conditions": [
{
"name": "date",
"comparison_operator": ">=",
"value": "2027-01-13"
}
]
}
}
Notes
- Only supports standard YYYY-MM-DD format
- Non-standard date formats in data are treated as data quality issues
and will not match
- Users should ensure their date metadata is in the correct format
---------
Co-authored-by: Clint-chan <Clint-chan@users.noreply.github.com>
2026-02-05 13:52:51 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
is_value_date = (
|
2026-05-18 14:22:04 +08:00
|
|
|
len(value_str) == 10 and
|
|
|
|
|
value_str[4] == '-' and
|
|
|
|
|
value_str[7] == '-' and
|
|
|
|
|
value_str[:4].isdigit() and
|
|
|
|
|
value_str[5:7].isdigit() and
|
|
|
|
|
value_str[8:10].isdigit()
|
fix: support date comparison operators (>=, <=, >, <) in metadata filtering (#12982)
## Description
This PR fixes the issue where date metadata conditions with comparison
operators (`>=`, `<=`, `>`, `<`) did not work correctly in the
`/api/v1/retrieval` endpoint.
## Problem
When using metadata conditions like:
```json
{
"metadata_condition": {
"conditions": [
{
"name": "date",
"comparison_operator": ">=",
"value": "2027-01-13"
}
]
}
}
The filtering did not work as expected because:
1. Operators >= and <= were not mapped to internal symbols ≥ and ≤
2. Date strings like "2027-01-13" failed to parse with
ast.literal_eval()
3. Non-standard date formats were incorrectly compared as strings
Solution
Changes in common/metadata_utils.py:
1. Added operator mapping in convert_conditions():
- >= → ≥
- <= → ≤
- != → ≠
2. Implemented strict date format detection in meta_filter():
- Only processes dates in YYYY-MM-DD format (10 characters, properly
formatted)
- When query value is a date, only matches data in the same standard
format
- Non-standard formats (e.g., "2026年1月13日", "2026-1-22") are skipped
3. Maintained backward compatibility:
- Numeric comparisons still work
- String comparisons still work
- Only affects date-formatted queries
Testing
All test cases pass (8/8):
- ✅ Date >= comparison
- ✅ Date > comparison
- ✅ Date < comparison
- ✅ Date <= comparison
- ✅ Date = comparison
- ✅ Date range queries
- ✅ Non-date string comparison (backward compatibility)
- ✅ Numeric comparison (backward compatibility)
Example Usage
{
"dataset_ids": ["xxx"],
"question": "test",
"metadata_condition": {
"conditions": [
{
"name": "date",
"comparison_operator": ">=",
"value": "2027-01-13"
}
]
}
}
Notes
- Only supports standard YYYY-MM-DD format
- Non-standard date formats in data are treated as data quality issues
and will not match
- Users should ensure their date metadata is in the correct format
---------
Co-authored-by: Clint-chan <Clint-chan@users.noreply.github.com>
2026-02-05 13:52:51 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if is_value_date:
|
|
|
|
|
# Query value is in date format
|
|
|
|
|
if is_input_date:
|
|
|
|
|
# Data is also in date format: perform date comparison
|
|
|
|
|
input = input_str
|
|
|
|
|
value = value_str
|
|
|
|
|
else:
|
|
|
|
|
# Data is not in date format: skip this record (no match)
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
# Query value is not in date format: use original logic
|
|
|
|
|
try:
|
|
|
|
|
if isinstance(input, list):
|
|
|
|
|
input = input[0]
|
|
|
|
|
input = ast.literal_eval(input)
|
|
|
|
|
value = ast.literal_eval(value)
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
# Convert strings to lowercase
|
|
|
|
|
if isinstance(input, str):
|
|
|
|
|
input = input.lower()
|
|
|
|
|
if isinstance(value, str):
|
|
|
|
|
value = value.lower()
|
|
|
|
|
else:
|
|
|
|
|
# Non-comparison operators: maintain original logic
|
2026-05-06 14:28:25 +08:00
|
|
|
input = normalize_string_values(input)
|
|
|
|
|
value = normalize_string_values(value)
|
2025-12-12 17:12:38 +08:00
|
|
|
|
2025-12-30 12:56:48 +09:00
|
|
|
matched = False
|
|
|
|
|
try:
|
|
|
|
|
if operator == "contains":
|
2026-05-18 14:22:04 +08:00
|
|
|
matched = str(input).find(value) >= 0 if not isinstance(input, list) else any(
|
|
|
|
|
str(i).find(value) >= 0 for i in input)
|
2025-12-30 12:56:48 +09:00
|
|
|
elif operator == "not contains":
|
2026-05-18 14:22:04 +08:00
|
|
|
matched = str(input).find(value) == -1 if not isinstance(input, list) else all(
|
|
|
|
|
str(i).find(value) == -1 for i in input)
|
2025-12-30 12:56:48 +09:00
|
|
|
elif operator == "in":
|
|
|
|
|
matched = input in value if not isinstance(input, list) else all(i in value for i in input)
|
|
|
|
|
elif operator == "not in":
|
|
|
|
|
matched = input not in value if not isinstance(input, list) else all(i not in value for i in input)
|
|
|
|
|
elif operator == "start with":
|
2026-05-18 14:22:04 +08:00
|
|
|
matched = str(input).lower().startswith(str(value).lower()) if not isinstance(input,
|
|
|
|
|
list) else "".join(
|
|
|
|
|
[str(i).lower() for i in input]).startswith(str(value).lower())
|
2025-12-30 12:56:48 +09:00
|
|
|
elif operator == "end with":
|
2026-05-18 14:22:04 +08:00
|
|
|
matched = str(input).lower().endswith(str(value).lower()) if not isinstance(input,
|
|
|
|
|
list) else "".join(
|
|
|
|
|
[str(i).lower() for i in input]).endswith(str(value).lower())
|
2025-12-30 12:56:48 +09:00
|
|
|
elif operator == "empty":
|
|
|
|
|
matched = not input
|
|
|
|
|
elif operator == "not empty":
|
|
|
|
|
matched = bool(input)
|
|
|
|
|
elif operator == "=":
|
|
|
|
|
matched = input == value
|
|
|
|
|
elif operator == "≠":
|
|
|
|
|
matched = input != value
|
|
|
|
|
elif operator == ">":
|
|
|
|
|
matched = input > value
|
|
|
|
|
elif operator == "<":
|
|
|
|
|
matched = input < value
|
|
|
|
|
elif operator == "≥":
|
|
|
|
|
matched = input >= value
|
|
|
|
|
elif operator == "≤":
|
|
|
|
|
matched = input <= value
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
if matched:
|
|
|
|
|
ids.extend(docids)
|
2025-12-12 17:12:38 +08:00
|
|
|
return ids
|
|
|
|
|
|
fix: ensure all metadata filters are processed in AND logic (#13019)
### What problem does this PR solve?
Bug: When a filter key doesn't exist in metas or has no matching values,
the filter was skipped entirely, causing AND logic to fail.
Example:
- Filter 1: meeting_series = '宏观早8点' (matches doc1, doc2, doc3)
- Filter 2: date = '2026-03-05' (no matches)
- Expected: [] (AND should return empty)
- Actual: [doc1, doc2, doc3] (Filter 2 was skipped)
Root cause:
Old logic iterated metas.items() first, then filters. If a filter's key
wasn't in metas, it was never processed.
Fix:
Iterate filters first, then look up in metas. If key not found, treat as
no match (empty result), which correctly applies AND logic.
Changes:
- Changed loop order from 'for k in metas: for f in filters' to 'for f
in filters: if f.key in metas'
- Explicitly handle missing keys as empty results
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
Co-authored-by: Clint-chan <Clint-chan@users.noreply.github.com>
2026-02-06 12:57:27 +08:00
|
|
|
for f in filters:
|
|
|
|
|
k = f["key"]
|
|
|
|
|
if k not in metas:
|
|
|
|
|
# Key not found in metas: treat as no match
|
|
|
|
|
ids = []
|
|
|
|
|
else:
|
|
|
|
|
v2docs = metas[k]
|
2025-12-12 17:12:38 +08:00
|
|
|
ids = filter_out(v2docs, f["op"], f["value"])
|
fix: ensure all metadata filters are processed in AND logic (#13019)
### What problem does this PR solve?
Bug: When a filter key doesn't exist in metas or has no matching values,
the filter was skipped entirely, causing AND logic to fail.
Example:
- Filter 1: meeting_series = '宏观早8点' (matches doc1, doc2, doc3)
- Filter 2: date = '2026-03-05' (no matches)
- Expected: [] (AND should return empty)
- Actual: [doc1, doc2, doc3] (Filter 2 was skipped)
Root cause:
Old logic iterated metas.items() first, then filters. If a filter's key
wasn't in metas, it was never processed.
Fix:
Iterate filters first, then look up in metas. If key not found, treat as
no match (empty result), which correctly applies AND logic.
Changes:
- Changed loop order from 'for k in metas: for f in filters' to 'for f
in filters: if f.key in metas'
- Explicitly handle missing keys as empty results
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
Co-authored-by: Clint-chan <Clint-chan@users.noreply.github.com>
2026-02-06 12:57:27 +08:00
|
|
|
|
2026-05-29 04:33:26 -07:00
|
|
|
if doc_ids is None:
|
fix: ensure all metadata filters are processed in AND logic (#13019)
### What problem does this PR solve?
Bug: When a filter key doesn't exist in metas or has no matching values,
the filter was skipped entirely, causing AND logic to fail.
Example:
- Filter 1: meeting_series = '宏观早8点' (matches doc1, doc2, doc3)
- Filter 2: date = '2026-03-05' (no matches)
- Expected: [] (AND should return empty)
- Actual: [doc1, doc2, doc3] (Filter 2 was skipped)
Root cause:
Old logic iterated metas.items() first, then filters. If a filter's key
wasn't in metas, it was never processed.
Fix:
Iterate filters first, then look up in metas. If key not found, treat as
no match (empty result), which correctly applies AND logic.
Changes:
- Changed loop order from 'for k in metas: for f in filters' to 'for f
in filters: if f.key in metas'
- Explicitly handle missing keys as empty results
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
Co-authored-by: Clint-chan <Clint-chan@users.noreply.github.com>
2026-02-06 12:57:27 +08:00
|
|
|
doc_ids = set(ids)
|
|
|
|
|
else:
|
|
|
|
|
if logic == "and":
|
|
|
|
|
doc_ids = doc_ids & set(ids)
|
|
|
|
|
if not doc_ids:
|
2026-05-18 14:22:04 +08:00
|
|
|
logging.debug(f"meta_filter filters={filters}, logic={logic}, early return []")
|
fix: ensure all metadata filters are processed in AND logic (#13019)
### What problem does this PR solve?
Bug: When a filter key doesn't exist in metas or has no matching values,
the filter was skipped entirely, causing AND logic to fail.
Example:
- Filter 1: meeting_series = '宏观早8点' (matches doc1, doc2, doc3)
- Filter 2: date = '2026-03-05' (no matches)
- Expected: [] (AND should return empty)
- Actual: [doc1, doc2, doc3] (Filter 2 was skipped)
Root cause:
Old logic iterated metas.items() first, then filters. If a filter's key
wasn't in metas, it was never processed.
Fix:
Iterate filters first, then look up in metas. If key not found, treat as
no match (empty result), which correctly applies AND logic.
Changes:
- Changed loop order from 'for k in metas: for f in filters' to 'for f
in filters: if f.key in metas'
- Explicitly handle missing keys as empty results
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
Co-authored-by: Clint-chan <Clint-chan@users.noreply.github.com>
2026-02-06 12:57:27 +08:00
|
|
|
return []
|
2025-12-12 17:12:38 +08:00
|
|
|
else:
|
fix: ensure all metadata filters are processed in AND logic (#13019)
### What problem does this PR solve?
Bug: When a filter key doesn't exist in metas or has no matching values,
the filter was skipped entirely, causing AND logic to fail.
Example:
- Filter 1: meeting_series = '宏观早8点' (matches doc1, doc2, doc3)
- Filter 2: date = '2026-03-05' (no matches)
- Expected: [] (AND should return empty)
- Actual: [doc1, doc2, doc3] (Filter 2 was skipped)
Root cause:
Old logic iterated metas.items() first, then filters. If a filter's key
wasn't in metas, it was never processed.
Fix:
Iterate filters first, then look up in metas. If key not found, treat as
no match (empty result), which correctly applies AND logic.
Changes:
- Changed loop order from 'for k in metas: for f in filters' to 'for f
in filters: if f.key in metas'
- Explicitly handle missing keys as empty results
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
Co-authored-by: Clint-chan <Clint-chan@users.noreply.github.com>
2026-02-06 12:57:27 +08:00
|
|
|
doc_ids = doc_ids | set(ids)
|
2026-05-29 04:33:26 -07:00
|
|
|
return list(doc_ids or [])
|
2025-12-12 17:12:38 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
async def apply_meta_data_filter(
|
2026-05-18 14:22:04 +08:00
|
|
|
meta_data_filter: dict | None,
|
|
|
|
|
metas: dict | None = None,
|
|
|
|
|
question: str = "",
|
|
|
|
|
chat_mdl: Any = None,
|
|
|
|
|
base_doc_ids: list[str] | None = None,
|
|
|
|
|
manual_value_resolver: Callable[[dict], dict] | None = None,
|
|
|
|
|
kb_ids: list[str] | None = None,
|
|
|
|
|
metas_loader: Callable[[], dict] | None = None,
|
2025-12-12 17:12:38 +08:00
|
|
|
) -> list[str] | None:
|
|
|
|
|
"""
|
|
|
|
|
Apply metadata filtering rules and return the filtered doc_ids.
|
|
|
|
|
|
|
|
|
|
meta_data_filter supports three modes:
|
|
|
|
|
- auto: generate filter conditions via LLM (gen_meta_filter)
|
|
|
|
|
- semi_auto: generate conditions using selected metadata keys only
|
|
|
|
|
- manual: directly filter based on provided conditions
|
|
|
|
|
|
2026-05-18 14:22:04 +08:00
|
|
|
When ``kb_ids`` is supplied, metadata filters are pushed down to the doc metadata
|
|
|
|
|
index (ES/Infinity) via ``DocMetadataService.filter_doc_ids_by_metadata`` instead
|
|
|
|
|
of being evaluated in Python over ``metas``. The in-memory ``meta_filter`` path
|
|
|
|
|
remains the fallback so callers without a KB scope, or backends without push-down
|
|
|
|
|
support, behave exactly as before.
|
Perf: push metadata filters down to Elasticsearch (#14576)
### What problem does this PR solve?
Fixes #14412.
`common.metadata_utils.meta_filter` evaluates user-defined metadata
conditions in Python after `DocMetadataService.get_flatted_meta_by_kbs`
loads the entire `meta_fields` table into memory. Past a few thousand
documents per knowledge base this becomes a memory bottleneck and a
wasted ES round-trip — every filter request currently fetches up to
10000 metadata rows even when the resulting `doc_ids` list is tiny.
This PR adds an ES push-down path that translates the same filter
language into a `bool` query and returns just the matching document IDs.
**Changes**
- `common/metadata_es_filter.py` *(new)*: pure-Python translator from
the RAGflow filter list to ES DSL. Covers every operator the in-memory
path supports (`=`, `≠`, `>`, `<`, `≥`, `≤`, `in`, `not in`, `contains`,
`not contains`, `start with`, `end with`, `empty`, `not empty`) with
`case_insensitive: true` on `prefix` and `wildcard` for parity with the
existing lower-cased Python comparisons. User wildcard metacharacters
are escaped before being injected into `wildcard` patterns. Negative
operators (`≠`, `not in`, `not contains`, ranges) are wrapped with an
`exists` guard so they do not accidentally match documents missing the
key, matching the legacy `if k not in metas` behaviour.
- `api/db/services/doc_metadata_service.py`: new
`DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, filters,
logic)` that returns the doc IDs ES matched, or `None` to signal the
caller should fall back to the in-memory path. Returns `None` when the
active doc store is Infinity (`meta_fields` is a JSON column, not a
dotted-object mapping), when any filter cannot be expressed in DSL
(`UnsupportedMetaFilter`), or when the ES request or metadata index
lookup errors.
- `common/metadata_utils.py`: `apply_meta_data_filter` accepts an
optional `kb_ids` argument. When supplied, conditions go through
push-down first via a new `_try_meta_pushdown` helper; on `None` the
function falls back to the original `meta_filter` call. Default
behaviour is unchanged for callers that don't pass `kb_ids`.
- Updated all four callers (`agent/tools/retrieval.py`,
`api/db/services/dialog_service.py` ×2,
`api/apps/services/dataset_api_service.py`, `api/apps/sdk/session.py`)
to forward `kb_ids` so the push-down path is exercised in production.
- `test/unit_test/common/test_metadata_es_filter.py` *(new)*: 35 unit
tests covering every operator's DSL shape, value coercion
(`ast.literal_eval`, lowercasing, ISO-date pass-through), wildcard
escaping, OR-logic wrapping that protects negative clauses, and the
doc-ID extractor.
**Behaviour preserved**
- The in-memory `meta_filter` is untouched and still services every
fallback case (Infinity backend, unknown operators, ES outages).
- The eligibility / credibility / issue-multiplier semantics described
in the LLM-driven `auto` and `semi_auto` modes still hand the LLM the
full in-memory `metas` dict to choose conditions from. Only the
*evaluation* of those generated conditions is pushed down.
- Existing tests in
`test/unit_test/common/test_metadata_filter_operators.py` continue to
pass (14/14).
**Test plan**
- `pytest test/unit_test/common/test_metadata_es_filter.py` — 35 passed.
- `pytest test/unit_test/common/test_metadata_filter_operators.py` — 14
passed.
- `ruff check` clean on every modified file.
- Reviewer please validate the ES query shapes against a live cluster —
particularly `case_insensitive` on `wildcard` and `prefix` (requires ES
7.10+) and the `exists` + `must_not` pairing for `≠`.
**Notes**
- The first cut caps each push-down request at 10000 results, matching
the existing `get_flatted_meta_by_kbs` limit, and logs a warning when
the cap is hit. A `search_after` follow-up would let us drop the cap
entirely once the push-down path is validated.
- Operator parity with the in-memory path is exact for the canonical
unicode operators (`≥`, `≤`, `≠`) used internally; the ASCII aliases
(`>=`, `<=`, `!=`) are normalised by `convert_conditions` before they
reach the translator.
### Type of change
- [x] Performance Improvement
---------
Co-authored-by: sxxtony <sxxtony@users.noreply.github.com>
2026-05-07 16:23:43 +03:00
|
|
|
|
|
|
|
|
``metas`` may be supplied eagerly or via ``metas_loader``. The loader is
|
|
|
|
|
only invoked when the metadata dict is actually needed — i.e. for the LLM
|
|
|
|
|
context in ``auto`` / ``semi_auto`` modes, or as the in-memory fallback
|
|
|
|
|
when push-down can't service a request. ``manual`` mode that lands on the
|
|
|
|
|
push-down path therefore skips the expensive
|
|
|
|
|
``get_flatted_meta_by_kbs`` round-trip entirely.
|
|
|
|
|
|
2025-12-12 17:12:38 +08:00
|
|
|
Returns:
|
|
|
|
|
list of doc_ids, ["-999"] when manual filters yield no result, or None
|
|
|
|
|
when auto/semi_auto filters return empty.
|
|
|
|
|
"""
|
2026-05-18 14:22:04 +08:00
|
|
|
from rag.prompts.generator import gen_meta_filter # move from the top of the file to avoid circular import
|
2026-01-29 02:59:48 +01:00
|
|
|
|
2025-12-12 17:12:38 +08:00
|
|
|
doc_ids = list(base_doc_ids) if base_doc_ids else []
|
|
|
|
|
|
|
|
|
|
if not meta_data_filter:
|
|
|
|
|
return doc_ids
|
|
|
|
|
|
|
|
|
|
method = meta_data_filter.get("method")
|
|
|
|
|
|
Perf: push metadata filters down to Elasticsearch (#14576)
### What problem does this PR solve?
Fixes #14412.
`common.metadata_utils.meta_filter` evaluates user-defined metadata
conditions in Python after `DocMetadataService.get_flatted_meta_by_kbs`
loads the entire `meta_fields` table into memory. Past a few thousand
documents per knowledge base this becomes a memory bottleneck and a
wasted ES round-trip — every filter request currently fetches up to
10000 metadata rows even when the resulting `doc_ids` list is tiny.
This PR adds an ES push-down path that translates the same filter
language into a `bool` query and returns just the matching document IDs.
**Changes**
- `common/metadata_es_filter.py` *(new)*: pure-Python translator from
the RAGflow filter list to ES DSL. Covers every operator the in-memory
path supports (`=`, `≠`, `>`, `<`, `≥`, `≤`, `in`, `not in`, `contains`,
`not contains`, `start with`, `end with`, `empty`, `not empty`) with
`case_insensitive: true` on `prefix` and `wildcard` for parity with the
existing lower-cased Python comparisons. User wildcard metacharacters
are escaped before being injected into `wildcard` patterns. Negative
operators (`≠`, `not in`, `not contains`, ranges) are wrapped with an
`exists` guard so they do not accidentally match documents missing the
key, matching the legacy `if k not in metas` behaviour.
- `api/db/services/doc_metadata_service.py`: new
`DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, filters,
logic)` that returns the doc IDs ES matched, or `None` to signal the
caller should fall back to the in-memory path. Returns `None` when the
active doc store is Infinity (`meta_fields` is a JSON column, not a
dotted-object mapping), when any filter cannot be expressed in DSL
(`UnsupportedMetaFilter`), or when the ES request or metadata index
lookup errors.
- `common/metadata_utils.py`: `apply_meta_data_filter` accepts an
optional `kb_ids` argument. When supplied, conditions go through
push-down first via a new `_try_meta_pushdown` helper; on `None` the
function falls back to the original `meta_filter` call. Default
behaviour is unchanged for callers that don't pass `kb_ids`.
- Updated all four callers (`agent/tools/retrieval.py`,
`api/db/services/dialog_service.py` ×2,
`api/apps/services/dataset_api_service.py`, `api/apps/sdk/session.py`)
to forward `kb_ids` so the push-down path is exercised in production.
- `test/unit_test/common/test_metadata_es_filter.py` *(new)*: 35 unit
tests covering every operator's DSL shape, value coercion
(`ast.literal_eval`, lowercasing, ISO-date pass-through), wildcard
escaping, OR-logic wrapping that protects negative clauses, and the
doc-ID extractor.
**Behaviour preserved**
- The in-memory `meta_filter` is untouched and still services every
fallback case (Infinity backend, unknown operators, ES outages).
- The eligibility / credibility / issue-multiplier semantics described
in the LLM-driven `auto` and `semi_auto` modes still hand the LLM the
full in-memory `metas` dict to choose conditions from. Only the
*evaluation* of those generated conditions is pushed down.
- Existing tests in
`test/unit_test/common/test_metadata_filter_operators.py` continue to
pass (14/14).
**Test plan**
- `pytest test/unit_test/common/test_metadata_es_filter.py` — 35 passed.
- `pytest test/unit_test/common/test_metadata_filter_operators.py` — 14
passed.
- `ruff check` clean on every modified file.
- Reviewer please validate the ES query shapes against a live cluster —
particularly `case_insensitive` on `wildcard` and `prefix` (requires ES
7.10+) and the `exists` + `must_not` pairing for `≠`.
**Notes**
- The first cut caps each push-down request at 10000 results, matching
the existing `get_flatted_meta_by_kbs` limit, and logs a warning when
the cap is hit. A `search_after` follow-up would let us drop the cap
entirely once the push-down path is validated.
- Operator parity with the in-memory path is exact for the canonical
unicode operators (`≥`, `≤`, `≠`) used internally; the ASCII aliases
(`>=`, `<=`, `!=`) are normalised by `convert_conditions` before they
reach the translator.
### Type of change
- [x] Performance Improvement
---------
Co-authored-by: sxxtony <sxxtony@users.noreply.github.com>
2026-05-07 16:23:43 +03:00
|
|
|
# Memoised metadata loader. ``_get_metas`` materialises the dict at most
|
|
|
|
|
# once per call; downstream branches that never reach an in-memory eval
|
|
|
|
|
# leave the loader untouched.
|
|
|
|
|
cached_metas: dict | None = metas
|
|
|
|
|
|
|
|
|
|
def _get_metas() -> dict:
|
|
|
|
|
nonlocal cached_metas
|
|
|
|
|
if cached_metas is None:
|
|
|
|
|
cached_metas = metas_loader() if metas_loader else {}
|
|
|
|
|
return cached_metas
|
|
|
|
|
|
2026-05-18 14:22:04 +08:00
|
|
|
def _run_metadata_filter(conditions: list[dict], logic: str) -> list[str]:
|
|
|
|
|
"""Run conditions through ES/Infinity push-down when possible, in-memory otherwise."""
|
Perf: push metadata filters down to Elasticsearch (#14576)
### What problem does this PR solve?
Fixes #14412.
`common.metadata_utils.meta_filter` evaluates user-defined metadata
conditions in Python after `DocMetadataService.get_flatted_meta_by_kbs`
loads the entire `meta_fields` table into memory. Past a few thousand
documents per knowledge base this becomes a memory bottleneck and a
wasted ES round-trip — every filter request currently fetches up to
10000 metadata rows even when the resulting `doc_ids` list is tiny.
This PR adds an ES push-down path that translates the same filter
language into a `bool` query and returns just the matching document IDs.
**Changes**
- `common/metadata_es_filter.py` *(new)*: pure-Python translator from
the RAGflow filter list to ES DSL. Covers every operator the in-memory
path supports (`=`, `≠`, `>`, `<`, `≥`, `≤`, `in`, `not in`, `contains`,
`not contains`, `start with`, `end with`, `empty`, `not empty`) with
`case_insensitive: true` on `prefix` and `wildcard` for parity with the
existing lower-cased Python comparisons. User wildcard metacharacters
are escaped before being injected into `wildcard` patterns. Negative
operators (`≠`, `not in`, `not contains`, ranges) are wrapped with an
`exists` guard so they do not accidentally match documents missing the
key, matching the legacy `if k not in metas` behaviour.
- `api/db/services/doc_metadata_service.py`: new
`DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, filters,
logic)` that returns the doc IDs ES matched, or `None` to signal the
caller should fall back to the in-memory path. Returns `None` when the
active doc store is Infinity (`meta_fields` is a JSON column, not a
dotted-object mapping), when any filter cannot be expressed in DSL
(`UnsupportedMetaFilter`), or when the ES request or metadata index
lookup errors.
- `common/metadata_utils.py`: `apply_meta_data_filter` accepts an
optional `kb_ids` argument. When supplied, conditions go through
push-down first via a new `_try_meta_pushdown` helper; on `None` the
function falls back to the original `meta_filter` call. Default
behaviour is unchanged for callers that don't pass `kb_ids`.
- Updated all four callers (`agent/tools/retrieval.py`,
`api/db/services/dialog_service.py` ×2,
`api/apps/services/dataset_api_service.py`, `api/apps/sdk/session.py`)
to forward `kb_ids` so the push-down path is exercised in production.
- `test/unit_test/common/test_metadata_es_filter.py` *(new)*: 35 unit
tests covering every operator's DSL shape, value coercion
(`ast.literal_eval`, lowercasing, ISO-date pass-through), wildcard
escaping, OR-logic wrapping that protects negative clauses, and the
doc-ID extractor.
**Behaviour preserved**
- The in-memory `meta_filter` is untouched and still services every
fallback case (Infinity backend, unknown operators, ES outages).
- The eligibility / credibility / issue-multiplier semantics described
in the LLM-driven `auto` and `semi_auto` modes still hand the LLM the
full in-memory `metas` dict to choose conditions from. Only the
*evaluation* of those generated conditions is pushed down.
- Existing tests in
`test/unit_test/common/test_metadata_filter_operators.py` continue to
pass (14/14).
**Test plan**
- `pytest test/unit_test/common/test_metadata_es_filter.py` — 35 passed.
- `pytest test/unit_test/common/test_metadata_filter_operators.py` — 14
passed.
- `ruff check` clean on every modified file.
- Reviewer please validate the ES query shapes against a live cluster —
particularly `case_insensitive` on `wildcard` and `prefix` (requires ES
7.10+) and the `exists` + `must_not` pairing for `≠`.
**Notes**
- The first cut caps each push-down request at 10000 results, matching
the existing `get_flatted_meta_by_kbs` limit, and logs a warning when
the cap is hit. A `search_after` follow-up would let us drop the cap
entirely once the push-down path is validated.
- Operator parity with the in-memory path is exact for the canonical
unicode operators (`≥`, `≤`, `≠`) used internally; the ASCII aliases
(`>=`, `<=`, `!=`) are normalised by `convert_conditions` before they
reach the translator.
### Type of change
- [x] Performance Improvement
---------
Co-authored-by: sxxtony <sxxtony@users.noreply.github.com>
2026-05-07 16:23:43 +03:00
|
|
|
if conditions and kb_ids:
|
2026-05-18 14:22:04 +08:00
|
|
|
try:
|
|
|
|
|
from api.db.services.doc_metadata_service import DocMetadataService
|
|
|
|
|
doc_ids = DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, conditions, logic)
|
|
|
|
|
logging.debug(f"Doc ids filtered by metadata: {doc_ids}")
|
|
|
|
|
if doc_ids is not None:
|
|
|
|
|
return doc_ids
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.error(f"Metadata filter push down errored: {e}")
|
|
|
|
|
|
|
|
|
|
# In-memory fallback
|
|
|
|
|
logging.debug("Metadata filter falls back to in-memory filter")
|
Perf: push metadata filters down to Elasticsearch (#14576)
### What problem does this PR solve?
Fixes #14412.
`common.metadata_utils.meta_filter` evaluates user-defined metadata
conditions in Python after `DocMetadataService.get_flatted_meta_by_kbs`
loads the entire `meta_fields` table into memory. Past a few thousand
documents per knowledge base this becomes a memory bottleneck and a
wasted ES round-trip — every filter request currently fetches up to
10000 metadata rows even when the resulting `doc_ids` list is tiny.
This PR adds an ES push-down path that translates the same filter
language into a `bool` query and returns just the matching document IDs.
**Changes**
- `common/metadata_es_filter.py` *(new)*: pure-Python translator from
the RAGflow filter list to ES DSL. Covers every operator the in-memory
path supports (`=`, `≠`, `>`, `<`, `≥`, `≤`, `in`, `not in`, `contains`,
`not contains`, `start with`, `end with`, `empty`, `not empty`) with
`case_insensitive: true` on `prefix` and `wildcard` for parity with the
existing lower-cased Python comparisons. User wildcard metacharacters
are escaped before being injected into `wildcard` patterns. Negative
operators (`≠`, `not in`, `not contains`, ranges) are wrapped with an
`exists` guard so they do not accidentally match documents missing the
key, matching the legacy `if k not in metas` behaviour.
- `api/db/services/doc_metadata_service.py`: new
`DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, filters,
logic)` that returns the doc IDs ES matched, or `None` to signal the
caller should fall back to the in-memory path. Returns `None` when the
active doc store is Infinity (`meta_fields` is a JSON column, not a
dotted-object mapping), when any filter cannot be expressed in DSL
(`UnsupportedMetaFilter`), or when the ES request or metadata index
lookup errors.
- `common/metadata_utils.py`: `apply_meta_data_filter` accepts an
optional `kb_ids` argument. When supplied, conditions go through
push-down first via a new `_try_meta_pushdown` helper; on `None` the
function falls back to the original `meta_filter` call. Default
behaviour is unchanged for callers that don't pass `kb_ids`.
- Updated all four callers (`agent/tools/retrieval.py`,
`api/db/services/dialog_service.py` ×2,
`api/apps/services/dataset_api_service.py`, `api/apps/sdk/session.py`)
to forward `kb_ids` so the push-down path is exercised in production.
- `test/unit_test/common/test_metadata_es_filter.py` *(new)*: 35 unit
tests covering every operator's DSL shape, value coercion
(`ast.literal_eval`, lowercasing, ISO-date pass-through), wildcard
escaping, OR-logic wrapping that protects negative clauses, and the
doc-ID extractor.
**Behaviour preserved**
- The in-memory `meta_filter` is untouched and still services every
fallback case (Infinity backend, unknown operators, ES outages).
- The eligibility / credibility / issue-multiplier semantics described
in the LLM-driven `auto` and `semi_auto` modes still hand the LLM the
full in-memory `metas` dict to choose conditions from. Only the
*evaluation* of those generated conditions is pushed down.
- Existing tests in
`test/unit_test/common/test_metadata_filter_operators.py` continue to
pass (14/14).
**Test plan**
- `pytest test/unit_test/common/test_metadata_es_filter.py` — 35 passed.
- `pytest test/unit_test/common/test_metadata_filter_operators.py` — 14
passed.
- `ruff check` clean on every modified file.
- Reviewer please validate the ES query shapes against a live cluster —
particularly `case_insensitive` on `wildcard` and `prefix` (requires ES
7.10+) and the `exists` + `must_not` pairing for `≠`.
**Notes**
- The first cut caps each push-down request at 10000 results, matching
the existing `get_flatted_meta_by_kbs` limit, and logs a warning when
the cap is hit. A `search_after` follow-up would let us drop the cap
entirely once the push-down path is validated.
- Operator parity with the in-memory path is exact for the canonical
unicode operators (`≥`, `≤`, `≠`) used internally; the ASCII aliases
(`>=`, `<=`, `!=`) are normalised by `convert_conditions` before they
reach the translator.
### Type of change
- [x] Performance Improvement
---------
Co-authored-by: sxxtony <sxxtony@users.noreply.github.com>
2026-05-07 16:23:43 +03:00
|
|
|
return meta_filter(_get_metas(), conditions, logic)
|
|
|
|
|
|
2025-12-12 17:12:38 +08:00
|
|
|
if method == "auto":
|
Perf: push metadata filters down to Elasticsearch (#14576)
### What problem does this PR solve?
Fixes #14412.
`common.metadata_utils.meta_filter` evaluates user-defined metadata
conditions in Python after `DocMetadataService.get_flatted_meta_by_kbs`
loads the entire `meta_fields` table into memory. Past a few thousand
documents per knowledge base this becomes a memory bottleneck and a
wasted ES round-trip — every filter request currently fetches up to
10000 metadata rows even when the resulting `doc_ids` list is tiny.
This PR adds an ES push-down path that translates the same filter
language into a `bool` query and returns just the matching document IDs.
**Changes**
- `common/metadata_es_filter.py` *(new)*: pure-Python translator from
the RAGflow filter list to ES DSL. Covers every operator the in-memory
path supports (`=`, `≠`, `>`, `<`, `≥`, `≤`, `in`, `not in`, `contains`,
`not contains`, `start with`, `end with`, `empty`, `not empty`) with
`case_insensitive: true` on `prefix` and `wildcard` for parity with the
existing lower-cased Python comparisons. User wildcard metacharacters
are escaped before being injected into `wildcard` patterns. Negative
operators (`≠`, `not in`, `not contains`, ranges) are wrapped with an
`exists` guard so they do not accidentally match documents missing the
key, matching the legacy `if k not in metas` behaviour.
- `api/db/services/doc_metadata_service.py`: new
`DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, filters,
logic)` that returns the doc IDs ES matched, or `None` to signal the
caller should fall back to the in-memory path. Returns `None` when the
active doc store is Infinity (`meta_fields` is a JSON column, not a
dotted-object mapping), when any filter cannot be expressed in DSL
(`UnsupportedMetaFilter`), or when the ES request or metadata index
lookup errors.
- `common/metadata_utils.py`: `apply_meta_data_filter` accepts an
optional `kb_ids` argument. When supplied, conditions go through
push-down first via a new `_try_meta_pushdown` helper; on `None` the
function falls back to the original `meta_filter` call. Default
behaviour is unchanged for callers that don't pass `kb_ids`.
- Updated all four callers (`agent/tools/retrieval.py`,
`api/db/services/dialog_service.py` ×2,
`api/apps/services/dataset_api_service.py`, `api/apps/sdk/session.py`)
to forward `kb_ids` so the push-down path is exercised in production.
- `test/unit_test/common/test_metadata_es_filter.py` *(new)*: 35 unit
tests covering every operator's DSL shape, value coercion
(`ast.literal_eval`, lowercasing, ISO-date pass-through), wildcard
escaping, OR-logic wrapping that protects negative clauses, and the
doc-ID extractor.
**Behaviour preserved**
- The in-memory `meta_filter` is untouched and still services every
fallback case (Infinity backend, unknown operators, ES outages).
- The eligibility / credibility / issue-multiplier semantics described
in the LLM-driven `auto` and `semi_auto` modes still hand the LLM the
full in-memory `metas` dict to choose conditions from. Only the
*evaluation* of those generated conditions is pushed down.
- Existing tests in
`test/unit_test/common/test_metadata_filter_operators.py` continue to
pass (14/14).
**Test plan**
- `pytest test/unit_test/common/test_metadata_es_filter.py` — 35 passed.
- `pytest test/unit_test/common/test_metadata_filter_operators.py` — 14
passed.
- `ruff check` clean on every modified file.
- Reviewer please validate the ES query shapes against a live cluster —
particularly `case_insensitive` on `wildcard` and `prefix` (requires ES
7.10+) and the `exists` + `must_not` pairing for `≠`.
**Notes**
- The first cut caps each push-down request at 10000 results, matching
the existing `get_flatted_meta_by_kbs` limit, and logs a warning when
the cap is hit. A `search_after` follow-up would let us drop the cap
entirely once the push-down path is validated.
- Operator parity with the in-memory path is exact for the canonical
unicode operators (`≥`, `≤`, `≠`) used internally; the ASCII aliases
(`>=`, `<=`, `!=`) are normalised by `convert_conditions` before they
reach the translator.
### Type of change
- [x] Performance Improvement
---------
Co-authored-by: sxxtony <sxxtony@users.noreply.github.com>
2026-05-07 16:23:43 +03:00
|
|
|
filters: dict = await gen_meta_filter(chat_mdl, _get_metas(), question)
|
2026-05-18 14:22:04 +08:00
|
|
|
logging.debug(f"Metadata filter(auto) generated: {filters}")
|
|
|
|
|
doc_ids.extend(_run_metadata_filter(filters["conditions"], filters.get("logic", "and")))
|
2025-12-12 17:12:38 +08:00
|
|
|
if not doc_ids:
|
|
|
|
|
return None
|
|
|
|
|
elif method == "semi_auto":
|
Support operator constraints in semi-automatic metadata filtering (#12956)
### What problem does this PR solve?
#### Summary
This PR enhances the Semi-automatic metadata filtering mode by allowing
users to explicitly pre-define operators (e.g., contains, =, >, etc.)
for selected metadata keys. While the LLM still dynamically extracts the
filter value from the user's query, it is now strictly constrained to
use the operator specified in the UI configuration.
Using this feature is optional. By default the operator selection is set
to "automatic" resulting in the LLM choosing the operator (as
presently).
#### Rationale & Use Case
This enhancement was driven by a concrete challenge I encountered while
working with technical documentation.
In my specific use case, I was trying to filter for software versions
within a technical manual. In this dataset, a single document chunk
often applies to multiple software versions. These versions are stored
as a combined string within the metadata for each chunk.
When using the standard semi-automatic filter, the LLM would
inconsistently choose between the contains and equals operators. When it
chose equals, it would exclude every chunk that applied to more than one
version, even if the version I was searching for was clearly included in
that metadata string. This led to incomplete and frustrating retrieval
results.
By extending the semi-automatic filter to allow pre-defining the
operator for a specific key, I was able to force the use of contains for
the version field. This change immediately led to significantly improved
and more reliable results in my case.
I believe this functionality will be equally useful for others dealing
with "tagged" or multi-value metadata where the relationship between the
query and the field is known, but the specific value needs to remain
dynamic.
#### Key Changes
##### Backend & Core Logic
- `common/metadata_utils.py`: Updated apply_meta_data_filter to support
a mixed data structure for semi_auto (handling both legacy string arrays
and the new object-based format {"key": "...", "op": "..."}).
- `rag/prompts/generator.py`: Extended gen_meta_filter to accept and
pass operator constraints to the LLM.
- `rag/prompts/meta_filter.md`: Updated the system prompt to instruct
the LLM to strictly respect provided operator constraints.
##### Frontend
- `web/src/components/metadata-filter/metadata-semi-auto-fields.tsx`:
Enhanced the UI to include an operator dropdown for each selected
metadata key, utilizing existing operator constants.
- `web/src/components/metadata-filter/index.tsx`: Updated the validation
schema to accommodate the new state structure.
#### Test Plan
- Backward Compatibility: Verified that existing semi-auto filters
stored as simple strings still function correctly.
- Prompt Verification: Confirmed that constraints are correctly rendered
in the LLM system prompt when specified.
- Added unit tests as
`test/unit_test/common/test_apply_semi_auto_meta_data_filter.py`
- Manual End-to-End:
- Configured a "Semi-automatic" filter for a "Version" key with the
"contains" operator.
- Asked a version-specific query.
- Result
<img width="1173" height="704" alt="Screenshot 2026-02-02 145359"
src="https://github.com/user-attachments/assets/510a6a61-a231-4dc2-a7fe-cdfc07219132"
/>
### Type of change
- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
---------
Co-authored-by: Philipp Heyken Soares <philipp.heyken-soares@am.ai>
2026-02-03 04:11:34 +01:00
|
|
|
selected_keys = []
|
|
|
|
|
constraints = {}
|
|
|
|
|
for item in meta_data_filter.get("semi_auto", []):
|
|
|
|
|
if isinstance(item, str):
|
|
|
|
|
selected_keys.append(item)
|
|
|
|
|
elif isinstance(item, dict):
|
|
|
|
|
key = item.get("key")
|
|
|
|
|
op = item.get("op")
|
|
|
|
|
selected_keys.append(key)
|
|
|
|
|
if op:
|
|
|
|
|
constraints[key] = op
|
|
|
|
|
|
2025-12-12 17:12:38 +08:00
|
|
|
if selected_keys:
|
Perf: push metadata filters down to Elasticsearch (#14576)
### What problem does this PR solve?
Fixes #14412.
`common.metadata_utils.meta_filter` evaluates user-defined metadata
conditions in Python after `DocMetadataService.get_flatted_meta_by_kbs`
loads the entire `meta_fields` table into memory. Past a few thousand
documents per knowledge base this becomes a memory bottleneck and a
wasted ES round-trip — every filter request currently fetches up to
10000 metadata rows even when the resulting `doc_ids` list is tiny.
This PR adds an ES push-down path that translates the same filter
language into a `bool` query and returns just the matching document IDs.
**Changes**
- `common/metadata_es_filter.py` *(new)*: pure-Python translator from
the RAGflow filter list to ES DSL. Covers every operator the in-memory
path supports (`=`, `≠`, `>`, `<`, `≥`, `≤`, `in`, `not in`, `contains`,
`not contains`, `start with`, `end with`, `empty`, `not empty`) with
`case_insensitive: true` on `prefix` and `wildcard` for parity with the
existing lower-cased Python comparisons. User wildcard metacharacters
are escaped before being injected into `wildcard` patterns. Negative
operators (`≠`, `not in`, `not contains`, ranges) are wrapped with an
`exists` guard so they do not accidentally match documents missing the
key, matching the legacy `if k not in metas` behaviour.
- `api/db/services/doc_metadata_service.py`: new
`DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, filters,
logic)` that returns the doc IDs ES matched, or `None` to signal the
caller should fall back to the in-memory path. Returns `None` when the
active doc store is Infinity (`meta_fields` is a JSON column, not a
dotted-object mapping), when any filter cannot be expressed in DSL
(`UnsupportedMetaFilter`), or when the ES request or metadata index
lookup errors.
- `common/metadata_utils.py`: `apply_meta_data_filter` accepts an
optional `kb_ids` argument. When supplied, conditions go through
push-down first via a new `_try_meta_pushdown` helper; on `None` the
function falls back to the original `meta_filter` call. Default
behaviour is unchanged for callers that don't pass `kb_ids`.
- Updated all four callers (`agent/tools/retrieval.py`,
`api/db/services/dialog_service.py` ×2,
`api/apps/services/dataset_api_service.py`, `api/apps/sdk/session.py`)
to forward `kb_ids` so the push-down path is exercised in production.
- `test/unit_test/common/test_metadata_es_filter.py` *(new)*: 35 unit
tests covering every operator's DSL shape, value coercion
(`ast.literal_eval`, lowercasing, ISO-date pass-through), wildcard
escaping, OR-logic wrapping that protects negative clauses, and the
doc-ID extractor.
**Behaviour preserved**
- The in-memory `meta_filter` is untouched and still services every
fallback case (Infinity backend, unknown operators, ES outages).
- The eligibility / credibility / issue-multiplier semantics described
in the LLM-driven `auto` and `semi_auto` modes still hand the LLM the
full in-memory `metas` dict to choose conditions from. Only the
*evaluation* of those generated conditions is pushed down.
- Existing tests in
`test/unit_test/common/test_metadata_filter_operators.py` continue to
pass (14/14).
**Test plan**
- `pytest test/unit_test/common/test_metadata_es_filter.py` — 35 passed.
- `pytest test/unit_test/common/test_metadata_filter_operators.py` — 14
passed.
- `ruff check` clean on every modified file.
- Reviewer please validate the ES query shapes against a live cluster —
particularly `case_insensitive` on `wildcard` and `prefix` (requires ES
7.10+) and the `exists` + `must_not` pairing for `≠`.
**Notes**
- The first cut caps each push-down request at 10000 results, matching
the existing `get_flatted_meta_by_kbs` limit, and logs a warning when
the cap is hit. A `search_after` follow-up would let us drop the cap
entirely once the push-down path is validated.
- Operator parity with the in-memory path is exact for the canonical
unicode operators (`≥`, `≤`, `≠`) used internally; the ASCII aliases
(`>=`, `<=`, `!=`) are normalised by `convert_conditions` before they
reach the translator.
### Type of change
- [x] Performance Improvement
---------
Co-authored-by: sxxtony <sxxtony@users.noreply.github.com>
2026-05-07 16:23:43 +03:00
|
|
|
current_metas = _get_metas()
|
|
|
|
|
filtered_metas = {key: current_metas[key] for key in selected_keys if key in current_metas}
|
2025-12-12 17:12:38 +08:00
|
|
|
if filtered_metas:
|
Support operator constraints in semi-automatic metadata filtering (#12956)
### What problem does this PR solve?
#### Summary
This PR enhances the Semi-automatic metadata filtering mode by allowing
users to explicitly pre-define operators (e.g., contains, =, >, etc.)
for selected metadata keys. While the LLM still dynamically extracts the
filter value from the user's query, it is now strictly constrained to
use the operator specified in the UI configuration.
Using this feature is optional. By default the operator selection is set
to "automatic" resulting in the LLM choosing the operator (as
presently).
#### Rationale & Use Case
This enhancement was driven by a concrete challenge I encountered while
working with technical documentation.
In my specific use case, I was trying to filter for software versions
within a technical manual. In this dataset, a single document chunk
often applies to multiple software versions. These versions are stored
as a combined string within the metadata for each chunk.
When using the standard semi-automatic filter, the LLM would
inconsistently choose between the contains and equals operators. When it
chose equals, it would exclude every chunk that applied to more than one
version, even if the version I was searching for was clearly included in
that metadata string. This led to incomplete and frustrating retrieval
results.
By extending the semi-automatic filter to allow pre-defining the
operator for a specific key, I was able to force the use of contains for
the version field. This change immediately led to significantly improved
and more reliable results in my case.
I believe this functionality will be equally useful for others dealing
with "tagged" or multi-value metadata where the relationship between the
query and the field is known, but the specific value needs to remain
dynamic.
#### Key Changes
##### Backend & Core Logic
- `common/metadata_utils.py`: Updated apply_meta_data_filter to support
a mixed data structure for semi_auto (handling both legacy string arrays
and the new object-based format {"key": "...", "op": "..."}).
- `rag/prompts/generator.py`: Extended gen_meta_filter to accept and
pass operator constraints to the LLM.
- `rag/prompts/meta_filter.md`: Updated the system prompt to instruct
the LLM to strictly respect provided operator constraints.
##### Frontend
- `web/src/components/metadata-filter/metadata-semi-auto-fields.tsx`:
Enhanced the UI to include an operator dropdown for each selected
metadata key, utilizing existing operator constants.
- `web/src/components/metadata-filter/index.tsx`: Updated the validation
schema to accommodate the new state structure.
#### Test Plan
- Backward Compatibility: Verified that existing semi-auto filters
stored as simple strings still function correctly.
- Prompt Verification: Confirmed that constraints are correctly rendered
in the LLM system prompt when specified.
- Added unit tests as
`test/unit_test/common/test_apply_semi_auto_meta_data_filter.py`
- Manual End-to-End:
- Configured a "Semi-automatic" filter for a "Version" key with the
"contains" operator.
- Asked a version-specific query.
- Result
<img width="1173" height="704" alt="Screenshot 2026-02-02 145359"
src="https://github.com/user-attachments/assets/510a6a61-a231-4dc2-a7fe-cdfc07219132"
/>
### Type of change
- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
---------
Co-authored-by: Philipp Heyken Soares <philipp.heyken-soares@am.ai>
2026-02-03 04:11:34 +01:00
|
|
|
filters: dict = await gen_meta_filter(chat_mdl, filtered_metas, question, constraints=constraints)
|
2026-05-18 14:22:04 +08:00
|
|
|
logging.debug(f"Metadata filter(semi_auto) generated: {filters}")
|
|
|
|
|
doc_ids.extend(_run_metadata_filter(filters["conditions"], filters.get("logic", "and")))
|
2025-12-12 17:12:38 +08:00
|
|
|
if not doc_ids:
|
|
|
|
|
return None
|
|
|
|
|
elif method == "manual":
|
|
|
|
|
filters = meta_data_filter.get("manual", [])
|
|
|
|
|
if manual_value_resolver:
|
|
|
|
|
filters = [manual_value_resolver(flt) for flt in filters]
|
2026-05-18 14:22:04 +08:00
|
|
|
logging.debug(f"Metadata filter(manual): {filters}")
|
|
|
|
|
doc_ids.extend(_run_metadata_filter(filters, meta_data_filter.get("logic", "and")))
|
2025-12-12 17:12:38 +08:00
|
|
|
if filters and not doc_ids:
|
|
|
|
|
doc_ids = ["-999"]
|
|
|
|
|
|
2026-05-18 14:22:04 +08:00
|
|
|
logging.debug(f"apply_meta_data_filter meta_filter={meta_data_filter}, returning doc_ids={doc_ids}")
|
2025-12-12 17:12:38 +08:00
|
|
|
return doc_ids
|
2025-12-17 16:50:36 +08:00
|
|
|
|
|
|
|
|
|
Perf: push metadata filters down to Elasticsearch (#14576)
### What problem does this PR solve?
Fixes #14412.
`common.metadata_utils.meta_filter` evaluates user-defined metadata
conditions in Python after `DocMetadataService.get_flatted_meta_by_kbs`
loads the entire `meta_fields` table into memory. Past a few thousand
documents per knowledge base this becomes a memory bottleneck and a
wasted ES round-trip — every filter request currently fetches up to
10000 metadata rows even when the resulting `doc_ids` list is tiny.
This PR adds an ES push-down path that translates the same filter
language into a `bool` query and returns just the matching document IDs.
**Changes**
- `common/metadata_es_filter.py` *(new)*: pure-Python translator from
the RAGflow filter list to ES DSL. Covers every operator the in-memory
path supports (`=`, `≠`, `>`, `<`, `≥`, `≤`, `in`, `not in`, `contains`,
`not contains`, `start with`, `end with`, `empty`, `not empty`) with
`case_insensitive: true` on `prefix` and `wildcard` for parity with the
existing lower-cased Python comparisons. User wildcard metacharacters
are escaped before being injected into `wildcard` patterns. Negative
operators (`≠`, `not in`, `not contains`, ranges) are wrapped with an
`exists` guard so they do not accidentally match documents missing the
key, matching the legacy `if k not in metas` behaviour.
- `api/db/services/doc_metadata_service.py`: new
`DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, filters,
logic)` that returns the doc IDs ES matched, or `None` to signal the
caller should fall back to the in-memory path. Returns `None` when the
active doc store is Infinity (`meta_fields` is a JSON column, not a
dotted-object mapping), when any filter cannot be expressed in DSL
(`UnsupportedMetaFilter`), or when the ES request or metadata index
lookup errors.
- `common/metadata_utils.py`: `apply_meta_data_filter` accepts an
optional `kb_ids` argument. When supplied, conditions go through
push-down first via a new `_try_meta_pushdown` helper; on `None` the
function falls back to the original `meta_filter` call. Default
behaviour is unchanged for callers that don't pass `kb_ids`.
- Updated all four callers (`agent/tools/retrieval.py`,
`api/db/services/dialog_service.py` ×2,
`api/apps/services/dataset_api_service.py`, `api/apps/sdk/session.py`)
to forward `kb_ids` so the push-down path is exercised in production.
- `test/unit_test/common/test_metadata_es_filter.py` *(new)*: 35 unit
tests covering every operator's DSL shape, value coercion
(`ast.literal_eval`, lowercasing, ISO-date pass-through), wildcard
escaping, OR-logic wrapping that protects negative clauses, and the
doc-ID extractor.
**Behaviour preserved**
- The in-memory `meta_filter` is untouched and still services every
fallback case (Infinity backend, unknown operators, ES outages).
- The eligibility / credibility / issue-multiplier semantics described
in the LLM-driven `auto` and `semi_auto` modes still hand the LLM the
full in-memory `metas` dict to choose conditions from. Only the
*evaluation* of those generated conditions is pushed down.
- Existing tests in
`test/unit_test/common/test_metadata_filter_operators.py` continue to
pass (14/14).
**Test plan**
- `pytest test/unit_test/common/test_metadata_es_filter.py` — 35 passed.
- `pytest test/unit_test/common/test_metadata_filter_operators.py` — 14
passed.
- `ruff check` clean on every modified file.
- Reviewer please validate the ES query shapes against a live cluster —
particularly `case_insensitive` on `wildcard` and `prefix` (requires ES
7.10+) and the `exists` + `must_not` pairing for `≠`.
**Notes**
- The first cut caps each push-down request at 10000 results, matching
the existing `get_flatted_meta_by_kbs` limit, and logs a warning when
the cap is hit. A `search_after` follow-up would let us drop the cap
entirely once the push-down path is validated.
- Operator parity with the in-memory path is exact for the canonical
unicode operators (`≥`, `≤`, `≠`) used internally; the ASCII aliases
(`>=`, `<=`, `!=`) are normalised by `convert_conditions` before they
reach the translator.
### Type of change
- [x] Performance Improvement
---------
Co-authored-by: sxxtony <sxxtony@users.noreply.github.com>
2026-05-07 16:23:43 +03:00
|
|
|
def _try_meta_pushdown(
|
2026-05-18 14:22:04 +08:00
|
|
|
kb_ids: list[str],
|
|
|
|
|
conditions: list[dict],
|
|
|
|
|
logic: str,
|
Perf: push metadata filters down to Elasticsearch (#14576)
### What problem does this PR solve?
Fixes #14412.
`common.metadata_utils.meta_filter` evaluates user-defined metadata
conditions in Python after `DocMetadataService.get_flatted_meta_by_kbs`
loads the entire `meta_fields` table into memory. Past a few thousand
documents per knowledge base this becomes a memory bottleneck and a
wasted ES round-trip — every filter request currently fetches up to
10000 metadata rows even when the resulting `doc_ids` list is tiny.
This PR adds an ES push-down path that translates the same filter
language into a `bool` query and returns just the matching document IDs.
**Changes**
- `common/metadata_es_filter.py` *(new)*: pure-Python translator from
the RAGflow filter list to ES DSL. Covers every operator the in-memory
path supports (`=`, `≠`, `>`, `<`, `≥`, `≤`, `in`, `not in`, `contains`,
`not contains`, `start with`, `end with`, `empty`, `not empty`) with
`case_insensitive: true` on `prefix` and `wildcard` for parity with the
existing lower-cased Python comparisons. User wildcard metacharacters
are escaped before being injected into `wildcard` patterns. Negative
operators (`≠`, `not in`, `not contains`, ranges) are wrapped with an
`exists` guard so they do not accidentally match documents missing the
key, matching the legacy `if k not in metas` behaviour.
- `api/db/services/doc_metadata_service.py`: new
`DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, filters,
logic)` that returns the doc IDs ES matched, or `None` to signal the
caller should fall back to the in-memory path. Returns `None` when the
active doc store is Infinity (`meta_fields` is a JSON column, not a
dotted-object mapping), when any filter cannot be expressed in DSL
(`UnsupportedMetaFilter`), or when the ES request or metadata index
lookup errors.
- `common/metadata_utils.py`: `apply_meta_data_filter` accepts an
optional `kb_ids` argument. When supplied, conditions go through
push-down first via a new `_try_meta_pushdown` helper; on `None` the
function falls back to the original `meta_filter` call. Default
behaviour is unchanged for callers that don't pass `kb_ids`.
- Updated all four callers (`agent/tools/retrieval.py`,
`api/db/services/dialog_service.py` ×2,
`api/apps/services/dataset_api_service.py`, `api/apps/sdk/session.py`)
to forward `kb_ids` so the push-down path is exercised in production.
- `test/unit_test/common/test_metadata_es_filter.py` *(new)*: 35 unit
tests covering every operator's DSL shape, value coercion
(`ast.literal_eval`, lowercasing, ISO-date pass-through), wildcard
escaping, OR-logic wrapping that protects negative clauses, and the
doc-ID extractor.
**Behaviour preserved**
- The in-memory `meta_filter` is untouched and still services every
fallback case (Infinity backend, unknown operators, ES outages).
- The eligibility / credibility / issue-multiplier semantics described
in the LLM-driven `auto` and `semi_auto` modes still hand the LLM the
full in-memory `metas` dict to choose conditions from. Only the
*evaluation* of those generated conditions is pushed down.
- Existing tests in
`test/unit_test/common/test_metadata_filter_operators.py` continue to
pass (14/14).
**Test plan**
- `pytest test/unit_test/common/test_metadata_es_filter.py` — 35 passed.
- `pytest test/unit_test/common/test_metadata_filter_operators.py` — 14
passed.
- `ruff check` clean on every modified file.
- Reviewer please validate the ES query shapes against a live cluster —
particularly `case_insensitive` on `wildcard` and `prefix` (requires ES
7.10+) and the `exists` + `must_not` pairing for `≠`.
**Notes**
- The first cut caps each push-down request at 10000 results, matching
the existing `get_flatted_meta_by_kbs` limit, and logs a warning when
the cap is hit. A `search_after` follow-up would let us drop the cap
entirely once the push-down path is validated.
- Operator parity with the in-memory path is exact for the canonical
unicode operators (`≥`, `≤`, `≠`) used internally; the ASCII aliases
(`>=`, `<=`, `!=`) are normalised by `convert_conditions` before they
reach the translator.
### Type of change
- [x] Performance Improvement
---------
Co-authored-by: sxxtony <sxxtony@users.noreply.github.com>
2026-05-07 16:23:43 +03:00
|
|
|
) -> list[str] | None:
|
|
|
|
|
"""Attempt the ES push-down path; return ``None`` to fall back in-memory.
|
|
|
|
|
|
|
|
|
|
Lazy-imports ``DocMetadataService`` so this module stays usable in
|
|
|
|
|
environments where the API/db layer hasn't been wired up (e.g. unit tests
|
|
|
|
|
that exercise ``meta_filter`` directly).
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
from api.db.services.doc_metadata_service import DocMetadataService
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.debug(f"[apply_meta_data_filter] push-down disabled, import failed: {e}")
|
|
|
|
|
return None
|
|
|
|
|
try:
|
|
|
|
|
return DocMetadataService.filter_doc_ids_by_meta_pushdown(kb_ids, conditions, logic)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.warning(f"[apply_meta_data_filter] push-down errored, falling back: {e}")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2025-12-24 09:32:55 +08:00
|
|
|
def dedupe_list(values: list) -> list:
|
|
|
|
|
seen = set()
|
|
|
|
|
deduped = []
|
|
|
|
|
for item in values:
|
|
|
|
|
key = str(item)
|
|
|
|
|
if key in seen:
|
|
|
|
|
continue
|
|
|
|
|
seen.add(key)
|
|
|
|
|
deduped.append(item)
|
|
|
|
|
return deduped
|
|
|
|
|
|
|
|
|
|
|
2025-12-17 16:50:36 +08:00
|
|
|
def update_metadata_to(metadata, meta):
|
|
|
|
|
if not meta:
|
|
|
|
|
return metadata
|
|
|
|
|
if isinstance(meta, str):
|
|
|
|
|
try:
|
|
|
|
|
meta = json_repair.loads(meta)
|
|
|
|
|
except Exception:
|
|
|
|
|
logging.error("Meta data format error.")
|
|
|
|
|
return metadata
|
|
|
|
|
if not isinstance(meta, dict):
|
|
|
|
|
return metadata
|
2025-12-24 09:32:55 +08:00
|
|
|
|
2025-12-17 16:50:36 +08:00
|
|
|
for k, v in meta.items():
|
|
|
|
|
if isinstance(v, list):
|
|
|
|
|
v = [vv for vv in v if isinstance(vv, str)]
|
|
|
|
|
if not v:
|
|
|
|
|
continue
|
2025-12-24 09:32:55 +08:00
|
|
|
v = dedupe_list(v)
|
2025-12-17 16:50:36 +08:00
|
|
|
if not isinstance(v, list) and not isinstance(v, str):
|
|
|
|
|
continue
|
|
|
|
|
if k not in metadata:
|
|
|
|
|
metadata[k] = v
|
|
|
|
|
continue
|
|
|
|
|
if isinstance(metadata[k], list):
|
|
|
|
|
if isinstance(v, list):
|
|
|
|
|
metadata[k].extend(v)
|
|
|
|
|
else:
|
|
|
|
|
metadata[k].append(v)
|
2025-12-24 09:32:55 +08:00
|
|
|
metadata[k] = dedupe_list(metadata[k])
|
2025-12-17 16:50:36 +08:00
|
|
|
else:
|
|
|
|
|
metadata[k] = v
|
|
|
|
|
|
|
|
|
|
return metadata
|
|
|
|
|
|
|
|
|
|
|
2026-05-18 14:22:04 +08:00
|
|
|
def metadata_schema(metadata: dict | list | None) -> Dict[str, Any]:
|
2025-12-17 16:50:36 +08:00
|
|
|
if not metadata:
|
|
|
|
|
return {}
|
|
|
|
|
properties = {}
|
|
|
|
|
|
|
|
|
|
for item in metadata:
|
|
|
|
|
key = item.get("key")
|
|
|
|
|
if not key:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
prop_schema = {
|
|
|
|
|
"description": item.get("description", "")
|
|
|
|
|
}
|
|
|
|
|
if "enum" in item and item["enum"]:
|
|
|
|
|
prop_schema["enum"] = item["enum"]
|
|
|
|
|
prop_schema["type"] = "string"
|
|
|
|
|
|
|
|
|
|
properties[key] = prop_schema
|
|
|
|
|
|
|
|
|
|
json_schema = {
|
|
|
|
|
"type": "object",
|
|
|
|
|
"properties": properties,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
json_schema["additionalProperties"] = False
|
2025-12-24 09:32:55 +08:00
|
|
|
return json_schema
|
2026-01-22 15:34:08 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def _is_json_schema(obj: dict) -> bool:
|
|
|
|
|
if not isinstance(obj, dict):
|
|
|
|
|
return False
|
|
|
|
|
if "$schema" in obj:
|
|
|
|
|
return True
|
|
|
|
|
return obj.get("type") == "object" and isinstance(obj.get("properties"), dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _is_metadata_list(obj: list) -> bool:
|
|
|
|
|
if not isinstance(obj, list) or not obj:
|
|
|
|
|
return False
|
|
|
|
|
for item in obj:
|
|
|
|
|
if not isinstance(item, dict):
|
|
|
|
|
return False
|
|
|
|
|
key = item.get("key")
|
|
|
|
|
if not isinstance(key, str) or not key:
|
|
|
|
|
return False
|
2026-06-09 19:10:48 +08:00
|
|
|
if "enum" in item and item["enum"] is not None and not isinstance(item["enum"], list):
|
2026-01-22 15:34:08 +08:00
|
|
|
return False
|
2026-06-09 19:10:48 +08:00
|
|
|
if "description" in item and item["description"] is not None and not isinstance(item["description"], str):
|
2026-01-22 15:34:08 +08:00
|
|
|
return False
|
2026-06-09 19:10:48 +08:00
|
|
|
if "descriptions" in item and item["descriptions"] is not None and not isinstance(item["descriptions"], str):
|
2026-01-22 15:34:08 +08:00
|
|
|
return False
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def turn2jsonschema(obj: dict | list) -> Dict[str, Any]:
|
|
|
|
|
if isinstance(obj, dict) and _is_json_schema(obj):
|
|
|
|
|
return obj
|
|
|
|
|
if isinstance(obj, list) and _is_metadata_list(obj):
|
|
|
|
|
normalized = []
|
|
|
|
|
for item in obj:
|
2026-06-09 19:10:48 +08:00
|
|
|
description = item.get("description") or item.get("descriptions") or ""
|
2026-01-22 15:34:08 +08:00
|
|
|
normalized_item = {
|
|
|
|
|
"key": item.get("key"),
|
|
|
|
|
"description": description,
|
|
|
|
|
}
|
2026-06-09 19:10:48 +08:00
|
|
|
if "enum" in item and item["enum"] is not None:
|
2026-01-22 15:34:08 +08:00
|
|
|
normalized_item["enum"] = item["enum"]
|
|
|
|
|
normalized.append(normalized_item)
|
|
|
|
|
return metadata_schema(normalized)
|
|
|
|
|
return {}
|