mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
## Summary This PR fixes case-sensitivity regressions introduced in #15656 and consolidates the metadata filtering pipeline by removing the duplicate `applySingleCondition` adapter layer. ### Bug fixes 1. **contains / not contains**: restored case-insensitive matching (was lost when `applySingleCondition` was replaced by `common.MetaFilter.matchValue` which lacked `strings.ToLower`) 2. **not in**: restored case-insensitive matching (was lost for same reason; uses `strings.EqualFold`) 3. **!= with date filter values**: non-date metadata values now correctly match the `≠` operator (a non-date value IS not equal to any date, but was returning false) ### Architecture 4. **Removed `applySingleCondition`** (65 lines) — the inline switch was a duplicate of `common.MetaFilter` logic. `ApplyMetaFilter` now converts conditions and delegates to `common.MetaFilter` once per filter set, eliminating ~25 lines of duplicate AND/OR merge logic. 5. **Added `filterSet`** — O(n+m) hash-map fast path for `in`/`not in` operators, replacing the O(n*m) linear scan in `matchValue`. 6. **Exported `NormalizeOperator`** from `common` for consistent operator alias handling. ### Cleanup 7. Removed 18 lines of dead code (`matchValue`'s `in`/`not in` branches already bypassed by `filterOut` delegation) 8. Fixed orphaned godoc comment for `convertOperator` 9. Fixed incorrect `filterSet` doc comment (claimed "matching EqualFold" but used `strings.ToLower`) 10. Completed `convertToMetaCondition` operator normalization documentation ### Testing - 60 tests (24 service + 36 common), all passing - New tests: `==`, `≠`, `>`, `<`, `≥`, `≤`, `empty`, `not empty` through `ApplyMetaFilter` - New tests: `<`, `≤`, `≠` through `MetaFilter`; `not-in-empty-list` through `filterSet` - All 18 `MetaFilter` tests pass; all 10 `filterSet` unit tests pass --------- Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
367 lines
9.3 KiB
Go
367 lines
9.3 KiB
Go
//
|
||
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
||
//
|
||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
// you may not use this file except in compliance with the License.
|
||
// You may obtain a copy of the License at
|
||
//
|
||
// http://www.apache.org/licenses/LICENSE-2.0
|
||
//
|
||
// Unless required by applicable law or agreed to in writing, software
|
||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
// See the License for the specific language governing permissions and
|
||
// limitations under the License.
|
||
//
|
||
|
||
package common
|
||
|
||
import (
|
||
"strconv"
|
||
"strings"
|
||
)
|
||
|
||
// MetaCondition represents a single parsed filter condition.
|
||
type MetaCondition struct {
|
||
Operator string // "=", "≠", ">", "<", "≥", "≤", "contains", "not contains", "in", "not in", "start with", "end with", "empty", "not empty"
|
||
Key string // metadata field name
|
||
Value interface{} // comparison value
|
||
}
|
||
|
||
// MetaValueDocs maps a metadata field value to the document IDs that have that value.
|
||
// Example: {"Zhang San": ["doc1", "doc2"], "Li Si": ["doc3"]}
|
||
type MetaValueDocs map[string][]string
|
||
|
||
// MetaData maps a metadata field name to its value→documents mapping.
|
||
// Example: {"author": {"Zhang San": ["doc1"]}, "year": {"2024": ["doc1", "doc2"]}}
|
||
type MetaData map[string]MetaValueDocs
|
||
|
||
// MetaFilterInput groups filter conditions with their logic operator.
|
||
type MetaFilterInput struct {
|
||
Conditions []MetaCondition
|
||
Logic string // "and" | "or"
|
||
}
|
||
|
||
// operatorMapping translates Python-style operators to internal symbols.
|
||
var operatorMapping = map[string]string{
|
||
"is": "=",
|
||
"not is": "≠",
|
||
">=": "≥",
|
||
"<=": "≤",
|
||
"!=": "≠",
|
||
"==": "=",
|
||
}
|
||
|
||
// ParseAndConvert converts raw API conditions into MetaFilterInput.
|
||
// Equivalent to Python: meta_filter(metas, convert_conditions(cond), cond.get("logic"))
|
||
func ParseAndConvert(metadataCondition map[string]interface{}) *MetaFilterInput {
|
||
if metadataCondition == nil {
|
||
return nil
|
||
}
|
||
|
||
logic, _ := metadataCondition["logic"].(string)
|
||
if logic == "" {
|
||
logic = "and"
|
||
}
|
||
|
||
rawConditions, ok := metadataCondition["conditions"].([]interface{})
|
||
if !ok || len(rawConditions) == 0 {
|
||
return nil
|
||
}
|
||
|
||
var conditions []MetaCondition
|
||
for _, raw := range rawConditions {
|
||
cond, ok := raw.(map[string]interface{})
|
||
if !ok {
|
||
continue
|
||
}
|
||
name, _ := cond["name"].(string)
|
||
if name == "" {
|
||
continue
|
||
}
|
||
op, _ := cond["comparison_operator"].(string)
|
||
op = convertOperator(op)
|
||
conditions = append(conditions, MetaCondition{
|
||
Operator: op,
|
||
Key: name,
|
||
Value: cond["value"],
|
||
})
|
||
}
|
||
|
||
if len(conditions) == 0 {
|
||
return nil
|
||
}
|
||
|
||
return &MetaFilterInput{
|
||
Conditions: conditions,
|
||
Logic: logic,
|
||
}
|
||
}
|
||
|
||
// convertOperator translates operator aliases to their canonical form.
|
||
|
||
func convertOperator(op string) string {
|
||
if mapped, exists := operatorMapping[op]; exists {
|
||
return mapped
|
||
}
|
||
return op
|
||
}
|
||
|
||
// NormalizeOperator is the exported equivalent of convertOperator.
|
||
func NormalizeOperator(op string) string { return convertOperator(op) }
|
||
|
||
// MetaFilter applies filter conditions against metadata and returns matching doc IDs.
|
||
// Python equivalent: common/metadata_utils.py::meta_filter()
|
||
func MetaFilter(metas MetaData, input *MetaFilterInput) []string {
|
||
if input == nil || len(input.Conditions) == 0 {
|
||
return nil
|
||
}
|
||
|
||
logic := input.Logic
|
||
if logic == "" {
|
||
logic = "and"
|
||
}
|
||
|
||
var docIDs *map[string]struct{}
|
||
|
||
for _, f := range input.Conditions {
|
||
v2docs, ok := metas[f.Key]
|
||
if !ok {
|
||
if logic == "and" {
|
||
return []string{}
|
||
}
|
||
continue
|
||
}
|
||
|
||
matched := filterOut(v2docs, f.Operator, f.Value)
|
||
|
||
if docIDs == nil {
|
||
s := make(map[string]struct{}, len(matched))
|
||
for _, id := range matched {
|
||
s[id] = struct{}{}
|
||
}
|
||
docIDs = &s
|
||
} else {
|
||
if logic == "and" {
|
||
s := make(map[string]struct{})
|
||
for _, id := range matched {
|
||
if _, exists := (*docIDs)[id]; exists {
|
||
s[id] = struct{}{}
|
||
}
|
||
}
|
||
docIDs = &s
|
||
if len(*docIDs) == 0 {
|
||
return []string{}
|
||
}
|
||
} else {
|
||
for _, id := range matched {
|
||
(*docIDs)[id] = struct{}{}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
if docIDs == nil {
|
||
return []string{}
|
||
}
|
||
result := make([]string, 0, len(*docIDs))
|
||
for id := range *docIDs {
|
||
result = append(result, id)
|
||
}
|
||
return result
|
||
}
|
||
|
||
// filterOut returns matching doc IDs for a single (value → matchedDocs) map and operator.
|
||
// For "in" and "not in", it delegates to filterSet for O(n+m) hash-map-based filtering;
|
||
// all other operators use matchValue for per-element predicate evaluation.
|
||
func filterOut(v2docs MetaValueDocs, operator string, value interface{}) []string {
|
||
if operator == "in" || operator == "not in" {
|
||
return filterSet(v2docs, operator, value)
|
||
}
|
||
var ids []string
|
||
for input, docids := range v2docs {
|
||
if matchValue(input, operator, value) {
|
||
ids = append(ids, docids...)
|
||
}
|
||
}
|
||
return ids
|
||
}
|
||
|
||
// filterSet handles "in" and "not in" operators using O(1) hash map lookups.
|
||
//
|
||
// Instead of the O(n×m) linear scan that matchValue performs for these operators
|
||
// (n = distinct metadata values, m = filter list size), filterSet builds a lookup
|
||
// map from the filter value list once (O(m)) then tests each metadata entry in
|
||
// O(1) time (O(n)), yielding O(n+m) overall.
|
||
//
|
||
// Case sensitivity follows the same contract as matchValue:
|
||
// - "in": case-sensitive (exact match via toString(item) == input)
|
||
// - "not in": case-insensitive (strings.ToLower on both sides)
|
||
//
|
||
// When value is not a []interface{} (should not happen in normal call paths),
|
||
// filterSet returns nil — no metadata values match "in", and for "not in" it
|
||
// defensively returns nil as well (rather than returning all entries, which could
|
||
// silently bypass a misconfigured filter).
|
||
func filterSet(v2docs MetaValueDocs, operator string, value interface{}) []string {
|
||
list, ok := value.([]interface{})
|
||
if !ok {
|
||
return nil
|
||
}
|
||
|
||
if operator == "not in" {
|
||
// Build case-insensitive exclusion set.
|
||
lookup := make(map[string]bool, len(list))
|
||
for _, item := range list {
|
||
lookup[strings.ToLower(toString(item))] = true
|
||
}
|
||
var ids []string
|
||
for input, docids := range v2docs {
|
||
if !lookup[strings.ToLower(input)] {
|
||
ids = append(ids, docids...)
|
||
}
|
||
}
|
||
return ids
|
||
}
|
||
|
||
// "in": build case-sensitive inclusion set.
|
||
lookup := make(map[string]bool, len(list))
|
||
for _, item := range list {
|
||
lookup[toString(item)] = true
|
||
}
|
||
var ids []string
|
||
for input, docids := range v2docs {
|
||
if lookup[input] {
|
||
ids = append(ids, docids...)
|
||
}
|
||
}
|
||
return ids
|
||
}
|
||
|
||
// matchValue checks if a single metadata value matches the operator+value.
|
||
func matchValue(input string, operator string, value interface{}) bool {
|
||
switch operator {
|
||
case "empty":
|
||
return input == ""
|
||
case "not empty":
|
||
return input != ""
|
||
}
|
||
|
||
valStr := toString(value)
|
||
|
||
switch operator {
|
||
case "contains":
|
||
return strings.Contains(strings.ToLower(input), strings.ToLower(valStr))
|
||
case "not contains":
|
||
return !strings.Contains(strings.ToLower(input), strings.ToLower(valStr))
|
||
case "start with":
|
||
return strings.HasPrefix(strings.ToLower(input), strings.ToLower(valStr))
|
||
case "end with":
|
||
return strings.HasSuffix(strings.ToLower(input), strings.ToLower(valStr))
|
||
|
||
// "in" and "not in" are intentionally omitted from matchValue.
|
||
// filterOut (line 177) intercepts these operators and delegates
|
||
// them to filterSet for O(n+m) hash-map-based filtering, so they
|
||
// never reach this function through normal call paths.
|
||
}
|
||
|
||
// Comparison operators: =, ≠, >, <, ≥, ≤
|
||
return compareValues(input, valStr, operator)
|
||
}
|
||
|
||
// compareValues handles numeric/date/string comparison.
|
||
func compareValues(a, b, operator string) bool {
|
||
// If filter value (b) is a date, only compare if data (a) is also a date.
|
||
// Non-date values should not be compared against date filters (matching Python behavior).
|
||
if isDate(b) {
|
||
if !isDate(a) {
|
||
return operator == "≠"
|
||
}
|
||
return compareString(a, b, operator)
|
||
}
|
||
|
||
// Try numeric comparison
|
||
af, errA := strconv.ParseFloat(a, 64)
|
||
bf, errB := strconv.ParseFloat(b, 64)
|
||
if errA == nil && errB == nil {
|
||
return compareFloat(af, bf, operator)
|
||
}
|
||
|
||
// Fall back to case-insensitive string comparison
|
||
return compareString(strings.ToLower(a), strings.ToLower(b), operator)
|
||
}
|
||
|
||
func compareFloat(a, b float64, operator string) bool {
|
||
switch operator {
|
||
case "=":
|
||
return a == b
|
||
case "≠":
|
||
return a != b
|
||
case ">":
|
||
return a > b
|
||
case "<":
|
||
return a < b
|
||
case "≥":
|
||
return a >= b
|
||
case "≤":
|
||
return a <= b
|
||
}
|
||
return false
|
||
}
|
||
|
||
func compareString(a, b string, operator string) bool {
|
||
switch operator {
|
||
case "=":
|
||
return a == b
|
||
case "≠":
|
||
return a != b
|
||
case ">":
|
||
return a > b
|
||
case "<":
|
||
return a < b
|
||
case "≥":
|
||
return a >= b
|
||
case "≤":
|
||
return a <= b
|
||
}
|
||
return false
|
||
}
|
||
|
||
// isDate checks if a string is in YYYY-MM-DD format.
|
||
func isDate(s string) bool {
|
||
if len(s) != 10 {
|
||
return false
|
||
}
|
||
if s[4] != '-' || s[7] != '-' {
|
||
return false
|
||
}
|
||
for i := 0; i < 10; i++ {
|
||
if i == 4 || i == 7 {
|
||
continue
|
||
}
|
||
if s[i] < '0' || s[i] > '9' {
|
||
return false
|
||
}
|
||
}
|
||
return true
|
||
}
|
||
|
||
// toString converts a value to string for comparison.
|
||
func toString(v interface{}) string {
|
||
if v == nil {
|
||
return ""
|
||
}
|
||
switch s := v.(type) {
|
||
case string:
|
||
return s
|
||
case float64:
|
||
return strconv.FormatFloat(s, 'f', -1, 64)
|
||
case bool:
|
||
if s {
|
||
return "true"
|
||
}
|
||
return "false"
|
||
default:
|
||
return ""
|
||
}
|
||
}
|