Files
ragflow/internal/common/metadata_utils.go
Jack f6ff862a24 fix: restore case-insensitive contains/not contains/not in and consolidate metadata filter pipeline (#15686)
## Summary

This PR fixes case-sensitivity regressions introduced in #15656 and
consolidates the metadata filtering pipeline by removing the duplicate
`applySingleCondition` adapter layer.

### Bug fixes
1. **contains / not contains**: restored case-insensitive matching (was
lost when `applySingleCondition` was replaced by
`common.MetaFilter.matchValue` which lacked `strings.ToLower`)
2. **not in**: restored case-insensitive matching (was lost for same
reason; uses `strings.EqualFold`)
3. **!= with date filter values**: non-date metadata values now
correctly match the `≠` operator (a non-date value IS not equal to any
date, but was returning false)

### Architecture
4. **Removed `applySingleCondition`** (65 lines) — the inline switch was
a duplicate of `common.MetaFilter` logic. `ApplyMetaFilter` now converts
conditions and delegates to `common.MetaFilter` once per filter set,
eliminating ~25 lines of duplicate AND/OR merge logic.
5. **Added `filterSet`** — O(n+m) hash-map fast path for `in`/`not in`
operators, replacing the O(n*m) linear scan in `matchValue`.
6. **Exported `NormalizeOperator`** from `common` for consistent
operator alias handling.

### Cleanup
7. Removed 18 lines of dead code (`matchValue`'s `in`/`not in` branches
already bypassed by `filterOut` delegation)
8. Fixed orphaned godoc comment for `convertOperator`
9. Fixed incorrect `filterSet` doc comment (claimed "matching EqualFold"
but used `strings.ToLower`)
10. Completed `convertToMetaCondition` operator normalization
documentation

### Testing
- 60 tests (24 service + 36 common), all passing
- New tests: `==`, `≠`, `>`, `<`, `≥`, `≤`, `empty`, `not empty` through
`ApplyMetaFilter`
- New tests: `<`, `≤`, `≠` through `MetaFilter`; `not-in-empty-list`
through `filterSet`
- All 18 `MetaFilter` tests pass; all 10 `filterSet` unit tests pass

---------

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 12:47:55 +08:00

367 lines
9.3 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package common
import (
"strconv"
"strings"
)
// MetaCondition represents a single parsed filter condition.
type MetaCondition struct {
Operator string // "=", "≠", ">", "<", "≥", "≤", "contains", "not contains", "in", "not in", "start with", "end with", "empty", "not empty"
Key string // metadata field name
Value interface{} // comparison value
}
// MetaValueDocs maps a metadata field value to the document IDs that have that value.
// Example: {"Zhang San": ["doc1", "doc2"], "Li Si": ["doc3"]}
type MetaValueDocs map[string][]string
// MetaData maps a metadata field name to its value→documents mapping.
// Example: {"author": {"Zhang San": ["doc1"]}, "year": {"2024": ["doc1", "doc2"]}}
type MetaData map[string]MetaValueDocs
// MetaFilterInput groups filter conditions with their logic operator.
type MetaFilterInput struct {
Conditions []MetaCondition
Logic string // "and" | "or"
}
// operatorMapping translates Python-style operators to internal symbols.
var operatorMapping = map[string]string{
"is": "=",
"not is": "≠",
">=": "≥",
"<=": "≤",
"!=": "≠",
"==": "=",
}
// ParseAndConvert converts raw API conditions into MetaFilterInput.
// Equivalent to Python: meta_filter(metas, convert_conditions(cond), cond.get("logic"))
func ParseAndConvert(metadataCondition map[string]interface{}) *MetaFilterInput {
if metadataCondition == nil {
return nil
}
logic, _ := metadataCondition["logic"].(string)
if logic == "" {
logic = "and"
}
rawConditions, ok := metadataCondition["conditions"].([]interface{})
if !ok || len(rawConditions) == 0 {
return nil
}
var conditions []MetaCondition
for _, raw := range rawConditions {
cond, ok := raw.(map[string]interface{})
if !ok {
continue
}
name, _ := cond["name"].(string)
if name == "" {
continue
}
op, _ := cond["comparison_operator"].(string)
op = convertOperator(op)
conditions = append(conditions, MetaCondition{
Operator: op,
Key: name,
Value: cond["value"],
})
}
if len(conditions) == 0 {
return nil
}
return &MetaFilterInput{
Conditions: conditions,
Logic: logic,
}
}
// convertOperator translates operator aliases to their canonical form.
func convertOperator(op string) string {
if mapped, exists := operatorMapping[op]; exists {
return mapped
}
return op
}
// NormalizeOperator is the exported equivalent of convertOperator.
func NormalizeOperator(op string) string { return convertOperator(op) }
// MetaFilter applies filter conditions against metadata and returns matching doc IDs.
// Python equivalent: common/metadata_utils.py::meta_filter()
func MetaFilter(metas MetaData, input *MetaFilterInput) []string {
if input == nil || len(input.Conditions) == 0 {
return nil
}
logic := input.Logic
if logic == "" {
logic = "and"
}
var docIDs *map[string]struct{}
for _, f := range input.Conditions {
v2docs, ok := metas[f.Key]
if !ok {
if logic == "and" {
return []string{}
}
continue
}
matched := filterOut(v2docs, f.Operator, f.Value)
if docIDs == nil {
s := make(map[string]struct{}, len(matched))
for _, id := range matched {
s[id] = struct{}{}
}
docIDs = &s
} else {
if logic == "and" {
s := make(map[string]struct{})
for _, id := range matched {
if _, exists := (*docIDs)[id]; exists {
s[id] = struct{}{}
}
}
docIDs = &s
if len(*docIDs) == 0 {
return []string{}
}
} else {
for _, id := range matched {
(*docIDs)[id] = struct{}{}
}
}
}
}
if docIDs == nil {
return []string{}
}
result := make([]string, 0, len(*docIDs))
for id := range *docIDs {
result = append(result, id)
}
return result
}
// filterOut returns matching doc IDs for a single (value → matchedDocs) map and operator.
// For "in" and "not in", it delegates to filterSet for O(n+m) hash-map-based filtering;
// all other operators use matchValue for per-element predicate evaluation.
func filterOut(v2docs MetaValueDocs, operator string, value interface{}) []string {
if operator == "in" || operator == "not in" {
return filterSet(v2docs, operator, value)
}
var ids []string
for input, docids := range v2docs {
if matchValue(input, operator, value) {
ids = append(ids, docids...)
}
}
return ids
}
// filterSet handles "in" and "not in" operators using O(1) hash map lookups.
//
// Instead of the O(n×m) linear scan that matchValue performs for these operators
// (n = distinct metadata values, m = filter list size), filterSet builds a lookup
// map from the filter value list once (O(m)) then tests each metadata entry in
// O(1) time (O(n)), yielding O(n+m) overall.
//
// Case sensitivity follows the same contract as matchValue:
// - "in": case-sensitive (exact match via toString(item) == input)
// - "not in": case-insensitive (strings.ToLower on both sides)
//
// When value is not a []interface{} (should not happen in normal call paths),
// filterSet returns nil — no metadata values match "in", and for "not in" it
// defensively returns nil as well (rather than returning all entries, which could
// silently bypass a misconfigured filter).
func filterSet(v2docs MetaValueDocs, operator string, value interface{}) []string {
list, ok := value.([]interface{})
if !ok {
return nil
}
if operator == "not in" {
// Build case-insensitive exclusion set.
lookup := make(map[string]bool, len(list))
for _, item := range list {
lookup[strings.ToLower(toString(item))] = true
}
var ids []string
for input, docids := range v2docs {
if !lookup[strings.ToLower(input)] {
ids = append(ids, docids...)
}
}
return ids
}
// "in": build case-sensitive inclusion set.
lookup := make(map[string]bool, len(list))
for _, item := range list {
lookup[toString(item)] = true
}
var ids []string
for input, docids := range v2docs {
if lookup[input] {
ids = append(ids, docids...)
}
}
return ids
}
// matchValue checks if a single metadata value matches the operator+value.
func matchValue(input string, operator string, value interface{}) bool {
switch operator {
case "empty":
return input == ""
case "not empty":
return input != ""
}
valStr := toString(value)
switch operator {
case "contains":
return strings.Contains(strings.ToLower(input), strings.ToLower(valStr))
case "not contains":
return !strings.Contains(strings.ToLower(input), strings.ToLower(valStr))
case "start with":
return strings.HasPrefix(strings.ToLower(input), strings.ToLower(valStr))
case "end with":
return strings.HasSuffix(strings.ToLower(input), strings.ToLower(valStr))
// "in" and "not in" are intentionally omitted from matchValue.
// filterOut (line 177) intercepts these operators and delegates
// them to filterSet for O(n+m) hash-map-based filtering, so they
// never reach this function through normal call paths.
}
// Comparison operators: =, ≠, >, <, ≥, ≤
return compareValues(input, valStr, operator)
}
// compareValues handles numeric/date/string comparison.
func compareValues(a, b, operator string) bool {
// If filter value (b) is a date, only compare if data (a) is also a date.
// Non-date values should not be compared against date filters (matching Python behavior).
if isDate(b) {
if !isDate(a) {
return operator == "≠"
}
return compareString(a, b, operator)
}
// Try numeric comparison
af, errA := strconv.ParseFloat(a, 64)
bf, errB := strconv.ParseFloat(b, 64)
if errA == nil && errB == nil {
return compareFloat(af, bf, operator)
}
// Fall back to case-insensitive string comparison
return compareString(strings.ToLower(a), strings.ToLower(b), operator)
}
func compareFloat(a, b float64, operator string) bool {
switch operator {
case "=":
return a == b
case "≠":
return a != b
case ">":
return a > b
case "<":
return a < b
case "≥":
return a >= b
case "≤":
return a <= b
}
return false
}
func compareString(a, b string, operator string) bool {
switch operator {
case "=":
return a == b
case "≠":
return a != b
case ">":
return a > b
case "<":
return a < b
case "≥":
return a >= b
case "≤":
return a <= b
}
return false
}
// isDate checks if a string is in YYYY-MM-DD format.
func isDate(s string) bool {
if len(s) != 10 {
return false
}
if s[4] != '-' || s[7] != '-' {
return false
}
for i := 0; i < 10; i++ {
if i == 4 || i == 7 {
continue
}
if s[i] < '0' || s[i] > '9' {
return false
}
}
return true
}
// toString converts a value to string for comparison.
func toString(v interface{}) string {
if v == nil {
return ""
}
switch s := v.(type) {
case string:
return s
case float64:
return strconv.FormatFloat(s, 'f', -1, 64)
case bool:
if s {
return "true"
}
return "false"
default:
return ""
}
}