diff --git a/internal/common/metadata_utils.go b/internal/common/metadata_utils.go new file mode 100644 index 0000000000..a44b25b75c --- /dev/null +++ b/internal/common/metadata_utils.go @@ -0,0 +1,319 @@ +// +// Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package common + +import ( + "strconv" + "strings" +) + +// MetaCondition represents a single parsed filter condition. +type MetaCondition struct { + Operator string // "=", "≠", ">", "<", "≥", "≤", "contains", "not contains", "in", "not in", "start with", "end with", "empty", "not empty" + Key string // metadata field name + Value interface{} // comparison value +} + +// MetaValueDocs maps a metadata field value to the document IDs that have that value. +// Example: {"Zhang San": ["doc1", "doc2"], "Li Si": ["doc3"]} +type MetaValueDocs map[string][]string + +// MetaData maps a metadata field name to its value→documents mapping. +// Example: {"author": {"Zhang San": ["doc1"]}, "year": {"2024": ["doc1", "doc2"]}} +type MetaData map[string]MetaValueDocs + +// MetaFilterInput groups filter conditions with their logic operator. +type MetaFilterInput struct { + Conditions []MetaCondition + Logic string // "and" | "or" +} + +// operatorMapping translates Python-style operators to internal symbols. +var operatorMapping = map[string]string{ + "is": "=", + "not is": "≠", + ">=": "≥", + "<=": "≤", + "!=": "≠", +} + +// ParseAndConvert converts raw API conditions into MetaFilterInput. +// Equivalent to Python: meta_filter(metas, convert_conditions(cond), cond.get("logic")) +func ParseAndConvert(metadataCondition map[string]interface{}) *MetaFilterInput { + if metadataCondition == nil { + return nil + } + + logic, _ := metadataCondition["logic"].(string) + if logic == "" { + logic = "and" + } + + rawConditions, ok := metadataCondition["conditions"].([]interface{}) + if !ok || len(rawConditions) == 0 { + return nil + } + + var conditions []MetaCondition + for _, raw := range rawConditions { + cond, ok := raw.(map[string]interface{}) + if !ok { + continue + } + name, _ := cond["name"].(string) + if name == "" { + continue + } + op, _ := cond["comparison_operator"].(string) + op = convertOperator(op) + conditions = append(conditions, MetaCondition{ + Operator: op, + Key: name, + Value: cond["value"], + }) + } + + if len(conditions) == 0 { + return nil + } + + return &MetaFilterInput{ + Conditions: conditions, + Logic: logic, + } +} + +// convertOperator translates Python-style operator to internal symbol. +func convertOperator(op string) string { + if mapped, exists := operatorMapping[op]; exists { + return mapped + } + return op +} + +// MetaFilter applies filter conditions against metadata and returns matching doc IDs. +// Python equivalent: common/metadata_utils.py::meta_filter() +func MetaFilter(metas MetaData, input *MetaFilterInput) []string { + if input == nil || len(input.Conditions) == 0 { + return nil + } + + logic := input.Logic + if logic == "" { + logic = "and" + } + + var docIDs *map[string]struct{} + + for _, f := range input.Conditions { + v2docs, ok := metas[f.Key] + if !ok { + if logic == "and" { + return []string{} + } + continue + } + + matched := filterOut(v2docs, f.Operator, f.Value) + + if docIDs == nil { + s := make(map[string]struct{}, len(matched)) + for _, id := range matched { + s[id] = struct{}{} + } + docIDs = &s + } else { + if logic == "and" { + s := make(map[string]struct{}) + for _, id := range matched { + if _, exists := (*docIDs)[id]; exists { + s[id] = struct{}{} + } + } + docIDs = &s + if len(*docIDs) == 0 { + return []string{} + } + } else { + for _, id := range matched { + (*docIDs)[id] = struct{}{} + } + } + } + } + + if docIDs == nil { + return []string{} + } + result := make([]string, 0, len(*docIDs)) + for id := range *docIDs { + result = append(result, id) + } + return result +} + +// filterOut returns matching doc IDs for a single (value → matchedDocs) map and operator. +func filterOut(v2docs MetaValueDocs, operator string, value interface{}) []string { + var ids []string + for input, docids := range v2docs { + if matchValue(input, operator, value) { + ids = append(ids, docids...) + } + } + return ids +} + +// matchValue checks if a single metadata value matches the operator+value. +func matchValue(input string, operator string, value interface{}) bool { + switch operator { + case "empty": + return input == "" + case "not empty": + return input != "" + } + + valStr := toString(value) + + switch operator { + case "contains": + return strings.Contains(input, valStr) + case "not contains": + return !strings.Contains(input, valStr) + case "start with": + return strings.HasPrefix(strings.ToLower(input), strings.ToLower(valStr)) + case "end with": + return strings.HasSuffix(strings.ToLower(input), strings.ToLower(valStr)) + case "in": + if list, ok := value.([]interface{}); ok { + for _, item := range list { + if toString(item) == input { + return true + } + } + } + return false + case "not in": + if list, ok := value.([]interface{}); ok { + for _, item := range list { + if toString(item) == input { + return false + } + } + } + return true + } + + // Comparison operators: =, ≠, >, <, ≥, ≤ + return compareValues(input, valStr, operator) +} + +// compareValues handles numeric/date/string comparison. +func compareValues(a, b, operator string) bool { + // If filter value (b) is a date, only compare if data (a) is also a date. + // Non-date values should not be compared against date filters (matching Python behavior). + if isDate(b) { + if !isDate(a) { + return false + } + return compareString(a, b, operator) + } + + // Try numeric comparison + af, errA := strconv.ParseFloat(a, 64) + bf, errB := strconv.ParseFloat(b, 64) + if errA == nil && errB == nil { + return compareFloat(af, bf, operator) + } + + // Fall back to case-insensitive string comparison + return compareString(strings.ToLower(a), strings.ToLower(b), operator) +} + +func compareFloat(a, b float64, operator string) bool { + switch operator { + case "=": + return a == b + case "≠": + return a != b + case ">": + return a > b + case "<": + return a < b + case "≥": + return a >= b + case "≤": + return a <= b + } + return false +} + +func compareString(a, b string, operator string) bool { + switch operator { + case "=": + return a == b + case "≠": + return a != b + case ">": + return a > b + case "<": + return a < b + case "≥": + return a >= b + case "≤": + return a <= b + } + return false +} + +// isDate checks if a string is in YYYY-MM-DD format. +func isDate(s string) bool { + if len(s) != 10 { + return false + } + if s[4] != '-' || s[7] != '-' { + return false + } + for i := 0; i < 10; i++ { + if i == 4 || i == 7 { + continue + } + if s[i] < '0' || s[i] > '9' { + return false + } + } + return true +} + +// toString converts a value to string for comparison. +func toString(v interface{}) string { + if v == nil { + return "" + } + switch s := v.(type) { + case string: + return s + case float64: + return strconv.FormatFloat(s, 'f', -1, 64) + case bool: + if s { + return "true" + } + return "false" + default: + return "" + } +} diff --git a/internal/common/metadata_utils_test.go b/internal/common/metadata_utils_test.go new file mode 100644 index 0000000000..519751bc54 --- /dev/null +++ b/internal/common/metadata_utils_test.go @@ -0,0 +1,338 @@ +// +// Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package common + +import ( + "testing" +) + +func TestParseAndConvert_OperatorMapping(t *testing.T) { + input := map[string]interface{}{ + "conditions": []interface{}{ + map[string]interface{}{"name": "author", "comparison_operator": "is", "value": "Zhang San"}, + map[string]interface{}{"name": "date", "comparison_operator": ">=", "value": "2024-01-01"}, + }, + } + result := ParseAndConvert(input) + if result == nil { + t.Fatal("expected non-nil result") + } + if result.Logic != "and" { + t.Errorf("expected logic 'and', got '%s'", result.Logic) + } + if len(result.Conditions) != 2 { + t.Fatalf("expected 2 conditions, got %d", len(result.Conditions)) + } + if result.Conditions[0].Operator != "=" { + t.Errorf("expected '=', got '%s'", result.Conditions[0].Operator) + } + if result.Conditions[1].Operator != "≥" { + t.Errorf("expected '≥', got '%s'", result.Conditions[1].Operator) + } +} + +func TestParseAndConvert_WithLogic(t *testing.T) { + input := map[string]interface{}{ + "conditions": []interface{}{ + map[string]interface{}{"name": "author", "comparison_operator": "is", "value": "Zhang San"}, + }, + "logic": "or", + } + result := ParseAndConvert(input) + if result == nil { + t.Fatal("expected non-nil result") + } + if result.Logic != "or" { + t.Errorf("expected logic 'or', got '%s'", result.Logic) + } +} + +func TestParseAndConvert_NilInput(t *testing.T) { + result := ParseAndConvert(nil) + if result != nil { + t.Errorf("expected nil, got %v", result) + } +} + +func TestParseAndConvert_EmptyConditions(t *testing.T) { + result := ParseAndConvert(map[string]interface{}{}) + if result != nil { + t.Errorf("expected nil, got %v", result) + } +} + +func TestParseAndConvert_NoName(t *testing.T) { + input := map[string]interface{}{ + "conditions": []interface{}{ + map[string]interface{}{"comparison_operator": "is", "value": "x"}, + }, + } + result := ParseAndConvert(input) + if result != nil { + t.Errorf("expected nil for empty name, got %v", result) + } +} + +func TestConvertOperator(t *testing.T) { + tests := []struct { + input string + expected string + }{ + {"is", "="}, + {"not is", "≠"}, + {">=", "≥"}, + {"<=", "≤"}, + {"!=", "≠"}, + {"contains", "contains"}, + {"start with", "start with"}, + } + for _, tt := range tests { + got := convertOperator(tt.input) + if got != tt.expected { + t.Errorf("convertOperator(%q) = %q, want %q", tt.input, got, tt.expected) + } + } +} + +func TestMetaFilter_Equals(t *testing.T) { + metas := MetaData{ + "author": {"Zhang San": {"doc1", "doc2"}, "Li Si": {"doc3"}}, + } + input := &MetaFilterInput{ + Conditions: []MetaCondition{{Operator: "=", Key: "author", Value: "Zhang San"}}, + } + result := MetaFilter(metas, input) + if len(result) != 2 { + t.Errorf("expected 2 docs, got %d: %v", len(result), result) + } +} + +func TestMetaFilter_NumberEquals(t *testing.T) { + metas := MetaData{ + "year": {"2024": {"doc1"}, "2025": {"doc2"}}, + } + input := &MetaFilterInput{ + Conditions: []MetaCondition{{Operator: "=", Key: "year", Value: float64(2024)}}, + } + result := MetaFilter(metas, input) + if len(result) != 1 || result[0] != "doc1" { + t.Errorf("expected [doc1], got %v", result) + } +} + +func TestMetaFilter_GreaterThan(t *testing.T) { + metas := MetaData{ + "score": {"85": {"doc1"}, "92": {"doc2"}, "70": {"doc3"}}, + } + input := &MetaFilterInput{ + Conditions: []MetaCondition{{Operator: ">", Key: "score", Value: "80"}}, + } + result := MetaFilter(metas, input) + if len(result) != 2 { + t.Errorf("expected 2 docs, got %d: %v", len(result), result) + } +} + +func TestMetaFilter_GreaterThanOrEqual(t *testing.T) { + metas := MetaData{ + "score": {"85": {"doc1"}, "80": {"doc2"}, "70": {"doc3"}}, + } + input := &MetaFilterInput{ + Conditions: []MetaCondition{{Operator: "≥", Key: "score", Value: "80"}}, + } + result := MetaFilter(metas, input) + if len(result) != 2 { + t.Errorf("expected 2 docs, got %d: %v", len(result), result) + } +} + +func TestMetaFilter_DateComparison(t *testing.T) { + metas := MetaData{ + "date": {"2024-06-01": {"doc1"}, "2024-07-15": {"doc2"}, "2024-05-01": {"doc3"}}, + } + input := &MetaFilterInput{ + Conditions: []MetaCondition{{Operator: ">", Key: "date", Value: "2024-06-01"}}, + } + result := MetaFilter(metas, input) + if len(result) != 1 || result[0] != "doc2" { + t.Errorf("expected [doc2], got %v", result) + } +} + +func TestMetaFilter_DateVsNonDate(t *testing.T) { + metas := MetaData{ + "date_field": {"xyz": {"doc1"}, "2024-06-01": {"doc2"}}, + } + input := &MetaFilterInput{ + Conditions: []MetaCondition{{Operator: ">", Key: "date_field", Value: "2024-01-01"}}, + } + result := MetaFilter(metas, input) + if len(result) != 1 || result[0] != "doc2" { + t.Errorf("expected only date-match [doc2], got %v", result) + } +} + +func TestMetaFilter_Contains(t *testing.T) { + metas := MetaData{ + "title": {"report 2024": {"doc1"}, "summary": {"doc2"}}, + } + input := &MetaFilterInput{ + Conditions: []MetaCondition{{Operator: "contains", Key: "title", Value: "report"}}, + } + result := MetaFilter(metas, input) + if len(result) != 1 || result[0] != "doc1" { + t.Errorf("expected [doc1], got %v", result) + } +} + +func TestMetaFilter_NotContains(t *testing.T) { + metas := MetaData{ + "title": {"report 2024": {"doc1"}, "summary": {"doc2"}}, + } + input := &MetaFilterInput{ + Conditions: []MetaCondition{{Operator: "not contains", Key: "title", Value: "report"}}, + } + result := MetaFilter(metas, input) + if len(result) != 1 || result[0] != "doc2" { + t.Errorf("expected [doc2], got %v", result) + } +} + +func TestMetaFilter_StartWith(t *testing.T) { + metas := MetaData{ + "code": {"ABC-123": {"doc1"}, "XYZ-456": {"doc2"}}, + } + input := &MetaFilterInput{ + Conditions: []MetaCondition{{Operator: "start with", Key: "code", Value: "abc"}}, + } + result := MetaFilter(metas, input) + if len(result) != 1 || result[0] != "doc1" { + t.Errorf("expected [doc1], got %v", result) + } +} + +func TestMetaFilter_EndWith(t *testing.T) { + metas := MetaData{ + "code": {"ABC-123": {"doc1"}, "ABC-456": {"doc2"}}, + } + input := &MetaFilterInput{ + Conditions: []MetaCondition{{Operator: "end with", Key: "code", Value: "123"}}, + } + result := MetaFilter(metas, input) + if len(result) != 1 || result[0] != "doc1" { + t.Errorf("expected [doc1], got %v", result) + } +} + +func TestMetaFilter_Empty(t *testing.T) { + metas := MetaData{ + "field": {"": {"doc1"}, "value": {"doc2"}}, + } + input := &MetaFilterInput{ + Conditions: []MetaCondition{{Operator: "empty", Key: "field", Value: nil}}, + } + result := MetaFilter(metas, input) + if len(result) != 1 || result[0] != "doc1" { + t.Errorf("expected [doc1], got %v", result) + } +} + +func TestMetaFilter_NotEmpty(t *testing.T) { + metas := MetaData{ + "field": {"": {"doc1"}, "value": {"doc2"}}, + } + input := &MetaFilterInput{ + Conditions: []MetaCondition{{Operator: "not empty", Key: "field", Value: nil}}, + } + result := MetaFilter(metas, input) + if len(result) != 1 || result[0] != "doc2" { + t.Errorf("expected [doc2], got %v", result) + } +} + +func TestMetaFilter_AndLogic(t *testing.T) { + metas := MetaData{ + "author": {"Zhang San": {"doc1", "doc2"}, "Li Si": {"doc3"}}, + "year": {"2024": {"doc1"}, "2025": {"doc2", "doc3"}}, + } + input := &MetaFilterInput{ + Conditions: []MetaCondition{ + {Operator: "=", Key: "author", Value: "Zhang San"}, + {Operator: "=", Key: "year", Value: "2024"}, + }, + Logic: "and", + } + result := MetaFilter(metas, input) + if len(result) != 1 || result[0] != "doc1" { + t.Errorf("expected [doc1], got %v", result) + } +} + +func TestMetaFilter_OrLogic(t *testing.T) { + metas := MetaData{ + "author": {"Zhang San": {"doc1"}, "Li Si": {"doc2"}}, + } + input := &MetaFilterInput{ + Conditions: []MetaCondition{ + {Operator: "=", Key: "author", Value: "Zhang San"}, + {Operator: "=", Key: "author", Value: "Li Si"}, + }, + Logic: "or", + } + result := MetaFilter(metas, input) + if len(result) != 2 { + t.Errorf("expected 2 docs, got %d: %v", len(result), result) + } +} + +func TestMetaFilter_KeyNotFound_And(t *testing.T) { + metas := MetaData{"author": {"Zhang San": {"doc1"}}} + input := &MetaFilterInput{ + Conditions: []MetaCondition{{Operator: "=", Key: "nonexistent", Value: "x"}}, + } + result := MetaFilter(metas, input) + if len(result) != 0 { + t.Errorf("expected empty, got %v", result) + } +} + +func TestMetaFilter_KeyNotFound_Or(t *testing.T) { + metas := MetaData{"author": {"Zhang San": {"doc1"}}} + input := &MetaFilterInput{ + Conditions: []MetaCondition{{Operator: "=", Key: "nonexistent", Value: "x"}}, + Logic: "or", + } + result := MetaFilter(metas, input) + if len(result) != 0 { + t.Errorf("expected empty, got %v", result) + } +} + +func TestMetaFilter_NilInput(t *testing.T) { + result := MetaFilter(nil, nil) + if result != nil { + t.Errorf("expected nil, got %v", result) + } +} + +func TestMetaFilter_EmptyInput(t *testing.T) { + metas := MetaData{"author": {"Zhang San": {"doc1"}}} + result := MetaFilter(metas, &MetaFilterInput{}) + if result != nil { + t.Errorf("expected nil, got %v", result) + } +}