feat: migrate meta_filter and convert_conditions to Go (#15648)

## Summary

Migrate the metadata filtering utilities `meta_filter` and
`convert_conditions` from `common/metadata_utils.py` to Go as pure
functions with zero external dependencies.

These functions are used by `dify/retrieval`, `openai/chat/completions`,
`document_api`, and `chunk_api` for filtering documents by metadata
conditions.

### Changes

- **New**: `internal/common/metadata_utils.go` — `ConvertConditions()`
and `MetaFilter()` with full operator support
- **New**: `internal/common/metadata_utils_test.go` — 18 test cases
covering all operators and edge cases

### Supported Operators

`=`, `≠`, `>`, `<`, `≥`, `≤`, `contains`, `not contains`, `in`, `not
in`, `start with`, `end with`, `empty`, `not empty`

### Design

- Numeric comparison via `strconv.ParseFloat`
- Date comparison via YYYY-MM-DD format detection
- Case-insensitive string comparison fallback
- `and` / `or` logic support for multiple conditions
- Zero external dependencies — pure functions only
This commit is contained in:
Jack
2026-06-04 20:14:27 +08:00
committed by GitHub
parent e627f5d8c5
commit 461c190c49
2 changed files with 657 additions and 0 deletions

View File

@@ -0,0 +1,319 @@
//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package common
import (
"strconv"
"strings"
)
// MetaCondition represents a single parsed filter condition.
type MetaCondition struct {
Operator string // "=", "≠", ">", "<", "≥", "≤", "contains", "not contains", "in", "not in", "start with", "end with", "empty", "not empty"
Key string // metadata field name
Value interface{} // comparison value
}
// MetaValueDocs maps a metadata field value to the document IDs that have that value.
// Example: {"Zhang San": ["doc1", "doc2"], "Li Si": ["doc3"]}
type MetaValueDocs map[string][]string
// MetaData maps a metadata field name to its value→documents mapping.
// Example: {"author": {"Zhang San": ["doc1"]}, "year": {"2024": ["doc1", "doc2"]}}
type MetaData map[string]MetaValueDocs
// MetaFilterInput groups filter conditions with their logic operator.
type MetaFilterInput struct {
Conditions []MetaCondition
Logic string // "and" | "or"
}
// operatorMapping translates Python-style operators to internal symbols.
var operatorMapping = map[string]string{
"is": "=",
"not is": "≠",
">=": "≥",
"<=": "≤",
"!=": "≠",
}
// ParseAndConvert converts raw API conditions into MetaFilterInput.
// Equivalent to Python: meta_filter(metas, convert_conditions(cond), cond.get("logic"))
func ParseAndConvert(metadataCondition map[string]interface{}) *MetaFilterInput {
if metadataCondition == nil {
return nil
}
logic, _ := metadataCondition["logic"].(string)
if logic == "" {
logic = "and"
}
rawConditions, ok := metadataCondition["conditions"].([]interface{})
if !ok || len(rawConditions) == 0 {
return nil
}
var conditions []MetaCondition
for _, raw := range rawConditions {
cond, ok := raw.(map[string]interface{})
if !ok {
continue
}
name, _ := cond["name"].(string)
if name == "" {
continue
}
op, _ := cond["comparison_operator"].(string)
op = convertOperator(op)
conditions = append(conditions, MetaCondition{
Operator: op,
Key: name,
Value: cond["value"],
})
}
if len(conditions) == 0 {
return nil
}
return &MetaFilterInput{
Conditions: conditions,
Logic: logic,
}
}
// convertOperator translates Python-style operator to internal symbol.
func convertOperator(op string) string {
if mapped, exists := operatorMapping[op]; exists {
return mapped
}
return op
}
// MetaFilter applies filter conditions against metadata and returns matching doc IDs.
// Python equivalent: common/metadata_utils.py::meta_filter()
func MetaFilter(metas MetaData, input *MetaFilterInput) []string {
if input == nil || len(input.Conditions) == 0 {
return nil
}
logic := input.Logic
if logic == "" {
logic = "and"
}
var docIDs *map[string]struct{}
for _, f := range input.Conditions {
v2docs, ok := metas[f.Key]
if !ok {
if logic == "and" {
return []string{}
}
continue
}
matched := filterOut(v2docs, f.Operator, f.Value)
if docIDs == nil {
s := make(map[string]struct{}, len(matched))
for _, id := range matched {
s[id] = struct{}{}
}
docIDs = &s
} else {
if logic == "and" {
s := make(map[string]struct{})
for _, id := range matched {
if _, exists := (*docIDs)[id]; exists {
s[id] = struct{}{}
}
}
docIDs = &s
if len(*docIDs) == 0 {
return []string{}
}
} else {
for _, id := range matched {
(*docIDs)[id] = struct{}{}
}
}
}
}
if docIDs == nil {
return []string{}
}
result := make([]string, 0, len(*docIDs))
for id := range *docIDs {
result = append(result, id)
}
return result
}
// filterOut returns matching doc IDs for a single (value → matchedDocs) map and operator.
func filterOut(v2docs MetaValueDocs, operator string, value interface{}) []string {
var ids []string
for input, docids := range v2docs {
if matchValue(input, operator, value) {
ids = append(ids, docids...)
}
}
return ids
}
// matchValue checks if a single metadata value matches the operator+value.
func matchValue(input string, operator string, value interface{}) bool {
switch operator {
case "empty":
return input == ""
case "not empty":
return input != ""
}
valStr := toString(value)
switch operator {
case "contains":
return strings.Contains(input, valStr)
case "not contains":
return !strings.Contains(input, valStr)
case "start with":
return strings.HasPrefix(strings.ToLower(input), strings.ToLower(valStr))
case "end with":
return strings.HasSuffix(strings.ToLower(input), strings.ToLower(valStr))
case "in":
if list, ok := value.([]interface{}); ok {
for _, item := range list {
if toString(item) == input {
return true
}
}
}
return false
case "not in":
if list, ok := value.([]interface{}); ok {
for _, item := range list {
if toString(item) == input {
return false
}
}
}
return true
}
// Comparison operators: =, ≠, >, <, ≥, ≤
return compareValues(input, valStr, operator)
}
// compareValues handles numeric/date/string comparison.
func compareValues(a, b, operator string) bool {
// If filter value (b) is a date, only compare if data (a) is also a date.
// Non-date values should not be compared against date filters (matching Python behavior).
if isDate(b) {
if !isDate(a) {
return false
}
return compareString(a, b, operator)
}
// Try numeric comparison
af, errA := strconv.ParseFloat(a, 64)
bf, errB := strconv.ParseFloat(b, 64)
if errA == nil && errB == nil {
return compareFloat(af, bf, operator)
}
// Fall back to case-insensitive string comparison
return compareString(strings.ToLower(a), strings.ToLower(b), operator)
}
func compareFloat(a, b float64, operator string) bool {
switch operator {
case "=":
return a == b
case "≠":
return a != b
case ">":
return a > b
case "<":
return a < b
case "≥":
return a >= b
case "≤":
return a <= b
}
return false
}
func compareString(a, b string, operator string) bool {
switch operator {
case "=":
return a == b
case "≠":
return a != b
case ">":
return a > b
case "<":
return a < b
case "≥":
return a >= b
case "≤":
return a <= b
}
return false
}
// isDate checks if a string is in YYYY-MM-DD format.
func isDate(s string) bool {
if len(s) != 10 {
return false
}
if s[4] != '-' || s[7] != '-' {
return false
}
for i := 0; i < 10; i++ {
if i == 4 || i == 7 {
continue
}
if s[i] < '0' || s[i] > '9' {
return false
}
}
return true
}
// toString converts a value to string for comparison.
func toString(v interface{}) string {
if v == nil {
return ""
}
switch s := v.(type) {
case string:
return s
case float64:
return strconv.FormatFloat(s, 'f', -1, 64)
case bool:
if s {
return "true"
}
return "false"
default:
return ""
}
}

View File

@@ -0,0 +1,338 @@
//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package common
import (
"testing"
)
func TestParseAndConvert_OperatorMapping(t *testing.T) {
input := map[string]interface{}{
"conditions": []interface{}{
map[string]interface{}{"name": "author", "comparison_operator": "is", "value": "Zhang San"},
map[string]interface{}{"name": "date", "comparison_operator": ">=", "value": "2024-01-01"},
},
}
result := ParseAndConvert(input)
if result == nil {
t.Fatal("expected non-nil result")
}
if result.Logic != "and" {
t.Errorf("expected logic 'and', got '%s'", result.Logic)
}
if len(result.Conditions) != 2 {
t.Fatalf("expected 2 conditions, got %d", len(result.Conditions))
}
if result.Conditions[0].Operator != "=" {
t.Errorf("expected '=', got '%s'", result.Conditions[0].Operator)
}
if result.Conditions[1].Operator != "≥" {
t.Errorf("expected '≥', got '%s'", result.Conditions[1].Operator)
}
}
func TestParseAndConvert_WithLogic(t *testing.T) {
input := map[string]interface{}{
"conditions": []interface{}{
map[string]interface{}{"name": "author", "comparison_operator": "is", "value": "Zhang San"},
},
"logic": "or",
}
result := ParseAndConvert(input)
if result == nil {
t.Fatal("expected non-nil result")
}
if result.Logic != "or" {
t.Errorf("expected logic 'or', got '%s'", result.Logic)
}
}
func TestParseAndConvert_NilInput(t *testing.T) {
result := ParseAndConvert(nil)
if result != nil {
t.Errorf("expected nil, got %v", result)
}
}
func TestParseAndConvert_EmptyConditions(t *testing.T) {
result := ParseAndConvert(map[string]interface{}{})
if result != nil {
t.Errorf("expected nil, got %v", result)
}
}
func TestParseAndConvert_NoName(t *testing.T) {
input := map[string]interface{}{
"conditions": []interface{}{
map[string]interface{}{"comparison_operator": "is", "value": "x"},
},
}
result := ParseAndConvert(input)
if result != nil {
t.Errorf("expected nil for empty name, got %v", result)
}
}
func TestConvertOperator(t *testing.T) {
tests := []struct {
input string
expected string
}{
{"is", "="},
{"not is", "≠"},
{">=", "≥"},
{"<=", "≤"},
{"!=", "≠"},
{"contains", "contains"},
{"start with", "start with"},
}
for _, tt := range tests {
got := convertOperator(tt.input)
if got != tt.expected {
t.Errorf("convertOperator(%q) = %q, want %q", tt.input, got, tt.expected)
}
}
}
func TestMetaFilter_Equals(t *testing.T) {
metas := MetaData{
"author": {"Zhang San": {"doc1", "doc2"}, "Li Si": {"doc3"}},
}
input := &MetaFilterInput{
Conditions: []MetaCondition{{Operator: "=", Key: "author", Value: "Zhang San"}},
}
result := MetaFilter(metas, input)
if len(result) != 2 {
t.Errorf("expected 2 docs, got %d: %v", len(result), result)
}
}
func TestMetaFilter_NumberEquals(t *testing.T) {
metas := MetaData{
"year": {"2024": {"doc1"}, "2025": {"doc2"}},
}
input := &MetaFilterInput{
Conditions: []MetaCondition{{Operator: "=", Key: "year", Value: float64(2024)}},
}
result := MetaFilter(metas, input)
if len(result) != 1 || result[0] != "doc1" {
t.Errorf("expected [doc1], got %v", result)
}
}
func TestMetaFilter_GreaterThan(t *testing.T) {
metas := MetaData{
"score": {"85": {"doc1"}, "92": {"doc2"}, "70": {"doc3"}},
}
input := &MetaFilterInput{
Conditions: []MetaCondition{{Operator: ">", Key: "score", Value: "80"}},
}
result := MetaFilter(metas, input)
if len(result) != 2 {
t.Errorf("expected 2 docs, got %d: %v", len(result), result)
}
}
func TestMetaFilter_GreaterThanOrEqual(t *testing.T) {
metas := MetaData{
"score": {"85": {"doc1"}, "80": {"doc2"}, "70": {"doc3"}},
}
input := &MetaFilterInput{
Conditions: []MetaCondition{{Operator: "≥", Key: "score", Value: "80"}},
}
result := MetaFilter(metas, input)
if len(result) != 2 {
t.Errorf("expected 2 docs, got %d: %v", len(result), result)
}
}
func TestMetaFilter_DateComparison(t *testing.T) {
metas := MetaData{
"date": {"2024-06-01": {"doc1"}, "2024-07-15": {"doc2"}, "2024-05-01": {"doc3"}},
}
input := &MetaFilterInput{
Conditions: []MetaCondition{{Operator: ">", Key: "date", Value: "2024-06-01"}},
}
result := MetaFilter(metas, input)
if len(result) != 1 || result[0] != "doc2" {
t.Errorf("expected [doc2], got %v", result)
}
}
func TestMetaFilter_DateVsNonDate(t *testing.T) {
metas := MetaData{
"date_field": {"xyz": {"doc1"}, "2024-06-01": {"doc2"}},
}
input := &MetaFilterInput{
Conditions: []MetaCondition{{Operator: ">", Key: "date_field", Value: "2024-01-01"}},
}
result := MetaFilter(metas, input)
if len(result) != 1 || result[0] != "doc2" {
t.Errorf("expected only date-match [doc2], got %v", result)
}
}
func TestMetaFilter_Contains(t *testing.T) {
metas := MetaData{
"title": {"report 2024": {"doc1"}, "summary": {"doc2"}},
}
input := &MetaFilterInput{
Conditions: []MetaCondition{{Operator: "contains", Key: "title", Value: "report"}},
}
result := MetaFilter(metas, input)
if len(result) != 1 || result[0] != "doc1" {
t.Errorf("expected [doc1], got %v", result)
}
}
func TestMetaFilter_NotContains(t *testing.T) {
metas := MetaData{
"title": {"report 2024": {"doc1"}, "summary": {"doc2"}},
}
input := &MetaFilterInput{
Conditions: []MetaCondition{{Operator: "not contains", Key: "title", Value: "report"}},
}
result := MetaFilter(metas, input)
if len(result) != 1 || result[0] != "doc2" {
t.Errorf("expected [doc2], got %v", result)
}
}
func TestMetaFilter_StartWith(t *testing.T) {
metas := MetaData{
"code": {"ABC-123": {"doc1"}, "XYZ-456": {"doc2"}},
}
input := &MetaFilterInput{
Conditions: []MetaCondition{{Operator: "start with", Key: "code", Value: "abc"}},
}
result := MetaFilter(metas, input)
if len(result) != 1 || result[0] != "doc1" {
t.Errorf("expected [doc1], got %v", result)
}
}
func TestMetaFilter_EndWith(t *testing.T) {
metas := MetaData{
"code": {"ABC-123": {"doc1"}, "ABC-456": {"doc2"}},
}
input := &MetaFilterInput{
Conditions: []MetaCondition{{Operator: "end with", Key: "code", Value: "123"}},
}
result := MetaFilter(metas, input)
if len(result) != 1 || result[0] != "doc1" {
t.Errorf("expected [doc1], got %v", result)
}
}
func TestMetaFilter_Empty(t *testing.T) {
metas := MetaData{
"field": {"": {"doc1"}, "value": {"doc2"}},
}
input := &MetaFilterInput{
Conditions: []MetaCondition{{Operator: "empty", Key: "field", Value: nil}},
}
result := MetaFilter(metas, input)
if len(result) != 1 || result[0] != "doc1" {
t.Errorf("expected [doc1], got %v", result)
}
}
func TestMetaFilter_NotEmpty(t *testing.T) {
metas := MetaData{
"field": {"": {"doc1"}, "value": {"doc2"}},
}
input := &MetaFilterInput{
Conditions: []MetaCondition{{Operator: "not empty", Key: "field", Value: nil}},
}
result := MetaFilter(metas, input)
if len(result) != 1 || result[0] != "doc2" {
t.Errorf("expected [doc2], got %v", result)
}
}
func TestMetaFilter_AndLogic(t *testing.T) {
metas := MetaData{
"author": {"Zhang San": {"doc1", "doc2"}, "Li Si": {"doc3"}},
"year": {"2024": {"doc1"}, "2025": {"doc2", "doc3"}},
}
input := &MetaFilterInput{
Conditions: []MetaCondition{
{Operator: "=", Key: "author", Value: "Zhang San"},
{Operator: "=", Key: "year", Value: "2024"},
},
Logic: "and",
}
result := MetaFilter(metas, input)
if len(result) != 1 || result[0] != "doc1" {
t.Errorf("expected [doc1], got %v", result)
}
}
func TestMetaFilter_OrLogic(t *testing.T) {
metas := MetaData{
"author": {"Zhang San": {"doc1"}, "Li Si": {"doc2"}},
}
input := &MetaFilterInput{
Conditions: []MetaCondition{
{Operator: "=", Key: "author", Value: "Zhang San"},
{Operator: "=", Key: "author", Value: "Li Si"},
},
Logic: "or",
}
result := MetaFilter(metas, input)
if len(result) != 2 {
t.Errorf("expected 2 docs, got %d: %v", len(result), result)
}
}
func TestMetaFilter_KeyNotFound_And(t *testing.T) {
metas := MetaData{"author": {"Zhang San": {"doc1"}}}
input := &MetaFilterInput{
Conditions: []MetaCondition{{Operator: "=", Key: "nonexistent", Value: "x"}},
}
result := MetaFilter(metas, input)
if len(result) != 0 {
t.Errorf("expected empty, got %v", result)
}
}
func TestMetaFilter_KeyNotFound_Or(t *testing.T) {
metas := MetaData{"author": {"Zhang San": {"doc1"}}}
input := &MetaFilterInput{
Conditions: []MetaCondition{{Operator: "=", Key: "nonexistent", Value: "x"}},
Logic: "or",
}
result := MetaFilter(metas, input)
if len(result) != 0 {
t.Errorf("expected empty, got %v", result)
}
}
func TestMetaFilter_NilInput(t *testing.T) {
result := MetaFilter(nil, nil)
if result != nil {
t.Errorf("expected nil, got %v", result)
}
}
func TestMetaFilter_EmptyInput(t *testing.T) {
metas := MetaData{"author": {"Zhang San": {"doc1"}}}
result := MetaFilter(metas, &MetaFilterInput{})
if result != nil {
t.Errorf("expected nil, got %v", result)
}
}