mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-05 10:58:34 +08:00
## Summary
Both ES and Infinity engines now respect `SearchRequest.SelectFields`,
allowing callers to specify output columns for KG
entity/relation/community queries instead of the default chunk columns.
### Changes
- **`internal/engine/elasticsearch/chunk.go`**: Added `SelectFields`
override after default `outputColumns`
- **`internal/engine/infinity/chunk.go`**: Added `SelectFields` override
after default `outputColumns`
- **`internal/engine/elasticsearch/kg_test.go`** (new): Integration test
(skipped unless `ES_TEST=1`)
### Usage
```go
result, err := docEngine.Search(ctx, \&types.SearchRequest{
KbIDs: kbIDs,
SelectFields: []string{entity_kwd, entity_type_kwd, rank_flt, n_hop_with_weight},
Filter: map[string]interface{}{knowledge_graph_kwd: entity},
})
```
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2056 lines
57 KiB
Go
2056 lines
57 KiB
Go
//
|
|
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
|
|
package elasticsearch
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"slices"
|
|
"sort"
|
|
"strings"
|
|
|
|
"github.com/elastic/go-elasticsearch/v8/esapi"
|
|
"ragflow/internal/common"
|
|
"ragflow/internal/engine/types"
|
|
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
// CreateChunkStore creates an index
|
|
func (e *elasticsearchEngine) CreateChunkStore(ctx context.Context, baseName, datasetID string, vectorSize int, parserID string) error {
|
|
if baseName == "" {
|
|
return fmt.Errorf("index name cannot be empty")
|
|
}
|
|
|
|
// Check if index already exists (matches Python create_idx behavior)
|
|
exists, err := e.indexExists(ctx, baseName)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to check index existence: %w", err)
|
|
}
|
|
if exists {
|
|
common.Info("Index already exists, skipping creation", zap.String("index_name", baseName))
|
|
return nil
|
|
}
|
|
|
|
// Load mapping based on index type
|
|
var mapping map[string]interface{}
|
|
if datasetID == "skill" {
|
|
// Load skill-specific mapping
|
|
skillMapping, err := loadSkillMapping()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to load skill mapping: %w", err)
|
|
}
|
|
mapping = skillMapping
|
|
} else {
|
|
// Default mapping for dataset
|
|
mapping = map[string]interface{}{
|
|
"settings": map[string]interface{}{
|
|
"number_of_shards": 1,
|
|
"number_of_replicas": 0,
|
|
},
|
|
}
|
|
}
|
|
|
|
// Prepare request body
|
|
var body io.Reader
|
|
if mapping != nil {
|
|
data, err := json.Marshal(mapping)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal mapping: %w", err)
|
|
}
|
|
body = bytes.NewReader(data)
|
|
}
|
|
|
|
// Create index
|
|
req := esapi.IndicesCreateRequest{
|
|
Index: baseName,
|
|
Body: body,
|
|
}
|
|
|
|
res, err := req.Do(ctx, e.client)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create index: %w", err)
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
if res.IsError() {
|
|
bodyBytes, _ := io.ReadAll(res.Body)
|
|
reason := extractErrorReason(bodyBytes)
|
|
if reason != "" {
|
|
return fmt.Errorf("elasticsearch error: %s", reason)
|
|
}
|
|
return fmt.Errorf("elasticsearch returned error: %s, body: %s", res.Status(), string(bodyBytes))
|
|
}
|
|
|
|
// Parse response
|
|
var result map[string]interface{}
|
|
if err := json.NewDecoder(res.Body).Decode(&result); err != nil {
|
|
return fmt.Errorf("failed to parse response: %w", err)
|
|
}
|
|
|
|
acknowledged, ok := result["acknowledged"].(bool)
|
|
if !ok || !acknowledged {
|
|
return fmt.Errorf("index creation not acknowledged")
|
|
}
|
|
|
|
common.Info("Successfully created Elasticsearch index", zap.String("index_name", baseName))
|
|
return nil
|
|
}
|
|
|
|
// InsertChunks inserts chunks into a chunk index
|
|
// If a chunk with the same id + doc_id + kb_id already exists, it will be updated with the new value
|
|
func (e *elasticsearchEngine) InsertChunks(ctx context.Context, chunks []map[string]interface{}, baseName string, datasetID string) ([]string, error) {
|
|
common.Info("ElasticsearchConnection.InsertChunks called", zap.String("index_name", baseName), zap.Int("chunkCount", len(chunks)))
|
|
|
|
if len(chunks) == 0 {
|
|
return []string{}, nil
|
|
}
|
|
|
|
if baseName == "" {
|
|
return nil, fmt.Errorf("index name cannot be empty")
|
|
}
|
|
|
|
// Build bulk request body with index operations (upsert behavior: insert if not exists, update if exists)
|
|
var buf bytes.Buffer
|
|
for _, doc := range chunks {
|
|
docID, _ := doc["doc_id"].(string)
|
|
chunkID, _ := doc["id"].(string)
|
|
if docID == "" || chunkID == "" {
|
|
common.Warn("Skipping chunk without doc_id or id")
|
|
continue
|
|
}
|
|
|
|
compositeID := fmt.Sprintf("%s_%s_%s", docID, datasetID, chunkID)
|
|
|
|
// Action line: use json.Marshal to properly escape string values
|
|
action, err := json.Marshal(map[string]interface{}{
|
|
"index": map[string]interface{}{
|
|
"_index": baseName,
|
|
"_id": compositeID,
|
|
},
|
|
})
|
|
if err != nil {
|
|
common.Error("Failed to marshal bulk action", err)
|
|
return nil, fmt.Errorf("failed to marshal bulk action: %w", err)
|
|
}
|
|
buf.Write(action)
|
|
buf.WriteByte('\n')
|
|
|
|
// Document line: work with a copy to avoid mutating the original
|
|
docCopy := copyFields(doc)
|
|
docCopy["kb_id"] = datasetID
|
|
if err := json.NewEncoder(&buf).Encode(docCopy); err != nil {
|
|
return nil, fmt.Errorf("failed to encode document: %w", err)
|
|
}
|
|
}
|
|
|
|
// Execute bulk request with refresh="wait_for" (matches Python behavior)
|
|
req := esapi.BulkRequest{
|
|
Body: bytes.NewReader(buf.Bytes()),
|
|
Refresh: "wait_for",
|
|
}
|
|
|
|
res, err := req.Do(ctx, e.client)
|
|
if err != nil {
|
|
common.Error("Failed to execute bulk request", err)
|
|
return nil, fmt.Errorf("failed to execute bulk request: %w", err)
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
if res.IsError() {
|
|
bodyBytes, _ := io.ReadAll(res.Body)
|
|
common.Sugar.Errorw("Elasticsearch bulk request returned error", "status", res.Status(), "body", string(bodyBytes))
|
|
return nil, fmt.Errorf("elasticsearch bulk request returned error: %s, body: %s", res.Status(), string(bodyBytes))
|
|
}
|
|
|
|
// Parse bulk response
|
|
var bulkResponse map[string]interface{}
|
|
if err := json.NewDecoder(res.Body).Decode(&bulkResponse); err != nil {
|
|
common.Error("Failed to parse bulk response", err)
|
|
return nil, fmt.Errorf("failed to parse bulk response: %w", err)
|
|
}
|
|
|
|
// Check for errors in bulk response
|
|
if errors, ok := bulkResponse["errors"].(bool); ok && errors {
|
|
common.Warn("Bulk request had some errors")
|
|
// Could iterate through items to find specific errors if needed
|
|
}
|
|
|
|
common.Info("ElasticsearchConnection.InsertChunks result", zap.String("index_name", baseName), zap.Int("count", len(chunks)))
|
|
return []string{}, nil
|
|
}
|
|
|
|
// UpdateChunks updates chunks by condition
|
|
func (e *elasticsearchEngine) UpdateChunks(ctx context.Context, condition map[string]interface{}, newValue map[string]interface{}, baseName string, datasetID string) error {
|
|
fullIndexName := baseName
|
|
common.Info("ElasticsearchConnection.UpdateChunks called", zap.String("index_name", fullIndexName), zap.Any("condition", condition), zap.Any("new_value", newValue))
|
|
|
|
if fullIndexName == "" {
|
|
return fmt.Errorf("index name cannot be empty")
|
|
}
|
|
|
|
// Check if index exists
|
|
exists, err := e.indexExists(ctx, fullIndexName)
|
|
if err != nil {
|
|
common.Error("Failed to check index existence", err)
|
|
return fmt.Errorf("failed to check index existence: %w", err)
|
|
}
|
|
if !exists {
|
|
return fmt.Errorf("index '%s' does not exist", fullIndexName)
|
|
}
|
|
|
|
// Add kb_id to condition
|
|
condition["kb_id"] = datasetID
|
|
|
|
// Case 1: Single document update (when condition["id"] is a string)
|
|
if chunkID, ok := condition["id"].(string); ok {
|
|
return e.updateSingleChunk(ctx, fullIndexName, chunkID, newValue)
|
|
}
|
|
|
|
// Case 2: Multi-document update via UpdateByQuery
|
|
return e.updateChunksByQuery(ctx, fullIndexName, condition, newValue)
|
|
}
|
|
|
|
// updateSingleChunk handles single document update (matches Python lines 350-398)
|
|
func (e *elasticsearchEngine) updateSingleChunk(ctx context.Context, indexName, chunkID string, newValue map[string]interface{}) error {
|
|
common.Debug("ElasticsearchConnection.updateSingleChunk called", zap.String("indexName", indexName), zap.String("chunkID", chunkID))
|
|
|
|
// First find the document by id field to get the actual _id
|
|
searchReq := map[string]interface{}{
|
|
"query": map[string]interface{}{
|
|
"term": map[string]interface{}{"id": chunkID},
|
|
},
|
|
}
|
|
|
|
body, err := json.Marshal(searchReq)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal search request: %w", err)
|
|
}
|
|
|
|
res, err := e.client.Search(
|
|
e.client.Search.WithContext(ctx),
|
|
e.client.Search.WithIndex(indexName),
|
|
e.client.Search.WithBody(bytes.NewReader(body)),
|
|
)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to search for chunk: %w", err)
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
if res.IsError() {
|
|
return fmt.Errorf("failed to search for chunk: %s", res.Status())
|
|
}
|
|
|
|
var searchResult map[string]interface{}
|
|
if err := json.NewDecoder(res.Body).Decode(&searchResult); err != nil {
|
|
return fmt.Errorf("failed to parse search response: %w", err)
|
|
}
|
|
|
|
hits, ok := searchResult["hits"].(map[string]interface{})
|
|
if !ok {
|
|
return fmt.Errorf("elasticsearch update error: 404 Not Found")
|
|
}
|
|
|
|
hitList, ok := hits["hits"].([]interface{})
|
|
if !ok || len(hitList) == 0 {
|
|
return fmt.Errorf("elasticsearch update error: 404 Not Found")
|
|
}
|
|
|
|
firstHit, ok := hitList[0].(map[string]interface{})
|
|
if !ok {
|
|
return fmt.Errorf("elasticsearch update error: 404 Not Found")
|
|
}
|
|
|
|
actualID, ok := firstHit["_id"].(string)
|
|
if !ok {
|
|
return fmt.Errorf("elasticsearch update error: 404 Not Found")
|
|
}
|
|
|
|
doc := copyFields(newValue)
|
|
delete(doc, "id")
|
|
|
|
removeValue, _ := doc["remove"]
|
|
delete(doc, "remove")
|
|
removeField, _ := removeValue.(string)
|
|
removeDict, _ := removeValue.(map[string]interface{})
|
|
|
|
// Remove *_feas fields
|
|
var feasFields []string
|
|
for k := range doc {
|
|
if strings.HasSuffix(k, "feas") {
|
|
feasFields = append(feasFields, k)
|
|
}
|
|
}
|
|
for _, k := range feasFields {
|
|
scriptBody := map[string]interface{}{
|
|
"script": map[string]interface{}{
|
|
"source": fmt.Sprintf("ctx._source.remove(\"%s\");", k),
|
|
},
|
|
}
|
|
body, _ := json.Marshal(scriptBody)
|
|
req := esapi.UpdateRequest{
|
|
Index: indexName,
|
|
DocumentID: actualID,
|
|
Body: bytes.NewReader(body),
|
|
}
|
|
res, err := req.Do(ctx, e.client)
|
|
if err != nil {
|
|
common.Warn("Failed to remove feas field", zap.String("field", k), zap.Error(err))
|
|
} else {
|
|
res.Body.Close()
|
|
}
|
|
}
|
|
|
|
// Remove specific field if removeField is set
|
|
if removeField != "" {
|
|
scriptBody := map[string]interface{}{
|
|
"script": map[string]interface{}{
|
|
"source": fmt.Sprintf("ctx._source.remove('%s');", removeField),
|
|
},
|
|
}
|
|
body, _ := json.Marshal(scriptBody)
|
|
req := esapi.UpdateRequest{
|
|
Index: indexName,
|
|
DocumentID: actualID,
|
|
Body: bytes.NewReader(body),
|
|
}
|
|
res, err := req.Do(ctx, e.client)
|
|
if err != nil {
|
|
common.Warn("Failed to remove field", zap.String("field", removeField), zap.Error(err))
|
|
} else {
|
|
res.Body.Close()
|
|
}
|
|
}
|
|
|
|
// Remove specific values from array fields (removeDict)
|
|
if removeDict != nil {
|
|
scripts := []string{}
|
|
params := make(map[string]interface{})
|
|
for kk, vv := range removeDict {
|
|
scripts = append(scripts,
|
|
fmt.Sprintf("if (ctx._source.containsKey('%s') && ctx._source.%s != null) { int i = ctx._source.%s.indexOf(params.p_%s); if (i >= 0) { ctx._source.%s.remove(i); }}",
|
|
kk, kk, kk, kk, kk))
|
|
params[fmt.Sprintf("p_%s", kk)] = vv
|
|
}
|
|
if scripts != nil {
|
|
scriptBody := map[string]interface{}{
|
|
"script": map[string]interface{}{
|
|
"source": strings.Join(scripts, ""),
|
|
"params": params,
|
|
},
|
|
}
|
|
body, _ := json.Marshal(scriptBody)
|
|
req := esapi.UpdateRequest{
|
|
Index: indexName,
|
|
DocumentID: actualID,
|
|
Body: bytes.NewReader(body),
|
|
}
|
|
res, err := req.Do(ctx, e.client)
|
|
if err != nil {
|
|
common.Warn("Failed to remove dict fields", zap.Error(err))
|
|
} else {
|
|
res.Body.Close()
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update document fields if any remain
|
|
if len(doc) > 0 {
|
|
updateBody := map[string]interface{}{"doc": doc}
|
|
body, _ := json.Marshal(updateBody)
|
|
req := esapi.UpdateRequest{
|
|
Index: indexName,
|
|
DocumentID: actualID,
|
|
Body: bytes.NewReader(body),
|
|
}
|
|
res, err := req.Do(ctx, e.client)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to update document: %w", err)
|
|
}
|
|
defer res.Body.Close()
|
|
if res.IsError() {
|
|
return fmt.Errorf("elasticsearch update error: %s", res.Status())
|
|
}
|
|
}
|
|
|
|
common.Debug("ElasticsearchConnection.updateSingleChunk completed", zap.String("indexName", indexName), zap.String("chunkID", chunkID))
|
|
return nil
|
|
}
|
|
|
|
// updateChunksByQuery handles multi-document update
|
|
func (e *elasticsearchEngine) updateChunksByQuery(ctx context.Context, indexName string, condition map[string]interface{}, newValue map[string]interface{}) error {
|
|
common.Debug("ElasticsearchConnection.updateChunksByQuery called", zap.String("indexName", indexName))
|
|
|
|
// Build bool query from condition
|
|
var mustClauses []map[string]interface{}
|
|
for k, v := range condition {
|
|
if k == "exists" {
|
|
mustClauses = append(mustClauses, map[string]interface{}{
|
|
"exists": map[string]interface{}{"field": v},
|
|
})
|
|
continue
|
|
}
|
|
if v == nil || v == "" {
|
|
continue
|
|
}
|
|
if listVal, ok := v.([]interface{}); ok {
|
|
mustClauses = append(mustClauses, map[string]interface{}{
|
|
"terms": map[string]interface{}{k: listVal},
|
|
})
|
|
} else if _, ok := v.(string); ok {
|
|
mustClauses = append(mustClauses, map[string]interface{}{
|
|
"term": map[string]interface{}{k: v},
|
|
})
|
|
} else if _, ok := v.(int); ok {
|
|
mustClauses = append(mustClauses, map[string]interface{}{
|
|
"term": map[string]interface{}{k: v},
|
|
})
|
|
}
|
|
}
|
|
|
|
boolQuery := map[string]interface{}{
|
|
"bool": map[string]interface{}{
|
|
"filter": mustClauses,
|
|
},
|
|
}
|
|
|
|
// Build painless scripts from newValue
|
|
var scripts []string
|
|
params := make(map[string]interface{})
|
|
|
|
for k, v := range newValue {
|
|
if k == "remove" {
|
|
if removeStr, ok := v.(string); ok {
|
|
scripts = append(scripts, fmt.Sprintf("ctx._source.remove('%s');", removeStr))
|
|
continue
|
|
}
|
|
if removeDict, ok := v.(map[string]interface{}); ok {
|
|
for kk, vv := range removeDict {
|
|
scripts = append(scripts,
|
|
fmt.Sprintf("if (ctx._source.containsKey('%s') && ctx._source.%s != null) { int i = ctx._source.%s.indexOf(params.p_%s); if (i >= 0) { ctx._source.%s.remove(i); }}",
|
|
kk, kk, kk, kk, kk))
|
|
params[fmt.Sprintf("p_%s", kk)] = vv
|
|
}
|
|
}
|
|
continue
|
|
}
|
|
if k == "add" {
|
|
if addDict, ok := v.(map[string]interface{}); ok {
|
|
for kk, vv := range addDict {
|
|
vvStr, ok := vv.(string)
|
|
if ok {
|
|
vvStr = strings.TrimSpace(vvStr)
|
|
scripts = append(scripts, fmt.Sprintf("ctx._source.%s.add(params.pp_%s);", kk, kk))
|
|
params[fmt.Sprintf("pp_%s", kk)] = vvStr
|
|
}
|
|
}
|
|
}
|
|
continue
|
|
}
|
|
if (k == "" || v == nil) && k != "available_int" {
|
|
continue
|
|
}
|
|
|
|
switch val := v.(type) {
|
|
case string:
|
|
// Sanitize: replace ' \n \r with space
|
|
sanitized := sanitizeString(val)
|
|
params[fmt.Sprintf("pp_%s", k)] = sanitized
|
|
scripts = append(scripts, fmt.Sprintf("ctx._source.%s=params.pp_%s;", k, k))
|
|
case int, float64:
|
|
scripts = append(scripts, fmt.Sprintf("ctx._source.%s=%v;", k, val))
|
|
case []interface{}:
|
|
params[fmt.Sprintf("pp_%s", k)] = val
|
|
scripts = append(scripts, fmt.Sprintf("ctx._source.%s=params.pp_%s;", k, k))
|
|
}
|
|
}
|
|
|
|
scriptSource := strings.Join(scripts, "")
|
|
|
|
// Build update by query body
|
|
updateBody := map[string]interface{}{
|
|
"query": boolQuery,
|
|
"script": map[string]interface{}{
|
|
"source": scriptSource,
|
|
"params": params,
|
|
},
|
|
}
|
|
|
|
bodyBytes, err := json.Marshal(updateBody)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal update body: %w", err)
|
|
}
|
|
|
|
// Execute update by query with refresh=true, slices=5, conflicts="proceed"
|
|
refreshTrue := true
|
|
req := esapi.UpdateByQueryRequest{
|
|
Index: []string{indexName},
|
|
Body: bytes.NewReader(bodyBytes),
|
|
Refresh: &refreshTrue,
|
|
Slices: 5,
|
|
Conflicts: "proceed",
|
|
}
|
|
|
|
res, err := req.Do(ctx, e.client)
|
|
if err != nil {
|
|
common.Error("Failed to execute update by query", err)
|
|
return fmt.Errorf("failed to execute update by query: %w", err)
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
if res.IsError() {
|
|
bodyBytes, _ := io.ReadAll(res.Body)
|
|
return fmt.Errorf("elasticsearch update by query error: %s, body: %s", res.Status(), string(bodyBytes))
|
|
}
|
|
|
|
common.Debug("ElasticsearchConnection.updateChunksByQuery completed", zap.String("indexName", indexName))
|
|
return nil
|
|
}
|
|
|
|
// sanitizeString replaces ' \n \r with space
|
|
func sanitizeString(s string) string {
|
|
s = strings.ReplaceAll(s, "'", " ")
|
|
s = strings.ReplaceAll(s, "\n", " ")
|
|
s = strings.ReplaceAll(s, "\r", " ")
|
|
return strings.TrimSpace(s)
|
|
}
|
|
|
|
// copyFields creates a shallow copy of a map
|
|
func copyFields(m map[string]interface{}) map[string]interface{} {
|
|
result := make(map[string]interface{})
|
|
for k, v := range m {
|
|
result[k] = v
|
|
}
|
|
return result
|
|
}
|
|
|
|
// DeleteChunks deletes chunks from a dataset index by condition
|
|
func (e *elasticsearchEngine) DeleteChunks(ctx context.Context, condition map[string]interface{}, indexName string, datasetID string) (int64, error) {
|
|
// For ES, index name is just indexName (e.g., "ragflow_{tenantID}"), not indexName_datasetID
|
|
fullIndexName := indexName
|
|
common.Info("Deleting chunks from Elasticsearch index", zap.String("index_name", fullIndexName), zap.Any("condition", condition))
|
|
|
|
// Check if index exists
|
|
exists, err := e.indexExists(ctx, fullIndexName)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("failed to check index existence: %w", err)
|
|
}
|
|
if !exists {
|
|
common.Warn(fmt.Sprintf("Index %s does not exist, skipping delete", fullIndexName))
|
|
return 0, nil
|
|
}
|
|
|
|
// Build bool query from condition
|
|
var mustClauses []map[string]interface{}
|
|
var filterClauses []map[string]interface{}
|
|
var mustNotClauses []map[string]interface{}
|
|
|
|
// Handle chunk IDs - use terms query on "id" field instead of ids query on _id
|
|
if idVal, ok := condition["id"]; ok && idVal != nil {
|
|
switch v := idVal.(type) {
|
|
case []interface{}:
|
|
ids := make([]string, 0, len(v))
|
|
for _, id := range v {
|
|
if s, ok := id.(string); ok {
|
|
ids = append(ids, s)
|
|
}
|
|
}
|
|
if len(ids) > 0 {
|
|
mustClauses = append(mustClauses, map[string]interface{}{
|
|
"terms": map[string]interface{}{"id": ids},
|
|
})
|
|
}
|
|
case string:
|
|
mustClauses = append(mustClauses, map[string]interface{}{
|
|
"term": map[string]interface{}{"id": v},
|
|
})
|
|
}
|
|
}
|
|
|
|
// Handle kb_id - add as term filter
|
|
if kbID, ok := condition["kb_id"].(string); ok && kbID != "" {
|
|
filterClauses = append(filterClauses, map[string]interface{}{
|
|
"term": map[string]interface{}{"kb_id": kbID},
|
|
})
|
|
}
|
|
|
|
// Add all other conditions as filters/must/must_not
|
|
for k, v := range condition {
|
|
if k == "id" || k == "kb_id" {
|
|
continue // Already handled above
|
|
}
|
|
if k == "exists" {
|
|
filterClauses = append(filterClauses, map[string]interface{}{
|
|
"exists": map[string]interface{}{"field": v},
|
|
})
|
|
} else if k == "must_not" {
|
|
if m, ok := v.(map[string]interface{}); ok {
|
|
for kk, vv := range m {
|
|
if kk == "exists" {
|
|
mustNotClauses = append(mustNotClauses, map[string]interface{}{
|
|
"exists": map[string]interface{}{"field": vv},
|
|
})
|
|
}
|
|
}
|
|
}
|
|
} else if v != nil {
|
|
if listVal, ok := v.([]interface{}); ok {
|
|
mustClauses = append(mustClauses, map[string]interface{}{
|
|
"terms": map[string]interface{}{k: listVal},
|
|
})
|
|
} else if _, ok := v.(string); ok {
|
|
mustClauses = append(mustClauses, map[string]interface{}{
|
|
"term": map[string]interface{}{k: v},
|
|
})
|
|
} else if _, ok := v.(int); ok {
|
|
mustClauses = append(mustClauses, map[string]interface{}{
|
|
"term": map[string]interface{}{k: v},
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
// Build the query
|
|
var qry map[string]interface{}
|
|
if len(filterClauses) == 0 && len(mustClauses) == 0 && len(mustNotClauses) == 0 {
|
|
qry = map[string]interface{}{"match_all": map[string]interface{}{}}
|
|
} else {
|
|
boolMap := map[string]interface{}{}
|
|
if len(filterClauses) > 0 {
|
|
boolMap["filter"] = filterClauses
|
|
}
|
|
if len(mustClauses) > 0 {
|
|
boolMap["must"] = mustClauses
|
|
}
|
|
if len(mustNotClauses) > 0 {
|
|
boolMap["must_not"] = mustNotClauses
|
|
}
|
|
qry = map[string]interface{}{"bool": boolMap}
|
|
}
|
|
|
|
// Build delete by query body
|
|
deleteBody := map[string]interface{}{
|
|
"query": qry,
|
|
}
|
|
|
|
bodyBytes, err := json.Marshal(deleteBody)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("failed to marshal delete body: %w", err)
|
|
}
|
|
|
|
// Execute delete by query with refresh=true
|
|
refreshTrue := true
|
|
req := esapi.DeleteByQueryRequest{
|
|
Index: []string{fullIndexName},
|
|
Body: bytes.NewReader(bodyBytes),
|
|
Refresh: &refreshTrue,
|
|
}
|
|
|
|
res, err := req.Do(ctx, e.client)
|
|
if err != nil {
|
|
common.Error("Failed to execute delete by query", err)
|
|
if strings.Contains(err.Error(), "not_found") {
|
|
return 0, nil
|
|
}
|
|
return 0, fmt.Errorf("failed to execute delete by query: %w", err)
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
if res.IsError() {
|
|
bodyBytes, _ := io.ReadAll(res.Body)
|
|
errStr := string(bodyBytes)
|
|
if strings.Contains(errStr, "not_found") {
|
|
return 0, nil
|
|
}
|
|
common.Sugar.Errorw("Elasticsearch delete by query returned error", "status", res.Status())
|
|
return 0, fmt.Errorf("elasticsearch delete by query returned error: %s", res.Status())
|
|
}
|
|
|
|
// Parse response
|
|
var result map[string]interface{}
|
|
if err := json.NewDecoder(res.Body).Decode(&result); err != nil {
|
|
common.Error("Failed to parse delete response", err)
|
|
return 0, fmt.Errorf("failed to parse delete response: %w", err)
|
|
}
|
|
|
|
deleted := int64(0)
|
|
if d, ok := result["deleted"].(float64); ok {
|
|
deleted = int64(d)
|
|
}
|
|
|
|
common.Info("Successfully deleted chunks", zap.String("index_name", fullIndexName), zap.Int64("deleted_count", deleted))
|
|
return deleted, nil
|
|
}
|
|
|
|
// SearchResponse Elasticsearch search response
|
|
type SearchResponse struct {
|
|
Hits struct {
|
|
Total struct {
|
|
Value int64 `json:"value"`
|
|
} `json:"total"`
|
|
Hits []struct {
|
|
ID string `json:"_id"`
|
|
Score float64 `json:"_score"`
|
|
Source map[string]interface{} `json:"_source"`
|
|
} `json:"hits"`
|
|
} `json:"hits"`
|
|
Aggregations map[string]interface{} `json:"aggregations"`
|
|
}
|
|
|
|
// Search executes search with unified types.SearchRequest
|
|
func (e *elasticsearchEngine) Search(ctx context.Context, req *types.SearchRequest) (*types.SearchResult, error) {
|
|
return e.searchUnified(ctx, req)
|
|
}
|
|
|
|
// searchUnified handles the unified types.SearchRequest
|
|
// Matches the behavior of Infinity's Search() method
|
|
func (e *elasticsearchEngine) searchUnified(ctx context.Context, req *types.SearchRequest) (*types.SearchResult, error) {
|
|
common.Debug("Search in Elasticsearch started", zap.Any("indexNames", req.IndexNames))
|
|
|
|
if len(req.IndexNames) == 0 {
|
|
return nil, fmt.Errorf("index names cannot be empty")
|
|
}
|
|
|
|
// Get retrieval parameters with defaults
|
|
pageSize := req.Limit
|
|
if pageSize <= 0 {
|
|
pageSize = 30
|
|
}
|
|
|
|
offset := req.Offset
|
|
if offset < 0 {
|
|
offset = 0
|
|
}
|
|
|
|
isMetadataTable := false
|
|
isSkillIndex := false
|
|
for _, idx := range req.IndexNames {
|
|
if strings.HasPrefix(idx, "ragflow_doc_meta_") {
|
|
isMetadataTable = true
|
|
break
|
|
}
|
|
if strings.HasPrefix(idx, "skill_") {
|
|
isSkillIndex = true
|
|
break
|
|
}
|
|
}
|
|
|
|
var outputColumns []string
|
|
if isMetadataTable {
|
|
outputColumns = []string{"id", "kb_id", "meta_fields"}
|
|
} else if isSkillIndex {
|
|
outputColumns = []string{
|
|
"skill_id", "space_id", "folder_id", "name", "tags", "description", "content",
|
|
"version", "status", "create_time", "update_time",
|
|
}
|
|
} else {
|
|
outputColumns = []string{
|
|
"id", "doc_id", "kb_id", "content_ltks", "content_with_weight",
|
|
"title_tks", "docnm_kwd", "img_id", "available_int", "important_kwd",
|
|
"position_int", "page_num_int", "top_int", "chunk_order_int",
|
|
"create_timestamp_flt", "knowledge_graph_kwd", "question_kwd", "question_tks",
|
|
"doc_type_kwd", "mom_id", "tag_kwd", "pagerank_fea", "tag_feas",
|
|
}
|
|
}
|
|
|
|
// Allow caller to override output columns (used by KG search, etc.)
|
|
if len(req.SelectFields) > 0 {
|
|
outputColumns = req.SelectFields
|
|
}
|
|
|
|
hasTextMatch := false
|
|
hasVectorMatch := false
|
|
var matchText *types.MatchTextExpr
|
|
var matchDense *types.MatchDenseExpr
|
|
if req.MatchExprs != nil && len(req.MatchExprs) > 0 {
|
|
for _, expr := range req.MatchExprs {
|
|
if expr == nil {
|
|
continue
|
|
}
|
|
switch e := expr.(type) {
|
|
case string:
|
|
if e != "" {
|
|
hasTextMatch = true
|
|
matchText = &types.MatchTextExpr{
|
|
MatchingText: e,
|
|
TopN: pageSize,
|
|
}
|
|
}
|
|
case *types.MatchTextExpr:
|
|
if e.MatchingText != "" {
|
|
hasTextMatch = true
|
|
matchText = e
|
|
}
|
|
case *types.MatchDenseExpr:
|
|
if len(e.EmbeddingData) > 0 {
|
|
hasVectorMatch = true
|
|
matchDense = e
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extract FusionExpr if present (used for hybrid search fusion)
|
|
var fusionExpr *types.FusionExpr
|
|
if len(req.MatchExprs) > 2 {
|
|
if fe, ok := req.MatchExprs[2].(*types.FusionExpr); ok {
|
|
fusionExpr = fe
|
|
}
|
|
}
|
|
_ = fusionExpr // TODO: implement fusion for ES hybrid search
|
|
|
|
if hasTextMatch || hasVectorMatch {
|
|
if !isSkillIndex {
|
|
if !slices.Contains(outputColumns, common.PAGERANK_FLD) {
|
|
outputColumns = append(outputColumns, common.PAGERANK_FLD)
|
|
}
|
|
if !slices.Contains(outputColumns, common.TAG_FLD) {
|
|
outputColumns = append(outputColumns, common.TAG_FLD)
|
|
}
|
|
}
|
|
}
|
|
|
|
if hasVectorMatch && matchDense != nil && matchDense.VectorColumnName != "" {
|
|
outputColumns = append(outputColumns, matchDense.VectorColumnName)
|
|
}
|
|
|
|
// Build filter string
|
|
var filterParts []string
|
|
|
|
if !isMetadataTable && (hasTextMatch || hasVectorMatch) {
|
|
if req.Filter != nil {
|
|
if availInt, ok := req.Filter["available_int"]; ok {
|
|
filterParts = append(filterParts, fmt.Sprintf("available_int=%v", availInt))
|
|
} else if status, ok := req.Filter["status"]; ok {
|
|
filterParts = append(filterParts, fmt.Sprintf("status='%s'", status))
|
|
} else {
|
|
if isSkillIndex {
|
|
filterParts = append(filterParts, "status='1'")
|
|
} else {
|
|
filterParts = append(filterParts, "available_int=1")
|
|
}
|
|
}
|
|
} else {
|
|
if isSkillIndex {
|
|
filterParts = append(filterParts, "status='1'")
|
|
} else {
|
|
filterParts = append(filterParts, "available_int=1")
|
|
}
|
|
}
|
|
}
|
|
|
|
// Build filter string from req.Filter
|
|
if req.Filter != nil {
|
|
filterCopy := req.Filter
|
|
if !isMetadataTable {
|
|
filterCopy = make(map[string]interface{})
|
|
for k, v := range req.Filter {
|
|
if k != "kb_id" {
|
|
filterCopy[k] = v
|
|
}
|
|
}
|
|
}
|
|
|
|
condStr := equivalentConditionToStr(filterCopy)
|
|
if condStr != "" {
|
|
filterParts = append(filterParts, condStr)
|
|
}
|
|
}
|
|
filterStr := strings.Join(filterParts, " AND ")
|
|
|
|
orderBy := req.OrderBy
|
|
_ = orderBy // TODO: implement rank feature for ES
|
|
|
|
var allResults []map[string]interface{}
|
|
totalHits := int64(0)
|
|
|
|
for _, indexName := range req.IndexNames {
|
|
var indexNames []string
|
|
if strings.HasPrefix(indexName, "ragflow_doc_meta_") {
|
|
indexNames = []string{indexName}
|
|
} else {
|
|
indexNames = []string{indexName}
|
|
}
|
|
|
|
for _, fullIndexName := range indexNames {
|
|
// Build search query body
|
|
queryBody := make(map[string]interface{})
|
|
|
|
// Determine text fields for the query (used indirectly via buildESKeywordQuery)
|
|
if matchText != nil && len(matchText.Fields) > 0 {
|
|
// Use matchText.Fields for text matching
|
|
} else if isSkillIndex {
|
|
// Use skill-specific fields in buildSkillKeywordQuery
|
|
} else {
|
|
// Use default fields in buildESKeywordQuery
|
|
}
|
|
|
|
var vectorFieldName string
|
|
if !hasVectorMatch || matchDense == nil {
|
|
// Keyword-only search (no vector match)
|
|
queryBody["query"] = map[string]interface{}{
|
|
"match_all": map[string]interface{}{},
|
|
}
|
|
if hasTextMatch && matchText != nil {
|
|
if isSkillIndex {
|
|
queryBody["query"] = buildSkillKeywordQuery(matchText.MatchingText, nil, 1.0)
|
|
} else {
|
|
queryBody["query"] = buildESKeywordQuery(matchText.MatchingText, nil, 1.0)
|
|
}
|
|
// Add filter if present
|
|
if filterStr != "" {
|
|
if boolQuery, ok := queryBody["query"].(map[string]interface{}); ok {
|
|
if boolMap, ok := boolQuery["bool"].(map[string]interface{}); ok {
|
|
filterClauses := buildFilterClausesFromStr(filterStr)
|
|
if existingFilter, ok := boolMap["filter"].([]map[string]interface{}); ok {
|
|
boolMap["filter"] = append(existingFilter, filterClauses...)
|
|
} else {
|
|
boolMap["filter"] = filterClauses
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Hybrid search: keyword + vector
|
|
textWeight := 0.7 // default: vector weight = 0.3
|
|
vectorWeight := 0.3
|
|
if matchDense.ExtraOptions != nil {
|
|
if vw, ok := matchDense.ExtraOptions["text_weight"].(float64); ok {
|
|
textWeight = vw
|
|
}
|
|
if vw, ok := matchDense.ExtraOptions["vector_weight"].(float64); ok {
|
|
vectorWeight = vw
|
|
}
|
|
}
|
|
|
|
// Build boolean query for text match and filters
|
|
var boolQuery map[string]interface{}
|
|
matchingText := ""
|
|
if matchText != nil {
|
|
matchingText = matchText.MatchingText
|
|
}
|
|
if isSkillIndex {
|
|
boolQuery = buildSkillKeywordQuery(matchingText, nil, textWeight)
|
|
} else {
|
|
boolQuery = buildESKeywordQuery(matchingText, nil, textWeight)
|
|
}
|
|
|
|
// Add filter to bool query
|
|
if filterStr != "" {
|
|
if boolMap, ok := boolQuery["bool"].(map[string]interface{}); ok {
|
|
filterClauses := buildFilterClausesFromStr(filterStr)
|
|
if existingFilter, ok := boolMap["filter"].([]map[string]interface{}); ok {
|
|
boolMap["filter"] = append(existingFilter, filterClauses...)
|
|
} else {
|
|
boolMap["filter"] = filterClauses
|
|
}
|
|
}
|
|
}
|
|
|
|
// Build kNN query
|
|
vectorData := matchDense.EmbeddingData
|
|
vectorFieldName = matchDense.VectorColumnName
|
|
k := matchDense.TopN
|
|
if k <= 0 {
|
|
k = req.Limit
|
|
}
|
|
if k <= 0 {
|
|
k = 1024
|
|
}
|
|
numCandidates := k * 2
|
|
|
|
similarity := 0.0
|
|
if matchDense.ExtraOptions != nil {
|
|
if sim, ok := matchDense.ExtraOptions["similarity"].(float64); ok {
|
|
similarity = sim
|
|
}
|
|
}
|
|
|
|
knnQuery := map[string]interface{}{
|
|
"field": vectorFieldName,
|
|
"query_vector": vectorData,
|
|
"k": k,
|
|
"num_candidates": numCandidates,
|
|
"similarity": similarity,
|
|
"boost": vectorWeight,
|
|
}
|
|
|
|
queryBody["knn"] = knnQuery
|
|
queryBody["query"] = boolQuery
|
|
|
|
// Add vector column to output columns
|
|
if vectorFieldName != "" {
|
|
outputColumns = append(outputColumns, vectorFieldName)
|
|
}
|
|
}
|
|
|
|
queryBody["size"] = pageSize
|
|
queryBody["from"] = offset
|
|
|
|
// Add sorting if specified
|
|
if orderBy != nil && len(orderBy.Fields) > 0 {
|
|
sort := parseOrderByExpr(orderBy)
|
|
if len(sort) > 0 {
|
|
queryBody["sort"] = sort
|
|
}
|
|
}
|
|
|
|
// Serialize query
|
|
var buf bytes.Buffer
|
|
if err := json.NewEncoder(&buf).Encode(queryBody); err != nil {
|
|
return nil, fmt.Errorf("error encoding query: %w", err)
|
|
}
|
|
|
|
// Log search details
|
|
common.Debug("Elasticsearch searching index", zap.String("index", fullIndexName))
|
|
common.Debug("Elasticsearch DSL", zap.Any("dsl", queryBody))
|
|
|
|
// Build search request
|
|
reqES := esapi.SearchRequest{
|
|
Index: []string{fullIndexName},
|
|
Body: &buf,
|
|
}
|
|
|
|
// Execute search
|
|
res, err := reqES.Do(ctx, e.client)
|
|
if err != nil {
|
|
common.Warn("Elasticsearch query failed", zap.String("index", fullIndexName), zap.Error(err))
|
|
continue
|
|
}
|
|
|
|
if res.IsError() {
|
|
bodyBytes, err := io.ReadAll(res.Body)
|
|
res.Body.Close()
|
|
if err != nil {
|
|
common.Error("Elasticsearch failed to read error response body", err)
|
|
} else {
|
|
common.Warn("Elasticsearch error response", zap.String("index", fullIndexName), zap.String("body", string(bodyBytes)))
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Parse response
|
|
var esResp SearchResponse
|
|
if err := json.NewDecoder(res.Body).Decode(&esResp); err != nil {
|
|
res.Body.Close()
|
|
common.Warn("Elasticsearch failed to parse response", zap.String("index", fullIndexName), zap.Error(err))
|
|
continue
|
|
}
|
|
|
|
res.Body.Close()
|
|
|
|
// Convert to unified response
|
|
searchChunks := convertESResponse(&esResp, vectorFieldName)
|
|
totalHits += esResp.Hits.Total.Value
|
|
|
|
// Apply field name mapping and row_id handling
|
|
if !isSkillIndex {
|
|
GetFields(searchChunks, nil)
|
|
}
|
|
|
|
allResults = append(allResults, searchChunks...)
|
|
}
|
|
}
|
|
|
|
// Calculate scores and sort
|
|
if hasTextMatch || hasVectorMatch {
|
|
scoreColumn := "_score"
|
|
if hasTextMatch && hasVectorMatch {
|
|
scoreColumn = "SCORE"
|
|
}
|
|
|
|
pagerankField := common.PAGERANK_FLD
|
|
if isSkillIndex {
|
|
pagerankField = ""
|
|
}
|
|
|
|
allResults = calculateScores(allResults, scoreColumn, pagerankField)
|
|
allResults = sortByScore(allResults, len(allResults))
|
|
}
|
|
|
|
// Limit results
|
|
if len(allResults) > pageSize {
|
|
allResults = allResults[:pageSize]
|
|
}
|
|
|
|
common.Debug("Search in Elasticsearch completed", zap.Int("returnedRows", len(allResults)), zap.Int64("totalHits", totalHits))
|
|
|
|
return &types.SearchResult{
|
|
Chunks: allResults,
|
|
Total: totalHits,
|
|
}, nil
|
|
}
|
|
|
|
// buildFilterClausesFromStr converts a filter string to ES filter clauses
|
|
func buildFilterClausesFromStr(filterStr string) []map[string]interface{} {
|
|
if filterStr == "" {
|
|
return nil
|
|
}
|
|
return []map[string]interface{}{
|
|
{"query_string": map[string]interface{}{
|
|
"query": filterStr,
|
|
}},
|
|
}
|
|
}
|
|
|
|
// GetChunk gets a chunk by ID using ES search API
|
|
// _id in ES is composite: {doc_id}_{kb_id}_{chunk_id}
|
|
func (e *elasticsearchEngine) GetChunk(ctx context.Context, baseName, chunkID string, datasetIDs []string) (interface{}, error) {
|
|
// Try search by doc_id field (which is stored in the document)
|
|
for _, datasetID := range datasetIDs {
|
|
searchReq := map[string]interface{}{
|
|
"query": map[string]interface{}{
|
|
"bool": map[string]interface{}{
|
|
"must": []map[string]interface{}{
|
|
{"term": map[string]interface{}{"id": chunkID}},
|
|
{"term": map[string]interface{}{"kb_id": datasetID}},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
body, err := json.Marshal(searchReq)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to marshal search request: %w", err)
|
|
}
|
|
|
|
res, err := e.client.Search(
|
|
e.client.Search.WithContext(ctx),
|
|
e.client.Search.WithIndex(baseName),
|
|
e.client.Search.WithBody(bytes.NewReader(body)),
|
|
)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to search for chunk: %w", err)
|
|
}
|
|
|
|
if res.IsError() {
|
|
res.Body.Close()
|
|
return nil, fmt.Errorf("failed to search for chunk: %s", res.Status())
|
|
}
|
|
|
|
var searchResult map[string]interface{}
|
|
if err := json.NewDecoder(res.Body).Decode(&searchResult); err != nil {
|
|
res.Body.Close()
|
|
return nil, fmt.Errorf("failed to parse search response: %w", err)
|
|
}
|
|
res.Body.Close()
|
|
|
|
hits, ok := searchResult["hits"].(map[string]interface{})
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
hitList, ok := hits["hits"].([]interface{})
|
|
if !ok || len(hitList) == 0 {
|
|
continue
|
|
}
|
|
|
|
firstHit, ok := hitList[0].(map[string]interface{})
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
source, ok := firstHit["_source"].(map[string]interface{})
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
common.Info("GetChunk found hit", zap.String("baseName", baseName), zap.String("chunkID", chunkID))
|
|
source["id"] = chunkID
|
|
return source, nil
|
|
}
|
|
|
|
common.Info("GetChunk no hits found", zap.String("baseName", baseName), zap.String("chunkID", chunkID))
|
|
return nil, nil
|
|
}
|
|
|
|
// GetFields is not implemented for Elasticsearch
|
|
func (e *elasticsearchEngine) GetFields(chunks []map[string]interface{}, fields []string) map[string]map[string]interface{} {
|
|
common.Warn("GetFields not implemented for Elasticsearch")
|
|
return nil
|
|
}
|
|
|
|
// GetAggregation is not implemented for Elasticsearch
|
|
func (e *elasticsearchEngine) GetAggregation(chunks []map[string]interface{}, fieldName string) []map[string]interface{} {
|
|
common.Warn("GetAggregation not implemented for Elasticsearch")
|
|
return nil
|
|
}
|
|
|
|
// GetHighlight is not implemented for Elasticsearch
|
|
func (e *elasticsearchEngine) GetHighlight(chunks []map[string]interface{}, keywords []string, fieldName string) map[string]string {
|
|
common.Warn("GetHighlight not implemented for Elasticsearch")
|
|
return nil
|
|
}
|
|
|
|
// DropChunkStore deletes a chunk index
|
|
func (e *elasticsearchEngine) DropChunkStore(ctx context.Context, baseName, datasetID string) error {
|
|
return e.dropIndex(ctx, baseName)
|
|
}
|
|
|
|
// ChunkStoreExists checks if a chunk index exists
|
|
func (e *elasticsearchEngine) ChunkStoreExists(ctx context.Context, baseName, datasetID string) (bool, error) {
|
|
return e.indexExists(ctx, baseName)
|
|
}
|
|
|
|
// buildQueryFromCondition builds an ES query from condition map
|
|
func (e *elasticsearchEngine) buildQueryFromCondition(condition map[string]interface{}) map[string]interface{} {
|
|
if len(condition) == 0 {
|
|
return nil
|
|
}
|
|
|
|
var clauses []map[string]interface{}
|
|
|
|
for k, v := range condition {
|
|
if v == nil {
|
|
continue
|
|
}
|
|
|
|
switch k {
|
|
case "kb_id":
|
|
// Handle kb_id as terms query
|
|
if listVal, ok := v.([]interface{}); ok {
|
|
clauses = append(clauses, map[string]interface{}{
|
|
"terms": map[string]interface{}{k: listVal},
|
|
})
|
|
} else {
|
|
clauses = append(clauses, map[string]interface{}{
|
|
"term": map[string]interface{}{k: v},
|
|
})
|
|
}
|
|
case "id":
|
|
// Handle id as terms or term query
|
|
if listVal, ok := v.([]interface{}); ok {
|
|
clauses = append(clauses, map[string]interface{}{
|
|
"terms": map[string]interface{}{k: listVal},
|
|
})
|
|
} else {
|
|
clauses = append(clauses, map[string]interface{}{
|
|
"term": map[string]interface{}{k: v},
|
|
})
|
|
}
|
|
case "available_int":
|
|
// Handle available_int as term query
|
|
clauses = append(clauses, map[string]interface{}{
|
|
"term": map[string]interface{}{k: v},
|
|
})
|
|
default:
|
|
// Default: treat as term query
|
|
clauses = append(clauses, map[string]interface{}{
|
|
"term": map[string]interface{}{k: v},
|
|
})
|
|
}
|
|
}
|
|
|
|
if len(clauses) == 0 {
|
|
return nil
|
|
}
|
|
|
|
if len(clauses) == 1 {
|
|
return clauses[0]
|
|
}
|
|
|
|
return map[string]interface{}{
|
|
"bool": map[string]interface{}{
|
|
"must": clauses,
|
|
},
|
|
}
|
|
}
|
|
|
|
// buildRemoveOperations builds ES script operations for remove
|
|
func (e *elasticsearchEngine) buildRemoveOperations(removeData map[string]interface{}, query map[string]interface{}, indexName string) []map[string]interface{} {
|
|
// For ES, we handle removals differently - they are typically done via separate update operations
|
|
// This is a simplified implementation
|
|
return nil
|
|
}
|
|
|
|
// needsScriptUpdate checks if the update requires a script (more complex operations)
|
|
func (e *elasticsearchEngine) needsScriptUpdate(newValue map[string]interface{}) bool {
|
|
// Check if any values contain operations that need scripts
|
|
return false
|
|
}
|
|
|
|
// buildUpdateScript builds an ES script for updates
|
|
func (e *elasticsearchEngine) buildUpdateScript(newValue map[string]interface{}, removeOperations []map[string]interface{}) map[string]interface{} {
|
|
script := map[string]interface{}{
|
|
"source": "ctx._source.putAll(params.doc)",
|
|
"params": map[string]interface{}{
|
|
"doc": newValue,
|
|
},
|
|
}
|
|
return script
|
|
}
|
|
|
|
// buildMetadataQueryFromCondition builds an ES query for metadata index
|
|
func (e *elasticsearchEngine) buildMetadataQueryFromCondition(condition map[string]interface{}) map[string]interface{} {
|
|
if len(condition) == 0 {
|
|
return nil
|
|
}
|
|
|
|
var clauses []map[string]interface{}
|
|
|
|
for k, v := range condition {
|
|
if v == nil {
|
|
continue
|
|
}
|
|
|
|
switch k {
|
|
case "kb_id":
|
|
if listVal, ok := v.([]interface{}); ok {
|
|
clauses = append(clauses, map[string]interface{}{
|
|
"terms": map[string]interface{}{k: listVal},
|
|
})
|
|
} else {
|
|
clauses = append(clauses, map[string]interface{}{
|
|
"term": map[string]interface{}{k: v},
|
|
})
|
|
}
|
|
case "id":
|
|
if listVal, ok := v.([]interface{}); ok {
|
|
clauses = append(clauses, map[string]interface{}{
|
|
"terms": map[string]interface{}{k: listVal},
|
|
})
|
|
} else {
|
|
clauses = append(clauses, map[string]interface{}{
|
|
"term": map[string]interface{}{k: v},
|
|
})
|
|
}
|
|
default:
|
|
clauses = append(clauses, map[string]interface{}{
|
|
"term": map[string]interface{}{k: v},
|
|
})
|
|
}
|
|
}
|
|
|
|
if len(clauses) == 0 {
|
|
return nil
|
|
}
|
|
|
|
if len(clauses) == 1 {
|
|
return clauses[0]
|
|
}
|
|
|
|
return map[string]interface{}{
|
|
"bool": map[string]interface{}{
|
|
"must": clauses,
|
|
},
|
|
}
|
|
}
|
|
|
|
// loadSkillMapping loads the skill index mapping from config file
|
|
func loadSkillMapping() (map[string]interface{}, error) {
|
|
// Try multiple possible locations for the mapping file
|
|
possiblePaths := []string{
|
|
"conf/skill_es_mapping.json",
|
|
"../conf/skill_es_mapping.json",
|
|
"/app/conf/skill_es_mapping.json",
|
|
}
|
|
|
|
var data []byte
|
|
var err error
|
|
for _, path := range possiblePaths {
|
|
data, err = os.ReadFile(path)
|
|
if err == nil {
|
|
break
|
|
}
|
|
}
|
|
|
|
if err != nil {
|
|
// Fallback to default skill mapping if file not found
|
|
return getDefaultSkillMapping(), nil
|
|
}
|
|
|
|
var mapping map[string]interface{}
|
|
if err := json.Unmarshal(data, &mapping); err != nil {
|
|
return nil, fmt.Errorf("failed to parse skill mapping: %w", err)
|
|
}
|
|
|
|
return mapping, nil
|
|
}
|
|
|
|
// getDefaultSkillMapping returns the default skill index mapping
|
|
func getDefaultSkillMapping() map[string]interface{} {
|
|
return map[string]interface{}{
|
|
"settings": map[string]interface{}{
|
|
"index": map[string]interface{}{
|
|
"number_of_shards": 1,
|
|
"number_of_replicas": 0,
|
|
"refresh_interval": "1000ms",
|
|
},
|
|
},
|
|
"mappings": map[string]interface{}{
|
|
"dynamic": false,
|
|
"properties": map[string]interface{}{
|
|
"skill_id": map[string]interface{}{
|
|
"type": "keyword",
|
|
"store": true,
|
|
},
|
|
"name": map[string]interface{}{
|
|
"type": "text",
|
|
"index": false,
|
|
"store": true,
|
|
},
|
|
"name_tks": map[string]interface{}{
|
|
"type": "text",
|
|
"analyzer": "whitespace",
|
|
"store": true,
|
|
},
|
|
"tags": map[string]interface{}{
|
|
"type": "text",
|
|
"index": false,
|
|
"store": true,
|
|
},
|
|
"tags_tks": map[string]interface{}{
|
|
"type": "text",
|
|
"analyzer": "whitespace",
|
|
"store": true,
|
|
},
|
|
"description": map[string]interface{}{
|
|
"type": "text",
|
|
"index": false,
|
|
"store": true,
|
|
},
|
|
"description_tks": map[string]interface{}{
|
|
"type": "text",
|
|
"analyzer": "whitespace",
|
|
"store": true,
|
|
},
|
|
"content": map[string]interface{}{
|
|
"type": "text",
|
|
"index": false,
|
|
"store": true,
|
|
},
|
|
"content_tks": map[string]interface{}{
|
|
"type": "text",
|
|
"analyzer": "whitespace",
|
|
"store": true,
|
|
},
|
|
"q_3072_vec": map[string]interface{}{
|
|
"type": "dense_vector",
|
|
"dims": 3072,
|
|
"index": true,
|
|
"similarity": "cosine",
|
|
},
|
|
"q_2560_vec": map[string]interface{}{
|
|
"type": "dense_vector",
|
|
"dims": 2560,
|
|
"index": true,
|
|
"similarity": "cosine",
|
|
},
|
|
"q_1536_vec": map[string]interface{}{
|
|
"type": "dense_vector",
|
|
"dims": 1536,
|
|
"index": true,
|
|
"similarity": "cosine",
|
|
},
|
|
"q_1024_vec": map[string]interface{}{
|
|
"type": "dense_vector",
|
|
"dims": 1024,
|
|
"index": true,
|
|
"similarity": "cosine",
|
|
},
|
|
"q_768_vec": map[string]interface{}{
|
|
"type": "dense_vector",
|
|
"dims": 768,
|
|
"index": true,
|
|
"similarity": "cosine",
|
|
},
|
|
"q_512_vec": map[string]interface{}{
|
|
"type": "dense_vector",
|
|
"dims": 512,
|
|
"index": true,
|
|
"similarity": "cosine",
|
|
},
|
|
"q_256_vec": map[string]interface{}{
|
|
"type": "dense_vector",
|
|
"dims": 256,
|
|
"index": true,
|
|
"similarity": "cosine",
|
|
},
|
|
"version": map[string]interface{}{
|
|
"type": "keyword",
|
|
"store": true,
|
|
},
|
|
"status": map[string]interface{}{
|
|
"type": "keyword",
|
|
"store": true,
|
|
},
|
|
"create_time": map[string]interface{}{
|
|
"type": "long",
|
|
"store": true,
|
|
},
|
|
"update_time": map[string]interface{}{
|
|
"type": "long",
|
|
"store": true,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
// calculatePagination calculates offset and limit based on page, size and topK
|
|
func calculatePagination(page, size, topK int) (int, int) {
|
|
if page < 1 {
|
|
page = 1
|
|
}
|
|
if size <= 0 {
|
|
size = 30
|
|
}
|
|
if topK <= 0 {
|
|
topK = 1024
|
|
}
|
|
|
|
RERANK_LIMIT := max(30, (64/size)*size)
|
|
if RERANK_LIMIT < size {
|
|
RERANK_LIMIT = size
|
|
}
|
|
if RERANK_LIMIT > topK {
|
|
RERANK_LIMIT = topK
|
|
}
|
|
|
|
offset := (page - 1) * RERANK_LIMIT
|
|
if offset < 0 {
|
|
offset = 0
|
|
}
|
|
|
|
return offset, RERANK_LIMIT
|
|
}
|
|
|
|
// buildFilterClauses builds ES filter clauses from kb_ids and available_int
|
|
// Reference: rag/utils/es_conn.py L60-L78
|
|
// When available=0: available_int < 1
|
|
// When available!=0: NOT (available_int < 1)
|
|
func buildFilterClauses(datasetIDs []string, available int) []map[string]interface{} {
|
|
var filters []map[string]interface{}
|
|
|
|
if len(datasetIDs) > 0 {
|
|
filters = append(filters, map[string]interface{}{
|
|
"terms": map[string]interface{}{"kb_id": datasetIDs},
|
|
})
|
|
}
|
|
|
|
// Add available_int filter
|
|
// Reference: rag/utils/es_conn.py L63-L68
|
|
if available == 0 {
|
|
// available_int < 1
|
|
filters = append(filters, map[string]interface{}{
|
|
"range": map[string]interface{}{
|
|
"available_int": map[string]interface{}{
|
|
"lt": 1,
|
|
},
|
|
},
|
|
})
|
|
} else {
|
|
// must_not: available_int < 1 (i.e., available_int >= 1)
|
|
filters = append(filters, map[string]interface{}{
|
|
"bool": map[string]interface{}{
|
|
"must_not": []map[string]interface{}{
|
|
{
|
|
"range": map[string]interface{}{
|
|
"available_int": map[string]interface{}{
|
|
"lt": 1,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
})
|
|
}
|
|
|
|
return filters
|
|
}
|
|
|
|
// buildSkillFilterClauses builds ES filter clauses for skill index
|
|
// Skill index uses 'status' field instead of 'available_int'
|
|
func buildSkillFilterClauses() []map[string]interface{} {
|
|
// Filter for active skills (status = "1")
|
|
return []map[string]interface{}{
|
|
{
|
|
"term": map[string]interface{}{
|
|
"status": "1",
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
// buildFilterFromMap converts a generic filter map to ES filter clauses
|
|
func buildFilterFromMap(filter map[string]interface{}) []map[string]interface{} {
|
|
var filters []map[string]interface{}
|
|
for field, value := range filter {
|
|
switch v := value.(type) {
|
|
case []string:
|
|
filters = append(filters, map[string]interface{}{
|
|
"terms": map[string]interface{}{field: v},
|
|
})
|
|
case []interface{}:
|
|
filters = append(filters, map[string]interface{}{
|
|
"terms": map[string]interface{}{field: v},
|
|
})
|
|
default:
|
|
filters = append(filters, map[string]interface{}{
|
|
"term": map[string]interface{}{field: v},
|
|
})
|
|
}
|
|
}
|
|
return filters
|
|
}
|
|
|
|
// buildESKeywordQuery builds keyword-only search query for ES
|
|
// Uses query_string if matchText is in query_string format, otherwise uses multi_match
|
|
// boost is applied to the text match clause (query_string or multi_match)
|
|
func buildESKeywordQuery(matchText string, filterClauses []map[string]interface{}, boost float64) map[string]interface{} {
|
|
var mustClause map[string]interface{}
|
|
|
|
// Handle wildcard query (match all)
|
|
if matchText == "*" || matchText == "" {
|
|
mustClause = map[string]interface{}{
|
|
"match_all": map[string]interface{}{},
|
|
}
|
|
} else {
|
|
// Use query_string for complex queries
|
|
queryString := map[string]interface{}{
|
|
"query": matchText,
|
|
"fields": []string{"title_tks^10", "title_sm_tks^5", "important_kwd^30", "important_tks^20", "question_tks^20", "content_ltks^2", "content_sm_ltks"},
|
|
"type": "best_fields",
|
|
"minimum_should_match": "30%",
|
|
"boost": boost,
|
|
}
|
|
mustClause = map[string]interface{}{
|
|
"query_string": queryString,
|
|
}
|
|
}
|
|
|
|
return map[string]interface{}{
|
|
"bool": map[string]interface{}{
|
|
"must": mustClause,
|
|
"filter": filterClauses,
|
|
},
|
|
}
|
|
}
|
|
|
|
// buildSkillKeywordQuery builds keyword-only search query for skill index
|
|
// Skill index uses different field names: name_tks, tags_tks, description_tks, content_tks
|
|
func buildSkillKeywordQuery(matchText string, filterClauses []map[string]interface{}, boost float64) map[string]interface{} {
|
|
var mustClause map[string]interface{}
|
|
|
|
// Handle wildcard query (match all)
|
|
if matchText == "*" || matchText == "" {
|
|
mustClause = map[string]interface{}{
|
|
"match_all": map[string]interface{}{},
|
|
}
|
|
} else {
|
|
// Use query_string for complex queries with skill-specific fields
|
|
queryString := map[string]interface{}{
|
|
"query": matchText,
|
|
"fields": []string{"name_tks^10", "tags_tks^5", "description_tks^3", "content_tks^1"},
|
|
"type": "best_fields",
|
|
"minimum_should_match": "30%",
|
|
"boost": boost,
|
|
}
|
|
mustClause = map[string]interface{}{
|
|
"query_string": queryString,
|
|
}
|
|
}
|
|
|
|
return map[string]interface{}{
|
|
"bool": map[string]interface{}{
|
|
"must": mustClause,
|
|
"filter": filterClauses,
|
|
},
|
|
}
|
|
}
|
|
|
|
// convertESResponse converts ES SearchResponse to unified chunks format
|
|
func convertESResponse(esResp *SearchResponse, vectorFieldName string) []map[string]interface{} {
|
|
if esResp == nil || esResp.Hits.Hits == nil {
|
|
return []map[string]interface{}{}
|
|
}
|
|
|
|
chunks := make([]map[string]interface{}, len(esResp.Hits.Hits))
|
|
for i, hit := range esResp.Hits.Hits {
|
|
chunks[i] = hit.Source
|
|
chunks[i]["_score"] = hit.Score
|
|
chunks[i]["_id"] = hit.ID
|
|
}
|
|
return chunks
|
|
}
|
|
|
|
// parseOrderByExpr parses the OrderBy expression into ES sort format
|
|
func parseOrderByExpr(orderBy *types.OrderByExpr) []map[string]interface{} {
|
|
if orderBy == nil || len(orderBy.Fields) == 0 {
|
|
return nil
|
|
}
|
|
|
|
var result []map[string]interface{}
|
|
for _, field := range orderBy.Fields {
|
|
direction := "asc"
|
|
if field.Type == types.SortDesc {
|
|
direction = "desc"
|
|
}
|
|
|
|
if field.Field == "_score" || field.Field == "score" {
|
|
result = append(result, map[string]interface{}{
|
|
"_score": direction,
|
|
})
|
|
} else {
|
|
result = append(result, map[string]interface{}{
|
|
field.Field: direction,
|
|
})
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// Helper query builder functions (legacy)
|
|
|
|
// BuildMatchTextQuery builds a text match query
|
|
func BuildMatchTextQuery(fields []string, text string, fuzziness string) map[string]interface{} {
|
|
query := map[string]interface{}{
|
|
"multi_match": map[string]interface{}{
|
|
"query": text,
|
|
"fields": fields,
|
|
},
|
|
}
|
|
|
|
if fuzziness != "" {
|
|
if multiMatch, ok := query["multi_match"].(map[string]interface{}); ok {
|
|
multiMatch["fuzziness"] = fuzziness
|
|
}
|
|
}
|
|
|
|
return query
|
|
}
|
|
|
|
// BuildTermQuery builds a term query
|
|
func BuildTermQuery(field string, value interface{}) map[string]interface{} {
|
|
return map[string]interface{}{
|
|
"term": map[string]interface{}{
|
|
field: value,
|
|
},
|
|
}
|
|
}
|
|
|
|
// BuildRangeQuery builds a range query
|
|
func BuildRangeQuery(field string, from, to interface{}) map[string]interface{} {
|
|
rangeQuery := make(map[string]interface{})
|
|
if from != nil {
|
|
rangeQuery["gte"] = from
|
|
}
|
|
if to != nil {
|
|
rangeQuery["lte"] = to
|
|
}
|
|
|
|
return map[string]interface{}{
|
|
"range": map[string]interface{}{
|
|
field: rangeQuery,
|
|
},
|
|
}
|
|
}
|
|
|
|
// BuildBoolQuery builds a bool query
|
|
func BuildBoolQuery() map[string]interface{} {
|
|
return map[string]interface{}{
|
|
"bool": make(map[string]interface{}),
|
|
}
|
|
}
|
|
|
|
// AddMust adds must clause to bool query
|
|
func AddMust(query map[string]interface{}, clauses ...map[string]interface{}) {
|
|
if boolQuery, ok := query["bool"].(map[string]interface{}); ok {
|
|
if _, exists := boolQuery["must"]; !exists {
|
|
boolQuery["must"] = []map[string]interface{}{}
|
|
}
|
|
if must, ok := boolQuery["must"].([]map[string]interface{}); ok {
|
|
boolQuery["must"] = append(must, clauses...)
|
|
}
|
|
}
|
|
}
|
|
|
|
// AddShould adds should clause to bool query
|
|
func AddShould(query map[string]interface{}, clauses ...map[string]interface{}) {
|
|
if boolQuery, ok := query["bool"].(map[string]interface{}); ok {
|
|
if _, exists := boolQuery["should"]; !exists {
|
|
boolQuery["should"] = []map[string]interface{}{}
|
|
}
|
|
if should, ok := boolQuery["should"].([]map[string]interface{}); ok {
|
|
boolQuery["should"] = append(should, clauses...)
|
|
}
|
|
}
|
|
}
|
|
|
|
// AddFilter adds filter clause to bool query
|
|
func AddFilter(query map[string]interface{}, clauses ...map[string]interface{}) {
|
|
if boolQuery, ok := query["bool"].(map[string]interface{}); ok {
|
|
if _, exists := boolQuery["filter"]; !exists {
|
|
boolQuery["filter"] = []map[string]interface{}{}
|
|
}
|
|
if filter, ok := boolQuery["filter"].([]map[string]interface{}); ok {
|
|
boolQuery["filter"] = append(filter, clauses...)
|
|
}
|
|
}
|
|
}
|
|
|
|
// AddMustNot adds must_not clause to bool query
|
|
func AddMustNot(query map[string]interface{}, clauses ...map[string]interface{}) {
|
|
if boolQuery, ok := query["bool"].(map[string]interface{}); ok {
|
|
if _, exists := boolQuery["must_not"]; !exists {
|
|
boolQuery["must_not"] = []map[string]interface{}{}
|
|
}
|
|
if mustNot, ok := boolQuery["must_not"].([]map[string]interface{}); ok {
|
|
boolQuery["must_not"] = append(mustNot, clauses...)
|
|
}
|
|
}
|
|
}
|
|
|
|
// GetDocIDs is not implemented for Elasticsearch
|
|
func (e *elasticsearchEngine) GetDocIDs(chunks []map[string]interface{}) []string {
|
|
common.Warn("GetDocIDs not implemented for Elasticsearch")
|
|
return nil
|
|
}
|
|
|
|
// equivalentConditionToStr converts a condition map to a filter string (for ES query_string)
|
|
func equivalentConditionToStr(condition map[string]interface{}) string {
|
|
if len(condition) == 0 {
|
|
return ""
|
|
}
|
|
|
|
var cond []string
|
|
|
|
for k, v := range condition {
|
|
if k == "_id" {
|
|
continue
|
|
}
|
|
if v == nil || v == "" {
|
|
continue
|
|
}
|
|
|
|
// Handle list values
|
|
if list, ok := v.([]interface{}); ok && len(list) > 0 {
|
|
var items []string
|
|
for _, item := range list {
|
|
if s, ok := item.(string); ok {
|
|
items = append(items, fmt.Sprintf("%s:'%s'", k, strings.ReplaceAll(s, "'", "\\'")))
|
|
} else {
|
|
items = append(items, fmt.Sprintf("%s:%v", k, item))
|
|
}
|
|
}
|
|
if len(items) > 0 {
|
|
cond = append(cond, "("+strings.Join(items, " OR ")+")")
|
|
}
|
|
continue
|
|
}
|
|
|
|
if list, ok := v.([]string); ok && len(list) > 0 {
|
|
var items []string
|
|
for _, item := range list {
|
|
items = append(items, fmt.Sprintf("%s:'%s'", k, strings.ReplaceAll(item, "'", "\\'")))
|
|
}
|
|
if len(items) > 0 {
|
|
cond = append(cond, "("+strings.Join(items, " OR ")+")")
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Handle numeric values (no quotes)
|
|
if isNumericValue(v) {
|
|
cond = append(cond, fmt.Sprintf("%s:%v", k, v))
|
|
continue
|
|
}
|
|
|
|
// Handle string values (with quotes and escaping)
|
|
if str, ok := v.(string); ok {
|
|
cond = append(cond, fmt.Sprintf("%s:'%s'", k, strings.ReplaceAll(str, "'", "\\'")))
|
|
continue
|
|
}
|
|
|
|
// Fallback: treat as string
|
|
cond = append(cond, fmt.Sprintf("%s:'%v'", k, v))
|
|
}
|
|
|
|
if len(cond) == 0 {
|
|
return ""
|
|
}
|
|
return strings.Join(cond, " AND ")
|
|
}
|
|
|
|
// isNumericValue checks if a value is numeric
|
|
func isNumericValue(v interface{}) bool {
|
|
switch v.(type) {
|
|
case int, int8, int16, int32, int64:
|
|
return true
|
|
case uint, uint8, uint16, uint32, uint64:
|
|
return true
|
|
case float32, float64:
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// calculateScores calculates _score for chunks
|
|
func calculateScores(chunks []map[string]interface{}, scoreColumn, pagerankField string) []map[string]interface{} {
|
|
for i := range chunks {
|
|
score := 0.0
|
|
if scoreVal, ok := chunks[i][scoreColumn]; ok {
|
|
if f, ok := toFloat64(scoreVal); ok {
|
|
score += f
|
|
}
|
|
}
|
|
if pagerankField != "" {
|
|
if prVal, ok := chunks[i][pagerankField]; ok {
|
|
if f, ok := toFloat64(prVal); ok {
|
|
score += f
|
|
}
|
|
}
|
|
}
|
|
chunks[i]["_score"] = score
|
|
}
|
|
return chunks
|
|
}
|
|
|
|
// toFloat64 converts a value to float64
|
|
func toFloat64(v interface{}) (float64, bool) {
|
|
switch val := v.(type) {
|
|
case float64:
|
|
return val, true
|
|
case float32:
|
|
return float64(val), true
|
|
case int:
|
|
return float64(val), true
|
|
case int64:
|
|
return float64(val), true
|
|
}
|
|
return 0, false
|
|
}
|
|
|
|
// sortByScore sorts chunks by _score descending and limits
|
|
func sortByScore(chunks []map[string]interface{}, limit int) []map[string]interface{} {
|
|
if len(chunks) == 0 {
|
|
return chunks
|
|
}
|
|
|
|
// Sort by _score descending
|
|
sort.Slice(chunks, func(i, j int) bool {
|
|
scoreI := getChunkScore(chunks[i])
|
|
scoreJ := getChunkScore(chunks[j])
|
|
return scoreI > scoreJ
|
|
})
|
|
|
|
// Limit
|
|
if len(chunks) > limit && limit > 0 {
|
|
chunks = chunks[:limit]
|
|
}
|
|
|
|
return chunks
|
|
}
|
|
|
|
// getChunkScore extracts the score from a chunk
|
|
func getChunkScore(chunk map[string]interface{}) float64 {
|
|
if v, ok := chunk["_score"].(float64); ok {
|
|
return v
|
|
}
|
|
if v, ok := chunk["SCORE"].(float64); ok {
|
|
return v
|
|
}
|
|
if v, ok := chunk["SIMILARITY"].(float64); ok {
|
|
return v
|
|
}
|
|
return 0.0
|
|
}
|
|
|
|
// GetFields applies field mappings to chunks and returns a dict keyed by chunk ID.
|
|
// This mirrors the Infinity GetFields function behavior.
|
|
func GetFields(chunks []map[string]interface{}, fields []string) map[string]map[string]interface{} {
|
|
result := make(map[string]map[string]interface{})
|
|
if len(chunks) == 0 {
|
|
return result
|
|
}
|
|
|
|
// If fields is provided, create a set for lookup
|
|
fieldSet := make(map[string]bool)
|
|
for _, f := range fields {
|
|
fieldSet[f] = true
|
|
}
|
|
|
|
for _, chunk := range chunks {
|
|
// Apply field mappings
|
|
// docnm -> docnm_kwd, title_tks, title_sm_tks
|
|
if val, ok := chunk["docnm"].(string); ok {
|
|
chunk["docnm_kwd"] = val
|
|
chunk["title_tks"] = val
|
|
chunk["title_sm_tks"] = val
|
|
}
|
|
|
|
// important_keywords -> important_kwd (split by comma), important_tks
|
|
if val, ok := chunk["important_keywords"].(string); ok {
|
|
if val == "" {
|
|
chunk["important_kwd"] = []interface{}{}
|
|
} else {
|
|
parts := strings.Split(val, ",")
|
|
chunk["important_kwd"] = parts
|
|
}
|
|
chunk["important_tks"] = val
|
|
} else {
|
|
chunk["important_kwd"] = []interface{}{}
|
|
chunk["important_tks"] = []interface{}{}
|
|
}
|
|
|
|
// questions -> question_kwd (split by newline), question_tks
|
|
if val, ok := chunk["questions"].(string); ok {
|
|
if val == "" {
|
|
chunk["question_kwd"] = []interface{}{}
|
|
} else {
|
|
parts := strings.Split(val, "\n")
|
|
chunk["question_kwd"] = parts
|
|
}
|
|
chunk["question_tks"] = val
|
|
} else {
|
|
chunk["question_kwd"] = []interface{}{}
|
|
chunk["question_tks"] = []interface{}{}
|
|
}
|
|
|
|
// content -> content_with_weight, content_ltks, content_sm_ltks
|
|
if val, ok := chunk["content"].(string); ok {
|
|
chunk["content_with_weight"] = val
|
|
chunk["content_ltks"] = val
|
|
chunk["content_sm_ltks"] = val
|
|
}
|
|
|
|
// authors -> authors_tks, authors_sm_tks
|
|
if val, ok := chunk["authors"].(string); ok {
|
|
chunk["authors_tks"] = val
|
|
chunk["authors_sm_tks"] = val
|
|
}
|
|
|
|
// Build result map keyed by id
|
|
if id, ok := chunk["id"].(string); ok {
|
|
fieldMap := make(map[string]interface{})
|
|
for field, value := range chunk {
|
|
if len(fieldSet) == 0 || fieldSet[field] {
|
|
fieldMap[field] = value
|
|
}
|
|
}
|
|
result[id] = fieldMap
|
|
}
|
|
}
|
|
|
|
return result
|
|
} |