mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-01 16:25:44 +08:00
### What problem does this PR solve? Implement OpenAI chat completions in GO POST /api/v1/openai/<chat_id>/chat/completions OpenAI chat cli: internal/development.md ### Type of change - [x] Refactoring
2290 lines
68 KiB
Go
2290 lines
68 KiB
Go
//
|
||
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
||
//
|
||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
// you may not use this file except in compliance with the License.
|
||
// You may obtain a copy of the License at
|
||
//
|
||
// http://www.apache.org/licenses/LICENSE-2.0
|
||
//
|
||
// Unless required by applicable law or agreed to in writing, software
|
||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
// See the License for the specific language governing permissions and
|
||
// limitations under the License.
|
||
//
|
||
|
||
package infinity
|
||
|
||
import (
|
||
"context"
|
||
"encoding/json"
|
||
"fmt"
|
||
"os"
|
||
"path/filepath"
|
||
"ragflow/internal/common"
|
||
"ragflow/internal/engine/types"
|
||
"ragflow/internal/utility"
|
||
"regexp"
|
||
"slices"
|
||
"sort"
|
||
"strconv"
|
||
"strings"
|
||
|
||
infinity "github.com/infiniflow/infinity-go-sdk"
|
||
"go.uber.org/zap"
|
||
)
|
||
|
||
// ChinesePunctRegex splits on comma, semicolon, Chinese punctuations, and newlines
|
||
var ChinesePunctRegex = regexp.MustCompile(`[,,;;、\r\n]+`)
|
||
|
||
// CreateChunkStore creates a chunk table in Infinity
|
||
// baseName is the table name prefix (e.g., "ragflow_<tenant_id>")
|
||
// The full table name is built as "{baseName}_{datasetID}"
|
||
// For skill index (datasetID="skill"), tableName is just baseName and uses skill_infinity_mapping.json
|
||
func (e *infinityEngine) CreateChunkStore(ctx context.Context, baseName, datasetID string, vectorSize int, parserID string) error {
|
||
vecSize := vectorSize
|
||
|
||
// Determine table name and mapping file based on index type
|
||
var tableName string
|
||
var mappingFile string
|
||
|
||
tableName = buildChunkTableName(baseName, datasetID)
|
||
if datasetID == "skill" {
|
||
mappingFile = "skill_infinity_mapping.json"
|
||
common.Info("Creating skill index table", zap.String("tableName", tableName), zap.String("mappingFile", mappingFile))
|
||
} else {
|
||
mappingFile = e.mappingFileName
|
||
common.Info("Creating regular index table", zap.String("tableName", tableName), zap.String("mappingFile", mappingFile))
|
||
}
|
||
|
||
// Use configured schema
|
||
fpMapping := filepath.Join(utility.GetProjectRoot(), "conf", mappingFile)
|
||
|
||
schemaData, err := os.ReadFile(fpMapping)
|
||
if err != nil {
|
||
return fmt.Errorf("Failed to read mapping file: %w", err)
|
||
}
|
||
|
||
var schema orderedFields
|
||
if err := json.Unmarshal(schemaData, &schema); err != nil {
|
||
return fmt.Errorf("Failed to parse mapping file: %w", err)
|
||
}
|
||
|
||
// Get database
|
||
db, err := e.client.conn.GetDatabase(e.client.dbName)
|
||
if err != nil {
|
||
return fmt.Errorf("Failed to get database: %w", err)
|
||
}
|
||
|
||
// Determine vector column name
|
||
vectorColName := fmt.Sprintf("q_%d_vec", vecSize)
|
||
|
||
// Check if table already exists
|
||
exists, err := e.tableExists(ctx, tableName)
|
||
if err != nil {
|
||
return fmt.Errorf("Failed to check if table exists: %w", err)
|
||
}
|
||
|
||
var table *infinity.Table
|
||
if exists {
|
||
// Table exists, open it and check if vector column needs to be added
|
||
common.Info("Table already exists, checking for vector column", zap.String("tableName", tableName))
|
||
table, err = db.GetTable(tableName)
|
||
if err != nil {
|
||
return fmt.Errorf("Failed to open existing table %s: %w", tableName, err)
|
||
}
|
||
|
||
// Check if vector column exists (for embedding model changes)
|
||
colExists, err := e.columnExists(table, vectorColName)
|
||
if err != nil {
|
||
common.Warn("Failed to check column existence", zap.String("column", vectorColName), zap.Error(err))
|
||
}
|
||
|
||
// Add new vector column if it doesn't exist (handles embedding model change)
|
||
if !colExists {
|
||
common.Info("Adding new vector column for embedding model change", zap.String("column", vectorColName), zap.Int("size", vecSize))
|
||
addColSchema := infinity.TableSchema{
|
||
&infinity.ColumnDefinition{
|
||
Name: vectorColName,
|
||
DataType: fmt.Sprintf("vector,%d,float", vecSize),
|
||
},
|
||
}
|
||
if _, err := table.AddColumns(addColSchema); err != nil {
|
||
common.Error("Failed to add vector column "+vectorColName, err)
|
||
return fmt.Errorf("Failed to add vector column %s: %w", vectorColName, err)
|
||
}
|
||
common.Info("Successfully added vector column", zap.String("column", vectorColName))
|
||
}
|
||
} else {
|
||
// Table doesn't exist, create it with vector column in the initial schema
|
||
common.Info(fmt.Sprintf("Creating table with vector column: %s with dimension %d", vectorColName, vecSize))
|
||
|
||
// Build column definitions (preserving JSON order)
|
||
var columns infinity.TableSchema
|
||
for _, fieldName := range schema.Keys {
|
||
fieldInfo := schema.Fields[fieldName]
|
||
col := infinity.ColumnDefinition{
|
||
Name: fieldName,
|
||
DataType: fieldInfo.Type,
|
||
Default: fieldInfo.Default,
|
||
}
|
||
columns = append(columns, &col)
|
||
}
|
||
|
||
// Add vector column
|
||
columns = append(columns, &infinity.ColumnDefinition{
|
||
Name: vectorColName,
|
||
DataType: fmt.Sprintf("vector,%d,float", vecSize),
|
||
})
|
||
|
||
// Add chunk_data column for table parser
|
||
if parserID == "table" {
|
||
columns = append(columns, &infinity.ColumnDefinition{
|
||
Name: "chunk_data",
|
||
DataType: "json",
|
||
Default: "{}",
|
||
})
|
||
}
|
||
|
||
// Create table
|
||
table, err = db.CreateTable(tableName, columns, infinity.ConflictTypeIgnore)
|
||
if err != nil {
|
||
return fmt.Errorf("Failed to create table: %w", err)
|
||
}
|
||
common.Debug("Infinity created table", zap.String("tableName", tableName))
|
||
}
|
||
|
||
// Create HNSW index on vector column with unique name based on vector size
|
||
// Use unique index name to avoid conflict when embedding model changes
|
||
vectorIndexName := fmt.Sprintf("q_%d_vec_idx", vecSize)
|
||
_, err = table.CreateIndex(
|
||
vectorIndexName,
|
||
infinity.NewIndexInfo(vectorColName, infinity.IndexTypeHnsw, map[string]string{
|
||
"M": "16",
|
||
"ef_construction": "50",
|
||
"metric": "cosine",
|
||
"encode": "lvq",
|
||
}),
|
||
infinity.ConflictTypeIgnore,
|
||
"",
|
||
)
|
||
if err != nil {
|
||
return fmt.Errorf("Failed to create HNSW index %s: %w", vectorIndexName, err)
|
||
}
|
||
common.Info("Created vector index", zap.String("indexName", vectorIndexName), zap.String("column", vectorColName))
|
||
|
||
// Create full-text indexes for varchar fields with analyzers
|
||
for _, fieldName := range schema.Keys {
|
||
fieldInfo := schema.Fields[fieldName]
|
||
if fieldInfo.Type != "varchar" || fieldInfo.Analyzer == nil {
|
||
continue
|
||
}
|
||
|
||
analyzers := []string{}
|
||
switch a := fieldInfo.Analyzer.(type) {
|
||
case string:
|
||
analyzers = []string{a}
|
||
case []interface{}:
|
||
for _, v := range a {
|
||
if s, ok := v.(string); ok {
|
||
analyzers = append(analyzers, s)
|
||
}
|
||
}
|
||
}
|
||
|
||
for _, analyzer := range analyzers {
|
||
indexNameFt := fmt.Sprintf("ft_%s_%s",
|
||
regexp.MustCompile(`[^a-zA-Z0-9]`).ReplaceAllString(fieldName, "_"),
|
||
regexp.MustCompile(`[^a-zA-Z0-9]`).ReplaceAllString(analyzer, "_"),
|
||
)
|
||
_, err = table.CreateIndex(
|
||
indexNameFt,
|
||
infinity.NewIndexInfo(fieldName, infinity.IndexTypeFullText, map[string]string{"ANALYZER": analyzer}),
|
||
infinity.ConflictTypeIgnore,
|
||
"",
|
||
)
|
||
if err != nil {
|
||
return fmt.Errorf("Failed to create fulltext index %s: %w", indexNameFt, err)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Create secondary indexes for fields with index_type
|
||
for _, fieldName := range schema.Keys {
|
||
fieldInfo := schema.Fields[fieldName]
|
||
if fieldInfo.IndexType == nil {
|
||
continue
|
||
}
|
||
|
||
indexTypeStr := ""
|
||
params := map[string]string{}
|
||
|
||
switch it := fieldInfo.IndexType.(type) {
|
||
case string:
|
||
indexTypeStr = it
|
||
case map[string]interface{}:
|
||
if t, ok := it["type"].(string); ok {
|
||
indexTypeStr = t
|
||
}
|
||
if card, ok := it["cardinality"].(string); ok {
|
||
params["cardinality"] = card
|
||
}
|
||
}
|
||
|
||
if indexTypeStr == "secondary" {
|
||
indexNameSec := fmt.Sprintf("sec_%s", fieldName)
|
||
_, err = table.CreateIndex(
|
||
indexNameSec,
|
||
infinity.NewIndexInfo(fieldName, infinity.IndexTypeSecondary, params),
|
||
infinity.ConflictTypeIgnore,
|
||
"",
|
||
)
|
||
if err != nil {
|
||
return fmt.Errorf("Failed to create secondary index %s: %w", indexNameSec, err)
|
||
}
|
||
}
|
||
}
|
||
|
||
return nil
|
||
}
|
||
|
||
// InsertChunks inserts documents into a dataset table
|
||
// Table name format: {baseName}_{datasetID}
|
||
// Auto-create the table if it doesn't exist
|
||
// Delete existing rows with matching IDs before insert
|
||
func (e *infinityEngine) InsertChunks(ctx context.Context, chunks []map[string]interface{}, baseName string, datasetID string) ([]string, error) {
|
||
tableName := buildChunkTableName(baseName, datasetID)
|
||
common.Info("InfinityConnection.InsertChunks called", zap.String("tableName", tableName), zap.Int("chunkCount", len(chunks)))
|
||
|
||
db, err := e.client.conn.GetDatabase(e.client.dbName)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("Failed to get database: %w", err)
|
||
}
|
||
|
||
table, err := db.GetTable(tableName)
|
||
if err != nil {
|
||
// Table doesn't exist, try to create it
|
||
errMsg := strings.ToLower(err.Error())
|
||
if !strings.Contains(errMsg, "not found") && !strings.Contains(errMsg, "doesn't exist") {
|
||
return nil, fmt.Errorf("Failed to get table %s: %w", tableName, err)
|
||
}
|
||
|
||
// Infer vector size from chunks
|
||
vectorSize := 0
|
||
vectorPattern := regexp.MustCompile(`q_(\d+)_vec`)
|
||
for _, chunk := range chunks {
|
||
for key := range chunk {
|
||
matches := vectorPattern.FindStringSubmatch(key)
|
||
if len(matches) >= 2 {
|
||
vectorSize, _ = strconv.Atoi(matches[1])
|
||
break
|
||
}
|
||
}
|
||
if vectorSize > 0 {
|
||
break
|
||
}
|
||
}
|
||
if vectorSize == 0 {
|
||
return nil, fmt.Errorf("cannot infer vector size from chunks")
|
||
}
|
||
|
||
// Determine parser_id from chunk structure
|
||
parserID := ""
|
||
if chunkData, ok := chunks[0]["chunk_data"].(map[string]interface{}); ok && chunkData != nil {
|
||
parserID = "table"
|
||
}
|
||
|
||
// Create table
|
||
if err := e.CreateChunkStore(ctx, baseName, datasetID, vectorSize, parserID); err != nil {
|
||
return nil, fmt.Errorf("Failed to create table: %w", err)
|
||
}
|
||
|
||
table, err = db.GetTable(tableName)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("Failed to get table after creation: %w", err)
|
||
}
|
||
}
|
||
|
||
// Get embedding columns and their sizes
|
||
var embeddingCols [][2]interface{}
|
||
colsResp, err := table.ShowColumns()
|
||
if err != nil {
|
||
return nil, fmt.Errorf("Failed to get columns: %w", err)
|
||
}
|
||
result, ok := colsResp.(*infinity.QueryResult)
|
||
if !ok {
|
||
return nil, fmt.Errorf("unexpected response type: %T", colsResp)
|
||
}
|
||
|
||
// ShowColumns returns a result set where Data contains arrays of column values
|
||
re := regexp.MustCompile(`Embedding\([a-z]+,(\d+)\)`)
|
||
if nameArr, ok := result.Data["name"]; ok {
|
||
if typeArr, ok := result.Data["type"]; ok {
|
||
for i := 0; i < len(nameArr); i++ {
|
||
colName, _ := nameArr[i].(string)
|
||
colType, _ := typeArr[i].(string)
|
||
matches := re.FindStringSubmatch(colType)
|
||
if len(matches) >= 2 {
|
||
size, _ := strconv.Atoi(matches[1])
|
||
embeddingCols = append(embeddingCols, [2]interface{}{colName, size})
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Transform chunks using helper function
|
||
insertChunks := make([]map[string]interface{}, len(chunks))
|
||
for i, chunk := range chunks {
|
||
insertChunks[i] = transformChunkFields(chunk, embeddingCols)
|
||
}
|
||
|
||
// Delete existing rows with matching IDs
|
||
if len(insertChunks) > 0 {
|
||
idList := make([]string, len(insertChunks))
|
||
for i, chunk := range insertChunks {
|
||
idList[i] = fmt.Sprintf("'%v'", chunk["id"])
|
||
}
|
||
filter := fmt.Sprintf("id IN (%s)", strings.Join(idList, ", "))
|
||
common.Debug(fmt.Sprintf("Deleting existing rows with filter: %s", filter))
|
||
delResp, delErr := table.Delete(filter)
|
||
if delErr != nil {
|
||
common.Warn(fmt.Sprintf("Failed to delete existing rows: %v", delErr))
|
||
} else {
|
||
common.Info(fmt.Sprintf("Deleted %d existing rows", delResp.DeletedRows))
|
||
}
|
||
}
|
||
|
||
// Insert chunks to dataset
|
||
_, err = table.Insert(insertChunks)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("Failed to insert chunks to dataset: %w", err)
|
||
}
|
||
|
||
common.Info("InfinityConnection.InsertChunks result", zap.String("tableName", tableName), zap.Int("count", len(insertChunks)))
|
||
return []string{}, nil
|
||
}
|
||
|
||
// UpdateChunks updates chunks in a dataset table
|
||
// Table name format: {baseName}_{datasetID}
|
||
func (e *infinityEngine) UpdateChunks(ctx context.Context, condition map[string]interface{}, newValue map[string]interface{}, baseName string, datasetID string) error {
|
||
tableName := buildChunkTableName(baseName, datasetID)
|
||
common.Info("InfinityConnection.UpdateChunks called", zap.String("tableName", tableName), zap.Any("condition", condition))
|
||
|
||
db, err := e.client.conn.GetDatabase(e.client.dbName)
|
||
if err != nil {
|
||
return fmt.Errorf("Failed to get database: %w", err)
|
||
}
|
||
|
||
table, err := db.GetTable(tableName)
|
||
if err != nil {
|
||
return fmt.Errorf("Failed to get table %s: %w", tableName, err)
|
||
}
|
||
|
||
// Get table columns
|
||
clmns := make(map[string]struct {
|
||
Type string
|
||
Default interface{}
|
||
})
|
||
colsResp, err := table.ShowColumns()
|
||
if err != nil {
|
||
return fmt.Errorf("Failed to get columns: %w", err)
|
||
}
|
||
result, ok := colsResp.(*infinity.QueryResult)
|
||
if ok {
|
||
if nameArr, ok := result.Data["name"]; ok {
|
||
if typeArr, ok := result.Data["type"]; ok {
|
||
if defArr, ok := result.Data["default"]; ok {
|
||
for i := 0; i < len(nameArr); i++ {
|
||
colName, _ := nameArr[i].(string)
|
||
colType, _ := typeArr[i].(string)
|
||
var colDefault interface{}
|
||
if i < len(defArr) {
|
||
colDefault = defArr[i]
|
||
}
|
||
clmns[colName] = struct {
|
||
Type string
|
||
Default interface{}
|
||
}{colType, colDefault}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Build filter string from condition
|
||
filter := buildFilterFromCondition(condition, clmns)
|
||
|
||
// Process remove operation first
|
||
removeValue := make(map[string]interface{})
|
||
if removeData, ok := newValue["remove"].(map[string]interface{}); ok {
|
||
removeValue = removeData
|
||
}
|
||
delete(newValue, "remove")
|
||
|
||
// Transform new_value fields using helper function (no embeddings needed for update)
|
||
transformed := transformChunkFields(newValue, nil)
|
||
for k, v := range transformed {
|
||
newValue[k] = v
|
||
}
|
||
|
||
// Remove original fields that were transformed (they're now in transformed with new names/types)
|
||
// Also remove intermediate token fields that shouldn't be stored in Infinity
|
||
// This must match Python's delete list in infinity_conn.py
|
||
for _, key := range []string{"docnm_kwd", "title_tks", "title_sm_tks", "important_kwd", "important_tks",
|
||
"content_with_weight", "content_ltks", "content_sm_ltks", "authors_tks", "authors_sm_tks",
|
||
"question_kwd", "question_tks"} {
|
||
delete(newValue, key)
|
||
}
|
||
|
||
// Handle remove operations if any
|
||
if len(removeValue) > 0 {
|
||
colToRemove := make([]string, 0, len(removeValue))
|
||
for k := range removeValue {
|
||
colToRemove = append(colToRemove, k)
|
||
}
|
||
colToRemove = append(colToRemove, "id")
|
||
|
||
// Query rows to be updated
|
||
queryResult, err := table.Output(colToRemove).Filter(filter).ToResult()
|
||
if err != nil {
|
||
common.Warn(fmt.Sprintf("Failed to query rows for remove operation: %v", err))
|
||
} else {
|
||
qr, ok := queryResult.(*infinity.QueryResult)
|
||
if ok && len(qr.Data) > 0 {
|
||
// Get the id column and columns to remove
|
||
idCol := qr.Data["id"]
|
||
removeOpt := make(map[string]map[string][]string) // column -> value -> [ids]
|
||
|
||
for colName, colData := range qr.Data {
|
||
if colName == "id" {
|
||
continue
|
||
}
|
||
removeVal := removeValue[colName]
|
||
for i, id := range idCol {
|
||
if i < len(colData) {
|
||
existingVal := colData[i]
|
||
if removeStr, ok := removeVal.(string); ok {
|
||
// Split existing value by ### and remove the target value
|
||
if existingStr, ok := existingVal.(string); ok {
|
||
parts := strings.Split(existingStr, "###")
|
||
var newParts []string
|
||
for _, p := range parts {
|
||
if p != removeStr {
|
||
newParts = append(newParts, p)
|
||
}
|
||
}
|
||
if len(newParts) != len(parts) {
|
||
idStr := fmt.Sprintf("'%s'", escapeFilterValue(fmt.Sprintf("%v", id)))
|
||
if removeOpt[colName] == nil {
|
||
removeOpt[colName] = make(map[string][]string)
|
||
}
|
||
removeOpt[colName][strings.Join(newParts, "###")] = append(removeOpt[colName][strings.Join(newParts, "###")], idStr)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Execute remove updates
|
||
for colName, valueToIDs := range removeOpt {
|
||
for newVal, ids := range valueToIDs {
|
||
idFilter := filter + " AND id IN (" + strings.Join(ids, ", ") + ")"
|
||
common.Info(fmt.Sprintf("INFINITY remove update: table=%s, idFilter=%s, column=%s, newValue=%v", tableName, idFilter, colName, newVal))
|
||
_, err := table.Update(idFilter, map[string]interface{}{colName: newVal})
|
||
if err != nil {
|
||
common.Warn(fmt.Sprintf("Failed to remove value from column %s: %v", colName, err))
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Execute the main update
|
||
common.Info(fmt.Sprintf("INFINITY update: table=%s, filter=%s, newValue=%v", tableName, filter, newValue))
|
||
_, err = table.Update(filter, newValue)
|
||
if err != nil {
|
||
return fmt.Errorf("Failed to update chunks: %w", err)
|
||
}
|
||
|
||
common.Info("InfinityConnection.UpdateChunks completes", zap.String("tableName", tableName))
|
||
return nil
|
||
}
|
||
|
||
// DeleteChunks deletes chunks from a dataset table
|
||
// Table name format: {baseName}_{datasetID}
|
||
// condition specifies which chunks to delete
|
||
func (e *infinityEngine) DeleteChunks(ctx context.Context, condition map[string]interface{}, baseName string, datasetID string) (int64, error) {
|
||
tableName := buildChunkTableName(baseName, datasetID)
|
||
|
||
db, err := e.client.conn.GetDatabase(e.client.dbName)
|
||
if err != nil {
|
||
return 0, fmt.Errorf("failed to get database: %w", err)
|
||
}
|
||
|
||
table, err := db.GetTable(tableName)
|
||
if err != nil {
|
||
common.Warn(fmt.Sprintf("Table %s does not exist, skipping delete", tableName))
|
||
return 0, nil
|
||
}
|
||
|
||
// Get table columns for building filter
|
||
clmns := make(map[string]struct {
|
||
Type string
|
||
Default interface{}
|
||
})
|
||
colsResp, err := table.ShowColumns()
|
||
if err != nil {
|
||
return 0, fmt.Errorf("failed to get columns: %w", err)
|
||
}
|
||
result, ok := colsResp.(*infinity.QueryResult)
|
||
if ok {
|
||
if nameArr, ok := result.Data["name"]; ok {
|
||
if typeArr, ok := result.Data["type"]; ok {
|
||
if defArr, ok := result.Data["default"]; ok {
|
||
for i := 0; i < len(nameArr); i++ {
|
||
colName, _ := nameArr[i].(string)
|
||
colType, _ := typeArr[i].(string)
|
||
var colDefault interface{}
|
||
if i < len(defArr) {
|
||
colDefault = defArr[i]
|
||
}
|
||
clmns[colName] = struct {
|
||
Type string
|
||
Default interface{}
|
||
}{colType, colDefault}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Build filter from condition
|
||
filter := buildFilterFromCondition(condition, clmns)
|
||
|
||
delResp, err := table.Delete(filter)
|
||
if err != nil {
|
||
return 0, fmt.Errorf("failed to delete: %w", err)
|
||
}
|
||
|
||
return delResp.DeletedRows, nil
|
||
}
|
||
|
||
// Search searches the Infinity engine for matching chunks.
|
||
// It supports three matching types: MatchTextExpr (full-text), MatchDenseExpr (vector), and FusionExpr (combined).
|
||
// If no match expressions are provided, Search relies solely on filter (e.g., doc_id, available_int) to find results.
|
||
func (e *infinityEngine) Search(ctx context.Context, req *types.SearchRequest) (*types.SearchResult, error) {
|
||
types.LogSearchRequest("Infinity", req)
|
||
|
||
if len(req.IndexNames) == 0 {
|
||
return nil, fmt.Errorf("index names cannot be empty")
|
||
}
|
||
|
||
// Get retrieval parameters with defaults
|
||
pageSize := req.Limit
|
||
if pageSize <= 0 {
|
||
pageSize = 30
|
||
}
|
||
|
||
offset := req.Offset
|
||
if offset < 0 {
|
||
offset = 0
|
||
}
|
||
|
||
db, err := e.client.conn.GetDatabase(e.client.dbName)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to get database: %w", err)
|
||
}
|
||
|
||
isSkillIndex := false
|
||
for _, idx := range req.IndexNames {
|
||
if strings.HasPrefix(idx, "skill_") {
|
||
isSkillIndex = true
|
||
break
|
||
}
|
||
}
|
||
|
||
var outputColumns []string
|
||
if isSkillIndex {
|
||
outputColumns = []string{
|
||
"skill_id", "space_id", "folder_id", "name", "tags", "description", "content",
|
||
"version", "status", "create_time", "update_time",
|
||
}
|
||
outputColumns = convertSelectFields(outputColumns, true)
|
||
} else {
|
||
outputColumns = []string{
|
||
"id", "doc_id", "kb_id", "content_ltks", "content_with_weight",
|
||
"title_tks", "docnm_kwd", "img_id", "available_int", "important_kwd",
|
||
"position_int", "page_num_int", "top_int", "chunk_order_int",
|
||
"create_timestamp_flt", "knowledge_graph_kwd", "question_kwd", "question_tks",
|
||
"doc_type_kwd", "mom_id", "tag_kwd", "pagerank_fea", "tag_feas",
|
||
}
|
||
outputColumns = convertSelectFields(outputColumns)
|
||
}
|
||
|
||
// Allow caller to override output columns (used by KG search, etc.)
|
||
if len(req.SelectFields) > 0 {
|
||
outputColumns = convertSelectFields(req.SelectFields)
|
||
}
|
||
|
||
hasTextMatch := false
|
||
hasVectorMatch := false
|
||
var matchText *types.MatchTextExpr
|
||
var matchDense *types.MatchDenseExpr
|
||
if req.MatchExprs != nil && len(req.MatchExprs) > 0 {
|
||
for _, expr := range req.MatchExprs {
|
||
if expr == nil {
|
||
continue
|
||
}
|
||
switch e := expr.(type) {
|
||
case string:
|
||
if e != "" {
|
||
hasTextMatch = true
|
||
matchText = &types.MatchTextExpr{
|
||
MatchingText: e,
|
||
TopN: pageSize,
|
||
}
|
||
}
|
||
case *types.MatchTextExpr:
|
||
if e.MatchingText != "" {
|
||
hasTextMatch = true
|
||
matchText = e
|
||
}
|
||
case *types.MatchDenseExpr:
|
||
if len(e.EmbeddingData) > 0 {
|
||
hasVectorMatch = true
|
||
matchDense = e
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
if hasTextMatch || hasVectorMatch {
|
||
if hasTextMatch {
|
||
outputColumns = append(outputColumns, "score()")
|
||
}
|
||
// similarity() is only allowed by Infinity when there is ONLY MATCH VECTOR.
|
||
// When both text and vector matches exist (hybrid search with Fusion),
|
||
// only score() is valid — Fusion produces a unified SCORE column.
|
||
if hasVectorMatch && !hasTextMatch {
|
||
outputColumns = append(outputColumns, "similarity()")
|
||
}
|
||
// Skill index does not have pagerank_fea and tag_feas columns
|
||
if !isSkillIndex {
|
||
if !slices.Contains(outputColumns, common.PAGERANK_FLD) {
|
||
outputColumns = append(outputColumns, common.PAGERANK_FLD)
|
||
}
|
||
if !slices.Contains(outputColumns, common.TAG_FLD) {
|
||
outputColumns = append(outputColumns, common.TAG_FLD)
|
||
}
|
||
}
|
||
}
|
||
|
||
if !slices.Contains(outputColumns, "row_id") && !slices.Contains(outputColumns, "row_id()") {
|
||
outputColumns = append(outputColumns, "row_id()")
|
||
}
|
||
|
||
// Strip score pseudo-columns when there's no match expression — Infinity
|
||
// rejects SCORE()/SCORE_FACTORS() without MATCH TEXT/TENSOR/Fusion with
|
||
// "InfinityException(3013)". This protects callers (e.g. the no-match
|
||
// fallback in retrieval.go) that reuse a SelectFields list containing
|
||
// "_score" across both matched and unmatched queries.
|
||
if !hasTextMatch && !hasVectorMatch {
|
||
filtered := outputColumns[:0]
|
||
for _, c := range outputColumns {
|
||
switch c {
|
||
case "_score", "SCORE", "score()", "similarity()":
|
||
continue
|
||
}
|
||
filtered = append(filtered, c)
|
||
}
|
||
outputColumns = filtered
|
||
}
|
||
|
||
outputColumns = convertSelectFields(outputColumns, isSkillIndex)
|
||
if hasVectorMatch && matchDense != nil && matchDense.VectorColumnName != "" {
|
||
outputColumns = append(outputColumns, matchDense.VectorColumnName)
|
||
}
|
||
|
||
var filterParts []string
|
||
|
||
if hasTextMatch || hasVectorMatch {
|
||
if req.Filter != nil {
|
||
if availInt, ok := req.Filter["available_int"]; ok {
|
||
filterParts = append(filterParts, fmt.Sprintf("available_int=%v", availInt))
|
||
} else if status, ok := req.Filter["status"]; ok {
|
||
filterParts = append(filterParts, fmt.Sprintf("status='%s'", status))
|
||
} else {
|
||
if isSkillIndex {
|
||
filterParts = append(filterParts, "status='1'")
|
||
} else {
|
||
filterParts = append(filterParts, "available_int=1")
|
||
}
|
||
}
|
||
} else {
|
||
if isSkillIndex {
|
||
filterParts = append(filterParts, "status='1'")
|
||
} else {
|
||
filterParts = append(filterParts, "available_int=1")
|
||
}
|
||
}
|
||
}
|
||
|
||
// Build filter string from req.Filter
|
||
if req.Filter != nil {
|
||
filterCopy := make(map[string]interface{})
|
||
for k, v := range req.Filter {
|
||
if k != "kb_id" {
|
||
filterCopy[k] = v
|
||
}
|
||
}
|
||
|
||
condStr := equivalentConditionToStr(filterCopy)
|
||
if condStr != "" {
|
||
filterParts = append(filterParts, condStr)
|
||
}
|
||
}
|
||
filterStr := strings.Join(filterParts, " AND ")
|
||
|
||
orderBy := req.OrderBy
|
||
var rankFeature map[string]float64
|
||
if req.RankFeature != nil {
|
||
rankFeature = req.RankFeature
|
||
}
|
||
|
||
var fusionExpr *types.FusionExpr
|
||
if len(req.MatchExprs) > 2 {
|
||
if fe, ok := req.MatchExprs[2].(*types.FusionExpr); ok {
|
||
fusionExpr = fe
|
||
}
|
||
}
|
||
|
||
var allResults []map[string]interface{}
|
||
totalHits := int64(0)
|
||
|
||
for _, indexName := range req.IndexNames {
|
||
var tableNames []string
|
||
if strings.HasPrefix(indexName, "ragflow_doc_meta_") {
|
||
tableNames = []string{indexName}
|
||
} else {
|
||
kbIDs := req.KbIDs
|
||
if len(kbIDs) == 0 {
|
||
kbIDs = []string{""}
|
||
}
|
||
for _, kbID := range kbIDs {
|
||
if kbID == "" {
|
||
tableNames = append(tableNames, indexName)
|
||
} else {
|
||
tableNames = append(tableNames, fmt.Sprintf("%s_%s", indexName, kbID))
|
||
}
|
||
}
|
||
}
|
||
|
||
// minMatch comes from matchText.ExtraOptions when set (Python parity).
|
||
// Mirrors rag/utils/infinity_conn.py which reads
|
||
// matchExpr.extra_options.get("minimum_should_match", 0.0) — for the
|
||
// English (non-Chinese) path, the Go Question() builder omits
|
||
// minimum_should_match, so the default is 0.0 to match Python's
|
||
// effective 0% threshold for English queries.
|
||
minMatch := 0.0
|
||
var questionText string
|
||
var vectorData []float64
|
||
textTopN := pageSize
|
||
var originalQuery string
|
||
if matchText != nil {
|
||
questionText = matchText.MatchingText
|
||
textTopN = int(matchText.TopN)
|
||
if matchText.ExtraOptions != nil {
|
||
if oq, ok := matchText.ExtraOptions["original_query"].(string); ok {
|
||
originalQuery = oq
|
||
}
|
||
if v, ok := matchText.ExtraOptions["minimum_should_match"]; ok {
|
||
switch x := v.(type) {
|
||
case float64:
|
||
minMatch = x
|
||
case int:
|
||
minMatch = float64(x)
|
||
case string:
|
||
s := strings.TrimSuffix(x, "%")
|
||
if pct, err := strconv.Atoi(s); err == nil {
|
||
minMatch = float64(pct) / 100
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
if matchDense != nil {
|
||
vectorData = matchDense.EmbeddingData
|
||
}
|
||
|
||
for _, tableName := range tableNames {
|
||
tbl, err := db.GetTable(tableName)
|
||
if err != nil {
|
||
continue
|
||
}
|
||
table := tbl.Output(outputColumns)
|
||
|
||
var textFields []string
|
||
if matchText != nil && len(matchText.Fields) > 0 {
|
||
textFields = matchText.Fields
|
||
} else if isSkillIndex {
|
||
textFields = []string{
|
||
"name^10",
|
||
"tags^5",
|
||
"description^3",
|
||
"content^1",
|
||
}
|
||
} else {
|
||
textFields = []string{
|
||
"title_tks^10",
|
||
"title_sm_tks^5",
|
||
"important_kwd^30",
|
||
"important_tks^20",
|
||
"question_tks^20",
|
||
"content_ltks^2",
|
||
"content_sm_ltks",
|
||
}
|
||
}
|
||
|
||
// Convert field names for Infinity
|
||
var convertedFields []string
|
||
for _, f := range textFields {
|
||
cf := convertMatchingField(f)
|
||
convertedFields = append(convertedFields, cf)
|
||
}
|
||
fields := strings.Join(convertedFields, ",")
|
||
|
||
hasTextMatch := questionText != ""
|
||
hasVectorMatch := len(vectorData) > 0
|
||
// Add text match if question is provided
|
||
if hasTextMatch {
|
||
extraOptions := map[string]string{
|
||
"minimum_should_match": fmt.Sprintf("%d%%", int(minMatch*100)),
|
||
}
|
||
|
||
if filterStr != "" {
|
||
extraOptions["filter"] = filterStr
|
||
}
|
||
|
||
if rankFeature != nil {
|
||
var rankFeaturesList []string
|
||
for featureName, weight := range rankFeature {
|
||
rankFeaturesList = append(rankFeaturesList, fmt.Sprintf("%s^%s^%.0f", common.TAG_FLD, featureName, weight))
|
||
}
|
||
if len(rankFeaturesList) > 0 {
|
||
extraOptions["rank_features"] = strings.Join(rankFeaturesList, ",")
|
||
}
|
||
}
|
||
|
||
if originalQuery != "" {
|
||
extraOptions["original_query"] = originalQuery
|
||
}
|
||
|
||
table = table.MatchText(fields, questionText, textTopN, extraOptions)
|
||
|
||
common.Debug(fmt.Sprintf(
|
||
"MatchTextExpr:\n"+
|
||
" fields=%s\n"+
|
||
" matching_text=%s\n"+
|
||
" topn=%d\n"+
|
||
" extra_options=%v",
|
||
fields, questionText, textTopN, extraOptions,
|
||
))
|
||
}
|
||
|
||
// Add vector match if provided
|
||
if hasVectorMatch {
|
||
vecFieldName := fmt.Sprintf("q_%d_vec", len(vectorData))
|
||
dataType := "float"
|
||
distanceType := "cosine"
|
||
|
||
if matchDense != nil {
|
||
if matchDense.VectorColumnName != "" {
|
||
vecFieldName = matchDense.VectorColumnName
|
||
}
|
||
if matchDense.EmbeddingDataType != "" {
|
||
dataType = matchDense.EmbeddingDataType
|
||
}
|
||
if matchDense.DistanceType != "" {
|
||
distanceType = matchDense.DistanceType
|
||
}
|
||
}
|
||
|
||
vectorTopN := pageSize
|
||
if matchDense != nil && matchDense.TopN > 0 {
|
||
vectorTopN = int(matchDense.TopN)
|
||
}
|
||
|
||
denseFilterStr := filterStr
|
||
if denseFilterStr == "" {
|
||
if isSkillIndex {
|
||
denseFilterStr = "status='1'"
|
||
} else {
|
||
denseFilterStr = "available_int=1"
|
||
}
|
||
}
|
||
|
||
if hasTextMatch {
|
||
fieldsStr := strings.Join(convertedFields, ",")
|
||
filterFulltext := fmt.Sprintf("filter_fulltext('%s', '%s')", fieldsStr, questionText)
|
||
denseFilterStr = fmt.Sprintf("(%s) AND %s", denseFilterStr, filterFulltext)
|
||
}
|
||
threshold := "0.0"
|
||
if matchDense != nil && matchDense.ExtraOptions != nil {
|
||
if sim, ok := matchDense.ExtraOptions["similarity"].(float64); ok {
|
||
threshold = fmt.Sprintf("%g", sim)
|
||
} else if s, ok := matchDense.ExtraOptions["threshold"].(string); ok {
|
||
threshold = s
|
||
}
|
||
}
|
||
extraOptions := map[string]string{
|
||
"threshold": threshold,
|
||
"filter": denseFilterStr,
|
||
}
|
||
|
||
common.Debug("MatchDense for hybrid search",
|
||
zap.String("fieldName", vecFieldName),
|
||
zap.String("distanceType", distanceType),
|
||
zap.Int("topN", vectorTopN),
|
||
zap.Bool("hasFusion", fusionExpr != nil))
|
||
|
||
table = table.MatchDense(vecFieldName, vectorData, dataType, distanceType, vectorTopN, extraOptions)
|
||
}
|
||
|
||
// Add fusion (for text + vector combination)
|
||
if hasTextMatch && hasVectorMatch && fusionExpr != nil {
|
||
fusionMethod := fusionExpr.Method
|
||
fusionTopK := fusionExpr.TopN
|
||
if fusionTopK == 0 {
|
||
fusionTopK = pageSize
|
||
}
|
||
fusionParams := map[string]interface{}{
|
||
"normalize": "atan",
|
||
}
|
||
if fusionExpr.FusionParams != nil {
|
||
for k, v := range fusionExpr.FusionParams {
|
||
fusionParams[k] = v
|
||
}
|
||
}
|
||
|
||
common.Debug("Applying Fusion for hybrid search",
|
||
zap.String("method", fusionMethod),
|
||
zap.Int("topN", fusionTopK),
|
||
zap.Any("params", fusionParams))
|
||
|
||
table = table.Fusion(fusionMethod, fusionTopK, fusionParams)
|
||
}
|
||
|
||
// Add order_by if provided
|
||
if orderBy != nil && len(orderBy.Fields) > 0 {
|
||
var sortFields [][2]interface{}
|
||
for _, orderField := range orderBy.Fields {
|
||
sortType := infinity.SortTypeAsc
|
||
if orderField.Type == types.SortDesc {
|
||
sortType = infinity.SortTypeDesc
|
||
}
|
||
sortFields = append(sortFields, [2]interface{}{orderField.Field, sortType})
|
||
}
|
||
table = table.Sort(sortFields)
|
||
}
|
||
|
||
// Add filter when there's no text/vector match (like metadata queries)
|
||
if !hasTextMatch && !hasVectorMatch && filterStr != "" {
|
||
common.Debug(fmt.Sprintf("Adding filter for no-match query: %s", filterStr))
|
||
table = table.Filter(filterStr)
|
||
}
|
||
|
||
// Set limit and offset
|
||
table = table.Limit(pageSize)
|
||
if offset > 0 {
|
||
table = table.Offset(offset)
|
||
}
|
||
|
||
// Request total_hits_count from Infinity
|
||
table = table.Option(map[string]interface{}{"total_hits_count": true})
|
||
|
||
// Execute query
|
||
df, err := table.ToDataFrame()
|
||
if err != nil {
|
||
common.Warn("Infinity query failed",
|
||
zap.String("tableName", tableName),
|
||
zap.Bool("hasTextMatch", hasTextMatch),
|
||
zap.Bool("hasVectorMatch", hasVectorMatch),
|
||
zap.Bool("hasFusion", fusionExpr != nil),
|
||
zap.Error(err))
|
||
continue
|
||
}
|
||
|
||
// Convert DataFrame to chunks format (column-oriented to row-oriented)
|
||
searchChunks := make([]map[string]interface{}, 0)
|
||
for colName, colData := range df.ColumnData {
|
||
for i, val := range colData {
|
||
for len(searchChunks) <= i {
|
||
searchChunks = append(searchChunks, make(map[string]interface{}))
|
||
}
|
||
searchChunks[i][colName] = val
|
||
}
|
||
}
|
||
|
||
// Apply field name mapping and row_id handling
|
||
// Skill index uses different schema
|
||
// so we skip the document-specific field mappings
|
||
if !isSkillIndex {
|
||
applyFieldMappings(searchChunks)
|
||
} else {
|
||
// For skill index, only handle ROW_ID -> row_id() mapping
|
||
for _, chunk := range searchChunks {
|
||
if val, ok := chunk["ROW_ID"]; ok {
|
||
chunk["row_id()"] = val
|
||
delete(chunk, "ROW_ID")
|
||
}
|
||
}
|
||
}
|
||
|
||
// Parse total_hits_count from ExtraInfo
|
||
var tableTotal int64
|
||
if df.ExtraInfo != "" {
|
||
var extraResult map[string]interface{}
|
||
if err := json.Unmarshal([]byte(df.ExtraInfo), &extraResult); err == nil {
|
||
if count, ok := extraResult["total_hits_count"].(float64); ok {
|
||
tableTotal = int64(count)
|
||
}
|
||
}
|
||
}
|
||
|
||
searchResult := &types.SearchResult{
|
||
Chunks: searchChunks,
|
||
Total: tableTotal,
|
||
}
|
||
|
||
allResults = append(allResults, searchResult.Chunks...)
|
||
totalHits += searchResult.Total
|
||
}
|
||
}
|
||
|
||
if hasTextMatch || hasVectorMatch {
|
||
scoreColumn := ""
|
||
if hasTextMatch && hasVectorMatch {
|
||
scoreColumn = "SCORE"
|
||
} else if hasTextMatch {
|
||
scoreColumn = "SCORE"
|
||
} else if hasVectorMatch {
|
||
scoreColumn = "SIMILARITY"
|
||
}
|
||
pagerankField := common.PAGERANK_FLD
|
||
if isSkillIndex {
|
||
pagerankField = "" // Skill index has no pagerank field
|
||
}
|
||
|
||
allResults = calculateScores(allResults, scoreColumn, pagerankField)
|
||
allResults = sortByScore(allResults, len(allResults))
|
||
}
|
||
|
||
if len(allResults) > pageSize {
|
||
allResults = allResults[:pageSize]
|
||
}
|
||
|
||
common.Debug("Search in Infinity completed", zap.Int("returnedRows", len(allResults)), zap.Int64("totalHits", totalHits))
|
||
|
||
return &types.SearchResult{
|
||
Chunks: allResults,
|
||
Total: totalHits,
|
||
}, nil
|
||
}
|
||
|
||
// GetChunk gets a chunk by ID
|
||
func (e *infinityEngine) GetChunk(ctx context.Context, tableName, chunkID string, datasetIDs []string) (interface{}, error) {
|
||
if e.client == nil || e.client.conn == nil {
|
||
return nil, fmt.Errorf("Infinity client not initialized")
|
||
}
|
||
|
||
common.Info("Infinity get chunk start",
|
||
zap.String("chunkID", chunkID),
|
||
zap.String("tableName", tableName),
|
||
zap.Strings("datasetIDs", datasetIDs))
|
||
|
||
// Build list of table names to search
|
||
tableNames := make([]string, 0, len(datasetIDs))
|
||
for _, datasetID := range datasetIDs {
|
||
tableNames = append(tableNames, fmt.Sprintf("%s_%s", tableName, datasetID))
|
||
}
|
||
|
||
// Try each table and collect results from all tables
|
||
db, err := e.client.conn.GetDatabase(e.client.dbName)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("failed to get database: %w", err)
|
||
}
|
||
|
||
// Collect chunks from all tables (same as Python's concat_dataframes)
|
||
allChunks := make(map[string]map[string]interface{})
|
||
|
||
for _, tblName := range tableNames {
|
||
table, err := db.GetTable(tblName)
|
||
if err != nil {
|
||
continue
|
||
}
|
||
|
||
// Query with filter for the specific chunk ID
|
||
filter := fmt.Sprintf("id = '%s'", chunkID)
|
||
result, err := table.Output([]string{"*"}).Filter(filter).ToResult()
|
||
if err != nil {
|
||
continue
|
||
}
|
||
|
||
qr, ok := result.(*infinity.QueryResult)
|
||
if !ok {
|
||
continue
|
||
}
|
||
|
||
if len(qr.Data) == 0 {
|
||
continue
|
||
}
|
||
|
||
// Convert to chunk format
|
||
chunks := make([]map[string]interface{}, 0)
|
||
for colName, colData := range qr.Data {
|
||
for i, val := range colData {
|
||
for len(chunks) <= i {
|
||
chunks = append(chunks, make(map[string]interface{}))
|
||
}
|
||
chunks[i][colName] = val
|
||
}
|
||
}
|
||
|
||
// Merge chunks into allChunks (by id), keeping first non-empty value
|
||
for _, chunk := range chunks {
|
||
if idVal, ok := chunk["id"].(string); ok {
|
||
if existing, exists := allChunks[idVal]; exists {
|
||
// Merge: keep first non-empty value for each field
|
||
for k, v := range chunk {
|
||
if _, has := existing[k]; !has || (utility.IsEmpty(existing[k]) && !utility.IsEmpty(v)) {
|
||
existing[k] = v
|
||
}
|
||
}
|
||
} else {
|
||
allChunks[idVal] = chunk
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Get the chunk by chunkID
|
||
chunk, found := allChunks[chunkID]
|
||
if !found {
|
||
return nil, nil
|
||
}
|
||
|
||
common.Debug("infinity get chunk", zap.String("chunkID", chunkID), zap.Any("tables", tableNames))
|
||
|
||
// Apply field mappings (same as in GetFields)
|
||
// docnm -> docnm_kwd, title_tks, title_sm_tks
|
||
if val, ok := chunk["docnm"].(string); ok {
|
||
chunk["docnm_kwd"] = val
|
||
chunk["title_tks"] = val
|
||
chunk["title_sm_tks"] = val
|
||
}
|
||
|
||
// content -> content_with_weight, content_ltks, content_sm_ltks
|
||
if val, ok := chunk["content"].(string); ok {
|
||
chunk["content_with_weight"] = val
|
||
chunk["content_ltks"] = val
|
||
chunk["content_sm_ltks"] = val
|
||
}
|
||
|
||
// important_keywords -> important_kwd (split by comma), important_tks
|
||
if val, ok := chunk["important_keywords"].(string); ok {
|
||
if val == "" {
|
||
chunk["important_kwd"] = []interface{}{}
|
||
} else {
|
||
parts := strings.Split(val, ",")
|
||
chunk["important_kwd"] = parts
|
||
}
|
||
chunk["important_tks"] = val
|
||
} else {
|
||
chunk["important_kwd"] = []interface{}{}
|
||
chunk["important_tks"] = []interface{}{}
|
||
}
|
||
|
||
// questions -> question_kwd (split by newline), question_tks
|
||
if val, ok := chunk["questions"].(string); ok {
|
||
if val == "" {
|
||
chunk["question_kwd"] = []interface{}{}
|
||
} else {
|
||
parts := strings.Split(val, "\n")
|
||
chunk["question_kwd"] = parts
|
||
}
|
||
chunk["question_tks"] = val
|
||
} else {
|
||
chunk["question_kwd"] = []interface{}{}
|
||
chunk["question_tks"] = []interface{}{}
|
||
}
|
||
|
||
if posVal, ok := chunk["position_int"].(string); ok {
|
||
chunk["position_int"] = utility.ConvertHexToPositionIntArray(posVal)
|
||
} else {
|
||
chunk["position_int"] = []interface{}{}
|
||
}
|
||
|
||
return chunk, nil
|
||
}
|
||
|
||
// applyFieldMappings applies field mappings to chunks (side-effect only).
|
||
// Used by Search() to mutate chunks with derived fields before returning.
|
||
func applyFieldMappings(chunks []map[string]interface{}) {
|
||
for _, chunk := range chunks {
|
||
// docnm -> docnm_kwd, title_tks, title_sm_tks
|
||
if val, ok := chunk["docnm"].(string); ok {
|
||
chunk["docnm_kwd"] = val
|
||
chunk["title_tks"] = val
|
||
chunk["title_sm_tks"] = val
|
||
}
|
||
|
||
// important_keywords -> important_kwd (split by comma/semicolon/Chinese punctuations), important_tks
|
||
if val, ok := chunk["important_keywords"].(string); ok {
|
||
if val == "" {
|
||
chunk["important_kwd"] = []interface{}{}
|
||
} else {
|
||
parts := ChinesePunctRegex.Split(val, -1)
|
||
chunk["important_kwd"] = parts
|
||
}
|
||
chunk["important_tks"] = val
|
||
} else {
|
||
chunk["important_kwd"] = []interface{}{}
|
||
chunk["important_tks"] = []interface{}{}
|
||
}
|
||
|
||
// questions -> question_kwd (split by newline), question_tks
|
||
if val, ok := chunk["questions"].(string); ok {
|
||
if val == "" {
|
||
chunk["question_kwd"] = []interface{}{}
|
||
} else {
|
||
parts := strings.Split(val, "\n")
|
||
chunk["question_kwd"] = parts
|
||
}
|
||
chunk["question_tks"] = val
|
||
} else {
|
||
chunk["question_kwd"] = []interface{}{}
|
||
chunk["question_tks"] = []interface{}{}
|
||
}
|
||
|
||
// content -> content_with_weight, content_ltks, content_sm_ltks
|
||
if val, ok := chunk["content"].(string); ok {
|
||
chunk["content_with_weight"] = val
|
||
chunk["content_ltks"] = val
|
||
chunk["content_sm_ltks"] = val
|
||
}
|
||
|
||
// authors -> authors_tks, authors_sm_tks
|
||
if val, ok := chunk["authors"].(string); ok {
|
||
chunk["authors_tks"] = val
|
||
chunk["authors_sm_tks"] = val
|
||
}
|
||
|
||
// position_int: convert from hex string to array format (grouped by 5)
|
||
if val, ok := chunk["position_int"].(string); ok {
|
||
chunk["position_int"] = utility.ConvertHexToPositionIntArray(val)
|
||
}
|
||
|
||
// Convert page_num_int and top_int from hex string to array
|
||
for _, colName := range []string{"page_num_int", "top_int"} {
|
||
if val, ok := chunk[colName].(string); ok && val != "" {
|
||
chunk[colName] = utility.ConvertHexToIntArray(val)
|
||
}
|
||
}
|
||
|
||
// Post-process: convert nil/empty values to empty slices for array-like fields
|
||
// and split _kwd fields by "###" (except knowledge_graph_kwd, docnm_kwd, important_kwd, question_kwd)
|
||
kwdNoSplit := map[string]bool{
|
||
"knowledge_graph_kwd": true, "docnm_kwd": true,
|
||
"important_kwd": true, "question_kwd": true,
|
||
}
|
||
arrayFields := []string{
|
||
"important_kwd", "important_tks", "question_tks",
|
||
"question_kwd", "authors_tks", "authors_sm_tks", "title_tks",
|
||
"title_sm_tks", "content_ltks", "content_sm_ltks", "tag_kwd",
|
||
}
|
||
for _, colName := range arrayFields {
|
||
val, ok := chunk[colName]
|
||
if !ok || val == nil || val == "" {
|
||
chunk[colName] = []interface{}{}
|
||
} else if !kwdNoSplit[colName] {
|
||
// Split by "###" for _kwd fields
|
||
if strVal, ok := val.(string); ok && strings.Contains(strVal, "###") {
|
||
parts := strings.Split(strVal, "###")
|
||
var filtered []interface{}
|
||
for _, p := range parts {
|
||
if p != "" {
|
||
filtered = append(filtered, p)
|
||
}
|
||
}
|
||
chunk[colName] = filtered
|
||
}
|
||
}
|
||
}
|
||
|
||
// Handle row_id mapping - Infinity returns "ROW_ID" but we use "row_id()"
|
||
if val, ok := chunk["ROW_ID"]; ok {
|
||
chunk["row_id()"] = val
|
||
delete(chunk, "ROW_ID")
|
||
}
|
||
}
|
||
}
|
||
|
||
// GetFields extracts the requested fields from Infinity search results
|
||
func (e *infinityEngine) GetFields(chunks []map[string]interface{}, fields []string) map[string]map[string]interface{} {
|
||
result := make(map[string]map[string]interface{})
|
||
|
||
// Python: if not fields, return {}
|
||
if len(fields) == 0 {
|
||
return result
|
||
}
|
||
|
||
if len(chunks) == 0 {
|
||
return result
|
||
}
|
||
|
||
// Build field set for lookup (Python lines 713-715)
|
||
fieldsAll := make(map[string]bool)
|
||
for _, f := range fields {
|
||
fieldsAll[f] = true
|
||
}
|
||
fieldsAll["id"] = true
|
||
|
||
// noneColumns is rebuilt per chunk inside the loop below. The
|
||
// per-chunk "missing → nil" map MUST be fresh for every iteration; if
|
||
// it's reused, the first chunk that contains a field removes it from
|
||
// the shared set, and later chunks missing that same field silently
|
||
// stop getting the nil placeholder, producing inconsistent shapes
|
||
// per document.
|
||
|
||
// Check if important_kwd is needed (for empty_count handling)
|
||
needImportantKwdEmptyCount := fieldsAll["important_kwd"]
|
||
|
||
for _, chunk := range chunks {
|
||
// Build column map for case-insensitive lookup (Python line 747)
|
||
columnMap := make(map[string]string)
|
||
for k := range chunk {
|
||
columnMap[strings.ToLower(k)] = k
|
||
}
|
||
|
||
// Apply field mappings first (to get derived fields)
|
||
// docnm -> docnm_kwd, title_tks, title_sm_tks (Python lines 716-719)
|
||
// Note: Python checks "docnm" in res.columns regardless of whether fields were requested
|
||
if val, ok := chunk["docnm"].(string); ok {
|
||
if fieldsAll["docnm_kwd"] {
|
||
chunk["docnm_kwd"] = val
|
||
}
|
||
if fieldsAll["title_tks"] {
|
||
chunk["title_tks"] = val
|
||
}
|
||
if fieldsAll["title_sm_tks"] {
|
||
chunk["title_sm_tks"] = val
|
||
}
|
||
}
|
||
|
||
// important_keywords -> important_kwd (split by comma), important_tks (Python lines 720-732)
|
||
// Python: v.split(",") if v else [] — empty string yields empty list
|
||
if fieldsAll["important_kwd"] || fieldsAll["important_tks"] {
|
||
if val, ok := chunk["important_keywords"].(string); ok && val != "" {
|
||
if fieldsAll["important_kwd"] {
|
||
if needImportantKwdEmptyCount {
|
||
// Check for important_kwd_empty_count (Python lines 722-728)
|
||
if emptyCountVal, hasEmptyCount := chunk["important_kwd_empty_count"]; hasEmptyCount {
|
||
tokens := strings.Split(val, ",")
|
||
var emptyCount int
|
||
switch v := emptyCountVal.(type) {
|
||
case float64:
|
||
emptyCount = int(v)
|
||
case int:
|
||
emptyCount = v
|
||
case string:
|
||
emptyCount, _ = strconv.Atoi(v)
|
||
}
|
||
kwdList := make([]interface{}, 0, len(tokens)+emptyCount)
|
||
for _, t := range tokens {
|
||
kwdList = append(kwdList, t)
|
||
}
|
||
for i := 0; i < emptyCount; i++ {
|
||
kwdList = append(kwdList, "")
|
||
}
|
||
chunk["important_kwd"] = kwdList
|
||
} else {
|
||
parts := strings.Split(val, ",")
|
||
kwdList := make([]interface{}, len(parts))
|
||
for i, p := range parts {
|
||
kwdList[i] = p
|
||
}
|
||
chunk["important_kwd"] = kwdList
|
||
}
|
||
} else {
|
||
parts := strings.Split(val, ",")
|
||
kwdList := make([]interface{}, len(parts))
|
||
for i, p := range parts {
|
||
kwdList[i] = p
|
||
}
|
||
chunk["important_kwd"] = kwdList
|
||
}
|
||
}
|
||
if fieldsAll["important_tks"] {
|
||
chunk["important_tks"] = val
|
||
}
|
||
} else {
|
||
if fieldsAll["important_kwd"] {
|
||
chunk["important_kwd"] = []interface{}{}
|
||
}
|
||
if fieldsAll["important_tks"] {
|
||
chunk["important_tks"] = []interface{}{}
|
||
}
|
||
}
|
||
}
|
||
|
||
// questions -> question_kwd (split by newline), question_tks (Python lines 733-737)
|
||
// Python: v.splitlines() — empty string yields empty list
|
||
if fieldsAll["question_kwd"] || fieldsAll["question_tks"] {
|
||
if val, ok := chunk["questions"].(string); ok && val != "" {
|
||
if fieldsAll["question_kwd"] {
|
||
parts := strings.Split(val, "\n")
|
||
qList := make([]interface{}, len(parts))
|
||
for i, p := range parts {
|
||
qList[i] = p
|
||
}
|
||
chunk["question_kwd"] = qList
|
||
}
|
||
if fieldsAll["question_tks"] {
|
||
chunk["question_tks"] = val
|
||
}
|
||
} else {
|
||
if fieldsAll["question_kwd"] {
|
||
chunk["question_kwd"] = []interface{}{}
|
||
}
|
||
if fieldsAll["question_tks"] {
|
||
chunk["question_tks"] = []interface{}{}
|
||
}
|
||
}
|
||
}
|
||
|
||
// content -> content_with_weight, content_ltks, content_sm_ltks (Python lines 738-741)
|
||
if fieldsAll["content_with_weight"] || fieldsAll["content_ltks"] || fieldsAll["content_sm_ltks"] {
|
||
if val, ok := chunk["content"].(string); ok {
|
||
if fieldsAll["content_with_weight"] {
|
||
chunk["content_with_weight"] = val
|
||
}
|
||
if fieldsAll["content_ltks"] {
|
||
chunk["content_ltks"] = val
|
||
}
|
||
if fieldsAll["content_sm_ltks"] {
|
||
chunk["content_sm_ltks"] = val
|
||
}
|
||
}
|
||
}
|
||
|
||
// authors -> authors_tks, authors_sm_tks (Python lines 742-745)
|
||
if fieldsAll["authors_tks"] || fieldsAll["authors_sm_tks"] {
|
||
if val, ok := chunk["authors"].(string); ok {
|
||
if fieldsAll["authors_tks"] {
|
||
chunk["authors_tks"] = val
|
||
}
|
||
if fieldsAll["authors_sm_tks"] {
|
||
chunk["authors_sm_tks"] = val
|
||
}
|
||
}
|
||
}
|
||
|
||
// Post-process fields matching Python lines 758-780
|
||
// This single loop processes all column transformations in Python order
|
||
kwdNoSplit := map[string]bool{
|
||
"knowledge_graph_kwd": true, "docnm_kwd": true,
|
||
"important_kwd": true, "question_kwd": true,
|
||
}
|
||
for field, val := range chunk {
|
||
fieldLower := strings.ToLower(field)
|
||
|
||
// field_keyword: split by "###" (Python lines 760-761)
|
||
needsSplit := false
|
||
if fieldLower == "source_id" {
|
||
needsSplit = true
|
||
} else if strings.HasSuffix(fieldLower, "_kwd") && !kwdNoSplit[fieldLower] {
|
||
needsSplit = true
|
||
}
|
||
if needsSplit {
|
||
if strVal, ok := val.(string); ok && strings.Contains(strVal, "###") {
|
||
parts := strings.Split(strVal, "###")
|
||
var filtered []interface{}
|
||
for _, p := range parts {
|
||
if p != "" {
|
||
filtered = append(filtered, p)
|
||
}
|
||
}
|
||
chunk[field] = filtered
|
||
}
|
||
continue
|
||
}
|
||
|
||
// _feas: JSON parse (Python lines 762-763)
|
||
if strings.HasSuffix(fieldLower, "_feas") {
|
||
if strVal, ok := val.(string); ok && strVal != "" {
|
||
var parsed interface{}
|
||
if err := json.Unmarshal([]byte(strVal), &parsed); err == nil {
|
||
chunk[field] = parsed
|
||
}
|
||
} else {
|
||
chunk[field] = map[string]interface{}{}
|
||
}
|
||
continue
|
||
}
|
||
|
||
// chunk_data: JSON parse (Python lines 764-766)
|
||
if fieldLower == "chunk_data" {
|
||
if strVal, ok := val.(string); ok && strVal != "" {
|
||
var parsed interface{}
|
||
if err := json.Unmarshal([]byte(strVal), &parsed); err == nil {
|
||
chunk[field] = parsed
|
||
}
|
||
} else if val == nil {
|
||
// Keep nil
|
||
}
|
||
continue
|
||
}
|
||
|
||
// position_int: hex decode with grouping by 5 (Python lines 767-776)
|
||
if fieldLower == "position_int" && fieldsAll[fieldLower] {
|
||
// If already converted to slice by applyFieldMappings, skip
|
||
if _, isSlice := val.([]interface{}); isSlice {
|
||
continue
|
||
}
|
||
// applyFieldMappings returns [][]int, check that too
|
||
if _, isIntSlice := val.([][]int); isIntSlice {
|
||
continue
|
||
}
|
||
if strVal, ok := val.(string); ok && strVal != "" {
|
||
chunk[field] = utility.ConvertHexToPositionIntArray(strVal)
|
||
} else {
|
||
chunk[field] = []interface{}{}
|
||
}
|
||
continue
|
||
}
|
||
|
||
// page_num_int, top_int: hex decode (Python lines 777-778)
|
||
if (fieldLower == "page_num_int" || fieldLower == "top_int") && fieldsAll[fieldLower] {
|
||
// If already converted to slice by applyFieldMappings, skip
|
||
if _, isSlice := val.([]interface{}); isSlice {
|
||
continue
|
||
}
|
||
// applyFieldMappings returns []int, check that too
|
||
if _, isIntSlice := val.([]int); isIntSlice {
|
||
continue
|
||
}
|
||
if strVal, ok := val.(string); ok && strVal != "" {
|
||
chunk[field] = utility.ConvertHexToIntArray(strVal)
|
||
} else {
|
||
chunk[field] = []interface{}{}
|
||
}
|
||
continue
|
||
}
|
||
}
|
||
|
||
// Handle row_id mapping (Python lines 748-750)
|
||
if fieldsAll["row_id()"] {
|
||
if lowerKey, ok := columnMap["row_id"]; ok {
|
||
chunk["row_id()"] = chunk[lowerKey]
|
||
}
|
||
}
|
||
|
||
// Delete base columns after mapping (Python lines 781-783)
|
||
for _, col := range []string{"docnm", "important_keywords", "questions", "content", "authors"} {
|
||
delete(chunk, col)
|
||
}
|
||
|
||
// Build result map keyed by id
|
||
if idVal, ok := chunk["id"].(string); ok {
|
||
fieldMap := make(map[string]interface{})
|
||
// Rebuild noneColumns for this chunk so that fields missing
|
||
// from THIS chunk get a nil placeholder. Reusing a set across
|
||
// chunks would let the first chunk's contents permanently
|
||
// remove keys, leaving later chunks with inconsistent shapes.
|
||
noneColumns := make(map[string]bool, len(fieldsAll))
|
||
for f := range fieldsAll {
|
||
noneColumns[strings.ToLower(f)] = true
|
||
}
|
||
for field, value := range chunk {
|
||
if fieldsAll[field] {
|
||
fieldMap[field] = value
|
||
delete(noneColumns, strings.ToLower(field))
|
||
}
|
||
}
|
||
// Set none_columns to None (Python lines 784-785)
|
||
for col := range noneColumns {
|
||
fieldMap[col] = nil
|
||
}
|
||
result[idVal] = fieldMap
|
||
}
|
||
}
|
||
|
||
return result
|
||
}
|
||
|
||
// GetAggregation aggregates chunk values by field name.
|
||
// Input: [{"docnm_kwd": "docA"}, {"docnm_kwd": "docA"}, {"docnm_kwd": "docB"}]
|
||
//
|
||
// GetAggregation(chunks, "docnm_kwd") returns:
|
||
//
|
||
// [{"key": "docA", "count": 2}, {"key": "docB", "count": 1}]
|
||
//
|
||
// For tag_kwd field, splits values by "###" separator.
|
||
// For other fields, uses comma separation.
|
||
func (e *infinityEngine) GetAggregation(chunks []map[string]interface{}, fieldName string) []map[string]interface{} {
|
||
if len(chunks) == 0 {
|
||
return []map[string]interface{}{}
|
||
}
|
||
|
||
// Check if field exists in first chunk
|
||
hasField := false
|
||
for _, chunk := range chunks {
|
||
if _, ok := chunk[fieldName]; ok {
|
||
hasField = true
|
||
break
|
||
}
|
||
}
|
||
if !hasField {
|
||
return []map[string]interface{}{}
|
||
}
|
||
|
||
// Count occurrences
|
||
tagCounts := make(map[string]int)
|
||
for _, chunk := range chunks {
|
||
value, ok := chunk[fieldName]
|
||
if !ok || value == nil {
|
||
continue
|
||
}
|
||
|
||
// Handle string value
|
||
if valueStr, ok := value.(string); ok {
|
||
if valueStr == "" {
|
||
continue
|
||
}
|
||
|
||
var tags []string
|
||
// Split by "###" for tag_kwd field
|
||
if fieldName == "tag_kwd" && strings.Contains(valueStr, "###") {
|
||
for _, tag := range strings.Split(valueStr, "###") {
|
||
tag = strings.TrimSpace(tag)
|
||
if tag != "" {
|
||
tags = append(tags, tag)
|
||
}
|
||
}
|
||
} else {
|
||
// Fallback to comma separation
|
||
for _, tag := range strings.Split(valueStr, ",") {
|
||
tag = strings.TrimSpace(tag)
|
||
if tag != "" {
|
||
tags = append(tags, tag)
|
||
}
|
||
}
|
||
}
|
||
|
||
for _, tag := range tags {
|
||
tagCounts[tag]++
|
||
}
|
||
continue
|
||
}
|
||
|
||
// Handle list value
|
||
if valueList, ok := value.([]interface{}); ok {
|
||
for _, item := range valueList {
|
||
if itemStr, ok := item.(string); ok {
|
||
tag := strings.TrimSpace(itemStr)
|
||
if tag != "" {
|
||
tagCounts[tag]++
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
if len(tagCounts) == 0 {
|
||
return []map[string]interface{}{}
|
||
}
|
||
|
||
// Convert to slice and sort by count descending
|
||
type tagCountPair struct {
|
||
tag string
|
||
count int
|
||
}
|
||
pairs := make([]tagCountPair, 0, len(tagCounts))
|
||
for tag, count := range tagCounts {
|
||
pairs = append(pairs, tagCountPair{tag, count})
|
||
}
|
||
sort.Slice(pairs, func(i, j int) bool {
|
||
return pairs[i].count > pairs[j].count
|
||
})
|
||
|
||
// Convert to []map[string]interface{} directly
|
||
result := make([]map[string]interface{}, len(pairs))
|
||
for i, p := range pairs {
|
||
result[i] = map[string]interface{}{"key": p.tag, "count": p.count}
|
||
}
|
||
|
||
return result
|
||
}
|
||
|
||
// GetChunkIDs extracts chunk IDs from Infinity search results.
|
||
func (e *infinityEngine) GetChunkIDs(chunks []map[string]interface{}) []string {
|
||
ids := make([]string, 0, len(chunks))
|
||
for _, chunk := range chunks {
|
||
if id, ok := chunk["id"].(string); ok {
|
||
ids = append(ids, id)
|
||
}
|
||
}
|
||
return ids
|
||
}
|
||
|
||
// GetHighlight generates highlighted text snippets for search results.
|
||
// Matches keywords in text and wraps them with <em> tags.
|
||
func (e *infinityEngine) GetHighlight(chunks []map[string]interface{}, keywords []string, fieldName string) map[string]string {
|
||
result := make(map[string]string)
|
||
if len(chunks) == 0 || len(keywords) == 0 {
|
||
return result
|
||
}
|
||
|
||
// For Infinity, scores are already returned in search results (_score column)
|
||
// So GetScores just extracts scores from chunks, mimicking Python's approach
|
||
return result
|
||
}
|
||
|
||
// KNNScores for Infinity - since Infinity normalizes scores during fusion,
|
||
// we just need to return a result structure that GetScores can parse.
|
||
// This matches Python's approach where Infinity doesn't use the two-pass KNN.
|
||
func (e *infinityEngine) KNNScores(ctx context.Context, chunks []map[string]interface{}, queryVector []float64, topK int) (map[string]interface{}, error) {
|
||
if len(chunks) == 0 {
|
||
return nil, nil
|
||
}
|
||
|
||
// Build a result structure that GetScores can parse
|
||
// For Infinity, scores are already in _score field from the first search
|
||
result := make(map[string]interface{})
|
||
hitList := make([]interface{}, 0, len(chunks))
|
||
for _, chunk := range chunks {
|
||
if id, ok := chunk["id"].(string); ok {
|
||
hit := map[string]interface{}{
|
||
"_id": id,
|
||
"_score": chunk["_score"],
|
||
}
|
||
hitList = append(hitList, hit)
|
||
}
|
||
}
|
||
result["hits"] = map[string]interface{}{
|
||
"hits": hitList,
|
||
}
|
||
return result, nil
|
||
}
|
||
|
||
// GetScores extracts similarity scores from KNN search result.
|
||
// For Infinity, it parses the result from KNNScores and extracts _score values.
|
||
func (e *infinityEngine) GetScores(knnResult map[string]interface{}) map[string]float64 {
|
||
scores := make(map[string]float64)
|
||
hits, ok := knnResult["hits"].(map[string]interface{})
|
||
if !ok {
|
||
return scores
|
||
}
|
||
hitList, ok := hits["hits"].([]interface{})
|
||
if !ok {
|
||
return scores
|
||
}
|
||
for _, h := range hitList {
|
||
hit, ok := h.(map[string]interface{})
|
||
if !ok {
|
||
continue
|
||
}
|
||
docID, ok := hit["_id"].(string)
|
||
if !ok {
|
||
continue
|
||
}
|
||
scoreVal := hit["_score"]
|
||
if scoreVal == nil {
|
||
scores[docID] = 0.0
|
||
continue
|
||
}
|
||
score, ok := scoreVal.(float64)
|
||
if !ok {
|
||
scores[docID] = 0.0
|
||
continue
|
||
}
|
||
scores[docID] = score
|
||
}
|
||
return scores
|
||
}
|
||
|
||
// convertSelectFields converts field names to Infinity format
|
||
// isSkillIndex indicates if this is a skill index (uses skill_id instead of id)
|
||
//
|
||
// Does NOT mutate the input slice — callers (e.g. retrieval.go) reuse the same
|
||
// SelectFields list both for Search() and GetFields(); mutating it would
|
||
// replace logical names like "content_with_weight" with their Infinity column
|
||
// names ("content"), breaking GetFields's field-presence checks.
|
||
func convertSelectFields(output []string, isSkillIndex ...bool) []string {
|
||
fieldMapping := map[string]string{
|
||
"docnm_kwd": "docnm",
|
||
"title_tks": "docnm",
|
||
"title_sm_tks": "docnm",
|
||
"important_kwd": "important_keywords",
|
||
"important_tks": "important_keywords",
|
||
"question_kwd": "questions",
|
||
"question_tks": "questions",
|
||
"content_with_weight": "content",
|
||
"content_ltks": "content",
|
||
"content_sm_ltks": "content",
|
||
"authors_tks": "authors",
|
||
"authors_sm_tks": "authors",
|
||
}
|
||
|
||
skillIndex := false
|
||
if len(isSkillIndex) > 0 {
|
||
skillIndex = isSkillIndex[0]
|
||
}
|
||
|
||
// Copy + map without mutating the caller's slice.
|
||
mapped := make([]string, len(output))
|
||
needEmptyCount := false
|
||
for i, field := range output {
|
||
if field == "important_kwd" {
|
||
needEmptyCount = true
|
||
}
|
||
if newField, ok := fieldMapping[field]; ok {
|
||
mapped[i] = newField
|
||
} else {
|
||
mapped[i] = field
|
||
}
|
||
}
|
||
|
||
// Remove duplicates
|
||
seen := make(map[string]bool)
|
||
result := []string{}
|
||
for _, f := range mapped {
|
||
if f != "" && !seen[f] {
|
||
seen[f] = true
|
||
result = append(result, f)
|
||
}
|
||
}
|
||
|
||
// Add id and empty count if needed
|
||
// For skill index, use skill_id instead of id
|
||
hasID := false
|
||
idField := "id"
|
||
if skillIndex {
|
||
idField = "skill_id"
|
||
}
|
||
for _, f := range result {
|
||
if f == idField {
|
||
hasID = true
|
||
break
|
||
}
|
||
}
|
||
if !hasID {
|
||
result = append([]string{idField}, result...)
|
||
}
|
||
|
||
if needEmptyCount {
|
||
result = append(result, "important_kwd_empty_count")
|
||
}
|
||
|
||
return result
|
||
}
|
||
|
||
// convertMatchingField converts field names for matching
|
||
// For regular document indices: maps _tks/_kwd fields to column@index_name format
|
||
// For skill indices: maps raw field names to column@index_name format
|
||
// Infinity requires column@index_name when a column has multiple full-text indexes
|
||
func convertMatchingField(fieldWeightStr string) string {
|
||
// Split on ^ to get field name
|
||
parts := strings.Split(fieldWeightStr, "^")
|
||
field := parts[0]
|
||
|
||
// Field name conversion
|
||
fieldMapping := map[string]string{
|
||
"docnm_kwd": "docnm@ft_docnm_rag_coarse",
|
||
"title_tks": "docnm@ft_docnm_rag_coarse",
|
||
"title_sm_tks": "docnm@ft_docnm_rag_fine",
|
||
"important_kwd": "important_keywords@ft_important_keywords_rag_coarse",
|
||
"important_tks": "important_keywords@ft_important_keywords_rag_fine",
|
||
"question_kwd": "questions@ft_questions_rag_coarse",
|
||
"question_tks": "questions@ft_questions_rag_fine",
|
||
"content_with_weight": "content@ft_content_rag_coarse",
|
||
"content_ltks": "content@ft_content_rag_coarse",
|
||
"content_sm_ltks": "content@ft_content_rag_fine",
|
||
"authors_tks": "authors@ft_authors_rag_coarse",
|
||
"authors_sm_tks": "authors@ft_authors_rag_fine",
|
||
"tag_kwd": "tag_kwd@ft_tag_kwd_whitespace__",
|
||
"toc_kwd": "toc_kwd@ft_toc_kwd_whitespace__",
|
||
// Skill index fields
|
||
"name": "name@ft_name_rag_coarse",
|
||
"tags": "tags@ft_tags_rag_coarse",
|
||
"description": "description@ft_description_rag_coarse",
|
||
"content": "content@ft_content_rag_coarse",
|
||
}
|
||
|
||
if newField, ok := fieldMapping[field]; ok {
|
||
parts[0] = newField
|
||
}
|
||
|
||
return strings.Join(parts, "^")
|
||
}
|
||
|
||
// escapeFilterValue escapes single quotes for filter values
|
||
func escapeFilterValue(s string) string {
|
||
return strings.ReplaceAll(s, "'", "''")
|
||
}
|
||
|
||
// equivalentConditionToStr converts a condition map to an Infinity filter string
|
||
func equivalentConditionToStr(condition map[string]interface{}) string {
|
||
if len(condition) == 0 {
|
||
return ""
|
||
}
|
||
|
||
var cond []string
|
||
|
||
for k, v := range condition {
|
||
if k == "_id" || utility.IsEmpty(v) {
|
||
continue
|
||
}
|
||
|
||
// Handle must_not specially
|
||
if k == "must_not" {
|
||
if m, ok := v.(map[string]interface{}); ok {
|
||
for kk, vv := range m {
|
||
if kk == "exists" {
|
||
// For must_not exists, use !='' since we don't have table schema
|
||
cond = append(cond, fmt.Sprintf("NOT (%v!='')", vv))
|
||
}
|
||
}
|
||
}
|
||
continue
|
||
}
|
||
|
||
// Handle exists specially (without table schema, use string comparison)
|
||
if k == "exists" {
|
||
cond = append(cond, fmt.Sprintf("%v!=''", v))
|
||
continue
|
||
}
|
||
|
||
// Handle keyword fields (using full-text filter)
|
||
if fieldKeyword(k) {
|
||
// For keyword fields, values are always treated as strings for filter_fulltext
|
||
switch val := v.(type) {
|
||
case []string:
|
||
var inCond []string
|
||
for _, item := range val {
|
||
inCond = append(inCond, fmt.Sprintf("filter_fulltext('%s', '%s')",
|
||
convertMatchingField(k), escapeFilterValue(item)))
|
||
}
|
||
if len(inCond) > 0 {
|
||
cond = append(cond, "("+strings.Join(inCond, " or ")+")")
|
||
}
|
||
case []interface{}:
|
||
var inCond []string
|
||
for _, item := range val {
|
||
if s, ok := item.(string); ok {
|
||
inCond = append(inCond, fmt.Sprintf("filter_fulltext('%s', '%s')",
|
||
convertMatchingField(k), escapeFilterValue(s)))
|
||
} else {
|
||
inCond = append(inCond, fmt.Sprintf("filter_fulltext('%s', '%s')",
|
||
convertMatchingField(k), escapeFilterValue(fmt.Sprintf("%v", item))))
|
||
}
|
||
}
|
||
if len(inCond) > 0 {
|
||
cond = append(cond, "("+strings.Join(inCond, " or ")+")")
|
||
}
|
||
case string:
|
||
cond = append(cond, fmt.Sprintf("filter_fulltext('%s', '%s')",
|
||
convertMatchingField(k), escapeFilterValue(val)))
|
||
default:
|
||
cond = append(cond, fmt.Sprintf("filter_fulltext('%s', '%s')",
|
||
convertMatchingField(k), escapeFilterValue(fmt.Sprintf("%v", v))))
|
||
}
|
||
continue
|
||
}
|
||
|
||
// Handle list values (mixed types - strings get quotes, numbers don't)
|
||
if list, ok := v.([]interface{}); ok && len(list) > 0 {
|
||
var strItems, numItems []string
|
||
for _, item := range list {
|
||
if s, ok := item.(string); ok {
|
||
strItems = append(strItems, fmt.Sprintf("'%s'", escapeFilterValue(s)))
|
||
} else if n, ok := item.(int); ok {
|
||
numItems = append(numItems, strconv.Itoa(n))
|
||
} else if n, ok := item.(int64); ok {
|
||
numItems = append(numItems, strconv.FormatInt(n, 10))
|
||
} else if f, ok := item.(float64); ok {
|
||
numItems = append(numItems, strconv.FormatFloat(f, 'f', -1, 64))
|
||
} else if s, ok := item.(fmt.Stringer); ok {
|
||
strItems = append(strItems, fmt.Sprintf("'%s'", escapeFilterValue(s.String())))
|
||
} else {
|
||
strItems = append(strItems, fmt.Sprintf("'%s'", escapeFilterValue(fmt.Sprintf("%v", item))))
|
||
}
|
||
}
|
||
if len(strItems) > 0 {
|
||
if len(strItems) == 1 {
|
||
cond = append(cond, fmt.Sprintf("%s=%s", k, strItems[0]))
|
||
} else {
|
||
cond = append(cond, fmt.Sprintf("%s IN (%s)", k, strings.Join(strItems, ", ")))
|
||
}
|
||
}
|
||
if len(numItems) > 0 {
|
||
if len(numItems) == 1 {
|
||
cond = append(cond, fmt.Sprintf("%s=%s", k, numItems[0]))
|
||
} else {
|
||
cond = append(cond, fmt.Sprintf("%s IN (%s)", k, strings.Join(numItems, ", ")))
|
||
}
|
||
}
|
||
continue
|
||
}
|
||
|
||
if list, ok := v.([]string); ok && len(list) > 0 {
|
||
if len(list) == 1 {
|
||
cond = append(cond, fmt.Sprintf("%s='%s'", k, escapeFilterValue(list[0])))
|
||
} else {
|
||
var items []string
|
||
for _, item := range list {
|
||
items = append(items, fmt.Sprintf("'%s'", escapeFilterValue(item)))
|
||
}
|
||
cond = append(cond, fmt.Sprintf("%s IN (%s)", k, strings.Join(items, ", ")))
|
||
}
|
||
continue
|
||
}
|
||
|
||
if list, ok := v.([]int); ok && len(list) > 0 {
|
||
if len(list) == 1 {
|
||
cond = append(cond, fmt.Sprintf("%s=%d", k, list[0]))
|
||
} else {
|
||
var strs []string
|
||
for _, n := range list {
|
||
strs = append(strs, strconv.Itoa(n))
|
||
}
|
||
cond = append(cond, fmt.Sprintf("%s IN (%s)", k, strings.Join(strs, ", ")))
|
||
}
|
||
continue
|
||
}
|
||
|
||
// Handle numeric values (no quotes)
|
||
if utility.IsNumericValue(v) {
|
||
cond = append(cond, fmt.Sprintf("%s=%v", k, v))
|
||
continue
|
||
}
|
||
|
||
// Handle string values (with quotes and escaping)
|
||
if str, ok := v.(string); ok {
|
||
cond = append(cond, fmt.Sprintf("%s='%s'", k, escapeFilterValue(str)))
|
||
continue
|
||
}
|
||
|
||
// Fallback: treat as string
|
||
cond = append(cond, fmt.Sprintf("%s='%s'", k, escapeFilterValue(fmt.Sprintf("%v", v))))
|
||
}
|
||
|
||
if len(cond) == 0 {
|
||
return ""
|
||
}
|
||
return strings.Join(cond, " AND ")
|
||
}
|
||
|
||
// calculateScores calculates _score = score_column + pagerank
|
||
func calculateScores(chunks []map[string]interface{}, scoreColumn, pagerankField string) []map[string]interface{} {
|
||
for i := range chunks {
|
||
score := 0.0
|
||
if scoreVal, ok := chunks[i][scoreColumn]; ok {
|
||
if f, ok := utility.ToFloat64(scoreVal); ok {
|
||
score += f
|
||
}
|
||
}
|
||
if pagerankField != "" {
|
||
if prVal, ok := chunks[i][pagerankField]; ok {
|
||
if f, ok := utility.ToFloat64(prVal); ok {
|
||
score += f
|
||
}
|
||
}
|
||
}
|
||
chunks[i]["_score"] = score
|
||
}
|
||
return chunks
|
||
}
|
||
|
||
// sortByScore sorts by _score descending and limits
|
||
func sortByScore(chunks []map[string]interface{}, limit int) []map[string]interface{} {
|
||
if len(chunks) == 0 {
|
||
return chunks
|
||
}
|
||
|
||
// Sort by _score descending
|
||
sort.Slice(chunks, func(i, j int) bool {
|
||
scoreI := getChunkScore(chunks[i])
|
||
scoreJ := getChunkScore(chunks[j])
|
||
return scoreI > scoreJ
|
||
})
|
||
|
||
// Limit
|
||
if len(chunks) > limit && limit > 0 {
|
||
chunks = chunks[:limit]
|
||
}
|
||
|
||
return chunks
|
||
}
|
||
|
||
// getChunkScore extracts the score from a chunk
|
||
func getChunkScore(chunk map[string]interface{}) float64 {
|
||
if v, ok := chunk["_score"].(float64); ok {
|
||
return v
|
||
}
|
||
if v, ok := chunk["SCORE"].(float64); ok {
|
||
return v
|
||
}
|
||
if v, ok := chunk["SIMILARITY"].(float64); ok {
|
||
return v
|
||
}
|
||
return 0.0
|
||
}
|
||
|
||
// transformChunkFields converts chunk field names to Infinity format.
|
||
// Converts internal field names (like docnm_kwd) to Infinity column names (docnm).
|
||
// Also handles:
|
||
// - kb_id: extracts first element if it's a list
|
||
// - position_int, page_num_int, top_int: converts arrays to hex strings
|
||
// - tag_kwd: joins with ### separator
|
||
// - question_kwd: joins with newline separator
|
||
// - chunk_data: dict -> JSON string
|
||
// - Missing embeddings filled with zeros if embeddingCols provided
|
||
func transformChunkFields(chunk map[string]interface{}, embeddingCols [][2]interface{}) map[string]interface{} {
|
||
d := make(map[string]interface{})
|
||
|
||
for k, v := range chunk {
|
||
switch k {
|
||
case "docnm_kwd":
|
||
d["docnm"] = v
|
||
case "title_kwd":
|
||
if _, exists := chunk["docnm_kwd"]; !exists {
|
||
d["docnm"] = utility.ConvertToString(v)
|
||
}
|
||
case "title_sm_tks":
|
||
if _, exists := chunk["docnm_kwd"]; !exists {
|
||
d["docnm"] = utility.ConvertToString(v)
|
||
}
|
||
case "important_kwd":
|
||
if list, ok := v.([]interface{}); ok {
|
||
emptyCount := 0
|
||
tokens := make([]string, 0)
|
||
for _, item := range list {
|
||
if str, ok := item.(string); ok {
|
||
if str == "" {
|
||
emptyCount++
|
||
} else {
|
||
tokens = append(tokens, str)
|
||
}
|
||
}
|
||
}
|
||
d["important_keywords"] = strings.Join(tokens, ",")
|
||
d["important_kwd_empty_count"] = emptyCount
|
||
} else {
|
||
d["important_keywords"] = utility.ConvertToString(v)
|
||
}
|
||
case "important_tks":
|
||
if _, exists := chunk["important_kwd"]; !exists {
|
||
d["important_keywords"] = v
|
||
}
|
||
case "content_with_weight":
|
||
d["content"] = v
|
||
case "content_ltks":
|
||
if _, exists := chunk["content_with_weight"]; !exists {
|
||
d["content"] = v
|
||
}
|
||
case "content_sm_ltks":
|
||
if _, exists := chunk["content_with_weight"]; !exists {
|
||
d["content"] = v
|
||
}
|
||
case "authors_tks":
|
||
d["authors"] = v
|
||
case "authors_sm_tks":
|
||
if _, exists := chunk["authors_tks"]; !exists {
|
||
d["authors"] = v
|
||
}
|
||
case "question_kwd":
|
||
d["questions"] = strings.Join(utility.ConvertToStringSlice(v), "\n")
|
||
case "tag_kwd":
|
||
d["tag_kwd"] = strings.Join(utility.ConvertToStringSlice(v), "###")
|
||
case "question_tks":
|
||
if _, exists := chunk["question_kwd"]; !exists {
|
||
d["questions"] = utility.ConvertToString(v)
|
||
}
|
||
case "kb_id":
|
||
if list, ok := v.([]interface{}); ok && len(list) > 0 {
|
||
d["kb_id"] = list[0]
|
||
} else {
|
||
d["kb_id"] = v
|
||
}
|
||
case "position_int":
|
||
if list, ok := v.([]interface{}); ok {
|
||
d["position_int"] = utility.ConvertPositionIntArrayToHex(list)
|
||
} else {
|
||
d["position_int"] = v
|
||
}
|
||
case "page_num_int", "top_int":
|
||
if list, ok := v.([]interface{}); ok {
|
||
d[k] = utility.ConvertIntArrayToHex(list)
|
||
} else {
|
||
d[k] = v
|
||
}
|
||
case "chunk_data":
|
||
d["chunk_data"] = utility.ConvertMapToJSONString(v)
|
||
default:
|
||
// Check for *_feas fields
|
||
if strings.HasSuffix(k, "_feas") {
|
||
jsonBytes, _ := json.Marshal(v)
|
||
d[k] = string(jsonBytes)
|
||
} else if fieldKeyword(k) {
|
||
// keyword fields with list values -> ### joined
|
||
if list, ok := v.([]interface{}); ok {
|
||
d[k] = strings.Join(utility.ConvertToStringSlice(list), "###")
|
||
} else {
|
||
d[k] = v
|
||
}
|
||
} else {
|
||
d[k] = v
|
||
}
|
||
}
|
||
}
|
||
|
||
// Remove intermediate token fields
|
||
for _, key := range []string{"docnm_kwd", "title_tks", "title_sm_tks", "important_kwd", "important_tks",
|
||
"content_with_weight", "content_ltks", "content_sm_ltks", "authors_tks", "authors_sm_tks",
|
||
"question_kwd", "question_tks"} {
|
||
delete(d, key)
|
||
}
|
||
|
||
// Fill missing embedding columns with zeros if embedding info provided
|
||
for _, ec := range embeddingCols {
|
||
name, ok1 := ec[0].(string)
|
||
size, ok2 := ec[1].(int)
|
||
if !ok1 || !ok2 {
|
||
continue
|
||
}
|
||
if _, exists := d[name]; !exists {
|
||
zeros := make([]float64, size)
|
||
for i := range zeros {
|
||
zeros[i] = 0
|
||
}
|
||
d[name] = zeros
|
||
}
|
||
}
|
||
|
||
return d
|
||
}
|
||
|
||
// DropChunkStore drops a chunk table from Infinity
|
||
func (e *infinityEngine) DropChunkStore(ctx context.Context, baseName, datasetID string) error {
|
||
return e.dropTable(ctx, buildChunkTableName(baseName, datasetID))
|
||
}
|
||
|
||
// ChunkStoreExists checks if a chunk table exists in Infinity
|
||
func (e *infinityEngine) ChunkStoreExists(ctx context.Context, baseName, datasetID string) (bool, error) {
|
||
return e.tableExists(ctx, buildChunkTableName(baseName, datasetID))
|
||
}
|