Files
ragflow/internal/service/kg/scoring.go
Jack 87b8062df4 feat: implement POST /api/v1/searchbots/ask — streaming RAG with citations and think-tag processing (#15825)
Implements POST /api/v1/searchbots/ask in Go with streaming SSE,
citations, and think-tag processing. 23 files, 90+ unit tests.

---------

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 22:48:50 +08:00

261 lines
7.3 KiB
Go

//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package kg
import (
"bytes"
"encoding/csv"
"encoding/json"
"fmt"
"sort"
"strings"
"ragflow/internal/service"
)
// AnalyzeNHopPaths decomposes N-hop paths into edges with distance-decayed scores.
// Python equivalent: rag/graphrag/search.py lines 172-187
func AnalyzeNHopPaths(entsFromQuery map[string]*KGEntity) map[Edge]EdgeScore {
nhopPathes := make(map[Edge]EdgeScore)
for _, ent := range entsFromQuery {
for _, nbr := range ent.NhopEnts {
path := nbr.Path
weights := nbr.Weights
for i := 0; i < len(path)-1; i++ {
f, t := path[i], path[i+1]
edge := Edge{From: f, To: t}
es := nhopPathes[edge]
es.Sim += ent.Similarity / (2.0 + float64(i))
if i < len(weights) {
es.PageRank = weights[i]
}
nhopPathes[edge] = es
}
}
}
return nhopPathes
}
// DoubleHitBoost doubles the similarity of entities found in both
// keyword search and type search. Python equivalent: lines 194-198
func DoubleHitBoost(entsFromQuery map[string]*KGEntity, entsFromTypes map[string]struct{}) {
for ent := range entsFromQuery {
if _, ok := entsFromTypes[ent]; ok {
entsFromQuery[ent].Similarity *= 2
}
}
}
// FuseRelationScores integrates N-hop contributions and type boosts
// into relation scores. New edges from N-hop are added as relations.
// Python equivalent: lines 200-222
func FuseRelationScores(
relsFromText map[Edge]*KGRelation,
entsFromTypes map[string]struct{},
nhopPathes map[Edge]EdgeScore,
) {
// Boost existing relations with N-hop and type scores
for edge, rel := range relsFromText {
s := 0.0
if np, ok := nhopPathes[edge]; ok {
s += np.Sim
delete(nhopPathes, edge)
}
if _, ok := entsFromTypes[edge.From]; ok {
s += 1
}
if _, ok := entsFromTypes[edge.To]; ok {
s += 1
}
rel.Sim *= s + 1
}
// N-hop discovered edges become new relations
for edge, np := range nhopPathes {
s := 0.0
if _, ok := entsFromTypes[edge.From]; ok {
s += 1
}
if _, ok := entsFromTypes[edge.To]; ok {
s += 1
}
relsFromText[edge] = &KGRelation{
Sim: np.Sim * (s + 1),
PageRank: np.PageRank,
}
}
}
// SortAndTrimEntities sorts entities by sim*pagerank and takes top N.
// Python equivalent: lines 224-225
func SortAndTrimEntities(entsFromQuery map[string]*KGEntity, topN int) []ScoredEntity {
if topN <= 0 {
topN = 6
}
var scored []ScoredEntity
for name, ent := range entsFromQuery {
scored = append(scored, ScoredEntity{
Entity: name,
Score: ent.Similarity * ent.PageRank,
Description: ent.Description,
})
}
sort.Slice(scored, func(i, j int) bool {
return scored[i].Score > scored[j].Score
})
if len(scored) > topN {
scored = scored[:topN]
}
return scored
}
// SortAndTrimRelations sorts relations by sim*pagerank and takes top N.
// Python equivalent: lines 226-227
func SortAndTrimRelations(relsFromText map[Edge]*KGRelation, topN int) []ScoredRelation {
if topN <= 0 {
topN = 6
}
var scored []ScoredRelation
for edge, rel := range relsFromText {
scored = append(scored, ScoredRelation{
From: edge.From,
To: edge.To,
Score: rel.Sim * rel.PageRank,
Description: rel.Description,
})
}
sort.Slice(scored, func(i, j int) bool {
return scored[i].Score > scored[j].Score
})
if len(scored) > topN {
scored = scored[:topN]
}
return scored
}
// NumTokensFromString estimates the number of tokens in a string.
// Delegates to the shared implementation in the parent service package.
func NumTokensFromString(s string) int {
return service.NumTokensFromString(s)
}
// formatCSVLine formats fields as a single CSV record with trailing newline.
// Handles commas, quotes, and newlines in field values correctly — unlike fmt.Sprintf.
// Matches Python: pd.DataFrame(...).to_csv() quoting behavior.
func formatCSVLine(fields ...string) string {
var buf bytes.Buffer
w := csv.NewWriter(&buf)
_ = w.Write(fields)
w.Flush()
return buf.String()
}
// FilterChunksByScore filters chunks where _score >= threshold.
// Chunks missing _score are treated as score=0.
// Pure function — no I/O, no external dependencies.
// Matches Python: _ent_info_from_ and _relation_info_from_ sim_thr filtering.
func FilterChunksByScore(chunks []map[string]interface{}, threshold float64) []map[string]interface{} {
if threshold <= 0 || len(chunks) == 0 {
return chunks
}
result := make([]map[string]interface{}, 0, len(chunks))
for _, chunk := range chunks {
score := 0.0
if v, ok := chunk["_score"].(float64); ok {
score = v
} else if v, ok := chunk["score"].(float64); ok {
score = v
}
if score >= threshold {
result = append(result, chunk)
}
}
return result
}
// FormatEntitiesToCSV formats scored entities as a CSV string and tracks token count.
func FormatEntitiesToCSV(entities []ScoredEntity, maxToken int) (csv string, remainingToken int) {
if len(entities) == 0 {
return "", maxToken
}
var b strings.Builder
b.WriteString("---- Entities ----\n")
b.WriteString("Entity,Score,Description\n")
for _, ent := range entities {
desc := extractDescription(ent.Description)
line := formatCSVLine(ent.Entity, fmt.Sprintf("%.2f", ent.Score), desc)
tokens := NumTokensFromString(line)
if maxToken-tokens <= 0 {
break
}
b.WriteString(line)
maxToken -= tokens
}
return b.String(), maxToken
}
// FormatRelationsToCSV formats scored relations as a CSV string and tracks token count.
func FormatRelationsToCSV(relations []ScoredRelation, maxToken int) (csv string, remainingToken int) {
if len(relations) == 0 {
return "", maxToken
}
var b strings.Builder
b.WriteString("---- Relations ----\n")
b.WriteString("From Entity,To Entity,Score,Description\n")
for _, rel := range relations {
desc := extractDescription(rel.Description)
line := formatCSVLine(rel.From, rel.To, fmt.Sprintf("%.2f", rel.Score), desc)
tokens := NumTokensFromString(line)
if maxToken-tokens <= 0 {
break
}
b.WriteString(line)
maxToken -= tokens
}
return b.String(), maxToken
}
// BuildContent assembles the final knowledge graph content string.
// Python equivalent: lines 267-291
func BuildContent(
entities []ScoredEntity,
relations []ScoredRelation,
maxToken int,
) string {
entityCSV, remaining := FormatEntitiesToCSV(entities, maxToken)
relCSV, _ := FormatRelationsToCSV(relations, remaining)
return entityCSV + relCSV
}
// extractDescription tries to parse a description from a JSON-like string.
// Python equivalent: json.loads(desc).get("description", "")
func extractDescription(desc string) string {
if desc == "" {
return ""
}
// Try to parse as JSON and extract the "description" field.
var data map[string]interface{}
if err := json.Unmarshal([]byte(desc), &data); err == nil {
if v, ok := data["description"]; ok {
if s, ok := v.(string); ok {
return s
}
}
}
return desc
}