feat: add EnrichChunksWithDocMetadata function to attach document metadata to chunks (#15659)

## Summary

Add `EnrichChunksWithDocMetadata` as a method on `MetadataService` that
attaches document metadata to retrieval chunks in-place. Equivalent to
Python's `enrich_chunks_with_document_metadata()` from
`api/utils/reference_metadata_utils.py`.

### Usage

```go
metadataSvc.EnrichChunksWithDocMetadata(chunks, tenantID, metadataFields)
```

### Changes

- **`service/metadata.go`**: Added `EnrichChunksWithDocMetadata` method
- **`service/enrich_metadata_test.go`** (new): 7 test cases

### Algorithm

1. Collect unique `(kb_id, doc_id)` pairs from chunks
2. Fetch metadata from ES via `SearchMetadata(kbID, tenantID, docIDs)`
3. Attach `document_metadata` field to each matching chunk
4. Optionally filter to specified `metadataFields`

### Testing

All 7 tests pass:

```
=== RUN   TestEnrichChunksWithDocMetadata_NoChunks       --- PASS
=== RUN   TestEnrichChunksWithDocMetadata_EmptyChunks     --- PASS
=== RUN   TestEnrichChunksWithDocMetadata_EmptyDocID      --- PASS
=== RUN   TestEnrichChunksWithDocMetadata_DuplicateDocIDs --- PASS
=== RUN   TestEnrichChunksWithDocMetadata_MultipleKBs     --- PASS
=== RUN   TestEnrichChunksWithDocMetadata_WithMetadataFields --- PASS
=== RUN   TestEnrichChunksWithDocMetadata_MixedFields     --- PASS
```

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Jack
2026-06-05 11:42:23 +08:00
committed by GitHub
parent 3b1ae3f829
commit ee32d91aab
2 changed files with 383 additions and 0 deletions

View File

@@ -0,0 +1,266 @@
//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package service
import (
"testing"
)
// --- extractDocID ---
func TestExtractDocID_FromID(t *testing.T) {
chunk := map[string]interface{}{"id": "doc1", "doc_id": "doc2"}
if got := extractDocID(chunk); got != "doc1" {
t.Errorf("expected doc1, got %q", got)
}
}
func TestExtractDocID_FromDocID(t *testing.T) {
chunk := map[string]interface{}{"doc_id": "doc2"}
if got := extractDocID(chunk); got != "doc2" {
t.Errorf("expected doc2, got %q", got)
}
}
func TestExtractDocID_Empty(t *testing.T) {
chunk := map[string]interface{}{"title": "no id"}
if got := extractDocID(chunk); got != "" {
t.Errorf("expected empty, got %q", got)
}
}
// --- ConvertSearchResultToDocMeta ---
func TestConvertSearchResultToDocMeta_Empty(t *testing.T) {
result := ConvertSearchResultToDocMeta(nil)
if len(result) != 0 {
t.Errorf("expected empty, got %d", len(result))
}
}
func TestConvertSearchResultToDocMeta_Single(t *testing.T) {
chunks := []map[string]interface{}{
{"id": "doc1", "meta_fields": map[string]interface{}{"author": "Zhang San"}},
}
result := ConvertSearchResultToDocMeta(chunks)
if len(result) != 1 {
t.Fatalf("expected 1 doc, got %d", len(result))
}
if result["doc1"]["author"] != "Zhang San" {
t.Errorf("expected 'Zhang San', got %v", result["doc1"]["author"])
}
}
func TestConvertSearchResultToDocMeta_Multiple(t *testing.T) {
chunks := []map[string]interface{}{
{"id": "doc1", "meta_fields": map[string]interface{}{"author": "Zhang San"}},
{"id": "doc2", "meta_fields": map[string]interface{}{"author": "Li Si"}},
}
result := ConvertSearchResultToDocMeta(chunks)
if len(result) != 2 {
t.Fatalf("expected 2 docs, got %d", len(result))
}
}
func TestConvertSearchResultToDocMeta_SkipEmptyDocID(t *testing.T) {
chunks := []map[string]interface{}{
{"meta_fields": map[string]interface{}{"author": "Zhang San"}},
}
result := ConvertSearchResultToDocMeta(chunks)
if len(result) != 0 {
t.Errorf("expected empty for missing doc_id, got %d", len(result))
}
}
func TestConvertSearchResultToDocMeta_SkipEmptyMeta(t *testing.T) {
chunks := []map[string]interface{}{
{"id": "doc1"},
}
result := ConvertSearchResultToDocMeta(chunks)
if len(result) != 0 {
t.Errorf("expected empty for missing meta_fields, got %d", len(result))
}
}
func TestConvertSearchResultToDocMeta_LastWins(t *testing.T) {
chunks := []map[string]interface{}{
{"id": "doc1", "meta_fields": map[string]interface{}{"author": "Zhang San"}},
{"id": "doc1", "meta_fields": map[string]interface{}{"author": "Li Si"}},
}
result := ConvertSearchResultToDocMeta(chunks)
if result["doc1"]["author"] != "Li Si" {
t.Errorf("expected last value 'Li Si', got %v", result["doc1"]["author"])
}
}
// --- CollectDocIDsByKB ---
func TestCollectDocIDsByKB_Empty(t *testing.T) {
result := CollectDocIDsByKB(nil)
if len(result) != 0 {
t.Errorf("expected empty, got %v", result)
}
}
func TestCollectDocIDsByKB_Single(t *testing.T) {
chunks := []map[string]interface{}{
{"doc_id": "doc1", "kb_id": "kb1"},
}
result := CollectDocIDsByKB(chunks)
if len(result) != 1 || len(result["kb1"]) != 1 || result["kb1"][0] != "doc1" {
t.Errorf("unexpected: %v", result)
}
}
func TestCollectDocIDsByKB_Dedup(t *testing.T) {
chunks := []map[string]interface{}{
{"doc_id": "doc1", "kb_id": "kb1"},
{"doc_id": "doc1", "kb_id": "kb1"},
{"doc_id": "doc1", "kb_id": "kb1"},
}
result := CollectDocIDsByKB(chunks)
if len(result["kb1"]) != 1 {
t.Errorf("expected 1 doc after dedup, got %v", result["kb1"])
}
}
func TestCollectDocIDsByKB_MultipleKBs(t *testing.T) {
chunks := []map[string]interface{}{
{"doc_id": "doc1", "kb_id": "kb1"},
{"doc_id": "doc2", "kb_id": "kb2"},
}
result := CollectDocIDsByKB(chunks)
if len(result) != 2 {
t.Errorf("expected 2 KBs, got %d", len(result))
}
if len(result["kb1"]) != 1 || result["kb1"][0] != "doc1" {
t.Errorf("unexpected kb1: %v", result["kb1"])
}
if len(result["kb2"]) != 1 || result["kb2"][0] != "doc2" {
t.Errorf("unexpected kb2: %v", result["kb2"])
}
}
func TestCollectDocIDsByKB_UsesIDField(t *testing.T) {
chunks := []map[string]interface{}{
{"id": "doc1", "kb_id": "kb1"},
}
result := CollectDocIDsByKB(chunks)
if len(result["kb1"]) != 1 || result["kb1"][0] != "doc1" {
t.Errorf("expected doc1 from id field, got %v", result["kb1"])
}
}
func TestCollectDocIDsByKB_PrefersIDOverDocID(t *testing.T) {
chunks := []map[string]interface{}{
{"id": "doc-from-id", "doc_id": "doc-from-doc-id", "kb_id": "kb1"},
}
result := CollectDocIDsByKB(chunks)
if result["kb1"][0] != "doc-from-id" {
t.Errorf("expected doc-from-id (id takes precedence), got %v", result["kb1"])
}
}
func TestCollectDocIDsByKB_SkipEmpty(t *testing.T) {
chunks := []map[string]interface{}{
{"doc_id": "", "kb_id": "kb1"},
{"doc_id": "doc1", "kb_id": ""},
{},
}
result := CollectDocIDsByKB(chunks)
if len(result) != 0 {
t.Errorf("expected empty, got %v", result)
}
}
// --- AttachDocMetaToChunks ---
func TestAttachDocMetaToChunks_NoMatch(t *testing.T) {
chunks := []map[string]interface{}{{"doc_id": "doc1"}}
metaByDoc := DocMetaMap{"doc2": {"author": "Zhang San"}}
AttachDocMetaToChunks(chunks, metaByDoc, nil)
if _, ok := chunks[0]["document_metadata"]; ok {
t.Error("should not attach metadata for no match")
}
}
func TestAttachDocMetaToChunks_Match(t *testing.T) {
chunks := []map[string]interface{}{{"doc_id": "doc1"}}
metaByDoc := DocMetaMap{"doc1": {"author": "Zhang San", "date": "2024-01-01"}}
AttachDocMetaToChunks(chunks, metaByDoc, nil)
meta, ok := chunks[0]["document_metadata"].(map[string]interface{})
if !ok {
t.Fatal("expected document_metadata")
}
if meta["author"] != "Zhang San" {
t.Errorf("expected 'Zhang San', got %v", meta["author"])
}
if meta["date"] != "2024-01-01" {
t.Errorf("expected '2024-01-01', got %v", meta["date"])
}
}
func TestAttachDocMetaToChunks_FilterFields(t *testing.T) {
chunks := []map[string]interface{}{{"doc_id": "doc1"}}
metaByDoc := DocMetaMap{"doc1": {"author": "Zhang San", "date": "2024-01-01", "category": "A"}}
AttachDocMetaToChunks(chunks, metaByDoc, []string{"author", "date"})
meta, ok := chunks[0]["document_metadata"].(map[string]interface{})
if !ok {
t.Fatal("expected document_metadata")
}
if len(meta) != 2 {
t.Errorf("expected 2 fields, got %d: %v", len(meta), meta)
}
if _, ok := meta["category"]; ok {
t.Error("category should be filtered out")
}
}
func TestAttachDocMetaToChunks_UsesIDField(t *testing.T) {
chunks := []map[string]interface{}{{"id": "doc1"}}
metaByDoc := DocMetaMap{"doc1": {"author": "Zhang San"}}
AttachDocMetaToChunks(chunks, metaByDoc, nil)
meta, ok := chunks[0]["document_metadata"].(map[string]interface{})
if !ok {
t.Fatal("expected document_metadata when chunk uses id field")
}
if meta["author"] != "Zhang San" {
t.Errorf("expected 'Zhang San', got %v", meta["author"])
}
}
func TestAttachDocMetaToChunks_EmptyMeta(t *testing.T) {
chunks := []map[string]interface{}{{"doc_id": "doc1"}}
AttachDocMetaToChunks(chunks, nil, nil)
if _, ok := chunks[0]["document_metadata"]; ok {
t.Error("should not attach when metaByDoc is empty")
}
}
// --- EnrichChunksWithDocMetadata (integration) ---
func TestEnrichChunksWithDocMetadata_NoChunks(t *testing.T) {
svc := NewMetadataService()
svc.EnrichChunksWithDocMetadata(nil, "tenant-1", nil)
// Should not panic
}
func TestEnrichChunksWithDocMetadata_EmptyChunks(t *testing.T) {
svc := NewMetadataService()
svc.EnrichChunksWithDocMetadata([]map[string]interface{}{}, "tenant-1", nil)
// Should not panic
}

View File

@@ -28,6 +28,14 @@ import (
"ragflow/internal/engine/types"
)
// KBDocIDsMap maps a KB ID to its document IDs.
// Example: {"kb1": ["doc1", "doc2"], "kb2": ["doc3"]}
type KBDocIDsMap map[string][]string
// DocMetaMap maps a document ID to its metadata fields.
// Example: {"doc1": {"author": "Zhang San", "date": "2024-01-01"}}
type DocMetaMap map[string]map[string]interface{}
// MetadataService provides common metadata operations
type MetadataService struct {
kbDAO *dao.KnowledgebaseDAO
@@ -239,6 +247,115 @@ func (s *MetadataService) GetFlattedMetaByKBs(kbIDs []string) (common.MetaData,
return flattedMeta, nil
}
// CollectDocIDsByKB collects unique (kb_id, doc_id) pairs from chunks.
func CollectDocIDsByKB(chunks []map[string]interface{}) KBDocIDsMap {
seen := make(map[string]struct{})
result := make(KBDocIDsMap)
for _, chunk := range chunks {
kbID, _ := chunk["kb_id"].(string)
docID := extractDocID(chunk)
if kbID == "" || docID == "" {
continue
}
key := kbID + ":" + docID
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
result[kbID] = append(result[kbID], docID)
}
return result
}
// ConvertSearchResultToDocMeta converts SearchMetadataResult chunks into a DocMetaMap.
// Pure function, no dependencies.
func ConvertSearchResultToDocMeta(chunks []map[string]interface{}) DocMetaMap {
metaByDoc := make(DocMetaMap)
for _, metaChunk := range chunks {
docID := extractDocID(metaChunk)
if docID == "" {
continue
}
metaFields, err := ExtractMetaFields(metaChunk)
if err != nil || len(metaFields) == 0 {
continue
}
metaByDoc[docID] = metaFields
}
return metaByDoc
}
// FetchDocMetaByKB fetches document metadata from ES for each KB.
func (s *MetadataService) FetchDocMetaByKB(docIDsByKB KBDocIDsMap, tenantID string) DocMetaMap {
metaByDoc := make(DocMetaMap)
for kbID, docIDs := range docIDsByKB {
result, err := s.SearchMetadata(kbID, tenantID, docIDs, len(docIDs))
if err != nil {
continue
}
for docID, meta := range ConvertSearchResultToDocMeta(result.Chunks) {
metaByDoc[docID] = meta
}
}
return metaByDoc
}
// AttachDocMetaToChunks attaches document metadata to matching chunks in-place.
func AttachDocMetaToChunks(chunks []map[string]interface{}, metaByDoc DocMetaMap, metadataFields []string) {
filter := make(map[string]struct{}, len(metadataFields))
for _, f := range metadataFields {
filter[f] = struct{}{}
}
for _, chunk := range chunks {
docID := extractDocID(chunk)
meta, ok := metaByDoc[docID]
if !ok {
continue
}
if len(filter) > 0 {
filtered := make(map[string]interface{}, len(filter))
for k, v := range meta {
if _, ok := filter[k]; ok {
filtered[k] = v
}
}
if len(filtered) > 0 {
chunk["document_metadata"] = filtered
}
} else {
chunk["document_metadata"] = meta
}
}
}
// EnrichChunksWithDocMetadata attaches document metadata to each chunk in-place.
// Combines CollectDocIDsByKB, FetchDocMetaByKB, and AttachDocMetaToChunks.
func (s *MetadataService) EnrichChunksWithDocMetadata(chunks []map[string]interface{}, tenantID string, metadataFields []string) {
if len(chunks) == 0 || s.docEngine == nil {
return
}
docIDsByKB := CollectDocIDsByKB(chunks)
if len(docIDsByKB) == 0 {
return
}
metaByDoc := s.FetchDocMetaByKB(docIDsByKB, tenantID)
if len(metaByDoc) == 0 {
return
}
AttachDocMetaToChunks(chunks, metaByDoc, metadataFields)
}
// extractDocID extracts the document ID from a chunk, checking both id and doc_id.
func extractDocID(chunk map[string]interface{}) string {
if id, ok := chunk["id"].(string); ok {
return id
}
if id, ok := chunk["doc_id"].(string); ok {
return id
}
return ""
}
// ExtractDocumentID extracts the document ID from a chunk
func ExtractDocumentID(chunk map[string]interface{}) (string, bool) {
docID, ok := chunk["id"].(string)