feat: add EnrichChunksWithDocMetadata function to attach document metadata to chunks (#15659)

## Summary Add `EnrichChunksWithDocMetadata` as a method on `MetadataService` that attaches document metadata to retrieval chunks in-place. Equivalent to Python's `enrich_chunks_with_document_metadata()` from `api/utils/reference_metadata_utils.py`. ### Usage ```go metadataSvc.EnrichChunksWithDocMetadata(chunks, tenantID, metadataFields) ``` ### Changes - **`service/metadata.go`**: Added `EnrichChunksWithDocMetadata` method - **`service/enrich_metadata_test.go`** (new): 7 test cases ### Algorithm 1. Collect unique `(kb_id, doc_id)` pairs from chunks 2. Fetch metadata from ES via `SearchMetadata(kbID, tenantID, docIDs)` 3. Attach `document_metadata` field to each matching chunk 4. Optionally filter to specified `metadataFields` ### Testing All 7 tests pass: ``` === RUN TestEnrichChunksWithDocMetadata_NoChunks --- PASS === RUN TestEnrichChunksWithDocMetadata_EmptyChunks --- PASS === RUN TestEnrichChunksWithDocMetadata_EmptyDocID --- PASS === RUN TestEnrichChunksWithDocMetadata_DuplicateDocIDs --- PASS === RUN TestEnrichChunksWithDocMetadata_MultipleKBs --- PASS === RUN TestEnrichChunksWithDocMetadata_WithMetadataFields --- PASS === RUN TestEnrichChunksWithDocMetadata_MixedFields --- PASS ``` Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-04 01:29:35 +08:00 · 2026-06-05 11:42:23 +08:00
parent 3b1ae3f829
commit ee32d91aab
2 changed files with 383 additions and 0 deletions
--- a/internal/service/enrich_metadata_test.go
+++ b/internal/service/enrich_metadata_test.go
@@ -0,0 +1,266 @@
+//
+//  Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+package service
+
+import (
+	"testing"
+)
+
+// --- extractDocID ---
+
+func TestExtractDocID_FromID(t *testing.T) {
+	chunk := map[string]interface{}{"id": "doc1", "doc_id": "doc2"}
+	if got := extractDocID(chunk); got != "doc1" {
+		t.Errorf("expected doc1, got %q", got)
+	}
+}
+
+func TestExtractDocID_FromDocID(t *testing.T) {
+	chunk := map[string]interface{}{"doc_id": "doc2"}
+	if got := extractDocID(chunk); got != "doc2" {
+		t.Errorf("expected doc2, got %q", got)
+	}
+}
+
+func TestExtractDocID_Empty(t *testing.T) {
+	chunk := map[string]interface{}{"title": "no id"}
+	if got := extractDocID(chunk); got != "" {
+		t.Errorf("expected empty, got %q", got)
+	}
+}
+
+// --- ConvertSearchResultToDocMeta ---
+
+func TestConvertSearchResultToDocMeta_Empty(t *testing.T) {
+	result := ConvertSearchResultToDocMeta(nil)
+	if len(result) != 0 {
+		t.Errorf("expected empty, got %d", len(result))
+	}
+}
+
+func TestConvertSearchResultToDocMeta_Single(t *testing.T) {
+	chunks := []map[string]interface{}{
+		{"id": "doc1", "meta_fields": map[string]interface{}{"author": "Zhang San"}},
+	}
+	result := ConvertSearchResultToDocMeta(chunks)
+	if len(result) != 1 {
+		t.Fatalf("expected 1 doc, got %d", len(result))
+	}
+	if result["doc1"]["author"] != "Zhang San" {
+		t.Errorf("expected 'Zhang San', got %v", result["doc1"]["author"])
+	}
+}
+
+func TestConvertSearchResultToDocMeta_Multiple(t *testing.T) {
+	chunks := []map[string]interface{}{
+		{"id": "doc1", "meta_fields": map[string]interface{}{"author": "Zhang San"}},
+		{"id": "doc2", "meta_fields": map[string]interface{}{"author": "Li Si"}},
+	}
+	result := ConvertSearchResultToDocMeta(chunks)
+	if len(result) != 2 {
+		t.Fatalf("expected 2 docs, got %d", len(result))
+	}
+}
+
+func TestConvertSearchResultToDocMeta_SkipEmptyDocID(t *testing.T) {
+	chunks := []map[string]interface{}{
+		{"meta_fields": map[string]interface{}{"author": "Zhang San"}},
+	}
+	result := ConvertSearchResultToDocMeta(chunks)
+	if len(result) != 0 {
+		t.Errorf("expected empty for missing doc_id, got %d", len(result))
+	}
+}
+
+func TestConvertSearchResultToDocMeta_SkipEmptyMeta(t *testing.T) {
+	chunks := []map[string]interface{}{
+		{"id": "doc1"},
+	}
+	result := ConvertSearchResultToDocMeta(chunks)
+	if len(result) != 0 {
+		t.Errorf("expected empty for missing meta_fields, got %d", len(result))
+	}
+}
+
+func TestConvertSearchResultToDocMeta_LastWins(t *testing.T) {
+	chunks := []map[string]interface{}{
+		{"id": "doc1", "meta_fields": map[string]interface{}{"author": "Zhang San"}},
+		{"id": "doc1", "meta_fields": map[string]interface{}{"author": "Li Si"}},
+	}
+	result := ConvertSearchResultToDocMeta(chunks)
+	if result["doc1"]["author"] != "Li Si" {
+		t.Errorf("expected last value 'Li Si', got %v", result["doc1"]["author"])
+	}
+}
+
+// --- CollectDocIDsByKB ---
+
+func TestCollectDocIDsByKB_Empty(t *testing.T) {
+	result := CollectDocIDsByKB(nil)
+	if len(result) != 0 {
+		t.Errorf("expected empty, got %v", result)
+	}
+}
+
+func TestCollectDocIDsByKB_Single(t *testing.T) {
+	chunks := []map[string]interface{}{
+		{"doc_id": "doc1", "kb_id": "kb1"},
+	}
+	result := CollectDocIDsByKB(chunks)
+	if len(result) != 1 || len(result["kb1"]) != 1 || result["kb1"][0] != "doc1" {
+		t.Errorf("unexpected: %v", result)
+	}
+}
+
+func TestCollectDocIDsByKB_Dedup(t *testing.T) {
+	chunks := []map[string]interface{}{
+		{"doc_id": "doc1", "kb_id": "kb1"},
+		{"doc_id": "doc1", "kb_id": "kb1"},
+		{"doc_id": "doc1", "kb_id": "kb1"},
+	}
+	result := CollectDocIDsByKB(chunks)
+	if len(result["kb1"]) != 1 {
+		t.Errorf("expected 1 doc after dedup, got %v", result["kb1"])
+	}
+}
+
+func TestCollectDocIDsByKB_MultipleKBs(t *testing.T) {
+	chunks := []map[string]interface{}{
+		{"doc_id": "doc1", "kb_id": "kb1"},
+		{"doc_id": "doc2", "kb_id": "kb2"},
+	}
+	result := CollectDocIDsByKB(chunks)
+	if len(result) != 2 {
+		t.Errorf("expected 2 KBs, got %d", len(result))
+	}
+	if len(result["kb1"]) != 1 || result["kb1"][0] != "doc1" {
+		t.Errorf("unexpected kb1: %v", result["kb1"])
+	}
+	if len(result["kb2"]) != 1 || result["kb2"][0] != "doc2" {
+		t.Errorf("unexpected kb2: %v", result["kb2"])
+	}
+}
+
+func TestCollectDocIDsByKB_UsesIDField(t *testing.T) {
+	chunks := []map[string]interface{}{
+		{"id": "doc1", "kb_id": "kb1"},
+	}
+	result := CollectDocIDsByKB(chunks)
+	if len(result["kb1"]) != 1 || result["kb1"][0] != "doc1" {
+		t.Errorf("expected doc1 from id field, got %v", result["kb1"])
+	}
+}
+
+func TestCollectDocIDsByKB_PrefersIDOverDocID(t *testing.T) {
+	chunks := []map[string]interface{}{
+		{"id": "doc-from-id", "doc_id": "doc-from-doc-id", "kb_id": "kb1"},
+	}
+	result := CollectDocIDsByKB(chunks)
+	if result["kb1"][0] != "doc-from-id" {
+		t.Errorf("expected doc-from-id (id takes precedence), got %v", result["kb1"])
+	}
+}
+
+func TestCollectDocIDsByKB_SkipEmpty(t *testing.T) {
+	chunks := []map[string]interface{}{
+		{"doc_id": "", "kb_id": "kb1"},
+		{"doc_id": "doc1", "kb_id": ""},
+		{},
+	}
+	result := CollectDocIDsByKB(chunks)
+	if len(result) != 0 {
+		t.Errorf("expected empty, got %v", result)
+	}
+}
+
+// --- AttachDocMetaToChunks ---
+
+func TestAttachDocMetaToChunks_NoMatch(t *testing.T) {
+	chunks := []map[string]interface{}{{"doc_id": "doc1"}}
+	metaByDoc := DocMetaMap{"doc2": {"author": "Zhang San"}}
+	AttachDocMetaToChunks(chunks, metaByDoc, nil)
+	if _, ok := chunks[0]["document_metadata"]; ok {
+		t.Error("should not attach metadata for no match")
+	}
+}
+
+func TestAttachDocMetaToChunks_Match(t *testing.T) {
+	chunks := []map[string]interface{}{{"doc_id": "doc1"}}
+	metaByDoc := DocMetaMap{"doc1": {"author": "Zhang San", "date": "2024-01-01"}}
+	AttachDocMetaToChunks(chunks, metaByDoc, nil)
+	meta, ok := chunks[0]["document_metadata"].(map[string]interface{})
+	if !ok {
+		t.Fatal("expected document_metadata")
+	}
+	if meta["author"] != "Zhang San" {
+		t.Errorf("expected 'Zhang San', got %v", meta["author"])
+	}
+	if meta["date"] != "2024-01-01" {
+		t.Errorf("expected '2024-01-01', got %v", meta["date"])
+	}
+}
+
+func TestAttachDocMetaToChunks_FilterFields(t *testing.T) {
+	chunks := []map[string]interface{}{{"doc_id": "doc1"}}
+	metaByDoc := DocMetaMap{"doc1": {"author": "Zhang San", "date": "2024-01-01", "category": "A"}}
+	AttachDocMetaToChunks(chunks, metaByDoc, []string{"author", "date"})
+	meta, ok := chunks[0]["document_metadata"].(map[string]interface{})
+	if !ok {
+		t.Fatal("expected document_metadata")
+	}
+	if len(meta) != 2 {
+		t.Errorf("expected 2 fields, got %d: %v", len(meta), meta)
+	}
+	if _, ok := meta["category"]; ok {
+		t.Error("category should be filtered out")
+	}
+}
+
+func TestAttachDocMetaToChunks_UsesIDField(t *testing.T) {
+	chunks := []map[string]interface{}{{"id": "doc1"}}
+	metaByDoc := DocMetaMap{"doc1": {"author": "Zhang San"}}
+	AttachDocMetaToChunks(chunks, metaByDoc, nil)
+	meta, ok := chunks[0]["document_metadata"].(map[string]interface{})
+	if !ok {
+		t.Fatal("expected document_metadata when chunk uses id field")
+	}
+	if meta["author"] != "Zhang San" {
+		t.Errorf("expected 'Zhang San', got %v", meta["author"])
+	}
+}
+
+func TestAttachDocMetaToChunks_EmptyMeta(t *testing.T) {
+	chunks := []map[string]interface{}{{"doc_id": "doc1"}}
+	AttachDocMetaToChunks(chunks, nil, nil)
+	if _, ok := chunks[0]["document_metadata"]; ok {
+		t.Error("should not attach when metaByDoc is empty")
+	}
+}
+
+// --- EnrichChunksWithDocMetadata (integration) ---
+
+func TestEnrichChunksWithDocMetadata_NoChunks(t *testing.T) {
+	svc := NewMetadataService()
+	svc.EnrichChunksWithDocMetadata(nil, "tenant-1", nil)
+	// Should not panic
+}
+
+func TestEnrichChunksWithDocMetadata_EmptyChunks(t *testing.T) {
+	svc := NewMetadataService()
+	svc.EnrichChunksWithDocMetadata([]map[string]interface{}{}, "tenant-1", nil)
+	// Should not panic
+}
--- a/internal/service/metadata.go
+++ b/internal/service/metadata.go
@@ -28,6 +28,14 @@ import (
 	"ragflow/internal/engine/types"
 )

+// KBDocIDsMap maps a KB ID to its document IDs.
+// Example: {"kb1": ["doc1", "doc2"], "kb2": ["doc3"]}
+type KBDocIDsMap map[string][]string
+
+// DocMetaMap maps a document ID to its metadata fields.
+// Example: {"doc1": {"author": "Zhang San", "date": "2024-01-01"}}
+type DocMetaMap map[string]map[string]interface{}
+
 // MetadataService provides common metadata operations
 type MetadataService struct {
 	kbDAO     *dao.KnowledgebaseDAO
@@ -239,6 +247,115 @@ func (s *MetadataService) GetFlattedMetaByKBs(kbIDs []string) (common.MetaData,
 	return flattedMeta, nil
 }

+// CollectDocIDsByKB collects unique (kb_id, doc_id) pairs from chunks.
+func CollectDocIDsByKB(chunks []map[string]interface{}) KBDocIDsMap {
+	seen := make(map[string]struct{})
+	result := make(KBDocIDsMap)
+	for _, chunk := range chunks {
+		kbID, _ := chunk["kb_id"].(string)
+		docID := extractDocID(chunk)
+		if kbID == "" || docID == "" {
+			continue
+		}
+		key := kbID + ":" + docID
+		if _, ok := seen[key]; ok {
+			continue
+		}
+		seen[key] = struct{}{}
+		result[kbID] = append(result[kbID], docID)
+	}
+	return result
+}
+
+// ConvertSearchResultToDocMeta converts SearchMetadataResult chunks into a DocMetaMap.
+// Pure function, no dependencies.
+func ConvertSearchResultToDocMeta(chunks []map[string]interface{}) DocMetaMap {
+	metaByDoc := make(DocMetaMap)
+	for _, metaChunk := range chunks {
+		docID := extractDocID(metaChunk)
+		if docID == "" {
+			continue
+		}
+		metaFields, err := ExtractMetaFields(metaChunk)
+		if err != nil || len(metaFields) == 0 {
+			continue
+		}
+		metaByDoc[docID] = metaFields
+	}
+	return metaByDoc
+}
+
+// FetchDocMetaByKB fetches document metadata from ES for each KB.
+func (s *MetadataService) FetchDocMetaByKB(docIDsByKB KBDocIDsMap, tenantID string) DocMetaMap {
+	metaByDoc := make(DocMetaMap)
+	for kbID, docIDs := range docIDsByKB {
+		result, err := s.SearchMetadata(kbID, tenantID, docIDs, len(docIDs))
+		if err != nil {
+			continue
+		}
+		for docID, meta := range ConvertSearchResultToDocMeta(result.Chunks) {
+			metaByDoc[docID] = meta
+		}
+	}
+	return metaByDoc
+}
+
+// AttachDocMetaToChunks attaches document metadata to matching chunks in-place.
+func AttachDocMetaToChunks(chunks []map[string]interface{}, metaByDoc DocMetaMap, metadataFields []string) {
+	filter := make(map[string]struct{}, len(metadataFields))
+	for _, f := range metadataFields {
+		filter[f] = struct{}{}
+	}
+	for _, chunk := range chunks {
+		docID := extractDocID(chunk)
+		meta, ok := metaByDoc[docID]
+		if !ok {
+			continue
+		}
+		if len(filter) > 0 {
+			filtered := make(map[string]interface{}, len(filter))
+			for k, v := range meta {
+				if _, ok := filter[k]; ok {
+					filtered[k] = v
+				}
+			}
+			if len(filtered) > 0 {
+				chunk["document_metadata"] = filtered
+			}
+		} else {
+			chunk["document_metadata"] = meta
+		}
+	}
+}
+
+// EnrichChunksWithDocMetadata attaches document metadata to each chunk in-place.
+// Combines CollectDocIDsByKB, FetchDocMetaByKB, and AttachDocMetaToChunks.
+func (s *MetadataService) EnrichChunksWithDocMetadata(chunks []map[string]interface{}, tenantID string, metadataFields []string) {
+	if len(chunks) == 0 || s.docEngine == nil {
+		return
+	}
+	docIDsByKB := CollectDocIDsByKB(chunks)
+	if len(docIDsByKB) == 0 {
+		return
+	}
+	metaByDoc := s.FetchDocMetaByKB(docIDsByKB, tenantID)
+	if len(metaByDoc) == 0 {
+		return
+	}
+	AttachDocMetaToChunks(chunks, metaByDoc, metadataFields)
+}
+
+// extractDocID extracts the document ID from a chunk, checking both id and doc_id.
+func extractDocID(chunk map[string]interface{}) string {
+	if id, ok := chunk["id"].(string); ok {
+		return id
+	}
+	if id, ok := chunk["doc_id"].(string); ok {
+		return id
+	}
+	return ""
+}
+
 // ExtractDocumentID extracts the document ID from a chunk
 func ExtractDocumentID(chunk map[string]interface{}) (string, bool) {
 	docID, ok := chunk["id"].(string)