From 3ad3241ae06f414d2ccd2c92fda8c576bb96a96a Mon Sep 17 00:00:00 2001
From: yuch85 <yuch@live.com.sg>
Date: Mon, 27 Apr 2026 10:20:46 +0800
Subject: [PATCH] feat: persist RAPTOR layer metadata on summary chunks
 (#13286)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

RAPTOR's recursive clustering builds a `layers` list tracking
`(start_idx, end_idx)` boundaries per level, but currently discards this
information — only the flat `chunks` list is returned. This makes it
impossible to distinguish leaf-level summaries from top-level ones.

This PR:
- Returns `(chunks, layers)` tuple from `raptor.py`'s `__call__`
- Annotates each RAPTOR summary chunk with `raptor_layer_int` (1 = first
summary level, 2 = summary-of-summaries, etc.)
- Adds `raptor_layer_int` to `infinity_mapping.json` (Elasticsearch
handles it via existing `*_int` dynamic template)

### Why this matters

Downstream features need to know which RAPTOR layer a summary belongs
to:
- **Retrieving the top-level document summary** for entity extraction,
search snippets, or document comparison
- **Filtering by abstraction level** — users may want only high-level
summaries or only leaf-level cluster summaries
- **RAPTOR recall quality** — #10951 reports summaries not being
recalled for definition queries; layer metadata enables targeted
retrieval

### Changes

| File | Change | LOC |
|------|--------|-----|
| `rag/raptor.py` | Return `(chunks, layers)` tuple | ~3 |
| `rag/svr/task_executor.py` | Build `chunk_layer` mapping, set
`raptor_layer_int` | ~12 |
| `conf/infinity_mapping.json` | Add `raptor_layer_int` integer field |
~1 |

### Backward compatibility

- **Additive only** — no existing fields or behavior changed
- Existing RAPTOR chunks continue to work (they'll have
`raptor_layer_int = 0` by default)
- New RAPTOR chunks get layer metadata automatically

## Test plan

- [ ] Parse a document with RAPTOR enabled, verify `raptor_layer_int` is
set on indexed chunks
- [ ] Verify `raptor_layer_int` values increase with abstraction level
(layer 1 < layer 2 < ...)
- [ ] Verify existing RAPTOR deletion (`delete by raptor_kwd`) still
works
- [ ] Verify Infinity backend accepts the new field

Fixes #7488
Related: #4104, #11191, #10951

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
---
 conf/infinity_mapping.json |  3 ++-
 rag/raptor.py              |  4 ++--
 rag/svr/task_executor.py   | 15 +++++++++++++--
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/conf/infinity_mapping.json b/conf/infinity_mapping.json
index 77d26dd960..5f7ed80f26 100644
--- a/conf/infinity_mapping.json
+++ b/conf/infinity_mapping.json
@@ -38,5 +38,6 @@
 	"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
 	"doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
 	"toc_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
-	"raptor_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}
+	"raptor_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
+	"raptor_layer_int": {"type": "integer", "default": 0}
 }
diff --git a/rag/raptor.py b/rag/raptor.py
index 5d952dc428..e4017319b5 100644
--- a/rag/raptor.py
+++ b/rag/raptor.py
@@ -111,7 +111,7 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
 
     async def __call__(self, chunks, random_state, callback=None, task_id: str = ""):
         if len(chunks) <= 1:
-            return []
+            return [], []
         chunks = [(s, a) for s, a in chunks if s and a is not None and len(a) > 0]
         layers = [(0, len(chunks))]
         start, end = 0, len(chunks)
@@ -212,4 +212,4 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
             start = end
             end = len(chunks)
 
-        return chunks
+        return chunks, layers
diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py
index c81555c76e..5f8305176c 100644
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@@ -843,7 +843,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
             max_errors=max_errors,
         )
         original_length = len(chunks)
-        chunks = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"])
+        chunks, layers = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"])
         effective_doc_name = row["name"] if did == fake_doc_id else doc_name_by_id.get(did, row["name"])
         doc = {
             "doc_id": did,
@@ -855,7 +855,17 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
         if row["pagerank"]:
             doc[PAGERANK_FLD] = int(row["pagerank"])
 
-        for content, vctr in chunks[original_length:]:
+        # Build index→layer mapping from RAPTOR layer boundaries.
+        # layers is [(start, end), ...] where layer 0 is the original chunks
+        # and layer 1+ are summary layers. We skip layer 0 (original chunks).
+        chunk_layer = {}
+        for layer_idx, (layer_start, layer_end) in enumerate(layers):
+            if layer_idx == 0:
+                continue  # layer 0 = original input chunks, not summaries
+            for ci in range(layer_start, layer_end):
+                chunk_layer[ci] = layer_idx
+
+        for idx, (content, vctr) in enumerate(chunks[original_length:], start=original_length):
             d = copy.deepcopy(doc)
             d["id"] = xxhash.xxh64((content + str(fake_doc_id)).encode("utf-8")).hexdigest()
             d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
@@ -864,6 +874,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
             d["content_with_weight"] = content
             d["content_ltks"] = rag_tokenizer.tokenize(content)
             d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+            d["raptor_layer_int"] = chunk_layer.get(idx, 1)
             res.append(d)
             tk_count += num_tokens_from_string(content)