feat: persist RAPTOR layer metadata on summary chunks (#13286)

## Summary RAPTOR's recursive clustering builds a `layers` list tracking `(start_idx, end_idx)` boundaries per level, but currently discards this information — only the flat `chunks` list is returned. This makes it impossible to distinguish leaf-level summaries from top-level ones. This PR: - Returns `(chunks, layers)` tuple from `raptor.py`'s `__call__` - Annotates each RAPTOR summary chunk with `raptor_layer_int` (1 = first summary level, 2 = summary-of-summaries, etc.) - Adds `raptor_layer_int` to `infinity_mapping.json` (Elasticsearch handles it via existing `*_int` dynamic template) ### Why this matters Downstream features need to know which RAPTOR layer a summary belongs to: - **Retrieving the top-level document summary** for entity extraction, search snippets, or document comparison - **Filtering by abstraction level** — users may want only high-level summaries or only leaf-level cluster summaries - **RAPTOR recall quality** — #10951 reports summaries not being recalled for definition queries; layer metadata enables targeted retrieval ### Changes | File | Change | LOC | |------|--------|-----| | `rag/raptor.py` | Return `(chunks, layers)` tuple | ~3 | | `rag/svr/task_executor.py` | Build `chunk_layer` mapping, set `raptor_layer_int` | ~12 | | `conf/infinity_mapping.json` | Add `raptor_layer_int` integer field | ~1 | ### Backward compatibility - **Additive only** — no existing fields or behavior changed - Existing RAPTOR chunks continue to work (they'll have `raptor_layer_int = 0` by default) - New RAPTOR chunks get layer metadata automatically ## Test plan - [ ] Parse a document with RAPTOR enabled, verify `raptor_layer_int` is set on indexed chunks - [ ] Verify `raptor_layer_int` values increase with abstraction level (layer 1 < layer 2 < ...) - [ ] Verify existing RAPTOR deletion (`delete by raptor_kwd`) still works - [ ] Verify Infinity backend accepts the new field Fixes #7488 Related: #4104, #11191, #10951 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: yuch85 <yuch85.1@gmail.com> Co-authored-by: Wang Qi <wangq8@outlook.com>
2026-06-29 23:41:12 +08:00 · 2026-04-27 10:20:46 +08:00
parent a9e5724b46
commit 3ad3241ae0
3 changed files with 17 additions and 5 deletions
--- a/conf/infinity_mapping.json
+++ b/conf/infinity_mapping.json
@@ -38,5 +38,6 @@
 	"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
 	"doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
 	"toc_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
-	"raptor_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}
+	"raptor_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
+	"raptor_layer_int": {"type": "integer", "default": 0}
 }
--- a/rag/raptor.py
+++ b/rag/raptor.py
@@ -111,7 +111,7 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:

    async def __call__(self, chunks, random_state, callback=None, task_id: str = ""):
        if len(chunks) <= 1:
-            return []
+            return [], []
        chunks = [(s, a) for s, a in chunks if s and a is not None and len(a) > 0]
        layers = [(0, len(chunks))]
        start, end = 0, len(chunks)
@@ -212,4 +212,4 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
            start = end
            end = len(chunks)

-        return chunks
+        return chunks, layers
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@@ -843,7 +843,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
            max_errors=max_errors,
        )
        original_length = len(chunks)
-        chunks = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"])
+        chunks, layers = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"])
        effective_doc_name = row["name"] if did == fake_doc_id else doc_name_by_id.get(did, row["name"])
        doc = {
            "doc_id": did,
@@ -855,7 +855,17 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
        if row["pagerank"]:
            doc[PAGERANK_FLD] = int(row["pagerank"])

-        for content, vctr in chunks[original_length:]:
+        # Build index→layer mapping from RAPTOR layer boundaries.
+        # layers is [(start, end), ...] where layer 0 is the original chunks
+        # and layer 1+ are summary layers. We skip layer 0 (original chunks).
+        chunk_layer = {}
+        for layer_idx, (layer_start, layer_end) in enumerate(layers):
+            if layer_idx == 0:
+                continue  # layer 0 = original input chunks, not summaries
+            for ci in range(layer_start, layer_end):
+                chunk_layer[ci] = layer_idx
+
+        for idx, (content, vctr) in enumerate(chunks[original_length:], start=original_length):
            d = copy.deepcopy(doc)
            d["id"] = xxhash.xxh64((content + str(fake_doc_id)).encode("utf-8")).hexdigest()
            d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
@@ -864,6 +874,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
            d["content_with_weight"] = content
            d["content_ltks"] = rag_tokenizer.tokenize(content)
            d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+            d["raptor_layer_int"] = chunk_layer.get(idx, 1)
            res.append(d)
            tk_count += num_tokens_from_string(content)