From 3ad3241ae06f414d2ccd2c92fda8c576bb96a96a Mon Sep 17 00:00:00 2001 From: yuch85 Date: Mon, 27 Apr 2026 10:20:46 +0800 Subject: [PATCH] feat: persist RAPTOR layer metadata on summary chunks (#13286) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary RAPTOR's recursive clustering builds a `layers` list tracking `(start_idx, end_idx)` boundaries per level, but currently discards this information — only the flat `chunks` list is returned. This makes it impossible to distinguish leaf-level summaries from top-level ones. This PR: - Returns `(chunks, layers)` tuple from `raptor.py`'s `__call__` - Annotates each RAPTOR summary chunk with `raptor_layer_int` (1 = first summary level, 2 = summary-of-summaries, etc.) - Adds `raptor_layer_int` to `infinity_mapping.json` (Elasticsearch handles it via existing `*_int` dynamic template) ### Why this matters Downstream features need to know which RAPTOR layer a summary belongs to: - **Retrieving the top-level document summary** for entity extraction, search snippets, or document comparison - **Filtering by abstraction level** — users may want only high-level summaries or only leaf-level cluster summaries - **RAPTOR recall quality** — #10951 reports summaries not being recalled for definition queries; layer metadata enables targeted retrieval ### Changes | File | Change | LOC | |------|--------|-----| | `rag/raptor.py` | Return `(chunks, layers)` tuple | ~3 | | `rag/svr/task_executor.py` | Build `chunk_layer` mapping, set `raptor_layer_int` | ~12 | | `conf/infinity_mapping.json` | Add `raptor_layer_int` integer field | ~1 | ### Backward compatibility - **Additive only** — no existing fields or behavior changed - Existing RAPTOR chunks continue to work (they'll have `raptor_layer_int = 0` by default) - New RAPTOR chunks get layer metadata automatically ## Test plan - [ ] Parse a document with RAPTOR enabled, verify `raptor_layer_int` is set on indexed chunks - [ ] Verify `raptor_layer_int` values increase with abstraction level (layer 1 < layer 2 < ...) - [ ] Verify existing RAPTOR deletion (`delete by raptor_kwd`) still works - [ ] Verify Infinity backend accepts the new field Fixes #7488 Related: #4104, #11191, #10951 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: yuch85 Co-authored-by: Wang Qi --- conf/infinity_mapping.json | 3 ++- rag/raptor.py | 4 ++-- rag/svr/task_executor.py | 15 +++++++++++++-- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/conf/infinity_mapping.json b/conf/infinity_mapping.json index 77d26dd960..5f7ed80f26 100644 --- a/conf/infinity_mapping.json +++ b/conf/infinity_mapping.json @@ -38,5 +38,6 @@ "removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, "doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, "toc_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, - "raptor_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"} + "raptor_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, + "raptor_layer_int": {"type": "integer", "default": 0} } diff --git a/rag/raptor.py b/rag/raptor.py index 5d952dc428..e4017319b5 100644 --- a/rag/raptor.py +++ b/rag/raptor.py @@ -111,7 +111,7 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval: async def __call__(self, chunks, random_state, callback=None, task_id: str = ""): if len(chunks) <= 1: - return [] + return [], [] chunks = [(s, a) for s, a in chunks if s and a is not None and len(a) > 0] layers = [(0, len(chunks))] start, end = 0, len(chunks) @@ -212,4 +212,4 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval: start = end end = len(chunks) - return chunks + return chunks, layers diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index c81555c76e..5f8305176c 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -843,7 +843,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si max_errors=max_errors, ) original_length = len(chunks) - chunks = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"]) + chunks, layers = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"]) effective_doc_name = row["name"] if did == fake_doc_id else doc_name_by_id.get(did, row["name"]) doc = { "doc_id": did, @@ -855,7 +855,17 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si if row["pagerank"]: doc[PAGERANK_FLD] = int(row["pagerank"]) - for content, vctr in chunks[original_length:]: + # Build index→layer mapping from RAPTOR layer boundaries. + # layers is [(start, end), ...] where layer 0 is the original chunks + # and layer 1+ are summary layers. We skip layer 0 (original chunks). + chunk_layer = {} + for layer_idx, (layer_start, layer_end) in enumerate(layers): + if layer_idx == 0: + continue # layer 0 = original input chunks, not summaries + for ci in range(layer_start, layer_end): + chunk_layer[ci] = layer_idx + + for idx, (content, vctr) in enumerate(chunks[original_length:], start=original_length): d = copy.deepcopy(doc) d["id"] = xxhash.xxh64((content + str(fake_doc_id)).encode("utf-8")).hexdigest() d["create_time"] = str(datetime.now()).replace("T", " ")[:19] @@ -864,6 +874,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si d["content_with_weight"] = content d["content_ltks"] = rag_tokenizer.tokenize(content) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) + d["raptor_layer_int"] = chunk_layer.get(idx, 1) res.append(d) tk_count += num_tokens_from_string(content)