feat: persist RAPTOR layer metadata on summary chunks (#13286)

## Summary

RAPTOR's recursive clustering builds a `layers` list tracking
`(start_idx, end_idx)` boundaries per level, but currently discards this
information — only the flat `chunks` list is returned. This makes it
impossible to distinguish leaf-level summaries from top-level ones.

This PR:
- Returns `(chunks, layers)` tuple from `raptor.py`'s `__call__`
- Annotates each RAPTOR summary chunk with `raptor_layer_int` (1 = first
summary level, 2 = summary-of-summaries, etc.)
- Adds `raptor_layer_int` to `infinity_mapping.json` (Elasticsearch
handles it via existing `*_int` dynamic template)

### Why this matters

Downstream features need to know which RAPTOR layer a summary belongs
to:
- **Retrieving the top-level document summary** for entity extraction,
search snippets, or document comparison
- **Filtering by abstraction level** — users may want only high-level
summaries or only leaf-level cluster summaries
- **RAPTOR recall quality** — #10951 reports summaries not being
recalled for definition queries; layer metadata enables targeted
retrieval

### Changes

| File | Change | LOC |
|------|--------|-----|
| `rag/raptor.py` | Return `(chunks, layers)` tuple | ~3 |
| `rag/svr/task_executor.py` | Build `chunk_layer` mapping, set
`raptor_layer_int` | ~12 |
| `conf/infinity_mapping.json` | Add `raptor_layer_int` integer field |
~1 |

### Backward compatibility

- **Additive only** — no existing fields or behavior changed
- Existing RAPTOR chunks continue to work (they'll have
`raptor_layer_int = 0` by default)
- New RAPTOR chunks get layer metadata automatically

## Test plan

- [ ] Parse a document with RAPTOR enabled, verify `raptor_layer_int` is
set on indexed chunks
- [ ] Verify `raptor_layer_int` values increase with abstraction level
(layer 1 < layer 2 < ...)
- [ ] Verify existing RAPTOR deletion (`delete by raptor_kwd`) still
works
- [ ] Verify Infinity backend accepts the new field

Fixes #7488
Related: #4104, #11191, #10951

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
This commit is contained in:
yuch85
2026-04-27 10:20:46 +08:00
committed by GitHub
parent a9e5724b46
commit 3ad3241ae0
3 changed files with 17 additions and 5 deletions

View File

@@ -38,5 +38,6 @@
"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
"doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
"toc_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
"raptor_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}
"raptor_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
"raptor_layer_int": {"type": "integer", "default": 0}
}

View File

@@ -111,7 +111,7 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
async def __call__(self, chunks, random_state, callback=None, task_id: str = ""):
if len(chunks) <= 1:
return []
return [], []
chunks = [(s, a) for s, a in chunks if s and a is not None and len(a) > 0]
layers = [(0, len(chunks))]
start, end = 0, len(chunks)
@@ -212,4 +212,4 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
start = end
end = len(chunks)
return chunks
return chunks, layers

View File

@@ -843,7 +843,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
max_errors=max_errors,
)
original_length = len(chunks)
chunks = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"])
chunks, layers = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"])
effective_doc_name = row["name"] if did == fake_doc_id else doc_name_by_id.get(did, row["name"])
doc = {
"doc_id": did,
@@ -855,7 +855,17 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
if row["pagerank"]:
doc[PAGERANK_FLD] = int(row["pagerank"])
for content, vctr in chunks[original_length:]:
# Build index→layer mapping from RAPTOR layer boundaries.
# layers is [(start, end), ...] where layer 0 is the original chunks
# and layer 1+ are summary layers. We skip layer 0 (original chunks).
chunk_layer = {}
for layer_idx, (layer_start, layer_end) in enumerate(layers):
if layer_idx == 0:
continue # layer 0 = original input chunks, not summaries
for ci in range(layer_start, layer_end):
chunk_layer[ci] = layer_idx
for idx, (content, vctr) in enumerate(chunks[original_length:], start=original_length):
d = copy.deepcopy(doc)
d["id"] = xxhash.xxh64((content + str(fake_doc_id)).encode("utf-8")).hexdigest()
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
@@ -864,6 +874,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
d["content_with_weight"] = content
d["content_ltks"] = rag_tokenizer.tokenize(content)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
d["raptor_layer_int"] = chunk_layer.get(idx, 1)
res.append(d)
tk_count += num_tokens_from_string(content)