mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
feat: persist RAPTOR layer metadata on summary chunks (#13286)
## Summary RAPTOR's recursive clustering builds a `layers` list tracking `(start_idx, end_idx)` boundaries per level, but currently discards this information — only the flat `chunks` list is returned. This makes it impossible to distinguish leaf-level summaries from top-level ones. This PR: - Returns `(chunks, layers)` tuple from `raptor.py`'s `__call__` - Annotates each RAPTOR summary chunk with `raptor_layer_int` (1 = first summary level, 2 = summary-of-summaries, etc.) - Adds `raptor_layer_int` to `infinity_mapping.json` (Elasticsearch handles it via existing `*_int` dynamic template) ### Why this matters Downstream features need to know which RAPTOR layer a summary belongs to: - **Retrieving the top-level document summary** for entity extraction, search snippets, or document comparison - **Filtering by abstraction level** — users may want only high-level summaries or only leaf-level cluster summaries - **RAPTOR recall quality** — #10951 reports summaries not being recalled for definition queries; layer metadata enables targeted retrieval ### Changes | File | Change | LOC | |------|--------|-----| | `rag/raptor.py` | Return `(chunks, layers)` tuple | ~3 | | `rag/svr/task_executor.py` | Build `chunk_layer` mapping, set `raptor_layer_int` | ~12 | | `conf/infinity_mapping.json` | Add `raptor_layer_int` integer field | ~1 | ### Backward compatibility - **Additive only** — no existing fields or behavior changed - Existing RAPTOR chunks continue to work (they'll have `raptor_layer_int = 0` by default) - New RAPTOR chunks get layer metadata automatically ## Test plan - [ ] Parse a document with RAPTOR enabled, verify `raptor_layer_int` is set on indexed chunks - [ ] Verify `raptor_layer_int` values increase with abstraction level (layer 1 < layer 2 < ...) - [ ] Verify existing RAPTOR deletion (`delete by raptor_kwd`) still works - [ ] Verify Infinity backend accepts the new field Fixes #7488 Related: #4104, #11191, #10951 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: yuch85 <yuch85.1@gmail.com> Co-authored-by: Wang Qi <wangq8@outlook.com>
This commit is contained in:
@@ -38,5 +38,6 @@
|
||||
"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||
"doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||
"toc_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||
"raptor_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}
|
||||
"raptor_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||
"raptor_layer_int": {"type": "integer", "default": 0}
|
||||
}
|
||||
|
||||
@@ -111,7 +111,7 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
|
||||
|
||||
async def __call__(self, chunks, random_state, callback=None, task_id: str = ""):
|
||||
if len(chunks) <= 1:
|
||||
return []
|
||||
return [], []
|
||||
chunks = [(s, a) for s, a in chunks if s and a is not None and len(a) > 0]
|
||||
layers = [(0, len(chunks))]
|
||||
start, end = 0, len(chunks)
|
||||
@@ -212,4 +212,4 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
|
||||
start = end
|
||||
end = len(chunks)
|
||||
|
||||
return chunks
|
||||
return chunks, layers
|
||||
|
||||
@@ -843,7 +843,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
|
||||
max_errors=max_errors,
|
||||
)
|
||||
original_length = len(chunks)
|
||||
chunks = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"])
|
||||
chunks, layers = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"])
|
||||
effective_doc_name = row["name"] if did == fake_doc_id else doc_name_by_id.get(did, row["name"])
|
||||
doc = {
|
||||
"doc_id": did,
|
||||
@@ -855,7 +855,17 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
|
||||
if row["pagerank"]:
|
||||
doc[PAGERANK_FLD] = int(row["pagerank"])
|
||||
|
||||
for content, vctr in chunks[original_length:]:
|
||||
# Build index→layer mapping from RAPTOR layer boundaries.
|
||||
# layers is [(start, end), ...] where layer 0 is the original chunks
|
||||
# and layer 1+ are summary layers. We skip layer 0 (original chunks).
|
||||
chunk_layer = {}
|
||||
for layer_idx, (layer_start, layer_end) in enumerate(layers):
|
||||
if layer_idx == 0:
|
||||
continue # layer 0 = original input chunks, not summaries
|
||||
for ci in range(layer_start, layer_end):
|
||||
chunk_layer[ci] = layer_idx
|
||||
|
||||
for idx, (content, vctr) in enumerate(chunks[original_length:], start=original_length):
|
||||
d = copy.deepcopy(doc)
|
||||
d["id"] = xxhash.xxh64((content + str(fake_doc_id)).encode("utf-8")).hexdigest()
|
||||
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
||||
@@ -864,6 +874,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
|
||||
d["content_with_weight"] = content
|
||||
d["content_ltks"] = rag_tokenizer.tokenize(content)
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
d["raptor_layer_int"] = chunk_layer.get(idx, 1)
|
||||
res.append(d)
|
||||
tk_count += num_tokens_from_string(content)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user