2024-11-12 14:59:41 +08:00
|
|
|
{
|
|
|
|
|
"id": {"type": "varchar", "default": ""},
|
|
|
|
|
"doc_id": {"type": "varchar", "default": ""},
|
2026-02-02 13:22:29 +08:00
|
|
|
"kb_id": {"type": "varchar", "default": "", "index_type": {"type": "secondary", "cardinality": "low"}},
|
2025-12-09 09:34:01 +08:00
|
|
|
"mom_id": {"type": "varchar", "default": ""},
|
2026-03-27 13:06:18 +08:00
|
|
|
"mom": {"type": "varchar", "default": ""},
|
2024-11-12 14:59:41 +08:00
|
|
|
"create_time": {"type": "varchar", "default": ""},
|
|
|
|
|
"create_timestamp_flt": {"type": "float", "default": 0.0},
|
|
|
|
|
"img_id": {"type": "varchar", "default": ""},
|
2025-11-26 11:06:37 +08:00
|
|
|
"docnm": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "docnm_kwd, title_tks, title_sm_tks"},
|
2025-03-26 15:34:42 +08:00
|
|
|
"name_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
2025-03-05 09:35:40 +08:00
|
|
|
"tag_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
2026-01-16 20:03:52 +08:00
|
|
|
"important_kwd_empty_count": {"type": "integer", "default": 0},
|
2025-11-26 11:06:37 +08:00
|
|
|
"important_keywords": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "important_kwd, important_tks"},
|
|
|
|
|
"questions": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "question_kwd, question_tks"},
|
|
|
|
|
"content": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "content_with_weight, content_ltks, content_sm_ltks"},
|
|
|
|
|
"authors": {"type": "varchar", "default": "", "analyzer": ["rag-coarse", "rag-fine"], "comment": "authors_tks, authors_sm_tks"},
|
2024-12-10 16:32:58 +08:00
|
|
|
"page_num_int": {"type": "varchar", "default": ""},
|
|
|
|
|
"top_int": {"type": "varchar", "default": ""},
|
|
|
|
|
"position_int": {"type": "varchar", "default": ""},
|
2024-11-12 14:59:41 +08:00
|
|
|
"weight_int": {"type": "integer", "default": 0},
|
|
|
|
|
"weight_flt": {"type": "float", "default": 0.0},
|
2026-04-03 19:26:45 +08:00
|
|
|
"chunk_order_int": {"type": "integer", "default": 0},
|
2024-11-12 14:59:41 +08:00
|
|
|
"rank_int": {"type": "integer", "default": 0},
|
2025-01-22 19:43:14 +08:00
|
|
|
"rank_flt": {"type": "float", "default": 0},
|
2026-02-02 13:22:29 +08:00
|
|
|
"available_int": {"type": "integer", "default": 1, "index_type": {"type": "secondary", "cardinality": "low"}},
|
2025-03-28 22:05:40 +08:00
|
|
|
"knowledge_graph_kwd": {"type": "varchar", "default": ""},
|
2025-03-05 09:35:40 +08:00
|
|
|
"entities_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
2025-01-09 17:07:21 +08:00
|
|
|
"pagerank_fea": {"type": "integer", "default": 0},
|
2025-07-29 09:14:23 +08:00
|
|
|
"tag_feas": {"type": "varchar", "default": "", "analyzer": "rankfeatures"},
|
2025-03-26 15:34:42 +08:00
|
|
|
"from_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
|
|
|
"to_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
|
|
|
"entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
|
|
|
"entity_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
|
|
|
|
"source_id": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
2025-01-22 19:43:14 +08:00
|
|
|
"n_hop_with_weight": {"type": "varchar", "default": ""},
|
2025-12-23 09:35:52 +08:00
|
|
|
"mom_with_weight": {"type": "varchar", "default": ""},
|
2025-05-13 14:30:36 +08:00
|
|
|
"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
2025-10-16 12:47:24 +08:00
|
|
|
"doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
2025-11-10 19:02:25 +08:00
|
|
|
"toc_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
feat: persist RAPTOR layer metadata on summary chunks (#13286)
## Summary
RAPTOR's recursive clustering builds a `layers` list tracking
`(start_idx, end_idx)` boundaries per level, but currently discards this
information — only the flat `chunks` list is returned. This makes it
impossible to distinguish leaf-level summaries from top-level ones.
This PR:
- Returns `(chunks, layers)` tuple from `raptor.py`'s `__call__`
- Annotates each RAPTOR summary chunk with `raptor_layer_int` (1 = first
summary level, 2 = summary-of-summaries, etc.)
- Adds `raptor_layer_int` to `infinity_mapping.json` (Elasticsearch
handles it via existing `*_int` dynamic template)
### Why this matters
Downstream features need to know which RAPTOR layer a summary belongs
to:
- **Retrieving the top-level document summary** for entity extraction,
search snippets, or document comparison
- **Filtering by abstraction level** — users may want only high-level
summaries or only leaf-level cluster summaries
- **RAPTOR recall quality** — #10951 reports summaries not being
recalled for definition queries; layer metadata enables targeted
retrieval
### Changes
| File | Change | LOC |
|------|--------|-----|
| `rag/raptor.py` | Return `(chunks, layers)` tuple | ~3 |
| `rag/svr/task_executor.py` | Build `chunk_layer` mapping, set
`raptor_layer_int` | ~12 |
| `conf/infinity_mapping.json` | Add `raptor_layer_int` integer field |
~1 |
### Backward compatibility
- **Additive only** — no existing fields or behavior changed
- Existing RAPTOR chunks continue to work (they'll have
`raptor_layer_int = 0` by default)
- New RAPTOR chunks get layer metadata automatically
## Test plan
- [ ] Parse a document with RAPTOR enabled, verify `raptor_layer_int` is
set on indexed chunks
- [ ] Verify `raptor_layer_int` values increase with abstraction level
(layer 1 < layer 2 < ...)
- [ ] Verify existing RAPTOR deletion (`delete by raptor_kwd`) still
works
- [ ] Verify Infinity backend accepts the new field
Fixes #7488
Related: #4104, #11191, #10951
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
2026-04-27 10:20:46 +08:00
|
|
|
"raptor_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
2026-05-19 15:10:03 +05:30
|
|
|
"raptor_layer_int": {"type": "integer", "default": 0},
|
|
|
|
|
"extra": {"type": "varchar", "default": ""}
|
2024-11-12 14:59:41 +08:00
|
|
|
}
|