diff --git a/conf/infinity_mapping.json b/conf/infinity_mapping.json index 77d26dd960..5f7ed80f26 100644 --- a/conf/infinity_mapping.json +++ b/conf/infinity_mapping.json @@ -38,5 +38,6 @@ "removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, "doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, "toc_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, - "raptor_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"} + "raptor_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, + "raptor_layer_int": {"type": "integer", "default": 0} } diff --git a/rag/raptor.py b/rag/raptor.py index 5d952dc428..e4017319b5 100644 --- a/rag/raptor.py +++ b/rag/raptor.py @@ -111,7 +111,7 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval: async def __call__(self, chunks, random_state, callback=None, task_id: str = ""): if len(chunks) <= 1: - return [] + return [], [] chunks = [(s, a) for s, a in chunks if s and a is not None and len(a) > 0] layers = [(0, len(chunks))] start, end = 0, len(chunks) @@ -212,4 +212,4 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval: start = end end = len(chunks) - return chunks + return chunks, layers diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index c81555c76e..5f8305176c 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -843,7 +843,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si max_errors=max_errors, ) original_length = len(chunks) - chunks = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"]) + chunks, layers = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"]) effective_doc_name = row["name"] if did == fake_doc_id else doc_name_by_id.get(did, row["name"]) doc = { "doc_id": did, @@ -855,7 +855,17 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si if row["pagerank"]: doc[PAGERANK_FLD] = int(row["pagerank"]) - for content, vctr in chunks[original_length:]: + # Build index→layer mapping from RAPTOR layer boundaries. + # layers is [(start, end), ...] where layer 0 is the original chunks + # and layer 1+ are summary layers. We skip layer 0 (original chunks). + chunk_layer = {} + for layer_idx, (layer_start, layer_end) in enumerate(layers): + if layer_idx == 0: + continue # layer 0 = original input chunks, not summaries + for ci in range(layer_start, layer_end): + chunk_layer[ci] = layer_idx + + for idx, (content, vctr) in enumerate(chunks[original_length:], start=original_length): d = copy.deepcopy(doc) d["id"] = xxhash.xxh64((content + str(fake_doc_id)).encode("utf-8")).hexdigest() d["create_time"] = str(datetime.now()).replace("T", " ")[:19] @@ -864,6 +874,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si d["content_with_weight"] = content d["content_ltks"] = rag_tokenizer.tokenize(content) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) + d["raptor_layer_int"] = chunk_layer.get(idx, 1) res.append(d) tk_count += num_tokens_from_string(content)