diff --git a/api/db/init_data/compilation_templates/session_essence.yaml b/api/db/init_data/compilation_templates/session_essence.yaml new file mode 100644 index 0000000000..dfe7bb6888 --- /dev/null +++ b/api/db/init_data/compilation_templates/session_essence.yaml @@ -0,0 +1,50 @@ +kind: session_essence +display_name: Session Essence — Cross-source entity synopses from conversations +config: + kind: knowledge_graph + entity: + description: >- + You are a robust entity and fact extractor for conversational data. + fields: + - type: person + description: A person mentioned or participating in the conversation. + rule: | + - Full name preferred (e.g., "Alice Zhang", not "A. Zhang"). + - Max length: 60 characters. + - type: org + description: Organization, company, team, or department. + rule: | + - Use the official name when possible (e.g., "Acme Corp"). + - Max length: 80 characters. + - type: topic + description: Discussed topic, theme, or subject area. + rule: | + - Use the phrasing from the conversation (e.g., "API migration"). + - Max length: 80 characters. + - type: fact + description: A verifiable factual statement extracted from the conversation. + rule: | + - Must be a single proposition (subject + predicate + object). + - Must be directly supported by the source text. + - Do not include opinions, speculation, or hypotheticals. + fact_fields: + - subject + - predicate + - object + - polarity + - confidence + relation: + description: >- + You are an expert in extracting semantic relations from conversations. + fields: + - type: mentions + description: One entity references or brings up another. + rule: | + - Direction from referrer to referred: (A mentions B). + synthesis: + enabled: true + compile_kwd: "essence" + example: | + Write a 3-5 sentence executive summary about the entity. + Synthesize what multiple sources collectively say. + Output ONLY the paragraph. No headers, no JSON, no preamble. diff --git a/api/db/init_data/compilation_templates/session_graph.yaml b/api/db/init_data/compilation_templates/session_graph.yaml new file mode 100644 index 0000000000..3bbdb1efb5 --- /dev/null +++ b/api/db/init_data/compilation_templates/session_graph.yaml @@ -0,0 +1,67 @@ +kind: session_graph +display_name: Session Graph — Knowledge graph from conversations +config: + kind: knowledge_graph + entity: + description: >- + You are a robust entity and fact extractor for conversational data. + fields: + - type: person + description: A person mentioned or participating in the conversation. + rule: | + - Full name preferred (e.g., "Alice Zhang", not "A. Zhang"). + - Max length: 60 characters. + - type: org + description: Organization, company, team, or department. + rule: | + - Use the official name when possible (e.g., "Acme Corp"). + - Max length: 80 characters. + - type: topic + description: Discussed topic, theme, or subject area. + rule: | + - Use the phrasing from the conversation (e.g., "API migration"). + - Max length: 80 characters. + - type: fact + description: A verifiable factual statement extracted from the conversation. + rule: | + - Must be a single proposition (subject + predicate + object). + - Must be directly supported by the source text. + - Do not include opinions, speculation, or hypotheticals. + fact_fields: + - subject + - predicate + - object + - polarity + - confidence + relation: + description: >- + You are an expert in extracting semantic relations from conversations. + fields: + - type: mentions + description: One entity references or brings up another. + rule: | + - Direction from referrer to referred: (A mentions B). + - type: decides + description: A decision or resolution made during the conversation. + rule: | + - Direction from decision-maker to decision: (A decides B). + - type: assigns + description: Task or responsibility assigned to someone. + rule: | + - Direction from assigner to assignee: (A assigns B to C). + - type: references + description: Reference to a prior topic, document, or event. + rule: | + - Direction from current to referenced: (A references B). + - type: other + description: Any meaningful relation not covered by the above types. + rule: | + - Provide an explicit label in a "relation_label" field. + global_rules: '' + synthesis: + enabled: true + compile_kwd: "artifact_page" + example: | + - Build a wiki article covering the entity's role in the conversations. + - Include a summary paragraph and sections for key decisions and actions. + - Use [[entity_name]] for cross-references to related entities. diff --git a/rag/svr/task_executor_refactor/chunk_post_processor.py b/rag/svr/task_executor_refactor/chunk_post_processor.py index 6a809c8635..c071fc8ee8 100644 --- a/rag/svr/task_executor_refactor/chunk_post_processor.py +++ b/rag/svr/task_executor_refactor/chunk_post_processor.py @@ -913,6 +913,11 @@ async def run_document_structure_compile(handler, embedding_model: LLMBundle) -> ``handler._load_chunks_for_doc``) and fans each batch out to every configured non-artifact template, flushing accumulators through ``merge_compiled_structures`` at :data:`DOC_STRUCTURE_MERGE_MAX_DOCS`. + + After extract+merge, if any template has ``synthesis.enabled``, + runs ``wiki_plan_from_reduction`` + ``wiki_refine_from_plan`` to + generate synthesis output (wiki pages, essence paragraphs, etc.). + Compile_kwd and REFINE prompt are read from the template config. """ ctx = handler._task_context template_ids = _parser_config_compilation_template_ids(ctx.parser_config, ctx.tenant_id) @@ -1079,7 +1084,7 @@ async def run_document_structure_compile(handler, embedding_model: LLMBundle) -> progress_cb(msg=f" merge flush ({len(accumulators[template_id])} docs) for template ({idx + 1}/{total})") await _flush(template_id) - for idx, (template_id, _parser_cfg) in enumerate(active_templates): + for idx, (template_id, parser_cfg) in enumerate(active_templates): if ctx.has_canceled_func(ctx.id): raise TaskCanceledException(f"Task {ctx.id} was cancelled during document knowledge compilation") await _flush(template_id) @@ -1087,6 +1092,83 @@ async def run_document_structure_compile(handler, embedding_model: LLMBundle) -> ctx.recording_context.record(f"document_structure_compile:{template_id}", agg) progress_cb(msg=f"Document knowledge compilation done ({idx + 1}/{total}): {agg}") + # ── Synthesis phase ────────────────────────────────────────────── + # If the template has synthesis.enabled, run wiki PLAN+REFINE + # to generate output (wiki page, essence paragraph, etc.). + synthesis_cfg = (parser_cfg or {}).get("synthesis") or {} + if synthesis_cfg.get("enabled"): + example = synthesis_cfg.get("example") + compile_kwd = synthesis_cfg.get("compile_kwd", "artifact_page") + plan_cfg = synthesis_cfg.get("plan") or {} + + # Reserved for future wiki_plan_from_reduction extension: + # entity_type_filter, mention_count_threshold, top_n + if plan_cfg: + logging.debug( + "synthesis: template %s plan config %r reserved for future use", + template_id, plan_cfg, + ) + + if ctx.has_canceled_func(ctx.id): + raise TaskCanceledException( + f"Task {ctx.id} was cancelled before synthesis PLAN" + ) + + if not example: + logging.warning( + "synthesis: template %s has synthesis.enabled but no example; skipping", + template_id, + ) + else: + try: + from rag.advanced_rag.knowlege_compile.wiki import ( + wiki_plan_from_reduction, + wiki_refine_from_plan, + ) + + progress_cb( + msg=f"Synthesis PLAN for template {template_id} (kind={compile_kwd}) ..." + ) + plan = await wiki_plan_from_reduction( + chat_mdl=chat_mdl_by_tid[template_id], + embd_mdl=embedding_model, + tenant_id=ctx.tenant_id, + kb_id=ctx.kb_id, + callback=progress_cb, + ) + if ctx.has_canceled_func(ctx.id): + raise TaskCanceledException( + f"Task {ctx.id} was cancelled after synthesis PLAN" + ) + + if not plan or not plan.get("pages"): + progress_cb( + msg=f"Synthesis: no pages planned for template {template_id}." + ) + else: + progress_cb( + msg=f"Synthesis REFINE for template {template_id} ({len(plan['pages'])} page(s)) ..." + ) + pages = await wiki_refine_from_plan( + chat_mdl=chat_mdl_by_tid[template_id], + embd_mdl=embedding_model, + tenant_id=ctx.tenant_id, + kb_id=ctx.kb_id, + callback=progress_cb, + example=example, + ) + # Overwrite compile_kwd on every output page so the + # synthesis type is tracked correctly in ES. + for p in pages or []: + p["compile_kwd"] = compile_kwd + progress_cb( + msg=f"Synthesis done: {len(pages or [])} {compile_kwd} page(s) written." + ) + except Exception: + logging.exception( + "synthesis: failed for template %s", template_id, + ) + async def run_document_post_chunking_if_last( handler,