ragflow/api/apps/services/dataset_api_service.py

#
#  Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
import logging
import json
import os
import re

from api.db.joint_services.tenant_model_service import get_model_config_from_provider_instance
from common.constants import PAGERANK_FLD
from common import settings
from api.db.db_models import File
from api.db.services.document_service import DocumentService, queue_raptor_o_graphrag_tasks
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.connector_service import Connector2KbService
from api.db.services.task_service import GRAPH_RAPTOR_FAKE_DOC_ID, TaskService
from api.db.services.user_service import TenantService, UserService, UserTenantService
from common.constants import FileSource, StatusEnum
from api.utils.api_utils import deep_merge, get_parser_config, remap_dictionary_keys, verify_embedding_availability
from common.misc_utils import thread_pool_exec

_VALID_INDEX_TYPES = {"graph", "raptor", "mindmap", "artifact", "skill"}

_INDEX_TYPE_TO_TASK_TYPE = {
    "graph": "graphrag",
    "raptor": "raptor",
    "mindmap": "mindmap",
    "artifact": "artifact",
    "skill": "skill",
}

_INDEX_TYPE_TO_TASK_ID_FIELD = {
    "graph": "graphrag_task_id",
    "raptor": "raptor_task_id",
    "mindmap": "mindmap_task_id",
    "artifact": "artifact_task_id",
    "skill": "skill_task_id",
}

_INDEX_TYPE_TO_DISPLAY_NAME = {
    "graph": "Graph",
    "raptor": "RAPTOR",
    "mindmap": "Mindmap",
    "artifact": "Artifact",
    "skill": "Skill",
}


async def create_dataset(tenant_id: str, req: dict):
    """
    Create a new dataset.

    :param tenant_id: tenant ID
    :param req: dataset creation request
    :return: (success, result) or (success, error_message)
    """
    # Extract ext field for additional parameters
    ext_fields = req.pop("ext", {})

    # Map auto_metadata_config (if provided) into parser_config structure
    auto_meta = req.pop("auto_metadata_config", {})
    if auto_meta:
        parser_cfg = req.get("parser_config") or {}
        fields = []
        for f in auto_meta.get("fields", []):
            fields.append(
                {
                    "name": f.get("name", ""),
                    "type": f.get("type", ""),
                    "description": f.get("description"),
                    "examples": f.get("examples"),
                    "restrict_values": f.get("restrict_values", False),
                }
            )
        parser_cfg["metadata"] = fields
        parser_cfg["enable_metadata"] = auto_meta.get("enabled", True)
        req["parser_config"] = parser_cfg
    req.update(ext_fields)

    e, create_dict = KnowledgebaseService.create_with_name(name=req.pop("name", None), tenant_id=tenant_id, parser_id=req.pop("parser_id", None), **req)

    if not e:
        return False, create_dict

    # Insert embedding model(embd id)
    ok, t = TenantService.get_by_id(tenant_id)
    if not ok:
        return False, "Tenant not found"
    if not create_dict.get("embd_id"):
        create_dict["embd_id"] = t.embd_id
    else:
        ok, err = verify_embedding_availability(create_dict["embd_id"], tenant_id)
        if not ok:
            return False, err

    if not KnowledgebaseService.save(**create_dict):
        return False, "Failed to save dataset"
    ok, k = KnowledgebaseService.get_by_id(create_dict["id"])
    if not ok:
        return False, "Dataset created failed"
    response_data = remap_dictionary_keys(k.to_dict())
    return True, response_data


async def delete_datasets(tenant_id: str, ids: list = None, delete_all: bool = False):
    """
    Delete datasets.

    :param tenant_id: tenant ID
    :param ids: list of dataset IDs
    :param delete_all: whether to delete all datasets of the tenant (if ids is not provided)
    :return: (success, result) or (success, error_message)
    """
    kb_id_instance_pairs = []
    if not ids:
        if not delete_all:
            return True, {"success_count": 0}
        else:
            ids = [kb.id for kb in KnowledgebaseService.query(tenant_id=tenant_id)]

    error_kb_ids = []
    for kb_id in ids:
        kb = KnowledgebaseService.get_or_none(id=kb_id, tenant_id=tenant_id)
        if kb is None:
            error_kb_ids.append(kb_id)
            continue
        kb_id_instance_pairs.append((kb_id, kb))
    if len(error_kb_ids) > 0:
        return False, f"""User '{tenant_id}' lacks permission for datasets: '{", ".join(error_kb_ids)}'"""

    errors = []
    success_count = 0
    for kb_id, kb in kb_id_instance_pairs:
        for doc in DocumentService.query(kb_id=kb_id):
            if not DocumentService.remove_document(doc, tenant_id):
                errors.append(f"Remove document '{doc.id}' error for dataset '{kb_id}'")
                continue
            f2d = File2DocumentService.get_by_document_id(doc.id)
            if f2d:
                FileService.filter_delete(
                    [
                        File.source_type == FileSource.KNOWLEDGEBASE,
                        File.id == f2d[0].file_id,
                    ]
                )
            else:
                # Normal uploads create a File2Document row via FileService.add_file_from_kb.
                # A missing row usually means stale/partial data (e.g. link removed earlier,
                # failed post-insert file linkage, or legacy rows). Deletion still proceeds.
                logging.warning(
                    "delete_datasets: document %s in dataset %s has no File2Document row; skipping linked file delete",
                    doc.id,
                    kb_id,
                )
            File2DocumentService.delete_by_document_id(doc.id)
        FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kb.name])

        # Drop index for this dataset
        try:
            from rag.nlp import search

            idxnm = search.index_name(kb.tenant_id)
            settings.docStoreConn.delete_idx(idxnm, kb_id)
        except Exception as e:
            errors.append(f"Failed to drop index for dataset {kb_id}: {e}")

        if not KnowledgebaseService.delete_by_id(kb_id):
            errors.append(f"Delete dataset error for {kb_id}")
            continue
        success_count += 1

    if not errors:
        return True, {"success_count": success_count}

    error_message = f"Successfully deleted {success_count} datasets, {len(errors)} failed. Details: {'; '.join(errors)[:128]}..."
    if success_count == 0:
        return False, error_message

    return True, {"success_count": success_count, "errors": errors[:5]}


def get_dataset(dataset_id: str, tenant_id: str):
    """
    Get a single dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :return: (success, result) or (success, error_message)
    """
    if not dataset_id:
        return False, 'Lack of "Dataset ID"'

    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, f"User '{tenant_id}' lacks permission for dataset '{dataset_id}'"

    ok, kb = KnowledgebaseService.get_by_id(dataset_id)
    if not ok:
        return False, "Invalid Dataset ID"

    response_data = remap_dictionary_keys(kb.to_dict())
    response_data["size"] = DocumentService.get_total_size_by_kb_id(dataset_id)
    response_data["connectors"] = list(Connector2KbService.list_connectors(dataset_id))
    return True, response_data


def get_ingestion_summary(dataset_id: str, tenant_id: str):
    """
    Get ingestion summary for a dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :return: (success, result) or (success, error_message)
    """
    if not dataset_id:
        return False, 'Lack of "Dataset ID"'

    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, f"User '{tenant_id}' lacks permission for dataset '{dataset_id}'"

    ok, kb = KnowledgebaseService.get_by_id(dataset_id)
    if not ok:
        return False, "Invalid Dataset ID"

    status = DocumentService.get_parsing_status_by_kb_ids([dataset_id]).get(dataset_id, {})
    return True, {
        "doc_num": kb.doc_num,
        "chunk_num": kb.chunk_num,
        "token_num": kb.token_num,
        "status": status,
    }


async def update_dataset(tenant_id: str, dataset_id: str, req: dict):
    """
    Update a dataset.

    :param tenant_id: tenant ID
    :param dataset_id: dataset ID
    :param req: dataset update request
    :return: (success, result) or (success, error_message)
    """
    if not req:
        return False, "No properties were modified"

    kb = KnowledgebaseService.get_or_none(id=dataset_id, tenant_id=tenant_id)
    if kb is None:
        return False, f"User '{tenant_id}' lacks permission for dataset '{dataset_id}'"

    # Extract ext field for additional parameters
    ext_fields = req.pop("ext", {})

    # Map auto_metadata_config into parser_config if present
    auto_meta = req.pop("auto_metadata_config", {})
    if auto_meta:
        parser_cfg = req.get("parser_config") or {}
        fields = []
        for f in auto_meta.get("fields", []):
            fields.append(
                {
                    "name": f.get("name", ""),
                    "type": f.get("type", ""),
                    "description": f.get("description"),
                    "examples": f.get("examples"),
                    "restrict_values": f.get("restrict_values", False),
                }
            )
        parser_cfg["metadata"] = fields
        parser_cfg["enable_metadata"] = auto_meta.get("enabled", True)
        req["parser_config"] = parser_cfg

    # Merge ext fields with req
    req.update(ext_fields)

    # Extract connectors from request
    connectors = []
    if "connectors" in req:
        connectors = req["connectors"]
        del req["connectors"]

    if req.get("parser_config"):
        # Flatten parent_child config into children_delimiter for the execution layer
        pc = req["parser_config"].get("parent_child", {})
        if pc.get("use_parent_child"):
            req["parser_config"]["children_delimiter"] = pc.get("children_delimiter", "\n")
            req["parser_config"]["enable_children"] = pc.get("use_parent_child", True)
        else:
            req["parser_config"]["children_delimiter"] = ""
            req["parser_config"]["enable_children"] = False
            req["parser_config"]["parent_child"] = {}

        parser_config = req["parser_config"]
        req_ext_fields = parser_config.pop("ext", {})
        parser_config.update(req_ext_fields)
        req["parser_config"] = deep_merge(kb.parser_config, parser_config)

    if (chunk_method := req.get("parser_id")) and chunk_method != kb.parser_id:
        if not req.get("parser_config"):
            req["parser_config"] = get_parser_config(chunk_method, None)
    elif "parser_config" in req and not req["parser_config"]:
        del req["parser_config"]

    if kb.pipeline_id and req.get("parser_id") and not req.get("pipeline_id"):
        # shift to use parser_id, delete old pipeline_id
        req["pipeline_id"] = ""

    if "name" in req and req["name"].lower() != kb.name.lower():
        exists = KnowledgebaseService.get_or_none(name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value)
        if exists:
            return False, f"Dataset name '{req['name']}' already exists"

    if "embd_id" in req:
        if not req["embd_id"]:
            req["embd_id"] = kb.embd_id
        ok, err = verify_embedding_availability(req["embd_id"], tenant_id)
        if not ok:
            return False, err

    if "pagerank" in req and req["pagerank"] != kb.pagerank:
        if os.environ.get("DOC_ENGINE", "elasticsearch") == "infinity":
            return False, "'pagerank' can only be set when doc_engine is elasticsearch"

        if req["pagerank"] > 0:
            from rag.nlp import search

            settings.docStoreConn.update({"kb_id": kb.id}, {PAGERANK_FLD: req["pagerank"]}, search.index_name(kb.tenant_id), kb.id)
        else:
            # Elasticsearch requires PAGERANK_FLD be non-zero!
            from rag.nlp import search

            settings.docStoreConn.update({"exists": PAGERANK_FLD}, {"remove": PAGERANK_FLD}, search.index_name(kb.tenant_id), kb.id)
    if "parse_type" in req:
        del req["parse_type"]

    if not KnowledgebaseService.update_by_id(kb.id, req):
        return False, "Update dataset error.(Database error)"

    ok, k = KnowledgebaseService.get_by_id(kb.id)
    if not ok:
        return False, "Dataset updated failed"

    # Link connectors to the dataset
    errors = Connector2KbService.link_connectors(kb.id, [conn for conn in connectors], tenant_id)
    if errors:
        logging.error("Link KB errors: %s", errors)

    response_data = remap_dictionary_keys(k.to_dict())
    response_data["connectors"] = connectors
    return True, response_data


def list_datasets(tenant_id: str, args: dict):
    """
    List datasets.

    :param tenant_id: tenant ID
    :param args: query arguments
    :return: (success, result) or (success, error_message)
    """
    kb_id = args.get("id")
    name = args.get("name")
    page = int(args.get("page", 1))
    page_size = int(args.get("page_size", 30))
    ext_fields = args.get("ext", {})
    parser_id = ext_fields.get("parser_id")
    keywords = ext_fields.get("keywords", "")
    orderby = args.get("orderby", "create_time")
    desc_arg = args.get("desc", "true")
    if isinstance(desc_arg, str):
        desc = desc_arg.lower() != "false"
    elif isinstance(desc_arg, bool):
        desc = desc_arg
    else:
        # unknown type, default to True
        desc = True

    if kb_id:
        kbs = KnowledgebaseService.get_kb_by_id(kb_id, tenant_id)
        if not kbs:
            return False, f"User '{tenant_id}' lacks permission for dataset '{kb_id}'"
    if name:
        kbs = KnowledgebaseService.get_kb_by_name(name, tenant_id)
        if not kbs:
            return False, f"User '{tenant_id}' lacks permission for dataset '{name}'"
    if ext_fields.get("owner_ids", []):
        tenant_ids = ext_fields["owner_ids"]
    else:
        tenants = TenantService.get_joined_tenants_by_user_id(tenant_id)
        tenant_ids = [m["tenant_id"] for m in tenants]
    kbs, total = KnowledgebaseService.get_list(tenant_ids, tenant_id, page, page_size, orderby, desc, kb_id, name, keywords, parser_id)
    users = UserService.get_by_ids([m["tenant_id"] for m in kbs])
    user_map = {m.id: m.to_dict() for m in users}
    response_data_list = []
    for kb in kbs:
        user_dict = user_map.get(kb["tenant_id"], {})
        kb.update({"nickname": user_dict.get("nickname", ""), "tenant_avatar": user_dict.get("avatar", "")})
        response_data_list.append(remap_dictionary_keys(kb))
    return True, {"data": response_data_list, "total": total}


async def get_knowledge_graph(dataset_id: str, tenant_id: str):
    """
    Get knowledge graph for a dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :return: (success, result) or (success, error_message)
    """
    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."
    _, kb = KnowledgebaseService.get_by_id(dataset_id)

    req = {"kb_id": [dataset_id], "knowledge_graph_kwd": ["graph"]}

    obj = {"graph": {}, "mind_map": {}}
    from rag.nlp import search

    if not settings.docStoreConn.index_exist(search.index_name(kb.tenant_id), dataset_id):
        return True, obj
    sres = await settings.retriever.search(req, search.index_name(kb.tenant_id), [dataset_id])
    if not len(sres.ids):
        return True, obj

    for id in sres.ids[:1]:
        ty = sres.field[id]["knowledge_graph_kwd"]
        try:
            content_json = json.loads(sres.field[id]["content_with_weight"])
        except Exception:
            continue

        obj[ty] = content_json

    if "nodes" in obj["graph"]:
        obj["graph"]["nodes"] = sorted(obj["graph"]["nodes"], key=lambda x: x.get("pagerank", 0), reverse=True)[:256]
        if "edges" in obj["graph"]:
            node_id_set = {o["id"] for o in obj["graph"]["nodes"]}
            filtered_edges = [o for o in obj["graph"]["edges"] if o["source"] != o["target"] and o["source"] in node_id_set and o["target"] in node_id_set]
            obj["graph"]["edges"] = sorted(filtered_edges, key=lambda x: x.get("weight", 0), reverse=True)[:128]
    return True, obj


def delete_knowledge_graph(dataset_id: str, tenant_id: str):
    """
    Delete knowledge graph for a dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :return: (success, result) or (success, error_message)
    """
    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."
    _, kb = KnowledgebaseService.get_by_id(dataset_id)
    from rag.nlp import search
    from rag.graphrag.phase_markers import clear_phase_markers

    settings.docStoreConn.delete({"knowledge_graph_kwd": ["graph", "subgraph", "entity", "relation", "community_report"]}, search.index_name(kb.tenant_id), dataset_id)
    # Wiping the graph invalidates any phase-completion markers used to
    # short-circuit resolution / community detection on resume.
    clear_phase_markers(dataset_id)
    KnowledgebaseService.update_by_id(
        kb.id,
        {"graphrag_task_id": "", "graphrag_task_finish_at": None},
    )

    return True, True


def run_index(dataset_id: str, tenant_id: str, index_type: str):
    """
    Run an indexing task (graph/raptor/mindmap) for a dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :param index_type: one of "graph", "raptor", "mindmap"
    :return: (success, result) or (success, error_message)
    """
    if index_type not in _VALID_INDEX_TYPES:
        return False, f"Invalid index type '{index_type}'. Must be one of {sorted(_VALID_INDEX_TYPES)}"

    if not dataset_id:
        return False, 'Lack of "Dataset ID"'
    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."

    ok, kb = KnowledgebaseService.get_by_id(dataset_id)
    if not ok:
        return False, "Invalid Dataset ID"

    task_type = _INDEX_TYPE_TO_TASK_TYPE[index_type]
    task_id_field = _INDEX_TYPE_TO_TASK_ID_FIELD[index_type]
    display_name = _INDEX_TYPE_TO_DISPLAY_NAME[index_type]

    existing_task_id = getattr(kb, task_id_field, None)
    if existing_task_id:
        ok, task = TaskService.get_by_id(existing_task_id)
        if not ok:
            logging.warning(f"A valid {display_name} task id is expected for Dataset {dataset_id}")

        if task and task.progress not in [-1, 1]:
            return False, f"Task {existing_task_id} in progress with status {task.progress}. A {display_name} Task is already running."

    documents, _ = DocumentService.get_by_kb_id(
        kb_id=dataset_id,
        page_number=0,
        items_per_page=0,
        orderby="create_time",
        desc=False,
        keywords="",
        run_status=[],
        types=[],
        suffix=[],
    )
    if not documents:
        return False, f"No documents in Dataset {dataset_id}"

    sample_document = documents[0]
    document_ids = [document["id"] for document in documents]

    task_id = queue_raptor_o_graphrag_tasks(sample_doc=sample_document, ty=task_type, priority=0, fake_doc_id=GRAPH_RAPTOR_FAKE_DOC_ID, doc_ids=list(document_ids))

    if not KnowledgebaseService.update_by_id(kb.id, {task_id_field: task_id}):
        logging.warning(f"Cannot save {task_id_field} for Dataset {dataset_id}")

    return True, {"task_id": task_id}


def trace_index(dataset_id: str, tenant_id: str, index_type: str):
    """
    Trace an indexing task (graph/raptor/mindmap) for a dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :param index_type: one of "graph", "raptor", "mindmap"
    :return: (success, result) or (success, error_message)
    """
    if index_type not in _VALID_INDEX_TYPES:
        return False, f"Invalid index type '{index_type}'. Must be one of {sorted(_VALID_INDEX_TYPES)}"

    if not dataset_id:
        return False, 'Lack of "Dataset ID"'

    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."

    ok, kb = KnowledgebaseService.get_by_id(dataset_id)
    if not ok:
        return False, "Invalid Dataset ID"

    task_id_field = _INDEX_TYPE_TO_TASK_ID_FIELD[index_type]
    task_id = getattr(kb, task_id_field, None)
    if not task_id:
        return True, {}

    ok, task = TaskService.get_by_id(task_id)
    if not ok:
        return True, {}

    return True, task.to_dict()


def list_tags(dataset_id: str, tenant_id: str):
    """
    List tags for a dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :return: (success, result) or (success, error_message)
    """
    if not dataset_id:
        return False, 'Lack of "Dataset ID"'

    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."

    tenants = UserTenantService.get_tenants_by_user_id(tenant_id)
    tags = []
    for tenant in tenants:
        tags += settings.retriever.all_tags(tenant["tenant_id"], [dataset_id])
    return True, tags


def aggregate_tags(dataset_ids: list[str], tenant_id: str):
    """
    Aggregate tags across multiple datasets.

    :param dataset_ids: list of dataset IDs
    :param tenant_id: tenant ID
    :return: (success, result) or (success, error_message)
    """
    if not dataset_ids:
        return False, 'Lack of "dataset_ids"'

    for dataset_id in dataset_ids:
        if not KnowledgebaseService.accessible(dataset_id, tenant_id):
            return False, f"No authorization for dataset '{dataset_id}'"

    dataset_ids_by_tenant = {}
    for dataset_id in dataset_ids:
        ok, kb = KnowledgebaseService.get_by_id(dataset_id)
        if not ok:
            return False, f"Invalid Dataset ID '{dataset_id}'"
        dataset_ids_by_tenant.setdefault(kb.tenant_id, []).append(dataset_id)

    merged = {}
    for kb_tenant_id, kb_ids in dataset_ids_by_tenant.items():
        for tag, count in settings.retriever.all_tags(kb_tenant_id, kb_ids):
            merged[tag] = merged.get(tag, 0) + count

    return True, [{"value": tag, "count": count} for tag, count in merged.items()]


def get_flattened_metadata(dataset_ids: list[str], tenant_id: str):
    """
    Get flattened metadata for datasets.

    :param dataset_ids: list of dataset IDs
    :param tenant_id: tenant ID
    :return: (success, result) or (success, error_message)
    """
    if not dataset_ids:
        return False, 'Lack of "dataset_ids"'

    for dataset_id in dataset_ids:
        if not KnowledgebaseService.accessible(dataset_id, tenant_id):
            return False, f"No authorization for dataset '{dataset_id}'"

    from api.db.services.doc_metadata_service import DocMetadataService

    return True, DocMetadataService.get_flatted_meta_by_kbs(dataset_ids)


def get_auto_metadata(dataset_id: str, tenant_id: str):
    """
    Get auto-metadata configuration for a dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :return: (success, result) or (success, error_message)
    """
    kb = KnowledgebaseService.get_or_none(id=dataset_id, tenant_id=tenant_id)
    if kb is None:
        return False, f"User '{tenant_id}' lacks permission for dataset '{dataset_id}'"
    parser_cfg = kb.parser_config or {}
    return True, {"metadata": parser_cfg.get("metadata") or [], "built_in_metadata": parser_cfg.get("built_in_metadata") or []}


async def update_auto_metadata(dataset_id: str, tenant_id: str, cfg: dict):
    """
    Update auto-metadata configuration for a dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :param cfg: auto-metadata configuration
    :return: (success, result) or (success, error_message)
    """
    kb = KnowledgebaseService.get_or_none(id=dataset_id, tenant_id=tenant_id)
    if kb is None:
        return False, f"User '{tenant_id}' lacks permission for dataset '{dataset_id}'"

    parser_cfg = kb.parser_config or {}
    parser_cfg["metadata"] = cfg.get("metadata")
    parser_cfg["built_in_metadata"] = cfg.get("built_in_metadata")

    if not KnowledgebaseService.update_by_id(kb.id, {"parser_config": parser_cfg}):
        return False, "Update auto-metadata error.(Database error)"

    return True, cfg


def delete_tags(dataset_id: str, tenant_id: str, tags: list[str]):
    """
    Delete tags from a dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :param tags: list of tags to delete
    :return: (success, result) or (success, error_message)
    """
    if not dataset_id:
        return False, 'Lack of "Dataset ID"'

    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."

    ok, kb = KnowledgebaseService.get_by_id(dataset_id)
    if not ok:
        return False, "Invalid Dataset ID"

    from rag.nlp import search

    for t in tags:
        settings.docStoreConn.update({"tag_kwd": t, "kb_id": [dataset_id]}, {"remove": {"tag_kwd": t}}, search.index_name(kb.tenant_id), dataset_id)

    return True, {}


def list_ingestion_logs(
    dataset_id: str,
    tenant_id: str,
    page: int,
    page_size: int,
    orderby: str,
    desc: bool,
    operation_status: list = None,
    create_date_from: str = None,
    create_date_to: str = None,
    log_type: str = "dataset",
    keywords: str = None,
):
    """
    List ingestion logs for a dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :param page: page number
    :param page_size: items per page
    :param orderby: order by field
    :param desc: descending order
    :param operation_status: filter by operation status
    :param create_date_from: filter start date
    :param create_date_to: filter end date
    :param log_type: "dataset" or "file"
    :param keywords: search keywords for file logs
    :return: (success, result) or (success, error_message)
    """
    if not dataset_id:
        return False, 'Lack of "Dataset ID"'

    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."

    from api.db.services.pipeline_operation_log_service import PipelineOperationLogService

    allowed_log_types = {"dataset", "file"}
    if log_type not in allowed_log_types:
        logging.warning(
            "list_ingestion_logs invalid log_type: dataset_id=%s tenant_id=%s log_type=%s",
            dataset_id,
            tenant_id,
            log_type,
        )
        return False, 'Invalid "log_type", expected "dataset" or "file"'

    logging.info(
        "list_ingestion_logs: dataset_id=%s tenant_id=%s log_type=%s page=%s page_size=%s",
        dataset_id,
        tenant_id,
        log_type,
        page,
        page_size,
    )

    if log_type == "file":
        logs, total = PipelineOperationLogService.get_file_logs_by_kb_id(dataset_id, page, page_size, orderby, desc, keywords, operation_status or [], None, None, create_date_from, create_date_to)
    else:
        logs, total = PipelineOperationLogService.get_dataset_logs_by_kb_id(dataset_id, page, page_size, orderby, desc, operation_status or [], create_date_from, create_date_to, keywords)
    return True, {"total": total, "logs": logs}


def get_ingestion_log(dataset_id: str, tenant_id: str, log_id: str):
    """
    Get a single ingestion log.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :param log_id: log ID
    :return: (success, result) or (success, error_message)
    """
    if not dataset_id:
        return False, 'Lack of "Dataset ID"'

    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."

    from api.db.services.pipeline_operation_log_service import PipelineOperationLogService

    # Return the full record (including `dsl`) so the front-end dataflow-result
    # page can render the pipeline timeline and chunks. The file-level field set
    # is a superset of the dataset-level fields, so it is valid for both
    # dataset-level (graph/raptor/mindmap) and per-file logs.
    fields = PipelineOperationLogService.get_file_logs_fields()
    log = PipelineOperationLogService.model.select(*fields).where((PipelineOperationLogService.model.id == log_id) & (PipelineOperationLogService.model.kb_id == dataset_id)).first()
    if not log:
        return False, "Log not found"

    result = log.to_dict()
    # Be explicit here: the dataflow-result page needs the full DSL payload to
    # rebuild the timeline and right-side parser view. Some serialization paths
    # can omit JSON fields from Peewee model dicts, so keep it attached here.
    result["dsl"] = log.dsl or {}
    return True, result


def delete_index(dataset_id: str, tenant_id: str, index_type: str, wipe: bool = True):
    """
    Delete an indexing task (graph/raptor/mindmap) for a dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :param index_type: one of "graph", "raptor", "mindmap"
    :param wipe: when True (default) the persisted artefacts (graph rows,
        raptor summaries) are removed from the doc store and any GraphRAG
        phase-completion markers are cleared.  Pass False to cancel the
        running task while keeping prior progress so it can be resumed.
    :return: (success, result) or (success, error_message)
    """
    if index_type not in _VALID_INDEX_TYPES:
        return False, f"Invalid index type '{index_type}'. Must be one of {sorted(_VALID_INDEX_TYPES)}"

    if not dataset_id:
        return False, 'Lack of "Dataset ID"'

    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."

    ok, kb = KnowledgebaseService.get_by_id(dataset_id)
    if not ok:
        return False, "Invalid Dataset ID"

    task_id_field = _INDEX_TYPE_TO_TASK_ID_FIELD[index_type]
    task_finish_at_field = f"{task_id_field.replace('_task_id', '_task_finish_at')}"
    task_id = getattr(kb, task_id_field, None)

    logging.info("delete_index: dataset=%s index_type=%s wipe=%s", dataset_id, index_type, wipe)

    if task_id:
        from rag.utils.redis_conn import REDIS_CONN

        try:
            REDIS_CONN.set(f"{task_id}-cancel", "x")
        except Exception as e:
            logging.exception(e)
        TaskService.delete_by_id(task_id)

    if wipe and index_type == "graph":
        from rag.nlp import search
        from rag.graphrag.phase_markers import clear_phase_markers

        settings.docStoreConn.delete({"knowledge_graph_kwd": ["graph", "subgraph", "entity", "relation", "community_report"]}, search.index_name(kb.tenant_id), dataset_id)
        # Wiping the graph invalidates any phase-completion markers used to
        # short-circuit resolution / community detection on resume.
        clear_phase_markers(dataset_id)
        logging.info("delete_index: cleared GraphRAG artefacts and phase markers for dataset=%s", dataset_id)
    elif wipe and index_type == "raptor":
        from rag.nlp import search

        settings.docStoreConn.delete({"raptor_kwd": ["raptor"]}, search.index_name(kb.tenant_id), dataset_id)
    elif wipe and index_type == "skill":
        from rag.nlp import search

        settings.docStoreConn.delete({"compile_kwd": ["skill", "skill_all"]}, search.index_name(kb.tenant_id), dataset_id)

    KnowledgebaseService.update_by_id(kb.id, {task_id_field: "", task_finish_at_field: None})
    return True, {}


def run_embedding(dataset_id: str, tenant_id: str):
    """
    Run embedding for all documents in a dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :return: (success, result) or (success, error_message)
    """
    if not dataset_id:
        return False, 'Lack of "Dataset ID"'

    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."

    ok, kb = KnowledgebaseService.get_by_id(dataset_id)
    if not ok:
        return False, "Invalid Dataset ID"

    documents, _ = DocumentService.get_by_kb_id(
        kb_id=dataset_id,
        page_number=0,
        items_per_page=0,
        orderby="create_time",
        desc=False,
        keywords="",
        run_status=[],
        types=[],
        suffix=[],
    )
    if not documents:
        return False, f"No documents in Dataset {dataset_id}"

    kb_table_num_map = {}
    for doc in documents:
        doc["tenant_id"] = tenant_id
        DocumentService.run(tenant_id, doc, kb_table_num_map)

    return True, {"scheduled_count": len(documents)}


def rename_tag(dataset_id: str, tenant_id: str, from_tag: str, to_tag: str):
    """
    Rename a tag in a dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :param from_tag: original tag name
    :param to_tag: new tag name
    :return: (success, result) or (success, error_message)
    """
    if not dataset_id:
        return False, 'Lack of "Dataset ID"'

    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."

    ok, kb = KnowledgebaseService.get_by_id(dataset_id)
    if not ok:
        return False, "Invalid Dataset ID"

    from rag.nlp import search

    settings.docStoreConn.update({"tag_kwd": from_tag, "kb_id": [dataset_id]}, {"remove": {"tag_kwd": from_tag.strip()}, "add": {"tag_kwd": to_tag}}, search.index_name(kb.tenant_id), dataset_id)

    return True, {"from": from_tag, "to": to_tag}


async def search(dataset_id: str, tenant_id: str, req: dict):
    """
    Search (retrieval test) within a dataset.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :param req: search request
    :return: (success, result) or (success, error_message)
    """
    from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type
    from api.db.services.doc_metadata_service import DocMetadataService
    from api.db.services.llm_service import LLMBundle
    from api.db.services.search_service import SearchService
    from api.db.services.user_service import UserTenantService
    from common.constants import LLMType
    from common.metadata_utils import apply_meta_data_filter
    from rag.app.tag import label_question
    from rag.prompts.generator import cross_languages, keyword_extraction

    logging.debug(
        "search(dataset=%s, tenant=%s, question_len=%s)",
        dataset_id,
        tenant_id,
        len(req.get("question", "")),
    )

    page = int(req.get("page", 1))
    size = int(req.get("size", 30))
    question = req.get("question", "")
    doc_ids = req.get("doc_ids", [])
    use_kg = req.get("use_kg", False)
    similarity_threshold = float(req.get("similarity_threshold", 0.0))
    vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
    top = max(1, min(int(req.get("top_k", 1024)), 2048))
    langs = req.get("cross_languages", [])

    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        logging.warning("search access denied: dataset=%s tenant=%s", dataset_id, tenant_id)
        return False, "Only owner of dataset authorized for this operation."

    e, kb = KnowledgebaseService.get_by_id(dataset_id)
    if not e:
        logging.warning("search dataset not found: dataset=%s", dataset_id)
        return False, "Dataset not found!"

    if doc_ids is not None and not isinstance(doc_ids, list):
        return False, "`doc_ids` should be a list"
    local_doc_ids = list(doc_ids) if doc_ids else []

    meta_data_filter = {}
    search_id = req.get("search_id", "")
    search_config = {}
    chat_mdl = None
    if search_id:
        search_detail = SearchService.get_detail(search_id)
        if not search_detail:
            logging.warning("search config not found: search_id=%s", search_id)
            return False, "Invalid search_id"
        search_config = search_detail.get("search_config", {})
        meta_data_filter = search_config.get("meta_data_filter", {})
        similarity_threshold = float(search_config.get("similarity_threshold", similarity_threshold))
        vector_similarity_weight = float(search_config.get("vector_similarity_weight", vector_similarity_weight))
        top = max(1, min(int(search_config.get("top_k", top)), 2048))
        use_kg = search_config.get("use_kg", use_kg)
        langs = search_config.get("cross_languages", langs)
        logging.debug(
            "Dataset search loaded Search config: search_id=%s dataset_id=%s vector_similarity_weight=%s full_text_weight=%s similarity_threshold=%s top_k=%s",
            search_id,
            dataset_id,
            vector_similarity_weight,
            1 - vector_similarity_weight,
            similarity_threshold,
            top,
        )
        if meta_data_filter.get("method") in ["auto", "semi_auto"]:
            chat_id = search_config.get("chat_id", "")
            if chat_id:
                chat_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.CHAT, search_config["chat_id"])
            else:
                chat_model_config = get_tenant_default_model_by_type(tenant_id, LLMType.CHAT)
            chat_mdl = LLMBundle(tenant_id, chat_model_config)
    else:
        meta_data_filter = req.get("meta_data_filter") or {}
        if meta_data_filter.get("method") in ["auto", "semi_auto"]:
            chat_model_config = get_tenant_default_model_by_type(tenant_id, LLMType.CHAT)
            chat_mdl = LLMBundle(tenant_id, chat_model_config)

    if meta_data_filter:
        local_doc_ids = await apply_meta_data_filter(
            meta_data_filter,
            None,
            question,
            chat_mdl,
            local_doc_ids,
            kb_ids=[dataset_id],
            metas_loader=lambda: DocMetadataService.get_flatted_meta_by_kbs([dataset_id]),
        )

    tenant_ids = []
    tenants = UserTenantService.query(user_id=tenant_id)
    for tenant in tenants:
        if KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=dataset_id):
            tenant_ids.append(tenant.tenant_id)
            break
    else:
        return False, "Only owner of dataset authorized for this operation."

    _question = question
    if langs:
        _question = await cross_languages(kb.tenant_id, None, _question, langs)
    if kb.embd_id:
        embd_model_config = get_model_config_from_provider_instance(kb.tenant_id, LLMType.EMBEDDING, kb.embd_id)
    else:
        embd_model_config = get_tenant_default_model_by_type(kb.tenant_id, LLMType.EMBEDDING)
    embd_mdl = LLMBundle(kb.tenant_id, embd_model_config)

    rerank_mdl = None
    rerank_id = search_config.get("rerank_id") or req.get("rerank_id")
    if rerank_id:
        rerank_model_config = get_model_config_from_provider_instance(kb.tenant_id, LLMType.RERANK.value, rerank_id)
        rerank_mdl = LLMBundle(kb.tenant_id, rerank_model_config)

    if search_config.get("keyword", req.get("keyword", False)):
        default_chat_model_config = get_tenant_default_model_by_type(kb.tenant_id, LLMType.CHAT)
        chat_mdl = LLMBundle(kb.tenant_id, default_chat_model_config)
        _question += await keyword_extraction(chat_mdl, _question)

    labels = label_question(_question, [kb])
    ranks = await settings.retriever.retrieval(
        _question,
        embd_mdl,
        tenant_ids,
        [dataset_id],
        page,
        size,
        similarity_threshold,
        vector_similarity_weight,
        doc_ids=local_doc_ids,
        top=top,
        rerank_mdl=rerank_mdl,
        rank_feature=labels,
        trace_id=search_id,
    )

    if use_kg:
        try:
            default_chat_model_config = get_tenant_default_model_by_type(tenant_id, LLMType.CHAT)
            ck = await settings.kg_retriever.retrieval(_question, tenant_ids, [dataset_id], embd_mdl, LLMBundle(kb.tenant_id, default_chat_model_config))
            if ck["content_with_weight"]:
                ranks["chunks"].insert(0, ck)
        except Exception:
            logging.warning("search KG retrieval failed: dataset=%s tenant=%s", dataset_id, tenant_id, exc_info=True)
    ranks["chunks"] = settings.retriever.retrieval_by_children(ranks["chunks"], tenant_ids)
    ranks["total"] = len(ranks["chunks"])

    for c in ranks["chunks"]:
        c.pop("vector", None)
    ranks["labels"] = labels

    return True, ranks


def check_embedding(dataset_id: str, tenant_id: str, req: dict):
    """
    Check embedding model compatibility by sampling random chunks,
    re-embedding them with the new model, and computing cosine similarity.

    :param dataset_id: dataset ID
    :param tenant_id: tenant ID
    :param req: request body with embd_id
    :return: (success, result) or (success, error_message)
    """
    import random

    import numpy as np
    from common.constants import RetCode
    from common.doc_store.doc_store_base import OrderByExpr
    from rag.nlp import search

    from api.db.services.llm_service import LLMBundle
    from common.constants import LLMType

    def _guess_vec_field(src: dict):
        for k in src or {}:
            if k.endswith("_vec"):
                return k
        return None

    def _as_float_vec(v):
        if v is None:
            return []
        if isinstance(v, str):
            return [float(x) for x in v.split("\t") if x != ""]
        if isinstance(v, (list, tuple, np.ndarray)):
            return [float(x) for x in v]
        return []

    def _to_1d(x):
        a = np.asarray(x, dtype=np.float32)
        return a.reshape(-1)

    def _cos_sim(a, b, eps=1e-12):
        a = _to_1d(a)
        b = _to_1d(b)
        na = np.linalg.norm(a)
        nb = np.linalg.norm(b)
        if na < eps or nb < eps:
            return 0.0
        return float(np.dot(a, b) / (na * nb))

    def sample_random_chunks_with_vectors(
        docStoreConn,
        tenant_id: str,
        kb_id: str,
        n: int = 5,
        base_fields=("docnm_kwd", "doc_id", "content_with_weight", "page_num_int", "position_int", "top_int"),
    ):
        index_nm = search.index_name(tenant_id)

        res0 = docStoreConn.search(
            select_fields=[],
            highlight_fields=[],
            condition={"kb_id": kb_id, "available_int": 1},
            match_expressions=[],
            order_by=OrderByExpr(),
            offset=0,
            limit=1,
            index_names=index_nm,
            knowledgebase_ids=[kb_id],
        )
        total = docStoreConn.get_total(res0)
        if total <= 0:
            return []

        n = min(n, total)
        offsets = sorted(random.sample(range(min(total, 1000)), n))
        out = []

        for off in offsets:
            res1 = docStoreConn.search(
                select_fields=list(base_fields),
                highlight_fields=[],
                condition={"kb_id": kb_id, "available_int": 1},
                match_expressions=[],
                order_by=OrderByExpr(),
                offset=off,
                limit=1,
                index_names=index_nm,
                knowledgebase_ids=[kb_id],
            )
            ids = docStoreConn.get_doc_ids(res1)
            if not ids:
                continue

            cid = ids[0]
            full_doc = docStoreConn.get(cid, index_nm, [kb_id]) or {}
            vec_field = _guess_vec_field(full_doc)
            vec = _as_float_vec(full_doc.get(vec_field))

            out.append(
                {
                    "chunk_id": cid,
                    "kb_id": kb_id,
                    "doc_id": full_doc.get("doc_id"),
                    "doc_name": full_doc.get("docnm_kwd"),
                    "vector_field": vec_field,
                    "vector_dim": len(vec),
                    "vector": vec,
                    "page_num_int": full_doc.get("page_num_int"),
                    "position_int": full_doc.get("position_int"),
                    "top_int": full_doc.get("top_int"),
                    "content_with_weight": full_doc.get("content_with_weight") or "",
                    "question_kwd": full_doc.get("question_kwd") or [],
                }
            )
        return out

    def _clean(s: str):
        return re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", s or "").strip()

    if not dataset_id:
        return False, 'Lack of "Dataset ID"'

    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."

    ok, kb = KnowledgebaseService.get_by_id(dataset_id)
    if not ok:
        return False, "Invalid Dataset ID"

    embd_id = req.get("embd_id", "")
    if not embd_id:
        return False, "`embd_id` is required."

    logging.info("check_embedding: dataset=%s tenant=%s embd_id=%s", dataset_id, tenant_id, embd_id)

    ok, err = verify_embedding_availability(embd_id, tenant_id)
    if not ok:
        return False, err

    embd_model_config = get_model_config_from_provider_instance(kb.tenant_id, LLMType.EMBEDDING, embd_id)
    emb_mdl = LLMBundle(kb.tenant_id, embd_model_config)

    n = int(req.get("check_num", 5))
    samples = sample_random_chunks_with_vectors(settings.docStoreConn, tenant_id=kb.tenant_id, kb_id=dataset_id, n=n)
    logging.info("check_embedding: dataset=%s sampled=%d chunks", dataset_id, len(samples))

    results, eff_sims = [], []
    mode = "content_only"
    for ck in samples:
        title = ck.get("doc_name") or "Title"

        txt_in = "\n".join(ck.get("question_kwd") or []) or ck.get("content_with_weight") or ""
        txt_in = _clean(txt_in)
        if not txt_in:
            results.append({"chunk_id": ck["chunk_id"], "reason": "no_text"})
            continue

        if not ck.get("vector"):
            results.append({"chunk_id": ck["chunk_id"], "reason": "no_stored_vector"})
            continue

        try:
            v, _ = emb_mdl.encode([title, txt_in])
            assert len(v[1]) == len(ck["vector"]), f"The dimension ({len(v[1])}) of given embedding model is different from the original ({len(ck['vector'])})"
            sim_content = _cos_sim(v[1], ck["vector"])
            title_w = 0.1
            qv_mix = title_w * v[0] + (1 - title_w) * v[1]
            sim_mix = _cos_sim(qv_mix, ck["vector"])
            sim = sim_content
            mode = "content_only"
            if sim_mix > sim:
                sim = sim_mix
                mode = "title+content"
        except Exception as e:
            return False, f"Embedding failure. {e}"

        eff_sims.append(sim)
        results.append(
            {
                "chunk_id": ck["chunk_id"],
                "doc_id": ck["doc_id"],
                "doc_name": ck["doc_name"],
                "vector_field": ck["vector_field"],
                "vector_dim": ck["vector_dim"],
                "cos_sim": round(sim, 6),
            }
        )

    summary = {
        "kb_id": dataset_id,
        "model": embd_id,
        "sampled": len(samples),
        "valid": len(eff_sims),
        "avg_cos_sim": round(float(np.mean(eff_sims)) if eff_sims else 0.0, 6),
        "min_cos_sim": round(float(np.min(eff_sims)) if eff_sims else 0.0, 6),
        "max_cos_sim": round(float(np.max(eff_sims)) if eff_sims else 0.0, 6),
        "match_mode": mode,
    }

    data = {"summary": summary, "results": results}
    if not eff_sims:
        logging.warning("check_embedding: dataset=%s no comparable chunks", dataset_id)
        return False, "No embedded chunks are available to compare."
    if summary["avg_cos_sim"] >= 0.9:
        logging.info("check_embedding: dataset=%s compatible avg_cos_sim=%s valid=%d", dataset_id, summary["avg_cos_sim"], len(eff_sims))
        return True, data
    logging.warning("check_embedding: dataset=%s not_effective avg_cos_sim=%s valid=%d", dataset_id, summary["avg_cos_sim"], len(eff_sims))
    return "not_effective", {
        "code": RetCode.NOT_EFFECTIVE,
        "message": "Embedding model switch failed: the average similarity between old and new vectors is below 0.9, indicating incompatible vector spaces.",
        "data": data,
    }


async def search_datasets(tenant_id: str, req: dict):
    """
    Search (retrieval test) across multiple datasets.

    :param tenant_id: tenant ID
    :param req: search request containing dataset_ids and other params
    :return: (success, result) or (success, error_message)
    """
    from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type, split_model_name
    from api.db.services.doc_metadata_service import DocMetadataService
    from api.db.services.llm_service import LLMBundle
    from api.db.services.search_service import SearchService
    from api.db.services.user_service import UserTenantService
    from common.constants import LLMType
    from common.metadata_utils import apply_meta_data_filter
    from rag.app.tag import label_question
    from rag.prompts.generator import cross_languages, keyword_extraction

    kb_ids = req.get("dataset_ids", [])
    page = int(req.get("page", 1))
    size = int(req.get("size", 30))
    question = req.get("question", "")
    doc_ids = req.get("doc_ids", [])
    use_kg = req.get("use_kg", False)
    similarity_threshold = float(req.get("similarity_threshold", 0.0))
    vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
    top = max(1, min(int(req.get("top_k", 1024)), 2048))
    langs = req.get("cross_languages", [])

    logging.debug(
        "search_datasets(datasets=%s, tenant=%s, question_len=%s)",
        kb_ids,
        tenant_id,
        len(question),
    )

    # Access check for all datasets
    for kb_id in kb_ids:
        if not KnowledgebaseService.accessible(kb_id, tenant_id):
            logging.warning("search_datasets access denied: dataset=%s tenant=%s", kb_id, tenant_id)
            return False, f"Only owner of dataset {kb_id} authorized for this operation."

    kbs = KnowledgebaseService.get_by_ids(kb_ids)
    if not kbs:
        return False, "Datasets not found!"

    # All datasets must use the same embedding model
    embd_nms = list(set([split_model_name(kb.embd_id)[0] for kb in kbs]))
    if len(embd_nms) != 1:
        return False, "Datasets use different embedding models."

    if doc_ids is not None and not isinstance(doc_ids, list):
        return False, "`doc_ids` should be a list"
    local_doc_ids = list(doc_ids) if doc_ids else []

    meta_data_filter = {}
    search_id = req.get("search_id", "")
    search_config = {}
    chat_mdl = None
    if search_id:
        search_detail = SearchService.get_detail(search_id)
        if not search_detail:
            logging.warning("search config not found: search_id=%s", search_id)
            return False, "Invalid search_id"
        search_config = search_detail.get("search_config", {})
        meta_data_filter = search_config.get("meta_data_filter", {})
        similarity_threshold = float(search_config.get("similarity_threshold", similarity_threshold))
        vector_similarity_weight = float(search_config.get("vector_similarity_weight", vector_similarity_weight))
        top = max(1, min(int(search_config.get("top_k", top)), 2048))
        use_kg = search_config.get("use_kg", use_kg)
        langs = search_config.get("cross_languages", langs)
        logging.debug(
            "Dataset search loaded Search config: search_id=%s dataset_ids=%s vector_similarity_weight=%s full_text_weight=%s similarity_threshold=%s top_k=%s",
            search_id,
            kb_ids,
            vector_similarity_weight,
            1 - vector_similarity_weight,
            similarity_threshold,
            top,
        )
        if meta_data_filter.get("method") in ["auto", "semi_auto"]:
            chat_id = search_config.get("chat_id", "")
            if chat_id:
                chat_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.CHAT, search_config["chat_id"])
            else:
                chat_model_config = get_tenant_default_model_by_type(tenant_id, LLMType.CHAT)
            chat_mdl = LLMBundle(tenant_id, chat_model_config)
    else:
        meta_data_filter = req.get("meta_data_filter") or {}
        if meta_data_filter.get("method") in ["auto", "semi_auto"]:
            chat_model_config = get_tenant_default_model_by_type(tenant_id, LLMType.CHAT)
            chat_mdl = LLMBundle(tenant_id, chat_model_config)

    if meta_data_filter:
        logging.debug("Metadata filter applied: %s, question length: %d, chat_mdl=%s", meta_data_filter, len(question), "None" if chat_mdl is None else "configured")
        local_doc_ids = await apply_meta_data_filter(
            meta_data_filter,
            None,
            question,
            chat_mdl,
            local_doc_ids,
            kb_ids=kb_ids,
            metas_loader=lambda: DocMetadataService.get_flatted_meta_by_kbs(kb_ids),
        )

    tenant_ids = []
    tenants = UserTenantService.query(user_id=tenant_id)
    for tenant in tenants:
        if any(KnowledgebaseService.query(tenant_id=tenant.tenant_id, id=kb_id) for kb_id in kb_ids):
            tenant_ids.append(tenant.tenant_id)
            break
    else:
        return False, "Only owner of datasets authorized for this operation."

    kb = kbs[0]
    _question = question
    if langs:
        _question = await cross_languages(kb.tenant_id, None, _question, langs)
    if kb.embd_id:
        embd_model_config = get_model_config_from_provider_instance(kb.tenant_id, LLMType.EMBEDDING, kb.embd_id)
    else:
        embd_model_config = get_tenant_default_model_by_type(kb.tenant_id, LLMType.EMBEDDING)
    embd_mdl = LLMBundle(kb.tenant_id, embd_model_config)

    rerank_mdl = None
    rerank_id = search_config.get("rerank_id") or req.get("rerank_id")
    if rerank_id:
        rerank_model_config = get_model_config_from_provider_instance(kb.tenant_id, LLMType.RERANK.value, rerank_id)
        rerank_mdl = LLMBundle(kb.tenant_id, rerank_model_config)

    if search_config.get("keyword", req.get("keyword", False)):
        default_chat_model_config = get_tenant_default_model_by_type(kb.tenant_id, LLMType.CHAT)
        chat_mdl = LLMBundle(kb.tenant_id, default_chat_model_config)
        _question += await keyword_extraction(chat_mdl, _question)

    labels = label_question(_question, kbs)
    ranks = await settings.retriever.retrieval(
        _question,
        embd_mdl,
        tenant_ids,
        kb_ids,
        page,
        size,
        similarity_threshold,
        vector_similarity_weight,
        doc_ids=local_doc_ids,
        top=top,
        rerank_mdl=rerank_mdl,
        rank_feature=labels,
        trace_id=search_id,
    )

    if use_kg:
        try:
            default_chat_model_config = get_tenant_default_model_by_type(tenant_id, LLMType.CHAT)
            ck = await settings.kg_retriever.retrieval(_question, tenant_ids, kb_ids, embd_mdl, LLMBundle(kb.tenant_id, default_chat_model_config))
            if ck["content_with_weight"]:
                ranks["chunks"].insert(0, ck)
        except Exception:
            logging.warning("search_datasets KG retrieval failed: datasets=%s tenant=%s", kb_ids, tenant_id, exc_info=True)
    ranks["chunks"] = settings.retriever.retrieval_by_children(ranks["chunks"], tenant_ids)
    ranks["total"] = len(ranks["chunks"])

    for c in ranks["chunks"]:
        c.pop("vector", None)
    ranks["labels"] = labels

    return True, ranks


# ---------------------------------------------------------------------------
# Artifact (knowledge compilation) page surface
#
# These three helpers power the dataset-level "Artifact" tab. They query rows
# with ``compile_kwd="artifact_page"`` written by TaskHandler's
# ``_persist_wiki_pages_to_es``. The schema fields they rely on are:
#   slug_kwd, title_kwd, page_type_kwd, content_with_weight,
#   entity_names_kwd, outlinks_kwd, related_kb_pages_kwd,
#   source_chunk_ids, source_doc_ids
# ---------------------------------------------------------------------------

_WIKI_COMPILE_KWD = "artifact_page"
_SKILL_COMPILE_KWD = "skill"
_SKILL_ALL_COMPILE_KWD = "skill_all"


def _compiled_index_or_none(tenant_id: str, kb_id: str):
    """Return (index_name, search_module) when the tenant index exists,
    else ``None``. Avoids 500s on brand-new tenants whose ES index hasn't
    been created yet."""
    from rag.nlp import search as _rag_search

    index_nm = _rag_search.index_name(tenant_id)
    if not settings.docStoreConn.index_exist(index_nm, kb_id):
        return None
    return index_nm, _rag_search


def _wiki_index_or_none(tenant_id: str, kb_id: str):
    return _compiled_index_or_none(tenant_id, kb_id)


def _skill_index_or_none(tenant_id: str, kb_id: str):
    return _compiled_index_or_none(tenant_id, kb_id)


async def has_any_wiki(dataset_id: str, tenant_id: str):
    """Fast existence probe for the sidebar tab visibility check.

    Returns ``(True, {"has": bool})`` on success or ``(False, str)`` on
    auth failure. Runs a ``limit=1`` search and reads only the total.
    """
    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."
    _, kb = KnowledgebaseService.get_by_id(dataset_id)

    pack = _wiki_index_or_none(kb.tenant_id, dataset_id)
    if pack is None:
        return True, {"has": False}
    index_nm, _ = pack

    from common.doc_store.doc_store_base import OrderByExpr

    try:
        res = settings.docStoreConn.search(
            select_fields=["id"],
            highlight_fields=[],
            condition={"compile_kwd": [_WIKI_COMPILE_KWD]},
            match_expressions=[],
            order_by=OrderByExpr(),
            offset=0,
            limit=1,
            index_names=index_nm,
            knowledgebase_ids=[dataset_id],
        )
    except Exception:
        logging.exception("has_any_wiki: docStore search failed for kb=%s", dataset_id)
        return True, {"has": False}

    total = settings.docStoreConn.get_total(res)
    return True, {"has": bool(total)}


async def list_wiki_pages(
    dataset_id: str,
    tenant_id: str,
    page: int = 1,
    page_size: int = 200,
    page_type: str | None = None,
):
    """List artifact pages for the left-hand 2-column list.

    Returns ``(True, {"total", "items": [{slug, title, page_type}, ...]})``.
    Ordering: ``page_type`` ascending, then ``title`` ascending — keeps
    pages of the same type grouped together visually.
    """
    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."
    _, kb = KnowledgebaseService.get_by_id(dataset_id)

    pack = _wiki_index_or_none(kb.tenant_id, dataset_id)
    if pack is None:
        return True, {"total": 0, "items": []}
    index_nm, _ = pack

    from common.doc_store.doc_store_base import OrderByExpr

    page = max(1, int(page or 1))
    page_size = max(1, min(int(page_size or 200), 1000))
    offset = (page - 1) * page_size

    condition: dict = {"compile_kwd": [_WIKI_COMPILE_KWD]}
    if page_type:
        condition["page_type_kwd"] = [page_type]

    order_by = OrderByExpr()
    try:
        # Most-connected pages first: outlinks_int = len(outlinks_kwd) is
        # written by the persistence layer for exactly this query.
        order_by.desc("outlinks_int").asc("title_kwd")
    except Exception:
        # OrderByExpr API differs across doc-store backends; degrade to
        # default order rather than 500.
        order_by = OrderByExpr()

    select_fields = [
        "id",
        "slug_kwd",
        "title_kwd",
        "page_type_kwd",
        "outlinks_int",
        "summary_with_weight",
    ]
    try:
        res = settings.docStoreConn.search(
            select_fields=select_fields,
            highlight_fields=[],
            condition=condition,
            match_expressions=[],
            order_by=order_by,
            offset=offset,
            limit=page_size,
            index_names=index_nm,
            knowledgebase_ids=[dataset_id],
        )
        field_map = settings.docStoreConn.get_fields(res, select_fields)
    except Exception:
        logging.exception("list_wiki_pages: docStore search failed for kb=%s", dataset_id)
        return True, {"total": 0, "items": []}

    total = settings.docStoreConn.get_total(res)
    items = []
    for row in (field_map or {}).values():
        slug = row.get("slug_kwd")
        if not isinstance(slug, str) or not slug:
            continue
        items.append(
            {
                "slug": slug,
                "title": row.get("title_kwd") or slug,
                "page_type": row.get("page_type_kwd") or "concept",
                "summary": row.get("summary_with_weight") or "",
            }
        )

    return True, {"total": int(total or 0), "items": items}


async def get_wiki_page(
    dataset_id: str,
    tenant_id: str,
    page_type: str,
    slug: str,
):
    """Fetch a single artifact page for the right-hand markdown viewer.

    ``slug`` is the tail after ``<page_type>/`` — i.e. the URL component
    that came from the markdown link ``artifact/<kb_id>/<page_type>/<slug>``.
    The stored ``slug_kwd`` is the full ``<page_type>/<slug>`` form, so we
    reconstruct it before the lookup.

    Returns ``(True, page_dict)`` or ``(True, None)`` when no row matches.
    """
    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."
    _, kb = KnowledgebaseService.get_by_id(dataset_id)

    pack = _wiki_index_or_none(kb.tenant_id, dataset_id)
    if pack is None:
        return True, None
    index_nm, _ = pack

    from common.doc_store.doc_store_base import OrderByExpr

    full_slug = f"{page_type}/{slug}" if "/" not in slug else slug
    select_fields = [
        "id",
        "slug_kwd",
        "title_kwd",
        "page_type_kwd",
        "content_with_weight",
        "summary_with_weight",
        "entity_names_kwd",
        "outlinks_kwd",
        "related_kb_pages_kwd",
        "source_chunk_ids",
        "source_doc_ids",
    ]
    try:
        res = settings.docStoreConn.search(
            select_fields=select_fields,
            highlight_fields=[],
            condition={
                "compile_kwd": [_WIKI_COMPILE_KWD],
                "page_type_kwd": [page_type],
                "slug_kwd": [full_slug],
            },
            match_expressions=[],
            order_by=OrderByExpr(),
            offset=0,
            limit=1,
            index_names=index_nm,
            knowledgebase_ids=[dataset_id],
        )
        field_map = settings.docStoreConn.get_fields(res, select_fields)
    except Exception:
        logging.exception(
            "get_wiki_page: search failed for kb=%s slug=%s",
            dataset_id,
            full_slug,
        )
        return True, None

    if not field_map:
        return True, None

    _, row = next(iter(field_map.items()))
    content_md = row.get("content_with_weight") or ""
    summary = row.get("summary_with_weight") or ""
    return True, {
        "slug": row.get("slug_kwd") or full_slug,
        "title": row.get("title_kwd") or full_slug,
        "page_type": row.get("page_type_kwd") or page_type,
        "content_md_rendered": content_md,
        "summary": summary,
        "entity_names": row.get("entity_names_kwd") or [],
        "outlinks": row.get("outlinks_kwd") or [],
        "related_kb_pages": row.get("related_kb_pages_kwd") or [],
        "source_chunk_ids": row.get("source_chunk_ids") or [],
        "source_doc_ids": row.get("source_doc_ids") or [],
    }


async def has_any_skill(dataset_id: str, tenant_id: str):
    """Fast existence probe for the dataset Skills sidebar entry."""
    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."
    _, kb = KnowledgebaseService.get_by_id(dataset_id)

    pack = _skill_index_or_none(kb.tenant_id, dataset_id)
    if pack is None:
        return True, {"has": False}
    index_nm, _ = pack

    from common.doc_store.doc_store_base import OrderByExpr

    try:
        res = settings.docStoreConn.search(
            select_fields=["id"],
            highlight_fields=[],
            condition={"compile_kwd": [_SKILL_ALL_COMPILE_KWD]},
            match_expressions=[],
            order_by=OrderByExpr(),
            offset=0,
            limit=1,
            index_names=index_nm,
            knowledgebase_ids=[dataset_id],
        )
    except Exception:
        logging.exception("has_any_skill: docStore search failed for kb=%s", dataset_id)
        return True, {"has": False}

    total = settings.docStoreConn.get_total(res)
    return True, {"has": bool(total)}


async def get_skill_tree(dataset_id: str, tenant_id: str):
    """Fetch the one-shot recursive skill tree for this dataset."""
    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."
    _, kb = KnowledgebaseService.get_by_id(dataset_id)

    pack = _skill_index_or_none(kb.tenant_id, dataset_id)
    if pack is None:
        return True, None
    index_nm, _ = pack

    from common.doc_store.doc_store_base import OrderByExpr

    select_fields = ["id", "kb_id", "doc_id", "compile_kwd", "skill_with_weight"]
    try:
        res = settings.docStoreConn.search(
            select_fields=select_fields,
            highlight_fields=[],
            condition={"compile_kwd": [_SKILL_ALL_COMPILE_KWD]},
            match_expressions=[],
            order_by=OrderByExpr(),
            offset=0,
            limit=1,
            index_names=index_nm,
            knowledgebase_ids=[dataset_id],
        )
        field_map = settings.docStoreConn.get_fields(res, select_fields)
    except Exception:
        logging.exception("get_skill_tree: docStore search failed for kb=%s", dataset_id)
        return True, None

    if not field_map:
        return True, None

    _, row = next(iter(field_map.items()))
    return True, {
        "id": row.get("id"),
        "kb_id": row.get("kb_id") or dataset_id,
        "doc_id": row.get("doc_id") or dataset_id,
        "compile_kwd": row.get("compile_kwd") or _SKILL_ALL_COMPILE_KWD,
        "skill_with_weight": json.loads(row.get("skill_with_weight")) or [],
    }


async def get_skill_page(dataset_id: str, tenant_id: str, skill_kwd: str):
    """Fetch the full markdown body for a single skill node."""
    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."
    _, kb = KnowledgebaseService.get_by_id(dataset_id)

    pack = _skill_index_or_none(kb.tenant_id, dataset_id)
    if pack is None:
        return True, None
    index_nm, _ = pack

    from common.doc_store.doc_store_base import OrderByExpr

    select_fields = [
        "id",
        "kb_id",
        "doc_id",
        "compile_kwd",
        "skill_kwd",
        "depth_int",
        "children_kwd",
        "source_doc_ids",
        "md_with_weight",
    ]
    try:
        res = settings.docStoreConn.search(
            select_fields=select_fields,
            highlight_fields=[],
            condition={
                "compile_kwd": [_SKILL_COMPILE_KWD],
                "skill_kwd": [skill_kwd],
            },
            match_expressions=[],
            order_by=OrderByExpr(),
            offset=0,
            limit=1,
            index_names=index_nm,
            knowledgebase_ids=[dataset_id],
        )
        field_map = settings.docStoreConn.get_fields(res, select_fields)
    except Exception:
        logging.exception(
            "get_skill_page: docStore search failed for kb=%s skill=%s",
            dataset_id,
            skill_kwd,
        )
        return True, None

    if not field_map:
        return True, None

    _, row = next(iter(field_map.items()))
    return True, {
        "id": row.get("id"),
        "kb_id": row.get("kb_id") or dataset_id,
        "doc_id": row.get("doc_id") or dataset_id,
        "compile_kwd": row.get("compile_kwd") or _SKILL_COMPILE_KWD,
        "skill_kwd": row.get("skill_kwd") or skill_kwd,
        "depth_int": row.get("depth_int") or 0,
        "children_kwd": row.get("children_kwd") or [],
        "source_doc_ids": row.get("source_doc_ids") or [],
        "md_with_weight": row.get("md_with_weight") or "",
    }


async def update_wiki_page(
    dataset_id: str,
    tenant_id: str,
    page_type: str,
    slug: str,
    content_md: str,
    *,
    user_id: str | None = None,
    title: str | None = None,
    comments: str | None = None,
):
    """Edit an artifact page in place from the canvas double-click dialog.

    Body must contain ``content_md`` — the (possibly edited) page markdown.
    We run it through ``_wiki_transform_links`` so any newly typed
    ``[[slug]]`` references upgrade to clickable artifact URLs (and pre-rendered
    links pass through unchanged — the transform is idempotent on already-
    rendered markdown). ``summary`` is re-derived from the new rendered text.
    ``outlinks_kwd`` is rebuilt from the link-transform pass.

    Per the v1 contract, only the page row is updated. The canvas
    ``artifact_page_graph`` / ``artifact_entity`` / ``artifact_relation``
    rows stay stale until the next full artifact compile.

    Side effect: when the rendered post-save markdown differs from the
    prior stored content, one ``artifact_commit`` row is recorded
    (git-style audit). No-op saves are silently skipped — empty diff,
    no row.

    Returns ``(True, page_dict)`` mirroring ``get_wiki_page``, or
    ``(True, None)`` when the row is missing, or
    ``(False, message)`` on authorization failure.
    """
    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."
    _, kb = KnowledgebaseService.get_by_id(dataset_id)

    pack = _wiki_index_or_none(kb.tenant_id, dataset_id)
    if pack is None:
        return True, None
    index_nm, _ = pack

    from rag.advanced_rag.knowlege_compile.wiki import (
        _wiki_transform_links,
        _wiki_extract_summary,
    )
    from api.db.services.file_commit_service import FileCommitService

    full_slug = f"{page_type}/{slug}" if "/" not in slug else slug

    # Capture the pre-edit rendered content + the row id. Both come from
    # the same search: the row id is the dict key returned by
    # docStoreConn.get_fields. We need the id specifically because the
    # generic non-id update path (ESConnection.update slow branch) routes
    # through a Painless script that scrubs newlines / single quotes /
    # backslash escapes from string values — which would collapse every
    # paragraph in the saved markdown to one line. Passing the row id in
    # ``condition`` selects the fast partial-update branch which preserves
    # the JSON value verbatim.
    from common.doc_store.doc_store_base import OrderByExpr

    row_id: str | None = None
    content_before = ""
    try:
        res = settings.docStoreConn.search(
            select_fields=["id", "content_with_weight"],
            highlight_fields=[],
            condition={
                "compile_kwd": [_WIKI_COMPILE_KWD],
                "page_type_kwd": [page_type],
                "slug_kwd": [full_slug],
            },
            match_expressions=[],
            order_by=OrderByExpr(),
            offset=0,
            limit=1,
            index_names=index_nm,
            knowledgebase_ids=[dataset_id],
        )
        field_map = settings.docStoreConn.get_fields(
            res,
            ["id", "content_with_weight"],
        )
        if field_map:
            row_id, row = next(iter(field_map.items()))
            content_before = row.get("content_with_weight") or ""
    except Exception:
        logging.exception(
            "update_wiki_page: lookup failed for kb=%s slug=%s",
            dataset_id,
            full_slug,
        )
    if not row_id:
        return True, None

    content_md = content_md or ""
    rendered, outlinks = _wiki_transform_links(content_md, dataset_id)
    summary = _wiki_extract_summary(rendered) or ""

    try:
        # id-keyed condition forces the partial-update fast path — no
        # newline scrubbing. See the comment above the lookup for the
        # full reasoning.
        ok = settings.docStoreConn.update(
            {"id": row_id},
            {
                "content_with_weight": rendered,
                "summary_with_weight": summary,
                "outlinks_kwd": list(outlinks),
            },
            index_nm,
            dataset_id,
        )
    except Exception:
        logging.exception(
            "update_wiki_page: docStore update failed for kb=%s slug=%s",
            dataset_id,
            full_slug,
        )
        return True, None

    if not ok:
        return True, None

    # Record a file_commit row on every real change. ``record_page_edit``
    # returns None for empty-diff saves, which we silently swallow.
    try:
        FileCommitService.record_page_edit(
            tenant_id=tenant_id,
            kb_id=dataset_id,
            page_type=page_type,
            slug=full_slug,
            content_before=content_before,
            content_after=rendered,
            title=title,
            comments=comments,
            user_id=user_id,
        )
    except Exception:
        logging.exception(
            "update_wiki_page: file_commit record failed for kb=%s slug=%s",
            dataset_id,
            full_slug,
        )

    # Re-read the row so the dialog gets the canonical post-update state.
    return await get_wiki_page(dataset_id, tenant_id, page_type, slug)


# ``list_wiki_commits`` / ``get_wiki_commit`` retired — the two
# ``/datasets/<id>/artifacts/.../commits`` REST endpoints now go through
# the generic file-commit routes (``/datasets/<id>/commits`` with an
# optional ``?slug=`` filter), backed by
# :meth:`FileCommitService.list_page_commits` and
# :meth:`FileCommitService.get_page_commit_detail`.


# All six row types the artifact pipeline writes. Listed in dependency
# order so partial failures of earlier deletes don't leave behind state
# that downstream phases would silently reuse. ``artifact_page_graph``
# is the materialized canvas graph derived from the refined pages —
# the dataset Artifact tab's graph view reads exactly this row.
_WIKI_COMPILE_KWDS = (
    "artifact_map_extract",
    "artifact_reduce_result",
    "artifact_compilation_plan",
    "artifact_page_draft",
    "artifact_page",
    "artifact_entity",
    "artifact_relation",
)

# Tunables for the incremental graph loader. See ``get_wiki_graph``.
_WIKI_GRAPH_ENTITY_KWD = "artifact_entity"
_WIKI_GRAPH_RELATION_KWD = "artifact_relation"
_WIKI_GRAPH_ENTITY_PAGE_SIZE = 32
_WIKI_GRAPH_MAX_LOADING_ENTITY = 128


def _wiki_entity_payload(row: dict) -> dict | None:
    """Project one ``artifact_entity`` ES row onto the canvas entity shape.

    The row stores the canvas payload pre-built as JSON in
    ``content_with_weight``; we parse it back and overlay the columns
    the writer set independently (weight_int, source_chunk_ids) so the
    frontend gets the authoritative numbers regardless of any
    JSON-vs-column drift.
    """
    raw = row.get("content_with_weight") or ""
    payload: dict = {}
    if isinstance(raw, str) and raw.strip():
        try:
            parsed = json.loads(raw)
            if isinstance(parsed, dict):
                payload = parsed
        except Exception:
            pass
    slug = payload.get("slug") or row.get("slug_kwd")
    if not isinstance(slug, str) or not slug:
        return None
    out = {
        "slug": slug,
        "name": payload.get("name") or slug,
        "aliases": list(payload.get("aliases") or []),
        "description": payload.get("description") or "",
        "type": payload.get("type") or "concept",
        "weight": int(row.get("weight_int") or payload.get("weight") or 0),
    }
    source_chunk_ids = row.get("source_chunk_ids") or []
    if isinstance(source_chunk_ids, list):
        out["source_chunk_ids"] = [c for c in source_chunk_ids if isinstance(c, str) and c]
    return out


def _wiki_relation_payload(row: dict) -> dict | None:
    raw = row.get("content_with_weight") or ""
    payload: dict = {}
    if isinstance(raw, str) and raw.strip():
        try:
            parsed = json.loads(raw)
            if isinstance(parsed, dict):
                payload = parsed
        except Exception:
            pass
    src = payload.get("from") or row.get("from_kwd")
    tgt = payload.get("to") or row.get("to_kwd")
    if not isinstance(src, str) or not src or not isinstance(tgt, str) or not tgt:
        return None
    return {"from": src, "to": tgt}


async def _wiki_search_entity_page(
    index_nm,
    dataset_id: str,
    offset: int,
    limit: int,
):
    """One page of artifact_entity rows, ordered by weight_int DESC."""
    from common.doc_store.doc_store_base import OrderByExpr

    order_by = OrderByExpr()
    try:
        order_by.desc("weight_int")
    except Exception:
        order_by = OrderByExpr()

    select_fields = [
        "id",
        "slug_kwd",
        "weight_int",
        "source_chunk_ids",
        "content_with_weight",
    ]
    res = await thread_pool_exec(
        settings.docStoreConn.search,
        select_fields,
        [],
        {"compile_kwd": [_WIKI_GRAPH_ENTITY_KWD]},
        [],
        order_by,
        offset,
        limit,
        index_nm,
        [dataset_id],
    )
    return settings.docStoreConn.get_fields(res, select_fields)


async def _wiki_search_entities_by_slugs(
    index_nm,
    dataset_id: str,
    slugs: list[str],
):
    """Fetch entity rows whose ``slug_kwd`` is in ``slugs``. Unordered."""
    if not slugs:
        return {}

    from common.doc_store.doc_store_base import OrderByExpr

    select_fields = [
        "id",
        "slug_kwd",
        "weight_int",
        "source_chunk_ids",
        "content_with_weight",
    ]
    res = await thread_pool_exec(
        settings.docStoreConn.search,
        select_fields,
        [],
        {
            "compile_kwd": [_WIKI_GRAPH_ENTITY_KWD],
            "slug_kwd": list(slugs),
        },
        [],
        OrderByExpr(),
        0,
        max(len(slugs), 1),
        index_nm,
        [dataset_id],
    )
    return settings.docStoreConn.get_fields(res, select_fields)


async def _wiki_search_relations_from(
    index_nm,
    dataset_id: str,
    from_slugs: list[str],
):
    """Fetch all relation rows with ``from_kwd`` in ``from_slugs``."""
    if not from_slugs:
        return {}

    from common.doc_store.doc_store_base import OrderByExpr

    select_fields = ["id", "from_kwd", "to_kwd", "content_with_weight"]
    # Generous upper bound: relations are short; bulk-pull all matching at
    # once rather than paging.
    res = await thread_pool_exec(
        settings.docStoreConn.search,
        select_fields,
        [],
        {
            "compile_kwd": [_WIKI_GRAPH_RELATION_KWD],
            "from_kwd": list(from_slugs),
        },
        [],
        OrderByExpr(),
        0,
        10000,
        index_nm,
        [dataset_id],
    )
    return settings.docStoreConn.get_fields(res, select_fields)


async def get_wiki_graph(
    dataset_id: str,
    tenant_id: str,
    node: str | None = None,
):
    """Load the canvas graph payload incrementally from per-row data.

    Two modes:

    * **Overview** (``node`` is None) — paginate ``artifact_entity`` rows
      ordered by ``weight_int DESC`` in pages of
      ``_WIKI_GRAPH_ENTITY_PAGE_SIZE``. For each page, append entities
      to a running set while the **cumulative** weight stays within
      ``_WIKI_GRAPH_MAX_LOADING_ENTITY``. Pull ``artifact_relation``
      rows whose ``from_kwd`` is in the just-added entities; pull the
      ``to`` targets that we haven't seen yet (they count toward the same
      cap). Stop once the cap is hit, or the page is empty, or no entry
      from the page fit under the budget.

    * **Click** (``node`` is a slug) — load the centre entity (always
      included), pull every ``artifact_relation`` with ``from_kwd=node``,
      then pull the ``to`` entities. Capped at
      ``_WIKI_GRAPH_MAX_LOADING_ENTITY`` for hub-node safety.

    Returns ``(True, {"entities": [...], "relations": [...]})`` shaped
    exactly as the frontend ``ForceGraph`` adapter consumes, or
    ``(False, message)`` on authorization failure.
    """
    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."
    _, kb = KnowledgebaseService.get_by_id(dataset_id)

    empty = {"entities": [], "relations": []}

    pack = _wiki_index_or_none(kb.tenant_id, dataset_id)
    if pack is None:
        return True, empty
    index_nm, _ = pack

    cap = _WIKI_GRAPH_MAX_LOADING_ENTITY
    page_size = _WIKI_GRAPH_ENTITY_PAGE_SIZE

    # ``entities`` preserves first-seen order so the canvas paints the
    # heaviest-weighted nodes first (or, in click mode, the centre node
    # first). The dict-keyed-by-slug structure also deduplicates the
    # "B is a to-target AND later a high-weight entity in its own right"
    # case cheaply.
    entities: dict[str, dict] = {}
    relations: list[dict] = []
    relation_keys: set[tuple[str, str]] = set()

    def _add_entity(payload: dict) -> bool:
        slug = payload.get("slug")
        if not isinstance(slug, str) or not slug or slug in entities:
            return False
        entities[slug] = payload
        return True

    def _add_relation(payload: dict) -> None:
        key = (payload["from"], payload["to"])
        if key in relation_keys:
            return
        relation_keys.add(key)
        relations.append(payload)

    # ---- Flow B — click expansion centred on ``node``. ----------------
    if isinstance(node, str) and node.strip():
        center_slug = node.strip()
        try:
            field_map = await _wiki_search_entities_by_slugs(
                index_nm,
                dataset_id,
                [center_slug],
            )
        except Exception:
            logging.exception(
                "get_wiki_graph: centre lookup failed kb=%s node=%s",
                dataset_id,
                center_slug,
            )
            return True, empty

        for row in (field_map or {}).values():
            payload = _wiki_entity_payload(row)
            if payload:
                _add_entity(payload)
                break

        if center_slug not in entities:
            # Caller pointed at a slug that doesn't exist; return empty
            # rather than a confusing partial graph.
            return True, empty

        # Outgoing edges from the centre, capped by MAX_LOADING_ENTITY.
        try:
            rel_map = await _wiki_search_relations_from(
                index_nm,
                dataset_id,
                [center_slug],
            )
        except Exception:
            logging.exception(
                "get_wiki_graph: relation lookup failed kb=%s node=%s",
                dataset_id,
                center_slug,
            )
            return True, {"entities": list(entities.values()), "relations": []}

        to_slugs: list[str] = []
        for row in (rel_map or {}).values():
            payload = _wiki_relation_payload(row)
            if payload is None:
                continue
            if payload["from"] != center_slug:
                continue
            # Hub-node cap: stop accepting more relations once the
            # to-target set would push us over the entity budget.
            if payload["to"] not in entities and len(entities) + len(to_slugs) >= cap:
                continue
            _add_relation(payload)
            if payload["to"] != center_slug and payload["to"] not in entities:
                if payload["to"] not in to_slugs:
                    to_slugs.append(payload["to"])

        if to_slugs:
            try:
                to_map = await _wiki_search_entities_by_slugs(
                    index_nm,
                    dataset_id,
                    to_slugs,
                )
            except Exception:
                logging.exception(
                    "get_wiki_graph: neighbour lookup failed kb=%s node=%s",
                    dataset_id,
                    center_slug,
                )
                to_map = {}
            for row in (to_map or {}).values():
                payload = _wiki_entity_payload(row)
                if payload and len(entities) < cap:
                    _add_entity(payload)

        return True, {
            "entities": list(entities.values()),
            "relations": relations,
        }

    # ---- Flow A — overview, top-weight paged with cumulative budget. ---
    cumulative_weight = 0
    page = 1
    while len(entities) < cap:
        offset = (page - 1) * page_size
        try:
            field_map = await _wiki_search_entity_page(
                index_nm,
                dataset_id,
                offset,
                page_size,
            )
        except Exception:
            logging.exception(
                "get_wiki_graph: entity page fetch failed kb=%s page=%d",
                dataset_id,
                page,
            )
            break
        if not field_map:
            break

        # Preserve weight_int DESC order from ES. Iteration over a dict
        # produced by get_fields keeps insertion order; ES returned them
        # sorted, so we can rely on that.
        page_rows = list(field_map.values())

        e_sub: list[dict] = []
        for row in page_rows:
            payload = _wiki_entity_payload(row)
            if payload is None:
                continue
            if payload["slug"] in entities:
                continue
            w = max(0, int(payload.get("weight") or 0))
            # Step 2: cumulative across the whole flow (per the spec).
            # Stop when adding this entry would push the budget over.
            # If even the first entity on a page can't fit, we exit the
            # outer loop below; this preserves the "least-weight first
            # excluded" semantics.
            # if cumulative_weight + w > cap and len(entities) + len(e_sub) > 0:
            #    break
            cumulative_weight += w
            e_sub.append(payload)
            if len(entities) + len(e_sub) >= cap:
                break

        if not e_sub:
            break

        for payload in e_sub:
            _add_entity(payload)

        # Step 3: relations originating in E_sub.
        sub_slugs = [p["slug"] for p in e_sub]
        try:
            rel_map = await _wiki_search_relations_from(
                index_nm,
                dataset_id,
                sub_slugs,
            )
        except Exception:
            logging.exception(
                "get_wiki_graph: relation page fetch failed kb=%s",
                dataset_id,
            )
            rel_map = {}

        missing_to: list[str] = []
        for row in (rel_map or {}).values():
            payload = _wiki_relation_payload(row)
            if payload is None:
                continue
            _add_relation(payload)
            if payload["to"] not in entities and payload["to"] not in missing_to:
                missing_to.append(payload["to"])

        # Step 4: hydrate the to-targets (they count toward the cap).
        if missing_to:
            try:
                to_map = await _wiki_search_entities_by_slugs(
                    index_nm,
                    dataset_id,
                    missing_to,
                )
            except Exception:
                logging.exception(
                    "get_wiki_graph: to-target hydrate failed kb=%s",
                    dataset_id,
                )
                to_map = {}
            for row in (to_map or {}).values():
                if len(entities) >= cap:
                    break
                payload = _wiki_entity_payload(row)
                if payload:
                    _add_entity(payload)

        # Step 5: page forward only if the cap allows another iteration.
        if len(entities) >= cap or len(page_rows) < page_size:
            break
        page += 1

    return True, {
        "entities": list(entities.values()),
        "relations": relations,
    }


async def clear_wiki(dataset_id: str, tenant_id: str):
    """Wipe every artifact-related row from ES for this KB.

    Touches all five ``compile_kwd`` row types the artifact pipeline writes
    (MAP extracts, REDUCE results, PLAN output, page drafts, and the
    searchable artifact_page rows). After this completes the next "Artifact"
    run starts from a clean slate — no resume cache to short-circuit MAP, no
    prior pages to reconcile against in PLAN.

    Returns ``(True, {"deleted": {kwd: count_or_True}})`` on success or
    ``(False, str)`` on auth failure.
    """
    if not KnowledgebaseService.accessible(dataset_id, tenant_id):
        return False, "No authorization."
    _, kb = KnowledgebaseService.get_by_id(dataset_id)

    pack = _wiki_index_or_none(kb.tenant_id, dataset_id)
    if pack is None:
        return True, {"deleted": {}}
    index_nm, _ = pack

    deleted: dict[str, object] = {}
    for kwd in _WIKI_COMPILE_KWDS:
        try:
            res = settings.docStoreConn.delete(
                {"compile_kwd": kwd},
                index_nm,
                dataset_id,
            )
            # Different backends return different shapes (int count, dict,
            # bool). Surface whatever we got so the caller can log it.
            deleted[kwd] = res if res is not None else True
        except Exception:
            logging.exception(
                "clear_wiki: delete failed for kwd=%s kb=%s",
                kwd,
                dataset_id,
            )
            deleted[kwd] = False

    return True, {"deleted": deleted}