rag/svr/task_executor.py

#
#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
import argparse
import time

from rag.svr.task_executor_refactor.task_manager import TaskManager
from rag.svr.task_executor_refactor.recording_context import timed_with_recording, get_recording_context, RecordingContext, set_recording_context, NullRecordingContext

start_ts = time.time()

# LiteLLM fetches a model cost map from GitHub during import unless this is set.
# Parser pods should not block startup on external network access.
import os

os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "True")  # no internet, save about 10s

from common.misc_utils import thread_pool_exec

import asyncio
import socket

# from beartype import BeartypeConf
# from beartype.claw import beartype_all  # <-- you didn't sign up for this
# beartype_all(conf=BeartypeConf(violation_type=UserWarning))    # <-- emit warnings from all code
import random
import sys
import threading

from api.db import PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
from api.db.joint_services.memory_message_service import handle_save_to_memory_task
from common.connection_utils import timeout
from common.metadata_utils import turn2jsonschema, update_metadata_to
from rag.utils.base64_image import image2id
from rag.utils.raptor_utils import (
    collect_raptor_chunk_ids,
    collect_raptor_methods,
    get_raptor_clustering_method,
    get_raptor_tree_builder,
    get_skip_reason,
    make_raptor_summary_chunk_id,
    should_skip_raptor,
)
from common.log_utils import init_root_logger
from common.config_utils import show_configs
from rag.graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache
from rag.prompts.generator import keyword_extraction, question_proposal, content_tagging, run_toc_from_text, gen_metadata
import logging
import os
from datetime import datetime
import json
import xxhash
import copy
import re
from functools import partial
from multiprocessing.context import TimeoutError
from timeit import default_timer as timer
import signal
import exceptiongroup
import faulthandler
import numpy as np
from peewee import DoesNotExist
from common.constants import LLMType, ParserType, PipelineTaskType
from api.db.services.document_service import DocumentService
from api.db.services.doc_metadata_service import DocMetadataService
from api.db.services.llm_service import LLMBundle
from api.db.services.task_service import TaskService, has_canceled, CANVAS_DEBUG_DOC_ID, GRAPH_RAPTOR_FAKE_DOC_ID
from api.db.services.file2document_service import File2DocumentService
from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type, get_model_config_from_provider_instance
from common.versions import get_ragflow_version
from api.db.db_models import close_connection
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, email, tag
from rag.nlp import search, rag_tokenizer, add_positions
from rag.advanced_rag.knowlege_compile.raptor import (
    RAPTOR_TREE_BUILDER,
)
from common.token_utils import num_tokens_from_string, truncate
from rag.utils.redis_conn import REDIS_CONN, RedisDistributedLock
from rag.graphrag.utils import chat_limiter
from common.signal_utils import start_tracemalloc_and_snapshot, stop_tracemalloc
from common.exceptions import TaskCanceledException
from rag.svr.task_executor_limiter import (
    task_limiter,
    chunk_limiter,
    embed_limiter,
    minio_limiter,
    kg_limiter,
)
from common import settings
from common.constants import PAGERANK_FLD, TAG_FLD, SVR_CONSUMER_GROUP_NAME
from rag.utils.table_es_metadata import (
    aggregate_table_doc_metadata,
    merge_table_parser_config_from_kb,
    table_parser_strip_doc_metadata_keys,
)
from rag.nlp import search as nlp_search

BATCH_SIZE = 64


FACTORY = {
    "general": naive,
    ParserType.NAIVE.value: naive,
    ParserType.PAPER.value: paper,
    ParserType.BOOK.value: book,
    ParserType.PRESENTATION.value: presentation,
    ParserType.MANUAL.value: manual,
    ParserType.LAWS.value: laws,
    ParserType.QA.value: qa,
    ParserType.TABLE.value: table,
    ParserType.RESUME.value: resume,
    ParserType.PICTURE.value: picture,
    ParserType.ONE.value: one,
    ParserType.AUDIO.value: audio,
    ParserType.EMAIL.value: email,
    ParserType.KG.value: naive,
    ParserType.TAG.value: tag,
}

TASK_TYPE_TO_PIPELINE_TASK_TYPE = {
    "dataflow": PipelineTaskType.PARSE,
    "raptor": PipelineTaskType.RAPTOR,
    "graphrag": PipelineTaskType.GRAPH_RAG,
    "mindmap": PipelineTaskType.MINDMAP,
    "memory": PipelineTaskType.MEMORY,
    "artifact": PipelineTaskType.ARTIFACT,
    "skill": PipelineTaskType.SKILL,
}

UNACKED_ITERATOR = None
# Task type and executor index (consistent with SAAS version)
TASK_TYPE = "common"
TE_IDX = "0"

BOOT_AT = datetime.now().astimezone().isoformat(timespec="milliseconds")
PENDING_TASKS = 0
LAG_TASKS = 0
DONE_TASKS = 0
FAILED_TASKS = 0

CURRENT_TASKS = {}

WORKER_HEARTBEAT_TIMEOUT = int(os.environ.get("WORKER_HEARTBEAT_TIMEOUT", "120"))
stop_event = threading.Event()


def signal_handler(sig, frame):
    logging.info("Received interrupt signal, shutting down...")
    stop_event.set()
    time.sleep(1)
    sys.exit(0)


def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
    try:
        if prog is not None and prog < 0:
            msg = "[ERROR]" + msg
        cancel = has_canceled(task_id)

        if cancel:
            msg += " [Canceled]"
            prog = -1

        if to_page > 0:
            if msg:
                if from_page < to_page:
                    msg = f"Page({from_page + 1}~{to_page + 1}): " + msg
        if msg:
            msg = datetime.now().strftime("%H:%M:%S") + " " + msg
        d = {"progress_msg": msg}
        if prog is not None:
            d["progress"] = prog

        TaskService.update_progress(task_id, d)

        close_connection()
        if cancel:
            raise TaskCanceledException(msg)
        logging.info(f"set_progress({task_id}), progress: {prog}, progress_msg: {msg}")
    except TaskCanceledException:
        raise
    except DoesNotExist:
        logging.warning(f"set_progress({task_id}) got exception DoesNotExist")
    except Exception as e:
        logging.exception(f"set_progress({task_id}), progress: {prog}, progress_msg: {msg}, got exception: {e}")


async def collect():
    global CONSUMER_NAME, DONE_TASKS, FAILED_TASKS
    global UNACKED_ITERATOR

    svr_queue_names = settings.get_svr_queue_names(TASK_TYPE)

    redis_msg = None
    try:
        if not UNACKED_ITERATOR:
            UNACKED_ITERATOR = REDIS_CONN.get_unacked_iterator(svr_queue_names, SVR_CONSUMER_GROUP_NAME, CONSUMER_NAME)
        try:
            redis_msg = next(UNACKED_ITERATOR)
        except StopIteration:
            for svr_queue_name in svr_queue_names:
                redis_msg = REDIS_CONN.queue_consumer(svr_queue_name, SVR_CONSUMER_GROUP_NAME, CONSUMER_NAME)
                if redis_msg:
                    break
    except Exception as e:
        logging.exception(f"collect got exception: {e}")
        return None, None

    if not redis_msg:
        return None, None
    msg = redis_msg.get_message()
    if not msg:
        logging.error(f"collect got empty message of {redis_msg.get_msg_id()}")
        redis_msg.ack()
        return None, None

    canceled = False
    if msg.get("doc_id", "") in [GRAPH_RAPTOR_FAKE_DOC_ID, CANVAS_DEBUG_DOC_ID]:
        task = msg
        if task["task_type"] in PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES:
            task = TaskService.get_task(msg["id"], msg["doc_ids"])
            if task:
                task["doc_id"] = msg["doc_id"]
                task["doc_ids"] = msg.get("doc_ids", []) or []
    elif msg.get("task_type") == PipelineTaskType.MEMORY.lower():
        _, task_obj = TaskService.get_by_id(msg["id"])
        task = task_obj.to_dict()
    else:
        task = TaskService.get_task(msg["id"])

    if task:
        canceled = has_canceled(task["id"])
    if not task or canceled:
        state = "is unknown" if not task else "has been cancelled"
        FAILED_TASKS += 1
        logging.warning(f"collect task {msg['id']} {state}")
        redis_msg.ack()
        return None, None

    task_type = msg.get("task_type", "")
    task["task_type"] = task_type
    # Per-doc fan-out task types (today: doc-scoped raptor) carry their
    # participating doc id list on the Redis message but not on the DB
    # row. The KB-scoped branch above already does this for FAKE doc
    # tasks; mirror here so ``ctx.doc_ids`` is populated for the
    # per-doc path too.
    if "doc_ids" in msg and not task.get("doc_ids"):
        task["doc_ids"] = msg.get("doc_ids", []) or []
    if task_type[:8] == "dataflow":
        task["tenant_id"] = msg["tenant_id"]
        task["dataflow_id"] = msg["dataflow_id"]
        task["kb_id"] = msg.get("kb_id", "")
    if task_type[:6] == "memory":
        task["memory_id"] = msg["memory_id"]
        if msg.get("tenant_id"):
            task["tenant_id"] = msg["tenant_id"]
        task["source_id"] = msg["source_id"]
        task["message_dict"] = msg["message_dict"]
    return redis_msg, task


async def get_storage_binary(bucket, name):
    return await thread_pool_exec(settings.STORAGE_IMPL.get, bucket, name)


@timed_with_recording
@timeout(60 * 80, 1)
async def build_chunks(task, progress_callback):
    if task["size"] > settings.DOC_MAXIMUM_SIZE:
        set_progress(task["id"], prog=-1, msg="File size exceeds( <= %dMb )" % (int(settings.DOC_MAXIMUM_SIZE / 1024 / 1024)))
        get_recording_context().record("file_size_exceeded", True)
        return []
    get_recording_context().record("file_size_exceeded", False)
    get_recording_context().record("parser_id", task["parser_id"])

    chunker = FACTORY[task["parser_id"].lower()]
    try:
        st = timer()
        bucket, name = File2DocumentService.get_storage_address(doc_id=task["doc_id"])
        binary = await get_storage_binary(bucket, name)
        if binary is None:
            raise FileNotFoundError(f"File not found: storage returned no content for {bucket}/{name}.")
        logging.info("From minio({}) {}/{}".format(timer() - st, task["location"], task["name"]))
    except TimeoutError:
        progress_callback(-1, "Internal server error: Fetch file from minio timeout. Could you try it again.")
        logging.exception("Minio {}/{} got timeout: Fetch file from minio timeout.".format(task["location"], task["name"]))
        raise
    except Exception as e:
        if re.search("(No such file|not found)", str(e)):
            progress_callback(-1, "Can not find file <%s> from minio. Could you try it again?" % task["name"])
        else:
            progress_callback(-1, "Get file from minio: %s" % str(e).replace("'", ""))
        logging.exception("Chunking {}/{} got exception".format(task["location"], task["name"]))
        raise

    # Table parser column roles / mode are stored on the dataset (KB) parser_config;
    # chunk tasks carry document-level parser_config only — merge KB keys so manual roles apply.
    parser_config_for_chunk = merge_table_parser_config_from_kb(task)
    if task.get("parser_id", "").lower() == "table" and task.get("kb_parser_config"):
        logging.debug(
            "[TASK_EXECUTOR_DEBUG] table parser: merged KB keys into parser_config for chunk; "
            f"mode={parser_config_for_chunk.get('table_column_mode')}, "
            f"roles_keys={list((parser_config_for_chunk.get('table_column_roles') or {}).keys())}"
        )

    # Record chunk configuration for comparison
    from common.float_utils import normalize_overlapped_percent

    chunk_config = {
        "parser_id": task["parser_id"],
        "chunk_token_num": parser_config_for_chunk.get("chunk_token_num", 128),
        "overlapped_percent": normalize_overlapped_percent(parser_config_for_chunk.get("overlapped_percent", 0)),
        "delimiter": parser_config_for_chunk.get("delimiter", "\n!?。；！？"),
        "from_page": task["from_page"],
        "to_page": task["to_page"],
        "language": task["language"],
        "layout_recognizer": parser_config_for_chunk.get("layout_recognizer"),
    }
    get_recording_context().record("chunk_config", chunk_config)
    get_recording_context().record("parser_config_after_merge", parser_config_for_chunk)

    try:
        async with chunk_limiter:
            task_language = task.get("language") or "Chinese"
            cks = await thread_pool_exec(
                chunker.chunk,
                task["name"],
                binary=binary,
                from_page=task["from_page"],
                to_page=task["to_page"],
                lang=task_language,
                callback=progress_callback,
                kb_id=task["kb_id"],
                parser_config=parser_config_for_chunk,
                tenant_id=task["tenant_id"],
            )
        logging.info("Chunking({}) {}/{} done".format(timer() - st, task["location"], task["name"]))
    except TaskCanceledException:
        raise
    except Exception as e:
        progress_callback(-1, "Internal server error while chunking: %s" % str(e).replace("'", ""))
        logging.exception("Chunking {}/{} got exception".format(task["location"], task["name"]))
        raise

    # Record raw chunks for comparison
    get_recording_context().record("raw_chunks", cks)

    # Extract and persist PDF outline if the parser attached it.
    outline_data = cks[0].get("__outline__") if cks else None
    get_recording_context().record("outline_data", outline_data)

    if cks and cks[0].get("__outline__"):
        outline = cks[0].pop("__outline__")
        try:
            ret = DocMetadataService.update_document_metadata(task["doc_id"], update_metadata_to({"outline": outline}, DocMetadataService.get_document_metadata(task["doc_id"]) or {}))
            get_recording_context().save_func_return_value("DocMetadataService.update_document_metadata", ret)
            logging.info("Persisted PDF outline (%d entries) for doc %s", len(outline), task["doc_id"])
        except Exception as e:
            logging.warning("Failed to persist PDF outline for doc %s: %s", task["doc_id"], e)

    docs = []
    doc = {"doc_id": task["doc_id"], "kb_id": str(task["kb_id"])}
    if task["pagerank"]:
        doc[PAGERANK_FLD] = int(task["pagerank"])
    st = timer()

    @timeout(60)
    async def upload_to_minio(document, chunk):
        try:
            d = copy.deepcopy(document)
            d.update(chunk)
            d["id"] = xxhash.xxh64((chunk["content_with_weight"] + str(d["doc_id"])).encode("utf-8", "surrogatepass")).hexdigest()
            d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
            d["create_timestamp_flt"] = datetime.now().timestamp()

            if d.get("img_id"):
                docs.append(d)
                return

            if not d.get("image"):
                _ = d.pop("image", None)
                d["img_id"] = ""
                docs.append(d)
                return
            await image2id(d, partial(settings.STORAGE_IMPL.put, tenant_id=task["tenant_id"]), d["id"], task["kb_id"])
            docs.append(d)
        except Exception:
            logging.exception("Saving image of chunk {}/{}/{} got exception".format(task["location"], task["name"], d["id"]))
            raise

    tasks = []
    for ck in cks:
        tasks.append(asyncio.create_task(upload_to_minio(doc, ck)))
    try:
        await asyncio.gather(*tasks, return_exceptions=False)
    except Exception as e:
        logging.error(f"MINIO PUT({task['name']}) got exception: {e}")
        for t in tasks:
            t.cancel()
        await asyncio.gather(*tasks, return_exceptions=True)
        raise

    el = timer() - st
    logging.info("MINIO PUT({}) cost {:.3f} s".format(task["name"], el))

    # Record docs after MinIO upload
    get_recording_context().record("docs_after_prep", docs)

    if task["parser_config"].get("auto_keywords", 0):
        st = timer()
        progress_callback(msg="Start to generate keywords for every chunk ...")
        chat_model_config = get_model_config_from_provider_instance(task["tenant_id"], LLMType.CHAT, task["llm_id"])
        chat_mdl = LLMBundle(task["tenant_id"], chat_model_config, lang=task["language"])

        async def doc_keyword_extraction(chat_mdl, d, topn):
            cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "keywords", {"topn": topn})
            if not cached:
                if has_canceled(task["id"]):
                    progress_callback(-1, msg="Task has been canceled.")
                    return
                async with chat_limiter:
                    cached = await keyword_extraction(chat_mdl, d["content_with_weight"], topn)
                set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "keywords", {"topn": topn})
            if cached:
                d["important_kwd"] = [k for k in re.split(r"[,，;；、\r\n]+", cached) if k.strip()]
                d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
            return

        tasks = []
        for d in docs:
            tasks.append(asyncio.create_task(doc_keyword_extraction(chat_mdl, d, task["parser_config"]["auto_keywords"])))
        try:
            await asyncio.gather(*tasks, return_exceptions=False)
        except Exception as e:
            logging.error("Error in doc_keyword_extraction: {}".format(e))
            for t in tasks:
                t.cancel()
            await asyncio.gather(*tasks, return_exceptions=True)
            raise
        progress_callback(msg="Keywords generation {} chunks completed in {:.2f}s".format(len(docs), timer() - st))

    # Record keywords extraction count
    keywords = [d for d in docs if d.get("important_kwd")]
    get_recording_context().record("keywords_extracted", keywords)

    if task["parser_config"].get("auto_questions", 0):
        st = timer()
        progress_callback(msg="Start to generate questions for every chunk ...")
        chat_model_config = get_model_config_from_provider_instance(task["tenant_id"], LLMType.CHAT, task["llm_id"])
        chat_mdl = LLMBundle(task["tenant_id"], chat_model_config, lang=task["language"])

        async def doc_question_proposal(chat_mdl, d, topn):
            cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "question", {"topn": topn})
            if not cached:
                if has_canceled(task["id"]):
                    progress_callback(-1, msg="Task has been canceled.")
                    return
                async with chat_limiter:
                    cached = await question_proposal(chat_mdl, d["content_with_weight"], topn)
                set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "question", {"topn": topn})
            if cached:
                d["question_kwd"] = cached.split("\n")
                d["question_tks"] = rag_tokenizer.tokenize("\n".join(d["question_kwd"]))

        tasks = []
        for d in docs:
            tasks.append(asyncio.create_task(doc_question_proposal(chat_mdl, d, task["parser_config"]["auto_questions"])))
        try:
            await asyncio.gather(*tasks, return_exceptions=False)
        except Exception as e:
            logging.error("Error in doc_question_proposal", exc_info=e)
            for t in tasks:
                t.cancel()
            await asyncio.gather(*tasks, return_exceptions=True)
            raise
        progress_callback(msg="Question generation {} chunks completed in {:.2f}s".format(len(docs), timer() - st))

    # Record question generation
    questions = [d for d in docs if d.get("question_kwd")]
    get_recording_context().record("questions_generated", questions)

    if task["parser_config"].get("enable_metadata", False) and (task["parser_config"].get("metadata") or task["parser_config"].get("built_in_metadata")):
        st = timer()
        progress_callback(msg="Start to generate meta-data for every chunk ...")
        chat_model_config = get_model_config_from_provider_instance(task["tenant_id"], LLMType.CHAT, task["llm_id"])
        chat_mdl = LLMBundle(task["tenant_id"], chat_model_config, lang=task["language"])

        async def gen_metadata_task(chat_mdl, d):
            metadata_conf = task["parser_config"].get("metadata", [])
            built_in_metadata = list(task["parser_config"].get("built_in_metadata") or [])
            if isinstance(metadata_conf, dict):
                if not isinstance(metadata_conf.get("properties"), dict):
                    metadata_conf = {"type": "object", "properties": {}}
                if built_in_metadata:
                    metadata_conf = {
                        **metadata_conf,
                        "properties": {
                            **metadata_conf.get("properties", {}),
                            **turn2jsonschema(built_in_metadata).get("properties", {}),
                        },
                    }
            elif isinstance(metadata_conf, list):
                metadata_conf = metadata_conf + built_in_metadata
            else:
                metadata_conf = built_in_metadata
            cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "metadata", metadata_conf)
            if not cached:
                if has_canceled(task["id"]):
                    progress_callback(-1, msg="Task has been canceled.")
                    return
                async with chat_limiter:
                    cached = await gen_metadata(chat_mdl, turn2jsonschema(metadata_conf), d["content_with_weight"])
                set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "metadata", metadata_conf)
            if cached:
                d["metadata_obj"] = cached

        tasks = []
        for d in docs:
            tasks.append(asyncio.create_task(gen_metadata_task(chat_mdl, d)))
        try:
            await asyncio.gather(*tasks, return_exceptions=False)
        except Exception as e:
            logging.error("Error in doc_question_proposal", exc_info=e)
            for t in tasks:
                t.cancel()
            await asyncio.gather(*tasks, return_exceptions=True)
            raise
        metadata = {}
        for doc in docs:
            metadata = update_metadata_to(metadata, doc["metadata_obj"])
            del doc["metadata_obj"]
        if metadata:
            existing_meta = DocMetadataService.get_document_metadata(task["doc_id"])
            existing_meta = existing_meta if isinstance(existing_meta, dict) else {}
            metadata = update_metadata_to(metadata, existing_meta)
            ret = DocMetadataService.update_document_metadata(task["doc_id"], metadata)
            get_recording_context().save_func_return_value("DocMetadataService.update_document_metadata", ret)
        progress_callback(msg="Question generation {} chunks completed in {:.2f}s".format(len(docs), timer() - st))

    # Record metadata generation count
    metadata_list = [d for d in docs if d.get("metadata_obj")]
    get_recording_context().record("metadata_list_generated", metadata_list)

    if task["kb_parser_config"].get("tag_kb_ids", []):
        progress_callback(msg="Start to tag for every chunk ...")
        kb_ids = task["kb_parser_config"]["tag_kb_ids"]
        tenant_id = task["tenant_id"]
        topn_tags = task["kb_parser_config"].get("topn_tags", 3)
        S = 1000
        st = timer()
        examples = []
        all_tags = get_tags_from_cache(kb_ids)
        if not all_tags:
            all_tags = settings.retriever.all_tags_in_portion(tenant_id, kb_ids, S)
            set_tags_to_cache(kb_ids, all_tags)
        else:
            all_tags = json.loads(all_tags)
        chat_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.CHAT, task["llm_id"])
        chat_mdl = LLMBundle(task["tenant_id"], chat_model_config, lang=task["language"])

        docs_to_tag = []
        for d in docs:
            task_canceled = has_canceled(task["id"])
            if task_canceled:
                progress_callback(-1, msg="Task has been canceled.")
                return None
            if settings.retriever.tag_content(tenant_id, kb_ids, d, all_tags, topn_tags=topn_tags, S=S) and len(d[TAG_FLD]) > 0:
                examples.append({"content": d["content_with_weight"], TAG_FLD: d[TAG_FLD]})
            else:
                docs_to_tag.append(d)

        async def doc_content_tagging(chat_mdl, d, topn_tags):
            cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], all_tags, {"topn": topn_tags})
            if not cached:
                if has_canceled(task["id"]):
                    progress_callback(-1, msg="Task has been canceled.")
                    return
                picked_examples = random.choices(examples, k=2) if len(examples) > 2 else examples
                if not picked_examples:
                    picked_examples.append({"content": "This is an example", TAG_FLD: {"example": 1}})
                async with chat_limiter:
                    cached = await content_tagging(
                        chat_mdl,
                        d["content_with_weight"],
                        all_tags,
                        picked_examples,
                        topn_tags,
                    )
                if cached:
                    cached = json.dumps(cached)
            if cached:
                set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, all_tags, {"topn": topn_tags})
                d[TAG_FLD] = json.loads(cached)

        tasks = []
        for d in docs_to_tag:
            tasks.append(asyncio.create_task(doc_content_tagging(chat_mdl, d, topn_tags)))
        try:
            await asyncio.gather(*tasks, return_exceptions=False)
        except Exception as e:
            logging.error("Error tagging docs: {}".format(e))
            for t in tasks:
                t.cancel()
            await asyncio.gather(*tasks, return_exceptions=True)
            raise
        progress_callback(msg="Tagging {} chunks completed in {:.2f}s".format(len(docs), timer() - st))

    # Record tags applied
    tags_applied = [d for d in docs if d.get(TAG_FLD)]
    get_recording_context().record("tags_applied", tags_applied)

    # Record final chunks for comparison
    get_recording_context().record("final_chunks", docs)
    final_chunk_ids = [c.get("id") for c in docs if isinstance(c, dict) and "id" in c]
    get_recording_context().record("final_chunk_ids_count", len(final_chunk_ids))

    return docs


@timed_with_recording
def build_TOC(task, docs, progress_callback):
    progress_callback(msg="Start to generate table of content ...")
    chat_model_config = get_model_config_from_provider_instance(task["tenant_id"], LLMType.CHAT, task["llm_id"])
    chat_mdl = LLMBundle(task["tenant_id"], chat_model_config, lang=task["language"])
    docs = sorted(
        docs,
        key=lambda d: (
            d.get("page_num_int", 0)[0] if isinstance(d.get("page_num_int", 0), list) else d.get("page_num_int", 0),
            d.get("top_int", 0)[0] if isinstance(d.get("top_int", 0), list) else d.get("top_int", 0),
        ),
    )
    toc: list[dict] = asyncio.run(run_toc_from_text([d["content_with_weight"] for d in docs], chat_mdl, progress_callback))
    logging.info("------------ T O C -------------\n" + json.dumps(toc, ensure_ascii=False, indent="  "))
    for ii, item in enumerate(toc):
        try:
            chunk_val = item.pop("chunk_id", None)
            if chunk_val is None or str(chunk_val).strip() == "":
                logging.warning(f"Index {ii}: chunk_id is missing or empty. Skipping.")
                continue
            curr_idx = int(chunk_val)
            if curr_idx >= len(docs):
                logging.error(f"Index {ii}: chunk_id {curr_idx} exceeds docs length {len(docs)}.")
                continue
            item["ids"] = [docs[curr_idx]["id"]]
            if ii + 1 < len(toc):
                next_chunk_val = toc[ii + 1].get("chunk_id", "")
                if str(next_chunk_val).strip() != "":
                    next_idx = int(next_chunk_val)
                    for jj in range(curr_idx + 1, min(next_idx + 1, len(docs))):
                        item["ids"].append(docs[jj]["id"])
                else:
                    logging.warning(f"Index {ii + 1}: next chunk_id is empty, range fill skipped.")
        except (ValueError, TypeError) as e:
            logging.error(f"Index {ii}: Data conversion error - {e}")
        except Exception as e:
            logging.exception(f"Index {ii}: Unexpected error - {e}")

    if toc:
        d = copy.deepcopy(docs[-1])
        d["content_with_weight"] = json.dumps(toc, ensure_ascii=False)
        d["toc_kwd"] = "toc"
        d["available_int"] = 0
        d["page_num_int"] = [100000000]
        d["id"] = xxhash.xxh64((d["content_with_weight"] + str(d["doc_id"])).encode("utf-8", "surrogatepass")).hexdigest()
        return d
    return None


def init_kb(row, vector_size: int):
    idxnm = search.index_name(row["tenant_id"])
    parser_id = row.get("parser_id", None)
    return settings.docStoreConn.create_idx(idxnm, row.get("kb_id", ""), vector_size, parser_id)


@timed_with_recording
async def embedding(docs, mdl, parser_config=None, callback=None):
    if parser_config is None:
        parser_config = {}
    tts, cnts = [], []
    for d in docs:
        tts.append(d.get("docnm_kwd", "Title"))
        c = "\n".join(d.get("question_kwd", []))
        if not c:
            c = d["content_with_weight"]
        c = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", c)
        if not c.strip():
            logging.debug("embedding(): normalized whitespace-only chunk to placeholder 'None' (len=%d)", len(c))
            c = "None"
        cnts.append(c)

    tk_count = 0
    if len(tts) == len(cnts):
        vts, c = await thread_pool_exec(mdl.encode, tts[0:1])
        tts = np.tile(vts[0], (len(cnts), 1))
        tk_count += c

    @timeout(60)
    def batch_encode(txts):
        nonlocal mdl
        return mdl.encode([truncate(c, mdl.max_length - 10) for c in txts])

    cnts_batches = []
    for i in range(0, len(cnts), settings.EMBEDDING_BATCH_SIZE):
        async with embed_limiter:
            vts, c = await thread_pool_exec(batch_encode, cnts[i : i + settings.EMBEDDING_BATCH_SIZE])
        cnts_batches.append(vts)
        tk_count += c
        callback(prog=0.7 + 0.2 * (i + 1) / len(cnts), msg="")
    cnts = np.vstack(cnts_batches) if cnts_batches else np.array([])
    filename_embd_weight = parser_config.get("filename_embd_weight", 0.1)  # due to the db support none value
    if not filename_embd_weight:
        filename_embd_weight = 0.1
    title_w = float(filename_embd_weight)
    if tts.ndim == 2 and cnts.ndim == 2 and tts.shape == cnts.shape:
        vects = title_w * tts + (1 - title_w) * cnts
    else:
        vects = cnts

    assert len(vects) == len(docs)
    vector_size = 0
    for i, d in enumerate(docs):
        v = vects[i].tolist()
        vector_size = len(v)
        d["q_%d_vec" % len(v)] = v
    return tk_count, vector_size


@timed_with_recording
async def run_dataflow(task: dict):
    from api.db.services.canvas_service import UserCanvasService
    from rag.flow.pipeline import Pipeline

    task_start_ts = timer()
    dataflow_id = task["dataflow_id"]
    doc_id = task["doc_id"]
    task_id = task["id"]
    task_dataset_id = task["kb_id"]

    if task["task_type"] == "dataflow":
        e, cvs = UserCanvasService.get_by_id(dataflow_id)
        assert e, "User pipeline not found."
        dsl = cvs.dsl
    else:
        e, pipeline_log = PipelineOperationLogService.get_by_id(dataflow_id)
        assert e, "Pipeline log not found."
        dsl = pipeline_log.dsl
        dataflow_id = pipeline_log.pipeline_id
    pipeline = Pipeline(dsl, tenant_id=task["tenant_id"], doc_id=doc_id, task_id=task_id, flow_id=dataflow_id)
    chunks = await pipeline.run(file=task["file"]) if task.get("file") else await pipeline.run()
    if doc_id == CANVAS_DEBUG_DOC_ID:
        get_recording_context().record("dataflow_debug_result", "canvas_debug_mode")
        get_recording_context().record("dataflow_chunks", chunks)
        return

    if not chunks:
        get_recording_context().record("pipeline_output_count", 0)
        get_recording_context().record("pipeline_output_type", "empty")
        ret = PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
        get_recording_context().save_func_return_value("PipelineOperationLogService.create", ret)
        return

    embedding_token_consumption = chunks.get("embedding_token_consumption", 0)
    # The output key may exist with an empty payload; check presence, not truthiness.
    if "chunks" in chunks:
        chunks = copy.deepcopy(chunks["chunks"])
        output_type = "chunks"
    elif "json" in chunks:
        chunks = copy.deepcopy(chunks["json"])
        output_type = "json"
    elif "markdown" in chunks:
        chunks = [{"text": [chunks["markdown"]]}] if chunks["markdown"] else []
        output_type = "markdown"
    elif "text" in chunks:
        chunks = [{"text": [chunks["text"]]}] if chunks["text"] else []
        output_type = "text"
    elif "html" in chunks:
        chunks = [{"text": [chunks["html"]]}] if chunks["html"] else []
        output_type = "html"
    else:
        chunks = []
        output_type = "empty"

    get_recording_context().record("pipeline_output_type", output_type)
    get_recording_context().record("pipeline_output_count", len(chunks))

    # An empty normalized payload means "nothing parsed", so stop before embedding/indexing.
    if not chunks:
        ret = PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
        get_recording_context().save_func_return_value("PipelineOperationLogService.create", ret)
        return

    keys = [k for o in chunks for k in list(o.keys())]
    if not any([re.match(r"q_[0-9]+_vec", k) for k in keys]):
        try:
            set_progress(task_id, prog=0.82, msg="\n-------------------------------------\nStart to embedding...")
            e, kb = KnowledgebaseService.get_by_id(task["kb_id"])
            embedding_id = kb.embd_id
            embd_model_config = get_model_config_from_provider_instance(task["tenant_id"], LLMType.EMBEDDING, embedding_id)
            embedding_model = LLMBundle(task["tenant_id"], embd_model_config)

            @timeout(60)
            def batch_encode(txts):
                nonlocal embedding_model
                return embedding_model.encode([truncate(c, embedding_model.max_length - 10) for c in txts])

            vects_batches = []
            texts = [o.get("questions", o.get("summary", o["text"])) for o in chunks]
            delta = 0.20 / (len(texts) // settings.EMBEDDING_BATCH_SIZE + 1)
            prog = 0.8
            for i in range(0, len(texts), settings.EMBEDDING_BATCH_SIZE):
                async with embed_limiter:
                    vts, c = await thread_pool_exec(batch_encode, texts[i : i + settings.EMBEDDING_BATCH_SIZE])
                vects_batches.append(vts)
                embedding_token_consumption += c
                prog += delta
                if i % (len(texts) // settings.EMBEDDING_BATCH_SIZE / 100 + 1) == 1:
                    set_progress(task_id, prog=prog, msg=f"{i + 1} / {len(texts) // settings.EMBEDDING_BATCH_SIZE}")
            vects = np.vstack(vects_batches) if vects_batches else np.array([])
            get_recording_context().record("embedding_token_consumption", embedding_token_consumption)
            get_recording_context().record("vector_size", len(vects[0]) if len(vects) > 0 else 0)

            assert len(vects) == len(chunks)
            for i, ck in enumerate(chunks):
                v = vects[i].tolist()
                ck["q_%d_vec" % len(v)] = v
        except TaskCanceledException:
            raise
        except Exception as e:
            set_progress(task_id, prog=-1, msg=f"[ERROR]: {e}")
            ret = PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
            get_recording_context().save_func_return_value("PipelineOperationLogService.create", ret)
            return

    metadata = {}
    for ck in chunks:
        ck["doc_id"] = doc_id
        ck["kb_id"] = [str(task["kb_id"])]
        ck["docnm_kwd"] = task["name"]
        ck["create_time"] = str(datetime.now()).replace("T", " ")[:19]
        ck["create_timestamp_flt"] = datetime.now().timestamp()
        if not ck.get("id"):
            ck["id"] = xxhash.xxh64((ck["text"] + str(ck["doc_id"])).encode("utf-8")).hexdigest()
        if "questions" in ck:
            if "question_tks" not in ck:
                ck["question_kwd"] = ck["questions"].split("\n")
                ck["question_tks"] = rag_tokenizer.tokenize(str(ck["questions"]))
            del ck["questions"]
        if "keywords" in ck:
            if "important_tks" not in ck:
                ck["important_kwd"] = [k for k in re.split(r"[,，;；、\r\n]+", ck["keywords"]) if k.strip()]
                ck["important_tks"] = rag_tokenizer.tokenize(str(ck["keywords"]))
            del ck["keywords"]
        if "summary" in ck:
            if "content_ltks" not in ck:
                ck["content_ltks"] = rag_tokenizer.tokenize(str(ck["summary"]))
                ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
            del ck["summary"]
        if "metadata" in ck:
            metadata = update_metadata_to(metadata, ck["metadata"])
            del ck["metadata"]
        if "content_with_weight" not in ck:
            ck["content_with_weight"] = ck["text"]
        del ck["text"]
        if "positions" in ck:
            add_positions(ck, ck["positions"])
            del ck["positions"]

    if metadata:
        existing_meta = DocMetadataService.get_document_metadata(doc_id)
        existing_meta = existing_meta if isinstance(existing_meta, dict) else {}
        metadata = update_metadata_to(metadata, existing_meta)
        get_recording_context().record("run_dataflow_metadata", metadata)
        ret = DocMetadataService.update_document_metadata(doc_id, metadata)
        get_recording_context().save_func_return_value("DocMetadataService.update_document_metadata", ret)

    start_ts = timer()
    set_progress(task_id, prog=0.82, msg="[DOC Engine]:\nStart to index...")
    e = await insert_chunks(task_id, task["tenant_id"], task["kb_id"], chunks, partial(set_progress, task_id, 0, 100000000))
    if not e:
        ret = PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
        get_recording_context().save_func_return_value("PipelineOperationLogService.create", ret)
        return

    time_cost = timer() - start_ts
    task_time_cost = timer() - task_start_ts
    set_progress(task_id, prog=1.0, msg="Indexing done ({:.2f}s). Task done ({:.2f}s)".format(time_cost, task_time_cost))
    ret = DocumentService.increment_chunk_num(doc_id, task_dataset_id, embedding_token_consumption, len(chunks), task_time_cost)
    get_recording_context().save_func_return_value("DocumentService.increment_chunk_num", ret)
    logging.info("[Done], chunks({}), token({}), elapsed:{:.2f}".format(len(chunks), embedding_token_consumption, task_time_cost))
    get_recording_context().record("dataflow_chunks", chunks)
    ret = PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
    get_recording_context().save_func_return_value("PipelineOperationLogService.create", ret)


RAPTOR_METHOD_SEARCH_LIMIT = 10000


async def get_raptor_chunk_field_map(doc_id: str, tenant_id: str, kb_id: str) -> dict:
    """Return stored RAPTOR marker fields for a document."""
    from common.doc_store.doc_store_base import OrderByExpr
    from rag.nlp import search as nlp_search

    async def search_fields(fields: list[str], condition: dict, order_by=None):
        """Search chunk fields in the current knowledge base."""
        res = await thread_pool_exec(settings.docStoreConn.search, fields, [], condition, [], order_by or OrderByExpr(), 0, RAPTOR_METHOD_SEARCH_LIMIT, nlp_search.index_name(tenant_id), [kb_id])
        return settings.docStoreConn.get_fields(res, fields)

    primary = await search_fields(["raptor_kwd", "extra"], {"doc_id": doc_id, "raptor_kwd": ["raptor"]})
    if collect_raptor_chunk_ids(primary):
        return primary

    try:
        return await search_fields(
            ["raptor_kwd", "extra"],
            {"doc_id": doc_id},
            OrderByExpr().desc("create_timestamp_flt"),
        )
    except Exception:
        logging.debug("RAPTOR fallback method lookup with extra field failed for doc %s", doc_id, exc_info=True)
        return primary


async def get_raptor_chunk_methods(doc_id: str, tenant_id: str, kb_id: str) -> set[str]:
    """Return the RAPTOR tree builders already stored for doc_id.

    Queries directly for raptor_kwd="raptor" rows so a non-RAPTOR leading
    chunk cannot produce a false-negative result. Legacy summary chunks that
    do not have method metadata are treated as the original RAPTOR builder.
    """
    try:
        field_map = await get_raptor_chunk_field_map(doc_id, tenant_id, kb_id)
        methods = collect_raptor_methods(field_map)
        if methods:
            logging.info(
                "Checkpoint hit: RAPTOR chunks for doc %s (tenant=%s kb=%s methods=%s) already exist",
                doc_id,
                tenant_id,
                kb_id,
                sorted(methods),
            )
        else:
            logging.info(
                "Checkpoint miss: no RAPTOR chunks for doc %s (tenant=%s kb=%s)",
                doc_id,
                tenant_id,
                kb_id,
            )
        return methods
    except Exception:
        logging.exception("Failed to check RAPTOR chunks for doc %s", doc_id)
        raise


async def has_raptor_chunks(doc_id: str, tenant_id: str, kb_id: str, tree_builder: str = RAPTOR_TREE_BUILDER) -> bool:
    """Return whether doc_id already has summaries for tree_builder."""
    methods = await get_raptor_chunk_methods(doc_id, tenant_id, kb_id)
    return tree_builder in methods


async def delete_raptor_chunks(doc_id: str, tenant_id: str, kb_id: str, keep_method: str | None = None):
    """Delete RAPTOR summaries for doc_id, optionally preserving one method."""
    if keep_method is None:
        logging.info(
            "delete_raptor_chunks: removing all RAPTOR summaries (doc=%s tenant=%s kb=%s)",
            doc_id,
            tenant_id,
            kb_id,
        )
        ret = await thread_pool_exec(
            settings.docStoreConn.delete,
            {"doc_id": doc_id, "raptor_kwd": ["raptor"]},
            nlp_search.index_name(tenant_id),
            kb_id,
        )
        get_recording_context().save_func_return_value("docStoreConn.delete", ret)
        return 0

    field_map = await get_raptor_chunk_field_map(doc_id, tenant_id, kb_id)
    chunk_ids = collect_raptor_chunk_ids(field_map, exclude_methods={keep_method})
    if not chunk_ids:
        logging.debug(
            "delete_raptor_chunks: no stale RAPTOR chunks to remove (doc=%s tenant=%s kb=%s keep=%s)",
            doc_id,
            tenant_id,
            kb_id,
            keep_method,
        )
        return 0

    logging.info(
        "delete_raptor_chunks: removing %d stale RAPTOR chunks (doc=%s tenant=%s kb=%s keep=%s)",
        len(chunk_ids),
        doc_id,
        tenant_id,
        kb_id,
        keep_method,
    )
    ret = await thread_pool_exec(
        settings.docStoreConn.delete,
        {"id": list(chunk_ids)},
        nlp_search.index_name(tenant_id),
        kb_id,
    )
    get_recording_context().save_func_return_value("docStoreConn.delete", ret)
    return len(chunk_ids)


@timeout(3600)
async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_size, callback=None, doc_ids=[]):
    """Generate RAPTOR summaries for selected documents in a knowledge base."""
    fake_doc_id = GRAPH_RAPTOR_FAKE_DOC_ID

    raptor_config = kb_parser_config.get("raptor", {})
    raptor_ext_config = raptor_config.get("ext") or {}
    tree_builder = get_raptor_tree_builder(raptor_config)
    clustering_method = get_raptor_clustering_method(raptor_config)
    vctr_nm = "q_%d_vec" % vector_size

    res = []
    tk_count = 0
    cleanup_raptor_chunks = []
    max_errors = int(os.environ.get("RAPTOR_MAX_ERRORS", 3))
    doc_info_by_id = {}
    for doc_id in set(doc_ids):
        ok, source_doc = DocumentService.get_by_id(doc_id)
        if not ok or not source_doc:
            continue
        doc_info_by_id[doc_id] = {
            "name": getattr(source_doc, "name", ""),
            "type": getattr(source_doc, "type", ""),
            "parser_id": getattr(source_doc, "parser_id", ""),
            "parser_config": getattr(source_doc, "parser_config", {}) or {},
        }

    def schedule_raptor_cleanup(doc_id: str, keep_method: str | None = None):
        """Queue stale RAPTOR summaries for deletion after successful insert."""
        cleanup_plan = (doc_id, keep_method)
        if cleanup_plan not in cleanup_raptor_chunks:
            cleanup_raptor_chunks.append(cleanup_plan)

    def skip_raptor_doc(doc_id: str) -> bool:
        """Return whether RAPTOR should be skipped for this source document."""
        doc_info = doc_info_by_id.get(doc_id, {})
        file_type = doc_info.get("type") or row.get("type", "")
        parser_id = doc_info.get("parser_id") or row.get("parser_id", "")
        parser_config = doc_info.get("parser_config") or row.get("parser_config", {})
        if should_skip_raptor(file_type, parser_id, parser_config, raptor_config):
            skip_reason = get_skip_reason(file_type, parser_id, parser_config)
            doc_name = doc_info.get("name") or doc_id
            logging.info("Skipping Raptor for document %s: %s", doc_name, skip_reason)
            callback(msg=f"[RAPTOR] doc:{doc_id} skipped: {skip_reason}")
            return True
        return False

    async def generate(chunks, did):
        """Run RAPTOR and append generated summary chunks for one doc id."""
        nonlocal tk_count, res
        logging.info("RAPTOR: using tree_builder=%s clustering_method=%s for doc %s", tree_builder, clustering_method, did)
        from rag.advanced_rag.knowlege_compile.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor  # Lazy load, save around 8s

        raptor = Raptor(
            raptor_config.get("max_cluster", 64),
            chat_mdl,
            embd_mdl,
            raptor_config["prompt"],
            raptor_config["max_token"],
            raptor_config["threshold"],
            max_errors=max_errors,
            tree_builder=tree_builder,
            clustering_method=clustering_method,
            psi_exact_max_leaves=raptor_ext_config.get("psi_exact_max_leaves", 4096),
            psi_bucket_size=raptor_ext_config.get("psi_bucket_size", 1024),
        )
        original_length = len(chunks)
        chunks, layers = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"])
        effective_doc_name = row["name"] if did == fake_doc_id else doc_info_by_id.get(did, {}).get("name") or row["name"]
        doc = {
            "doc_id": did,
            "kb_id": [str(row["kb_id"])],
            "docnm_kwd": effective_doc_name,
            "title_tks": rag_tokenizer.tokenize(effective_doc_name),
            "raptor_kwd": "raptor",
            "extra": {"raptor_method": tree_builder},
        }
        if row["pagerank"]:
            doc[PAGERANK_FLD] = int(row["pagerank"])

        # Build index→layer mapping from RAPTOR layer boundaries.
        # layers is [(start, end), ...] where layer 0 is the original chunks
        # and layer 1+ are summary layers. We skip layer 0 (original chunks).
        chunk_layer = {}
        for layer_idx, (layer_start, layer_end) in enumerate(layers):
            if layer_idx == 0:
                continue  # layer 0 = original input chunks, not summaries
            for ci in range(layer_start, layer_end):
                chunk_layer[ci] = layer_idx

        for idx, (content, vctr) in enumerate(chunks[original_length:], start=original_length):
            d = copy.deepcopy(doc)
            d["id"] = make_raptor_summary_chunk_id(content, did)
            d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
            d["create_timestamp_flt"] = datetime.now().timestamp()
            d[vctr_nm] = vctr.tolist()
            d["content_with_weight"] = content
            d["content_ltks"] = rag_tokenizer.tokenize(content)
            d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
            d["raptor_layer_int"] = chunk_layer.get(idx, 1)
            res.append(d)
            tk_count += num_tokens_from_string(content)

    if raptor_config.get("scope", "file") == "file":
        dataset_methods = await get_raptor_chunk_methods(fake_doc_id, row["tenant_id"], row["kb_id"])
        remove_dataset_summaries = bool(dataset_methods)
        has_file_level_target = False
        if dataset_methods:
            callback(msg="[RAPTOR] will remove dataset-level summaries after file-level summaries are available.")

        for x, doc_id in enumerate(doc_ids):
            if skip_raptor_doc(doc_id):
                callback(prog=(x + 1.0) / len(doc_ids))
                continue
            # CHECKPOINT: skip docs that already have RAPTOR chunks in the doc store
            existing_methods = await get_raptor_chunk_methods(doc_id, row["tenant_id"], row["kb_id"])
            if tree_builder in existing_methods:
                has_file_level_target = True
                if existing_methods != {tree_builder}:
                    schedule_raptor_cleanup(doc_id, tree_builder)
                    callback(msg=f"[RAPTOR] doc:{doc_id} will remove old RAPTOR summaries after insert.")
                callback(msg=f"[RAPTOR] doc:{doc_id} already has {tree_builder} RAPTOR chunks, skipping.")
                callback(prog=(x + 1.0) / len(doc_ids))
                continue
            if existing_methods:
                callback(msg=f"[RAPTOR] doc:{doc_id} will migrate RAPTOR summaries to {tree_builder} after insert.")

            chunks = []
            skipped_chunks = 0
            for d in settings.retriever.chunk_list(doc_id, row["tenant_id"], [str(row["kb_id"])], fields=["content_with_weight", vctr_nm], sort_by_position=True):
                # Skip chunks that don't have the required vector field (may have been indexed with different embedding model)
                if vctr_nm not in d or d[vctr_nm] is None:
                    skipped_chunks += 1
                    logging.warning(f"RAPTOR: Chunk missing vector field '{vctr_nm}' in doc {doc_id}, skipping")
                    continue
                chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))

            if skipped_chunks > 0:
                callback(msg=f"[WARN] Skipped {skipped_chunks} chunks without vector field '{vctr_nm}' for doc {doc_id}. Consider re-parsing the document with the current embedding model.")

            if not chunks:
                logging.warning(f"RAPTOR: No valid chunks with vectors found for doc {doc_id}")
                callback(msg=f"[WARN] No valid chunks with vectors found for doc {doc_id}, skipping")
                continue

            before_generate = len(res)
            await generate(chunks, doc_id)
            if len(res) > before_generate:
                has_file_level_target = True
                if existing_methods:
                    schedule_raptor_cleanup(doc_id, tree_builder)
            callback(prog=(x + 1.0) / len(doc_ids))

        if remove_dataset_summaries:
            if has_file_level_target:
                schedule_raptor_cleanup(fake_doc_id)
            else:
                callback(msg="[RAPTOR] kept dataset-level summaries because no file-level summaries were built.")
    else:
        migrated_file_docs = 0
        file_cleanup_doc_ids = []
        skipped_doc_ids = set()
        for doc_id in set(doc_ids):
            if skip_raptor_doc(doc_id):
                skipped_doc_ids.add(doc_id)
                continue
            existing_methods = await get_raptor_chunk_methods(doc_id, row["tenant_id"], row["kb_id"])
            if existing_methods:
                file_cleanup_doc_ids.append(doc_id)
                migrated_file_docs += 1
        if migrated_file_docs:
            callback(msg=f"[RAPTOR] will remove file-level summaries for {migrated_file_docs} docs after dataset-level build succeeds.")

        existing_methods = await get_raptor_chunk_methods(fake_doc_id, row["tenant_id"], row["kb_id"])
        if tree_builder in existing_methods:
            if existing_methods != {tree_builder}:
                schedule_raptor_cleanup(fake_doc_id, tree_builder)
                callback(msg="[RAPTOR] will remove old dataset-level RAPTOR summaries after insert.")
            for doc_id in file_cleanup_doc_ids:
                schedule_raptor_cleanup(doc_id)
            callback(msg=f"[RAPTOR] dataset-level {tree_builder} summaries already exist, skipping.")
            return res, tk_count, cleanup_raptor_chunks
        migrate_dataset_summaries = bool(existing_methods)
        if migrate_dataset_summaries:
            callback(msg=f"[RAPTOR] will migrate dataset-level RAPTOR summaries to {tree_builder} after insert.")

        chunks = []
        skipped_chunks = 0
        for doc_id in doc_ids:
            if doc_id in skipped_doc_ids:
                continue
            for d in settings.retriever.chunk_list(doc_id, row["tenant_id"], [str(row["kb_id"])], fields=["content_with_weight", vctr_nm], sort_by_position=True):
                # Skip chunks that don't have the required vector field
                if vctr_nm not in d or d[vctr_nm] is None:
                    skipped_chunks += 1
                    logging.warning(f"RAPTOR: Chunk missing vector field '{vctr_nm}' in doc {doc_id}, skipping")
                    continue
                chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))

        if skipped_chunks > 0:
            callback(msg=f"[WARN] Skipped {skipped_chunks} chunks without vector field '{vctr_nm}'. Consider re-parsing documents with the current embedding model.")

        if not chunks:
            if skipped_doc_ids and len(skipped_doc_ids) == len(set(doc_ids)):
                callback(msg="[RAPTOR] all documents were skipped by RAPTOR auto-disable rules.")
                return res, tk_count, cleanup_raptor_chunks
            logging.error(f"RAPTOR: No valid chunks with vectors found in any document for kb {row['kb_id']}")
            callback(msg=f"[ERROR] No valid chunks with vectors found. Please ensure documents are parsed with the current embedding model (vector size: {vector_size}).")
            return res, tk_count, cleanup_raptor_chunks

        before_generate = len(res)
        await generate(chunks, fake_doc_id)
        if len(res) > before_generate:
            for doc_id in file_cleanup_doc_ids:
                schedule_raptor_cleanup(doc_id)
            if migrate_dataset_summaries:
                schedule_raptor_cleanup(fake_doc_id, tree_builder)

    return res, tk_count, cleanup_raptor_chunks


async def delete_image(kb_id, chunk_id):
    try:
        async with minio_limiter:
            settings.STORAGE_IMPL.delete(kb_id, chunk_id)
    except Exception:
        logging.exception(f"Deleting image of chunk {chunk_id} got exception")
        raise


@timed_with_recording
async def insert_chunks(task_id, task_tenant_id, task_dataset_id, chunks, progress_callback):
    """
    Insert chunks into document store (Elasticsearch OR Infinity).

    Args:
        task_id: Task identifier
        task_tenant_id: Tenant ID
        task_dataset_id: Dataset/knowledge base ID
        chunks: List of chunk dictionaries to insert
        progress_callback: Callback function for progress updates
    """
    mothers = []
    mother_ids = set([])
    for ck in chunks:
        mom = ck.get("mom") or ck.get("mom_with_weight") or ""
        if not mom:
            continue
        id = xxhash.xxh64(mom.encode("utf-8")).hexdigest()
        ck["mom_id"] = id
        if id in mother_ids:
            continue
        mother_ids.add(id)
        mom_ck = copy.deepcopy(ck)
        mom_ck["id"] = id
        mom_ck["content_with_weight"] = mom
        mom_ck["available_int"] = 0
        flds = list(mom_ck.keys())
        for fld in flds:
            if fld not in ["id", "content_with_weight", "doc_id", "docnm_kwd", "kb_id", "available_int", "position_int", "create_timestamp_flt", "page_num_int", "top_int"]:
                del mom_ck[fld]
        mothers.append(mom_ck)

    for b in range(0, len(mothers), settings.DOC_BULK_SIZE):
        ret = await thread_pool_exec(
            settings.docStoreConn.insert,
            mothers[b : b + settings.DOC_BULK_SIZE],
            search.index_name(task_tenant_id),
            task_dataset_id,
        )
        get_recording_context().save_func_return_value("docStoreConn.insert", ret)
        task_canceled = has_canceled(task_id)
        if task_canceled:
            progress_callback(-1, msg="Task has been canceled.")
            return False

    for b in range(0, len(chunks), settings.DOC_BULK_SIZE):
        doc_store_result = await thread_pool_exec(
            settings.docStoreConn.insert,
            chunks[b : b + settings.DOC_BULK_SIZE],
            search.index_name(task_tenant_id),
            task_dataset_id,
        )
        get_recording_context().save_func_return_value("docStoreConn.insert", doc_store_result)
        task_canceled = has_canceled(task_id)
        if task_canceled:
            # Roll back partial RAPTOR summary inserts so the next run is not
            # mistaken for a completed checkpoint by get_raptor_chunk_methods.
            raptor_ids_to_rollback = [c["id"] for c in chunks[: b + settings.DOC_BULK_SIZE] if c.get("raptor_kwd") == "raptor"]
            if raptor_ids_to_rollback:
                try:
                    ret = await thread_pool_exec(
                        settings.docStoreConn.delete,
                        {"id": raptor_ids_to_rollback},
                        search.index_name(task_tenant_id),
                        task_dataset_id,
                    )
                    get_recording_context().save_func_return_value("docStoreConn.delete", ret)
                    logging.info(
                        "insert_chunks: rolled back %d partial RAPTOR chunks after cancellation (task=%s)",
                        len(raptor_ids_to_rollback),
                        task_id,
                    )
                except Exception:
                    logging.exception(
                        "insert_chunks: failed to roll back partial RAPTOR chunks after cancellation (task=%s)",
                        task_id,
                    )
            progress_callback(-1, msg="Task has been canceled.")
            return False
        if b % 128 == 0:
            progress_callback(prog=0.8 + 0.1 * (b + 1) / len(chunks), msg="")
        if doc_store_result:
            error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
            progress_callback(-1, msg=error_message)
            raise Exception(error_message)
        chunk_ids = [chunk["id"] for chunk in chunks[: b + settings.DOC_BULK_SIZE]]
        chunk_ids_str = " ".join(chunk_ids)
        try:
            TaskService.update_chunk_ids(task_id, chunk_ids_str)
            get_recording_context().save_func_return_value("TaskService.update_chunk_ids", None)
        except DoesNotExist:
            logging.warning(f"do_handle_task update_chunk_ids failed since task {task_id} is unknown.")
            doc_store_result = await thread_pool_exec(
                settings.docStoreConn.delete,
                {"id": chunk_ids},
                search.index_name(task_tenant_id),
                task_dataset_id,
            )
            get_recording_context().save_func_return_value("docStoreConn.delete", doc_store_result)
            tasks = []
            for chunk_id in chunk_ids:
                tasks.append(asyncio.create_task(delete_image(task_dataset_id, chunk_id)))
            try:
                await asyncio.gather(*tasks, return_exceptions=False)
            except Exception as e:
                logging.error(f"delete_image failed: {e}")
                for t in tasks:
                    t.cancel()
                await asyncio.gather(*tasks, return_exceptions=True)
                raise
            progress_callback(-1, msg=f"Chunk updates failed since task {task_id} is unknown.")
            return False
    return True


@timeout(60 * 60 * 3, 1)
async def do_handle_task(task):
    task_type = task.get("task_type", "")

    if task_type == "memory":
        result = await handle_save_to_memory_task(task)
        get_recording_context().save_func_return_value("handle_save_to_memory_task", result)
        return

    if task_type == "dataflow" and task.get("doc_id", "") == CANVAS_DEBUG_DOC_ID:
        await run_dataflow(task)
        return

    task_id = task["id"]
    task_from_page = task["from_page"]
    task_to_page = task["to_page"]
    task_tenant_id = task["tenant_id"]
    task_embedding_id = task["embd_id"]
    task_language = task.get("language") or "Chinese"
    if not task.get("language"):
        logging.warning("Task %s has no language set, falling back to Chinese", task_id)
    doc_task_llm_id = task["parser_config"].get("llm_id") or task["llm_id"]
    kb_task_llm_id = task["kb_parser_config"].get("llm_id") or task["llm_id"]
    task["llm_id"] = kb_task_llm_id
    task_dataset_id = task["kb_id"]
    task_doc_id = task["doc_id"]
    task_document_name = task["name"]
    task_parser_config = task["parser_config"]
    task_start_ts = timer()
    toc_thread = None
    raptor_cleanup_chunks = []

    # prepare the progress callback function
    progress_callback = partial(set_progress, task_id, task_from_page, task_to_page)

    task_canceled = has_canceled(task_id)
    if task_canceled:
        progress_callback(-1, msg="Task has been canceled.")
        return

    try:
        # bind embedding model
        if task_embedding_id:
            embd_model_config = get_model_config_from_provider_instance(task_tenant_id, LLMType.EMBEDDING, task_embedding_id)
        else:
            embd_model_config = get_tenant_default_model_by_type(task_tenant_id, LLMType.EMBEDDING)
        embedding_model = LLMBundle(task_tenant_id, embd_model_config, lang=task_language)
        vts, _ = embedding_model.encode(["ok"])
        vector_size = len(vts[0])
    except Exception as e:
        error_message = f"Fail to bind embedding model: {str(e)}"
        progress_callback(-1, msg=error_message)
        logging.exception(error_message)
        raise

    init_kb(task, vector_size)

    if task_type[: len("dataflow")] == "dataflow":
        await run_dataflow(task)
        return

    if task_type == "raptor":
        ok, kb = KnowledgebaseService.get_by_id(task_dataset_id)
        if not ok:
            progress_callback(prog=-1.0, msg="Cannot found valid dataset for RAPTOR task")
            return

        kb_parser_config = kb.parser_config
        if not kb_parser_config.get("raptor", {}).get("use_raptor", False):
            kb_parser_config.update(
                {
                    "raptor": {
                        "use_raptor": True,
                        "prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n      {cluster_content}\nThe above is the content you need to summarize.",
                        "max_token": 256,
                        "threshold": 0.1,
                        "max_cluster": 64,
                        "random_seed": 0,
                        "scope": "file",
                        "clustering_method": "gmm",
                        "tree_builder": "raptor",
                    },
                }
            )
            update_result = KnowledgebaseService.update_by_id(kb.id, {"parser_config": kb_parser_config})
            get_recording_context().save_func_return_value("KnowledgebaseService.update_by_id", update_result)
            if not update_result:
                progress_callback(prog=-1.0, msg="Internal error: Invalid RAPTOR configuration")
                return

        # bind LLM for raptor
        chat_model_config = get_model_config_from_provider_instance(task_tenant_id, LLMType.CHAT, kb_task_llm_id)
        chat_model = LLMBundle(task_tenant_id, chat_model_config, lang=task_language)
        # run RAPTOR
        async with kg_limiter:
            chunks, token_count, raptor_cleanup_chunks = await run_raptor_for_kb(
                row=task,
                kb_parser_config=kb_parser_config,
                chat_mdl=chat_model,
                embd_mdl=embedding_model,
                vector_size=vector_size,
                callback=progress_callback,
                doc_ids=task.get("doc_ids", []),
            )
        get_recording_context().record("raptor_chunks", chunks)
        get_recording_context().record("raptor_token_count", token_count)
        if fake_doc_ids := task.get("doc_ids", []):
            task_doc_id = fake_doc_ids[0]  # use the first document ID to represent this task for logging purposes
    # Either using graphrag or Standard chunking methods
    elif task_type == "graphrag":
        ok, kb = KnowledgebaseService.get_by_id(task_dataset_id)
        if not ok:
            progress_callback(prog=-1.0, msg="Cannot found valid dataset for GraphRAG task")
            return

        kb_parser_config = kb.parser_config
        if not kb_parser_config.get("graphrag", {}).get("use_graphrag", False):
            kb_parser_config.update(
                {
                    "graphrag": {
                        "use_graphrag": True,
                        "entity_types": [
                            "organization",
                            "person",
                            "geo",
                            "event",
                            "category",
                        ],
                        "method": "light",
                        "batch_chunk_token_size": 4096,
                        "retry_attempts": 2,
                        "retry_backoff_seconds": 2.0,
                        "retry_backoff_max_seconds": 60.0,
                        "build_subgraph_timeout_per_chunk_seconds": 300,
                        "build_subgraph_min_timeout_seconds": 600,
                        "merge_timeout_seconds": 180,
                        "resolution_timeout_seconds": 1800,
                        "community_timeout_seconds": 1800,
                        "lock_acquire_timeout_seconds": 600,
                    }
                }
            )
            update_result = KnowledgebaseService.update_by_id(kb.id, {"parser_config": kb_parser_config})
            get_recording_context().save_func_return_value("KnowledgebaseService.update_by_id", update_result)
            if not update_result:
                progress_callback(prog=-1.0, msg="Internal error: Invalid GraphRAG configuration")
                return

        graphrag_conf = kb_parser_config.get("graphrag", {})
        start_ts = timer()
        chat_model_config = get_model_config_from_provider_instance(task_tenant_id, LLMType.CHAT, kb_task_llm_id)
        chat_model = LLMBundle(task_tenant_id, chat_model_config, lang=task_language)
        with_resolution = graphrag_conf.get("resolution", False)
        with_community = graphrag_conf.get("community", False)
        async with kg_limiter:
            # await run_graphrag(task, task_language, with_resolution, with_community, chat_model, embedding_model, progress_callback)
            from rag.graphrag.general.index import run_graphrag_for_kb  # Lazy load, save around 2s

            result = await run_graphrag_for_kb(
                row=task,
                doc_ids=task.get("doc_ids", []),
                language=task_language,
                kb_parser_config=kb_parser_config,
                chat_model=chat_model,
                embedding_model=embedding_model,
                callback=progress_callback,
                with_resolution=with_resolution,
                with_community=with_community,
            )
            logging.info(f"GraphRAG task result for task {task}:\n{result}")
        get_recording_context().record("graphrag_result", result)
        progress_callback(prog=1.0, msg="Knowledge Graph done ({:.2f}s)".format(timer() - start_ts))
        return
    elif task_type == "mindmap":
        progress_callback(1, "place holder")
        pass
        return
    elif task_type == "skill":
        progress_callback(-1, "Skill generation requires the refactored task executor (TE_RUN_MODE=0).")
        return
    else:
        # Standard chunking methods
        task["llm_id"] = doc_task_llm_id
        start_ts = timer()
        chunks = await build_chunks(task, progress_callback)
        get_recording_context().record("chunks", chunks)
        # Record chunk_ids_count for comparison
        chunk_ids = [c.get("id") for c in chunks if isinstance(c, dict) and "id" in c]
        get_recording_context().record("chunk_ids_count", len(chunk_ids))
        # Record chunks array for content comparison (first, middle, last, random)
        logging.info("Build document {}: {:.2f}s".format(task_document_name, timer() - start_ts))
        if not chunks:
            progress_callback(1.0, msg=f"No chunk built from {task_document_name}")
            return
        progress_callback(msg="Generate {} chunks".format(len(chunks)))
        start_ts = timer()
        try:
            token_count, vector_size = await embedding(chunks, embedding_model, task_parser_config, progress_callback)
        except TaskCanceledException:
            raise
        except Exception as e:
            error_message = "Generate embedding error:{}".format(str(e))
            progress_callback(-1, error_message)
            logging.exception(error_message)
            token_count = 0
            raise
        get_recording_context().record("token_count", token_count)
        get_recording_context().record("vector_size", vector_size)
        progress_message = "Embedding chunks ({:.2f}s)".format(timer() - start_ts)
        logging.info(progress_message)
        progress_callback(msg=progress_message)

        if task["parser_id"].lower() == "naive" and task["parser_config"].get("toc_extraction", False):
            toc_thread = asyncio.create_task(asyncio.to_thread(build_TOC, task, chunks, progress_callback))

    chunk_count = len(set([chunk["id"] for chunk in chunks]))
    start_ts = timer()

    async def _maybe_insert_chunks(_chunks):
        if has_canceled(task_id):
            progress_callback(-1, msg="Task has been canceled.")
            return False
        insert_result = await insert_chunks(task_id, task_tenant_id, task_dataset_id, _chunks, progress_callback)
        return bool(insert_result)

    try:
        if not await _maybe_insert_chunks(chunks):
            get_recording_context().record("insertion_result", "failed")
            return
        get_recording_context().record("insertion_result", "success")
        if has_canceled(task_id):
            progress_callback(-1, msg="Task has been canceled.")
            return

        if raptor_cleanup_chunks:
            cleaned_chunks = 0
            for cleanup_doc_id, keep_method in raptor_cleanup_chunks:
                ret = await delete_raptor_chunks(
                    cleanup_doc_id,
                    task_tenant_id,
                    task_dataset_id,
                    keep_method=keep_method,
                )
                cleaned_chunks += ret
                get_recording_context().save_func_return_value("delete_raptor_chunks", ret)

            if cleaned_chunks:
                progress_callback(msg=f"Cleaned up {cleaned_chunks} stale RAPTOR chunks.")

        logging.info("Indexing doc({}), page({}-{}), chunks({}), elapsed: {:.2f}".format(task_document_name, task_from_page, task_to_page, len(chunks), timer() - start_ts))

        ret = DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, token_count, chunk_count, 0)
        get_recording_context().save_func_return_value("DocumentService.increment_chunk_num", ret)

        # Table parser: push metadata/both column values to document-level metadata for UI / chat filters
        if task.get("parser_id", "").lower() == "table":
            eff_pc = merge_table_parser_config_from_kb(task)
            logging.debug(f"[TABLE_META_DEBUG] table post-index: table_column_mode={eff_pc.get('table_column_mode')!r}")
            try:
                agg = aggregate_table_doc_metadata(chunks, task)
                logging.debug(f"[TABLE_META_DEBUG] aggregated metadata: {agg}")
                strip_keys = table_parser_strip_doc_metadata_keys(eff_pc)
                existing = DocMetadataService.get_document_metadata(task_doc_id)
                existing = existing if isinstance(existing, dict) else {}
                preserved = {k: v for k, v in existing.items() if k not in strip_keys}
                merged = update_metadata_to(dict(preserved), agg)
                logging.debug(
                    f"[TABLE_META_DEBUG] calling update_document_metadata for doc_id={task_doc_id}, "
                    f"meta_fields keys={list(merged.keys())}, "
                    f"table_strip_key_count={len(strip_keys)}, agg_keys={list(agg.keys())}"
                )
                try:
                    ret = DocMetadataService.update_document_metadata(task_doc_id, merged)
                    get_recording_context().save_func_return_value("DocMetadataService.update_document_metadata", ret)
                    logging.debug("[TABLE_META_DEBUG] update_document_metadata succeeded")
                except Exception as ue:
                    logging.error(
                        "update_document_metadata failed (table parser, doc_id=%s): %s",
                        task_doc_id,
                        ue,
                        exc_info=True,
                    )
            except Exception as e:
                logging.exception(
                    "Table parser document metadata aggregation failed (doc_id=%s): %s",
                    task_doc_id,
                    e,
                )

        progress_callback(msg="Indexing done ({:.2f}s).".format(timer() - start_ts))

        if toc_thread:
            d = await toc_thread
            if d:
                get_recording_context().record("toc_chunk", [d])
                if not await _maybe_insert_chunks([d]):
                    get_recording_context().record("toc_inserted", False)
                    return
                get_recording_context().record("toc_inserted", True)
                ret = DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, 0, 1, 0)
                get_recording_context().save_func_return_value("DocumentService.increment_chunk_num", ret)

        if has_canceled(task_id):
            progress_callback(-1, msg="Task has been canceled.")
            return

        task_time_cost = timer() - task_start_ts
        get_recording_context().record("task_status", "completed")
        progress_callback(prog=1.0, msg="Task done ({:.2f}s)".format(task_time_cost))
        logging.info("Chunk doc({}), page({}-{}), chunks({}), token({}), elapsed:{:.2f}".format(task_document_name, task_from_page, task_to_page, len(chunks), token_count, task_time_cost))

    finally:
        if toc_thread is not None and not toc_thread.done():
            toc_thread.cancel()
        if has_canceled(task_id):
            try:
                exists = await thread_pool_exec(
                    settings.docStoreConn.index_exist,
                    search.index_name(task_tenant_id),
                    task_dataset_id,
                )
                if exists:
                    ret = await thread_pool_exec(
                        settings.docStoreConn.delete,
                        {"doc_id": task_doc_id},
                        search.index_name(task_tenant_id),
                        task_dataset_id,
                    )
                    get_recording_context().save_func_return_value("docStoreConn.delete", ret)
            except Exception as e:
                logging.exception(f"Remove doc({task_doc_id}) from docStore failed when task({task_id}) canceled, exception: {e}")


async def handle_task():
    global DONE_TASKS, FAILED_TASKS
    redis_msg, task = await collect()
    if not task:
        await asyncio.sleep(5)
        return

    task_type = task["task_type"]
    pipeline_task_type = TASK_TYPE_TO_PIPELINE_TASK_TYPE.get(task_type, PipelineTaskType.PARSE) or PipelineTaskType.PARSE
    task_id = task["id"]
    try:
        CURRENT_TASKS[task["id"]] = copy.deepcopy(task)
        run_mode = os.environ.get("TE_RUN_MODE", "0")
        logging.info(f"TE_RUN_MODE is {run_mode}")

        # Check if dry-run comparison is enabled via environment variable
        if run_mode == "1":  # dry run mode - compare
            set_recording_context(RecordingContext())
            await do_handle_task(task)  # original execution
            # dry run mode
            logging.info(f"-----dry run task:{task_id}, {task.get('name', '')}, doc id:{task.get('doc_id', '')}")
            await TaskManager.dry_run_task(task, get_recording_context(), chat_limiter, minio_limiter, chunk_limiter, embed_limiter, kg_limiter, set_progress, has_canceled)
        elif run_mode == "0":  # use refactor-ed version
            # switch to refactor-ed version
            logging.info(f"-----run refactor-ed task executor:{task_id}, {task.get('name', '')}, doc id:{task.get('doc_id', '')}")
            set_recording_context(NullRecordingContext())
            await TaskManager.run_refactored_task(task, chat_limiter, minio_limiter, chunk_limiter, embed_limiter, kg_limiter, set_progress, has_canceled)
        else:  # original version
            logging.info(f"-----run original task executor:{task_id}, {task.get('name', '')}, doc id:{task.get('doc_id', '')}")
            set_recording_context(NullRecordingContext())
            await do_handle_task(task)

        DONE_TASKS += 1
        CURRENT_TASKS.pop(task_id, None)
        logging.info(f"handle_task done for task {json.dumps(task)}")
    except TaskCanceledException as e:
        DONE_TASKS += 1
        CURRENT_TASKS.pop(task_id, None)
        logging.info(f"handle_task canceled for task {task_id}: {getattr(e, 'msg', str(e))}")
    except Exception as e:
        FAILED_TASKS += 1
        CURRENT_TASKS.pop(task_id, None)
        try:
            err_msg = str(e)
            while isinstance(e, exceptiongroup.ExceptionGroup):
                e = e.exceptions[0]
                err_msg += " -- " + str(e)
            set_progress(task_id, prog=-1, msg=f"[Exception]: {err_msg}")
        except Exception as e:
            logging.exception(f"[Exception]: {str(e)}")
            pass
        logging.exception(f"handle_task got exception for task {json.dumps(task)}")
    finally:
        if not task.get("dataflow_id", ""):
            referred_document_id = None
            if task_type in ["graphrag", "raptor", "mindmap", "artifact", "skill"]:
                # KB-level fan-out tasks store the participating doc list in
                # task["doc_ids"]; the first entry is used as a referent so
                # the pipeline operation log has something to anchor to.
                referred_document_id = (task.get("doc_ids") or [None])[0]
            ret = PipelineOperationLogService.record_pipeline_operation(
                document_id=task["doc_id"], pipeline_id="", task_type=pipeline_task_type, task_id=task_id, referred_document_id=referred_document_id
            )
            get_recording_context().save_func_return_value("PipelineOperationLogService.record_pipeline_operation", ret)

    redis_msg.ack()


async def get_server_ip() -> str:
    # get ip by udp
    try:
        with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
            s.connect(("8.8.8.8", 80))
            return s.getsockname()[0]
    except Exception as e:
        logging.error(str(e))
        return "Unknown"


async def report_status():
    """
    Periodically reports the executor's heartbeat
    """
    global PENDING_TASKS, LAG_TASKS, DONE_TASKS, FAILED_TASKS

    ip_address = await get_server_ip()
    pid = os.getpid()

    # Register the executor in Redis
    REDIS_CONN.sadd("TASKEXE", CONSUMER_NAME)
    redis_lock = RedisDistributedLock("clean_task_executor", lock_value=CONSUMER_NAME, timeout=60)

    while True:
        now = datetime.now()
        now_ts = now.timestamp()

        group_info = REDIS_CONN.queue_info(settings.get_svr_queue_name(0), SVR_CONSUMER_GROUP_NAME) or {}
        PENDING_TASKS = int(group_info.get("pending", 0))
        LAG_TASKS = int(group_info.get("lag", 0))

        current = copy.deepcopy(CURRENT_TASKS)
        heartbeat = json.dumps(
            {
                "ip_address": ip_address,
                "pid": pid,
                "name": CONSUMER_NAME,
                "now": now.astimezone().isoformat(timespec="milliseconds"),
                "boot_at": BOOT_AT,
                "pending": PENDING_TASKS,
                "lag": LAG_TASKS,
                "done": DONE_TASKS,
                "failed": FAILED_TASKS,
                "current": current,
            }
        )

        # Report heartbeat to Redis
        try:
            REDIS_CONN.zadd(CONSUMER_NAME, heartbeat, now_ts)
        except Exception as e:
            logging.warning(f"Failed to report heartbeat: {e}")
        else:
            logging.debug(f"{CONSUMER_NAME} reported heartbeat: {heartbeat}")
            pass

        # Clean up own expired heartbeat
        try:
            REDIS_CONN.zremrangebyscore(CONSUMER_NAME, 0, now_ts - 60 * 30)
        except Exception as e:
            logging.warning(f"Failed to clean heartbeat: {e}")

        # Clean other executors
        lock_acquired = False
        try:
            lock_acquired = redis_lock.acquire()
        except Exception as e:
            logging.warning(f"Failed to acquire Redis lock: {e}")
        if lock_acquired:
            try:
                task_executors = REDIS_CONN.smembers("TASKEXE") or set()
                for worker_name in task_executors:
                    if worker_name == CONSUMER_NAME:
                        continue
                    try:
                        last_heartbeat = REDIS_CONN.REDIS.zrevrange(worker_name, 0, 0, withscores=True)
                    except Exception as e:
                        logging.warning(f"Failed to read zset for {worker_name}: {e}")
                        continue

                    if not last_heartbeat or now_ts - last_heartbeat[0][1] > WORKER_HEARTBEAT_TIMEOUT:
                        logging.info(f"{worker_name} expired, removed")
                        REDIS_CONN.srem("TASKEXE", worker_name)
                        REDIS_CONN.delete(worker_name)
            except Exception as e:
                logging.warning(f"Failed to clean other executors: {e}")
            finally:
                redis_lock.release()
        await asyncio.sleep(30)


async def task_manager():
    try:
        await handle_task()
    finally:
        task_limiter.release()


async def main():
    # Stagger executor startup to prevent connection storm to Infinity
    # Extract worker number from CONSUMER_NAME (e.g., "task_executor_abc123_5" -> 5)
    try:
        worker_num = int(CONSUMER_NAME.rsplit("_", 1)[-1])
        # Add random delay: base delay + worker_num * 2.0s + random jitter
        # This spreads out connection attempts over several seconds
        startup_delay = worker_num * 2.0 + random.uniform(0, 0.5)
        if startup_delay > 0:
            logging.info(f"Staggering startup by {startup_delay:.2f}s to prevent connection storm")
            await asyncio.sleep(startup_delay)
    except (ValueError, IndexError):
        pass  # Non-standard consumer name, skip delay

    logging.info(r"""
    ____                      __  _
   /  _/___  ____ ____  _____/ /_(_)___  ____     ________  ______   _____  _____
   / // __ \/ __ `/ _ \/ ___/ __/ / __ \/ __ \   / ___/ _ \/ ___/ | / / _ \/ ___/
 _/ // / / / /_/ /  __(__  ) /_/ / /_/ / / / /  (__  )  __/ /   | |/ /  __/ /
/___/_/ /_/\__, /\___/____/\__/_/\____/_/ /_/  /____/\___/_/    |___/\___/_/
          /____/
    """)
    logging.info(f"RAGFlow ingestion version: {get_ragflow_version()}")
    show_configs()
    settings.init_settings()
    settings.check_and_install_torch()
    logging.info(f"default embedding config: {settings.EMBEDDING_CFG}")
    settings.print_rag_settings()
    if sys.platform != "win32":
        signal.signal(signal.SIGUSR1, start_tracemalloc_and_snapshot)
        signal.signal(signal.SIGUSR2, stop_tracemalloc)
    TRACE_MALLOC_ENABLED = int(os.environ.get("TRACE_MALLOC_ENABLED", "0"))
    if TRACE_MALLOC_ENABLED:
        start_tracemalloc_and_snapshot(None, None)

    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    report_task = asyncio.create_task(report_status())
    tasks = []

    logging.info(f"RAGFlow ingestion is ready after {time.time() - start_ts}s initialization.")
    try:
        while not stop_event.is_set():
            await task_limiter.acquire()
            t = asyncio.create_task(task_manager())
            tasks.append(t)
    finally:
        for t in tasks:
            t.cancel()
        await asyncio.gather(*tasks, return_exceptions=True)
        report_task.cancel()
        await asyncio.gather(report_task, return_exceptions=True)
    logging.error("BUG!!! You should not reach here!!!")


if __name__ == "__main__":
    # Parse command line arguments (consistent with SAAS version)
    parser = argparse.ArgumentParser(description="Task Executor")
    parser.add_argument("-i", "--index", type=str, default="0")
    parser.add_argument("-t", "--type", type=str, default="common", help="[common, graphrag, raptor, resume]")
    args = parser.parse_args()

    # Update global variables
    TASK_TYPE = args.type
    TE_IDX = args.index
    CONSUMER_NAME = f"task_executor_{TASK_TYPE}_{TE_IDX}"

    faulthandler.enable()
    init_root_logger(CONSUMER_NAME)
    try:
        asyncio.run(main())
    except Exception as e:
        logging.exception(f"Unhandled exception: {e}")
        sys.exit(1)
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								#
-												llm configuation refine and trievalTest API refine (#40)


											
										
										
											2024-01-19 19:51:57 +08:00
+								#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								#
 								#  Licensed under the Apache License, Version 2.0 (the "License");
 								#  you may not use this file except in compliance with the License.
 								#  You may obtain a copy of the License at
 								#
 								#      http://www.apache.org/licenses/LICENSE-2.0
 								#
 								#  Unless required by applicable law or agreed to in writing, software
 								#  distributed under the License is distributed on an "AS IS" BASIS,
 								#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								#  See the License for the specific language governing permissions and
 								#  limitations under the License.
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								import argparse
-												Add time cost when start servers (#12552)

### What problem does this PR solve?

- API server
- Ingestion server
- Data sync server
- Admin server

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2026-01-12 12:48:23 +08:00
+								import time
-												Refa: asyncio.to_thread to ThreadPoolExecutor to break thread limitat… (#12716)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-20 13:29:37 +08:00
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								from rag.svr.task_executor_refactor.task_manager import TaskManager
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								from rag.svr.task_executor_refactor.recording_context import timed_with_recording, get_recording_context, RecordingContext, set_recording_context, NullRecordingContext
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
-												Speed up start time (#14833)

### What problem does this PR solve?

Speed up start time

### Type of change
- [x] Refactoring
											
										
										
											2026-05-12 17:00:45 +08:00
+								start_ts = time.time()
-												Refa: asyncio.to_thread to ThreadPoolExecutor to break thread limitat… (#12716)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-20 13:29:37 +08:00
-												Speed up start time (#14833)

### What problem does this PR solve?

Speed up start time

### Type of change
- [x] Refactoring
											
										
										
											2026-05-12 17:00:45 +08:00
+								# LiteLLM fetches a model cost map from GitHub during import unless this is set.
 								# Parser pods should not block startup on external network access.
 								import os
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
-												Speed up start time (#14833)

### What problem does this PR solve?

Speed up start time

### Type of change
- [x] Refactoring
											
										
										
											2026-05-12 17:00:45 +08:00
+								os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "True")  # no internet, save about 10s
-												Refa: asyncio.to_thread to ThreadPoolExecutor to break thread limitat… (#12716)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-20 13:29:37 +08:00
-												Speed up start time (#14833)

### What problem does this PR solve?

Speed up start time

### Type of change
- [x] Refactoring
											
										
										
											2026-05-12 17:00:45 +08:00
+								from common.misc_utils import thread_pool_exec
-												Add time cost when start servers (#12552)

### What problem does this PR solve?

- API server
- Ingestion server
- Data sync server
- Admin server

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2026-01-12 12:48:23 +08:00
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								import asyncio
-												Feat/monitor task (#11116)

### What problem does this PR solve?

Show task executor.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-10 12:51:39 +08:00
+								import socket
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
-												Removed beartype (#3528)

### What problem does this PR solve?

The beartype configuration of
main(64f50992e0fc4dce73e79f8b951a02e31cb2d638) is:
```
from beartype import BeartypeConf
from beartype.claw import beartype_all  # <-- you didn't sign up for this
beartype_all(conf=BeartypeConf(violation_type=UserWarning))    # <-- emit warnings from all code
```

ragflow_server failed at a third-party package:

```
(ragflow-py3.10) zhichyu@iris:~/github.com/infiniflow/ragflow$ rm -rf logs/* && bash docker/launch_backend_service.sh 
Starting task_executor.py for task 0 (Attempt 1)
Starting ragflow_server.py (Attempt 1)
Traceback (most recent call last):
  File "/home/zhichyu/github.com/infiniflow/ragflow/api/ragflow_server.py", line 22, in <module>
    from api.utils.log_utils import initRootLogger
  File "/home/zhichyu/github.com/infiniflow/ragflow/api/utils/__init__.py", line 25, in <module>
    import requests
  File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/requests/__init__.py", line 43, in <module>
    import urllib3
  File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/__init__.py", line 15, in <module>
    from ._base_connection import _TYPE_BODY
  File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/_base_connection.py", line 5, in <module>
    from .util.connection import _TYPE_SOCKET_OPTIONS
  File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/util/__init__.py", line 4, in <module>
    from .connection import is_connection_dropped
  File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/util/connection.py", line 7, in <module>
    from .timeout import _DEFAULT_TIMEOUT, _TYPE_TIMEOUT
  File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/util/timeout.py", line 20, in <module>
    _DEFAULT_TIMEOUT: Final[_TYPE_DEFAULT] = _TYPE_DEFAULT.token
NameError: name 'Final' is not defined
Traceback (most recent call last):
  File "/home/zhichyu/github.com/infiniflow/ragflow/rag/svr/task_executor.py", line 22, in <module>
    from api.utils.log_utils import initRootLogger
  File "/home/zhichyu/github.com/infiniflow/ragflow/api/utils/__init__.py", line 25, in <module>
    import requests
  File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/requests/__init__.py", line 43, in <module>
    import urllib3
  File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/__init__.py", line 15, in <module>
    from ._base_connection import _TYPE_BODY
  File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/_base_connection.py", line 5, in <module>
    from .util.connection import _TYPE_SOCKET_OPTIONS
  File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/util/__init__.py", line 4, in <module>
    from .connection import is_connection_dropped
  File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/util/connection.py", line 7, in <module>
    from .timeout import _DEFAULT_TIMEOUT, _TYPE_TIMEOUT
  File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/util/timeout.py", line 20, in <module>
    _DEFAULT_TIMEOUT: Final[_TYPE_DEFAULT] = _TYPE_DEFAULT.token
NameError: name 'Final' is not defined
```

This third-package is out of our control. I have to remove beartype
entirely.


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-20 17:43:16 +08:00
+								# from beartype import BeartypeConf
 								# from beartype.claw import beartype_all  # <-- you didn't sign up for this
 								# beartype_all(conf=BeartypeConf(violation_type=UserWarning))    # <-- emit warnings from all code
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								import random
-												Rework task executor heartbeat (#3430)

### What problem does this PR solve?

Rework task executor heartbeat, and print in console.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-15 14:43:55 +08:00
+								import sys
-												feat: Recover pending tasks while pod restart. (#7073)

### What problem does this PR solve?

If you deploy Ragflow using Kubernetes, the hostname will change during
a rolling update. This causes the consumer name of the task executor to
change, making it impossible to schedule tasks that were previously in a
pending state.
To address this, I introduced a recovery task that scans these pending
messages and re-publishes them, allowing the tasks to continue being
processed.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

---------

Co-authored-by: liuzhenghua-jk <liuzhenghua-jk@360shuke.com>
											
										
										
											2025-04-19 16:18:51 +08:00
+								import threading
-												Fix: possible memory leaks close #5277 (#5500)

### What problem does this PR solve?

close #5277 by make sure the file close

### Type of change

- [x] Performance Improvement

---------

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
											
										
										
											2025-03-03 10:26:45 +08:00
-												Fix: GraphRAG and RAPTOR tasks do not affect document status (#11194)

### What problem does this PR solve?

GraphRAG and RAPTOR tasks do not affect document status.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-12 12:03:41 +08:00
+								from api.db import PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								from api.db.services.knowledgebase_service import KnowledgebaseService
 								from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
-												Fix: use async task to save memory (#12308)

### What problem does this PR solve?

Use async task to save memory.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:41:38 +08:00
+								from api.db.joint_services.memory_message_service import handle_save_to_memory_task
-												Move 'timeout' to common folder (#10983)

### What problem does this PR solve?

As title.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-04 11:51:12 +08:00
+								from common.connection_utils import timeout
-												Feat: enhance metadata arranging. (#12745)

### What problem does this PR solve?
#11564

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-22 15:34:08 +08:00
+								from common.metadata_utils import turn2jsonschema, update_metadata_to
-												Move some vars to globals (#11017)

### What problem does this PR solve?

As title.

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-05 14:14:38 +08:00
+								from rag.utils.base64_image import image2id
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								from rag.utils.raptor_utils import (
 								    collect_raptor_chunk_ids,
 								    collect_raptor_methods,
 								    get_raptor_clustering_method,
 								    get_raptor_tree_builder,
 								    get_skip_reason,
 								    make_raptor_summary_chunk_id,
 								    should_skip_raptor,
 								)
-												Refactor log utils (#10973)

### What problem does this PR solve?

As title.

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-03 20:25:02 +08:00
+								from common.log_utils import init_root_logger
-												Introduce common/config_utils.py (#10968)

### What problem does this PR solve?

As title.

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-03 17:25:06 +08:00
+								from common.config_utils import show_configs
-												Refa: Clean the folders. (#12890)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-29 14:23:26 +08:00
+								from rag.graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								from rag.prompts.generator import keyword_extraction, question_proposal, content_tagging, run_toc_from_text, gen_metadata
-												Fixed log not displaying (#3946)

### What problem does this PR solve?

Fixed log not displaying

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-12-10 09:36:59 +08:00
+								import logging
 								import os
-												Rework task executor heartbeat (#3430)

### What problem does this PR solve?

Rework task executor heartbeat, and print in console.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-15 14:43:55 +08:00
+								from datetime import datetime
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								import json
-												Replaced md5 with xxhash64 for chunk id (#4009)

### What problem does this PR solve?

Replaced md5 with xxhash64 for chunk id

### Type of change

- [x] Refactoring
											
										
										
											2024-12-12 17:47:39 +08:00
+								import xxhash
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								import copy
 								import re
-												Add task moduel, and pipline the task and every parser (#49)


											
										
										
											2024-01-31 19:57:45 +08:00
+								from functools import partial
-												fix #258 task_executor occupy cpu too much (#288)

### What problem does this PR solve?

Issue link:#285

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-04-10 10:11:22 +08:00
+								from multiprocessing.context import TimeoutError
-												add redis to accelerate access of minio (#482)

### What problem does this PR solve?

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-04-22 14:11:09 +08:00
+								from timeit import default_timer as timer
-												Optimize ocr (#5297)

### What problem does this PR solve?

Introduced OCR.recognize_batch

### Type of change

- [x] Performance Improvement
											
										
										
											2025-02-24 16:21:55 +08:00
+								import signal
-												Refactor graphrag to remove redis lock (#5828)

### What problem does this PR solve?

Refactor graphrag to remove redis lock

### Type of change

- [x] Refactoring
											
										
										
											2025-03-10 15:15:06 +08:00
+								import exceptiongroup
-												Optimize graphrag cache get entity (#6018)

### What problem does this PR solve?

Optimize graphrag cache get entity

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-13 14:37:59 +08:00
+								import faulthandler
-												handle nits in task_executor (#2637)

### What problem does this PR solve?

- fix typo
- fix string format
- format import

### Type of change

- [x] Refactoring
											
										
										
											2024-09-29 09:49:45 +08:00
+								import numpy as np
-												Try to reuse existing chunks (#3983)

### What problem does this PR solve?

Try to reuse existing chunks. Close #3793
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-12 16:38:03 +08:00
+								from peewee import DoesNotExist
-												Move some constants to common (#11004)

### What problem does this PR solve?

As title.

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-05 08:01:39 +08:00
+								from common.constants import LLMType, ParserType, PipelineTaskType
-												add dialog api (#33)


											
										
										
											2024-01-17 20:20:42 +08:00
+								from api.db.services.document_service import DocumentService
-												Put document metadata in ES/Infinity (#12826)

### What problem does this PR solve?

Put document metadata in ES/Infinity.

Index name of meta data: ragflow_doc_meta_{tenant_id}

### Type of change

- [x] Refactoring
											
										
										
											2026-01-28 13:29:34 +08:00
+								from api.db.services.doc_metadata_service import DocMetadataService
-												Add task moduel, and pipline the task and every parser (#49)


											
										
										
											2024-01-31 19:57:45 +08:00
+								from api.db.services.llm_service import LLMBundle
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								from api.db.services.task_service import TaskService, has_canceled, CANVAS_DEBUG_DOC_ID, GRAPH_RAPTOR_FAKE_DOC_ID
-												handle nits in task_executor (#2637)

### What problem does this PR solve?

- fix typo
- fix string format
- format import

### Type of change

- [x] Refactoring
											
										
										
											2024-09-29 09:49:45 +08:00
+								from api.db.services.file2document_service import File2DocumentService
-												Feat: tenant llm provider (#14595)

### What problem does this PR solve?

Python implementation of the Go-based model_provider API suite.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: bill <yibie_jingnian@163.com>
											
										
										
											2026-05-29 17:39:41 +08:00
+								from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type, get_model_config_from_provider_instance
-												Admin: add 'show version' (#11079)

### What problem does this PR solve?

```
admin> show version;
show_version
+-----------------------+
| version               |
+-----------------------+
| v0.21.0-241-gc6cf58d5 |
+-----------------------+
admin> \q
Goodbye!

```

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-06 19:24:46 +08:00
+								from common.versions import get_ragflow_version
-												handle nits in task_executor (#2637)

### What problem does this PR solve?

- fix typo
- fix string format
- format import

### Type of change

- [x] Refactoring
											
										
										
											2024-09-29 09:49:45 +08:00
+								from api.db.db_models import close_connection
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, email, tag
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								from rag.nlp import search, rag_tokenizer, add_positions
-												Feat: Add knowledge compilation workflows (#16515)

## Summary
- Add knowledge compilation template APIs, services, and builtin
template seed data
- Add advanced knowledge compile structure/artifact/RAPTOR workflow
support
- Update parsing, dataset/document APIs, and supporting services for
compilation workflows
											
										
										
											2026-07-02 23:22:07 +08:00
+								from rag.advanced_rag.knowlege_compile.raptor import (
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								    RAPTOR_TREE_BUILDER,
 								)
-												Move token related functions to common (#10942)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-03 08:50:05 +08:00
+								from common.token_utils import num_tokens_from_string, truncate
-												feat: Recover pending tasks while pod restart. (#7073)

### What problem does this PR solve?

If you deploy Ragflow using Kubernetes, the hostname will change during
a rolling update. This causes the consumer name of the task executor to
change, making it impossible to schedule tasks that were previously in a
pending state.
To address this, I introduced a recovery task that scans these pending
messages and re-publishes them, allowing the tasks to continue being
processed.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

---------

Co-authored-by: liuzhenghua-jk <liuzhenghua-jk@360shuke.com>
											
										
										
											2025-04-19 16:18:51 +08:00
+								from rag.utils.redis_conn import REDIS_CONN, RedisDistributedLock
-												Refa: Clean the folders. (#12890)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-29 14:23:26 +08:00
+								from rag.graphrag.utils import chat_limiter
-												Fix and refactor imports (#11010)

### What problem does this PR solve?

1. Move EMBEDDING_CFG to common.globals
2. Fix error imports
3. Move signal handles to common/signal_utils.py

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-05 11:07:54 +08:00
+								from common.signal_utils import start_tracemalloc_and_snapshot, stop_tracemalloc
-												Feat: GraphRAG handle cancel gracefully (#11061)

### What problem does this PR solve?

 GraghRAG handle cancel gracefully. #10997.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-06 16:12:20 +08:00
+								from common.exceptions import TaskCanceledException
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								from rag.svr.task_executor_limiter import (
 								    task_limiter,
 								    chunk_limiter,
 								    embed_limiter,
 								    minio_limiter,
 								    kg_limiter,
 								)
-												Move api.settings to common.settings (#11036)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-06 09:36:38 +08:00
+								from common import settings
 								from common.constants import PAGERANK_FLD, TAG_FLD, SVR_CONSUMER_GROUP_NAME
-												Feature/table parser column roles (#13710)

### What problem does this PR solve?

The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.

For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.

The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.

This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.

Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-05-11 07:06:04 +05:00
+								from rag.utils.table_es_metadata import (
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								    aggregate_table_doc_metadata,
-												Feature/table parser column roles (#13710)

### What problem does this PR solve?

The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.

For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.

The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.

This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.

Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-05-11 07:06:04 +05:00
+								    merge_table_parser_config_from_kb,
 								    table_parser_strip_doc_metadata_keys,
 								)
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								from rag.nlp import search as nlp_search
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
 								BATCH_SIZE = 64
-												Feature/table parser column roles (#13710)

### What problem does this PR solve?

The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.

For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.

The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.

This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.

Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-05-11 07:06:04 +05:00
-												Add task moduel, and pipline the task and every parser (#49)


											
										
										
											2024-01-31 19:57:45 +08:00
+								FACTORY = {
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								    "general": naive,
-												fix user login issue (#85)


											
										
										
											2024-02-29 14:03:07 +08:00
+								    ParserType.NAIVE.value: naive,
-												Add task moduel, and pipline the task and every parser (#49)


											
										
										
											2024-01-31 19:57:45 +08:00
+								    ParserType.PAPER.value: paper,
-												remove unused codes, seperate layout detection out as a new api. Add new rag methed 'table' (#55)


											
										
										
											2024-02-05 18:08:17 +08:00
+								    ParserType.BOOK.value: book,
-												Add task moduel, and pipline the task and every parser (#49)


											
										
										
											2024-01-31 19:57:45 +08:00
+								    ParserType.PRESENTATION.value: presentation,
 								    ParserType.MANUAL.value: manual,
 								    ParserType.LAWS.value: laws,
-												Add Q&A and Book, fix task running bugs (#50)


											
										
										
											2024-02-01 18:53:56 +08:00
+								    ParserType.QA.value: qa,
-												remove unused codes, seperate layout detection out as a new api. Add new rag methed 'table' (#55)


											
										
										
											2024-02-05 18:08:17 +08:00
+								    ParserType.TABLE.value: table,
-												refactor retieval_test, add SQl retrieval methods (#61)


											
										
										
											2024-02-08 17:01:01 +08:00
+								    ParserType.RESUME.value: resume,
-												init README of deepdoc, add picture processer. (#71)

* init README of deepdoc, add picture processer.

* add resume parsing
											
										
										
											2024-02-23 18:28:12 +08:00
+								    ParserType.PICTURE.value: picture,
-												Add 'One' chunk method (#137)


											
										
										
											2024-03-20 18:57:22 +08:00
+								    ParserType.ONE.value: one,
-												Add graphrag (#1793)

### What problem does this PR solve?

#1594

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-08-02 18:51:14 +08:00
+								    ParserType.AUDIO.value: audio,
-												add support for eml file parser (#1768)

### What problem does this PR solve?

add support for eml file parser
#1363

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2024-08-06 16:42:14 +08:00
+								    ParserType.EMAIL.value: email,
-												Light GraphRAG (#4585)

### What problem does this PR solve?

#4543

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-22 19:43:14 +08:00
+								    ParserType.KG.value: naive,
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								    ParserType.TAG.value: tag,
-												Add task moduel, and pipline the task and every parser (#49)


											
										
										
											2024-01-31 19:57:45 +08:00
+								}
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								TASK_TYPE_TO_PIPELINE_TASK_TYPE = {
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								    "dataflow": PipelineTaskType.PARSE,
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    "raptor": PipelineTaskType.RAPTOR,
 								    "graphrag": PipelineTaskType.GRAPH_RAG,
 								    "mindmap": PipelineTaskType.MINDMAP,
-												Fix: use async task to save memory (#12308)

### What problem does this PR solve?

Use async task to save memory.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:41:38 +08:00
+								    "memory": PipelineTaskType.MEMORY,
-												Feat: Add knowledge compilation workflows (#16515)

## Summary
- Add knowledge compilation template APIs, services, and builtin
template seed data
- Add advanced knowledge compile structure/artifact/RAPTOR workflow
support
- Update parsing, dataset/document APIs, and supporting services for
compilation workflows
											
										
										
											2026-07-02 23:22:07 +08:00
+								    "artifact": PipelineTaskType.ARTIFACT,
 								    "skill": PipelineTaskType.SKILL,
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								}
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								UNACKED_ITERATOR = None
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								# Task type and executor index (consistent with SAAS version)
 								TASK_TYPE = "common"
 								TE_IDX = "0"
-												Fix executor name (#6080)

### What problem does this PR solve?

Fix executor name

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-14 14:13:47 +08:00
-												Added time to progress message (#4185)

### What problem does this PR solve?

Added time to progress message

### Type of change

- [x] Refactoring
											
										
										
											2024-12-23 17:25:55 +08:00
+								BOOT_AT = datetime.now().astimezone().isoformat(timespec="milliseconds")
-												Rework task executor heartbeat (#3430)

### What problem does this PR solve?

Rework task executor heartbeat, and print in console.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-15 14:43:55 +08:00
+								PENDING_TASKS = 0
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								LAG_TASKS = 0
-												Added current task into task executor's hearbeat (#3444)

### What problem does this PR solve?

Added current task into task executor's hearbeat

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 22:55:41 +08:00
+								DONE_TASKS = 0
 								FAILED_TASKS = 0
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								CURRENT_TASKS = {}
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								WORKER_HEARTBEAT_TIMEOUT = int(os.environ.get("WORKER_HEARTBEAT_TIMEOUT", "120"))
-												feat: Recover pending tasks while pod restart. (#7073)

### What problem does this PR solve?

If you deploy Ragflow using Kubernetes, the hostname will change during
a rolling update. This causes the consumer name of the task executor to
change, making it impossible to schedule tasks that were previously in a
pending state.
To address this, I introduced a recovery task that scans these pending
messages and re-publishes them, allowing the tasks to continue being
processed.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

---------

Co-authored-by: liuzhenghua-jk <liuzhenghua-jk@360shuke.com>
											
										
										
											2025-04-19 16:18:51 +08:00
+								stop_event = threading.Event()
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
-												feat: Recover pending tasks while pod restart. (#7073)

### What problem does this PR solve?

If you deploy Ragflow using Kubernetes, the hostname will change during
a rolling update. This causes the consumer name of the task executor to
change, making it impossible to schedule tasks that were previously in a
pending state.
To address this, I introduced a recovery task that scans these pending
messages and re-publishes them, allowing the tasks to continue being
processed.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

---------

Co-authored-by: liuzhenghua-jk <liuzhenghua-jk@360shuke.com>
											
										
										
											2025-04-19 16:18:51 +08:00
+								def signal_handler(sig, frame):
 								    logging.info("Received interrupt signal, shutting down...")
 								    stop_event.set()
 								    time.sleep(1)
 								    sys.exit(0)
-												Optimize ocr (#5297)

### What problem does this PR solve?

Introduced OCR.recognize_batch

### Type of change

- [x] Performance Improvement
											
										
										
											2025-02-24 16:21:55 +08:00
-												Feat: add OCR's muti-gpus and parallel processing support (#5972)

### What problem does this PR solve?

Add OCR's muti-gpus and parallel processing support

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

@yuzhichang I've tried to resolve the comments in #5697. OCR jobs can
now be done on both CPU and GPU. ( By the way, I've encountered a
“Generate embedding error” issue #5954 that might be due to my outdated
GPUs? idk. ) Please review it and give me suggestions.

GPU:

![gpu_ocr](https://github.com/user-attachments/assets/0ee2ecfb-a665-4e50-8bc7-15941b9cd80e)

![smi](https://github.com/user-attachments/assets/a2312f8c-cf24-443d-bf89-bec50503546d)

CPU:

![cpu_ocr](https://github.com/user-attachments/assets/1ba6bb0b-94df-41ea-be79-790096da4bf1)
											
										
										
											2025-03-17 11:58:40 +08:00
-												handle nits in task_executor (#2637)

### What problem does this PR solve?

- fix typo
- fix string format
- format import

### Type of change

- [x] Refactoring
											
										
										
											2024-09-29 09:49:45 +08:00
+								def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
-												Optimize graphrag cache get entity (#6018)

### What problem does this PR solve?

Optimize graphrag cache get entity

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-13 14:37:59 +08:00
+								    try:
 								        if prog is not None and prog < 0:
 								            msg = "[ERROR]" + msg
-												Pref: use redis to check if canceled. (#8853)

### What problem does this PR solve?

### Type of change

- [x] Performance Improvement
											
										
										
											2025-07-15 17:19:27 +08:00
+								        cancel = has_canceled(task_id)
-												Optimize graphrag cache get entity (#6018)

### What problem does this PR solve?

Optimize graphrag cache get entity

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-13 14:37:59 +08:00
 								        if cancel:
 								            msg += " [Canceled]"
 								            prog = -1
 								        if to_page > 0:
 								            if msg:
 								                if from_page < to_page:
 								                    msg = f"Page({from_page + 1}~{to_page + 1}): " + msg
-												change callback strategy, add timezone to docker (#96)


											
										
										
											2024-03-05 12:08:41 +08:00
+								        if msg:
-												Optimize graphrag cache get entity (#6018)

### What problem does this PR solve?

Optimize graphrag cache get entity

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-13 14:37:59 +08:00
+								            msg = datetime.now().strftime("%H:%M:%S") + " " + msg
 								        d = {"progress_msg": msg}
 								        if prog is not None:
 								            d["progress"] = prog
 								        TaskService.update_progress(task_id, d)
 								        close_connection()
 								        if cancel:
 								            raise TaskCanceledException(msg)
 								        logging.info(f"set_progress({task_id}), progress: {prog}, progress_msg: {msg}")
-												Fix: task cancel (#13034)

### What problem does this PR solve?

Fix: task cancel #11745 
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-02-06 14:48:24 +08:00
+								    except TaskCanceledException:
 								        raise
-												Optimize graphrag cache get entity (#6018)

### What problem does this PR solve?

Optimize graphrag cache get entity

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-13 14:37:59 +08:00
+								    except DoesNotExist:
 								        logging.warning(f"set_progress({task_id}) got exception DoesNotExist")
-												Refactor code (#12305)

### What problem does this PR solve?

as title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:09:18 +08:00
+								    except Exception as e:
 								        logging.exception(f"set_progress({task_id}), progress: {prog}, progress_msg: {msg}, got exception: {e}")
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
-												Perf: set timeout for building chunks. (#8940)

### What problem does this PR solve?


### Type of change

- [x] Performance Improvement
											
										
										
											2025-07-21 15:56:45 +08:00
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								async def collect():
 								    global CONSUMER_NAME, DONE_TASKS, FAILED_TASKS
 								    global UNACKED_ITERATOR
-												Refa: GraphRAG and explaining GraphRAG stalling behavior on large files (#8223)

### What problem does this PR solve?

This PR investigates the cause of #7957.

TL;DR: Incorrect similarity calculations lead to too many candidates.
Since candidate selection involves interaction with the LLM, this causes
significant delays in the program.

What this PR does:

1. **Fix similarity calculation**:
When processing a 64 pages government document, the corrected similarity
calculation reduces the number of candidates from over 100,000 to around
16,000. With a default batch size of 100 pairs per LLM call, this fix
reduces unnecessary LLM interactions from over 1,000 calls to around
160, a roughly 10x improvement.
2. **Add concurrency and timeout limits**: 
Up to 5 entity types are processed in "parallel", each with a 180-second
timeout. These limits may be configurable in future updates.
3. **Improve logging**:
The candidate resolution process now reports progress in real time.
4. **Mitigates potential concurrency risks**


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
											
										
										
											2025-06-12 19:09:50 +08:00
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    svr_queue_names = settings.get_svr_queue_names(TASK_TYPE)
-												Refactor code (#12305)

### What problem does this PR solve?

as title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:09:18 +08:00
+								    redis_msg = None
-												optimize srv broker and executor logic (#630)

### What problem does this PR solve?

Optimize task broker and executor for reduce memory usage and deployment
complexity.

### Type of change
- [x] Performance Improvement
- [x] Refactoring

### Change Log
- Enhance redis utils for message queue(use stream)
- Modify task broker logic via message queue (1.get parse event from
message queue 2.use ThreadPoolExecutor async executor )
- Modify the table column name of document and task (process_duation ->
process_duration maybe just a spelling mistake)
- Reformat some code style(just what i see)
- Add requirement_dev.txt for developer
- Add redis container on docker compose

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2024-05-07 11:43:33 +08:00
+								    try:
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								        if not UNACKED_ITERATOR:
-												Refa: revert to original task message collection logic (#8251)

### What problem does this PR solve?

Get rid of 'RedisDB.get_unacked_iterator queue rag_flow_svr_queue_1
doesn't exist'

----

Edit: revert to original message collection logic.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-06-13 16:38:53 +08:00
+								            UNACKED_ITERATOR = REDIS_CONN.get_unacked_iterator(svr_queue_names, SVR_CONSUMER_GROUP_NAME, CONSUMER_NAME)
 								        try:
 								            redis_msg = next(UNACKED_ITERATOR)
 								        except StopIteration:
-												Introduced task priority (#6118)

### What problem does this PR solve?

Introduced task priority

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-03-14 23:43:46 +08:00
+								            for svr_queue_name in svr_queue_names:
-												Refa: revert to original task message collection logic (#8251)

### What problem does this PR solve?

Get rid of 'RedisDB.get_unacked_iterator queue rag_flow_svr_queue_1
doesn't exist'

----

Edit: revert to original message collection logic.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-06-13 16:38:53 +08:00
+								                redis_msg = REDIS_CONN.queue_consumer(svr_queue_name, SVR_CONSUMER_GROUP_NAME, CONSUMER_NAME)
 								                if redis_msg:
 								                    break
-												Refactor code (#12305)

### What problem does this PR solve?

as title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:09:18 +08:00
+								    except Exception as e:
 								        logging.exception(f"collect got exception: {e}")
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								        return None, None
-												optimize srv broker and executor logic (#630)

### What problem does this PR solve?

Optimize task broker and executor for reduce memory usage and deployment
complexity.

### Type of change
- [x] Performance Improvement
- [x] Refactoring

### Change Log
- Enhance redis utils for message queue(use stream)
- Modify task broker logic via message queue (1.get parse event from
message queue 2.use ThreadPoolExecutor async executor )
- Modify the table column name of document and task (process_duation ->
process_duration maybe just a spelling mistake)
- Reformat some code style(just what i see)
- Add requirement_dev.txt for developer
- Add redis container on docker compose

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2024-05-07 11:43:33 +08:00
-												Introduced task priority (#6118)

### What problem does this PR solve?

Introduced task priority

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-03-14 23:43:46 +08:00
+								    if not redis_msg:
 								        return None, None
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								    msg = redis_msg.get_message()
-												make task resumable (#2132)

### What problem does this PR solve?

### Type of change


- [x] Performance Improvement
											
										
										
											2024-08-28 14:06:27 +08:00
+								    if not msg:
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								        logging.error(f"collect got empty message of {redis_msg.get_msg_id()}")
 								        redis_msg.ack()
 								        return None, None
-												optimize srv broker and executor logic (#630)

### What problem does this PR solve?

Optimize task broker and executor for reduce memory usage and deployment
complexity.

### Type of change
- [x] Performance Improvement
- [x] Refactoring

### Change Log
- Enhance redis utils for message queue(use stream)
- Modify task broker logic via message queue (1.get parse event from
message queue 2.use ThreadPoolExecutor async executor )
- Modify the table column name of document and task (process_duation ->
process_duration maybe just a spelling mistake)
- Reformat some code style(just what i see)
- Add requirement_dev.txt for developer
- Add redis container on docker compose

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2024-05-07 11:43:33 +08:00
-												Try to reuse existing chunks (#3983)

### What problem does this PR solve?

Try to reuse existing chunks. Close #3793
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-12 16:38:03 +08:00
+								    canceled = False
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    if msg.get("doc_id", "") in [GRAPH_RAPTOR_FAKE_DOC_ID, CANVAS_DEBUG_DOC_ID]:
 								        task = msg
-												Fix: GraphRAG and RAPTOR tasks do not affect document status (#11194)

### What problem does this PR solve?

GraphRAG and RAPTOR tasks do not affect document status.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-12 12:03:41 +08:00
+								        if task["task_type"] in PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES:
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								            task = TaskService.get_task(msg["id"], msg["doc_ids"])
-												Feat: Support multiple data sources synchronizations (#10954)

### What problem does this PR solve?
#10953

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-03 19:59:18 +08:00
+								            if task:
 								                task["doc_id"] = msg["doc_id"]
 								                task["doc_ids"] = msg.get("doc_ids", []) or []
-												Fix: use async task to save memory (#12308)

### What problem does this PR solve?

Use async task to save memory.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:41:38 +08:00
+								    elif msg.get("task_type") == PipelineTaskType.MEMORY.lower():
 								        _, task_obj = TaskService.get_by_id(msg["id"])
 								        task = task_obj.to_dict()
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    else:
 								        task = TaskService.get_task(msg["id"])
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								    if task:
-												Pref: use redis to check if canceled. (#8853)

### What problem does this PR solve?

### Type of change

- [x] Performance Improvement
											
										
										
											2025-07-15 17:19:27 +08:00
+								        canceled = has_canceled(task["id"])
-												Try to reuse existing chunks (#3983)

### What problem does this PR solve?

Try to reuse existing chunks. Close #3793
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-12 16:38:03 +08:00
+								    if not task or canceled:
 								        state = "is unknown" if not task else "has been cancelled"
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								        FAILED_TASKS += 1
 								        logging.warning(f"collect task {msg['id']} {state}")
 								        redis_msg.ack()
-												Refactored DocumentService.update_progress (#5642)

### What problem does this PR solve?

Refactored DocumentService.update_progress

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-05 14:48:03 +08:00
+								        return None, None
-												Feat: refine dataflow and initialize dataflow app (#9952)

### What problem does this PR solve?

Refine dataflow and initialize dataflow app.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-09-05 18:50:46 +08:00
 								    task_type = msg.get("task_type", "")
 								    task["task_type"] = task_type
-												Feat: Add knowledge compilation workflows (#16515)

## Summary
- Add knowledge compilation template APIs, services, and builtin
template seed data
- Add advanced knowledge compile structure/artifact/RAPTOR workflow
support
- Update parsing, dataset/document APIs, and supporting services for
compilation workflows
											
										
										
											2026-07-02 23:22:07 +08:00
+								    # Per-doc fan-out task types (today: doc-scoped raptor) carry their
 								    # participating doc id list on the Redis message but not on the DB
 								    # row. The KB-scoped branch above already does this for FAKE doc
 								    # tasks; mirror here so ``ctx.doc_ids`` is populated for the
 								    # per-doc path too.
 								    if "doc_ids" in msg and not task.get("doc_ids"):
 								        task["doc_ids"] = msg.get("doc_ids", []) or []
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    if task_type[:8] == "dataflow":
 								        task["tenant_id"] = msg["tenant_id"]
 								        task["dataflow_id"] = msg["dataflow_id"]
-												Feat: refine dataflow and initialize dataflow app (#9952)

### What problem does this PR solve?

Refine dataflow and initialize dataflow app.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-09-05 18:50:46 +08:00
+								        task["kb_id"] = msg.get("kb_id", "")
-												Fix: use async task to save memory (#12308)

### What problem does this PR solve?

Use async task to save memory.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:41:38 +08:00
+								    if task_type[:6] == "memory":
 								        task["memory_id"] = msg["memory_id"]
-												fix: propagate memory tenant id in task collect (#15837)

### What problem does this PR solve?
Propagate `tenant_id` from memory task messages into task collection so
refactored task execution can build a valid context.

### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-09 17:47:48 +08:00
+								        if msg.get("tenant_id"):
 								            task["tenant_id"] = msg["tenant_id"]
-												Fix: use async task to save memory (#12308)

### What problem does this PR solve?

Use async task to save memory.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:41:38 +08:00
+								        task["source_id"] = msg["source_id"]
 								        task["message_dict"] = msg["message_dict"]
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								    return redis_msg, task
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
-												Support Ollama (#261)

### What problem does this PR solve?

Issue link:#221

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-04-08 19:20:57 +08:00
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								async def get_storage_binary(bucket, name):
-												Refa: asyncio.to_thread to ThreadPoolExecutor to break thread limitat… (#12716)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-20 13:29:37 +08:00
+								    return await thread_pool_exec(settings.STORAGE_IMPL.get, bucket, name)
-												Support Ollama (#261)

### What problem does this PR solve?

Issue link:#221

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-04-08 19:20:57 +08:00
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								@timed_with_recording
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								@timeout(60 * 80, 1)
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								async def build_chunks(task, progress_callback):
-												Move api.settings to common.settings (#11036)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-06 09:36:38 +08:00
+								    if task["size"] > settings.DOC_MAXIMUM_SIZE:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        set_progress(task["id"], prog=-1, msg="File size exceeds( <= %dMb )" % (int(settings.DOC_MAXIMUM_SIZE / 1024 / 1024)))
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().record("file_size_exceeded", True)
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								        return []
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    get_recording_context().record("file_size_exceeded", False)
 								    get_recording_context().record("parser_id", task["parser_id"])
-												add alot of api (#23)

* clean rust version project

* clean rust version project

* build python version rag-flow

* add alot of api
											
										
										
											2024-01-15 19:47:25 +08:00
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								    chunker = FACTORY[task["parser_id"].lower()]
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								    try:
-												Support Ollama (#261)

### What problem does this PR solve?

Issue link:#221

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-04-08 19:20:57 +08:00
+								        st = timer()
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								        bucket, name = File2DocumentService.get_storage_address(doc_id=task["doc_id"])
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								        binary = await get_storage_binary(bucket, name)
-												Fix: empty file with better message (#15232)

Fix: empty file with better message
											
										
										
											2026-05-26 12:28:53 +08:00
+								        if binary is None:
 								            raise FileNotFoundError(f"File not found: storage returned no content for {bucket}/{name}.")
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								        logging.info("From minio({}) {}/{}".format(timer() - st, task["location"], task["name"]))
-												handle nits in task_executor (#2637)

### What problem does this PR solve?

- fix typo
- fix string format
- format import

### Type of change

- [x] Refactoring
											
										
										
											2024-09-29 09:49:45 +08:00
+								    except TimeoutError:
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								        progress_callback(-1, "Internal server error: Fetch file from minio timeout. Could you try it again.")
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        logging.exception("Minio {}/{} got timeout: Fetch file from minio timeout.".format(task["location"], task["name"]))
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								        raise
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								    except Exception as e:
 								        if re.search("(No such file|not found)", str(e)):
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								            progress_callback(-1, "Can not find file <%s> from minio. Could you try it again?" % task["name"])
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								        else:
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								            progress_callback(-1, "Get file from minio: %s" % str(e).replace("'", ""))
 								        logging.exception("Chunking {}/{} got exception".format(task["location"], task["name"]))
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								        raise
-												add alot of api (#23)

* clean rust version project

* clean rust version project

* build python version rag-flow

* add alot of api
											
										
										
											2024-01-15 19:47:25 +08:00
-												Feature/table parser column roles (#13710)

### What problem does this PR solve?

The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.

For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.

The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.

This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.

Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-05-11 07:06:04 +05:00
+								    # Table parser column roles / mode are stored on the dataset (KB) parser_config;
 								    # chunk tasks carry document-level parser_config only — merge KB keys so manual roles apply.
 								    parser_config_for_chunk = merge_table_parser_config_from_kb(task)
 								    if task.get("parser_id", "").lower() == "table" and task.get("kb_parser_config"):
 								        logging.debug(
 								            "[TASK_EXECUTOR_DEBUG] table parser: merged KB keys into parser_config for chunk; "
 								            f"mode={parser_config_for_chunk.get('table_column_mode')}, "
 								            f"roles_keys={list((parser_config_for_chunk.get('table_column_roles') or {}).keys())}"
 								        )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    # Record chunk configuration for comparison
 								    from common.float_utils import normalize_overlapped_percent
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    chunk_config = {
 								        "parser_id": task["parser_id"],
 								        "chunk_token_num": parser_config_for_chunk.get("chunk_token_num", 128),
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        "overlapped_percent": normalize_overlapped_percent(parser_config_for_chunk.get("overlapped_percent", 0)),
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        "delimiter": parser_config_for_chunk.get("delimiter", "\n!?。；！？"),
 								        "from_page": task["from_page"],
 								        "to_page": task["to_page"],
 								        "language": task["language"],
 								        "layout_recognizer": parser_config_for_chunk.get("layout_recognizer"),
 								    }
 								    get_recording_context().record("chunk_config", chunk_config)
 								    get_recording_context().record("parser_config_after_merge", parser_config_for_chunk)
-												refine error log while chunking (#1937)

### What problem does this PR solve?



### Type of change

- [x] Refactoring
											
										
										
											2024-08-14 11:09:07 +08:00
+								    try:
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								        async with chunk_limiter:
-												Fix: guard missing task language (#15136)

### What problem does this PR solve?

guard missing task language

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-05-22 11:46:38 +08:00
+								            task_language = task.get("language") or "Chinese"
-												Refa: asyncio.to_thread to ThreadPoolExecutor to break thread limitat… (#12716)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-20 13:29:37 +08:00
+								            cks = await thread_pool_exec(
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								                chunker.chunk,
 								                task["name"],
 								                binary=binary,
 								                from_page=task["from_page"],
 								                to_page=task["to_page"],
-												Fix: guard missing task language (#15136)

### What problem does this PR solve?

guard missing task language

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-05-22 11:46:38 +08:00
+								                lang=task_language,
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								                callback=progress_callback,
 								                kb_id=task["kb_id"],
-												Feature/table parser column roles (#13710)

### What problem does this PR solve?

The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.

For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.

The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.

This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.

Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-05-11 07:06:04 +05:00
+								                parser_config=parser_config_for_chunk,
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								                tenant_id=task["tenant_id"],
 								            )
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								        logging.info("Chunking({}) {}/{} done".format(timer() - st, task["location"], task["name"]))
-												Try to reuse existing chunks (#3983)

### What problem does this PR solve?

Try to reuse existing chunks. Close #3793
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-12 16:38:03 +08:00
+								    except TaskCanceledException:
 								        raise
-												refine error log while chunking (#1937)

### What problem does this PR solve?



### Type of change

- [x] Refactoring
											
										
										
											2024-08-14 11:09:07 +08:00
+								    except Exception as e:
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								        progress_callback(-1, "Internal server error while chunking: %s" % str(e).replace("'", ""))
 								        logging.exception("Chunking {}/{} got exception".format(task["location"], task["name"]))
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								        raise
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    # Record raw chunks for comparison
 								    get_recording_context().record("raw_chunks", cks)
-												feat: persist PDF bookmark outline as document metadata (#13287)

## Summary

PDF files often contain a bookmark/outline tree (table of contents built
into the file by the authoring tool). RAGFlow's `pdf_parser.outlines`
already extracts these `(title, depth)` tuples via pypdf, but they are
used ephemerally during chunking (`manual` parser uses them for
hierarchy detection) and then discarded.

This PR persists the outline as `doc.meta_fields["outline"]` — a JSON
array of `{"title": str, "depth": int}` objects — so downstream features
can use the structural information.

### Why this matters

- **Complementary to `toc_extraction`** — the existing `toc_extraction`
feature uses LLM calls to generate a TOC and only works for the `naive`
parser. The raw PDF outline is free (already extracted by pypdf), works
for all parsers, and captures the author's original document structure.
- **Document navigation** — frontends can render a clickable TOC from
the outline
- **Entity extraction** — the outline provides a structural map for
identifying document sections and key topics
- **Search result context** — knowing which section a chunk belongs to
helps users evaluate relevance

### Changes

| File | Change | LOC |
|------|--------|-----|
| `rag/app/naive.py` | Attach `pdf_parser.outlines` as `__outline__` on
first chunk dict | ~7 |
| `rag/app/manual.py` | Same for the manual parser | ~5 |
| `rag/svr/task_executor.py` | Extract `__outline__`, persist via
`DocMetadataService.update_document_metadata()` | ~12 |

### Design decisions

- **Transient key pattern**: The outline is passed from parser →
task_executor via `__outline__` on the first chunk dict, then removed
before indexing. This follows the same pattern as `metadata_obj` for
LLM-generated metadata.
- **No schema changes**: Uses the existing `meta_fields` JSON column on
the document table.
- **Graceful degradation**: If a PDF has no outline (common for scanned
docs), nothing is stored. If persistence fails, it logs a warning and
continues — parsing is not interrupted.

### Backward compatibility

- **Fully backward compatible** — no existing fields, behavior, or
schemas changed
- PDFs without outlines are unaffected
- Existing `meta_fields` data is preserved (merged, not overwritten)

## Test plan

- [ ] Parse a PDF with bookmarks (e.g. any multi-chapter document),
verify `meta_fields["outline"]` is populated
- [ ] Parse a PDF without bookmarks, verify no errors and no outline key
in meta_fields
- [ ] Verify existing `meta_fields` data is preserved (not overwritten)
when outline is added
- [ ] Verify `manual` parser also persists outlines
- [ ] Verify outline JSON structure: `[{"title": "Chapter 1", "depth":
0}, ...]`

Related: #9921 (Deterministic Document Access Layer)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
											
										
										
											2026-04-27 11:57:06 +08:00
+								    # Extract and persist PDF outline if the parser attached it.
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    outline_data = cks[0].get("__outline__") if cks else None
 								    get_recording_context().record("outline_data", outline_data)
-												feat: persist PDF bookmark outline as document metadata (#13287)

## Summary

PDF files often contain a bookmark/outline tree (table of contents built
into the file by the authoring tool). RAGFlow's `pdf_parser.outlines`
already extracts these `(title, depth)` tuples via pypdf, but they are
used ephemerally during chunking (`manual` parser uses them for
hierarchy detection) and then discarded.

This PR persists the outline as `doc.meta_fields["outline"]` — a JSON
array of `{"title": str, "depth": int}` objects — so downstream features
can use the structural information.

### Why this matters

- **Complementary to `toc_extraction`** — the existing `toc_extraction`
feature uses LLM calls to generate a TOC and only works for the `naive`
parser. The raw PDF outline is free (already extracted by pypdf), works
for all parsers, and captures the author's original document structure.
- **Document navigation** — frontends can render a clickable TOC from
the outline
- **Entity extraction** — the outline provides a structural map for
identifying document sections and key topics
- **Search result context** — knowing which section a chunk belongs to
helps users evaluate relevance

### Changes

| File | Change | LOC |
|------|--------|-----|
| `rag/app/naive.py` | Attach `pdf_parser.outlines` as `__outline__` on
first chunk dict | ~7 |
| `rag/app/manual.py` | Same for the manual parser | ~5 |
| `rag/svr/task_executor.py` | Extract `__outline__`, persist via
`DocMetadataService.update_document_metadata()` | ~12 |

### Design decisions

- **Transient key pattern**: The outline is passed from parser →
task_executor via `__outline__` on the first chunk dict, then removed
before indexing. This follows the same pattern as `metadata_obj` for
LLM-generated metadata.
- **No schema changes**: Uses the existing `meta_fields` JSON column on
the document table.
- **Graceful degradation**: If a PDF has no outline (common for scanned
docs), nothing is stored. If persistence fails, it logs a warning and
continues — parsing is not interrupted.

### Backward compatibility

- **Fully backward compatible** — no existing fields, behavior, or
schemas changed
- PDFs without outlines are unaffected
- Existing `meta_fields` data is preserved (merged, not overwritten)

## Test plan

- [ ] Parse a PDF with bookmarks (e.g. any multi-chapter document),
verify `meta_fields["outline"]` is populated
- [ ] Parse a PDF without bookmarks, verify no errors and no outline key
in meta_fields
- [ ] Verify existing `meta_fields` data is preserved (not overwritten)
when outline is added
- [ ] Verify `manual` parser also persists outlines
- [ ] Verify outline JSON structure: `[{"title": "Chapter 1", "depth":
0}, ...]`

Related: #9921 (Deterministic Document Access Layer)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
											
										
										
											2026-04-27 11:57:06 +08:00
+								    if cks and cks[0].get("__outline__"):
 								        outline = cks[0].pop("__outline__")
 								        try:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            ret = DocMetadataService.update_document_metadata(task["doc_id"], update_metadata_to({"outline": outline}, DocMetadataService.get_document_metadata(task["doc_id"]) or {}))
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            get_recording_context().save_func_return_value("DocMetadataService.update_document_metadata", ret)
-												feat: persist PDF bookmark outline as document metadata (#13287)

## Summary

PDF files often contain a bookmark/outline tree (table of contents built
into the file by the authoring tool). RAGFlow's `pdf_parser.outlines`
already extracts these `(title, depth)` tuples via pypdf, but they are
used ephemerally during chunking (`manual` parser uses them for
hierarchy detection) and then discarded.

This PR persists the outline as `doc.meta_fields["outline"]` — a JSON
array of `{"title": str, "depth": int}` objects — so downstream features
can use the structural information.

### Why this matters

- **Complementary to `toc_extraction`** — the existing `toc_extraction`
feature uses LLM calls to generate a TOC and only works for the `naive`
parser. The raw PDF outline is free (already extracted by pypdf), works
for all parsers, and captures the author's original document structure.
- **Document navigation** — frontends can render a clickable TOC from
the outline
- **Entity extraction** — the outline provides a structural map for
identifying document sections and key topics
- **Search result context** — knowing which section a chunk belongs to
helps users evaluate relevance

### Changes

| File | Change | LOC |
|------|--------|-----|
| `rag/app/naive.py` | Attach `pdf_parser.outlines` as `__outline__` on
first chunk dict | ~7 |
| `rag/app/manual.py` | Same for the manual parser | ~5 |
| `rag/svr/task_executor.py` | Extract `__outline__`, persist via
`DocMetadataService.update_document_metadata()` | ~12 |

### Design decisions

- **Transient key pattern**: The outline is passed from parser →
task_executor via `__outline__` on the first chunk dict, then removed
before indexing. This follows the same pattern as `metadata_obj` for
LLM-generated metadata.
- **No schema changes**: Uses the existing `meta_fields` JSON column on
the document table.
- **Graceful degradation**: If a PDF has no outline (common for scanned
docs), nothing is stored. If persistence fails, it logs a warning and
continues — parsing is not interrupted.

### Backward compatibility

- **Fully backward compatible** — no existing fields, behavior, or
schemas changed
- PDFs without outlines are unaffected
- Existing `meta_fields` data is preserved (merged, not overwritten)

## Test plan

- [ ] Parse a PDF with bookmarks (e.g. any multi-chapter document),
verify `meta_fields["outline"]` is populated
- [ ] Parse a PDF without bookmarks, verify no errors and no outline key
in meta_fields
- [ ] Verify existing `meta_fields` data is preserved (not overwritten)
when outline is added
- [ ] Verify `manual` parser also persists outlines
- [ ] Verify outline JSON structure: `[{"title": "Chapter 1", "depth":
0}, ...]`

Related: #9921 (Deterministic Document Access Layer)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
											
										
										
											2026-04-27 11:57:06 +08:00
+								            logging.info("Persisted PDF outline (%d entries) for doc %s", len(outline), task["doc_id"])
 								        except Exception as e:
 								            logging.warning("Failed to persist PDF outline for doc %s: %s", task["doc_id"], e)
-												Add task moduel, and pipline the task and every parser (#49)


											
										
										
											2024-01-31 19:57:45 +08:00
+								    docs = []
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								    doc = {"doc_id": task["doc_id"], "kb_id": str(task["kb_id"])}
-												Fix errors detected by Ruff (#3918)

### What problem does this PR solve?

Fix errors detected by Ruff

### Type of change

- [x] Refactoring
											
										
										
											2024-12-08 14:21:12 +08:00
+								    if task["pagerank"]:
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        doc[PAGERANK_FLD] = int(task["pagerank"])
-												perf: optimze figure parser (#7392)

### What problem does this PR solve?

When parsing documents containing images, the current code uses a
single-threaded approach to call the VL model, resulting in extremely
slow parsing speed (e.g., parsing a Word document with dozens of images
takes over 20 minutes).

By switching to a multithreaded approach to call the VL model, the
parsing speed can be improved to an acceptable level.

### Type of change

- [x] Performance Improvement

---------

Co-authored-by: liuzhenghua-jk <liuzhenghua-jk@360shuke.com>
											
										
										
											2025-05-06 14:39:45 +08:00
+								    st = timer()
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
-												Perf: Enhance timeout handling. (#8826)

### What problem does this PR solve?


### Type of change

- [x] Performance Improvement
											
										
										
											2025-07-15 09:36:45 +08:00
+								    @timeout(60)
-												perf: optimze figure parser (#7392)

### What problem does this PR solve?

When parsing documents containing images, the current code uses a
single-threaded approach to call the VL model, resulting in extremely
slow parsing speed (e.g., parsing a Word document with dozens of images
takes over 20 minutes).

By switching to a multithreaded approach to call the VL model, the
parsing speed can be improved to an acceptable level.

### Type of change

- [x] Performance Improvement

---------

Co-authored-by: liuzhenghua-jk <liuzhenghua-jk@360shuke.com>
											
										
										
											2025-05-06 14:39:45 +08:00
+								    async def upload_to_minio(document, chunk):
-												ignore when save image fail (#2178)

### What problem does this PR solve?


### Type of change
- [x] Performance Improvement
											
										
										
											2024-08-30 18:41:31 +08:00
+								        try:
-												Perf: reduce upload to minio limiter scope (#7878)

### What problem does this PR solve?
reduce upload_to_minio limter scope

### Type of change
- [x] Performance Improvement
											
										
										
											2025-05-27 17:49:37 +08:00
+								            d = copy.deepcopy(document)
 								            d.update(chunk)
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            d["id"] = xxhash.xxh64((chunk["content_with_weight"] + str(d["doc_id"])).encode("utf-8", "surrogatepass")).hexdigest()
-												Perf: reduce upload to minio limiter scope (#7878)

### What problem does this PR solve?
reduce upload_to_minio limter scope

### Type of change
- [x] Performance Improvement
											
										
										
											2025-05-27 17:49:37 +08:00
+								            d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
 								            d["create_timestamp_flt"] = datetime.now().timestamp()
-												Fix: image not displaying thumbnails when using pipeline (#12574)

### What problem does this PR solve?

Fix image not displaying thumbnails when using pipeline.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-01-13 12:54:13 +08:00
 								            if d.get("img_id"):
 								                docs.append(d)
 								                return
-												Perf: reduce upload to minio limiter scope (#7878)

### What problem does this PR solve?
reduce upload_to_minio limter scope

### Type of change
- [x] Performance Improvement
											
										
										
											2025-05-27 17:49:37 +08:00
+								            if not d.get("image"):
 								                _ = d.pop("image", None)
 								                d["img_id"] = ""
 								                docs.append(d)
 								                return
-												Move api.settings to common.settings (#11036)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-06 09:36:38 +08:00
+								            await image2id(d, partial(settings.STORAGE_IMPL.put, tenant_id=task["tenant_id"]), d["id"], task["kb_id"])
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								            docs.append(d)
-												Rework logging (#3358)

Unified all log files into one.

### What problem does this PR solve?

Unified all log files into one.

### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 17:35:13 +08:00
+								        except Exception:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            logging.exception("Saving image of chunk {}/{}/{} got exception".format(task["location"], task["name"], d["id"]))
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								            raise
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								    tasks = []
 								    for ck in cks:
 								        tasks.append(asyncio.create_task(upload_to_minio(doc, ck)))
 								    try:
 								        await asyncio.gather(*tasks, return_exceptions=False)
 								    except Exception as e:
 								        logging.error(f"MINIO PUT({task['name']}) got exception: {e}")
 								        for t in tasks:
 								            t.cancel()
 								        await asyncio.gather(*tasks, return_exceptions=True)
 								        raise
-												perf: optimze figure parser (#7392)

### What problem does this PR solve?

When parsing documents containing images, the current code uses a
single-threaded approach to call the VL model, resulting in extremely
slow parsing speed (e.g., parsing a Word document with dozens of images
takes over 20 minutes).

By switching to a multithreaded approach to call the VL model, the
parsing speed can be improved to an acceptable level.

### Type of change

- [x] Performance Improvement

---------

Co-authored-by: liuzhenghua-jk <liuzhenghua-jk@360shuke.com>
											
										
										
											2025-05-06 14:39:45 +08:00
 								    el = timer() - st
 								    logging.info("MINIO PUT({}) cost {:.3f} s".format(task["name"], el))
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    # Record docs after MinIO upload
 								    get_recording_context().record("docs_after_prep", docs)
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								    if task["parser_config"].get("auto_keywords", 0):
-												Updates on parsing progress, including more detailed time cost inform… (#3402)

### What problem does this PR solve?

#3401 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-11-14 16:28:10 +08:00
+								        st = timer()
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								        progress_callback(msg="Start to generate keywords for every chunk ...")
-												Feat: tenant llm provider (#14595)

### What problem does this PR solve?

Python implementation of the Go-based model_provider API suite.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: bill <yibie_jingnian@163.com>
											
										
										
											2026-05-29 17:39:41 +08:00
+								        chat_model_config = get_model_config_from_provider_instance(task["tenant_id"], LLMType.CHAT, task["llm_id"])
-												Feat/tenant model (#13072)

### What problem does this PR solve?

Add id for table tenant_llm and apply in LLMBundle.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
Co-authored-by: Liu An <asiro@qq.com>
											
										
										
											2026-03-05 17:27:17 +08:00
+								        chat_mdl = LLMBundle(task["tenant_id"], chat_model_config, lang=task["language"])
-												Cache the result from llm for graphrag and raptor (#4051)

### What problem does this PR solve?

#4045

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-17 09:48:03 +08:00
-												Run keyword_extraction, question_proposal, content_tagging in thread pool (#5376)

### What problem does this PR solve?

Run keyword_extraction, question_proposal, content_tagging in threads

### Type of change

- [x] Performance Improvement
											
										
										
											2025-02-26 15:21:14 +08:00
+								        async def doc_keyword_extraction(chat_mdl, d, topn):
 								            cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "keywords", {"topn": topn})
 								            if not cached:
-												Refine: image/table context. (#12336)

### What problem does this PR solve?

#12303

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-30 20:24:27 +08:00
+								                if has_canceled(task["id"]):
 								                    progress_callback(-1, msg="Task has been canceled.")
 								                    return
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								                async with chat_limiter:
-												Fix: tokenizer issue. (#11902)

#11786
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-11 17:38:17 +08:00
+								                    cached = await keyword_extraction(chat_mdl, d["content_with_weight"], topn)
-												Run keyword_extraction, question_proposal, content_tagging in thread pool (#5376)

### What problem does this PR solve?

Run keyword_extraction, question_proposal, content_tagging in threads

### Type of change

- [x] Performance Improvement
											
										
										
											2025-02-26 15:21:14 +08:00
+								                set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "keywords", {"topn": topn})
 								            if cached:
-												fix(keyword_extraction): accept Chinese commas/semicolons/newlines as keyword delimiters (#14540)

## What
Widen the keyword delimiter in `rag/svr/task_executor.py`:
both `build_chunks` (LLM `keyword_extraction` cache parsing) and
`run_dataflow` (chunk-level `keywords` ingestion) now split on
`, ， ; ； 、 \r \n` instead of only ASCII comma.

## Why
`rag/prompts/keyword_prompt.md` instructs the LLM:

> The keywords are delimited by ENGLISH COMMA.

In practice, Chinese-leaning models (Qwen / Tongyi-Qianwen, GLM,
etc.) frequently ignore this instruction when the source content is
Chinese and emit Chinese commas (`，`) instead. Result:
`cached.split(",")` sees the full LLM output as a *single* keyword.

Repro: `auto_keywords>=4` + Chinese docs + `qwen-plus@Tongyi-Qianwen`.
We observed entries in `important_kwd` like
`"功能介绍，配置说明，参数详解，问题排查"` — one bucket instead of four.

## Impact
- Silent data-quality bug; no exception thrown.
- BM25 `important_kwd^30` boost effectively stops firing — the
  indexed term is the whole list, never matches user query tokens.
- Any downstream aggregating `important_kwd` (tagging, analytics,
  candidate-keyword review UIs) sees garbage.

## Compatibility
- Pure widening of the splitter; ASCII-comma-only outputs continue
  to work identically.
- No schema / API change.

## Test plan
Manually verified against `qwen-plus@Tongyi-Qianwen` with
`auto_keywords=10` on Chinese .txt files:

- Before: `important_kwd` contains one element per chunk that is the
  full LLM string with `，`-separated phrases inside.
- After: `important_kwd` contains N elements, one per phrase, as the
  LLM intended.
											
										
										
											2026-05-11 12:05:24 +08:00
+								                d["important_kwd"] = [k for k in re.split(r"[,，;；、\r\n]+", cached) if k.strip()]
-												Run keyword_extraction, question_proposal, content_tagging in thread pool (#5376)

### What problem does this PR solve?

Run keyword_extraction, question_proposal, content_tagging in threads

### Type of change

- [x] Performance Improvement
											
										
										
											2025-02-26 15:21:14 +08:00
+								                d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
 								            return
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								        tasks = []
 								        for d in docs:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            tasks.append(asyncio.create_task(doc_keyword_extraction(chat_mdl, d, task["parser_config"]["auto_keywords"])))
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								        try:
 								            await asyncio.gather(*tasks, return_exceptions=False)
 								        except Exception as e:
 								            logging.error("Error in doc_keyword_extraction: {}".format(e))
 								            for t in tasks:
 								                t.cancel()
 								            await asyncio.gather(*tasks, return_exceptions=True)
 								            raise
-												Run keyword_extraction, question_proposal, content_tagging in thread pool (#5376)

### What problem does this PR solve?

Run keyword_extraction, question_proposal, content_tagging in threads

### Type of change

- [x] Performance Improvement
											
										
										
											2025-02-26 15:21:14 +08:00
+								        progress_callback(msg="Keywords generation {} chunks completed in {:.2f}s".format(len(docs), timer() - st))
-												refactor auto keywords and auto question (#2990)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
											
										
										
											2024-10-23 17:00:56 +08:00
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    # Record keywords extraction count
 								    keywords = [d for d in docs if d.get("important_kwd")]
 								    get_recording_context().record("keywords_extracted", keywords)
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								    if task["parser_config"].get("auto_questions", 0):
-												Updates on parsing progress, including more detailed time cost inform… (#3402)

### What problem does this PR solve?

#3401 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-11-14 16:28:10 +08:00
+								        st = timer()
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								        progress_callback(msg="Start to generate questions for every chunk ...")
-												Feat: tenant llm provider (#14595)

### What problem does this PR solve?

Python implementation of the Go-based model_provider API suite.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: bill <yibie_jingnian@163.com>
											
										
										
											2026-05-29 17:39:41 +08:00
+								        chat_model_config = get_model_config_from_provider_instance(task["tenant_id"], LLMType.CHAT, task["llm_id"])
-												Feat/tenant model (#13072)

### What problem does this PR solve?

Add id for table tenant_llm and apply in LLMBundle.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
Co-authored-by: Liu An <asiro@qq.com>
											
										
										
											2026-03-05 17:27:17 +08:00
+								        chat_mdl = LLMBundle(task["tenant_id"], chat_model_config, lang=task["language"])
-												Cache the result from llm for graphrag and raptor (#4051)

### What problem does this PR solve?

#4045

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-17 09:48:03 +08:00
-												Run keyword_extraction, question_proposal, content_tagging in thread pool (#5376)

### What problem does this PR solve?

Run keyword_extraction, question_proposal, content_tagging in threads

### Type of change

- [x] Performance Improvement
											
										
										
											2025-02-26 15:21:14 +08:00
+								        async def doc_question_proposal(chat_mdl, d, topn):
 								            cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "question", {"topn": topn})
 								            if not cached:
-												Refine: image/table context. (#12336)

### What problem does this PR solve?

#12303

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-30 20:24:27 +08:00
+								                if has_canceled(task["id"]):
 								                    progress_callback(-1, msg="Task has been canceled.")
 								                    return
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								                async with chat_limiter:
-												Fix: tokenizer issue. (#11902)

#11786
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-11 17:38:17 +08:00
+								                    cached = await question_proposal(chat_mdl, d["content_with_weight"], topn)
-												Run keyword_extraction, question_proposal, content_tagging in thread pool (#5376)

### What problem does this PR solve?

Run keyword_extraction, question_proposal, content_tagging in threads

### Type of change

- [x] Performance Improvement
											
										
										
											2025-02-26 15:21:14 +08:00
+								                set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "question", {"topn": topn})
 								            if cached:
 								                d["question_kwd"] = cached.split("\n")
 								                d["question_tks"] = rag_tokenizer.tokenize("\n".join(d["question_kwd"]))
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								        tasks = []
 								        for d in docs:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            tasks.append(asyncio.create_task(doc_question_proposal(chat_mdl, d, task["parser_config"]["auto_questions"])))
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								        try:
 								            await asyncio.gather(*tasks, return_exceptions=False)
 								        except Exception as e:
 								            logging.error("Error in doc_question_proposal", exc_info=e)
 								            for t in tasks:
 								                t.cancel()
 								            await asyncio.gather(*tasks, return_exceptions=True)
 								            raise
-												Run keyword_extraction, question_proposal, content_tagging in thread pool (#5376)

### What problem does this PR solve?

Run keyword_extraction, question_proposal, content_tagging in threads

### Type of change

- [x] Performance Improvement
											
										
										
											2025-02-26 15:21:14 +08:00
+								        progress_callback(msg="Question generation {} chunks completed in {:.2f}s".format(len(docs), timer() - st))
-												refactor auto keywords and auto question (#2990)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
											
										
										
											2024-10-23 17:00:56 +08:00
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    # Record question generation
 								    questions = [d for d in docs if d.get("question_kwd")]
 								    get_recording_context().record("questions_generated", questions)
-												Fix: get metadata conf  (#14250)

### What problem does this PR solve?

Get metadata configuration from union of custom metadata and
built_in_metadata.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-21 17:22:42 +08:00
+								    if task["parser_config"].get("enable_metadata", False) and (task["parser_config"].get("metadata") or task["parser_config"].get("built_in_metadata")):
-												Fix: parent-children chunking method. (#11997)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-17 16:50:36 +08:00
+								        st = timer()
 								        progress_callback(msg="Start to generate meta-data for every chunk ...")
-												Feat: tenant llm provider (#14595)

### What problem does this PR solve?

Python implementation of the Go-based model_provider API suite.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: bill <yibie_jingnian@163.com>
											
										
										
											2026-05-29 17:39:41 +08:00
+								        chat_model_config = get_model_config_from_provider_instance(task["tenant_id"], LLMType.CHAT, task["llm_id"])
-												Feat/tenant model (#13072)

### What problem does this PR solve?

Add id for table tenant_llm and apply in LLMBundle.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
Co-authored-by: Liu An <asiro@qq.com>
											
										
										
											2026-03-05 17:27:17 +08:00
+								        chat_mdl = LLMBundle(task["tenant_id"], chat_model_config, lang=task["language"])
-												Fix: parent-children chunking method. (#11997)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-17 16:50:36 +08:00
 								        async def gen_metadata_task(chat_mdl, d):
-												Fix metadata parsing regression for upgraded v0.24 datasets (#14383)

### What problem does this PR solve?

This PR fixes issue #14371 where file parsing failed after upgrading
from v0.24.0 to v0.25.0, because metadata config could be a JSON Schema
object but was handled like a list and later caused `KeyError:
'properties'`.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-27 16:18:06 +08:00
+								            metadata_conf = task["parser_config"].get("metadata", [])
 								            built_in_metadata = list(task["parser_config"].get("built_in_metadata") or [])
 								            if isinstance(metadata_conf, dict):
 								                if not isinstance(metadata_conf.get("properties"), dict):
 								                    metadata_conf = {"type": "object", "properties": {}}
 								                if built_in_metadata:
 								                    metadata_conf = {
 								                        **metadata_conf,
 								                        "properties": {
 								                            **metadata_conf.get("properties", {}),
 								                            **turn2jsonschema(built_in_metadata).get("properties", {}),
 								                        },
 								                    }
 								            elif isinstance(metadata_conf, list):
 								                metadata_conf = metadata_conf + built_in_metadata
 								            else:
 								                metadata_conf = built_in_metadata
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "metadata", metadata_conf)
-												Fix: parent-children chunking method. (#11997)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-17 16:50:36 +08:00
+								            if not cached:
-												Refine: image/table context. (#12336)

### What problem does this PR solve?

#12303

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-30 20:24:27 +08:00
+								                if has_canceled(task["id"]):
 								                    progress_callback(-1, msg="Task has been canceled.")
 								                    return
-												Fix: parent-children chunking method. (#11997)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-17 16:50:36 +08:00
+								                async with chat_limiter:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								                    cached = await gen_metadata(chat_mdl, turn2jsonschema(metadata_conf), d["content_with_weight"])
 								                set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "metadata", metadata_conf)
-												Fix: parent-children chunking method. (#11997)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-17 16:50:36 +08:00
+								            if cached:
 								                d["metadata_obj"] = cached
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
-												Fix: parent-children chunking method. (#11997)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-17 16:50:36 +08:00
+								        tasks = []
 								        for d in docs:
 								            tasks.append(asyncio.create_task(gen_metadata_task(chat_mdl, d)))
 								        try:
 								            await asyncio.gather(*tasks, return_exceptions=False)
 								        except Exception as e:
 								            logging.error("Error in doc_question_proposal", exc_info=e)
 								            for t in tasks:
 								                t.cancel()
 								            await asyncio.gather(*tasks, return_exceptions=True)
 								            raise
 								        metadata = {}
-												Fix: table tag on chunks. (#12126)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-24 09:32:19 +08:00
+								        for doc in docs:
 								            metadata = update_metadata_to(metadata, doc["metadata_obj"])
-												Fix: metadata_obj issue. (#12146)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-24 13:40:34 +08:00
+								            del doc["metadata_obj"]
-												Fix: parent-children chunking method. (#11997)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-17 16:50:36 +08:00
+								        if metadata:
-												Put document metadata in ES/Infinity (#12826)

### What problem does this PR solve?

Put document metadata in ES/Infinity.

Index name of meta data: ragflow_doc_meta_{tenant_id}

### Type of change

- [x] Refactoring
											
										
										
											2026-01-28 13:29:34 +08:00
+								            existing_meta = DocMetadataService.get_document_metadata(task["doc_id"])
 								            existing_meta = existing_meta if isinstance(existing_meta, dict) else {}
 								            metadata = update_metadata_to(metadata, existing_meta)
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            ret = DocMetadataService.update_document_metadata(task["doc_id"], metadata)
 								            get_recording_context().save_func_return_value("DocMetadataService.update_document_metadata", ret)
-												Fix: parent-children chunking method. (#11997)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-17 16:50:36 +08:00
+								        progress_callback(msg="Question generation {} chunks completed in {:.2f}s".format(len(docs), timer() - st))
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    # Record metadata generation count
 								    metadata_list = [d for d in docs if d.get("metadata_obj")]
 								    get_recording_context().record("metadata_list_generated", metadata_list)
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								    if task["kb_parser_config"].get("tag_kb_ids", []):
 								        progress_callback(msg="Start to tag for every chunk ...")
 								        kb_ids = task["kb_parser_config"]["tag_kb_ids"]
 								        tenant_id = task["tenant_id"]
 								        topn_tags = task["kb_parser_config"].get("topn_tags", 3)
 								        S = 1000
 								        st = timer()
 								        examples = []
 								        all_tags = get_tags_from_cache(kb_ids)
 								        if not all_tags:
-												Move api.settings to common.settings (#11036)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-06 09:36:38 +08:00
+								            all_tags = settings.retriever.all_tags_in_portion(tenant_id, kb_ids, S)
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								            set_tags_to_cache(kb_ids, all_tags)
 								        else:
 								            all_tags = json.loads(all_tags)
-												Feat: tenant llm provider (#14595)

### What problem does this PR solve?

Python implementation of the Go-based model_provider API suite.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: bill <yibie_jingnian@163.com>
											
										
										
											2026-05-29 17:39:41 +08:00
+								        chat_model_config = get_model_config_from_provider_instance(tenant_id, LLMType.CHAT, task["llm_id"])
-												Feat/tenant model (#13072)

### What problem does this PR solve?

Add id for table tenant_llm and apply in LLMBundle.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
Co-authored-by: Liu An <asiro@qq.com>
											
										
										
											2026-03-05 17:27:17 +08:00
+								        chat_mdl = LLMBundle(task["tenant_id"], chat_model_config, lang=task["language"])
-												Run keyword_extraction, question_proposal, content_tagging in thread pool (#5376)

### What problem does this PR solve?

Run keyword_extraction, question_proposal, content_tagging in threads

### Type of change

- [x] Performance Improvement
											
										
										
											2025-02-26 15:21:14 +08:00
 								        docs_to_tag = []
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        for d in docs:
-												Pref: use redis to check if canceled. (#8853)

### What problem does this PR solve?

### Type of change

- [x] Performance Improvement
											
										
										
											2025-07-15 17:19:27 +08:00
+								            task_canceled = has_canceled(task["id"])
-												Fix: improve task cancel lag (#7765)

### What problem does this PR solve?

https://github.com/infiniflow/ragflow/issues/7761

but it may be difficult to achieve 0 delay (which need to pass the
cancel token to all parts)

Another solution is just 0 delay effect at UI.
And task will stop latter

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-05-22 09:28:08 +08:00
+								            if task_canceled:
 								                progress_callback(-1, msg="Task has been canceled.")
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								                return None
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            if settings.retriever.tag_content(tenant_id, kb_ids, d, all_tags, topn_tags=topn_tags, S=S) and len(d[TAG_FLD]) > 0:
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								                examples.append({"content": d["content_with_weight"], TAG_FLD: d[TAG_FLD]})
-												Run keyword_extraction, question_proposal, content_tagging in thread pool (#5376)

### What problem does this PR solve?

Run keyword_extraction, question_proposal, content_tagging in threads

### Type of change

- [x] Performance Improvement
											
										
										
											2025-02-26 15:21:14 +08:00
+								            else:
 								                docs_to_tag.append(d)
 								        async def doc_content_tagging(chat_mdl, d, topn_tags):
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								            cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], all_tags, {"topn": topn_tags})
 								            if not cached:
-												Refine: image/table context. (#12336)

### What problem does this PR solve?

#12303

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-30 20:24:27 +08:00
+								                if has_canceled(task["id"]):
 								                    progress_callback(-1, msg="Task has been canceled.")
 								                    return
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								                picked_examples = random.choices(examples, k=2) if len(examples) > 2 else examples
-												Fix: Add a basic example when the example of content_tagging is empty (#6276)

### What problem does this PR solve?

When using LLM for auto-tag, if there are no examples, the tag format
generated by LLM may be wrong. This will cause Elasticsearch insert
errors. Adding basic examples can avoid this problem.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-19 17:30:47 +08:00
+								                if not picked_examples:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								                    picked_examples.append({"content": "This is an example", TAG_FLD: {"example": 1}})
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								                async with chat_limiter:
-												Fix: tokenizer issue. (#11902)

#11786
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-11 17:38:17 +08:00
+								                    cached = await content_tagging(
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								                        chat_mdl,
 								                        d["content_with_weight"],
 								                        all_tags,
 								                        picked_examples,
 								                        topn_tags,
 								                    )
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								                if cached:
-												Rebuild graph when it's out of time. (#4607)

### What problem does this PR solve?

#4543

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
											
										
										
											2025-01-23 17:26:20 +08:00
+								                    cached = json.dumps(cached)
 								            if cached:
 								                set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, all_tags, {"topn": topn_tags})
 								                d[TAG_FLD] = json.loads(cached)
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								        tasks = []
 								        for d in docs_to_tag:
 								            tasks.append(asyncio.create_task(doc_content_tagging(chat_mdl, d, topn_tags)))
 								        try:
 								            await asyncio.gather(*tasks, return_exceptions=False)
 								        except Exception as e:
 								            logging.error("Error tagging docs: {}".format(e))
 								            for t in tasks:
 								                t.cancel()
 								            await asyncio.gather(*tasks, return_exceptions=True)
 								            raise
-												Run keyword_extraction, question_proposal, content_tagging in thread pool (#5376)

### What problem does this PR solve?

Run keyword_extraction, question_proposal, content_tagging in threads

### Type of change

- [x] Performance Improvement
											
										
										
											2025-02-26 15:21:14 +08:00
+								        progress_callback(msg="Tagging {} chunks completed in {:.2f}s".format(len(docs), timer() - st))
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    # Record tags applied
 								    tags_applied = [d for d in docs if d.get(TAG_FLD)]
 								    get_recording_context().record("tags_applied", tags_applied)
 								    # Record final chunks for comparison
 								    get_recording_context().record("final_chunks", docs)
 								    final_chunk_ids = [c.get("id") for c in docs if isinstance(c, dict) and "id" in c]
 								    get_recording_context().record("final_chunk_ids_count", len(final_chunk_ids))
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								    return docs
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								@timed_with_recording
-												Feat: let toc run asynchronizly... (#10513)

### What problem does this PR solve?

#10436 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-14 14:14:52 +08:00
+								def build_TOC(task, docs, progress_callback):
 								    progress_callback(msg="Start to generate table of content ...")
-												Feat: tenant llm provider (#14595)

### What problem does this PR solve?

Python implementation of the Go-based model_provider API suite.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: bill <yibie_jingnian@163.com>
											
										
										
											2026-05-29 17:39:41 +08:00
+								    chat_model_config = get_model_config_from_provider_instance(task["tenant_id"], LLMType.CHAT, task["llm_id"])
-												Feat/tenant model (#13072)

### What problem does this PR solve?

Add id for table tenant_llm and apply in LLMBundle.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
Co-authored-by: Liu An <asiro@qq.com>
											
										
										
											2026-03-05 17:27:17 +08:00
+								    chat_mdl = LLMBundle(task["tenant_id"], chat_model_config, lang=task["language"])
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								    docs = sorted(
 								        docs,
 								        key=lambda d: (
 								            d.get("page_num_int", 0)[0] if isinstance(d.get("page_num_int", 0), list) else d.get("page_num_int", 0),
 								            d.get("top_int", 0)[0] if isinstance(d.get("top_int", 0), list) else d.get("top_int", 0),
 								        ),
 								    )
 								    toc: list[dict] = asyncio.run(run_toc_from_text([d["content_with_weight"] for d in docs], chat_mdl, progress_callback))
 								    logging.info("------------ T O C -------------\n" + json.dumps(toc, ensure_ascii=False, indent="  "))
-												Refa: improve TOC building with better error handling (#12427)

### What problem does this PR solve?

Refactor TOC building logic to use enumerate instead of while loop, add
comprehensive error handling for missing/invalid chunk_id values, and
improve logging with more specific error messages. The changes make the
code more robust against malformed TOC data while maintaining the same
functionality for valid inputs.

### Type of change

- [x] Refactoring
											
										
										
											2026-01-05 10:02:42 +08:00
+								    for ii, item in enumerate(toc):
-												Feat: let toc run asynchronizly... (#10513)

### What problem does this PR solve?

#10436 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-14 14:14:52 +08:00
+								        try:
-												Refa: improve TOC building with better error handling (#12427)

### What problem does this PR solve?

Refactor TOC building logic to use enumerate instead of while loop, add
comprehensive error handling for missing/invalid chunk_id values, and
improve logging with more specific error messages. The changes make the
code more robust against malformed TOC data while maintaining the same
functionality for valid inputs.

### Type of change

- [x] Refactoring
											
										
										
											2026-01-05 10:02:42 +08:00
+								            chunk_val = item.pop("chunk_id", None)
 								            if chunk_val is None or str(chunk_val).strip() == "":
 								                logging.warning(f"Index {ii}: chunk_id is missing or empty. Skipping.")
 								                continue
 								            curr_idx = int(chunk_val)
 								            if curr_idx >= len(docs):
 								                logging.error(f"Index {ii}: chunk_id {curr_idx} exceeds docs length {len(docs)}.")
 								                continue
 								            item["ids"] = [docs[curr_idx]["id"]]
 								            if ii + 1 < len(toc):
 								                next_chunk_val = toc[ii + 1].get("chunk_id", "")
 								                if str(next_chunk_val).strip() != "":
 								                    next_idx = int(next_chunk_val)
 								                    for jj in range(curr_idx + 1, min(next_idx + 1, len(docs))):
 								                        item["ids"].append(docs[jj]["id"])
 								                else:
 								                    logging.warning(f"Index {ii + 1}: next chunk_id is empty, range fill skipped.")
 								        except (ValueError, TypeError) as e:
 								            logging.error(f"Index {ii}: Data conversion error - {e}")
-												Feat: let toc run asynchronizly... (#10513)

### What problem does this PR solve?

#10436 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-14 14:14:52 +08:00
+								        except Exception as e:
-												Refa: improve TOC building with better error handling (#12427)

### What problem does this PR solve?

Refactor TOC building logic to use enumerate instead of while loop, add
comprehensive error handling for missing/invalid chunk_id values, and
improve logging with more specific error messages. The changes make the
code more robust against malformed TOC data while maintaining the same
functionality for valid inputs.

### Type of change

- [x] Refactoring
											
										
										
											2026-01-05 10:02:42 +08:00
+								            logging.exception(f"Index {ii}: Unexpected error - {e}")
-												Feat: let toc run asynchronizly... (#10513)

### What problem does this PR solve?

#10436 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-14 14:14:52 +08:00
 								    if toc:
 								        d = copy.deepcopy(docs[-1])
 								        d["content_with_weight"] = json.dumps(toc, ensure_ascii=False)
 								        d["toc_kwd"] = "toc"
 								        d["available_int"] = 0
-												Fix: add toc_kwd field and update page_num_int type (#10596)

### What problem does this PR solve?

- Added new field 'toc_kwd' to infinity_mapping.json for table of
contents keyword support
- Changed page_num_int from integer to array type in task_executor.py to
handle multiple page numbers

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-10-16 12:47:24 +08:00
+								        d["page_num_int"] = [100000000]
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        d["id"] = xxhash.xxh64((d["content_with_weight"] + str(d["doc_id"])).encode("utf-8", "surrogatepass")).hexdigest()
-												Feat: let toc run asynchronizly... (#10513)

### What problem does this PR solve?

#10436 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-14 14:14:52 +08:00
+								        return d
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								    return None
-												Feat: let toc run asynchronizly... (#10513)

### What problem does this PR solve?

#10436 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-14 14:14:52 +08:00
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								def init_kb(row, vector_size: int):
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								    idxnm = search.index_name(row["tenant_id"])
-												Add dataset with table parser type for Infinity and answer question in chat using SQL (#12541)

### What problem does this PR solve?

1) Create  dataset using table parser for infinity
2) Answer questions in chat using SQL

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-19 19:35:14 +08:00
+								    parser_id = row.get("parser_id", None)
 								    return settings.docStoreConn.create_idx(idxnm, row.get("kb_id", ""), vector_size, parser_id)
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								@timed_with_recording
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								async def embedding(docs, mdl, parser_config=None, callback=None):
-												handle nits in task_executor (#2637)

### What problem does this PR solve?

- fix typo
- fix string format
- format import

### Type of change

- [x] Refactoring
											
										
										
											2024-09-29 09:49:45 +08:00
+								    if parser_config is None:
 								        parser_config = {}
-												Feat: Add question parameter to edit chunk modal (#3875)

### What problem does this PR solve?

Close #3873

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-05 14:51:19 +08:00
+								    tts, cnts = [], []
 								    for d in docs:
-												Support debug components. (#3994)

### What problem does this PR solve?

#3993

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-12-11 19:23:59 +08:00
+								        tts.append(d.get("docnm_kwd", "Title"))
-												Feat: Add question parameter to edit chunk modal (#3875)

### What problem does this PR solve?

Close #3873

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-05 14:51:19 +08:00
+								        c = "\n".join(d.get("question_kwd", []))
 								        if not c:
 								            c = d["content_with_weight"]
 								        c = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", c)
-												fix: guard whitespace-only chunks before embedding (#13938)

## Problem

When parsing DOCX files with many tables, DeepDOC generates chunks
containing only empty HTML table tags, such as:

```html
<table><tr><td></td></tr><tr><td></td></tr><tr><td></td></tr><tr><td></td></tr></table>
```

After the regex cleanup at `task_executor.py:584`, this becomes `" "`
(whitespace only).

The guard at line 585 (`if not c`) only catches empty strings `""`, but
whitespace strings are truthy in Python and pass through. When sent to
Zhipu `embedding-3` API, it rejects them with error 1213:
`未正常接收到prompt参数`.

## Root Cause

```python
c = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", c)
if not c:       # ← only catches "", not "   " / "\n" / "\t"
    c = "None"
```

Verified with Zhipu `embedding-3`:
| Input | Result |
|---|---|
| `""` | error 1213 |
| `" "` | error 1213 |
| `"\n"` | error 1213 |
| `"None"` | OK |

## Fix

```diff
- if not c:
+ if not c.strip():
      c = "None"
```

## Testing

Reproduced with a 678KB DOCX file (166 tables, 270 chunks). Chunk #89 is
the empty table above. After fix, `"None"` is sent instead and embedding
succeeds.

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2026-05-13 11:47:50 +08:00
+								        if not c.strip():
 								            logging.debug("embedding(): normalized whitespace-only chunk to placeholder 'None' (len=%d)", len(c))
-												Refactor (#4303)

### What problem does this PR solve?

### Type of change
- [x] Refactoring
											
										
										
											2024-12-31 14:31:31 +08:00
+								            c = "None"
-												Feat: Add question parameter to edit chunk modal (#3875)

### What problem does this PR solve?

Close #3873

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-05 14:51:19 +08:00
+								        cnts.append(c)
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								    tk_count = 0
-												Add Q&A and Book, fix task running bugs (#50)


											
										
										
											2024-02-01 18:53:56 +08:00
+								    if len(tts) == len(cnts):
-												Refa: asyncio.to_thread to ThreadPoolExecutor to break thread limitat… (#12716)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-20 13:29:37 +08:00
+								        vts, c = await thread_pool_exec(mdl.encode, tts[0:1])
-												Feat:update check_embedding api (#11254)

### What problem does this PR solve?
pr: 
#10854
change:
update check_embedding api

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-13 18:48:25 +08:00
+								        tts = np.tile(vts[0], (len(cnts), 1))
-												Accelerate titles' embeddings. (#4492)

### What problem does this PR solve?


### Type of change

- [x] Performance Improvement
											
										
										
											2025-01-15 15:20:29 +08:00
+								        tk_count += c
-												Add Q&A and Book, fix task running bugs (#50)


											
										
										
											2024-02-01 18:53:56 +08:00
-												Feat: add meta data filter. (#9405)

### What problem does this PR solve?

#8531 
#7417 
#6761 
#6573
#6477

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-08-12 14:12:56 +08:00
+								    @timeout(60)
-												Fix: local variable issue. (#9255)

### What problem does this PR solve?

#9227

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-08-05 19:24:34 +08:00
+								    def batch_encode(txts):
 								        nonlocal mdl
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								        return mdl.encode([truncate(c, mdl.max_length - 10) for c in txts])
-												Fix: local variable issue. (#9255)

### What problem does this PR solve?

#9227

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-08-05 19:24:34 +08:00
-												perf: avoid O(n²) array growth in embedding accumulation (#14369)

### What problem does this PR solve?

Both tokenizer (`rag/flow/tokenizer/tokenizer.py`) and
`BuiltinEmbed.encode`
(`rag/llm/embedding_model.py`) currently accumulate embedding batches
via
`np.concatenate` inside the per-batch loop. `np.concatenate` allocates a
new
array and copies all existing data on every call, so accumulating N
batches
is O(N²) in both time and peak memory.

Replacing the incremental concatenate with a list-of-batches + a single
`np.vstack` at the end gives O(N) total work.

For tokenizer the title-vector broadcast `np.concatenate([vts[0]] * N)`
is
also replaced by `np.tile`, which does the same job with a single
contiguous
allocation instead of building a Python list of references.

This is purely a CPU/memory optimisation — output shape and dtype are
unchanged. Measured impact grows with document size:
  -   1k chunks (batch 512, 2 iters):    ~negligible
  -  10k chunks (20 iters):              ~10× speedup on this stage
  - 100k chunks (195 iters):             ~100× speedup, and peak RAM
drops from O(N) extra to near-zero

### Type of change

- [x] Performance Improvement

Co-authored-by: yoan sapienza <Yoan Sapienza yoan.sapienza@orange.fr Yoan Sapienza zappy@macbookpro.home>
											
										
										
											2026-04-30 05:00:10 +02:00
+								    cnts_batches = []
-												Move api.settings to common.settings (#11036)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-06 09:36:38 +08:00
+								    for i in range(0, len(cnts), settings.EMBEDDING_BATCH_SIZE):
-												Refa: remove temperature since some  LLMs fail to support. (#8981)

### What problem does this PR solve?


### Type of change

- [x] Refactoring
											
										
										
											2025-07-23 10:17:04 +08:00
+								        async with embed_limiter:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            vts, c = await thread_pool_exec(batch_encode, cnts[i : i + settings.EMBEDDING_BATCH_SIZE])
-												perf: avoid O(n²) array growth in embedding accumulation (#14369)

### What problem does this PR solve?

Both tokenizer (`rag/flow/tokenizer/tokenizer.py`) and
`BuiltinEmbed.encode`
(`rag/llm/embedding_model.py`) currently accumulate embedding batches
via
`np.concatenate` inside the per-batch loop. `np.concatenate` allocates a
new
array and copies all existing data on every call, so accumulating N
batches
is O(N²) in both time and peak memory.

Replacing the incremental concatenate with a list-of-batches + a single
`np.vstack` at the end gives O(N) total work.

For tokenizer the title-vector broadcast `np.concatenate([vts[0]] * N)`
is
also replaced by `np.tile`, which does the same job with a single
contiguous
allocation instead of building a Python list of references.

This is purely a CPU/memory optimisation — output shape and dtype are
unchanged. Measured impact grows with document size:
  -   1k chunks (batch 512, 2 iters):    ~negligible
  -  10k chunks (20 iters):              ~10× speedup on this stage
  - 100k chunks (195 iters):             ~100× speedup, and peak RAM
drops from O(N) extra to near-zero

### Type of change

- [x] Performance Improvement

Co-authored-by: yoan sapienza <Yoan Sapienza yoan.sapienza@orange.fr Yoan Sapienza zappy@macbookpro.home>
											
										
										
											2026-04-30 05:00:10 +02:00
+								        cnts_batches.append(vts)
-												change callback strategy, add timezone to docker (#96)


											
										
										
											2024-03-05 12:08:41 +08:00
+								        tk_count += c
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								        callback(prog=0.7 + 0.2 * (i + 1) / len(cnts), msg="")
-												perf: avoid O(n²) array growth in embedding accumulation (#14369)

### What problem does this PR solve?

Both tokenizer (`rag/flow/tokenizer/tokenizer.py`) and
`BuiltinEmbed.encode`
(`rag/llm/embedding_model.py`) currently accumulate embedding batches
via
`np.concatenate` inside the per-batch loop. `np.concatenate` allocates a
new
array and copies all existing data on every call, so accumulating N
batches
is O(N²) in both time and peak memory.

Replacing the incremental concatenate with a list-of-batches + a single
`np.vstack` at the end gives O(N) total work.

For tokenizer the title-vector broadcast `np.concatenate([vts[0]] * N)`
is
also replaced by `np.tile`, which does the same job with a single
contiguous
allocation instead of building a Python list of references.

This is purely a CPU/memory optimisation — output shape and dtype are
unchanged. Measured impact grows with document size:
  -   1k chunks (batch 512, 2 iters):    ~negligible
  -  10k chunks (20 iters):              ~10× speedup on this stage
  - 100k chunks (195 iters):             ~100× speedup, and peak RAM
drops from O(N) extra to near-zero

### Type of change

- [x] Performance Improvement

Co-authored-by: yoan sapienza <Yoan Sapienza yoan.sapienza@orange.fr Yoan Sapienza zappy@macbookpro.home>
											
										
										
											2026-04-30 05:00:10 +02:00
+								    cnts = np.vstack(cnts_batches) if cnts_batches else np.array([])
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								    filename_embd_weight = parser_config.get("filename_embd_weight", 0.1)  # due to the db support none value
-												Fix: The data set created by API call failed to parse after uploading the file. (#8657)

### What problem does this PR solve?

https://github.com/infiniflow/ragflow/issues/8656

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-07-04 12:41:28 +08:00
+								    if not filename_embd_weight:
 								        filename_embd_weight = 0.1
 								    title_w = float(filename_embd_weight)
-												Feat:update check_embedding api (#11254)

### What problem does this PR solve?
pr: 
#10854
change:
update check_embedding api

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-13 18:48:25 +08:00
+								    if tts.ndim == 2 and cnts.ndim == 2 and tts.shape == cnts.shape:
 								        vects = title_w * tts + (1 - title_w) * cnts
 								    else:
 								        vects = cnts
-												Add Q&A and Book, fix task running bugs (#50)


											
										
										
											2024-02-01 18:53:56 +08:00
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								    assert len(vects) == len(docs)
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								    vector_size = 0
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								    for i, d in enumerate(docs):
-												change licence (#28)

* add front end code

* change licence
											
										
										
											2024-01-17 09:39:50 +08:00
+								        v = vects[i].tolist()
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        vector_size = len(v)
-												Add task moduel, and pipline the task and every parser (#49)


											
										
										
											2024-01-31 19:57:45 +08:00
+								        d["q_%d_vec" % len(v)] = v
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								    return tk_count, vector_size
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								@timed_with_recording
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								async def run_dataflow(task: dict):
-												Feat: add mechanism to check cancellation in Agent (#10766)

### What problem does this PR solve?

Add mechanism to check cancellation in Agent.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 17:36:48 +08:00
+								    from api.db.services.canvas_service import UserCanvasService
 								    from rag.flow.pipeline import Pipeline
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    task_start_ts = timer()
 								    dataflow_id = task["dataflow_id"]
 								    doc_id = task["doc_id"]
 								    task_id = task["id"]
 								    task_dataset_id = task["kb_id"]
 								    if task["task_type"] == "dataflow":
 								        e, cvs = UserCanvasService.get_by_id(dataflow_id)
 								        assert e, "User pipeline not found."
 								        dsl = cvs.dsl
 								    else:
 								        e, pipeline_log = PipelineOperationLogService.get_by_id(dataflow_id)
 								        assert e, "Pipeline log not found."
 								        dsl = pipeline_log.dsl
 								        dataflow_id = pipeline_log.pipeline_id
 								    pipeline = Pipeline(dsl, tenant_id=task["tenant_id"], doc_id=doc_id, task_id=task_id, flow_id=dataflow_id)
 								    chunks = await pipeline.run(file=task["file"]) if task.get("file") else await pipeline.run()
 								    if doc_id == CANVAS_DEBUG_DOC_ID:
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().record("dataflow_debug_result", "canvas_debug_mode")
 								        get_recording_context().record("dataflow_chunks", chunks)
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								        return
 								    if not chunks:
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().record("pipeline_output_count", 0)
 								        get_recording_context().record("pipeline_output_type", "empty")
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        ret = PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().save_func_return_value("PipelineOperationLogService.create", ret)
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								        return
 								    embedding_token_consumption = chunks.get("embedding_token_consumption", 0)
-												Fix: accept empty value as 0 chunk (#14220)

### What problem does this PR solve?

Fix: accept empty value as 0 chunk
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-20 12:53:47 +08:00
+								    # The output key may exist with an empty payload; check presence, not truthiness.
 								    if "chunks" in chunks:
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								        chunks = copy.deepcopy(chunks["chunks"])
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        output_type = "chunks"
-												Fix: accept empty value as 0 chunk (#14220)

### What problem does this PR solve?

Fix: accept empty value as 0 chunk
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-20 12:53:47 +08:00
+								    elif "json" in chunks:
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								        chunks = copy.deepcopy(chunks["json"])
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        output_type = "json"
-												Fix: accept empty value as 0 chunk (#14220)

### What problem does this PR solve?

Fix: accept empty value as 0 chunk
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-20 12:53:47 +08:00
+								    elif "markdown" in chunks:
 								        chunks = [{"text": [chunks["markdown"]]}] if chunks["markdown"] else []
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        output_type = "markdown"
-												Fix: accept empty value as 0 chunk (#14220)

### What problem does this PR solve?

Fix: accept empty value as 0 chunk
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-20 12:53:47 +08:00
+								    elif "text" in chunks:
 								        chunks = [{"text": [chunks["text"]]}] if chunks["text"] else []
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        output_type = "text"
-												Fix: accept empty value as 0 chunk (#14220)

### What problem does this PR solve?

Fix: accept empty value as 0 chunk
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-20 12:53:47 +08:00
+								    elif "html" in chunks:
 								        chunks = [{"text": [chunks["html"]]}] if chunks["html"] else []
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        output_type = "html"
-												Fix: accept empty value as 0 chunk (#14220)

### What problem does this PR solve?

Fix: accept empty value as 0 chunk
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-20 12:53:47 +08:00
+								    else:
 								        chunks = []
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        output_type = "empty"
 								    get_recording_context().record("pipeline_output_type", output_type)
 								    get_recording_context().record("pipeline_output_count", len(chunks))
-												Fix: accept empty value as 0 chunk (#14220)

### What problem does this PR solve?

Fix: accept empty value as 0 chunk
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-20 12:53:47 +08:00
 								    # An empty normalized payload means "nothing parsed", so stop before embedding/indexing.
 								    if not chunks:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        ret = PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().save_func_return_value("PipelineOperationLogService.create", ret)
-												Fix: accept empty value as 0 chunk (#14220)

### What problem does this PR solve?

Fix: accept empty value as 0 chunk
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-20 12:53:47 +08:00
+								        return
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
 								    keys = [k for o in chunks for k in list(o.keys())]
 								    if not any([re.match(r"q_[0-9]+_vec", k) for k in keys]):
 								        try:
 								            set_progress(task_id, prog=0.82, msg="\n-------------------------------------\nStart to embedding...")
 								            e, kb = KnowledgebaseService.get_by_id(task["kb_id"])
 								            embedding_id = kb.embd_id
-												Feat: tenant llm provider (#14595)

### What problem does this PR solve?

Python implementation of the Go-based model_provider API suite.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: bill <yibie_jingnian@163.com>
											
										
										
											2026-05-29 17:39:41 +08:00
+								            embd_model_config = get_model_config_from_provider_instance(task["tenant_id"], LLMType.EMBEDDING, embedding_id)
-												Feat/tenant model (#13072)

### What problem does this PR solve?

Add id for table tenant_llm and apply in LLMBundle.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
Co-authored-by: Liu An <asiro@qq.com>
											
										
										
											2026-03-05 17:27:17 +08:00
+								            embedding_model = LLMBundle(task["tenant_id"], embd_model_config)
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								            @timeout(60)
 								            def batch_encode(txts):
 								                nonlocal embedding_model
 								                return embedding_model.encode([truncate(c, embedding_model.max_length - 10) for c in txts])
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
-												perf: avoid O(n²) array growth in embedding accumulation (#14369)

### What problem does this PR solve?

Both tokenizer (`rag/flow/tokenizer/tokenizer.py`) and
`BuiltinEmbed.encode`
(`rag/llm/embedding_model.py`) currently accumulate embedding batches
via
`np.concatenate` inside the per-batch loop. `np.concatenate` allocates a
new
array and copies all existing data on every call, so accumulating N
batches
is O(N²) in both time and peak memory.

Replacing the incremental concatenate with a list-of-batches + a single
`np.vstack` at the end gives O(N) total work.

For tokenizer the title-vector broadcast `np.concatenate([vts[0]] * N)`
is
also replaced by `np.tile`, which does the same job with a single
contiguous
allocation instead of building a Python list of references.

This is purely a CPU/memory optimisation — output shape and dtype are
unchanged. Measured impact grows with document size:
  -   1k chunks (batch 512, 2 iters):    ~negligible
  -  10k chunks (20 iters):              ~10× speedup on this stage
  - 100k chunks (195 iters):             ~100× speedup, and peak RAM
drops from O(N) extra to near-zero

### Type of change

- [x] Performance Improvement

Co-authored-by: yoan sapienza <Yoan Sapienza yoan.sapienza@orange.fr Yoan Sapienza zappy@macbookpro.home>
											
										
										
											2026-04-30 05:00:10 +02:00
+								            vects_batches = []
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								            texts = [o.get("questions", o.get("summary", o["text"])) for o in chunks]
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								            delta = 0.20 / (len(texts) // settings.EMBEDDING_BATCH_SIZE + 1)
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								            prog = 0.8
-												Move api.settings to common.settings (#11036)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-06 09:36:38 +08:00
+								            for i in range(0, len(texts), settings.EMBEDDING_BATCH_SIZE):
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								                async with embed_limiter:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								                    vts, c = await thread_pool_exec(batch_encode, texts[i : i + settings.EMBEDDING_BATCH_SIZE])
-												perf: avoid O(n²) array growth in embedding accumulation (#14369)

### What problem does this PR solve?

Both tokenizer (`rag/flow/tokenizer/tokenizer.py`) and
`BuiltinEmbed.encode`
(`rag/llm/embedding_model.py`) currently accumulate embedding batches
via
`np.concatenate` inside the per-batch loop. `np.concatenate` allocates a
new
array and copies all existing data on every call, so accumulating N
batches
is O(N²) in both time and peak memory.

Replacing the incremental concatenate with a list-of-batches + a single
`np.vstack` at the end gives O(N) total work.

For tokenizer the title-vector broadcast `np.concatenate([vts[0]] * N)`
is
also replaced by `np.tile`, which does the same job with a single
contiguous
allocation instead of building a Python list of references.

This is purely a CPU/memory optimisation — output shape and dtype are
unchanged. Measured impact grows with document size:
  -   1k chunks (batch 512, 2 iters):    ~negligible
  -  10k chunks (20 iters):              ~10× speedup on this stage
  - 100k chunks (195 iters):             ~100× speedup, and peak RAM
drops from O(N) extra to near-zero

### Type of change

- [x] Performance Improvement

Co-authored-by: yoan sapienza <Yoan Sapienza yoan.sapienza@orange.fr Yoan Sapienza zappy@macbookpro.home>
											
										
										
											2026-04-30 05:00:10 +02:00
+								                vects_batches.append(vts)
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								                embedding_token_consumption += c
 								                prog += delta
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								                if i % (len(texts) // settings.EMBEDDING_BATCH_SIZE / 100 + 1) == 1:
 								                    set_progress(task_id, prog=prog, msg=f"{i + 1} / {len(texts) // settings.EMBEDDING_BATCH_SIZE}")
-												perf: avoid O(n²) array growth in embedding accumulation (#14369)

### What problem does this PR solve?

Both tokenizer (`rag/flow/tokenizer/tokenizer.py`) and
`BuiltinEmbed.encode`
(`rag/llm/embedding_model.py`) currently accumulate embedding batches
via
`np.concatenate` inside the per-batch loop. `np.concatenate` allocates a
new
array and copies all existing data on every call, so accumulating N
batches
is O(N²) in both time and peak memory.

Replacing the incremental concatenate with a list-of-batches + a single
`np.vstack` at the end gives O(N) total work.

For tokenizer the title-vector broadcast `np.concatenate([vts[0]] * N)`
is
also replaced by `np.tile`, which does the same job with a single
contiguous
allocation instead of building a Python list of references.

This is purely a CPU/memory optimisation — output shape and dtype are
unchanged. Measured impact grows with document size:
  -   1k chunks (batch 512, 2 iters):    ~negligible
  -  10k chunks (20 iters):              ~10× speedup on this stage
  - 100k chunks (195 iters):             ~100× speedup, and peak RAM
drops from O(N) extra to near-zero

### Type of change

- [x] Performance Improvement

Co-authored-by: yoan sapienza <Yoan Sapienza yoan.sapienza@orange.fr Yoan Sapienza zappy@macbookpro.home>
											
										
										
											2026-04-30 05:00:10 +02:00
+								            vects = np.vstack(vects_batches) if vects_batches else np.array([])
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            get_recording_context().record("embedding_token_consumption", embedding_token_consumption)
 								            get_recording_context().record("vector_size", len(vects[0]) if len(vects) > 0 else 0)
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
 								            assert len(vects) == len(chunks)
 								            for i, ck in enumerate(chunks):
 								                v = vects[i].tolist()
 								                ck["q_%d_vec" % len(v)] = v
-												Fix: task cancel (#13034)

### What problem does this PR solve?

Fix: task cancel #11745 
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-02-06 14:48:24 +08:00
+								        except TaskCanceledException:
 								            raise
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								        except Exception as e:
 								            set_progress(task_id, prog=-1, msg=f"[ERROR]: {e}")
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            ret = PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            get_recording_context().save_func_return_value("PipelineOperationLogService.create", ret)
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								            return
 								    metadata = {}
 								    for ck in chunks:
 								        ck["doc_id"] = doc_id
 								        ck["kb_id"] = [str(task["kb_id"])]
 								        ck["docnm_kwd"] = task["name"]
 								        ck["create_time"] = str(datetime.now()).replace("T", " ")[:19]
 								        ck["create_timestamp_flt"] = datetime.now().timestamp()
-												Fix:toc in pipeline (#11785)

### What problem does this PR solve?
change:
Fix toc in pipeline
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-08 09:42:20 +08:00
+								        if not ck.get("id"):
 								            ck["id"] = xxhash.xxh64((ck["text"] + str(ck["doc_id"])).encode("utf-8")).hexdigest()
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								        if "questions" in ck:
 								            if "question_tks" not in ck:
 								                ck["question_kwd"] = ck["questions"].split("\n")
 								                ck["question_tks"] = rag_tokenizer.tokenize(str(ck["questions"]))
 								            del ck["questions"]
 								        if "keywords" in ck:
 								            if "important_tks" not in ck:
-												fix(keyword_extraction): accept Chinese commas/semicolons/newlines as keyword delimiters (#14540)

## What
Widen the keyword delimiter in `rag/svr/task_executor.py`:
both `build_chunks` (LLM `keyword_extraction` cache parsing) and
`run_dataflow` (chunk-level `keywords` ingestion) now split on
`, ， ; ； 、 \r \n` instead of only ASCII comma.

## Why
`rag/prompts/keyword_prompt.md` instructs the LLM:

> The keywords are delimited by ENGLISH COMMA.

In practice, Chinese-leaning models (Qwen / Tongyi-Qianwen, GLM,
etc.) frequently ignore this instruction when the source content is
Chinese and emit Chinese commas (`，`) instead. Result:
`cached.split(",")` sees the full LLM output as a *single* keyword.

Repro: `auto_keywords>=4` + Chinese docs + `qwen-plus@Tongyi-Qianwen`.
We observed entries in `important_kwd` like
`"功能介绍，配置说明，参数详解，问题排查"` — one bucket instead of four.

## Impact
- Silent data-quality bug; no exception thrown.
- BM25 `important_kwd^30` boost effectively stops firing — the
  indexed term is the whole list, never matches user query tokens.
- Any downstream aggregating `important_kwd` (tagging, analytics,
  candidate-keyword review UIs) sees garbage.

## Compatibility
- Pure widening of the splitter; ASCII-comma-only outputs continue
  to work identically.
- No schema / API change.

## Test plan
Manually verified against `qwen-plus@Tongyi-Qianwen` with
`auto_keywords=10` on Chinese .txt files:

- Before: `important_kwd` contains one element per chunk that is the
  full LLM string with `，`-separated phrases inside.
- After: `important_kwd` contains N elements, one per phrase, as the
  LLM intended.
											
										
										
											2026-05-11 12:05:24 +08:00
+								                ck["important_kwd"] = [k for k in re.split(r"[,，;；、\r\n]+", ck["keywords"]) if k.strip()]
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								                ck["important_tks"] = rag_tokenizer.tokenize(str(ck["keywords"]))
 								            del ck["keywords"]
 								        if "summary" in ck:
 								            if "content_ltks" not in ck:
 								                ck["content_ltks"] = rag_tokenizer.tokenize(str(ck["summary"]))
 								                ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
 								            del ck["summary"]
 								        if "metadata" in ck:
-												Fix: parent-children chunking method. (#11997)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-17 16:50:36 +08:00
+								            metadata = update_metadata_to(metadata, ck["metadata"])
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								            del ck["metadata"]
 								        if "content_with_weight" not in ck:
 								            ck["content_with_weight"] = ck["text"]
 								        del ck["text"]
 								        if "positions" in ck:
 								            add_positions(ck, ck["positions"])
 								            del ck["positions"]
 								    if metadata:
-												Put document metadata in ES/Infinity (#12826)

### What problem does this PR solve?

Put document metadata in ES/Infinity.

Index name of meta data: ragflow_doc_meta_{tenant_id}

### Type of change

- [x] Refactoring
											
										
										
											2026-01-28 13:29:34 +08:00
+								        existing_meta = DocMetadataService.get_document_metadata(doc_id)
 								        existing_meta = existing_meta if isinstance(existing_meta, dict) else {}
 								        metadata = update_metadata_to(metadata, existing_meta)
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().record("run_dataflow_metadata", metadata)
 								        ret = DocMetadataService.update_document_metadata(doc_id, metadata)
 								        get_recording_context().save_func_return_value("DocMetadataService.update_document_metadata", ret)
-												Feat: refine dataflow and initialize dataflow app (#9952)

### What problem does this PR solve?

Refine dataflow and initialize dataflow app.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-09-05 18:50:46 +08:00
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    start_ts = timer()
 								    set_progress(task_id, prog=0.82, msg="[DOC Engine]:\nStart to index...")
-												Add dataset with table parser type for Infinity and answer question in chat using SQL (#12541)

### What problem does this PR solve?

1) Create  dataset using table parser for infinity
2) Answer questions in chat using SQL

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-19 19:35:14 +08:00
+								    e = await insert_chunks(task_id, task["tenant_id"], task["kb_id"], chunks, partial(set_progress, task_id, 0, 100000000))
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    if not e:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        ret = PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().save_func_return_value("PipelineOperationLogService.create", ret)
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								        return
-												Feat: refine dataflow and initialize dataflow app (#9952)

### What problem does this PR solve?

Refine dataflow and initialize dataflow app.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-09-05 18:50:46 +08:00
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    time_cost = timer() - start_ts
 								    task_time_cost = timer() - task_start_ts
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								    set_progress(task_id, prog=1.0, msg="Indexing done ({:.2f}s). Task done ({:.2f}s)".format(time_cost, task_time_cost))
 								    ret = DocumentService.increment_chunk_num(doc_id, task_dataset_id, embedding_token_consumption, len(chunks), task_time_cost)
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    get_recording_context().save_func_return_value("DocumentService.increment_chunk_num", ret)
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								    logging.info("[Done], chunks({}), token({}), elapsed:{:.2f}".format(len(chunks), embedding_token_consumption, task_time_cost))
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    get_recording_context().record("dataflow_chunks", chunks)
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								    ret = PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    get_recording_context().save_func_return_value("PipelineOperationLogService.create", ret)
-												Feat: refine dataflow and initialize dataflow app (#9952)

### What problem does this PR solve?

Refine dataflow and initialize dataflow app.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-09-05 18:50:46 +08:00
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								RAPTOR_METHOD_SEARCH_LIMIT = 10000
-												Refact: improve task resume mechanism for graphrag (#14096)

### What problem does this PR solve?

Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).

**Changes based on @yuzhichang's review:**

1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.

**Changes based on CodeRabbit review:**

3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.

### Changes

| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |

### How it works

- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.

### Testing

- 10 new unit tests — all passing
- Full existing suite: 617 passed

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
											
										
										
											2026-04-15 15:07:28 +05:30
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
 								async def get_raptor_chunk_field_map(doc_id: str, tenant_id: str, kb_id: str) -> dict:
 								    """Return stored RAPTOR marker fields for a document."""
-												Refact: improve task resume mechanism for graphrag (#14096)

### What problem does this PR solve?

Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).

**Changes based on @yuzhichang's review:**

1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.

**Changes based on CodeRabbit review:**

3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.

### Changes

| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |

### How it works

- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.

### Testing

- 10 new unit tests — all passing
- Full existing suite: 617 passed

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
											
										
										
											2026-04-15 15:07:28 +05:30
+								    from common.doc_store.doc_store_base import OrderByExpr
 								    from rag.nlp import search as nlp_search
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
 								    async def search_fields(fields: list[str], condition: dict, order_by=None):
 								        """Search chunk fields in the current knowledge base."""
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        res = await thread_pool_exec(settings.docStoreConn.search, fields, [], condition, [], order_by or OrderByExpr(), 0, RAPTOR_METHOD_SEARCH_LIMIT, nlp_search.index_name(tenant_id), [kb_id])
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        return settings.docStoreConn.get_fields(res, fields)
 								    primary = await search_fields(["raptor_kwd", "extra"], {"doc_id": doc_id, "raptor_kwd": ["raptor"]})
 								    if collect_raptor_chunk_ids(primary):
 								        return primary
 								    try:
 								        return await search_fields(
 								            ["raptor_kwd", "extra"],
 								            {"doc_id": doc_id},
 								            OrderByExpr().desc("create_timestamp_flt"),
 								        )
 								    except Exception:
 								        logging.debug("RAPTOR fallback method lookup with extra field failed for doc %s", doc_id, exc_info=True)
 								        return primary
 								async def get_raptor_chunk_methods(doc_id: str, tenant_id: str, kb_id: str) -> set[str]:
 								    """Return the RAPTOR tree builders already stored for doc_id.
 								    Queries directly for raptor_kwd="raptor" rows so a non-RAPTOR leading
 								    chunk cannot produce a false-negative result. Legacy summary chunks that
 								    do not have method metadata are treated as the original RAPTOR builder.
 								    """
 								    try:
 								        field_map = await get_raptor_chunk_field_map(doc_id, tenant_id, kb_id)
 								        methods = collect_raptor_methods(field_map)
 								        if methods:
-												Refact: improve task resume mechanism for graphrag (#14096)

### What problem does this PR solve?

Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).

**Changes based on @yuzhichang's review:**

1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.

**Changes based on CodeRabbit review:**

3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.

### Changes

| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |

### How it works

- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.

### Testing

- 10 new unit tests — all passing
- Full existing suite: 617 passed

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
											
										
										
											2026-04-15 15:07:28 +05:30
+								            logging.info(
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								                "Checkpoint hit: RAPTOR chunks for doc %s (tenant=%s kb=%s methods=%s) already exist",
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								                doc_id,
 								                tenant_id,
 								                kb_id,
 								                sorted(methods),
-												Refact: improve task resume mechanism for graphrag (#14096)

### What problem does this PR solve?

Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).

**Changes based on @yuzhichang's review:**

1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.

**Changes based on CodeRabbit review:**

3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.

### Changes

| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |

### How it works

- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.

### Testing

- 10 new unit tests — all passing
- Full existing suite: 617 passed

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
											
										
										
											2026-04-15 15:07:28 +05:30
+								            )
 								        else:
 								            logging.info(
 								                "Checkpoint miss: no RAPTOR chunks for doc %s (tenant=%s kb=%s)",
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								                doc_id,
 								                tenant_id,
 								                kb_id,
-												Refact: improve task resume mechanism for graphrag (#14096)

### What problem does this PR solve?

Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).

**Changes based on @yuzhichang's review:**

1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.

**Changes based on CodeRabbit review:**

3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.

### Changes

| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |

### How it works

- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.

### Testing

- 10 new unit tests — all passing
- Full existing suite: 617 passed

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
											
										
										
											2026-04-15 15:07:28 +05:30
+								            )
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        return methods
-												Refact: improve task resume mechanism for graphrag (#14096)

### What problem does this PR solve?

Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).

**Changes based on @yuzhichang's review:**

1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.

**Changes based on CodeRabbit review:**

3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.

### Changes

| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |

### How it works

- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.

### Testing

- 10 new unit tests — all passing
- Full existing suite: 617 passed

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
											
										
										
											2026-04-15 15:07:28 +05:30
+								    except Exception:
 								        logging.exception("Failed to check RAPTOR chunks for doc %s", doc_id)
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        raise
 								async def has_raptor_chunks(doc_id: str, tenant_id: str, kb_id: str, tree_builder: str = RAPTOR_TREE_BUILDER) -> bool:
 								    """Return whether doc_id already has summaries for tree_builder."""
 								    methods = await get_raptor_chunk_methods(doc_id, tenant_id, kb_id)
 								    return tree_builder in methods
 								async def delete_raptor_chunks(doc_id: str, tenant_id: str, kb_id: str, keep_method: str | None = None):
 								    """Delete RAPTOR summaries for doc_id, optionally preserving one method."""
 								    if keep_method is None:
 								        logging.info(
 								            "delete_raptor_chunks: removing all RAPTOR summaries (doc=%s tenant=%s kb=%s)",
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            doc_id,
 								            tenant_id,
 								            kb_id,
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        ret = await thread_pool_exec(
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            settings.docStoreConn.delete,
 								            {"doc_id": doc_id, "raptor_kwd": ["raptor"]},
 								            nlp_search.index_name(tenant_id),
 								            kb_id,
 								        )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().save_func_return_value("docStoreConn.delete", ret)
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        return 0
 								    field_map = await get_raptor_chunk_field_map(doc_id, tenant_id, kb_id)
 								    chunk_ids = collect_raptor_chunk_ids(field_map, exclude_methods={keep_method})
 								    if not chunk_ids:
 								        logging.debug(
 								            "delete_raptor_chunks: no stale RAPTOR chunks to remove (doc=%s tenant=%s kb=%s keep=%s)",
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            doc_id,
 								            tenant_id,
 								            kb_id,
 								            keep_method,
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        )
 								        return 0
 								    logging.info(
 								        "delete_raptor_chunks: removing %d stale RAPTOR chunks (doc=%s tenant=%s kb=%s keep=%s)",
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        len(chunk_ids),
 								        doc_id,
 								        tenant_id,
 								        kb_id,
 								        keep_method,
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								    )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    ret = await thread_pool_exec(
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        settings.docStoreConn.delete,
 								        {"id": list(chunk_ids)},
 								        nlp_search.index_name(tenant_id),
 								        kb_id,
 								    )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    get_recording_context().save_func_return_value("docStoreConn.delete", ret)
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								    return len(chunk_ids)
-												Refact: improve task resume mechanism for graphrag (#14096)

### What problem does this PR solve?

Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).

**Changes based on @yuzhichang's review:**

1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.

**Changes based on CodeRabbit review:**

3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.

### Changes

| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |

### How it works

- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.

### Testing

- 10 new unit tests — all passing
- Full existing suite: 617 passed

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
											
										
										
											2026-04-15 15:07:28 +05:30
-												Perf: Enhance timeout handling. (#8826)

### What problem does this PR solve?


### Type of change

- [x] Performance Improvement
											
										
										
											2025-07-15 09:36:45 +08:00
+								@timeout(3600)
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_size, callback=None, doc_ids=[]):
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								    """Generate RAPTOR summaries for selected documents in a knowledge base."""
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    fake_doc_id = GRAPH_RAPTOR_FAKE_DOC_ID
 								    raptor_config = kb_parser_config.get("raptor", {})
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								    raptor_ext_config = raptor_config.get("ext") or {}
 								    tree_builder = get_raptor_tree_builder(raptor_config)
 								    clustering_method = get_raptor_clustering_method(raptor_config)
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								    vctr_nm = "q_%d_vec" % vector_size
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
-												add raptor (#899)

### What problem does this PR solve?

#882 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-05-23 14:31:16 +08:00
+								    res = []
 								    tk_count = 0
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								    cleanup_raptor_chunks = []
-												Feat: add fault-tolerant mechanism to RAPTOR (#11206)

### What problem does this PR solve?

Add fault-tolerant mechanism to RAPTOR.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-13 18:48:07 +08:00
+								    max_errors = int(os.environ.get("RAPTOR_MAX_ERRORS", 3))
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								    doc_info_by_id = {}
-												Fix missmatch docnm_kwd in raptor chunks (#13451)

### What problem does this PR solve?

issue #13393 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-10 14:24:33 +08:00
+								    for doc_id in set(doc_ids):
 								        ok, source_doc = DocumentService.get_by_id(doc_id)
 								        if not ok or not source_doc:
 								            continue
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        doc_info_by_id[doc_id] = {
 								            "name": getattr(source_doc, "name", ""),
 								            "type": getattr(source_doc, "type", ""),
 								            "parser_id": getattr(source_doc, "parser_id", ""),
 								            "parser_config": getattr(source_doc, "parser_config", {}) or {},
 								        }
 								    def schedule_raptor_cleanup(doc_id: str, keep_method: str | None = None):
 								        """Queue stale RAPTOR summaries for deletion after successful insert."""
 								        cleanup_plan = (doc_id, keep_method)
 								        if cleanup_plan not in cleanup_raptor_chunks:
 								            cleanup_raptor_chunks.append(cleanup_plan)
 								    def skip_raptor_doc(doc_id: str) -> bool:
 								        """Return whether RAPTOR should be skipped for this source document."""
 								        doc_info = doc_info_by_id.get(doc_id, {})
 								        file_type = doc_info.get("type") or row.get("type", "")
 								        parser_id = doc_info.get("parser_id") or row.get("parser_id", "")
 								        parser_config = doc_info.get("parser_config") or row.get("parser_config", {})
 								        if should_skip_raptor(file_type, parser_id, parser_config, raptor_config):
 								            skip_reason = get_skip_reason(file_type, parser_id, parser_config)
 								            doc_name = doc_info.get("name") or doc_id
 								            logging.info("Skipping Raptor for document %s: %s", doc_name, skip_reason)
 								            callback(msg=f"[RAPTOR] doc:{doc_id} skipped: {skip_reason}")
 								            return True
 								        return False
-												Feat: add fault-tolerant mechanism to RAPTOR (#11206)

### What problem does this PR solve?

Add fault-tolerant mechanism to RAPTOR.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-13 18:48:07 +08:00
-												Fix: ollama model list issue. (#11175)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-11 19:46:41 +08:00
+								    async def generate(chunks, did):
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        """Run RAPTOR and append generated summary chunks for one doc id."""
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								        nonlocal tk_count, res
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        logging.info("RAPTOR: using tree_builder=%s clustering_method=%s for doc %s", tree_builder, clustering_method, did)
-												Feat: Add knowledge compilation workflows (#16515)

## Summary
- Add knowledge compilation template APIs, services, and builtin
template seed data
- Add advanced knowledge compile structure/artifact/RAPTOR workflow
support
- Update parsing, dataset/document APIs, and supporting services for
compilation workflows
											
										
										
											2026-07-02 23:22:07 +08:00
+								        from rag.advanced_rag.knowlege_compile.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor  # Lazy load, save around 8s
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								        raptor = Raptor(
 								            raptor_config.get("max_cluster", 64),
 								            chat_mdl,
 								            embd_mdl,
 								            raptor_config["prompt"],
 								            raptor_config["max_token"],
 								            raptor_config["threshold"],
-												Feat: add fault-tolerant mechanism to RAPTOR (#11206)

### What problem does this PR solve?

Add fault-tolerant mechanism to RAPTOR.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-13 18:48:07 +08:00
+								            max_errors=max_errors,
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            tree_builder=tree_builder,
 								            clustering_method=clustering_method,
 								            psi_exact_max_leaves=raptor_ext_config.get("psi_exact_max_leaves", 4096),
 								            psi_bucket_size=raptor_ext_config.get("psi_bucket_size", 1024),
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								        )
 								        original_length = len(chunks)
-												feat: persist RAPTOR layer metadata on summary chunks (#13286)

## Summary

RAPTOR's recursive clustering builds a `layers` list tracking
`(start_idx, end_idx)` boundaries per level, but currently discards this
information — only the flat `chunks` list is returned. This makes it
impossible to distinguish leaf-level summaries from top-level ones.

This PR:
- Returns `(chunks, layers)` tuple from `raptor.py`'s `__call__`
- Annotates each RAPTOR summary chunk with `raptor_layer_int` (1 = first
summary level, 2 = summary-of-summaries, etc.)
- Adds `raptor_layer_int` to `infinity_mapping.json` (Elasticsearch
handles it via existing `*_int` dynamic template)

### Why this matters

Downstream features need to know which RAPTOR layer a summary belongs
to:
- **Retrieving the top-level document summary** for entity extraction,
search snippets, or document comparison
- **Filtering by abstraction level** — users may want only high-level
summaries or only leaf-level cluster summaries
- **RAPTOR recall quality** — #10951 reports summaries not being
recalled for definition queries; layer metadata enables targeted
retrieval

### Changes

| File | Change | LOC |
|------|--------|-----|
| `rag/raptor.py` | Return `(chunks, layers)` tuple | ~3 |
| `rag/svr/task_executor.py` | Build `chunk_layer` mapping, set
`raptor_layer_int` | ~12 |
| `conf/infinity_mapping.json` | Add `raptor_layer_int` integer field |
~1 |

### Backward compatibility

- **Additive only** — no existing fields or behavior changed
- Existing RAPTOR chunks continue to work (they'll have
`raptor_layer_int = 0` by default)
- New RAPTOR chunks get layer metadata automatically

## Test plan

- [ ] Parse a document with RAPTOR enabled, verify `raptor_layer_int` is
set on indexed chunks
- [ ] Verify `raptor_layer_int` values increase with abstraction level
(layer 1 < layer 2 < ...)
- [ ] Verify existing RAPTOR deletion (`delete by raptor_kwd`) still
works
- [ ] Verify Infinity backend accepts the new field

Fixes #7488
Related: #4104, #11191, #10951

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
											
										
										
											2026-04-27 10:20:46 +08:00
+								        chunks, layers = await raptor(chunks, kb_parser_config["raptor"]["random_seed"], callback, row["id"])
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        effective_doc_name = row["name"] if did == fake_doc_id else doc_info_by_id.get(did, {}).get("name") or row["name"]
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								        doc = {
-												Fix: ollama model list issue. (#11175)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-11 19:46:41 +08:00
+								            "doc_id": did,
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								            "kb_id": [str(row["kb_id"])],
-												Fix missmatch docnm_kwd in raptor chunks (#13451)

### What problem does this PR solve?

issue #13393 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-10 14:24:33 +08:00
+								            "docnm_kwd": effective_doc_name,
 								            "title_tks": rag_tokenizer.tokenize(effective_doc_name),
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            "raptor_kwd": "raptor",
 								            "extra": {"raptor_method": tree_builder},
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								        }
 								        if row["pagerank"]:
 								            doc[PAGERANK_FLD] = int(row["pagerank"])
-												feat: persist RAPTOR layer metadata on summary chunks (#13286)

## Summary

RAPTOR's recursive clustering builds a `layers` list tracking
`(start_idx, end_idx)` boundaries per level, but currently discards this
information — only the flat `chunks` list is returned. This makes it
impossible to distinguish leaf-level summaries from top-level ones.

This PR:
- Returns `(chunks, layers)` tuple from `raptor.py`'s `__call__`
- Annotates each RAPTOR summary chunk with `raptor_layer_int` (1 = first
summary level, 2 = summary-of-summaries, etc.)
- Adds `raptor_layer_int` to `infinity_mapping.json` (Elasticsearch
handles it via existing `*_int` dynamic template)

### Why this matters

Downstream features need to know which RAPTOR layer a summary belongs
to:
- **Retrieving the top-level document summary** for entity extraction,
search snippets, or document comparison
- **Filtering by abstraction level** — users may want only high-level
summaries or only leaf-level cluster summaries
- **RAPTOR recall quality** — #10951 reports summaries not being
recalled for definition queries; layer metadata enables targeted
retrieval

### Changes

| File | Change | LOC |
|------|--------|-----|
| `rag/raptor.py` | Return `(chunks, layers)` tuple | ~3 |
| `rag/svr/task_executor.py` | Build `chunk_layer` mapping, set
`raptor_layer_int` | ~12 |
| `conf/infinity_mapping.json` | Add `raptor_layer_int` integer field |
~1 |

### Backward compatibility

- **Additive only** — no existing fields or behavior changed
- Existing RAPTOR chunks continue to work (they'll have
`raptor_layer_int = 0` by default)
- New RAPTOR chunks get layer metadata automatically

## Test plan

- [ ] Parse a document with RAPTOR enabled, verify `raptor_layer_int` is
set on indexed chunks
- [ ] Verify `raptor_layer_int` values increase with abstraction level
(layer 1 < layer 2 < ...)
- [ ] Verify existing RAPTOR deletion (`delete by raptor_kwd`) still
works
- [ ] Verify Infinity backend accepts the new field

Fixes #7488
Related: #4104, #11191, #10951

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
											
										
										
											2026-04-27 10:20:46 +08:00
+								        # Build index→layer mapping from RAPTOR layer boundaries.
 								        # layers is [(start, end), ...] where layer 0 is the original chunks
 								        # and layer 1+ are summary layers. We skip layer 0 (original chunks).
 								        chunk_layer = {}
 								        for layer_idx, (layer_start, layer_end) in enumerate(layers):
 								            if layer_idx == 0:
 								                continue  # layer 0 = original input chunks, not summaries
 								            for ci in range(layer_start, layer_end):
 								                chunk_layer[ci] = layer_idx
 								        for idx, (content, vctr) in enumerate(chunks[original_length:], start=original_length):
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								            d = copy.deepcopy(doc)
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            d["id"] = make_raptor_summary_chunk_id(content, did)
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								            d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
 								            d["create_timestamp_flt"] = datetime.now().timestamp()
 								            d[vctr_nm] = vctr.tolist()
 								            d["content_with_weight"] = content
 								            d["content_ltks"] = rag_tokenizer.tokenize(content)
 								            d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
-												feat: persist RAPTOR layer metadata on summary chunks (#13286)

## Summary

RAPTOR's recursive clustering builds a `layers` list tracking
`(start_idx, end_idx)` boundaries per level, but currently discards this
information — only the flat `chunks` list is returned. This makes it
impossible to distinguish leaf-level summaries from top-level ones.

This PR:
- Returns `(chunks, layers)` tuple from `raptor.py`'s `__call__`
- Annotates each RAPTOR summary chunk with `raptor_layer_int` (1 = first
summary level, 2 = summary-of-summaries, etc.)
- Adds `raptor_layer_int` to `infinity_mapping.json` (Elasticsearch
handles it via existing `*_int` dynamic template)

### Why this matters

Downstream features need to know which RAPTOR layer a summary belongs
to:
- **Retrieving the top-level document summary** for entity extraction,
search snippets, or document comparison
- **Filtering by abstraction level** — users may want only high-level
summaries or only leaf-level cluster summaries
- **RAPTOR recall quality** — #10951 reports summaries not being
recalled for definition queries; layer metadata enables targeted
retrieval

### Changes

| File | Change | LOC |
|------|--------|-----|
| `rag/raptor.py` | Return `(chunks, layers)` tuple | ~3 |
| `rag/svr/task_executor.py` | Build `chunk_layer` mapping, set
`raptor_layer_int` | ~12 |
| `conf/infinity_mapping.json` | Add `raptor_layer_int` integer field |
~1 |

### Backward compatibility

- **Additive only** — no existing fields or behavior changed
- Existing RAPTOR chunks continue to work (they'll have
`raptor_layer_int = 0` by default)
- New RAPTOR chunks get layer metadata automatically

## Test plan

- [ ] Parse a document with RAPTOR enabled, verify `raptor_layer_int` is
set on indexed chunks
- [ ] Verify `raptor_layer_int` values increase with abstraction level
(layer 1 < layer 2 < ...)
- [ ] Verify existing RAPTOR deletion (`delete by raptor_kwd`) still
works
- [ ] Verify Infinity backend accepts the new field

Fixes #7488
Related: #4104, #11191, #10951

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
											
										
										
											2026-04-27 10:20:46 +08:00
+								            d["raptor_layer_int"] = chunk_layer.get(idx, 1)
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								            res.append(d)
 								            tk_count += num_tokens_from_string(content)
 								    if raptor_config.get("scope", "file") == "file":
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        dataset_methods = await get_raptor_chunk_methods(fake_doc_id, row["tenant_id"], row["kb_id"])
 								        remove_dataset_summaries = bool(dataset_methods)
 								        has_file_level_target = False
 								        if dataset_methods:
 								            callback(msg="[RAPTOR] will remove dataset-level summaries after file-level summaries are available.")
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								        for x, doc_id in enumerate(doc_ids):
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            if skip_raptor_doc(doc_id):
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								                callback(prog=(x + 1.0) / len(doc_ids))
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								                continue
-												Refact: improve task resume mechanism for graphrag (#14096)

### What problem does this PR solve?

Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).

**Changes based on @yuzhichang's review:**

1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.

**Changes based on CodeRabbit review:**

3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.

### Changes

| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |

### How it works

- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.

### Testing

- 10 new unit tests — all passing
- Full existing suite: 617 passed

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
											
										
										
											2026-04-15 15:07:28 +05:30
+								            # CHECKPOINT: skip docs that already have RAPTOR chunks in the doc store
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            existing_methods = await get_raptor_chunk_methods(doc_id, row["tenant_id"], row["kb_id"])
 								            if tree_builder in existing_methods:
 								                has_file_level_target = True
 								                if existing_methods != {tree_builder}:
 								                    schedule_raptor_cleanup(doc_id, tree_builder)
 								                    callback(msg=f"[RAPTOR] doc:{doc_id} will remove old RAPTOR summaries after insert.")
 								                callback(msg=f"[RAPTOR] doc:{doc_id} already has {tree_builder} RAPTOR chunks, skipping.")
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								                callback(prog=(x + 1.0) / len(doc_ids))
-												Refact: improve task resume mechanism for graphrag (#14096)

### What problem does this PR solve?

Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).

**Changes based on @yuzhichang's review:**

1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.

**Changes based on CodeRabbit review:**

3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.

### Changes

| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |

### How it works

- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.

### Testing

- 10 new unit tests — all passing
- Full existing suite: 617 passed

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
											
										
										
											2026-04-15 15:07:28 +05:30
+								                continue
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            if existing_methods:
 								                callback(msg=f"[RAPTOR] doc:{doc_id} will migrate RAPTOR summaries to {tree_builder} after insert.")
-												Refact: improve task resume mechanism for graphrag (#14096)

### What problem does this PR solve?

Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).

**Changes based on @yuzhichang's review:**

1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.

**Changes based on CodeRabbit review:**

3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.

### Changes

| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |

### How it works

- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.

### Testing

- 10 new unit tests — all passing
- Full existing suite: 617 passed

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
											
										
										
											2026-04-15 15:07:28 +05:30
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								            chunks = []
-												fix(raptor): handle missing vector fields gracefully (#12713)

## Summary

This PR fixes a `KeyError` crash when running RAPTOR tasks on documents
that don't have the expected vector field.

## Related Issue

Fixes https://github.com/infiniflow/ragflow/issues/12675

## Problem

When running RAPTOR tasks, the code assumes all chunks have the vector
field `q_<size>_vec` (e.g., `q_1024_vec`). However, chunks may not have
this field if:
1. They were indexed with a **different embedding model** (different
vector size)
2. The embedding step **failed silently** during initial parsing
3. The document was parsed before the current embedding model was
configured

This caused a crash:
```
KeyError: 'q_1024_vec'
```

## Solution

Added defensive validation in `run_raptor_for_kb()`:

1. **Check for vector field existence** before accessing it
2. **Skip chunks** that don't have the required vector field instead of
crashing
3. **Log warnings** for skipped chunks with actionable guidance
4. **Provide informative error messages** suggesting users re-parse
documents with the current embedding model
5. **Handle both scopes** (`file` and `kb` modes)

## Changes

- `rag/svr/task_executor.py`: Added validation and error handling in
`run_raptor_for_kb()`

## Testing

1. Create a knowledge base with an embedding model
2. Parse documents
3. Change the embedding model to one with a different vector size
4. Run RAPTOR task
5. **Before**: Crashes with `KeyError`
6. **After**: Gracefully skips incompatible chunks with informative
warnings

---

<!-- Gittensor Contribution Tag: @GlobalStar117 -->

Co-authored-by: GlobalStar117 <GlobalStar117@users.noreply.github.com>
											
										
										
											2026-01-20 15:24:20 +11:00
+								            skipped_chunks = 0
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            for d in settings.retriever.chunk_list(doc_id, row["tenant_id"], [str(row["kb_id"])], fields=["content_with_weight", vctr_nm], sort_by_position=True):
-												fix(raptor): handle missing vector fields gracefully (#12713)

## Summary

This PR fixes a `KeyError` crash when running RAPTOR tasks on documents
that don't have the expected vector field.

## Related Issue

Fixes https://github.com/infiniflow/ragflow/issues/12675

## Problem

When running RAPTOR tasks, the code assumes all chunks have the vector
field `q_<size>_vec` (e.g., `q_1024_vec`). However, chunks may not have
this field if:
1. They were indexed with a **different embedding model** (different
vector size)
2. The embedding step **failed silently** during initial parsing
3. The document was parsed before the current embedding model was
configured

This caused a crash:
```
KeyError: 'q_1024_vec'
```

## Solution

Added defensive validation in `run_raptor_for_kb()`:

1. **Check for vector field existence** before accessing it
2. **Skip chunks** that don't have the required vector field instead of
crashing
3. **Log warnings** for skipped chunks with actionable guidance
4. **Provide informative error messages** suggesting users re-parse
documents with the current embedding model
5. **Handle both scopes** (`file` and `kb` modes)

## Changes

- `rag/svr/task_executor.py`: Added validation and error handling in
`run_raptor_for_kb()`

## Testing

1. Create a knowledge base with an embedding model
2. Parse documents
3. Change the embedding model to one with a different vector size
4. Run RAPTOR task
5. **Before**: Crashes with `KeyError`
6. **After**: Gracefully skips incompatible chunks with informative
warnings

---

<!-- Gittensor Contribution Tag: @GlobalStar117 -->

Co-authored-by: GlobalStar117 <GlobalStar117@users.noreply.github.com>
											
										
										
											2026-01-20 15:24:20 +11:00
+								                # Skip chunks that don't have the required vector field (may have been indexed with different embedding model)
 								                if vctr_nm not in d or d[vctr_nm] is None:
 								                    skipped_chunks += 1
 								                    logging.warning(f"RAPTOR: Chunk missing vector field '{vctr_nm}' in doc {doc_id}, skipping")
 								                    continue
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								                chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
-												Refact: improve task resume mechanism for graphrag (#14096)

### What problem does this PR solve?

Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).

**Changes based on @yuzhichang's review:**

1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.

**Changes based on CodeRabbit review:**

3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.

### Changes

| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |

### How it works

- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.

### Testing

- 10 new unit tests — all passing
- Full existing suite: 617 passed

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
											
										
										
											2026-04-15 15:07:28 +05:30
-												fix(raptor): handle missing vector fields gracefully (#12713)

## Summary

This PR fixes a `KeyError` crash when running RAPTOR tasks on documents
that don't have the expected vector field.

## Related Issue

Fixes https://github.com/infiniflow/ragflow/issues/12675

## Problem

When running RAPTOR tasks, the code assumes all chunks have the vector
field `q_<size>_vec` (e.g., `q_1024_vec`). However, chunks may not have
this field if:
1. They were indexed with a **different embedding model** (different
vector size)
2. The embedding step **failed silently** during initial parsing
3. The document was parsed before the current embedding model was
configured

This caused a crash:
```
KeyError: 'q_1024_vec'
```

## Solution

Added defensive validation in `run_raptor_for_kb()`:

1. **Check for vector field existence** before accessing it
2. **Skip chunks** that don't have the required vector field instead of
crashing
3. **Log warnings** for skipped chunks with actionable guidance
4. **Provide informative error messages** suggesting users re-parse
documents with the current embedding model
5. **Handle both scopes** (`file` and `kb` modes)

## Changes

- `rag/svr/task_executor.py`: Added validation and error handling in
`run_raptor_for_kb()`

## Testing

1. Create a knowledge base with an embedding model
2. Parse documents
3. Change the embedding model to one with a different vector size
4. Run RAPTOR task
5. **Before**: Crashes with `KeyError`
6. **After**: Gracefully skips incompatible chunks with informative
warnings

---

<!-- Gittensor Contribution Tag: @GlobalStar117 -->

Co-authored-by: GlobalStar117 <GlobalStar117@users.noreply.github.com>
											
										
										
											2026-01-20 15:24:20 +11:00
+								            if skipped_chunks > 0:
 								                callback(msg=f"[WARN] Skipped {skipped_chunks} chunks without vector field '{vctr_nm}' for doc {doc_id}. Consider re-parsing the document with the current embedding model.")
-												Refact: improve task resume mechanism for graphrag (#14096)

### What problem does this PR solve?

Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).

**Changes based on @yuzhichang's review:**

1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.

**Changes based on CodeRabbit review:**

3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.

### Changes

| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |

### How it works

- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.

### Testing

- 10 new unit tests — all passing
- Full existing suite: 617 passed

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
											
										
										
											2026-04-15 15:07:28 +05:30
-												fix(raptor): handle missing vector fields gracefully (#12713)

## Summary

This PR fixes a `KeyError` crash when running RAPTOR tasks on documents
that don't have the expected vector field.

## Related Issue

Fixes https://github.com/infiniflow/ragflow/issues/12675

## Problem

When running RAPTOR tasks, the code assumes all chunks have the vector
field `q_<size>_vec` (e.g., `q_1024_vec`). However, chunks may not have
this field if:
1. They were indexed with a **different embedding model** (different
vector size)
2. The embedding step **failed silently** during initial parsing
3. The document was parsed before the current embedding model was
configured

This caused a crash:
```
KeyError: 'q_1024_vec'
```

## Solution

Added defensive validation in `run_raptor_for_kb()`:

1. **Check for vector field existence** before accessing it
2. **Skip chunks** that don't have the required vector field instead of
crashing
3. **Log warnings** for skipped chunks with actionable guidance
4. **Provide informative error messages** suggesting users re-parse
documents with the current embedding model
5. **Handle both scopes** (`file` and `kb` modes)

## Changes

- `rag/svr/task_executor.py`: Added validation and error handling in
`run_raptor_for_kb()`

## Testing

1. Create a knowledge base with an embedding model
2. Parse documents
3. Change the embedding model to one with a different vector size
4. Run RAPTOR task
5. **Before**: Crashes with `KeyError`
6. **After**: Gracefully skips incompatible chunks with informative
warnings

---

<!-- Gittensor Contribution Tag: @GlobalStar117 -->

Co-authored-by: GlobalStar117 <GlobalStar117@users.noreply.github.com>
											
										
										
											2026-01-20 15:24:20 +11:00
+								            if not chunks:
 								                logging.warning(f"RAPTOR: No valid chunks with vectors found for doc {doc_id}")
 								                callback(msg=f"[WARN] No valid chunks with vectors found for doc {doc_id}, skipping")
 								                continue
-												Refact: improve task resume mechanism for graphrag (#14096)

### What problem does this PR solve?

Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).

**Changes based on @yuzhichang's review:**

1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.

**Changes based on CodeRabbit review:**

3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.

### Changes

| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |

### How it works

- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.

### Testing

- 10 new unit tests — all passing
- Full existing suite: 617 passed

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
											
										
										
											2026-04-15 15:07:28 +05:30
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            before_generate = len(res)
-												Fix: ollama model list issue. (#11175)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-11 19:46:41 +08:00
+								            await generate(chunks, doc_id)
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            if len(res) > before_generate:
 								                has_file_level_target = True
 								                if existing_methods:
 								                    schedule_raptor_cleanup(doc_id, tree_builder)
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            callback(prog=(x + 1.0) / len(doc_ids))
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
 								        if remove_dataset_summaries:
 								            if has_file_level_target:
 								                schedule_raptor_cleanup(fake_doc_id)
 								            else:
 								                callback(msg="[RAPTOR] kept dataset-level summaries because no file-level summaries were built.")
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								    else:
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        migrated_file_docs = 0
 								        file_cleanup_doc_ids = []
 								        skipped_doc_ids = set()
 								        for doc_id in set(doc_ids):
 								            if skip_raptor_doc(doc_id):
 								                skipped_doc_ids.add(doc_id)
 								                continue
 								            existing_methods = await get_raptor_chunk_methods(doc_id, row["tenant_id"], row["kb_id"])
 								            if existing_methods:
 								                file_cleanup_doc_ids.append(doc_id)
 								                migrated_file_docs += 1
 								        if migrated_file_docs:
 								            callback(msg=f"[RAPTOR] will remove file-level summaries for {migrated_file_docs} docs after dataset-level build succeeds.")
 								        existing_methods = await get_raptor_chunk_methods(fake_doc_id, row["tenant_id"], row["kb_id"])
 								        if tree_builder in existing_methods:
 								            if existing_methods != {tree_builder}:
 								                schedule_raptor_cleanup(fake_doc_id, tree_builder)
 								                callback(msg="[RAPTOR] will remove old dataset-level RAPTOR summaries after insert.")
 								            for doc_id in file_cleanup_doc_ids:
 								                schedule_raptor_cleanup(doc_id)
 								            callback(msg=f"[RAPTOR] dataset-level {tree_builder} summaries already exist, skipping.")
 								            return res, tk_count, cleanup_raptor_chunks
 								        migrate_dataset_summaries = bool(existing_methods)
 								        if migrate_dataset_summaries:
 								            callback(msg=f"[RAPTOR] will migrate dataset-level RAPTOR summaries to {tree_builder} after insert.")
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								        chunks = []
-												fix(raptor): handle missing vector fields gracefully (#12713)

## Summary

This PR fixes a `KeyError` crash when running RAPTOR tasks on documents
that don't have the expected vector field.

## Related Issue

Fixes https://github.com/infiniflow/ragflow/issues/12675

## Problem

When running RAPTOR tasks, the code assumes all chunks have the vector
field `q_<size>_vec` (e.g., `q_1024_vec`). However, chunks may not have
this field if:
1. They were indexed with a **different embedding model** (different
vector size)
2. The embedding step **failed silently** during initial parsing
3. The document was parsed before the current embedding model was
configured

This caused a crash:
```
KeyError: 'q_1024_vec'
```

## Solution

Added defensive validation in `run_raptor_for_kb()`:

1. **Check for vector field existence** before accessing it
2. **Skip chunks** that don't have the required vector field instead of
crashing
3. **Log warnings** for skipped chunks with actionable guidance
4. **Provide informative error messages** suggesting users re-parse
documents with the current embedding model
5. **Handle both scopes** (`file` and `kb` modes)

## Changes

- `rag/svr/task_executor.py`: Added validation and error handling in
`run_raptor_for_kb()`

## Testing

1. Create a knowledge base with an embedding model
2. Parse documents
3. Change the embedding model to one with a different vector size
4. Run RAPTOR task
5. **Before**: Crashes with `KeyError`
6. **After**: Gracefully skips incompatible chunks with informative
warnings

---

<!-- Gittensor Contribution Tag: @GlobalStar117 -->

Co-authored-by: GlobalStar117 <GlobalStar117@users.noreply.github.com>
											
										
										
											2026-01-20 15:24:20 +11:00
+								        skipped_chunks = 0
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								        for doc_id in doc_ids:
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            if doc_id in skipped_doc_ids:
 								                continue
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            for d in settings.retriever.chunk_list(doc_id, row["tenant_id"], [str(row["kb_id"])], fields=["content_with_weight", vctr_nm], sort_by_position=True):
-												fix(raptor): handle missing vector fields gracefully (#12713)

## Summary

This PR fixes a `KeyError` crash when running RAPTOR tasks on documents
that don't have the expected vector field.

## Related Issue

Fixes https://github.com/infiniflow/ragflow/issues/12675

## Problem

When running RAPTOR tasks, the code assumes all chunks have the vector
field `q_<size>_vec` (e.g., `q_1024_vec`). However, chunks may not have
this field if:
1. They were indexed with a **different embedding model** (different
vector size)
2. The embedding step **failed silently** during initial parsing
3. The document was parsed before the current embedding model was
configured

This caused a crash:
```
KeyError: 'q_1024_vec'
```

## Solution

Added defensive validation in `run_raptor_for_kb()`:

1. **Check for vector field existence** before accessing it
2. **Skip chunks** that don't have the required vector field instead of
crashing
3. **Log warnings** for skipped chunks with actionable guidance
4. **Provide informative error messages** suggesting users re-parse
documents with the current embedding model
5. **Handle both scopes** (`file` and `kb` modes)

## Changes

- `rag/svr/task_executor.py`: Added validation and error handling in
`run_raptor_for_kb()`

## Testing

1. Create a knowledge base with an embedding model
2. Parse documents
3. Change the embedding model to one with a different vector size
4. Run RAPTOR task
5. **Before**: Crashes with `KeyError`
6. **After**: Gracefully skips incompatible chunks with informative
warnings

---

<!-- Gittensor Contribution Tag: @GlobalStar117 -->

Co-authored-by: GlobalStar117 <GlobalStar117@users.noreply.github.com>
											
										
										
											2026-01-20 15:24:20 +11:00
+								                # Skip chunks that don't have the required vector field
 								                if vctr_nm not in d or d[vctr_nm] is None:
 								                    skipped_chunks += 1
 								                    logging.warning(f"RAPTOR: Chunk missing vector field '{vctr_nm}' in doc {doc_id}, skipping")
 								                    continue
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
+								                chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
-												fix(raptor): handle missing vector fields gracefully (#12713)

## Summary

This PR fixes a `KeyError` crash when running RAPTOR tasks on documents
that don't have the expected vector field.

## Related Issue

Fixes https://github.com/infiniflow/ragflow/issues/12675

## Problem

When running RAPTOR tasks, the code assumes all chunks have the vector
field `q_<size>_vec` (e.g., `q_1024_vec`). However, chunks may not have
this field if:
1. They were indexed with a **different embedding model** (different
vector size)
2. The embedding step **failed silently** during initial parsing
3. The document was parsed before the current embedding model was
configured

This caused a crash:
```
KeyError: 'q_1024_vec'
```

## Solution

Added defensive validation in `run_raptor_for_kb()`:

1. **Check for vector field existence** before accessing it
2. **Skip chunks** that don't have the required vector field instead of
crashing
3. **Log warnings** for skipped chunks with actionable guidance
4. **Provide informative error messages** suggesting users re-parse
documents with the current embedding model
5. **Handle both scopes** (`file` and `kb` modes)

## Changes

- `rag/svr/task_executor.py`: Added validation and error handling in
`run_raptor_for_kb()`

## Testing

1. Create a knowledge base with an embedding model
2. Parse documents
3. Change the embedding model to one with a different vector size
4. Run RAPTOR task
5. **Before**: Crashes with `KeyError`
6. **After**: Gracefully skips incompatible chunks with informative
warnings

---

<!-- Gittensor Contribution Tag: @GlobalStar117 -->

Co-authored-by: GlobalStar117 <GlobalStar117@users.noreply.github.com>
											
										
										
											2026-01-20 15:24:20 +11:00
+								        if skipped_chunks > 0:
 								            callback(msg=f"[WARN] Skipped {skipped_chunks} chunks without vector field '{vctr_nm}'. Consider re-parsing documents with the current embedding model.")
 								        if not chunks:
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            if skipped_doc_ids and len(skipped_doc_ids) == len(set(doc_ids)):
 								                callback(msg="[RAPTOR] all documents were skipped by RAPTOR auto-disable rules.")
 								                return res, tk_count, cleanup_raptor_chunks
-												fix(raptor): handle missing vector fields gracefully (#12713)

## Summary

This PR fixes a `KeyError` crash when running RAPTOR tasks on documents
that don't have the expected vector field.

## Related Issue

Fixes https://github.com/infiniflow/ragflow/issues/12675

## Problem

When running RAPTOR tasks, the code assumes all chunks have the vector
field `q_<size>_vec` (e.g., `q_1024_vec`). However, chunks may not have
this field if:
1. They were indexed with a **different embedding model** (different
vector size)
2. The embedding step **failed silently** during initial parsing
3. The document was parsed before the current embedding model was
configured

This caused a crash:
```
KeyError: 'q_1024_vec'
```

## Solution

Added defensive validation in `run_raptor_for_kb()`:

1. **Check for vector field existence** before accessing it
2. **Skip chunks** that don't have the required vector field instead of
crashing
3. **Log warnings** for skipped chunks with actionable guidance
4. **Provide informative error messages** suggesting users re-parse
documents with the current embedding model
5. **Handle both scopes** (`file` and `kb` modes)

## Changes

- `rag/svr/task_executor.py`: Added validation and error handling in
`run_raptor_for_kb()`

## Testing

1. Create a knowledge base with an embedding model
2. Parse documents
3. Change the embedding model to one with a different vector size
4. Run RAPTOR task
5. **Before**: Crashes with `KeyError`
6. **After**: Gracefully skips incompatible chunks with informative
warnings

---

<!-- Gittensor Contribution Tag: @GlobalStar117 -->

Co-authored-by: GlobalStar117 <GlobalStar117@users.noreply.github.com>
											
										
										
											2026-01-20 15:24:20 +11:00
+								            logging.error(f"RAPTOR: No valid chunks with vectors found in any document for kb {row['kb_id']}")
 								            callback(msg=f"[ERROR] No valid chunks with vectors found. Please ensure documents are parsed with the current embedding model (vector size: {vector_size}).")
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            return res, tk_count, cleanup_raptor_chunks
-												fix(raptor): handle missing vector fields gracefully (#12713)

## Summary

This PR fixes a `KeyError` crash when running RAPTOR tasks on documents
that don't have the expected vector field.

## Related Issue

Fixes https://github.com/infiniflow/ragflow/issues/12675

## Problem

When running RAPTOR tasks, the code assumes all chunks have the vector
field `q_<size>_vec` (e.g., `q_1024_vec`). However, chunks may not have
this field if:
1. They were indexed with a **different embedding model** (different
vector size)
2. The embedding step **failed silently** during initial parsing
3. The document was parsed before the current embedding model was
configured

This caused a crash:
```
KeyError: 'q_1024_vec'
```

## Solution

Added defensive validation in `run_raptor_for_kb()`:

1. **Check for vector field existence** before accessing it
2. **Skip chunks** that don't have the required vector field instead of
crashing
3. **Log warnings** for skipped chunks with actionable guidance
4. **Provide informative error messages** suggesting users re-parse
documents with the current embedding model
5. **Handle both scopes** (`file` and `kb` modes)

## Changes

- `rag/svr/task_executor.py`: Added validation and error handling in
`run_raptor_for_kb()`

## Testing

1. Create a knowledge base with an embedding model
2. Parse documents
3. Change the embedding model to one with a different vector size
4. Run RAPTOR task
5. **Before**: Crashes with `KeyError`
6. **After**: Gracefully skips incompatible chunks with informative
warnings

---

<!-- Gittensor Contribution Tag: @GlobalStar117 -->

Co-authored-by: GlobalStar117 <GlobalStar117@users.noreply.github.com>
											
										
										
											2026-01-20 15:24:20 +11:00
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        before_generate = len(res)
-												Fix: ollama model list issue. (#11175)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-11 19:46:41 +08:00
+								        await generate(chunks, fake_doc_id)
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        if len(res) > before_generate:
 								            for doc_id in file_cleanup_doc_ids:
 								                schedule_raptor_cleanup(doc_id)
 								            if migrate_dataset_summaries:
 								                schedule_raptor_cleanup(fake_doc_id, tree_builder)
-												Fix: waitForResponse component. (#11172)

### What problem does this PR solve?

#10056

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 16:58:47 +08:00
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								    return res, tk_count, cleanup_raptor_chunks
-												Light GraphRAG (#4585)

### What problem does this PR solve?

#4543

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-22 19:43:14 +08:00
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								async def delete_image(kb_id, chunk_id):
 								    try:
 								        async with minio_limiter:
-												Move api.settings to common.settings (#11036)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-06 09:36:38 +08:00
+								            settings.STORAGE_IMPL.delete(kb_id, chunk_id)
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    except Exception:
 								        logging.exception(f"Deleting image of chunk {chunk_id} got exception")
 								        raise
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								@timed_with_recording
-												Add dataset with table parser type for Infinity and answer question in chat using SQL (#12541)

### What problem does this PR solve?

1) Create  dataset using table parser for infinity
2) Answer questions in chat using SQL

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-19 19:35:14 +08:00
+								async def insert_chunks(task_id, task_tenant_id, task_dataset_id, chunks, progress_callback):
 								    """
 								    Insert chunks into document store (Elasticsearch OR Infinity).
 								    Args:
 								        task_id: Task identifier
 								        task_tenant_id: Tenant ID
 								        task_dataset_id: Dataset/knowledge base ID
 								        chunks: List of chunk dictionaries to insert
 								        progress_callback: Callback function for progress updates
 								    """
-												Feat: add child parent chunking method in backend. (#11598)

### What problem does this PR solve?

#7996

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-28 19:25:32 +08:00
+								    mothers = []
 								    mother_ids = set([])
 								    for ck in chunks:
 								        mom = ck.get("mom") or ck.get("mom_with_weight") or ""
 								        if not mom:
 								            continue
 								        id = xxhash.xxh64(mom.encode("utf-8")).hexdigest()
-												Fix: parent-child chunking method (#11810)

### What problem does this PR solve?

change:
parent-child chunking method

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-09 09:34:01 +08:00
+								        ck["mom_id"] = id
-												Feat: add child parent chunking method in backend. (#11598)

### What problem does this PR solve?

#7996

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-28 19:25:32 +08:00
+								        if id in mother_ids:
 								            continue
 								        mother_ids.add(id)
 								        mom_ck = copy.deepcopy(ck)
 								        mom_ck["id"] = id
 								        mom_ck["content_with_weight"] = mom
 								        mom_ck["available_int"] = 0
 								        flds = list(mom_ck.keys())
 								        for fld in flds:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            if fld not in ["id", "content_with_weight", "doc_id", "docnm_kwd", "kb_id", "available_int", "position_int", "create_timestamp_flt", "page_num_int", "top_int"]:
-												Feat: add child parent chunking method in backend. (#11598)

### What problem does this PR solve?

#7996

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-28 19:25:32 +08:00
+								                del mom_ck[fld]
 								        mothers.append(mom_ck)
 								    for b in range(0, len(mothers), settings.DOC_BULK_SIZE):
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        ret = await thread_pool_exec(
 								            settings.docStoreConn.insert,
 								            mothers[b : b + settings.DOC_BULK_SIZE],
 								            search.index_name(task_tenant_id),
 								            task_dataset_id,
 								        )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().save_func_return_value("docStoreConn.insert", ret)
-												Feat: add child parent chunking method in backend. (#11598)

### What problem does this PR solve?

#7996

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-28 19:25:32 +08:00
+								        task_canceled = has_canceled(task_id)
 								        if task_canceled:
 								            progress_callback(-1, msg="Task has been canceled.")
 								            return False
-												Move api.settings to common.settings (#11036)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-06 09:36:38 +08:00
+								    for b in range(0, len(chunks), settings.DOC_BULK_SIZE):
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        doc_store_result = await thread_pool_exec(
 								            settings.docStoreConn.insert,
 								            chunks[b : b + settings.DOC_BULK_SIZE],
 								            search.index_name(task_tenant_id),
 								            task_dataset_id,
 								        )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().save_func_return_value("docStoreConn.insert", doc_store_result)
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								        task_canceled = has_canceled(task_id)
 								        if task_canceled:
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            # Roll back partial RAPTOR summary inserts so the next run is not
 								            # mistaken for a completed checkpoint by get_raptor_chunk_methods.
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            raptor_ids_to_rollback = [c["id"] for c in chunks[: b + settings.DOC_BULK_SIZE] if c.get("raptor_kwd") == "raptor"]
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            if raptor_ids_to_rollback:
 								                try:
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								                    ret = await thread_pool_exec(
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								                        settings.docStoreConn.delete,
 								                        {"id": raptor_ids_to_rollback},
 								                        search.index_name(task_tenant_id),
 								                        task_dataset_id,
 								                    )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								                    get_recording_context().save_func_return_value("docStoreConn.delete", ret)
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								                    logging.info(
 								                        "insert_chunks: rolled back %d partial RAPTOR chunks after cancellation (task=%s)",
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								                        len(raptor_ids_to_rollback),
 								                        task_id,
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								                    )
 								                except Exception:
 								                    logging.exception(
 								                        "insert_chunks: failed to roll back partial RAPTOR chunks after cancellation (task=%s)",
 								                        task_id,
 								                    )
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								            progress_callback(-1, msg="Task has been canceled.")
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								            return False
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								        if b % 128 == 0:
 								            progress_callback(prog=0.8 + 0.1 * (b + 1) / len(chunks), msg="")
 								        if doc_store_result:
 								            error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
 								            progress_callback(-1, msg=error_message)
 								            raise Exception(error_message)
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        chunk_ids = [chunk["id"] for chunk in chunks[: b + settings.DOC_BULK_SIZE]]
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								        chunk_ids_str = " ".join(chunk_ids)
 								        try:
 								            TaskService.update_chunk_ids(task_id, chunk_ids_str)
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            get_recording_context().save_func_return_value("TaskService.update_chunk_ids", None)
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								        except DoesNotExist:
 								            logging.warning(f"do_handle_task update_chunk_ids failed since task {task_id} is unknown.")
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            doc_store_result = await thread_pool_exec(
 								                settings.docStoreConn.delete,
 								                {"id": chunk_ids},
 								                search.index_name(task_tenant_id),
 								                task_dataset_id,
 								            )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            get_recording_context().save_func_return_value("docStoreConn.delete", doc_store_result)
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								            tasks = []
 								            for chunk_id in chunk_ids:
 								                tasks.append(asyncio.create_task(delete_image(task_dataset_id, chunk_id)))
 								            try:
 								                await asyncio.gather(*tasks, return_exceptions=False)
 								            except Exception as e:
 								                logging.error(f"delete_image failed: {e}")
 								                for t in tasks:
 								                    t.cancel()
 								                await asyncio.gather(*tasks, return_exceptions=True)
 								                raise
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								            progress_callback(-1, msg=f"Chunk updates failed since task {task_id} is unknown.")
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								            return False
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    return True
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								@timeout(60 * 60 * 3, 1)
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								async def do_handle_task(task):
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    task_type = task.get("task_type", "")
-												Fix: use async task to save memory (#12308)

### What problem does this PR solve?

Use async task to save memory.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:41:38 +08:00
+								    if task_type == "memory":
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        result = await handle_save_to_memory_task(task)
 								        get_recording_context().save_func_return_value("handle_save_to_memory_task", result)
-												Fix: use async task to save memory (#12308)

### What problem does this PR solve?

Use async task to save memory.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:41:38 +08:00
+								        return
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    if task_type == "dataflow" and task.get("doc_id", "") == CANVAS_DEBUG_DOC_ID:
 								        await run_dataflow(task)
 								        return
-												Update file parsing progress info (#3780)

### What problem does this PR solve?

Refine the file parsing progress info

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 17:03:00 +08:00
+								    task_id = task["id"]
 								    task_from_page = task["from_page"]
 								    task_to_page = task["to_page"]
 								    task_tenant_id = task["tenant_id"]
 								    task_embedding_id = task["embd_id"]
-												Fix: guard missing task language (#15136)

### What problem does this PR solve?

guard missing task language

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-05-22 11:46:38 +08:00
+								    task_language = task.get("language") or "Chinese"
 								    if not task.get("language"):
 								        logging.warning("Task %s has no language set, falling back to Chinese", task_id)
-												Fix: correct llm_id for graphrag (#13032)

### What problem does this PR solve?

Fix: correct llm_id for graphrag #13030

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-02-06 14:05:32 +08:00
+								    doc_task_llm_id = task["parser_config"].get("llm_id") or task["llm_id"]
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								    kb_task_llm_id = task["kb_parser_config"].get("llm_id") or task["llm_id"]
 								    task["llm_id"] = kb_task_llm_id
-												Update file parsing progress info (#3780)

### What problem does this PR solve?

Refine the file parsing progress info

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 17:03:00 +08:00
+								    task_dataset_id = task["kb_id"]
 								    task_doc_id = task["doc_id"]
 								    task_document_name = task["name"]
 								    task_parser_config = task["parser_config"]
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								    task_start_ts = timer()
-												Feat: let toc run asynchronizly... (#10513)

### What problem does this PR solve?

#10436 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-14 14:14:52 +08:00
+								    toc_thread = None
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								    raptor_cleanup_chunks = []
-												Update file parsing progress info (#3780)

### What problem does this PR solve?

Refine the file parsing progress info

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 17:03:00 +08:00
 								    # prepare the progress callback function
 								    progress_callback = partial(set_progress, task_id, task_from_page, task_to_page)
-												Try to reuse existing chunks (#3983)

### What problem does this PR solve?

Try to reuse existing chunks. Close #3793
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-12 16:38:03 +08:00
-												Pref: use redis to check if canceled. (#8853)

### What problem does this PR solve?

### Type of change

- [x] Performance Improvement
											
										
										
											2025-07-15 17:19:27 +08:00
+								    task_canceled = has_canceled(task_id)
-												Try to reuse existing chunks (#3983)

### What problem does this PR solve?

Try to reuse existing chunks. Close #3793
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-12 16:38:03 +08:00
+								    if task_canceled:
 								        progress_callback(-1, msg="Task has been canceled.")
 								        return
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								    try:
-												Update file parsing progress info (#3780)

### What problem does this PR solve?

Refine the file parsing progress info

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 17:03:00 +08:00
+								        # bind embedding model
-												Feat/tenant model (#13072)

### What problem does this PR solve?

Add id for table tenant_llm and apply in LLMBundle.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
Co-authored-by: Liu An <asiro@qq.com>
											
										
										
											2026-03-05 17:27:17 +08:00
+								        if task_embedding_id:
-												Feat: tenant llm provider (#14595)

### What problem does this PR solve?

Python implementation of the Go-based model_provider API suite.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: bill <yibie_jingnian@163.com>
											
										
										
											2026-05-29 17:39:41 +08:00
+								            embd_model_config = get_model_config_from_provider_instance(task_tenant_id, LLMType.EMBEDDING, task_embedding_id)
-												Feat/tenant model (#13072)

### What problem does this PR solve?

Add id for table tenant_llm and apply in LLMBundle.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
Co-authored-by: Liu An <asiro@qq.com>
											
										
										
											2026-03-05 17:27:17 +08:00
+								        else:
 								            embd_model_config = get_tenant_default_model_by_type(task_tenant_id, LLMType.EMBEDDING)
 								        embedding_model = LLMBundle(task_tenant_id, embd_model_config, lang=task_language)
-												Refine error message while embedding model error, (#5490)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
											
										
										
											2025-02-28 17:52:38 +08:00
+								        vts, _ = embedding_model.encode(["ok"])
 								        vector_size = len(vts[0])
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								    except Exception as e:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        error_message = f"Fail to bind embedding model: {str(e)}"
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								        progress_callback(-1, msg=error_message)
 								        logging.exception(error_message)
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								        raise
-												Update file parsing progress info (#3780)

### What problem does this PR solve?

Refine the file parsing progress info

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 17:03:00 +08:00
-												Light GraphRAG (#4585)

### What problem does this PR solve?

#4543

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-22 19:43:14 +08:00
+								    init_kb(task, vector_size)
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								    if task_type[: len("dataflow")] == "dataflow":
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								        await run_dataflow(task)
-												Feat: refine dataflow and initialize dataflow app (#9952)

### What problem does this PR solve?

Refine dataflow and initialize dataflow app.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-09-05 18:50:46 +08:00
+								        return
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
 								    if task_type == "raptor":
 								        ok, kb = KnowledgebaseService.get_by_id(task_dataset_id)
 								        if not ok:
-												Change knowledge base to dataset (#11976)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-17 10:03:33 +08:00
+								            progress_callback(prog=-1.0, msg="Cannot found valid dataset for RAPTOR task")
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								            return
 								        kb_parser_config = kb.parser_config
 								        if not kb_parser_config.get("raptor", {}).get("use_raptor", False):
-												Fix: maintain backward compatibility for KB tasks (#10508)

### What problem does this PR solve?

Maintain backward compatibility for KB tasks

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-10-13 11:53:48 +08:00
+								            kb_parser_config.update(
 								                {
 								                    "raptor": {
 								                        "use_raptor": True,
 								                        "prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n      {cluster_content}\nThe above is the content you need to summarize.",
 								                        "max_token": 256,
 								                        "threshold": 0.1,
 								                        "max_cluster": 64,
 								                        "random_seed": 0,
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								                        "scope": "file",
 								                        "clustering_method": "gmm",
 								                        "tree_builder": "raptor",
-												Fix: maintain backward compatibility for KB tasks (#10508)

### What problem does this PR solve?

Maintain backward compatibility for KB tasks

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-10-13 11:53:48 +08:00
+								                    },
 								                }
 								            )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            update_result = KnowledgebaseService.update_by_id(kb.id, {"parser_config": kb_parser_config})
 								            get_recording_context().save_func_return_value("KnowledgebaseService.update_by_id", update_result)
 								            if not update_result:
-												Fix: maintain backward compatibility for KB tasks (#10508)

### What problem does this PR solve?

Maintain backward compatibility for KB tasks

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-10-13 11:53:48 +08:00
+								                progress_callback(prog=-1.0, msg="Internal error: Invalid RAPTOR configuration")
 								                return
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								        # bind LLM for raptor
-												Feat: tenant llm provider (#14595)

### What problem does this PR solve?

Python implementation of the Go-based model_provider API suite.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: bill <yibie_jingnian@163.com>
											
										
										
											2026-05-29 17:39:41 +08:00
+								        chat_model_config = get_model_config_from_provider_instance(task_tenant_id, LLMType.CHAT, kb_task_llm_id)
-												Feat/tenant model (#13072)

### What problem does this PR solve?

Add id for table tenant_llm and apply in LLMBundle.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
Co-authored-by: Liu An <asiro@qq.com>
											
										
										
											2026-03-05 17:27:17 +08:00
+								        chat_model = LLMBundle(task_tenant_id, chat_model_config, lang=task_language)
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								        # run RAPTOR
-												Fix: raptor overloading (#7889)

### What problem does this PR solve?

#7840

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-05-27 17:41:35 +08:00
+								        async with kg_limiter:
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            chunks, token_count, raptor_cleanup_chunks = await run_raptor_for_kb(
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								                row=task,
 								                kb_parser_config=kb_parser_config,
 								                chat_mdl=chat_model,
 								                embd_mdl=embedding_model,
 								                vector_size=vector_size,
 								                callback=progress_callback,
 								                doc_ids=task.get("doc_ids", []),
 								            )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().record("raptor_chunks", chunks)
 								        get_recording_context().record("raptor_token_count", token_count)
-												Feat: RAPTOR handle cancel gracefully (#11074)

### What problem does this PR solve?

RAPTOR handle cancel gracefully.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-06 17:18:03 +08:00
+								        if fake_doc_ids := task.get("doc_ids", []):
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								            task_doc_id = fake_doc_ids[0]  # use the first document ID to represent this task for logging purposes
-												Light GraphRAG (#4585)

### What problem does this PR solve?

#4543

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-22 19:43:14 +08:00
+								    # Either using graphrag or Standard chunking methods
-												Feat: refine dataflow and initialize dataflow app (#9952)

### What problem does this PR solve?

Refine dataflow and initialize dataflow app.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-09-05 18:50:46 +08:00
+								    elif task_type == "graphrag":
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								        ok, kb = KnowledgebaseService.get_by_id(task_dataset_id)
 								        if not ok:
-												Change knowledge base to dataset (#11976)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-17 10:03:33 +08:00
+								            progress_callback(prog=-1.0, msg="Cannot found valid dataset for GraphRAG task")
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								            return
 								        kb_parser_config = kb.parser_config
 								        if not kb_parser_config.get("graphrag", {}).get("use_graphrag", False):
-												Fix: maintain backward compatibility for KB tasks (#10508)

### What problem does this PR solve?

Maintain backward compatibility for KB tasks

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-10-13 11:53:48 +08:00
+								            kb_parser_config.update(
 								                {
 								                    "graphrag": {
 								                        "use_graphrag": True,
 								                        "entity_types": [
 								                            "organization",
 								                            "person",
 								                            "geo",
 								                            "event",
 								                            "category",
 								                        ],
 								                        "method": "light",
-												Refactor: enhance graphrag - part 2 (#14972)

### What problem does this PR solve?
1. expose batch_chunk_token_size for configuration
2. retrieve chunks when build subgraph for the doc, not retreive all
docs chunks at the begining
3. get all chunks for a document, used to be hard coded 10000
4. delete not used method run_graphrag

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring

Follow on: #14617
											
										
										
											2026-05-18 16:10:21 +08:00
+								                        "batch_chunk_token_size": 4096,
-												Refactor: enahnce retry and timeout (#14983)

### What problem does this PR solve?

1. Enhance retry and timeout, and adjust the default timeout
2. NER: spacy do not batch chunks
3. extract _has_cancel_and_exit
4. enhance log messages

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
											
										
										
											2026-05-22 13:16:39 +08:00
+								                        "retry_attempts": 2,
 								                        "retry_backoff_seconds": 2.0,
 								                        "retry_backoff_max_seconds": 60.0,
 								                        "build_subgraph_timeout_per_chunk_seconds": 300,
 								                        "build_subgraph_min_timeout_seconds": 600,
 								                        "merge_timeout_seconds": 180,
 								                        "resolution_timeout_seconds": 1800,
 								                        "community_timeout_seconds": 1800,
 								                        "lock_acquire_timeout_seconds": 600,
-												Fix: maintain backward compatibility for KB tasks (#10508)

### What problem does this PR solve?

Maintain backward compatibility for KB tasks

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-10-13 11:53:48 +08:00
+								                    }
 								                }
 								            )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            update_result = KnowledgebaseService.update_by_id(kb.id, {"parser_config": kb_parser_config})
 								            get_recording_context().save_func_return_value("KnowledgebaseService.update_by_id", update_result)
 								            if not update_result:
-												Fix: maintain backward compatibility for KB tasks (#10508)

### What problem does this PR solve?

Maintain backward compatibility for KB tasks

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-10-13 11:53:48 +08:00
+								                progress_callback(prog=-1.0, msg="Internal error: Invalid GraphRAG configuration")
 								                return
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								        graphrag_conf = kb_parser_config.get("graphrag", {})
-												Light GraphRAG (#4585)

### What problem does this PR solve?

#4543

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-22 19:43:14 +08:00
+								        start_ts = timer()
-												Feat: tenant llm provider (#14595)

### What problem does this PR solve?

Python implementation of the Go-based model_provider API suite.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: bill <yibie_jingnian@163.com>
											
										
										
											2026-05-29 17:39:41 +08:00
+								        chat_model_config = get_model_config_from_provider_instance(task_tenant_id, LLMType.CHAT, kb_task_llm_id)
-												Feat/tenant model (#13072)

### What problem does this PR solve?

Add id for table tenant_llm and apply in LLMBundle.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
Co-authored-by: Liu An <asiro@qq.com>
											
										
										
											2026-03-05 17:27:17 +08:00
+								        chat_model = LLMBundle(task_tenant_id, chat_model_config, lang=task_language)
-												Refactor graphrag to remove redis lock (#5828)

### What problem does this PR solve?

Refactor graphrag to remove redis lock

### Type of change

- [x] Refactoring
											
										
										
											2025-03-10 15:15:06 +08:00
+								        with_resolution = graphrag_conf.get("resolution", False)
 								        with_community = graphrag_conf.get("community", False)
-												Fix: task limiter issue. (#7873)

### What problem does this PR solve?

#7869

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-05-27 11:16:29 +08:00
+								        async with kg_limiter:
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								            # await run_graphrag(task, task_language, with_resolution, with_community, chat_model, embedding_model, progress_callback)
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            from rag.graphrag.general.index import run_graphrag_for_kb  # Lazy load, save around 2s
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								            result = await run_graphrag_for_kb(
 								                row=task,
 								                doc_ids=task.get("doc_ids", []),
 								                language=task_language,
 								                kb_parser_config=kb_parser_config,
 								                chat_model=chat_model,
 								                embedding_model=embedding_model,
 								                callback=progress_callback,
 								                with_resolution=with_resolution,
 								                with_community=with_community,
 								            )
 								            logging.info(f"GraphRAG task result for task {task}:\n{result}")
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().record("graphrag_result", result)
-												Refactor graphrag to remove redis lock (#5828)

### What problem does this PR solve?

Refactor graphrag to remove redis lock

### Type of change

- [x] Refactoring
											
										
										
											2025-03-10 15:15:06 +08:00
+								        progress_callback(prog=1.0, msg="Knowledge Graph done ({:.2f}s)".format(timer() - start_ts))
-												Light GraphRAG (#4585)

### What problem does this PR solve?

#4543

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-22 19:43:14 +08:00
+								        return
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    elif task_type == "mindmap":
 								        progress_callback(1, "place holder")
 								        pass
 								        return
-												Feat: Add knowledge compilation workflows (#16515)

## Summary
- Add knowledge compilation template APIs, services, and builtin
template seed data
- Add advanced knowledge compile structure/artifact/RAPTOR workflow
support
- Update parsing, dataset/document APIs, and supporting services for
compilation workflows
											
										
										
											2026-07-02 23:22:07 +08:00
+								    elif task_type == "skill":
 								        progress_callback(-1, "Skill generation requires the refactored task executor (TE_RUN_MODE=0).")
 								        return
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								    else:
-												Update file parsing progress info (#3780)

### What problem does this PR solve?

Refine the file parsing progress info

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 17:03:00 +08:00
+								        # Standard chunking methods
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        task["llm_id"] = doc_task_llm_id
-												Update file parsing progress info (#3780)

### What problem does this PR solve?

Refine the file parsing progress info

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 17:03:00 +08:00
+								        start_ts = timer()
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								        chunks = await build_chunks(task, progress_callback)
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().record("chunks", chunks)
 								        # Record chunk_ids_count for comparison
 								        chunk_ids = [c.get("id") for c in chunks if isinstance(c, dict) and "id" in c]
 								        get_recording_context().record("chunk_ids_count", len(chunk_ids))
 								        # Record chunks array for content comparison (first, middle, last, random)
-												Update file parsing progress info (#3780)

### What problem does this PR solve?

Refine the file parsing progress info

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 17:03:00 +08:00
+								        logging.info("Build document {}: {:.2f}s".format(task_document_name, timer() - start_ts))
 								        if not chunks:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            progress_callback(1.0, msg=f"No chunk built from {task_document_name}")
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								            return
-												Update file parsing progress info (#3780)

### What problem does this PR solve?

Refine the file parsing progress info

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 17:03:00 +08:00
+								        progress_callback(msg="Generate {} chunks".format(len(chunks)))
 								        start_ts = timer()
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								        try:
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								            token_count, vector_size = await embedding(chunks, embedding_model, task_parser_config, progress_callback)
-												Fix: task cancel (#13034)

### What problem does this PR solve?

Fix: task cancel #11745 
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-02-06 14:48:24 +08:00
+								        except TaskCanceledException:
 								            raise
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								        except Exception as e:
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								            error_message = "Generate embedding error:{}".format(str(e))
 								            progress_callback(-1, error_message)
 								            logging.exception(error_message)
 								            token_count = 0
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								            raise
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().record("token_count", token_count)
 								        get_recording_context().record("vector_size", vector_size)
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
+								        progress_message = "Embedding chunks ({:.2f}s)".format(timer() - start_ts)
 								        logging.info(progress_message)
 								        progress_callback(msg=progress_message)
-												Feat: Add knowledge compilation workflows (#16515)

## Summary
- Add knowledge compilation template APIs, services, and builtin
template seed data
- Add advanced knowledge compile structure/artifact/RAPTOR workflow
support
- Update parsing, dataset/document APIs, and supporting services for
compilation workflows
											
										
										
											2026-07-02 23:22:07 +08:00
-												Feat: let toc run asynchronizly... (#10513)

### What problem does this PR solve?

#10436 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-14 14:14:52 +08:00
+								        if task["parser_id"].lower() == "naive" and task["parser_config"].get("toc_extraction", False):
-												Refactor: tidy up ThreadPoolExecutor lifecycle in file_service and task executor (#14668)

## Summary
- Wrap the `ThreadPoolExecutor` instances in `FileService.parse_docs`
and `FileService.get_files` with `with ... as exe:` blocks for
deterministic cleanup
- Replace the `concurrent.futures.ThreadPoolExecutor` in
`do_handle_task` with `asyncio.create_task(asyncio.to_thread(build_TOC,
...))`, preserving the existing parallelism with chunk insertion while
leveraging the surrounding async context
- Drop the now-unused `import concurrent` and the
`executor.shutdown(wait=False)` call in the `finally` block

Closes #14622.

No behavioral change, no public API change. Net diff: ~19 insertions /
25 deletions across two files.

## Test plan
- [ ] `uv run ruff check api/db/services/file_service.py
rag/svr/task_executor.py` passes
- [ ] Upload a multi-file batch through the chat/file endpoint and
confirm `FileService.parse_docs` still returns combined parsed text
- [ ] Trigger `FileService.get_files` via the chat reference flow with a
mix of image and non-image files; verify both `raw=True` and `raw=False`
paths return correctly
- [ ] Run a `naive`-parser document task with `toc_extraction: true` and
confirm the TOC chunk is generated and inserted exactly as before
- [ ] Run a `naive`-parser document task with `toc_extraction: false`
and confirm the path with `toc_thread = None` is unaffected
- [ ] Cancel a running task to exercise the `finally` block and confirm
cleanup still works without the executor shutdown call

---------

Co-authored-by: web-dev0521 <jasonpette1783@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
											
										
										
											2026-05-11 00:59:00 -04:00
+								            toc_thread = asyncio.create_task(asyncio.to_thread(build_TOC, task, chunks, progress_callback))
-												Try to reuse existing chunks (#3983)

### What problem does this PR solve?

Try to reuse existing chunks. Close #3793
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-12 16:38:03 +08:00
-												Update file parsing progress info (#3780)

### What problem does this PR solve?

Refine the file parsing progress info

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 17:03:00 +08:00
+								    chunk_count = len(set([chunk["id"] for chunk in chunks]))
 								    start_ts = timer()
-												Fix: fixed invalid save() arguments for slide thumbnails (#8851)

### What problem does this PR solve?

Fixed invalid save() arguments for slide thumbnails.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-07-15 17:19:45 +08:00
-												Add dataset with table parser type for Infinity and answer question in chat using SQL (#12541)

### What problem does this PR solve?

1) Create  dataset using table parser for infinity
2) Answer questions in chat using SQL

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-19 19:35:14 +08:00
+								    async def _maybe_insert_chunks(_chunks):
-												Fix: fix task cancel (#12093)

### What problem does this PR solve?

Fix: fix task cancel

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-23 09:38:25 +08:00
+								        if has_canceled(task_id):
-												Fix: task cancel (#13034)

### What problem does this PR solve?

Fix: task cancel #11745 
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-02-06 14:48:24 +08:00
+								            progress_callback(-1, msg="Task has been canceled.")
 								            return False
-												Add dataset with table parser type for Infinity and answer question in chat using SQL (#12541)

### What problem does this PR solve?

1) Create  dataset using table parser for infinity
2) Answer questions in chat using SQL

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-19 19:35:14 +08:00
+								        insert_result = await insert_chunks(task_id, task_tenant_id, task_dataset_id, _chunks, progress_callback)
-												Refactor code (#12305)

### What problem does this PR solve?

as title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:09:18 +08:00
+								        return bool(insert_result)
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
-												Fix: fix task cancel (#12093)

### What problem does this PR solve?

Fix: fix task cancel

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-23 09:38:25 +08:00
+								    try:
-												Add dataset with table parser type for Infinity and answer question in chat using SQL (#12541)

### What problem does this PR solve?

1) Create  dataset using table parser for infinity
2) Answer questions in chat using SQL

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-19 19:35:14 +08:00
+								        if not await _maybe_insert_chunks(chunks):
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            get_recording_context().record("insertion_result", "failed")
-												Fix: fix task cancel (#12093)

### What problem does this PR solve?

Fix: fix task cancel

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-23 09:38:25 +08:00
+								            return
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().record("insertion_result", "success")
-												Fix: task cancel (#13034)

### What problem does this PR solve?

Fix: task cancel #11745 
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-02-06 14:48:24 +08:00
+								        if has_canceled(task_id):
 								            progress_callback(-1, msg="Task has been canceled.")
 								            return
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								        if raptor_cleanup_chunks:
 								            cleaned_chunks = 0
 								            for cleanup_doc_id, keep_method in raptor_cleanup_chunks:
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								                ret = await delete_raptor_chunks(
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								                    cleanup_doc_id,
 								                    task_tenant_id,
 								                    task_dataset_id,
 								                    keep_method=keep_method,
 								                )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								                cleaned_chunks += ret
 								                get_recording_context().save_func_return_value("delete_raptor_chunks", ret)
-												feat(raptor): add Psi tree builder with original-space ranking and safe migration (#14679)

### What problem does this PR solve?

Closes #14674.

This PR improves RAPTOR configuration and tree construction while
preserving the existing RAPTOR behavior as the default.

RAPTOR currently builds summary layers with the original UMAP + GMM
clustering path. This PR keeps that default path, and adds:

- A hidden backend tree-builder option:
  - `tree_builder="raptor"`: default, existing RAPTOR behavior.
- `tree_builder="psi"`: rank-aware Psi-style tree builder using original
embedding-space cosine ranking.
- A user-facing clustering method option for the default RAPTOR builder:
  - `clustering_method="gmm"`: existing default.
- `clustering_method="ahc"`: agglomerative hierarchical clustering path.
- A RAPTOR UI setting for `Clustering method` and `Max cluster`.

### What changed

#### Backend

- Added `tree_builder` support for RAPTOR/Psi.
- Added `clustering_method` support for GMM/AHC.
- Kept existing RAPTOR + GMM as the default.
- Added Psi tree building from original-space cosine similarity.
- Added bucketed Psi building controls for large inputs:
  - `raptor.ext.psi_exact_max_leaves`
  - `raptor.ext.psi_bucket_size`
- Added method-aware RAPTOR summary metadata using existing
`extra.raptor_method`.
- Avoided adding a dedicated DB schema field for experimental method
tracking.
- Added cleanup/migration logic to avoid mixing stale RAPTOR summary
trees.
- Added defensive checks for Psi tree construction and summary failures.

#### Frontend/UI

- Added `Clustering method` in RAPTOR settings with `GMM` and `AHC`.
- Added/kept `Max cluster` in RAPTOR settings.
- Enlarged max cluster UI limit to `1024`, matching backend validation.
- Kept AHC editable even when a RAPTOR task has already finished.
- Fixed the UI save payload so `clustering_method` and `tree_builder`
are serialized through `parser_config.raptor.ext`, avoiding backend
validation errors for extra top-level RAPTOR fields.

Example saved RAPTOR config:

```json
{
  "raptor": {
    "max_cluster": 317,
    "ext": {
      "clustering_method": "ahc",
      "tree_builder": "raptor"
    }
  }
}

Co-authored-by: CaptainTimon <CaptainTimon@users.noreply.github.com>
											
										
										
											2026-05-11 15:42:31 -10:00
+								            if cleaned_chunks:
 								                progress_callback(msg=f"Cleaned up {cleaned_chunks} stale RAPTOR chunks.")
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        logging.info("Indexing doc({}), page({}-{}), chunks({}), elapsed: {:.2f}".format(task_document_name, task_from_page, task_to_page, len(chunks), timer() - start_ts))
-												Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-12-01 22:28:00 +08:00
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        ret = DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, token_count, chunk_count, 0)
 								        get_recording_context().save_func_return_value("DocumentService.increment_chunk_num", ret)
-												Feat: let toc run asynchronizly... (#10513)

### What problem does this PR solve?

#10436 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-14 14:14:52 +08:00
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        # Table parser: push metadata/both column values to document-level metadata for UI / chat filters
-												Feature/table parser column roles (#13710)

### What problem does this PR solve?

The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.

For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.

The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.

This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.

Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-05-11 07:06:04 +05:00
+								        if task.get("parser_id", "").lower() == "table":
 								            eff_pc = merge_table_parser_config_from_kb(task)
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            logging.debug(f"[TABLE_META_DEBUG] table post-index: table_column_mode={eff_pc.get('table_column_mode')!r}")
 								            try:
 								                agg = aggregate_table_doc_metadata(chunks, task)
 								                logging.debug(f"[TABLE_META_DEBUG] aggregated metadata: {agg}")
 								                strip_keys = table_parser_strip_doc_metadata_keys(eff_pc)
 								                existing = DocMetadataService.get_document_metadata(task_doc_id)
 								                existing = existing if isinstance(existing, dict) else {}
 								                preserved = {k: v for k, v in existing.items() if k not in strip_keys}
 								                merged = update_metadata_to(dict(preserved), agg)
 								                logging.debug(
 								                    f"[TABLE_META_DEBUG] calling update_document_metadata for doc_id={task_doc_id}, "
 								                    f"meta_fields keys={list(merged.keys())}, "
 								                    f"table_strip_key_count={len(strip_keys)}, agg_keys={list(agg.keys())}"
 								                )
-												Feature/table parser column roles (#13710)

### What problem does this PR solve?

The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.

For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.

The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.

This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.

Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-05-11 07:06:04 +05:00
+								                try:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								                    ret = DocMetadataService.update_document_metadata(task_doc_id, merged)
 								                    get_recording_context().save_func_return_value("DocMetadataService.update_document_metadata", ret)
 								                    logging.debug("[TABLE_META_DEBUG] update_document_metadata succeeded")
 								                except Exception as ue:
 								                    logging.error(
 								                        "update_document_metadata failed (table parser, doc_id=%s): %s",
-												Feature/table parser column roles (#13710)

### What problem does this PR solve?

The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.

For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.

The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.

This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.

Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-05-11 07:06:04 +05:00
+								                        task_doc_id,
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								                        ue,
 								                        exc_info=True,
-												Feature/table parser column roles (#13710)

### What problem does this PR solve?

The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.

For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.

The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.

This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.

Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-05-11 07:06:04 +05:00
+								                    )
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            except Exception as e:
 								                logging.exception(
 								                    "Table parser document metadata aggregation failed (doc_id=%s): %s",
 								                    task_doc_id,
 								                    e,
 								                )
-												Feature/table parser column roles (#13710)

### What problem does this PR solve?

The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.

For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.

The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.

This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.

Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-05-11 07:06:04 +05:00
-												Fix: fix task cancel (#12093)

### What problem does this PR solve?

Fix: fix task cancel

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-23 09:38:25 +08:00
+								        progress_callback(msg="Indexing done ({:.2f}s).".format(timer() - start_ts))
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
-												Fix: fix task cancel (#12093)

### What problem does this PR solve?

Fix: fix task cancel

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-23 09:38:25 +08:00
+								        if toc_thread:
-												Refactor: tidy up ThreadPoolExecutor lifecycle in file_service and task executor (#14668)

## Summary
- Wrap the `ThreadPoolExecutor` instances in `FileService.parse_docs`
and `FileService.get_files` with `with ... as exe:` blocks for
deterministic cleanup
- Replace the `concurrent.futures.ThreadPoolExecutor` in
`do_handle_task` with `asyncio.create_task(asyncio.to_thread(build_TOC,
...))`, preserving the existing parallelism with chunk insertion while
leveraging the surrounding async context
- Drop the now-unused `import concurrent` and the
`executor.shutdown(wait=False)` call in the `finally` block

Closes #14622.

No behavioral change, no public API change. Net diff: ~19 insertions /
25 deletions across two files.

## Test plan
- [ ] `uv run ruff check api/db/services/file_service.py
rag/svr/task_executor.py` passes
- [ ] Upload a multi-file batch through the chat/file endpoint and
confirm `FileService.parse_docs` still returns combined parsed text
- [ ] Trigger `FileService.get_files` via the chat reference flow with a
mix of image and non-image files; verify both `raw=True` and `raw=False`
paths return correctly
- [ ] Run a `naive`-parser document task with `toc_extraction: true` and
confirm the TOC chunk is generated and inserted exactly as before
- [ ] Run a `naive`-parser document task with `toc_extraction: false`
and confirm the path with `toc_thread = None` is unaffected
- [ ] Cancel a running task to exercise the `finally` block and confirm
cleanup still works without the executor shutdown call

---------

Co-authored-by: web-dev0521 <jasonpette1783@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
											
										
										
											2026-05-11 00:59:00 -04:00
+								            d = await toc_thread
-												Fix: fix task cancel (#12093)

### What problem does this PR solve?

Fix: fix task cancel

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-23 09:38:25 +08:00
+								            if d:
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								                get_recording_context().record("toc_chunk", [d])
-												Add dataset with table parser type for Infinity and answer question in chat using SQL (#12541)

### What problem does this PR solve?

1) Create  dataset using table parser for infinity
2) Answer questions in chat using SQL

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-19 19:35:14 +08:00
+								                if not await _maybe_insert_chunks([d]):
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								                    get_recording_context().record("toc_inserted", False)
-												Fix: fix task cancel (#12093)

### What problem does this PR solve?

Fix: fix task cancel

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-23 09:38:25 +08:00
+								                    return
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								                get_recording_context().record("toc_inserted", True)
 								                ret = DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, 0, 1, 0)
 								                get_recording_context().save_func_return_value("DocumentService.increment_chunk_num", ret)
-												Fix: fix task cancel (#12093)

### What problem does this PR solve?

Fix: fix task cancel

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-23 09:38:25 +08:00
 								        if has_canceled(task_id):
 								            progress_callback(-1, msg="Task has been canceled.")
 								            return
 								        task_time_cost = timer() - task_start_ts
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        get_recording_context().record("task_status", "completed")
-												Fix: fix task cancel (#12093)

### What problem does this PR solve?

Fix: fix task cancel

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-23 09:38:25 +08:00
+								        progress_callback(prog=1.0, msg="Task done ({:.2f}s)".format(task_time_cost))
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        logging.info("Chunk doc({}), page({}-{}), chunks({}), token({}), elapsed:{:.2f}".format(task_document_name, task_from_page, task_to_page, len(chunks), token_count, task_time_cost))
-												Fix: fix task cancel (#12093)

### What problem does this PR solve?

Fix: fix task cancel

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-23 09:38:25 +08:00
 								    finally:
-												Refactor: tidy up ThreadPoolExecutor lifecycle in file_service and task executor (#14668)

## Summary
- Wrap the `ThreadPoolExecutor` instances in `FileService.parse_docs`
and `FileService.get_files` with `with ... as exe:` blocks for
deterministic cleanup
- Replace the `concurrent.futures.ThreadPoolExecutor` in
`do_handle_task` with `asyncio.create_task(asyncio.to_thread(build_TOC,
...))`, preserving the existing parallelism with chunk insertion while
leveraging the surrounding async context
- Drop the now-unused `import concurrent` and the
`executor.shutdown(wait=False)` call in the `finally` block

Closes #14622.

No behavioral change, no public API change. Net diff: ~19 insertions /
25 deletions across two files.

## Test plan
- [ ] `uv run ruff check api/db/services/file_service.py
rag/svr/task_executor.py` passes
- [ ] Upload a multi-file batch through the chat/file endpoint and
confirm `FileService.parse_docs` still returns combined parsed text
- [ ] Trigger `FileService.get_files` via the chat reference flow with a
mix of image and non-image files; verify both `raw=True` and `raw=False`
paths return correctly
- [ ] Run a `naive`-parser document task with `toc_extraction: true` and
confirm the TOC chunk is generated and inserted exactly as before
- [ ] Run a `naive`-parser document task with `toc_extraction: false`
and confirm the path with `toc_thread = None` is unaffected
- [ ] Cancel a running task to exercise the `finally` block and confirm
cleanup still works without the executor shutdown call

---------

Co-authored-by: web-dev0521 <jasonpette1783@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
											
										
										
											2026-05-11 00:59:00 -04:00
+								        if toc_thread is not None and not toc_thread.done():
 								            toc_thread.cancel()
-												Fix: fix task cancel (#12093)

### What problem does this PR solve?

Fix: fix task cancel

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-23 09:38:25 +08:00
+								        if has_canceled(task_id):
 								            try:
-												Refa: asyncio.to_thread to ThreadPoolExecutor to break thread limitat… (#12716)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-20 13:29:37 +08:00
+								                exists = await thread_pool_exec(
-												Feat: support context window for docx (#12455)

### What problem does this PR solve?

Feat: support context window for docx

#12303

Done:
- [x] naive.py
- [x] one.py

TODO:
- [ ] book.py
- [ ] manual.py

Fix: incorrect image position
Fix: incorrect chunk type tag

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-07 15:08:17 +08:00
+								                    settings.docStoreConn.index_exist,
-												Fix: fix task cancel (#12093)

### What problem does this PR solve?

Fix: fix task cancel

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-23 09:38:25 +08:00
+								                    search.index_name(task_tenant_id),
 								                    task_dataset_id,
 								                )
 								                if exists:
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								                    ret = await thread_pool_exec(
-												Fix: fix task cancel (#12093)

### What problem does this PR solve?

Fix: fix task cancel

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-23 09:38:25 +08:00
+								                        settings.docStoreConn.delete,
 								                        {"doc_id": task_doc_id},
 								                        search.index_name(task_tenant_id),
 								                        task_dataset_id,
 								                    )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								                    get_recording_context().save_func_return_value("docStoreConn.delete", ret)
-												Refactor code (#12305)

### What problem does this PR solve?

as title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:09:18 +08:00
+								            except Exception as e:
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								                logging.exception(f"Remove doc({task_doc_id}) from docStore failed when task({task_id}) canceled, exception: {e}")
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
-												Feat: add mechanism to check cancellation in Agent (#10766)

### What problem does this PR solve?

Add mechanism to check cancellation in Agent.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-11 17:36:48 +08:00
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								async def handle_task():
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								    global DONE_TASKS, FAILED_TASKS
 								    redis_msg, task = await collect()
 								    if not task:
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								        await asyncio.sleep(5)
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								        return
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
 								    task_type = task["task_type"]
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								    pipeline_task_type = TASK_TYPE_TO_PIPELINE_TASK_TYPE.get(task_type, PipelineTaskType.PARSE) or PipelineTaskType.PARSE
-												Refactor code (#12305)

### What problem does this PR solve?

as title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:09:18 +08:00
+								    task_id = task["id"]
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								    try:
 								        CURRENT_TASKS[task["id"]] = copy.deepcopy(task)
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								        run_mode = os.environ.get("TE_RUN_MODE", "0")
 								        logging.info(f"TE_RUN_MODE is {run_mode}")
 								        # Check if dry-run comparison is enabled via environment variable
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        if run_mode == "1":  # dry run mode - compare
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            set_recording_context(RecordingContext())
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            await do_handle_task(task)  # original execution
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            # dry run mode
 								            logging.info(f"-----dry run task:{task_id}, {task.get('name', '')}, doc id:{task.get('doc_id', '')}")
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            await TaskManager.dry_run_task(task, get_recording_context(), chat_limiter, minio_limiter, chunk_limiter, embed_limiter, kg_limiter, set_progress, has_canceled)
 								        elif run_mode == "0":  # use refactor-ed version
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            # switch to refactor-ed version
 								            logging.info(f"-----run refactor-ed task executor:{task_id}, {task.get('name', '')}, doc id:{task.get('doc_id', '')}")
-												refactor: overhaul task executor with layered architecture and comprehensive test suite (#15471)

## Summary

Decomposes the monolithic `task_executor.py` (1945 lines) into a 6-layer
architecture with clear separation of concerns. The refactored code is
functionally equivalent to the original, verified through 400 passing
tests and a production-vs-dry-run comparison framework.

## Architecture

```
entry (task_manager)
  └─ orchestration (task_handler)
       ├─ services (chunk_service, embedding_service, dataflow_service, raptor_service, post_processor)
       │    └─ utilities (chunk_builder, chunk_post_processor, embedding_utils)
       └─ infrastructure (task_context, recording_context, interceptor)
```

Key design decisions:
- **TaskContext** — typed facade over raw task dict, injects rate
limiters + callbacks via composition
- **RecordingContext + Comparator** — enables side-by-side production vs
dry-run execution for safe migration
- **NullRecordingContext** — zero-allocation no-op for production, uses
`__slots__`
- **WriteOperationInterceptor** — FIFO replay of previous runs function
returns for comparison mode

## Migration Strategy

The original `handle_task()` in `task_executor.py` uses a 3-way switch
via `TE_RUN_MODE`:
- `TE_RUN_MODE=0` (default) → runs refactored code
- `TE_RUN_MODE=1` → runs both original + refactored, compares all
intermediate results
- `TE_RUN_MODE=2` → runs original code (fallback)

The comparison mode (`TE_RUN_MODE=1`) records ~40 intermediate values
(chunks, vectors, token counts, func return values) from the production
run and replays them during dry-run, then uses `ContextComparator` to
report mismatches.

## Functional Equivalence Fixes

All divergences between original and refactored code were identified and
fixed:
- Timeout decorators (handle/build_chunks/raptor/embedding)
- NullRecordingContext leak in finally block causing RuntimeError
- MinIO None-binary check with proper FileNotFoundError
- Dataflow dispatch after embedding binding + init_kb
- Memory task missing return after processing
- RAPTOR checkpoint progress reporting
- Tag cache (get_tags_from_cache/set_tags_to_cache) restoration
- dataflow_id correction in _load_dsl
- Language default Chinese, dead code guard removal
- embed_chunks made async with proper thread_pool_exec
- Full GraphRAG default configuration (10 parameters)
- Hardcoded q_768_vec fallback removal in RAPTOR

## Test Changes

- 20 new tests covering table parser manual mode, tag cache, embedding
edge cases, RAPTOR checkpoint, dataflow_id correction, storage binary
None, cancel cleanup, metadata=None boundary
- Unified `make_task_context`/`make_task_dict` factories eliminated 10+
duplicated helpers
- DataflowService tests migrated from internal method mocks to IO
boundary mocks (real orchestration code executes)
- Parametrized duplicate build_chunks post-processor tests
- 7 raptor tests modernized to @pytest.mark.asyncio
- Mock count per test reduced through boundary-level mocking strategy

**Test count: 400 passing, 0 warnings, 0 skips**

## Files Changed

| File | Change |
|------|--------|
| `rag/svr/task_executor.py` | +1 line (NullRecordingContext fix) |
| `rag/svr/task_executor_refactor/task_handler.py` | Orchestration
layer, 8 logic fixes |
| `rag/svr/task_executor_refactor/chunk_service.py` | +timeout +
None-check |
| `rag/svr/task_executor_refactor/embedding_service.py` | sync→async
rewrite |
| `rag/svr/task_executor_refactor/dataflow_service.py` | dataflow_id fix
+ timeout |
| `rag/svr/task_executor_refactor/raptor_service.py` | checkpoint fix +
assert |
| `rag/svr/task_executor_refactor/chunk_post_processor.py` | tag cache
restore |
| `rag/svr/task_executor_refactor/task_context.py` | language default
fix |
| `test/.../conftest.py` | +294 lines shared helpers |
| `test/.../*.py` | 15 test files refactored, 20 new tests |

---------

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
											
										
										
											2026-06-03 17:18:31 +08:00
+								            set_recording_context(NullRecordingContext())
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            await TaskManager.run_refactored_task(task, chat_limiter, minio_limiter, chunk_limiter, embed_limiter, kg_limiter, set_progress, has_canceled)
 								        else:  # original version
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            logging.info(f"-----run original task executor:{task_id}, {task.get('name', '')}, doc id:{task.get('doc_id', '')}")
 								            set_recording_context(NullRecordingContext())
 								            await do_handle_task(task)
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								        DONE_TASKS += 1
-												Refactor code (#12305)

### What problem does this PR solve?

as title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:09:18 +08:00
+								        CURRENT_TASKS.pop(task_id, None)
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								        logging.info(f"handle_task done for task {json.dumps(task)}")
-												Fix: task cancel (#13034)

### What problem does this PR solve?

Fix: task cancel #11745 
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-02-06 14:48:24 +08:00
+								    except TaskCanceledException as e:
 								        DONE_TASKS += 1
 								        CURRENT_TASKS.pop(task_id, None)
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        logging.info(f"handle_task canceled for task {task_id}: {getattr(e, 'msg', str(e))}")
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								    except Exception as e:
 								        FAILED_TASKS += 1
-												Refactor code (#12305)

### What problem does this PR solve?

as title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:09:18 +08:00
+								        CURRENT_TASKS.pop(task_id, None)
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								        try:
-												Refactor graphrag to remove redis lock (#5828)

### What problem does this PR solve?

Refactor graphrag to remove redis lock

### Type of change

- [x] Refactoring
											
										
										
											2025-03-10 15:15:06 +08:00
+								            err_msg = str(e)
 								            while isinstance(e, exceptiongroup.ExceptionGroup):
 								                e = e.exceptions[0]
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								                err_msg += " -- " + str(e)
-												Refactor code (#12305)

### What problem does this PR solve?

as title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:09:18 +08:00
+								            set_progress(task_id, prog=-1, msg=f"[Exception]: {err_msg}")
 								        except Exception as e:
 								            logging.exception(f"[Exception]: {str(e)}")
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								            pass
 								        logging.exception(f"handle_task got exception for task {json.dumps(task)}")
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								    finally:
 								        if not task.get("dataflow_id", ""):
-												[Bug fix #14133] fix graph rag, raptor, mindmap log cannot show correctly in UI (#14136)

### What problem does this PR solve?
Fix #14133, knowledge graph, raptor, mindmap log cannot show correctly
in UI
<img width="1930" height="982" alt="Image"
src="https://github.com/user-attachments/assets/d2f8e6c1-d82d-4b00-a377-949aada545ca"
/>
After Fix:
<img width="2108" height="805" alt="image"
src="https://github.com/user-attachments/assets/b37426c1-83d3-4a32-a83c-9d340d69e0e6"
/>
<img width="2173" height="1067" alt="image"
src="https://github.com/user-attachments/assets/30105222-3310-43a0-9f83-1e320d05e413"
/>

### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-16 13:08:36 +08:00
+								            referred_document_id = None
-												Feat: Add knowledge compilation workflows (#16515)

## Summary
- Add knowledge compilation template APIs, services, and builtin
template seed data
- Add advanced knowledge compile structure/artifact/RAPTOR workflow
support
- Update parsing, dataset/document APIs, and supporting services for
compilation workflows
											
										
										
											2026-07-02 23:22:07 +08:00
+								            if task_type in ["graphrag", "raptor", "mindmap", "artifact", "skill"]:
 								                # KB-level fan-out tasks store the participating doc list in
 								                # task["doc_ids"]; the first entry is used as a referent so
 								                # the pipeline operation log has something to anchor to.
 								                referred_document_id = (task.get("doc_ids") or [None])[0]
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								            ret = PipelineOperationLogService.record_pipeline_operation(
 								                document_id=task["doc_id"], pipeline_id="", task_type=pipeline_task_type, task_id=task_id, referred_document_id=referred_document_id
 								            )
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            get_recording_context().save_func_return_value("PipelineOperationLogService.record_pipeline_operation", ret)
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								    redis_msg.ack()
-												Feat/monitor task (#11116)

### What problem does this PR solve?

Show task executor.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-10 12:51:39 +08:00
+								async def get_server_ip() -> str:
 								    # get ip by udp
 								    try:
 								        with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
 								            s.connect(("8.8.8.8", 80))
 								            return s.getsockname()[0]
 								    except Exception as e:
 								        logging.error(str(e))
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        return "Unknown"
-												Feat/monitor task (#11116)

### What problem does this PR solve?

Show task executor.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-11-10 12:51:39 +08:00
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								async def report_status():
-												Improve task executor heartbeat handling and cleanup (#12390)

Improve task executor heartbeat handling and cleanup.

### What problem does this PR solve?

- **Reduce lock contention during executor cleanup**: The cleanup lock
is acquired only when removing expired executors, not during regular
heartbeat reporting, reducing potential lock contention.

- **Optimize own heartbeat cleanup**: Each executor removes its own
expired heartbeat using `zremrangebyscore` instead of `zcount` +
`zpopmin`, reducing Redis operations and improving efficiency.

- **Improve cleanup of other executors' heartbeats**: Expired executors
are detected by checking their latest heartbeat, and stale entries are
removed safely.

- **Other improvements**: IP address and PID are captured once at
startup, and unnecessary global declarations are removed.

### Type of change

- [x] Performance Improvement

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2026-01-04 11:24:05 +08:00
+								    """
 								    Periodically reports the executor's heartbeat
 								    """
 								    global PENDING_TASKS, LAG_TASKS, DONE_TASKS, FAILED_TASKS
 								    ip_address = await get_server_ip()
 								    pid = os.getpid()
 								    # Register the executor in Redis
-												Rework task executor heartbeat (#3430)

### What problem does this PR solve?

Rework task executor heartbeat, and print in console.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-15 14:43:55 +08:00
+								    REDIS_CONN.sadd("TASKEXE", CONSUMER_NAME)
-												feat: Recover pending tasks while pod restart. (#7073)

### What problem does this PR solve?

If you deploy Ragflow using Kubernetes, the hostname will change during
a rolling update. This causes the consumer name of the task executor to
change, making it impossible to schedule tasks that were previously in a
pending state.
To address this, I introduced a recovery task that scans these pending
messages and re-publishes them, allowing the tasks to continue being
processed.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

---------

Co-authored-by: liuzhenghua-jk <liuzhenghua-jk@360shuke.com>
											
										
										
											2025-04-19 16:18:51 +08:00
+								    redis_lock = RedisDistributedLock("clean_task_executor", lock_value=CONSUMER_NAME, timeout=60)
-												Improve task executor heartbeat handling and cleanup (#12390)

Improve task executor heartbeat handling and cleanup.

### What problem does this PR solve?

- **Reduce lock contention during executor cleanup**: The cleanup lock
is acquired only when removing expired executors, not during regular
heartbeat reporting, reducing potential lock contention.

- **Optimize own heartbeat cleanup**: Each executor removes its own
expired heartbeat using `zremrangebyscore` instead of `zcount` +
`zpopmin`, reducing Redis operations and improving efficiency.

- **Improve cleanup of other executors' heartbeats**: Expired executors
are detected by checking their latest heartbeat, and stale entries are
removed safely.

- **Other improvements**: IP address and PID are captured once at
startup, and unnecessary global declarations are removed.

### Type of change

- [x] Performance Improvement

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2026-01-04 11:24:05 +08:00
-												add taskexecutor status check (#2038)

### What problem does this PR solve?


### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-08-21 17:48:00 +08:00
+								    while True:
-												Improve task executor heartbeat handling and cleanup (#12390)

Improve task executor heartbeat handling and cleanup.

### What problem does this PR solve?

- **Reduce lock contention during executor cleanup**: The cleanup lock
is acquired only when removing expired executors, not during regular
heartbeat reporting, reducing potential lock contention.

- **Optimize own heartbeat cleanup**: Each executor removes its own
expired heartbeat using `zremrangebyscore` instead of `zcount` +
`zpopmin`, reducing Redis operations and improving efficiency.

- **Improve cleanup of other executors' heartbeats**: Expired executors
are detected by checking their latest heartbeat, and stale entries are
removed safely.

- **Other improvements**: IP address and PID are captured once at
startup, and unnecessary global declarations are removed.

### Type of change

- [x] Performance Improvement

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2026-01-04 11:24:05 +08:00
+								        now = datetime.now()
 								        now_ts = now.timestamp()
 								        group_info = REDIS_CONN.queue_info(settings.get_svr_queue_name(0), SVR_CONSUMER_GROUP_NAME) or {}
 								        PENDING_TASKS = int(group_info.get("pending", 0))
 								        LAG_TASKS = int(group_info.get("lag", 0))
 								        current = copy.deepcopy(CURRENT_TASKS)
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								        heartbeat = json.dumps(
 								            {
 								                "ip_address": ip_address,
 								                "pid": pid,
 								                "name": CONSUMER_NAME,
 								                "now": now.astimezone().isoformat(timespec="milliseconds"),
 								                "boot_at": BOOT_AT,
 								                "pending": PENDING_TASKS,
 								                "lag": LAG_TASKS,
 								                "done": DONE_TASKS,
 								                "failed": FAILED_TASKS,
 								                "current": current,
 								            }
 								        )
-												Improve task executor heartbeat handling and cleanup (#12390)

Improve task executor heartbeat handling and cleanup.

### What problem does this PR solve?

- **Reduce lock contention during executor cleanup**: The cleanup lock
is acquired only when removing expired executors, not during regular
heartbeat reporting, reducing potential lock contention.

- **Optimize own heartbeat cleanup**: Each executor removes its own
expired heartbeat using `zremrangebyscore` instead of `zcount` +
`zpopmin`, reducing Redis operations and improving efficiency.

- **Improve cleanup of other executors' heartbeats**: Expired executors
are detected by checking their latest heartbeat, and stale entries are
removed safely.

- **Other improvements**: IP address and PID are captured once at
startup, and unnecessary global declarations are removed.

### Type of change

- [x] Performance Improvement

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2026-01-04 11:24:05 +08:00
 								        # Report heartbeat to Redis
-												add taskexecutor status check (#2038)

### What problem does this PR solve?


### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-08-21 17:48:00 +08:00
+								        try:
-												Improve task executor heartbeat handling and cleanup (#12390)

Improve task executor heartbeat handling and cleanup.

### What problem does this PR solve?

- **Reduce lock contention during executor cleanup**: The cleanup lock
is acquired only when removing expired executors, not during regular
heartbeat reporting, reducing potential lock contention.

- **Optimize own heartbeat cleanup**: Each executor removes its own
expired heartbeat using `zremrangebyscore` instead of `zcount` +
`zpopmin`, reducing Redis operations and improving efficiency.

- **Improve cleanup of other executors' heartbeats**: Expired executors
are detected by checking their latest heartbeat, and stale entries are
removed safely.

- **Other improvements**: IP address and PID are captured once at
startup, and unnecessary global declarations are removed.

### Type of change

- [x] Performance Improvement

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2026-01-04 11:24:05 +08:00
+								            REDIS_CONN.zadd(CONSUMER_NAME, heartbeat, now_ts)
 								        except Exception as e:
 								            logging.warning(f"Failed to report heartbeat: {e}")
 								        else:
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								            logging.debug(f"{CONSUMER_NAME} reported heartbeat: {heartbeat}")
 								            pass
-												Rework task executor heartbeat (#3430)

### What problem does this PR solve?

Rework task executor heartbeat, and print in console.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-15 14:43:55 +08:00
-												Improve task executor heartbeat handling and cleanup (#12390)

Improve task executor heartbeat handling and cleanup.

### What problem does this PR solve?

- **Reduce lock contention during executor cleanup**: The cleanup lock
is acquired only when removing expired executors, not during regular
heartbeat reporting, reducing potential lock contention.

- **Optimize own heartbeat cleanup**: Each executor removes its own
expired heartbeat using `zremrangebyscore` instead of `zcount` +
`zpopmin`, reducing Redis operations and improving efficiency.

- **Improve cleanup of other executors' heartbeats**: Expired executors
are detected by checking their latest heartbeat, and stale entries are
removed safely.

- **Other improvements**: IP address and PID are captured once at
startup, and unnecessary global declarations are removed.

### Type of change

- [x] Performance Improvement

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2026-01-04 11:24:05 +08:00
+								        # Clean up own expired heartbeat
 								        try:
 								            REDIS_CONN.zremrangebyscore(CONSUMER_NAME, 0, now_ts - 60 * 30)
 								        except Exception as e:
 								            logging.warning(f"Failed to clean heartbeat: {e}")
-												feat: Recover pending tasks while pod restart. (#7073)

### What problem does this PR solve?

If you deploy Ragflow using Kubernetes, the hostname will change during
a rolling update. This causes the consumer name of the task executor to
change, making it impossible to schedule tasks that were previously in a
pending state.
To address this, I introduced a recovery task that scans these pending
messages and re-publishes them, allowing the tasks to continue being
processed.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

---------

Co-authored-by: liuzhenghua-jk <liuzhenghua-jk@360shuke.com>
											
										
										
											2025-04-19 16:18:51 +08:00
-												Improve task executor heartbeat handling and cleanup (#12390)

Improve task executor heartbeat handling and cleanup.

### What problem does this PR solve?

- **Reduce lock contention during executor cleanup**: The cleanup lock
is acquired only when removing expired executors, not during regular
heartbeat reporting, reducing potential lock contention.

- **Optimize own heartbeat cleanup**: Each executor removes its own
expired heartbeat using `zremrangebyscore` instead of `zcount` +
`zpopmin`, reducing Redis operations and improving efficiency.

- **Improve cleanup of other executors' heartbeats**: Expired executors
are detected by checking their latest heartbeat, and stale entries are
removed safely.

- **Other improvements**: IP address and PID are captured once at
startup, and unnecessary global declarations are removed.

### Type of change

- [x] Performance Improvement

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2026-01-04 11:24:05 +08:00
+								        # Clean other executors
 								        lock_acquired = False
 								        try:
 								            lock_acquired = redis_lock.acquire()
-												Refactor code (#12305)

### What problem does this PR solve?

as title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-30 11:09:18 +08:00
+								        except Exception as e:
-												Improve task executor heartbeat handling and cleanup (#12390)

Improve task executor heartbeat handling and cleanup.

### What problem does this PR solve?

- **Reduce lock contention during executor cleanup**: The cleanup lock
is acquired only when removing expired executors, not during regular
heartbeat reporting, reducing potential lock contention.

- **Optimize own heartbeat cleanup**: Each executor removes its own
expired heartbeat using `zremrangebyscore` instead of `zcount` +
`zpopmin`, reducing Redis operations and improving efficiency.

- **Improve cleanup of other executors' heartbeats**: Expired executors
are detected by checking their latest heartbeat, and stale entries are
removed safely.

- **Other improvements**: IP address and PID are captured once at
startup, and unnecessary global declarations are removed.

### Type of change

- [x] Performance Improvement

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2026-01-04 11:24:05 +08:00
+								            logging.warning(f"Failed to acquire Redis lock: {e}")
 								        if lock_acquired:
 								            try:
 								                task_executors = REDIS_CONN.smembers("TASKEXE") or set()
 								                for worker_name in task_executors:
 								                    if worker_name == CONSUMER_NAME:
 								                        continue
 								                    try:
 								                        last_heartbeat = REDIS_CONN.REDIS.zrevrange(worker_name, 0, 0, withscores=True)
 								                    except Exception as e:
 								                        logging.warning(f"Failed to read zset for {worker_name}: {e}")
 								                        continue
 								                    if not last_heartbeat or now_ts - last_heartbeat[0][1] > WORKER_HEARTBEAT_TIMEOUT:
 								                        logging.info(f"{worker_name} expired, removed")
 								                        REDIS_CONN.srem("TASKEXE", worker_name)
 								                        REDIS_CONN.delete(worker_name)
 								            except Exception as e:
 								                logging.warning(f"Failed to clean other executors: {e}")
 								            finally:
 								                redis_lock.release()
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								        await asyncio.sleep(30)
-												Fix: fixed invalid save() arguments for slide thumbnails (#8851)

### What problem does this PR solve?

Fixed invalid save() arguments for slide thumbnails.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-07-15 17:19:45 +08:00
-												fix: Fix the problem that concurrent execution limit in task executor fails and causes OOM (issue#7580) (#7700)

### What problem does this PR solve?

## Cause of the bug:
During the execution process, due to improper use of trio
CapacityLimiter, the configuration parameter MAX_CONCURRENT_TASKS is
invalid, causing the executor to take out a large number of tasks from
the Redis queue at one time.

This behavior will cause the task executor to occupy too much memory and
be killed by the OS when a large number of tasks exist at the same time.
As a result, all executing tasks are suspended.

## Fix:
Added the task_manager method to the entry of /rag/svr/task_executor.py
to make CapacityLimiter effective. Deleted the invalid async with
statement.

## Fix result:
After testing, the task executor execution meets expectations, that is:
concurrent execution of up to $MAX_CONCURRENT_TASKS tasks.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2025-05-19 10:25:56 +08:00
+								async def task_manager():
-												fix: single task executor getting all tasks from Redis queue (#7330)

### What problem does this PR solve?

Currently, as long as there are tasks in Redis, this loop will keep
getting the tasks. This will lead to a single task executor with many
tasks in the pending state. Then we need to wait for the pending tasks
to get them back in the queue.

In first place, if we set the `MAX_CONCURRENT_TASKS` to X, then only X
tasks should be picked from the queue, and others should be left in the
queue for other `task_executors` or be picked after 1 of the spots in
the current executor gets free. This PR ensures this behavior.

The additional changes were due to the Ruff linting in pre-commit. But I
believe these are expected to keep the coding style.

### Type of change

- [X] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2025-06-06 03:32:35 -03:00
+								    try:
-												fix: Fix the problem that concurrent execution limit in task executor fails and causes OOM (issue#7580) (#7700)

### What problem does this PR solve?

## Cause of the bug:
During the execution process, due to improper use of trio
CapacityLimiter, the configuration parameter MAX_CONCURRENT_TASKS is
invalid, causing the executor to take out a large number of tasks from
the Redis queue at one time.

This behavior will cause the task executor to occupy too much memory and
be killed by the OS when a large number of tasks exist at the same time.
As a result, all executing tasks are suspended.

## Fix:
Added the task_manager method to the entry of /rag/svr/task_executor.py
to make CapacityLimiter effective. Deleted the invalid async with
statement.

## Fix result:
After testing, the task executor execution meets expectations, that is:
concurrent execution of up to $MAX_CONCURRENT_TASKS tasks.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2025-05-19 10:25:56 +08:00
+								        await handle_task()
-												fix: single task executor getting all tasks from Redis queue (#7330)

### What problem does this PR solve?

Currently, as long as there are tasks in Redis, this loop will keep
getting the tasks. This will lead to a single task executor with many
tasks in the pending state. Then we need to wait for the pending tasks
to get them back in the queue.

In first place, if we set the `MAX_CONCURRENT_TASKS` to X, then only X
tasks should be picked from the queue, and others should be left in the
queue for other `task_executors` or be picked after 1 of the spots in
the current executor gets free. This PR ensures this behavior.

The additional changes were due to the Ruff linting in pre-commit. But I
believe these are expected to keep the coding style.

### Type of change

- [X] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2025-06-06 03:32:35 -03:00
+								    finally:
 								        task_limiter.release()
-												feat: Recover pending tasks while pod restart. (#7073)

### What problem does this PR solve?

If you deploy Ragflow using Kubernetes, the hostname will change during
a rolling update. This causes the consumer name of the task executor to
change, making it impossible to schedule tasks that were previously in a
pending state.
To address this, I introduced a recovery task that scans these pending
messages and re-publishes them, allowing the tasks to continue being
processed.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

---------

Co-authored-by: liuzhenghua-jk <liuzhenghua-jk@360shuke.com>
											
										
										
											2025-04-19 16:18:51 +08:00
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								async def main():
-												task executor issues (#12006)

### What problem does this PR solve?

**Fixes #8706** - `InfinityException: TOO_MANY_CONNECTIONS` when running
multiple task executor workers

### Problem Description

When running RAGFlow with 8-16 task executor workers, most workers fail
to start properly. Checking logs revealed that workers were
stuck/hanging during Infinity connection initialization - only 1-2
workers would successfully register in Redis while the rest remained
blocked.

### Root Cause

The Infinity SDK `ConnectionPool` pre-allocates all connections in
`__init__`. With the default `max_size=32` and multiple workers (e.g.,
16), this creates 16×32=512 connections immediately on startup,
exceeding Infinity's default 128 connection limit. Workers hang while
waiting for connections that can never be established.

### Changes

1. **Prevent Infinity connection storm** (`rag/utils/infinity_conn.py`,
`rag/svr/task_executor.py`)
- Reduced ConnectionPool `max_size` from 32 to 4 (sufficient since
operations are synchronous)
- Added staggered startup delay (2s per worker) to spread connection
initialization

2. **Handle None children_delimiter** (`rag/app/naive.py`)
   - Use `or ""` to handle explicitly set None values from parser config

3. **MinerU parser robustness** (`deepdoc/parser/mineru_parser.py`)
   - Use `.get()` for optional output fields that may be missing
- Fix DISCARDED block handling: change `pass` to `continue` to skip
discarded blocks entirely

### Why `max_size=4` is sufficient

| Workers | Pool Size | Total Connections | Infinity Limit |
|---------|-----------|-------------------|----------------|
| 16      | 32        | 512               | 128 ❌         |
| 16      | 4         | 64                | 128 ✅         |
| 32      | 4         | 128               | 128 ✅         |

- All RAGFlow operations are synchronous: `get_conn()` → operation →
`release_conn()`
- No parallel `docStoreConn` operations in the codebase
- Maximum 1-2 concurrent connections needed per worker; 4 provides
safety margin

### MinerU DISCARDED block bug

When MinerU returns blocks with `type: "discarded"` (headers, footers,
watermarks, page numbers, artifacts), the previous code used `pass`
which left the `section` variable undefined, causing:

- **UnboundLocalError** if DISCARDED is the first block
- **Duplicate content** if DISCARDED follows another block (stale value
from previous iteration)

**Root cause confirmed via MinerU source code:**

From
[`mineru/utils/enum_class.py`](https://github.com/opendatalab/MinerU/blob/main/mineru/utils/enum_class.py#L14):
```python
class BlockType:
    DISCARDED = 'discarded'
    # VLM 2.5+ also has: HEADER, FOOTER, PAGE_NUMBER, ASIDE_TEXT, PAGE_FOOTNOTE
```

Per [MinerU
documentation](https://opendatalab.github.io/MinerU/reference/output_files/),
discarded blocks contain content that should be filtered out for clean
text extraction.

**Fix:** Changed `pass` to `continue` to skip discarded blocks entirely.

### Testing

- Verified all 16 workers now register successfully in Redis
- All workers heartbeating correctly
- Document parsing works as expected
- MinerU parsing with DISCARDED blocks no longer crashes

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: user210 <user210@rt>
											
										
										
											2025-12-18 04:03:30 +02:00
+								    # Stagger executor startup to prevent connection storm to Infinity
 								    # Extract worker number from CONSUMER_NAME (e.g., "task_executor_abc123_5" -> 5)
 								    try:
 								        worker_num = int(CONSUMER_NAME.rsplit("_", 1)[-1])
 								        # Add random delay: base delay + worker_num * 2.0s + random jitter
 								        # This spreads out connection attempts over several seconds
 								        startup_delay = worker_num * 2.0 + random.uniform(0, 0.5)
 								        if startup_delay > 0:
 								            logging.info(f"Staggering startup by {startup_delay:.2f}s to prevent connection storm")
 								            await asyncio.sleep(startup_delay)
 								    except (ValueError, IndexError):
 								        pass  # Non-standard consumer name, skip delay
-												Update progress info and start welcome info (#3768)

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

### Type of change

- [x] Refactoring

---------

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-11-30 18:48:06 +08:00
+								    logging.info(r"""
-												Refa: improve flow of GraphRAG and RAPTOR (#10709)

### What problem does this PR solve?

Improve flow of GraphRAG and RAPTOR.

### Type of change

- [x] Refactoring
											
										
										
											2025-10-22 09:29:20 +08:00
+								    ____                      __  _
-												Update message printout when start ingestion server (#10677)

### What problem does this PR solve?

```
     ____                                  __     _                                                                  
    /  _/   ____    ____ _  ___    _____  / /_   (_)  ____    ____           _____  ___    _____ _   __  ___    _____
    / /    / __ \  / __ `/ / _ \  / ___/ / __/  / /  / __ \  / __ \         / ___/ / _ \  / ___/| | / / / _ \  / ___/
 _/ /    / / / / / /_/ / /  __/ (__  ) / /_   / /  / /_/ / / / / /        (__  ) /  __/ / /    | |/ / /  __/ / /    
/___/   /_/ /_/  \__, /  \___/ /____/  \__/  /_/   \____/ /_/ /_/        /____/  \___/ /_/     |___/  \___/ /_/     
                /____/          
```

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-10-21 09:38:20 +08:00
+								   /  _/___  ____ ____  _____/ /_(_)___  ____     ________  ______   _____  _____
 								   / // __ \/ __ `/ _ \/ ___/ __/ / __ \/ __ \   / ___/ _ \/ ___/ | / / _ \/ ___/
-												Refa: improve flow of GraphRAG and RAPTOR (#10709)

### What problem does this PR solve?

Improve flow of GraphRAG and RAPTOR.

### Type of change

- [x] Refactoring
											
										
										
											2025-10-22 09:29:20 +08:00
+								 _/ // / / / /_/ /  __(__  ) /_/ / /_/ / / / /  (__  )  __/ /   | |/ /  __/ /
 								/___/_/ /_/\__, /\___/____/\__/_/\____/_/ /_/  /____/\___/_/    |___/\___/_/
 								          /____/
-												Update progress info and start welcome info (#3768)

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

### Type of change

- [x] Refactoring

---------

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-11-30 18:48:06 +08:00
+								    """)
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								    logging.info(f"RAGFlow ingestion version: {get_ragflow_version()}")
-												Don't release full image (#10654)

### What problem does this PR solve?

Introduced gpu profile in .env
Added Dockerfile_tei
fix datrie
Removed LIGHTEN flag

### Type of change

- [x] Documentation Update
- [x] Refactoring
											
										
										
											2025-10-23 23:02:27 +08:00
+								    show_configs()
-												Added current task into task executor's hearbeat (#3444)

### What problem does this PR solve?

Added current task into task executor's hearbeat

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 22:55:41 +08:00
+								    settings.init_settings()
-												Move api.settings to common.settings (#11036)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-06 09:36:38 +08:00
+								    settings.check_and_install_torch()
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								    logging.info(f"default embedding config: {settings.EMBEDDING_CFG}")
-												Move api.settings to common.settings (#11036)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-06 09:36:38 +08:00
+								    settings.print_rag_settings()
-												Fix:signal.SIGUSR1 and signal.SIGUSR2 can't use in window. so don't bind signal.SIGUSR1 and signal.SIGUSR2 in the windows env (#5941)

### What problem does this PR solve?
Fix:signal.SIGUSR1 and signal.SIGUSR2 can't use in window. so don't bind
signal.SIGUSR1 and signal.SIGUSR2 in the windows env

### Type of change

- [✓ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

Co-authored-by: tangyu <1@1.com>
											
										
										
											2025-03-12 09:43:18 +08:00
+								    if sys.platform != "win32":
 								        signal.signal(signal.SIGUSR1, start_tracemalloc_and_snapshot)
 								        signal.signal(signal.SIGUSR2, stop_tracemalloc)
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								    TRACE_MALLOC_ENABLED = int(os.environ.get("TRACE_MALLOC_ENABLED", "0"))
-												Optimize ocr (#5297)

### What problem does this PR solve?

Introduced OCR.recognize_batch

### Type of change

- [x] Performance Improvement
											
										
										
											2025-02-24 16:21:55 +08:00
+								    if TRACE_MALLOC_ENABLED:
 								        start_tracemalloc_and_snapshot(None, None)
-												feat: Recover pending tasks while pod restart. (#7073)

### What problem does this PR solve?

If you deploy Ragflow using Kubernetes, the hostname will change during
a rolling update. This causes the consumer name of the task executor to
change, making it impossible to schedule tasks that were previously in a
pending state.
To address this, I introduced a recovery task that scans these pending
messages and re-publishes them, allowing the tasks to continue being
processed.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

---------

Co-authored-by: liuzhenghua-jk <liuzhenghua-jk@360shuke.com>
											
										
										
											2025-04-19 16:18:51 +08:00
+								    signal.signal(signal.SIGINT, signal_handler)
 								    signal.signal(signal.SIGTERM, signal_handler)
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								    report_task = asyncio.create_task(report_status())
 								    tasks = []
-												Add time cost when start servers (#12552)

### What problem does this PR solve?

- API server
- Ingestion server
- Data sync server
- Admin server

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2026-01-12 12:48:23 +08:00
 								    logging.info(f"RAGFlow ingestion is ready after {time.time() - start_ts}s initialization.")
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								    try:
-												feat: Recover pending tasks while pod restart. (#7073)

### What problem does this PR solve?

If you deploy Ragflow using Kubernetes, the hostname will change during
a rolling update. This causes the consumer name of the task executor to
change, making it impossible to schedule tasks that were previously in a
pending state.
To address this, I introduced a recovery task that scans these pending
messages and re-publishes them, allowing the tasks to continue being
processed.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

---------

Co-authored-by: liuzhenghua-jk <liuzhenghua-jk@360shuke.com>
											
										
										
											2025-04-19 16:18:51 +08:00
+								        while not stop_event.is_set():
-												fix: single task executor getting all tasks from Redis queue (#7330)

### What problem does this PR solve?

Currently, as long as there are tasks in Redis, this loop will keep
getting the tasks. This will lead to a single task executor with many
tasks in the pending state. Then we need to wait for the pending tasks
to get them back in the queue.

In first place, if we set the `MAX_CONCURRENT_TASKS` to X, then only X
tasks should be picked from the queue, and others should be left in the
queue for other `task_executors` or be picked after 1 of the spots in
the current executor gets free. This PR ensures this behavior.

The additional changes were due to the Ruff linting in pre-commit. But I
believe these are expected to keep the coding style.

### Type of change

- [X] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2025-06-06 03:32:35 -03:00
+								            await task_limiter.acquire()
-												Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
											
										
										
											2025-12-09 19:23:14 +08:00
+								            t = asyncio.create_task(task_manager())
 								            tasks.append(t)
 								    finally:
 								        for t in tasks:
 								            t.cancel()
 								        await asyncio.gather(*tasks, return_exceptions=True)
 								        report_task.cancel()
 								        await asyncio.gather(report_task, return_exceptions=True)
-												Made task_executor async to speedup parsing (#5530)

### What problem does this PR solve?

Made task_executor async to speedup parsing

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-03 18:59:49 +08:00
+								    logging.error("BUG!!! You should not reach here!!!")
-												Update progress info and start welcome info (#3768)

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

### Type of change

- [x] Refactoring

---------

Signed-off-by: jinhai <haijin.chn@gmail.com>
											
										
										
											2024-11-30 18:48:06 +08:00
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
-												handle_task catch all exception (#3441)

### What problem does this PR solve?

handle_task catch all exception
Report heartbeats

### Type of change

- [x] Refactoring
											
										
										
											2024-11-15 18:51:09 +08:00
+								if __name__ == "__main__":
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    # Parse command line arguments (consistent with SAAS version)
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
+								    parser = argparse.ArgumentParser(description="Task Executor")
 								    parser.add_argument("-i", "--index", type=str, default="0")
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    parser.add_argument("-t", "--type", type=str, default="common", help="[common, graphrag, raptor, resume]")
 								    args = parser.parse_args()
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
-												Refactor: Task Executor (#15154)

### What problem does this PR solve?

1. Break huge function into smaller pieces
2. Add unit test for the smaller pieces function
3. Layer-ed design
a. infra layer - task_context.py, recording_context.py,
write_operation_interceptor.py, ...
    b. service layer - *_service.py
    c. business layer - task_handler.py
4. Default behavior: use "refactor-ed version" - can switch to original
version by change env variable

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
- [x] Performance Improvement

---------

Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
											
										
										
											2026-05-27 21:54:17 +08:00
+								    # Update global variables
 								    TASK_TYPE = args.type
 								    TE_IDX = args.index
 								    CONSUMER_NAME = f"task_executor_{TASK_TYPE}_{TE_IDX}"
-												fix: support auto mode in table parser document metadata aggregation (#15780)

### What problem does this PR solve?

Table parser metadata aggregation previously only ran when
`table_column_mode` was set to `manual`. In auto mode (default), all
columns default to `"both"` role, meaning they should also be aggregated
into document-level metadata for UI/chat filters. Additionally, the task
snapshot could be stale — `table_column_names` are written to KB
`parser_config` during `chunk()` but the task may have been created
before that.

Changes:
- Renames `aggregate_table_manual_doc_metadata` →
`aggregate_table_doc_metadata`
- Supports both `"manual"` and `"auto"` `table_column_mode` (defaults to
`"auto"`)
- Reloads `table_column_names` from KB DB when missing from task
snapshot
- Removes the manual-only guard in `task_executor` and refactored
`post_processor`
- Updates all tests with new function name and adds auto mode test cases

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-06-08 04:08:23 -07:00
-												Optimize graphrag cache get entity (#6018)

### What problem does this PR solve?

Optimize graphrag cache get entity

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-13 14:37:59 +08:00
+								    faulthandler.enable()
-												Fix typo in code (#8327)

### What problem does this PR solve?

Fix typo in code

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-06-18 09:41:09 +08:00
+								    init_root_logger(CONSUMER_NAME)
-												Disable flask and quart debug (#14042)

### What problem does this PR solve?

Visit
`http://127.0.0.1:9381/?__debugger__=yes&cmd=resource&f=debugger.js`
will expose the flask code:
```
docReady(() => {
  if (!EVALEX_TRUSTED) {
    initPinBox();
  }
  // if we are in console mode, show the console.
  if (CONSOLE_MODE && EVALEX) {
    createInteractiveConsole();
  }

  const frames = document.querySelectorAll("div.traceback div.frame");
  if (EVALEX) {
    addConsoleIconToFrames(frames);
  }
  addEventListenersToElements(document.querySelectorAll("div.detail"), "click", () =>
    document.querySelector("div.traceback").scrollIntoView(false)
  );
  addToggleFrameTraceback(frames);
  addToggleTraceTypesOnClick(document.querySelectorAll("h2.traceback"));
  addInfoPrompt(document.querySelectorAll("span.nojavascript"));
  wrapPlainTraceback();
});

function addToggleFrameTraceback(frames) {
  frames.forEach((frame) => {
    frame.addEventListener("click", () => {
      frame.getElementsByTagName("pre")[0].parentElement.classList.toggle("expanded");
    });
  })
}

```

### Type of change

- [x] Other (please describe): Fix security risk
											
										
										
											2026-04-10 18:01:49 +08:00
+								    try:
 								        asyncio.run(main())
 								    except Exception as e:
 								        logging.exception(f"Unhandled exception: {e}")
 								        sys.exit(1)