2024-01-15 08:46:22 +08:00
#
2024-01-19 19:51:57 +08:00
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
2024-01-15 08:46:22 +08:00
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
2026-05-27 21:54:17 +08:00
import argparse
2026-01-12 12:48:23 +08:00
import time
2026-01-20 13:29:37 +08:00
2026-05-27 21:54:17 +08:00
from rag . svr . task_executor_refactor . task_manager import TaskManager
2026-06-08 04:08:23 -07:00
from rag . svr . task_executor_refactor . recording_context import timed_with_recording , get_recording_context , RecordingContext , set_recording_context , NullRecordingContext
2026-05-27 21:54:17 +08:00
2026-05-12 17:00:45 +08:00
start_ts = time . time ( )
2026-01-20 13:29:37 +08:00
2026-05-12 17:00:45 +08:00
# LiteLLM fetches a model cost map from GitHub during import unless this is set.
# Parser pods should not block startup on external network access.
import os
2026-06-08 04:08:23 -07:00
2026-05-12 17:00:45 +08:00
os . environ . setdefault ( " LITELLM_LOCAL_MODEL_COST_MAP " , " True " ) # no internet, save about 10s
2026-01-20 13:29:37 +08:00
2026-05-12 17:00:45 +08:00
from common . misc_utils import thread_pool_exec
2026-01-12 12:48:23 +08:00
2025-12-09 19:23:14 +08:00
import asyncio
2025-11-10 12:51:39 +08:00
import socket
2026-06-08 04:08:23 -07:00
Removed beartype (#3528)
### What problem does this PR solve?
The beartype configuration of
main(64f50992e0fc4dce73e79f8b951a02e31cb2d638) is:
```
from beartype import BeartypeConf
from beartype.claw import beartype_all # <-- you didn't sign up for this
beartype_all(conf=BeartypeConf(violation_type=UserWarning)) # <-- emit warnings from all code
```
ragflow_server failed at a third-party package:
```
(ragflow-py3.10) zhichyu@iris:~/github.com/infiniflow/ragflow$ rm -rf logs/* && bash docker/launch_backend_service.sh
Starting task_executor.py for task 0 (Attempt 1)
Starting ragflow_server.py (Attempt 1)
Traceback (most recent call last):
File "/home/zhichyu/github.com/infiniflow/ragflow/api/ragflow_server.py", line 22, in <module>
from api.utils.log_utils import initRootLogger
File "/home/zhichyu/github.com/infiniflow/ragflow/api/utils/__init__.py", line 25, in <module>
import requests
File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/requests/__init__.py", line 43, in <module>
import urllib3
File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/__init__.py", line 15, in <module>
from ._base_connection import _TYPE_BODY
File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/_base_connection.py", line 5, in <module>
from .util.connection import _TYPE_SOCKET_OPTIONS
File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/util/__init__.py", line 4, in <module>
from .connection import is_connection_dropped
File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/util/connection.py", line 7, in <module>
from .timeout import _DEFAULT_TIMEOUT, _TYPE_TIMEOUT
File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/util/timeout.py", line 20, in <module>
_DEFAULT_TIMEOUT: Final[_TYPE_DEFAULT] = _TYPE_DEFAULT.token
NameError: name 'Final' is not defined
Traceback (most recent call last):
File "/home/zhichyu/github.com/infiniflow/ragflow/rag/svr/task_executor.py", line 22, in <module>
from api.utils.log_utils import initRootLogger
File "/home/zhichyu/github.com/infiniflow/ragflow/api/utils/__init__.py", line 25, in <module>
import requests
File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/requests/__init__.py", line 43, in <module>
import urllib3
File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/__init__.py", line 15, in <module>
from ._base_connection import _TYPE_BODY
File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/_base_connection.py", line 5, in <module>
from .util.connection import _TYPE_SOCKET_OPTIONS
File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/util/__init__.py", line 4, in <module>
from .connection import is_connection_dropped
File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/util/connection.py", line 7, in <module>
from .timeout import _DEFAULT_TIMEOUT, _TYPE_TIMEOUT
File "/home/zhichyu/github.com/infiniflow/ragflow/.venv/lib/python3.10/site-packages/urllib3/util/timeout.py", line 20, in <module>
_DEFAULT_TIMEOUT: Final[_TYPE_DEFAULT] = _TYPE_DEFAULT.token
NameError: name 'Final' is not defined
```
This third-package is out of our control. I have to remove beartype
entirely.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2024-11-20 17:43:16 +08:00
# from beartype import BeartypeConf
# from beartype.claw import beartype_all # <-- you didn't sign up for this
# beartype_all(conf=BeartypeConf(violation_type=UserWarning)) # <-- emit warnings from all code
2025-01-09 17:07:21 +08:00
import random
2024-11-15 14:43:55 +08:00
import sys
2025-04-19 16:18:51 +08:00
import threading
2025-03-03 10:26:45 +08:00
2025-11-12 12:03:41 +08:00
from api . db import PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES
2025-10-09 12:36:19 +08:00
from api . db . services . knowledgebase_service import KnowledgebaseService
from api . db . services . pipeline_operation_log_service import PipelineOperationLogService
2025-12-30 11:41:38 +08:00
from api . db . joint_services . memory_message_service import handle_save_to_memory_task
2025-11-04 11:51:12 +08:00
from common . connection_utils import timeout
2026-01-22 15:34:08 +08:00
from common . metadata_utils import turn2jsonschema , update_metadata_to
2025-11-05 14:14:38 +08:00
from rag . utils . base64_image import image2id
2026-05-11 15:42:31 -10:00
from rag . utils . raptor_utils import (
collect_raptor_chunk_ids ,
collect_raptor_methods ,
get_raptor_clustering_method ,
get_raptor_tree_builder ,
get_skip_reason ,
make_raptor_summary_chunk_id ,
should_skip_raptor ,
)
2025-11-03 20:25:02 +08:00
from common . log_utils import init_root_logger
2025-11-03 17:25:06 +08:00
from common . config_utils import show_configs
2026-01-29 14:23:26 +08:00
from rag . graphrag . utils import get_llm_cache , set_llm_cache , get_tags_from_cache , set_tags_to_cache
2026-06-08 04:08:23 -07:00
from rag . prompts . generator import keyword_extraction , question_proposal , content_tagging , run_toc_from_text , gen_metadata
2024-12-10 09:36:59 +08:00
import logging
import os
2024-11-15 14:43:55 +08:00
from datetime import datetime
2024-01-15 08:46:22 +08:00
import json
2024-12-12 17:47:39 +08:00
import xxhash
2024-01-15 08:46:22 +08:00
import copy
import re
2024-01-31 19:57:45 +08:00
from functools import partial
2024-04-10 10:11:22 +08:00
from multiprocessing . context import TimeoutError
2024-04-22 14:11:09 +08:00
from timeit import default_timer as timer
2025-02-24 16:21:55 +08:00
import signal
2025-03-10 15:15:06 +08:00
import exceptiongroup
2025-03-13 14:37:59 +08:00
import faulthandler
2024-09-29 09:49:45 +08:00
import numpy as np
2024-12-12 16:38:03 +08:00
from peewee import DoesNotExist
2025-11-05 08:01:39 +08:00
from common . constants import LLMType , ParserType , PipelineTaskType
2024-01-17 20:20:42 +08:00
from api . db . services . document_service import DocumentService
2026-01-28 13:29:34 +08:00
from api . db . services . doc_metadata_service import DocMetadataService
2024-01-31 19:57:45 +08:00
from api . db . services . llm_service import LLMBundle
2025-10-09 12:36:19 +08:00
from api . db . services . task_service import TaskService , has_canceled , CANVAS_DEBUG_DOC_ID , GRAPH_RAPTOR_FAKE_DOC_ID
2024-09-29 09:49:45 +08:00
from api . db . services . file2document_service import File2DocumentService
2026-05-29 17:39:41 +08:00
from api . db . joint_services . tenant_model_service import get_tenant_default_model_by_type , get_model_config_from_provider_instance
2025-11-06 19:24:46 +08:00
from common . versions import get_ragflow_version
2024-09-29 09:49:45 +08:00
from api . db . db_models import close_connection
2026-06-08 04:08:23 -07:00
from rag . app import laws , paper , presentation , manual , qa , table , book , resume , picture , naive , one , audio , email , tag
2025-10-09 12:36:19 +08:00
from rag . nlp import search , rag_tokenizer , add_positions
2026-05-11 15:42:31 -10:00
from rag . raptor import (
RAPTOR_TREE_BUILDER ,
)
2025-11-03 08:50:05 +08:00
from common . token_utils import num_tokens_from_string , truncate
2025-04-19 16:18:51 +08:00
from rag . utils . redis_conn import REDIS_CONN , RedisDistributedLock
2026-01-29 14:23:26 +08:00
from rag . graphrag . utils import chat_limiter
2025-11-05 11:07:54 +08:00
from common . signal_utils import start_tracemalloc_and_snapshot , stop_tracemalloc
2025-11-06 16:12:20 +08:00
from common . exceptions import TaskCanceledException
2026-05-27 21:54:17 +08:00
from rag . svr . task_executor_limiter import (
task_limiter ,
chunk_limiter ,
embed_limiter ,
minio_limiter ,
kg_limiter ,
)
2025-11-06 09:36:38 +08:00
from common import settings
from common . constants import PAGERANK_FLD , TAG_FLD , SVR_CONSUMER_GROUP_NAME
Feature/table parser column roles (#13710)
### What problem does this PR solve?
The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.
For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.
The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.
This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.
Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-05-11 07:06:04 +05:00
from rag . utils . table_es_metadata import (
2026-06-08 04:08:23 -07:00
aggregate_table_doc_metadata ,
Feature/table parser column roles (#13710)
### What problem does this PR solve?
The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.
For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.
The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.
This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.
Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-05-11 07:06:04 +05:00
merge_table_parser_config_from_kb ,
table_parser_strip_doc_metadata_keys ,
)
2026-05-27 21:54:17 +08:00
from rag . nlp import search as nlp_search
2024-01-15 08:46:22 +08:00
BATCH_SIZE = 64
Feature/table parser column roles (#13710)
### What problem does this PR solve?
The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.
For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.
The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.
This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.
Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-05-11 07:06:04 +05:00
2024-01-31 19:57:45 +08:00
FACTORY = {
2024-03-04 14:42:26 +08:00
" general " : naive ,
2024-02-29 14:03:07 +08:00
ParserType . NAIVE . value : naive ,
2024-01-31 19:57:45 +08:00
ParserType . PAPER . value : paper ,
2024-02-05 18:08:17 +08:00
ParserType . BOOK . value : book ,
2024-01-31 19:57:45 +08:00
ParserType . PRESENTATION . value : presentation ,
ParserType . MANUAL . value : manual ,
ParserType . LAWS . value : laws ,
2024-02-01 18:53:56 +08:00
ParserType . QA . value : qa ,
2024-02-05 18:08:17 +08:00
ParserType . TABLE . value : table ,
2024-02-08 17:01:01 +08:00
ParserType . RESUME . value : resume ,
2024-02-23 18:28:12 +08:00
ParserType . PICTURE . value : picture ,
2024-03-20 18:57:22 +08:00
ParserType . ONE . value : one ,
2024-08-02 18:51:14 +08:00
ParserType . AUDIO . value : audio ,
2024-08-06 16:42:14 +08:00
ParserType . EMAIL . value : email ,
2025-01-22 19:43:14 +08:00
ParserType . KG . value : naive ,
2026-06-08 04:08:23 -07:00
ParserType . TAG . value : tag ,
2024-01-31 19:57:45 +08:00
}
2025-10-09 12:36:19 +08:00
TASK_TYPE_TO_PIPELINE_TASK_TYPE = {
2025-12-29 12:01:18 +08:00
" dataflow " : PipelineTaskType . PARSE ,
2025-10-09 12:36:19 +08:00
" raptor " : PipelineTaskType . RAPTOR ,
" graphrag " : PipelineTaskType . GRAPH_RAG ,
" mindmap " : PipelineTaskType . MINDMAP ,
2025-12-30 11:41:38 +08:00
" memory " : PipelineTaskType . MEMORY ,
2025-10-09 12:36:19 +08:00
}
2025-03-03 18:59:49 +08:00
UNACKED_ITERATOR = None
2026-05-27 21:54:17 +08:00
# Task type and executor index (consistent with SAAS version)
TASK_TYPE = " common "
TE_IDX = " 0 "
2025-03-14 14:13:47 +08:00
2024-12-23 17:25:55 +08:00
BOOT_AT = datetime . now ( ) . astimezone ( ) . isoformat ( timespec = " milliseconds " )
2024-11-15 14:43:55 +08:00
PENDING_TASKS = 0
2024-11-15 18:51:09 +08:00
LAG_TASKS = 0
2024-11-15 22:55:41 +08:00
DONE_TASKS = 0
FAILED_TASKS = 0
2025-03-03 18:59:49 +08:00
CURRENT_TASKS = { }
2026-06-08 04:08:23 -07:00
WORKER_HEARTBEAT_TIMEOUT = int ( os . environ . get ( " WORKER_HEARTBEAT_TIMEOUT " , " 120 " ) )
2025-04-19 16:18:51 +08:00
stop_event = threading . Event ( )
2026-06-08 04:08:23 -07:00
2025-04-19 16:18:51 +08:00
def signal_handler ( sig , frame ) :
logging . info ( " Received interrupt signal, shutting down... " )
stop_event . set ( )
time . sleep ( 1 )
sys . exit ( 0 )
2025-02-24 16:21:55 +08:00
2025-03-17 11:58:40 +08:00
2024-09-29 09:49:45 +08:00
def set_progress ( task_id , from_page = 0 , to_page = - 1 , prog = None , msg = " Processing... " ) :
2025-03-13 14:37:59 +08:00
try :
if prog is not None and prog < 0 :
msg = " [ERROR] " + msg
2025-07-15 17:19:27 +08:00
cancel = has_canceled ( task_id )
2025-03-13 14:37:59 +08:00
if cancel :
msg + = " [Canceled] "
prog = - 1
if to_page > 0 :
if msg :
if from_page < to_page :
msg = f " Page( { from_page + 1 } ~ { to_page + 1 } ): " + msg
2024-03-05 12:08:41 +08:00
if msg :
2025-03-13 14:37:59 +08:00
msg = datetime . now ( ) . strftime ( " % H: % M: % S " ) + " " + msg
d = { " progress_msg " : msg }
if prog is not None :
d [ " progress " ] = prog
TaskService . update_progress ( task_id , d )
close_connection ( )
if cancel :
raise TaskCanceledException ( msg )
logging . info ( f " set_progress( { task_id } ), progress: { prog } , progress_msg: { msg } " )
2026-02-06 14:48:24 +08:00
except TaskCanceledException :
raise
2025-03-13 14:37:59 +08:00
except DoesNotExist :
logging . warning ( f " set_progress( { task_id } ) got exception DoesNotExist " )
2025-12-30 11:09:18 +08:00
except Exception as e :
logging . exception ( f " set_progress( { task_id } ), progress: { prog } , progress_msg: { msg } , got exception: { e } " )
2024-01-15 08:46:22 +08:00
2025-07-21 15:56:45 +08:00
2025-03-03 18:59:49 +08:00
async def collect ( ) :
global CONSUMER_NAME , DONE_TASKS , FAILED_TASKS
global UNACKED_ITERATOR
2025-06-12 19:09:50 +08:00
2026-05-27 21:54:17 +08:00
svr_queue_names = settings . get_svr_queue_names ( TASK_TYPE )
2025-12-30 11:09:18 +08:00
redis_msg = None
2024-05-07 11:43:33 +08:00
try :
2025-03-03 18:59:49 +08:00
if not UNACKED_ITERATOR :
2025-06-13 16:38:53 +08:00
UNACKED_ITERATOR = REDIS_CONN . get_unacked_iterator ( svr_queue_names , SVR_CONSUMER_GROUP_NAME , CONSUMER_NAME )
try :
redis_msg = next ( UNACKED_ITERATOR )
except StopIteration :
2025-03-14 23:43:46 +08:00
for svr_queue_name in svr_queue_names :
2025-06-13 16:38:53 +08:00
redis_msg = REDIS_CONN . queue_consumer ( svr_queue_name , SVR_CONSUMER_GROUP_NAME , CONSUMER_NAME )
if redis_msg :
break
2025-12-30 11:09:18 +08:00
except Exception as e :
logging . exception ( f " collect got exception: { e } " )
2025-03-03 18:59:49 +08:00
return None , None
2024-05-07 11:43:33 +08:00
2025-03-14 23:43:46 +08:00
if not redis_msg :
return None , None
2025-03-03 18:59:49 +08:00
msg = redis_msg . get_message ( )
2024-08-28 14:06:27 +08:00
if not msg :
2025-03-03 18:59:49 +08:00
logging . error ( f " collect got empty message of { redis_msg . get_msg_id ( ) } " )
redis_msg . ack ( )
return None , None
2024-05-07 11:43:33 +08:00
2024-12-12 16:38:03 +08:00
canceled = False
2025-10-09 12:36:19 +08:00
if msg . get ( " doc_id " , " " ) in [ GRAPH_RAPTOR_FAKE_DOC_ID , CANVAS_DEBUG_DOC_ID ] :
task = msg
2025-11-12 12:03:41 +08:00
if task [ " task_type " ] in PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES :
2025-10-09 12:36:19 +08:00
task = TaskService . get_task ( msg [ " id " ] , msg [ " doc_ids " ] )
2025-11-03 19:59:18 +08:00
if task :
task [ " doc_id " ] = msg [ " doc_id " ]
task [ " doc_ids " ] = msg . get ( " doc_ids " , [ ] ) or [ ]
2025-12-30 11:41:38 +08:00
elif msg . get ( " task_type " ) == PipelineTaskType . MEMORY . lower ( ) :
_ , task_obj = TaskService . get_by_id ( msg [ " id " ] )
task = task_obj . to_dict ( )
2025-10-09 12:36:19 +08:00
else :
task = TaskService . get_task ( msg [ " id " ] )
2025-03-03 18:59:49 +08:00
if task :
2025-07-15 17:19:27 +08:00
canceled = has_canceled ( task [ " id " ] )
2024-12-12 16:38:03 +08:00
if not task or canceled :
state = " is unknown " if not task else " has been cancelled "
2025-03-03 18:59:49 +08:00
FAILED_TASKS + = 1
logging . warning ( f " collect task { msg [ ' id ' ] } { state } " )
redis_msg . ack ( )
2025-03-05 14:48:03 +08:00
return None , None
2025-09-05 18:50:46 +08:00
task_type = msg . get ( " task_type " , " " )
task [ " task_type " ] = task_type
2025-10-09 12:36:19 +08:00
if task_type [ : 8 ] == " dataflow " :
task [ " tenant_id " ] = msg [ " tenant_id " ]
task [ " dataflow_id " ] = msg [ " dataflow_id " ]
2025-09-05 18:50:46 +08:00
task [ " kb_id " ] = msg . get ( " kb_id " , " " )
2025-12-30 11:41:38 +08:00
if task_type [ : 6 ] == " memory " :
task [ " memory_id " ] = msg [ " memory_id " ]
2026-06-09 17:47:48 +08:00
if msg . get ( " tenant_id " ) :
task [ " tenant_id " ] = msg [ " tenant_id " ]
2025-12-30 11:41:38 +08:00
task [ " source_id " ] = msg [ " source_id " ]
task [ " message_dict " ] = msg [ " message_dict " ]
2025-03-03 18:59:49 +08:00
return redis_msg , task
2024-01-15 08:46:22 +08:00
2024-04-08 19:20:57 +08:00
2025-03-03 18:59:49 +08:00
async def get_storage_binary ( bucket , name ) :
2026-01-20 13:29:37 +08:00
return await thread_pool_exec ( settings . STORAGE_IMPL . get , bucket , name )
2024-04-08 19:20:57 +08:00
2024-01-15 08:46:22 +08:00
2026-05-27 21:54:17 +08:00
@timed_with_recording
2025-12-29 12:01:18 +08:00
@timeout ( 60 * 80 , 1 )
2025-03-03 18:59:49 +08:00
async def build_chunks ( task , progress_callback ) :
2025-11-06 09:36:38 +08:00
if task [ " size " ] > settings . DOC_MAXIMUM_SIZE :
2026-06-08 04:08:23 -07:00
set_progress ( task [ " id " ] , prog = - 1 , msg = " File size exceeds( <= %d Mb ) " % ( int ( settings . DOC_MAXIMUM_SIZE / 1024 / 1024 ) ) )
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " file_size_exceeded " , True )
2024-01-15 08:46:22 +08:00
return [ ]
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " file_size_exceeded " , False )
get_recording_context ( ) . record ( " parser_id " , task [ " parser_id " ] )
2024-01-15 19:47:25 +08:00
2024-12-01 22:28:00 +08:00
chunker = FACTORY [ task [ " parser_id " ] . lower ( ) ]
2024-01-15 08:46:22 +08:00
try :
2024-04-08 19:20:57 +08:00
st = timer ( )
2024-12-01 22:28:00 +08:00
bucket , name = File2DocumentService . get_storage_address ( doc_id = task [ " doc_id " ] )
2025-03-03 18:59:49 +08:00
binary = await get_storage_binary ( bucket , name )
2026-05-26 12:28:53 +08:00
if binary is None :
raise FileNotFoundError ( f " File not found: storage returned no content for { bucket } / { name } . " )
2024-12-01 22:28:00 +08:00
logging . info ( " From minio( {} ) {} / {} " . format ( timer ( ) - st , task [ " location " ] , task [ " name " ] ) )
2024-09-29 09:49:45 +08:00
except TimeoutError :
2024-12-01 22:28:00 +08:00
progress_callback ( - 1 , " Internal server error: Fetch file from minio timeout. Could you try it again. " )
2026-06-08 04:08:23 -07:00
logging . exception ( " Minio {} / {} got timeout: Fetch file from minio timeout. " . format ( task [ " location " ] , task [ " name " ] ) )
2024-11-15 18:51:09 +08:00
raise
2024-01-15 08:46:22 +08:00
except Exception as e :
if re . search ( " (No such file|not found) " , str ( e ) ) :
2024-12-01 22:28:00 +08:00
progress_callback ( - 1 , " Can not find file < %s > from minio. Could you try it again? " % task [ " name " ] )
2024-01-15 08:46:22 +08:00
else :
2024-12-01 22:28:00 +08:00
progress_callback ( - 1 , " Get file from minio: %s " % str ( e ) . replace ( " ' " , " " ) )
logging . exception ( " Chunking {} / {} got exception " . format ( task [ " location " ] , task [ " name " ] ) )
2024-11-15 18:51:09 +08:00
raise
2024-01-15 19:47:25 +08:00
Feature/table parser column roles (#13710)
### What problem does this PR solve?
The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.
For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.
The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.
This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.
Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-05-11 07:06:04 +05:00
# Table parser column roles / mode are stored on the dataset (KB) parser_config;
# chunk tasks carry document-level parser_config only — merge KB keys so manual roles apply.
parser_config_for_chunk = merge_table_parser_config_from_kb ( task )
if task . get ( " parser_id " , " " ) . lower ( ) == " table " and task . get ( " kb_parser_config " ) :
logging . debug (
" [TASK_EXECUTOR_DEBUG] table parser: merged KB keys into parser_config for chunk; "
f " mode= { parser_config_for_chunk . get ( ' table_column_mode ' ) } , "
f " roles_keys= { list ( ( parser_config_for_chunk . get ( ' table_column_roles ' ) or { } ) . keys ( ) ) } "
)
2026-05-27 21:54:17 +08:00
# Record chunk configuration for comparison
from common . float_utils import normalize_overlapped_percent
2026-06-08 04:08:23 -07:00
2026-05-27 21:54:17 +08:00
chunk_config = {
" parser_id " : task [ " parser_id " ] ,
" chunk_token_num " : parser_config_for_chunk . get ( " chunk_token_num " , 128 ) ,
2026-06-08 04:08:23 -07:00
" overlapped_percent " : normalize_overlapped_percent ( parser_config_for_chunk . get ( " overlapped_percent " , 0 ) ) ,
2026-05-27 21:54:17 +08:00
" delimiter " : parser_config_for_chunk . get ( " delimiter " , " \n !?。;!? " ) ,
" from_page " : task [ " from_page " ] ,
" to_page " : task [ " to_page " ] ,
" language " : task [ " language " ] ,
" layout_recognizer " : parser_config_for_chunk . get ( " layout_recognizer " ) ,
}
get_recording_context ( ) . record ( " chunk_config " , chunk_config )
get_recording_context ( ) . record ( " parser_config_after_merge " , parser_config_for_chunk )
2024-08-14 11:09:07 +08:00
try :
2025-03-03 18:59:49 +08:00
async with chunk_limiter :
2026-05-22 11:46:38 +08:00
task_language = task . get ( " language " ) or " Chinese "
2026-01-20 13:29:37 +08:00
cks = await thread_pool_exec (
2025-12-09 19:23:14 +08:00
chunker . chunk ,
task [ " name " ] ,
binary = binary ,
from_page = task [ " from_page " ] ,
to_page = task [ " to_page " ] ,
2026-05-22 11:46:38 +08:00
lang = task_language ,
2025-12-09 19:23:14 +08:00
callback = progress_callback ,
kb_id = task [ " kb_id " ] ,
Feature/table parser column roles (#13710)
### What problem does this PR solve?
The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.
For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.
The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.
This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.
Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-05-11 07:06:04 +05:00
parser_config = parser_config_for_chunk ,
2025-12-09 19:23:14 +08:00
tenant_id = task [ " tenant_id " ] ,
)
2024-12-01 22:28:00 +08:00
logging . info ( " Chunking( {} ) {} / {} done " . format ( timer ( ) - st , task [ " location " ] , task [ " name " ] ) )
2024-12-12 16:38:03 +08:00
except TaskCanceledException :
raise
2024-08-14 11:09:07 +08:00
except Exception as e :
2024-12-01 22:28:00 +08:00
progress_callback ( - 1 , " Internal server error while chunking: %s " % str ( e ) . replace ( " ' " , " " ) )
logging . exception ( " Chunking {} / {} got exception " . format ( task [ " location " ] , task [ " name " ] ) )
2024-11-15 18:51:09 +08:00
raise
2024-01-15 08:46:22 +08:00
2026-05-27 21:54:17 +08:00
# Record raw chunks for comparison
get_recording_context ( ) . record ( " raw_chunks " , cks )
feat: persist PDF bookmark outline as document metadata (#13287)
## Summary
PDF files often contain a bookmark/outline tree (table of contents built
into the file by the authoring tool). RAGFlow's `pdf_parser.outlines`
already extracts these `(title, depth)` tuples via pypdf, but they are
used ephemerally during chunking (`manual` parser uses them for
hierarchy detection) and then discarded.
This PR persists the outline as `doc.meta_fields["outline"]` — a JSON
array of `{"title": str, "depth": int}` objects — so downstream features
can use the structural information.
### Why this matters
- **Complementary to `toc_extraction`** — the existing `toc_extraction`
feature uses LLM calls to generate a TOC and only works for the `naive`
parser. The raw PDF outline is free (already extracted by pypdf), works
for all parsers, and captures the author's original document structure.
- **Document navigation** — frontends can render a clickable TOC from
the outline
- **Entity extraction** — the outline provides a structural map for
identifying document sections and key topics
- **Search result context** — knowing which section a chunk belongs to
helps users evaluate relevance
### Changes
| File | Change | LOC |
|------|--------|-----|
| `rag/app/naive.py` | Attach `pdf_parser.outlines` as `__outline__` on
first chunk dict | ~7 |
| `rag/app/manual.py` | Same for the manual parser | ~5 |
| `rag/svr/task_executor.py` | Extract `__outline__`, persist via
`DocMetadataService.update_document_metadata()` | ~12 |
### Design decisions
- **Transient key pattern**: The outline is passed from parser →
task_executor via `__outline__` on the first chunk dict, then removed
before indexing. This follows the same pattern as `metadata_obj` for
LLM-generated metadata.
- **No schema changes**: Uses the existing `meta_fields` JSON column on
the document table.
- **Graceful degradation**: If a PDF has no outline (common for scanned
docs), nothing is stored. If persistence fails, it logs a warning and
continues — parsing is not interrupted.
### Backward compatibility
- **Fully backward compatible** — no existing fields, behavior, or
schemas changed
- PDFs without outlines are unaffected
- Existing `meta_fields` data is preserved (merged, not overwritten)
## Test plan
- [ ] Parse a PDF with bookmarks (e.g. any multi-chapter document),
verify `meta_fields["outline"]` is populated
- [ ] Parse a PDF without bookmarks, verify no errors and no outline key
in meta_fields
- [ ] Verify existing `meta_fields` data is preserved (not overwritten)
when outline is added
- [ ] Verify `manual` parser also persists outlines
- [ ] Verify outline JSON structure: `[{"title": "Chapter 1", "depth":
0}, ...]`
Related: #9921 (Deterministic Document Access Layer)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
2026-04-27 11:57:06 +08:00
# Extract and persist PDF outline if the parser attached it.
2026-05-27 21:54:17 +08:00
outline_data = cks [ 0 ] . get ( " __outline__ " ) if cks else None
get_recording_context ( ) . record ( " outline_data " , outline_data )
feat: persist PDF bookmark outline as document metadata (#13287)
## Summary
PDF files often contain a bookmark/outline tree (table of contents built
into the file by the authoring tool). RAGFlow's `pdf_parser.outlines`
already extracts these `(title, depth)` tuples via pypdf, but they are
used ephemerally during chunking (`manual` parser uses them for
hierarchy detection) and then discarded.
This PR persists the outline as `doc.meta_fields["outline"]` — a JSON
array of `{"title": str, "depth": int}` objects — so downstream features
can use the structural information.
### Why this matters
- **Complementary to `toc_extraction`** — the existing `toc_extraction`
feature uses LLM calls to generate a TOC and only works for the `naive`
parser. The raw PDF outline is free (already extracted by pypdf), works
for all parsers, and captures the author's original document structure.
- **Document navigation** — frontends can render a clickable TOC from
the outline
- **Entity extraction** — the outline provides a structural map for
identifying document sections and key topics
- **Search result context** — knowing which section a chunk belongs to
helps users evaluate relevance
### Changes
| File | Change | LOC |
|------|--------|-----|
| `rag/app/naive.py` | Attach `pdf_parser.outlines` as `__outline__` on
first chunk dict | ~7 |
| `rag/app/manual.py` | Same for the manual parser | ~5 |
| `rag/svr/task_executor.py` | Extract `__outline__`, persist via
`DocMetadataService.update_document_metadata()` | ~12 |
### Design decisions
- **Transient key pattern**: The outline is passed from parser →
task_executor via `__outline__` on the first chunk dict, then removed
before indexing. This follows the same pattern as `metadata_obj` for
LLM-generated metadata.
- **No schema changes**: Uses the existing `meta_fields` JSON column on
the document table.
- **Graceful degradation**: If a PDF has no outline (common for scanned
docs), nothing is stored. If persistence fails, it logs a warning and
continues — parsing is not interrupted.
### Backward compatibility
- **Fully backward compatible** — no existing fields, behavior, or
schemas changed
- PDFs without outlines are unaffected
- Existing `meta_fields` data is preserved (merged, not overwritten)
## Test plan
- [ ] Parse a PDF with bookmarks (e.g. any multi-chapter document),
verify `meta_fields["outline"]` is populated
- [ ] Parse a PDF without bookmarks, verify no errors and no outline key
in meta_fields
- [ ] Verify existing `meta_fields` data is preserved (not overwritten)
when outline is added
- [ ] Verify `manual` parser also persists outlines
- [ ] Verify outline JSON structure: `[{"title": "Chapter 1", "depth":
0}, ...]`
Related: #9921 (Deterministic Document Access Layer)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
2026-04-27 11:57:06 +08:00
if cks and cks [ 0 ] . get ( " __outline__ " ) :
outline = cks [ 0 ] . pop ( " __outline__ " )
try :
2026-06-08 04:08:23 -07:00
ret = DocMetadataService . update_document_metadata ( task [ " doc_id " ] , update_metadata_to ( { " outline " : outline } , DocMetadataService . get_document_metadata ( task [ " doc_id " ] ) or { } ) )
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " DocMetadataService.update_document_metadata " , ret )
feat: persist PDF bookmark outline as document metadata (#13287)
## Summary
PDF files often contain a bookmark/outline tree (table of contents built
into the file by the authoring tool). RAGFlow's `pdf_parser.outlines`
already extracts these `(title, depth)` tuples via pypdf, but they are
used ephemerally during chunking (`manual` parser uses them for
hierarchy detection) and then discarded.
This PR persists the outline as `doc.meta_fields["outline"]` — a JSON
array of `{"title": str, "depth": int}` objects — so downstream features
can use the structural information.
### Why this matters
- **Complementary to `toc_extraction`** — the existing `toc_extraction`
feature uses LLM calls to generate a TOC and only works for the `naive`
parser. The raw PDF outline is free (already extracted by pypdf), works
for all parsers, and captures the author's original document structure.
- **Document navigation** — frontends can render a clickable TOC from
the outline
- **Entity extraction** — the outline provides a structural map for
identifying document sections and key topics
- **Search result context** — knowing which section a chunk belongs to
helps users evaluate relevance
### Changes
| File | Change | LOC |
|------|--------|-----|
| `rag/app/naive.py` | Attach `pdf_parser.outlines` as `__outline__` on
first chunk dict | ~7 |
| `rag/app/manual.py` | Same for the manual parser | ~5 |
| `rag/svr/task_executor.py` | Extract `__outline__`, persist via
`DocMetadataService.update_document_metadata()` | ~12 |
### Design decisions
- **Transient key pattern**: The outline is passed from parser →
task_executor via `__outline__` on the first chunk dict, then removed
before indexing. This follows the same pattern as `metadata_obj` for
LLM-generated metadata.
- **No schema changes**: Uses the existing `meta_fields` JSON column on
the document table.
- **Graceful degradation**: If a PDF has no outline (common for scanned
docs), nothing is stored. If persistence fails, it logs a warning and
continues — parsing is not interrupted.
### Backward compatibility
- **Fully backward compatible** — no existing fields, behavior, or
schemas changed
- PDFs without outlines are unaffected
- Existing `meta_fields` data is preserved (merged, not overwritten)
## Test plan
- [ ] Parse a PDF with bookmarks (e.g. any multi-chapter document),
verify `meta_fields["outline"]` is populated
- [ ] Parse a PDF without bookmarks, verify no errors and no outline key
in meta_fields
- [ ] Verify existing `meta_fields` data is preserved (not overwritten)
when outline is added
- [ ] Verify `manual` parser also persists outlines
- [ ] Verify outline JSON structure: `[{"title": "Chapter 1", "depth":
0}, ...]`
Related: #9921 (Deterministic Document Access Layer)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
2026-04-27 11:57:06 +08:00
logging . info ( " Persisted PDF outline ( %d entries) for doc %s " , len ( outline ) , task [ " doc_id " ] )
except Exception as e :
logging . warning ( " Failed to persist PDF outline for doc %s : %s " , task [ " doc_id " ] , e )
2024-01-31 19:57:45 +08:00
docs = [ ]
2026-06-08 04:08:23 -07:00
doc = { " doc_id " : task [ " doc_id " ] , " kb_id " : str ( task [ " kb_id " ] ) }
2024-12-08 14:21:12 +08:00
if task [ " pagerank " ] :
2025-01-09 17:07:21 +08:00
doc [ PAGERANK_FLD ] = int ( task [ " pagerank " ] )
2025-05-06 14:39:45 +08:00
st = timer ( )
2024-01-15 08:46:22 +08:00
2025-07-15 09:36:45 +08:00
@timeout ( 60 )
2025-05-06 14:39:45 +08:00
async def upload_to_minio ( document , chunk ) :
2024-08-30 18:41:31 +08:00
try :
2025-05-27 17:49:37 +08:00
d = copy . deepcopy ( document )
d . update ( chunk )
2026-06-08 04:08:23 -07:00
d [ " id " ] = xxhash . xxh64 ( ( chunk [ " content_with_weight " ] + str ( d [ " doc_id " ] ) ) . encode ( " utf-8 " , " surrogatepass " ) ) . hexdigest ( )
2025-05-27 17:49:37 +08:00
d [ " create_time " ] = str ( datetime . now ( ) ) . replace ( " T " , " " ) [ : 19 ]
d [ " create_timestamp_flt " ] = datetime . now ( ) . timestamp ( )
2026-01-13 12:54:13 +08:00
if d . get ( " img_id " ) :
docs . append ( d )
return
2025-05-27 17:49:37 +08:00
if not d . get ( " image " ) :
_ = d . pop ( " image " , None )
d [ " img_id " ] = " "
docs . append ( d )
return
2025-11-06 09:36:38 +08:00
await image2id ( d , partial ( settings . STORAGE_IMPL . put , tenant_id = task [ " tenant_id " ] ) , d [ " id " ] , task [ " kb_id " ] )
2025-10-09 12:36:19 +08:00
docs . append ( d )
2024-11-12 17:35:13 +08:00
except Exception :
2026-06-08 04:08:23 -07:00
logging . exception ( " Saving image of chunk {} / {} / {} got exception " . format ( task [ " location " ] , task [ " name " ] , d [ " id " ] ) )
2024-11-15 18:51:09 +08:00
raise
2024-01-15 08:46:22 +08:00
2025-12-09 19:23:14 +08:00
tasks = [ ]
for ck in cks :
tasks . append ( asyncio . create_task ( upload_to_minio ( doc , ck ) ) )
try :
await asyncio . gather ( * tasks , return_exceptions = False )
except Exception as e :
logging . error ( f " MINIO PUT( { task [ ' name ' ] } ) got exception: { e } " )
for t in tasks :
t . cancel ( )
await asyncio . gather ( * tasks , return_exceptions = True )
raise
2025-05-06 14:39:45 +08:00
el = timer ( ) - st
logging . info ( " MINIO PUT( {} ) cost {:.3f} s " . format ( task [ " name " ] , el ) )
2024-01-15 08:46:22 +08:00
2026-05-27 21:54:17 +08:00
# Record docs after MinIO upload
get_recording_context ( ) . record ( " docs_after_prep " , docs )
2024-12-01 22:28:00 +08:00
if task [ " parser_config " ] . get ( " auto_keywords " , 0 ) :
2024-11-14 16:28:10 +08:00
st = timer ( )
2024-12-01 22:28:00 +08:00
progress_callback ( msg = " Start to generate keywords for every chunk ... " )
2026-05-29 17:39:41 +08:00
chat_model_config = get_model_config_from_provider_instance ( task [ " tenant_id " ] , LLMType . CHAT , task [ " llm_id " ] )
2026-03-05 17:27:17 +08:00
chat_mdl = LLMBundle ( task [ " tenant_id " ] , chat_model_config , lang = task [ " language " ] )
2024-12-17 09:48:03 +08:00
2025-02-26 15:21:14 +08:00
async def doc_keyword_extraction ( chat_mdl , d , topn ) :
cached = get_llm_cache ( chat_mdl . llm_name , d [ " content_with_weight " ] , " keywords " , { " topn " : topn } )
if not cached :
2025-12-30 20:24:27 +08:00
if has_canceled ( task [ " id " ] ) :
progress_callback ( - 1 , msg = " Task has been canceled. " )
return
2025-03-03 18:59:49 +08:00
async with chat_limiter :
2025-12-11 17:38:17 +08:00
cached = await keyword_extraction ( chat_mdl , d [ " content_with_weight " ] , topn )
2025-02-26 15:21:14 +08:00
set_llm_cache ( chat_mdl . llm_name , d [ " content_with_weight " ] , cached , " keywords " , { " topn " : topn } )
if cached :
fix(keyword_extraction): accept Chinese commas/semicolons/newlines as keyword delimiters (#14540)
## What
Widen the keyword delimiter in `rag/svr/task_executor.py`:
both `build_chunks` (LLM `keyword_extraction` cache parsing) and
`run_dataflow` (chunk-level `keywords` ingestion) now split on
`, , ; ; 、 \r \n` instead of only ASCII comma.
## Why
`rag/prompts/keyword_prompt.md` instructs the LLM:
> The keywords are delimited by ENGLISH COMMA.
In practice, Chinese-leaning models (Qwen / Tongyi-Qianwen, GLM,
etc.) frequently ignore this instruction when the source content is
Chinese and emit Chinese commas (`,`) instead. Result:
`cached.split(",")` sees the full LLM output as a *single* keyword.
Repro: `auto_keywords>=4` + Chinese docs + `qwen-plus@Tongyi-Qianwen`.
We observed entries in `important_kwd` like
`"功能介绍,配置说明,参数详解,问题排查"` — one bucket instead of four.
## Impact
- Silent data-quality bug; no exception thrown.
- BM25 `important_kwd^30` boost effectively stops firing — the
indexed term is the whole list, never matches user query tokens.
- Any downstream aggregating `important_kwd` (tagging, analytics,
candidate-keyword review UIs) sees garbage.
## Compatibility
- Pure widening of the splitter; ASCII-comma-only outputs continue
to work identically.
- No schema / API change.
## Test plan
Manually verified against `qwen-plus@Tongyi-Qianwen` with
`auto_keywords=10` on Chinese .txt files:
- Before: `important_kwd` contains one element per chunk that is the
full LLM string with `,`-separated phrases inside.
- After: `important_kwd` contains N elements, one per phrase, as the
LLM intended.
2026-05-11 12:05:24 +08:00
d [ " important_kwd " ] = [ k for k in re . split ( r " [,, ;;、 \ r \ n]+ " , cached ) if k . strip ( ) ]
2025-02-26 15:21:14 +08:00
d [ " important_tks " ] = rag_tokenizer . tokenize ( " " . join ( d [ " important_kwd " ] ) )
return
2025-12-29 12:01:18 +08:00
2025-12-09 19:23:14 +08:00
tasks = [ ]
for d in docs :
2026-06-08 04:08:23 -07:00
tasks . append ( asyncio . create_task ( doc_keyword_extraction ( chat_mdl , d , task [ " parser_config " ] [ " auto_keywords " ] ) ) )
2025-12-09 19:23:14 +08:00
try :
await asyncio . gather ( * tasks , return_exceptions = False )
except Exception as e :
logging . error ( " Error in doc_keyword_extraction: {} " . format ( e ) )
for t in tasks :
t . cancel ( )
await asyncio . gather ( * tasks , return_exceptions = True )
raise
2025-02-26 15:21:14 +08:00
progress_callback ( msg = " Keywords generation {} chunks completed in {:.2f} s " . format ( len ( docs ) , timer ( ) - st ) )
2024-10-23 17:00:56 +08:00
2026-05-27 21:54:17 +08:00
# Record keywords extraction count
keywords = [ d for d in docs if d . get ( " important_kwd " ) ]
get_recording_context ( ) . record ( " keywords_extracted " , keywords )
2024-12-01 22:28:00 +08:00
if task [ " parser_config " ] . get ( " auto_questions " , 0 ) :
2024-11-14 16:28:10 +08:00
st = timer ( )
2024-12-01 22:28:00 +08:00
progress_callback ( msg = " Start to generate questions for every chunk ... " )
2026-05-29 17:39:41 +08:00
chat_model_config = get_model_config_from_provider_instance ( task [ " tenant_id " ] , LLMType . CHAT , task [ " llm_id " ] )
2026-03-05 17:27:17 +08:00
chat_mdl = LLMBundle ( task [ " tenant_id " ] , chat_model_config , lang = task [ " language " ] )
2024-12-17 09:48:03 +08:00
2025-02-26 15:21:14 +08:00
async def doc_question_proposal ( chat_mdl , d , topn ) :
cached = get_llm_cache ( chat_mdl . llm_name , d [ " content_with_weight " ] , " question " , { " topn " : topn } )
if not cached :
2025-12-30 20:24:27 +08:00
if has_canceled ( task [ " id " ] ) :
progress_callback ( - 1 , msg = " Task has been canceled. " )
return
2025-03-03 18:59:49 +08:00
async with chat_limiter :
2025-12-11 17:38:17 +08:00
cached = await question_proposal ( chat_mdl , d [ " content_with_weight " ] , topn )
2025-02-26 15:21:14 +08:00
set_llm_cache ( chat_mdl . llm_name , d [ " content_with_weight " ] , cached , " question " , { " topn " : topn } )
if cached :
d [ " question_kwd " ] = cached . split ( " \n " )
d [ " question_tks " ] = rag_tokenizer . tokenize ( " \n " . join ( d [ " question_kwd " ] ) )
2025-12-29 12:01:18 +08:00
2025-12-09 19:23:14 +08:00
tasks = [ ]
for d in docs :
2026-06-08 04:08:23 -07:00
tasks . append ( asyncio . create_task ( doc_question_proposal ( chat_mdl , d , task [ " parser_config " ] [ " auto_questions " ] ) ) )
2025-12-09 19:23:14 +08:00
try :
await asyncio . gather ( * tasks , return_exceptions = False )
except Exception as e :
logging . error ( " Error in doc_question_proposal " , exc_info = e )
for t in tasks :
t . cancel ( )
await asyncio . gather ( * tasks , return_exceptions = True )
raise
2025-02-26 15:21:14 +08:00
progress_callback ( msg = " Question generation {} chunks completed in {:.2f} s " . format ( len ( docs ) , timer ( ) - st ) )
2024-10-23 17:00:56 +08:00
2026-05-27 21:54:17 +08:00
# Record question generation
questions = [ d for d in docs if d . get ( " question_kwd " ) ]
get_recording_context ( ) . record ( " questions_generated " , questions )
2026-04-21 17:22:42 +08:00
if task [ " parser_config " ] . get ( " enable_metadata " , False ) and ( task [ " parser_config " ] . get ( " metadata " ) or task [ " parser_config " ] . get ( " built_in_metadata " ) ) :
2025-12-17 16:50:36 +08:00
st = timer ( )
progress_callback ( msg = " Start to generate meta-data for every chunk ... " )
2026-05-29 17:39:41 +08:00
chat_model_config = get_model_config_from_provider_instance ( task [ " tenant_id " ] , LLMType . CHAT , task [ " llm_id " ] )
2026-03-05 17:27:17 +08:00
chat_mdl = LLMBundle ( task [ " tenant_id " ] , chat_model_config , lang = task [ " language " ] )
2025-12-17 16:50:36 +08:00
async def gen_metadata_task ( chat_mdl , d ) :
2026-04-27 16:18:06 +08:00
metadata_conf = task [ " parser_config " ] . get ( " metadata " , [ ] )
built_in_metadata = list ( task [ " parser_config " ] . get ( " built_in_metadata " ) or [ ] )
if isinstance ( metadata_conf , dict ) :
if not isinstance ( metadata_conf . get ( " properties " ) , dict ) :
metadata_conf = { " type " : " object " , " properties " : { } }
if built_in_metadata :
metadata_conf = {
* * metadata_conf ,
" properties " : {
* * metadata_conf . get ( " properties " , { } ) ,
* * turn2jsonschema ( built_in_metadata ) . get ( " properties " , { } ) ,
} ,
}
elif isinstance ( metadata_conf , list ) :
metadata_conf = metadata_conf + built_in_metadata
else :
metadata_conf = built_in_metadata
2026-06-08 04:08:23 -07:00
cached = get_llm_cache ( chat_mdl . llm_name , d [ " content_with_weight " ] , " metadata " , metadata_conf )
2025-12-17 16:50:36 +08:00
if not cached :
2025-12-30 20:24:27 +08:00
if has_canceled ( task [ " id " ] ) :
progress_callback ( - 1 , msg = " Task has been canceled. " )
return
2025-12-17 16:50:36 +08:00
async with chat_limiter :
2026-06-08 04:08:23 -07:00
cached = await gen_metadata ( chat_mdl , turn2jsonschema ( metadata_conf ) , d [ " content_with_weight " ] )
set_llm_cache ( chat_mdl . llm_name , d [ " content_with_weight " ] , cached , " metadata " , metadata_conf )
2025-12-17 16:50:36 +08:00
if cached :
d [ " metadata_obj " ] = cached
2025-12-29 12:01:18 +08:00
2025-12-17 16:50:36 +08:00
tasks = [ ]
for d in docs :
tasks . append ( asyncio . create_task ( gen_metadata_task ( chat_mdl , d ) ) )
try :
await asyncio . gather ( * tasks , return_exceptions = False )
except Exception as e :
logging . error ( " Error in doc_question_proposal " , exc_info = e )
for t in tasks :
t . cancel ( )
await asyncio . gather ( * tasks , return_exceptions = True )
raise
metadata = { }
2025-12-24 09:32:19 +08:00
for doc in docs :
metadata = update_metadata_to ( metadata , doc [ " metadata_obj " ] )
2025-12-24 13:40:34 +08:00
del doc [ " metadata_obj " ]
2025-12-17 16:50:36 +08:00
if metadata :
2026-01-28 13:29:34 +08:00
existing_meta = DocMetadataService . get_document_metadata ( task [ " doc_id " ] )
existing_meta = existing_meta if isinstance ( existing_meta , dict ) else { }
metadata = update_metadata_to ( metadata , existing_meta )
2026-05-27 21:54:17 +08:00
ret = DocMetadataService . update_document_metadata ( task [ " doc_id " ] , metadata )
get_recording_context ( ) . save_func_return_value ( " DocMetadataService.update_document_metadata " , ret )
2025-12-17 16:50:36 +08:00
progress_callback ( msg = " Question generation {} chunks completed in {:.2f} s " . format ( len ( docs ) , timer ( ) - st ) )
2026-05-27 21:54:17 +08:00
# Record metadata generation count
metadata_list = [ d for d in docs if d . get ( " metadata_obj " ) ]
get_recording_context ( ) . record ( " metadata_list_generated " , metadata_list )
2025-01-09 17:07:21 +08:00
if task [ " kb_parser_config " ] . get ( " tag_kb_ids " , [ ] ) :
progress_callback ( msg = " Start to tag for every chunk ... " )
kb_ids = task [ " kb_parser_config " ] [ " tag_kb_ids " ]
tenant_id = task [ " tenant_id " ]
topn_tags = task [ " kb_parser_config " ] . get ( " topn_tags " , 3 )
S = 1000
st = timer ( )
examples = [ ]
all_tags = get_tags_from_cache ( kb_ids )
if not all_tags :
2025-11-06 09:36:38 +08:00
all_tags = settings . retriever . all_tags_in_portion ( tenant_id , kb_ids , S )
2025-01-09 17:07:21 +08:00
set_tags_to_cache ( kb_ids , all_tags )
else :
all_tags = json . loads ( all_tags )
2026-05-29 17:39:41 +08:00
chat_model_config = get_model_config_from_provider_instance ( tenant_id , LLMType . CHAT , task [ " llm_id " ] )
2026-03-05 17:27:17 +08:00
chat_mdl = LLMBundle ( task [ " tenant_id " ] , chat_model_config , lang = task [ " language " ] )
2025-02-26 15:21:14 +08:00
docs_to_tag = [ ]
2025-01-09 17:07:21 +08:00
for d in docs :
2025-07-15 17:19:27 +08:00
task_canceled = has_canceled ( task [ " id " ] )
2025-05-22 09:28:08 +08:00
if task_canceled :
progress_callback ( - 1 , msg = " Task has been canceled. " )
2025-11-12 19:00:15 +08:00
return None
2026-06-08 04:08:23 -07:00
if settings . retriever . tag_content ( tenant_id , kb_ids , d , all_tags , topn_tags = topn_tags , S = S ) and len ( d [ TAG_FLD ] ) > 0 :
2025-01-09 17:07:21 +08:00
examples . append ( { " content " : d [ " content_with_weight " ] , TAG_FLD : d [ TAG_FLD ] } )
2025-02-26 15:21:14 +08:00
else :
docs_to_tag . append ( d )
async def doc_content_tagging ( chat_mdl , d , topn_tags ) :
2025-01-09 17:07:21 +08:00
cached = get_llm_cache ( chat_mdl . llm_name , d [ " content_with_weight " ] , all_tags , { " topn " : topn_tags } )
if not cached :
2025-12-30 20:24:27 +08:00
if has_canceled ( task [ " id " ] ) :
progress_callback ( - 1 , msg = " Task has been canceled. " )
return
2025-12-29 12:01:18 +08:00
picked_examples = random . choices ( examples , k = 2 ) if len ( examples ) > 2 else examples
2025-03-19 17:30:47 +08:00
if not picked_examples :
2026-06-08 04:08:23 -07:00
picked_examples . append ( { " content " : " This is an example " , TAG_FLD : { " example " : 1 } } )
2025-03-03 18:59:49 +08:00
async with chat_limiter :
2025-12-11 17:38:17 +08:00
cached = await content_tagging (
2025-12-09 19:23:14 +08:00
chat_mdl ,
d [ " content_with_weight " ] ,
all_tags ,
picked_examples ,
topn_tags ,
)
2025-01-09 17:07:21 +08:00
if cached :
2025-01-23 17:26:20 +08:00
cached = json . dumps ( cached )
if cached :
set_llm_cache ( chat_mdl . llm_name , d [ " content_with_weight " ] , cached , all_tags , { " topn " : topn_tags } )
d [ TAG_FLD ] = json . loads ( cached )
2025-12-29 12:01:18 +08:00
2025-12-09 19:23:14 +08:00
tasks = [ ]
for d in docs_to_tag :
tasks . append ( asyncio . create_task ( doc_content_tagging ( chat_mdl , d , topn_tags ) ) )
try :
await asyncio . gather ( * tasks , return_exceptions = False )
except Exception as e :
logging . error ( " Error tagging docs: {} " . format ( e ) )
for t in tasks :
t . cancel ( )
await asyncio . gather ( * tasks , return_exceptions = True )
raise
2025-02-26 15:21:14 +08:00
progress_callback ( msg = " Tagging {} chunks completed in {:.2f} s " . format ( len ( docs ) , timer ( ) - st ) )
2025-01-09 17:07:21 +08:00
2026-05-27 21:54:17 +08:00
# Record tags applied
tags_applied = [ d for d in docs if d . get ( TAG_FLD ) ]
get_recording_context ( ) . record ( " tags_applied " , tags_applied )
# Record final chunks for comparison
get_recording_context ( ) . record ( " final_chunks " , docs )
final_chunk_ids = [ c . get ( " id " ) for c in docs if isinstance ( c , dict ) and " id " in c ]
get_recording_context ( ) . record ( " final_chunk_ids_count " , len ( final_chunk_ids ) )
2024-01-15 08:46:22 +08:00
return docs
2026-05-27 21:54:17 +08:00
@timed_with_recording
2025-10-14 14:14:52 +08:00
def build_TOC ( task , docs , progress_callback ) :
progress_callback ( msg = " Start to generate table of content ... " )
2026-05-29 17:39:41 +08:00
chat_model_config = get_model_config_from_provider_instance ( task [ " tenant_id " ] , LLMType . CHAT , task [ " llm_id " ] )
2026-03-05 17:27:17 +08:00
chat_mdl = LLMBundle ( task [ " tenant_id " ] , chat_model_config , lang = task [ " language " ] )
2026-06-08 04:08:23 -07:00
docs = sorted (
docs ,
key = lambda d : (
d . get ( " page_num_int " , 0 ) [ 0 ] if isinstance ( d . get ( " page_num_int " , 0 ) , list ) else d . get ( " page_num_int " , 0 ) ,
d . get ( " top_int " , 0 ) [ 0 ] if isinstance ( d . get ( " top_int " , 0 ) , list ) else d . get ( " top_int " , 0 ) ,
) ,
)
toc : list [ dict ] = asyncio . run ( run_toc_from_text ( [ d [ " content_with_weight " ] for d in docs ] , chat_mdl , progress_callback ) )
logging . info ( " ------------ T O C ------------- \n " + json . dumps ( toc , ensure_ascii = False , indent = " " ) )
2026-01-05 10:02:42 +08:00
for ii , item in enumerate ( toc ) :
2025-10-14 14:14:52 +08:00
try :
2026-01-05 10:02:42 +08:00
chunk_val = item . pop ( " chunk_id " , None )
if chunk_val is None or str ( chunk_val ) . strip ( ) == " " :
logging . warning ( f " Index { ii } : chunk_id is missing or empty. Skipping. " )
continue
curr_idx = int ( chunk_val )
if curr_idx > = len ( docs ) :
logging . error ( f " Index { ii } : chunk_id { curr_idx } exceeds docs length { len ( docs ) } . " )
continue
item [ " ids " ] = [ docs [ curr_idx ] [ " id " ] ]
if ii + 1 < len ( toc ) :
next_chunk_val = toc [ ii + 1 ] . get ( " chunk_id " , " " )
if str ( next_chunk_val ) . strip ( ) != " " :
next_idx = int ( next_chunk_val )
for jj in range ( curr_idx + 1 , min ( next_idx + 1 , len ( docs ) ) ) :
item [ " ids " ] . append ( docs [ jj ] [ " id " ] )
else :
logging . warning ( f " Index { ii + 1 } : next chunk_id is empty, range fill skipped. " )
except ( ValueError , TypeError ) as e :
logging . error ( f " Index { ii } : Data conversion error - { e } " )
2025-10-14 14:14:52 +08:00
except Exception as e :
2026-01-05 10:02:42 +08:00
logging . exception ( f " Index { ii } : Unexpected error - { e } " )
2025-10-14 14:14:52 +08:00
if toc :
d = copy . deepcopy ( docs [ - 1 ] )
d [ " content_with_weight " ] = json . dumps ( toc , ensure_ascii = False )
d [ " toc_kwd " ] = " toc "
d [ " available_int " ] = 0
2025-10-16 12:47:24 +08:00
d [ " page_num_int " ] = [ 100000000 ]
2026-06-08 04:08:23 -07:00
d [ " id " ] = xxhash . xxh64 ( ( d [ " content_with_weight " ] + str ( d [ " doc_id " ] ) ) . encode ( " utf-8 " , " surrogatepass " ) ) . hexdigest ( )
2025-10-14 14:14:52 +08:00
return d
2025-11-12 19:00:15 +08:00
return None
2025-10-14 14:14:52 +08:00
2024-11-12 14:59:41 +08:00
def init_kb ( row , vector_size : int ) :
2024-01-15 08:46:22 +08:00
idxnm = search . index_name ( row [ " tenant_id " ] )
2026-01-19 19:35:14 +08:00
parser_id = row . get ( " parser_id " , None )
return settings . docStoreConn . create_idx ( idxnm , row . get ( " kb_id " , " " ) , vector_size , parser_id )
2024-01-15 08:46:22 +08:00
2026-05-27 21:54:17 +08:00
@timed_with_recording
2025-03-03 18:59:49 +08:00
async def embedding ( docs , mdl , parser_config = None , callback = None ) :
2024-09-29 09:49:45 +08:00
if parser_config is None :
parser_config = { }
2024-12-05 14:51:19 +08:00
tts , cnts = [ ] , [ ]
for d in docs :
2024-12-11 19:23:59 +08:00
tts . append ( d . get ( " docnm_kwd " , " Title " ) )
2024-12-05 14:51:19 +08:00
c = " \n " . join ( d . get ( " question_kwd " , [ ] ) )
if not c :
c = d [ " content_with_weight " ]
c = re . sub ( r " </?(table|td|caption|tr|th)( [^<>] { 0,12})?> " , " " , c )
fix: guard whitespace-only chunks before embedding (#13938)
## Problem
When parsing DOCX files with many tables, DeepDOC generates chunks
containing only empty HTML table tags, such as:
```html
<table><tr><td></td></tr><tr><td></td></tr><tr><td></td></tr><tr><td></td></tr></table>
```
After the regex cleanup at `task_executor.py:584`, this becomes `" "`
(whitespace only).
The guard at line 585 (`if not c`) only catches empty strings `""`, but
whitespace strings are truthy in Python and pass through. When sent to
Zhipu `embedding-3` API, it rejects them with error 1213:
`未正常接收到prompt参数`.
## Root Cause
```python
c = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", c)
if not c: # ← only catches "", not " " / "\n" / "\t"
c = "None"
```
Verified with Zhipu `embedding-3`:
| Input | Result |
|---|---|
| `""` | error 1213 |
| `" "` | error 1213 |
| `"\n"` | error 1213 |
| `"None"` | OK |
## Fix
```diff
- if not c:
+ if not c.strip():
c = "None"
```
## Testing
Reproduced with a 678KB DOCX file (166 tables, 270 chunks). Chunk #89 is
the empty table above. After fix, `"None"` is sent instead and embedding
succeeds.
---------
Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2026-05-13 11:47:50 +08:00
if not c . strip ( ) :
logging . debug ( " embedding(): normalized whitespace-only chunk to placeholder ' None ' (len= %d ) " , len ( c ) )
2024-12-31 14:31:31 +08:00
c = " None "
2024-12-05 14:51:19 +08:00
cnts . append ( c )
2024-01-15 08:46:22 +08:00
tk_count = 0
2024-02-01 18:53:56 +08:00
if len ( tts ) == len ( cnts ) :
2026-01-20 13:29:37 +08:00
vts , c = await thread_pool_exec ( mdl . encode , tts [ 0 : 1 ] )
2025-11-13 18:48:25 +08:00
tts = np . tile ( vts [ 0 ] , ( len ( cnts ) , 1 ) )
2025-01-15 15:20:29 +08:00
tk_count + = c
2024-02-01 18:53:56 +08:00
2025-08-12 14:12:56 +08:00
@timeout ( 60 )
2025-08-05 19:24:34 +08:00
def batch_encode ( txts ) :
nonlocal mdl
2025-12-29 12:01:18 +08:00
return mdl . encode ( [ truncate ( c , mdl . max_length - 10 ) for c in txts ] )
2025-08-05 19:24:34 +08:00
2026-04-30 05:00:10 +02:00
cnts_batches = [ ]
2025-11-06 09:36:38 +08:00
for i in range ( 0 , len ( cnts ) , settings . EMBEDDING_BATCH_SIZE ) :
2025-07-23 10:17:04 +08:00
async with embed_limiter :
2026-06-08 04:08:23 -07:00
vts , c = await thread_pool_exec ( batch_encode , cnts [ i : i + settings . EMBEDDING_BATCH_SIZE ] )
2026-04-30 05:00:10 +02:00
cnts_batches . append ( vts )
2024-03-05 12:08:41 +08:00
tk_count + = c
2024-03-27 11:33:46 +08:00
callback ( prog = 0.7 + 0.2 * ( i + 1 ) / len ( cnts ) , msg = " " )
2026-04-30 05:00:10 +02:00
cnts = np . vstack ( cnts_batches ) if cnts_batches else np . array ( [ ] )
2025-12-29 12:01:18 +08:00
filename_embd_weight = parser_config . get ( " filename_embd_weight " , 0.1 ) # due to the db support none value
2025-07-04 12:41:28 +08:00
if not filename_embd_weight :
filename_embd_weight = 0.1
title_w = float ( filename_embd_weight )
2025-11-13 18:48:25 +08:00
if tts . ndim == 2 and cnts . ndim == 2 and tts . shape == cnts . shape :
vects = title_w * tts + ( 1 - title_w ) * cnts
else :
vects = cnts
2024-02-01 18:53:56 +08:00
2024-01-15 08:46:22 +08:00
assert len ( vects ) == len ( docs )
2024-11-12 14:59:41 +08:00
vector_size = 0
2024-01-15 08:46:22 +08:00
for i , d in enumerate ( docs ) :
2024-01-17 09:39:50 +08:00
v = vects [ i ] . tolist ( )
2024-11-12 14:59:41 +08:00
vector_size = len ( v )
2024-01-31 19:57:45 +08:00
d [ " q_ %d _vec " % len ( v ) ] = v
2024-11-12 14:59:41 +08:00
return tk_count , vector_size
2024-01-15 08:46:22 +08:00
2026-05-27 21:54:17 +08:00
@timed_with_recording
2025-10-09 12:36:19 +08:00
async def run_dataflow ( task : dict ) :
2025-11-11 17:36:48 +08:00
from api . db . services . canvas_service import UserCanvasService
from rag . flow . pipeline import Pipeline
2025-10-09 12:36:19 +08:00
task_start_ts = timer ( )
dataflow_id = task [ " dataflow_id " ]
doc_id = task [ " doc_id " ]
task_id = task [ " id " ]
task_dataset_id = task [ " kb_id " ]
if task [ " task_type " ] == " dataflow " :
e , cvs = UserCanvasService . get_by_id ( dataflow_id )
assert e , " User pipeline not found. "
dsl = cvs . dsl
else :
e , pipeline_log = PipelineOperationLogService . get_by_id ( dataflow_id )
assert e , " Pipeline log not found. "
dsl = pipeline_log . dsl
dataflow_id = pipeline_log . pipeline_id
pipeline = Pipeline ( dsl , tenant_id = task [ " tenant_id " ] , doc_id = doc_id , task_id = task_id , flow_id = dataflow_id )
chunks = await pipeline . run ( file = task [ " file " ] ) if task . get ( " file " ) else await pipeline . run ( )
if doc_id == CANVAS_DEBUG_DOC_ID :
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " dataflow_debug_result " , " canvas_debug_mode " )
get_recording_context ( ) . record ( " dataflow_chunks " , chunks )
2025-10-09 12:36:19 +08:00
return
if not chunks :
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " pipeline_output_count " , 0 )
get_recording_context ( ) . record ( " pipeline_output_type " , " empty " )
2026-06-08 04:08:23 -07:00
ret = PipelineOperationLogService . create ( document_id = doc_id , pipeline_id = dataflow_id , task_type = PipelineTaskType . PARSE , dsl = str ( pipeline ) )
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " PipelineOperationLogService.create " , ret )
2025-10-09 12:36:19 +08:00
return
embedding_token_consumption = chunks . get ( " embedding_token_consumption " , 0 )
2026-04-20 12:53:47 +08:00
# The output key may exist with an empty payload; check presence, not truthiness.
if " chunks " in chunks :
2025-10-09 12:36:19 +08:00
chunks = copy . deepcopy ( chunks [ " chunks " ] )
2026-05-27 21:54:17 +08:00
output_type = " chunks "
2026-04-20 12:53:47 +08:00
elif " json " in chunks :
2025-10-09 12:36:19 +08:00
chunks = copy . deepcopy ( chunks [ " json " ] )
2026-05-27 21:54:17 +08:00
output_type = " json "
2026-04-20 12:53:47 +08:00
elif " markdown " in chunks :
chunks = [ { " text " : [ chunks [ " markdown " ] ] } ] if chunks [ " markdown " ] else [ ]
2026-05-27 21:54:17 +08:00
output_type = " markdown "
2026-04-20 12:53:47 +08:00
elif " text " in chunks :
chunks = [ { " text " : [ chunks [ " text " ] ] } ] if chunks [ " text " ] else [ ]
2026-05-27 21:54:17 +08:00
output_type = " text "
2026-04-20 12:53:47 +08:00
elif " html " in chunks :
chunks = [ { " text " : [ chunks [ " html " ] ] } ] if chunks [ " html " ] else [ ]
2026-05-27 21:54:17 +08:00
output_type = " html "
2026-04-20 12:53:47 +08:00
else :
chunks = [ ]
2026-05-27 21:54:17 +08:00
output_type = " empty "
get_recording_context ( ) . record ( " pipeline_output_type " , output_type )
get_recording_context ( ) . record ( " pipeline_output_count " , len ( chunks ) )
2026-04-20 12:53:47 +08:00
# An empty normalized payload means "nothing parsed", so stop before embedding/indexing.
if not chunks :
2026-06-08 04:08:23 -07:00
ret = PipelineOperationLogService . create ( document_id = doc_id , pipeline_id = dataflow_id , task_type = PipelineTaskType . PARSE , dsl = str ( pipeline ) )
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " PipelineOperationLogService.create " , ret )
2026-04-20 12:53:47 +08:00
return
2025-10-09 12:36:19 +08:00
keys = [ k for o in chunks for k in list ( o . keys ( ) ) ]
if not any ( [ re . match ( r " q_[0-9]+_vec " , k ) for k in keys ] ) :
try :
set_progress ( task_id , prog = 0.82 , msg = " \n ------------------------------------- \n Start to embedding... " )
e , kb = KnowledgebaseService . get_by_id ( task [ " kb_id " ] )
embedding_id = kb . embd_id
2026-05-29 17:39:41 +08:00
embd_model_config = get_model_config_from_provider_instance ( task [ " tenant_id " ] , LLMType . EMBEDDING , embedding_id )
2026-03-05 17:27:17 +08:00
embedding_model = LLMBundle ( task [ " tenant_id " ] , embd_model_config )
2025-12-29 12:01:18 +08:00
2025-10-09 12:36:19 +08:00
@timeout ( 60 )
def batch_encode ( txts ) :
nonlocal embedding_model
return embedding_model . encode ( [ truncate ( c , embedding_model . max_length - 10 ) for c in txts ] )
2025-12-29 12:01:18 +08:00
2026-04-30 05:00:10 +02:00
vects_batches = [ ]
2025-10-09 12:36:19 +08:00
texts = [ o . get ( " questions " , o . get ( " summary " , o [ " text " ] ) ) for o in chunks ]
2025-12-29 12:01:18 +08:00
delta = 0.20 / ( len ( texts ) / / settings . EMBEDDING_BATCH_SIZE + 1 )
2025-10-09 12:36:19 +08:00
prog = 0.8
2025-11-06 09:36:38 +08:00
for i in range ( 0 , len ( texts ) , settings . EMBEDDING_BATCH_SIZE ) :
2025-10-09 12:36:19 +08:00
async with embed_limiter :
2026-06-08 04:08:23 -07:00
vts , c = await thread_pool_exec ( batch_encode , texts [ i : i + settings . EMBEDDING_BATCH_SIZE ] )
2026-04-30 05:00:10 +02:00
vects_batches . append ( vts )
2025-10-09 12:36:19 +08:00
embedding_token_consumption + = c
prog + = delta
2025-12-29 12:01:18 +08:00
if i % ( len ( texts ) / / settings . EMBEDDING_BATCH_SIZE / 100 + 1 ) == 1 :
set_progress ( task_id , prog = prog , msg = f " { i + 1 } / { len ( texts ) / / settings . EMBEDDING_BATCH_SIZE } " )
2026-04-30 05:00:10 +02:00
vects = np . vstack ( vects_batches ) if vects_batches else np . array ( [ ] )
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " embedding_token_consumption " , embedding_token_consumption )
get_recording_context ( ) . record ( " vector_size " , len ( vects [ 0 ] ) if len ( vects ) > 0 else 0 )
2025-10-09 12:36:19 +08:00
assert len ( vects ) == len ( chunks )
for i , ck in enumerate ( chunks ) :
v = vects [ i ] . tolist ( )
ck [ " q_ %d _vec " % len ( v ) ] = v
2026-02-06 14:48:24 +08:00
except TaskCanceledException :
raise
2025-10-09 12:36:19 +08:00
except Exception as e :
set_progress ( task_id , prog = - 1 , msg = f " [ERROR]: { e } " )
2026-06-08 04:08:23 -07:00
ret = PipelineOperationLogService . create ( document_id = doc_id , pipeline_id = dataflow_id , task_type = PipelineTaskType . PARSE , dsl = str ( pipeline ) )
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " PipelineOperationLogService.create " , ret )
2025-10-09 12:36:19 +08:00
return
metadata = { }
for ck in chunks :
ck [ " doc_id " ] = doc_id
ck [ " kb_id " ] = [ str ( task [ " kb_id " ] ) ]
ck [ " docnm_kwd " ] = task [ " name " ]
ck [ " create_time " ] = str ( datetime . now ( ) ) . replace ( " T " , " " ) [ : 19 ]
ck [ " create_timestamp_flt " ] = datetime . now ( ) . timestamp ( )
2025-12-08 09:42:20 +08:00
if not ck . get ( " id " ) :
ck [ " id " ] = xxhash . xxh64 ( ( ck [ " text " ] + str ( ck [ " doc_id " ] ) ) . encode ( " utf-8 " ) ) . hexdigest ( )
2025-10-09 12:36:19 +08:00
if " questions " in ck :
if " question_tks " not in ck :
ck [ " question_kwd " ] = ck [ " questions " ] . split ( " \n " )
ck [ " question_tks " ] = rag_tokenizer . tokenize ( str ( ck [ " questions " ] ) )
del ck [ " questions " ]
if " keywords " in ck :
if " important_tks " not in ck :
fix(keyword_extraction): accept Chinese commas/semicolons/newlines as keyword delimiters (#14540)
## What
Widen the keyword delimiter in `rag/svr/task_executor.py`:
both `build_chunks` (LLM `keyword_extraction` cache parsing) and
`run_dataflow` (chunk-level `keywords` ingestion) now split on
`, , ; ; 、 \r \n` instead of only ASCII comma.
## Why
`rag/prompts/keyword_prompt.md` instructs the LLM:
> The keywords are delimited by ENGLISH COMMA.
In practice, Chinese-leaning models (Qwen / Tongyi-Qianwen, GLM,
etc.) frequently ignore this instruction when the source content is
Chinese and emit Chinese commas (`,`) instead. Result:
`cached.split(",")` sees the full LLM output as a *single* keyword.
Repro: `auto_keywords>=4` + Chinese docs + `qwen-plus@Tongyi-Qianwen`.
We observed entries in `important_kwd` like
`"功能介绍,配置说明,参数详解,问题排查"` — one bucket instead of four.
## Impact
- Silent data-quality bug; no exception thrown.
- BM25 `important_kwd^30` boost effectively stops firing — the
indexed term is the whole list, never matches user query tokens.
- Any downstream aggregating `important_kwd` (tagging, analytics,
candidate-keyword review UIs) sees garbage.
## Compatibility
- Pure widening of the splitter; ASCII-comma-only outputs continue
to work identically.
- No schema / API change.
## Test plan
Manually verified against `qwen-plus@Tongyi-Qianwen` with
`auto_keywords=10` on Chinese .txt files:
- Before: `important_kwd` contains one element per chunk that is the
full LLM string with `,`-separated phrases inside.
- After: `important_kwd` contains N elements, one per phrase, as the
LLM intended.
2026-05-11 12:05:24 +08:00
ck [ " important_kwd " ] = [ k for k in re . split ( r " [,, ;;、 \ r \ n]+ " , ck [ " keywords " ] ) if k . strip ( ) ]
2025-10-09 12:36:19 +08:00
ck [ " important_tks " ] = rag_tokenizer . tokenize ( str ( ck [ " keywords " ] ) )
del ck [ " keywords " ]
if " summary " in ck :
if " content_ltks " not in ck :
ck [ " content_ltks " ] = rag_tokenizer . tokenize ( str ( ck [ " summary " ] ) )
ck [ " content_sm_ltks " ] = rag_tokenizer . fine_grained_tokenize ( ck [ " content_ltks " ] )
del ck [ " summary " ]
if " metadata " in ck :
2025-12-17 16:50:36 +08:00
metadata = update_metadata_to ( metadata , ck [ " metadata " ] )
2025-10-09 12:36:19 +08:00
del ck [ " metadata " ]
if " content_with_weight " not in ck :
ck [ " content_with_weight " ] = ck [ " text " ]
del ck [ " text " ]
if " positions " in ck :
add_positions ( ck , ck [ " positions " ] )
del ck [ " positions " ]
if metadata :
2026-01-28 13:29:34 +08:00
existing_meta = DocMetadataService . get_document_metadata ( doc_id )
existing_meta = existing_meta if isinstance ( existing_meta , dict ) else { }
metadata = update_metadata_to ( metadata , existing_meta )
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " run_dataflow_metadata " , metadata )
ret = DocMetadataService . update_document_metadata ( doc_id , metadata )
get_recording_context ( ) . save_func_return_value ( " DocMetadataService.update_document_metadata " , ret )
2025-09-05 18:50:46 +08:00
2025-10-09 12:36:19 +08:00
start_ts = timer ( )
set_progress ( task_id , prog = 0.82 , msg = " [DOC Engine]: \n Start to index... " )
2026-01-19 19:35:14 +08:00
e = await insert_chunks ( task_id , task [ " tenant_id " ] , task [ " kb_id " ] , chunks , partial ( set_progress , task_id , 0 , 100000000 ) )
2025-10-09 12:36:19 +08:00
if not e :
2026-06-08 04:08:23 -07:00
ret = PipelineOperationLogService . create ( document_id = doc_id , pipeline_id = dataflow_id , task_type = PipelineTaskType . PARSE , dsl = str ( pipeline ) )
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " PipelineOperationLogService.create " , ret )
2025-10-09 12:36:19 +08:00
return
2025-09-05 18:50:46 +08:00
2025-10-09 12:36:19 +08:00
time_cost = timer ( ) - start_ts
task_time_cost = timer ( ) - task_start_ts
2026-06-08 04:08:23 -07:00
set_progress ( task_id , prog = 1.0 , msg = " Indexing done ( {:.2f} s). Task done ( {:.2f} s) " . format ( time_cost , task_time_cost ) )
ret = DocumentService . increment_chunk_num ( doc_id , task_dataset_id , embedding_token_consumption , len ( chunks ) , task_time_cost )
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " DocumentService.increment_chunk_num " , ret )
2026-06-08 04:08:23 -07:00
logging . info ( " [Done], chunks( {} ), token( {} ), elapsed: {:.2f} " . format ( len ( chunks ) , embedding_token_consumption , task_time_cost ) )
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " dataflow_chunks " , chunks )
2026-06-08 04:08:23 -07:00
ret = PipelineOperationLogService . create ( document_id = doc_id , pipeline_id = dataflow_id , task_type = PipelineTaskType . PARSE , dsl = str ( pipeline ) )
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " PipelineOperationLogService.create " , ret )
2025-09-05 18:50:46 +08:00
2026-06-08 04:08:23 -07:00
2026-05-11 15:42:31 -10:00
RAPTOR_METHOD_SEARCH_LIMIT = 10000
Refact: improve task resume mechanism for graphrag (#14096)
### What problem does this PR solve?
Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).
**Changes based on @yuzhichang's review:**
1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.
**Changes based on CodeRabbit review:**
3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.
### Changes
| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |
### How it works
- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.
### Testing
- 10 new unit tests — all passing
- Full existing suite: 617 passed
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2026-04-15 15:07:28 +05:30
2026-05-11 15:42:31 -10:00
async def get_raptor_chunk_field_map ( doc_id : str , tenant_id : str , kb_id : str ) - > dict :
""" Return stored RAPTOR marker fields for a document. """
Refact: improve task resume mechanism for graphrag (#14096)
### What problem does this PR solve?
Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).
**Changes based on @yuzhichang's review:**
1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.
**Changes based on CodeRabbit review:**
3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.
### Changes
| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |
### How it works
- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.
### Testing
- 10 new unit tests — all passing
- Full existing suite: 617 passed
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2026-04-15 15:07:28 +05:30
from common . doc_store . doc_store_base import OrderByExpr
from rag . nlp import search as nlp_search
2026-05-11 15:42:31 -10:00
async def search_fields ( fields : list [ str ] , condition : dict , order_by = None ) :
""" Search chunk fields in the current knowledge base. """
2026-06-08 04:08:23 -07:00
res = await thread_pool_exec ( settings . docStoreConn . search , fields , [ ] , condition , [ ] , order_by or OrderByExpr ( ) , 0 , RAPTOR_METHOD_SEARCH_LIMIT , nlp_search . index_name ( tenant_id ) , [ kb_id ] )
2026-05-11 15:42:31 -10:00
return settings . docStoreConn . get_fields ( res , fields )
primary = await search_fields ( [ " raptor_kwd " , " extra " ] , { " doc_id " : doc_id , " raptor_kwd " : [ " raptor " ] } )
if collect_raptor_chunk_ids ( primary ) :
return primary
try :
return await search_fields (
[ " raptor_kwd " , " extra " ] ,
{ " doc_id " : doc_id } ,
OrderByExpr ( ) . desc ( " create_timestamp_flt " ) ,
)
except Exception :
logging . debug ( " RAPTOR fallback method lookup with extra field failed for doc %s " , doc_id , exc_info = True )
return primary
async def get_raptor_chunk_methods ( doc_id : str , tenant_id : str , kb_id : str ) - > set [ str ] :
""" Return the RAPTOR tree builders already stored for doc_id.
Queries directly for raptor_kwd = " raptor " rows so a non - RAPTOR leading
chunk cannot produce a false - negative result . Legacy summary chunks that
do not have method metadata are treated as the original RAPTOR builder .
"""
try :
field_map = await get_raptor_chunk_field_map ( doc_id , tenant_id , kb_id )
methods = collect_raptor_methods ( field_map )
if methods :
Refact: improve task resume mechanism for graphrag (#14096)
### What problem does this PR solve?
Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).
**Changes based on @yuzhichang's review:**
1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.
**Changes based on CodeRabbit review:**
3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.
### Changes
| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |
### How it works
- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.
### Testing
- 10 new unit tests — all passing
- Full existing suite: 617 passed
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2026-04-15 15:07:28 +05:30
logging . info (
2026-05-11 15:42:31 -10:00
" Checkpoint hit: RAPTOR chunks for doc %s (tenant= %s kb= %s methods= %s ) already exist " ,
2026-06-08 04:08:23 -07:00
doc_id ,
tenant_id ,
kb_id ,
sorted ( methods ) ,
Refact: improve task resume mechanism for graphrag (#14096)
### What problem does this PR solve?
Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).
**Changes based on @yuzhichang's review:**
1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.
**Changes based on CodeRabbit review:**
3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.
### Changes
| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |
### How it works
- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.
### Testing
- 10 new unit tests — all passing
- Full existing suite: 617 passed
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2026-04-15 15:07:28 +05:30
)
else :
logging . info (
" Checkpoint miss: no RAPTOR chunks for doc %s (tenant= %s kb= %s ) " ,
2026-06-08 04:08:23 -07:00
doc_id ,
tenant_id ,
kb_id ,
Refact: improve task resume mechanism for graphrag (#14096)
### What problem does this PR solve?
Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).
**Changes based on @yuzhichang's review:**
1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.
**Changes based on CodeRabbit review:**
3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.
### Changes
| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |
### How it works
- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.
### Testing
- 10 new unit tests — all passing
- Full existing suite: 617 passed
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2026-04-15 15:07:28 +05:30
)
2026-05-11 15:42:31 -10:00
return methods
Refact: improve task resume mechanism for graphrag (#14096)
### What problem does this PR solve?
Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).
**Changes based on @yuzhichang's review:**
1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.
**Changes based on CodeRabbit review:**
3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.
### Changes
| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |
### How it works
- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.
### Testing
- 10 new unit tests — all passing
- Full existing suite: 617 passed
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2026-04-15 15:07:28 +05:30
except Exception :
logging . exception ( " Failed to check RAPTOR chunks for doc %s " , doc_id )
2026-05-11 15:42:31 -10:00
raise
async def has_raptor_chunks ( doc_id : str , tenant_id : str , kb_id : str , tree_builder : str = RAPTOR_TREE_BUILDER ) - > bool :
""" Return whether doc_id already has summaries for tree_builder. """
methods = await get_raptor_chunk_methods ( doc_id , tenant_id , kb_id )
return tree_builder in methods
async def delete_raptor_chunks ( doc_id : str , tenant_id : str , kb_id : str , keep_method : str | None = None ) :
""" Delete RAPTOR summaries for doc_id, optionally preserving one method. """
if keep_method is None :
logging . info (
" delete_raptor_chunks: removing all RAPTOR summaries (doc= %s tenant= %s kb= %s ) " ,
2026-06-08 04:08:23 -07:00
doc_id ,
tenant_id ,
kb_id ,
2026-05-11 15:42:31 -10:00
)
2026-05-27 21:54:17 +08:00
ret = await thread_pool_exec (
2026-05-11 15:42:31 -10:00
settings . docStoreConn . delete ,
{ " doc_id " : doc_id , " raptor_kwd " : [ " raptor " ] } ,
nlp_search . index_name ( tenant_id ) ,
kb_id ,
)
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " docStoreConn.delete " , ret )
2026-05-11 15:42:31 -10:00
return 0
field_map = await get_raptor_chunk_field_map ( doc_id , tenant_id , kb_id )
chunk_ids = collect_raptor_chunk_ids ( field_map , exclude_methods = { keep_method } )
if not chunk_ids :
logging . debug (
" delete_raptor_chunks: no stale RAPTOR chunks to remove (doc= %s tenant= %s kb= %s keep= %s ) " ,
2026-06-08 04:08:23 -07:00
doc_id ,
tenant_id ,
kb_id ,
keep_method ,
2026-05-11 15:42:31 -10:00
)
return 0
logging . info (
" delete_raptor_chunks: removing %d stale RAPTOR chunks (doc= %s tenant= %s kb= %s keep= %s ) " ,
2026-06-08 04:08:23 -07:00
len ( chunk_ids ) ,
doc_id ,
tenant_id ,
kb_id ,
keep_method ,
2026-05-11 15:42:31 -10:00
)
2026-05-27 21:54:17 +08:00
ret = await thread_pool_exec (
2026-05-11 15:42:31 -10:00
settings . docStoreConn . delete ,
{ " id " : list ( chunk_ids ) } ,
nlp_search . index_name ( tenant_id ) ,
kb_id ,
)
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " docStoreConn.delete " , ret )
2026-05-11 15:42:31 -10:00
return len ( chunk_ids )
Refact: improve task resume mechanism for graphrag (#14096)
### What problem does this PR solve?
Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).
**Changes based on @yuzhichang's review:**
1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.
**Changes based on CodeRabbit review:**
3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.
### Changes
| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |
### How it works
- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.
### Testing
- 10 new unit tests — all passing
- Full existing suite: 617 passed
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2026-04-15 15:07:28 +05:30
2025-07-15 09:36:45 +08:00
@timeout ( 3600 )
2025-10-09 12:36:19 +08:00
async def run_raptor_for_kb ( row , kb_parser_config , chat_mdl , embd_mdl , vector_size , callback = None , doc_ids = [ ] ) :
2026-05-11 15:42:31 -10:00
""" Generate RAPTOR summaries for selected documents in a knowledge base. """
2025-10-09 12:36:19 +08:00
fake_doc_id = GRAPH_RAPTOR_FAKE_DOC_ID
raptor_config = kb_parser_config . get ( " raptor " , { } )
2026-05-11 15:42:31 -10:00
raptor_ext_config = raptor_config . get ( " ext " ) or { }
tree_builder = get_raptor_tree_builder ( raptor_config )
clustering_method = get_raptor_clustering_method ( raptor_config )
2025-12-29 12:01:18 +08:00
vctr_nm = " q_ %d _vec " % vector_size
2025-11-11 16:58:47 +08:00
2024-05-23 14:31:16 +08:00
res = [ ]
tk_count = 0
2026-05-11 15:42:31 -10:00
cleanup_raptor_chunks = [ ]
2025-11-13 18:48:07 +08:00
max_errors = int ( os . environ . get ( " RAPTOR_MAX_ERRORS " , 3 ) )
2026-05-11 15:42:31 -10:00
doc_info_by_id = { }
2026-03-10 14:24:33 +08:00
for doc_id in set ( doc_ids ) :
ok , source_doc = DocumentService . get_by_id ( doc_id )
if not ok or not source_doc :
continue
2026-05-11 15:42:31 -10:00
doc_info_by_id [ doc_id ] = {
" name " : getattr ( source_doc , " name " , " " ) ,
" type " : getattr ( source_doc , " type " , " " ) ,
" parser_id " : getattr ( source_doc , " parser_id " , " " ) ,
" parser_config " : getattr ( source_doc , " parser_config " , { } ) or { } ,
}
def schedule_raptor_cleanup ( doc_id : str , keep_method : str | None = None ) :
""" Queue stale RAPTOR summaries for deletion after successful insert. """
cleanup_plan = ( doc_id , keep_method )
if cleanup_plan not in cleanup_raptor_chunks :
cleanup_raptor_chunks . append ( cleanup_plan )
def skip_raptor_doc ( doc_id : str ) - > bool :
""" Return whether RAPTOR should be skipped for this source document. """
doc_info = doc_info_by_id . get ( doc_id , { } )
file_type = doc_info . get ( " type " ) or row . get ( " type " , " " )
parser_id = doc_info . get ( " parser_id " ) or row . get ( " parser_id " , " " )
parser_config = doc_info . get ( " parser_config " ) or row . get ( " parser_config " , { } )
if should_skip_raptor ( file_type , parser_id , parser_config , raptor_config ) :
skip_reason = get_skip_reason ( file_type , parser_id , parser_config )
doc_name = doc_info . get ( " name " ) or doc_id
logging . info ( " Skipping Raptor for document %s : %s " , doc_name , skip_reason )
callback ( msg = f " [RAPTOR] doc: { doc_id } skipped: { skip_reason } " )
return True
return False
2025-11-13 18:48:07 +08:00
2025-11-11 19:46:41 +08:00
async def generate ( chunks , did ) :
2026-05-11 15:42:31 -10:00
""" Run RAPTOR and append generated summary chunks for one doc id. """
2025-11-11 16:58:47 +08:00
nonlocal tk_count , res
2026-05-11 15:42:31 -10:00
logging . info ( " RAPTOR: using tree_builder= %s clustering_method= %s for doc %s " , tree_builder , clustering_method , did )
2026-05-12 17:00:45 +08:00
from rag . raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor # Lazy load, save around 8s
2026-06-08 04:08:23 -07:00
2025-11-11 16:58:47 +08:00
raptor = Raptor (
raptor_config . get ( " max_cluster " , 64 ) ,
chat_mdl ,
embd_mdl ,
raptor_config [ " prompt " ] ,
raptor_config [ " max_token " ] ,
raptor_config [ " threshold " ] ,
2025-11-13 18:48:07 +08:00
max_errors = max_errors ,
2026-05-11 15:42:31 -10:00
tree_builder = tree_builder ,
clustering_method = clustering_method ,
psi_exact_max_leaves = raptor_ext_config . get ( " psi_exact_max_leaves " , 4096 ) ,
psi_bucket_size = raptor_ext_config . get ( " psi_bucket_size " , 1024 ) ,
2025-11-11 16:58:47 +08:00
)
original_length = len ( chunks )
feat: persist RAPTOR layer metadata on summary chunks (#13286)
## Summary
RAPTOR's recursive clustering builds a `layers` list tracking
`(start_idx, end_idx)` boundaries per level, but currently discards this
information — only the flat `chunks` list is returned. This makes it
impossible to distinguish leaf-level summaries from top-level ones.
This PR:
- Returns `(chunks, layers)` tuple from `raptor.py`'s `__call__`
- Annotates each RAPTOR summary chunk with `raptor_layer_int` (1 = first
summary level, 2 = summary-of-summaries, etc.)
- Adds `raptor_layer_int` to `infinity_mapping.json` (Elasticsearch
handles it via existing `*_int` dynamic template)
### Why this matters
Downstream features need to know which RAPTOR layer a summary belongs
to:
- **Retrieving the top-level document summary** for entity extraction,
search snippets, or document comparison
- **Filtering by abstraction level** — users may want only high-level
summaries or only leaf-level cluster summaries
- **RAPTOR recall quality** — #10951 reports summaries not being
recalled for definition queries; layer metadata enables targeted
retrieval
### Changes
| File | Change | LOC |
|------|--------|-----|
| `rag/raptor.py` | Return `(chunks, layers)` tuple | ~3 |
| `rag/svr/task_executor.py` | Build `chunk_layer` mapping, set
`raptor_layer_int` | ~12 |
| `conf/infinity_mapping.json` | Add `raptor_layer_int` integer field |
~1 |
### Backward compatibility
- **Additive only** — no existing fields or behavior changed
- Existing RAPTOR chunks continue to work (they'll have
`raptor_layer_int = 0` by default)
- New RAPTOR chunks get layer metadata automatically
## Test plan
- [ ] Parse a document with RAPTOR enabled, verify `raptor_layer_int` is
set on indexed chunks
- [ ] Verify `raptor_layer_int` values increase with abstraction level
(layer 1 < layer 2 < ...)
- [ ] Verify existing RAPTOR deletion (`delete by raptor_kwd`) still
works
- [ ] Verify Infinity backend accepts the new field
Fixes #7488
Related: #4104, #11191, #10951
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
2026-04-27 10:20:46 +08:00
chunks , layers = await raptor ( chunks , kb_parser_config [ " raptor " ] [ " random_seed " ] , callback , row [ " id " ] )
2026-05-11 15:42:31 -10:00
effective_doc_name = row [ " name " ] if did == fake_doc_id else doc_info_by_id . get ( did , { } ) . get ( " name " ) or row [ " name " ]
2025-11-11 16:58:47 +08:00
doc = {
2025-11-11 19:46:41 +08:00
" doc_id " : did ,
2025-11-11 16:58:47 +08:00
" kb_id " : [ str ( row [ " kb_id " ] ) ] ,
2026-03-10 14:24:33 +08:00
" docnm_kwd " : effective_doc_name ,
" title_tks " : rag_tokenizer . tokenize ( effective_doc_name ) ,
2026-05-11 15:42:31 -10:00
" raptor_kwd " : " raptor " ,
" extra " : { " raptor_method " : tree_builder } ,
2025-11-11 16:58:47 +08:00
}
if row [ " pagerank " ] :
doc [ PAGERANK_FLD ] = int ( row [ " pagerank " ] )
feat: persist RAPTOR layer metadata on summary chunks (#13286)
## Summary
RAPTOR's recursive clustering builds a `layers` list tracking
`(start_idx, end_idx)` boundaries per level, but currently discards this
information — only the flat `chunks` list is returned. This makes it
impossible to distinguish leaf-level summaries from top-level ones.
This PR:
- Returns `(chunks, layers)` tuple from `raptor.py`'s `__call__`
- Annotates each RAPTOR summary chunk with `raptor_layer_int` (1 = first
summary level, 2 = summary-of-summaries, etc.)
- Adds `raptor_layer_int` to `infinity_mapping.json` (Elasticsearch
handles it via existing `*_int` dynamic template)
### Why this matters
Downstream features need to know which RAPTOR layer a summary belongs
to:
- **Retrieving the top-level document summary** for entity extraction,
search snippets, or document comparison
- **Filtering by abstraction level** — users may want only high-level
summaries or only leaf-level cluster summaries
- **RAPTOR recall quality** — #10951 reports summaries not being
recalled for definition queries; layer metadata enables targeted
retrieval
### Changes
| File | Change | LOC |
|------|--------|-----|
| `rag/raptor.py` | Return `(chunks, layers)` tuple | ~3 |
| `rag/svr/task_executor.py` | Build `chunk_layer` mapping, set
`raptor_layer_int` | ~12 |
| `conf/infinity_mapping.json` | Add `raptor_layer_int` integer field |
~1 |
### Backward compatibility
- **Additive only** — no existing fields or behavior changed
- Existing RAPTOR chunks continue to work (they'll have
`raptor_layer_int = 0` by default)
- New RAPTOR chunks get layer metadata automatically
## Test plan
- [ ] Parse a document with RAPTOR enabled, verify `raptor_layer_int` is
set on indexed chunks
- [ ] Verify `raptor_layer_int` values increase with abstraction level
(layer 1 < layer 2 < ...)
- [ ] Verify existing RAPTOR deletion (`delete by raptor_kwd`) still
works
- [ ] Verify Infinity backend accepts the new field
Fixes #7488
Related: #4104, #11191, #10951
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
2026-04-27 10:20:46 +08:00
# Build index→layer mapping from RAPTOR layer boundaries.
# layers is [(start, end), ...] where layer 0 is the original chunks
# and layer 1+ are summary layers. We skip layer 0 (original chunks).
chunk_layer = { }
for layer_idx , ( layer_start , layer_end ) in enumerate ( layers ) :
if layer_idx == 0 :
continue # layer 0 = original input chunks, not summaries
for ci in range ( layer_start , layer_end ) :
chunk_layer [ ci ] = layer_idx
for idx , ( content , vctr ) in enumerate ( chunks [ original_length : ] , start = original_length ) :
2025-11-11 16:58:47 +08:00
d = copy . deepcopy ( doc )
2026-05-11 15:42:31 -10:00
d [ " id " ] = make_raptor_summary_chunk_id ( content , did )
2025-11-11 16:58:47 +08:00
d [ " create_time " ] = str ( datetime . now ( ) ) . replace ( " T " , " " ) [ : 19 ]
d [ " create_timestamp_flt " ] = datetime . now ( ) . timestamp ( )
d [ vctr_nm ] = vctr . tolist ( )
d [ " content_with_weight " ] = content
d [ " content_ltks " ] = rag_tokenizer . tokenize ( content )
d [ " content_sm_ltks " ] = rag_tokenizer . fine_grained_tokenize ( d [ " content_ltks " ] )
feat: persist RAPTOR layer metadata on summary chunks (#13286)
## Summary
RAPTOR's recursive clustering builds a `layers` list tracking
`(start_idx, end_idx)` boundaries per level, but currently discards this
information — only the flat `chunks` list is returned. This makes it
impossible to distinguish leaf-level summaries from top-level ones.
This PR:
- Returns `(chunks, layers)` tuple from `raptor.py`'s `__call__`
- Annotates each RAPTOR summary chunk with `raptor_layer_int` (1 = first
summary level, 2 = summary-of-summaries, etc.)
- Adds `raptor_layer_int` to `infinity_mapping.json` (Elasticsearch
handles it via existing `*_int` dynamic template)
### Why this matters
Downstream features need to know which RAPTOR layer a summary belongs
to:
- **Retrieving the top-level document summary** for entity extraction,
search snippets, or document comparison
- **Filtering by abstraction level** — users may want only high-level
summaries or only leaf-level cluster summaries
- **RAPTOR recall quality** — #10951 reports summaries not being
recalled for definition queries; layer metadata enables targeted
retrieval
### Changes
| File | Change | LOC |
|------|--------|-----|
| `rag/raptor.py` | Return `(chunks, layers)` tuple | ~3 |
| `rag/svr/task_executor.py` | Build `chunk_layer` mapping, set
`raptor_layer_int` | ~12 |
| `conf/infinity_mapping.json` | Add `raptor_layer_int` integer field |
~1 |
### Backward compatibility
- **Additive only** — no existing fields or behavior changed
- Existing RAPTOR chunks continue to work (they'll have
`raptor_layer_int = 0` by default)
- New RAPTOR chunks get layer metadata automatically
## Test plan
- [ ] Parse a document with RAPTOR enabled, verify `raptor_layer_int` is
set on indexed chunks
- [ ] Verify `raptor_layer_int` values increase with abstraction level
(layer 1 < layer 2 < ...)
- [ ] Verify existing RAPTOR deletion (`delete by raptor_kwd`) still
works
- [ ] Verify Infinity backend accepts the new field
Fixes #7488
Related: #4104, #11191, #10951
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
2026-04-27 10:20:46 +08:00
d [ " raptor_layer_int " ] = chunk_layer . get ( idx , 1 )
2025-11-11 16:58:47 +08:00
res . append ( d )
tk_count + = num_tokens_from_string ( content )
if raptor_config . get ( " scope " , " file " ) == " file " :
2026-05-11 15:42:31 -10:00
dataset_methods = await get_raptor_chunk_methods ( fake_doc_id , row [ " tenant_id " ] , row [ " kb_id " ] )
remove_dataset_summaries = bool ( dataset_methods )
has_file_level_target = False
if dataset_methods :
callback ( msg = " [RAPTOR] will remove dataset-level summaries after file-level summaries are available. " )
2025-11-11 16:58:47 +08:00
for x , doc_id in enumerate ( doc_ids ) :
2026-05-11 15:42:31 -10:00
if skip_raptor_doc ( doc_id ) :
2026-06-08 04:08:23 -07:00
callback ( prog = ( x + 1.0 ) / len ( doc_ids ) )
2026-05-11 15:42:31 -10:00
continue
Refact: improve task resume mechanism for graphrag (#14096)
### What problem does this PR solve?
Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).
**Changes based on @yuzhichang's review:**
1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.
**Changes based on CodeRabbit review:**
3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.
### Changes
| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |
### How it works
- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.
### Testing
- 10 new unit tests — all passing
- Full existing suite: 617 passed
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2026-04-15 15:07:28 +05:30
# CHECKPOINT: skip docs that already have RAPTOR chunks in the doc store
2026-05-11 15:42:31 -10:00
existing_methods = await get_raptor_chunk_methods ( doc_id , row [ " tenant_id " ] , row [ " kb_id " ] )
if tree_builder in existing_methods :
has_file_level_target = True
if existing_methods != { tree_builder } :
schedule_raptor_cleanup ( doc_id , tree_builder )
callback ( msg = f " [RAPTOR] doc: { doc_id } will remove old RAPTOR summaries after insert. " )
callback ( msg = f " [RAPTOR] doc: { doc_id } already has { tree_builder } RAPTOR chunks, skipping. " )
2026-06-08 04:08:23 -07:00
callback ( prog = ( x + 1.0 ) / len ( doc_ids ) )
Refact: improve task resume mechanism for graphrag (#14096)
### What problem does this PR solve?
Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).
**Changes based on @yuzhichang's review:**
1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.
**Changes based on CodeRabbit review:**
3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.
### Changes
| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |
### How it works
- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.
### Testing
- 10 new unit tests — all passing
- Full existing suite: 617 passed
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2026-04-15 15:07:28 +05:30
continue
2026-05-11 15:42:31 -10:00
if existing_methods :
callback ( msg = f " [RAPTOR] doc: { doc_id } will migrate RAPTOR summaries to { tree_builder } after insert. " )
Refact: improve task resume mechanism for graphrag (#14096)
### What problem does this PR solve?
Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).
**Changes based on @yuzhichang's review:**
1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.
**Changes based on CodeRabbit review:**
3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.
### Changes
| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |
### How it works
- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.
### Testing
- 10 new unit tests — all passing
- Full existing suite: 617 passed
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2026-04-15 15:07:28 +05:30
2025-11-11 16:58:47 +08:00
chunks = [ ]
2026-01-20 15:24:20 +11:00
skipped_chunks = 0
2026-06-08 04:08:23 -07:00
for d in settings . retriever . chunk_list ( doc_id , row [ " tenant_id " ] , [ str ( row [ " kb_id " ] ) ] , fields = [ " content_with_weight " , vctr_nm ] , sort_by_position = True ) :
2026-01-20 15:24:20 +11:00
# Skip chunks that don't have the required vector field (may have been indexed with different embedding model)
if vctr_nm not in d or d [ vctr_nm ] is None :
skipped_chunks + = 1
logging . warning ( f " RAPTOR: Chunk missing vector field ' { vctr_nm } ' in doc { doc_id } , skipping " )
continue
2025-11-11 16:58:47 +08:00
chunks . append ( ( d [ " content_with_weight " ] , np . array ( d [ vctr_nm ] ) ) )
Refact: improve task resume mechanism for graphrag (#14096)
### What problem does this PR solve?
Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).
**Changes based on @yuzhichang's review:**
1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.
**Changes based on CodeRabbit review:**
3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.
### Changes
| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |
### How it works
- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.
### Testing
- 10 new unit tests — all passing
- Full existing suite: 617 passed
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2026-04-15 15:07:28 +05:30
2026-01-20 15:24:20 +11:00
if skipped_chunks > 0 :
callback ( msg = f " [WARN] Skipped { skipped_chunks } chunks without vector field ' { vctr_nm } ' for doc { doc_id } . Consider re-parsing the document with the current embedding model. " )
Refact: improve task resume mechanism for graphrag (#14096)
### What problem does this PR solve?
Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).
**Changes based on @yuzhichang's review:**
1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.
**Changes based on CodeRabbit review:**
3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.
### Changes
| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |
### How it works
- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.
### Testing
- 10 new unit tests — all passing
- Full existing suite: 617 passed
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2026-04-15 15:07:28 +05:30
2026-01-20 15:24:20 +11:00
if not chunks :
logging . warning ( f " RAPTOR: No valid chunks with vectors found for doc { doc_id } " )
callback ( msg = f " [WARN] No valid chunks with vectors found for doc { doc_id } , skipping " )
continue
Refact: improve task resume mechanism for graphrag (#14096)
### What problem does this PR solve?
Addresses review feedback on #14074 (Checkpoint mechanism for
long-running workflow jobs, issue #12494).
**Changes based on @yuzhichang's review:**
1. **Renamed `checkpoint_service.py` → `task_checkpoint.py`** as
suggested.
2. **Replaced Redis with direct docEngine queries** as suggested — the
subgraph already gets persisted to the doc store by
`generate_subgraph()`, so we just query for it instead of maintaining a
separate checkpoint in Redis. This is simpler, has no extra dependency,
and uses a single source of truth.
**Changes based on CodeRabbit review:**
3. **Fixed `source_id` query format mismatch** — subgraphs are stored
with `source_id: [doc_id]` (list), but the original query used
`source_id: doc_id` (string). Now follows the same pattern as
`does_graph_contains()` in `rag/graphrag/utils.py`: filter by
`knowledge_graph_kwd` only, then match `source_id` in Python. This
avoids ambiguity across Elasticsearch / Infinity / OceanBase backends.
### Changes
| File | Change |
|---|---|
| `api/db/services/task_checkpoint.py` (new) |
`load_subgraph_from_store()` and `has_raptor_chunks()` — docEngine-based
checkpoint queries |
| `rag/graphrag/general/index.py` | `build_one()` calls
`load_subgraph_from_store()` before running LLM extraction |
| `rag/svr/task_executor.py` | RAPTOR per-doc loop calls
`has_raptor_chunks()` before processing |
| `test/unit_test/rag/graphrag/test_checkpoint_resume.py` (new) | 10
unit tests covering subgraph loading, source_id filtering, edge cases |
### How it works
- **GraphRAG:** Before running expensive LLM entity/relation extraction
for a doc, checks the doc store for an existing subgraph (saved by a
previous interrupted run). If found, loads it directly and skips LLM
calls.
- **RAPTOR:** Before processing a doc, checks if RAPTOR chunks
(`raptor_kwd="raptor"`) already exist for it. If yes, skips.
### Testing
- 10 new unit tests — all passing
- Full existing suite: 617 passed
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2026-04-15 15:07:28 +05:30
2026-05-11 15:42:31 -10:00
before_generate = len ( res )
2025-11-11 19:46:41 +08:00
await generate ( chunks , doc_id )
2026-05-11 15:42:31 -10:00
if len ( res ) > before_generate :
has_file_level_target = True
if existing_methods :
schedule_raptor_cleanup ( doc_id , tree_builder )
2026-06-08 04:08:23 -07:00
callback ( prog = ( x + 1.0 ) / len ( doc_ids ) )
2026-05-11 15:42:31 -10:00
if remove_dataset_summaries :
if has_file_level_target :
schedule_raptor_cleanup ( fake_doc_id )
else :
callback ( msg = " [RAPTOR] kept dataset-level summaries because no file-level summaries were built. " )
2025-11-11 16:58:47 +08:00
else :
2026-05-11 15:42:31 -10:00
migrated_file_docs = 0
file_cleanup_doc_ids = [ ]
skipped_doc_ids = set ( )
for doc_id in set ( doc_ids ) :
if skip_raptor_doc ( doc_id ) :
skipped_doc_ids . add ( doc_id )
continue
existing_methods = await get_raptor_chunk_methods ( doc_id , row [ " tenant_id " ] , row [ " kb_id " ] )
if existing_methods :
file_cleanup_doc_ids . append ( doc_id )
migrated_file_docs + = 1
if migrated_file_docs :
callback ( msg = f " [RAPTOR] will remove file-level summaries for { migrated_file_docs } docs after dataset-level build succeeds. " )
existing_methods = await get_raptor_chunk_methods ( fake_doc_id , row [ " tenant_id " ] , row [ " kb_id " ] )
if tree_builder in existing_methods :
if existing_methods != { tree_builder } :
schedule_raptor_cleanup ( fake_doc_id , tree_builder )
callback ( msg = " [RAPTOR] will remove old dataset-level RAPTOR summaries after insert. " )
for doc_id in file_cleanup_doc_ids :
schedule_raptor_cleanup ( doc_id )
callback ( msg = f " [RAPTOR] dataset-level { tree_builder } summaries already exist, skipping. " )
return res , tk_count , cleanup_raptor_chunks
migrate_dataset_summaries = bool ( existing_methods )
if migrate_dataset_summaries :
callback ( msg = f " [RAPTOR] will migrate dataset-level RAPTOR summaries to { tree_builder } after insert. " )
2025-11-11 16:58:47 +08:00
chunks = [ ]
2026-01-20 15:24:20 +11:00
skipped_chunks = 0
2025-11-11 16:58:47 +08:00
for doc_id in doc_ids :
2026-05-11 15:42:31 -10:00
if doc_id in skipped_doc_ids :
continue
2026-06-08 04:08:23 -07:00
for d in settings . retriever . chunk_list ( doc_id , row [ " tenant_id " ] , [ str ( row [ " kb_id " ] ) ] , fields = [ " content_with_weight " , vctr_nm ] , sort_by_position = True ) :
2026-01-20 15:24:20 +11:00
# Skip chunks that don't have the required vector field
if vctr_nm not in d or d [ vctr_nm ] is None :
skipped_chunks + = 1
logging . warning ( f " RAPTOR: Chunk missing vector field ' { vctr_nm } ' in doc { doc_id } , skipping " )
continue
2025-11-11 16:58:47 +08:00
chunks . append ( ( d [ " content_with_weight " ] , np . array ( d [ vctr_nm ] ) ) )
2026-01-20 15:24:20 +11:00
if skipped_chunks > 0 :
callback ( msg = f " [WARN] Skipped { skipped_chunks } chunks without vector field ' { vctr_nm } ' . Consider re-parsing documents with the current embedding model. " )
if not chunks :
2026-05-11 15:42:31 -10:00
if skipped_doc_ids and len ( skipped_doc_ids ) == len ( set ( doc_ids ) ) :
callback ( msg = " [RAPTOR] all documents were skipped by RAPTOR auto-disable rules. " )
return res , tk_count , cleanup_raptor_chunks
2026-01-20 15:24:20 +11:00
logging . error ( f " RAPTOR: No valid chunks with vectors found in any document for kb { row [ ' kb_id ' ] } " )
callback ( msg = f " [ERROR] No valid chunks with vectors found. Please ensure documents are parsed with the current embedding model (vector size: { vector_size } ). " )
2026-05-11 15:42:31 -10:00
return res , tk_count , cleanup_raptor_chunks
2026-01-20 15:24:20 +11:00
2026-05-11 15:42:31 -10:00
before_generate = len ( res )
2025-11-11 19:46:41 +08:00
await generate ( chunks , fake_doc_id )
2026-05-11 15:42:31 -10:00
if len ( res ) > before_generate :
for doc_id in file_cleanup_doc_ids :
schedule_raptor_cleanup ( doc_id )
if migrate_dataset_summaries :
schedule_raptor_cleanup ( fake_doc_id , tree_builder )
2025-11-11 16:58:47 +08:00
2026-05-11 15:42:31 -10:00
return res , tk_count , cleanup_raptor_chunks
2025-01-22 19:43:14 +08:00
2025-10-09 12:36:19 +08:00
async def delete_image ( kb_id , chunk_id ) :
try :
async with minio_limiter :
2025-11-06 09:36:38 +08:00
settings . STORAGE_IMPL . delete ( kb_id , chunk_id )
2025-10-09 12:36:19 +08:00
except Exception :
logging . exception ( f " Deleting image of chunk { chunk_id } got exception " )
raise
2026-05-27 21:54:17 +08:00
@timed_with_recording
2026-01-19 19:35:14 +08:00
async def insert_chunks ( task_id , task_tenant_id , task_dataset_id , chunks , progress_callback ) :
"""
Insert chunks into document store ( Elasticsearch OR Infinity ) .
Args :
task_id : Task identifier
task_tenant_id : Tenant ID
task_dataset_id : Dataset / knowledge base ID
chunks : List of chunk dictionaries to insert
progress_callback : Callback function for progress updates
"""
2025-11-28 19:25:32 +08:00
mothers = [ ]
mother_ids = set ( [ ] )
for ck in chunks :
mom = ck . get ( " mom " ) or ck . get ( " mom_with_weight " ) or " "
if not mom :
continue
id = xxhash . xxh64 ( mom . encode ( " utf-8 " ) ) . hexdigest ( )
2025-12-09 09:34:01 +08:00
ck [ " mom_id " ] = id
2025-11-28 19:25:32 +08:00
if id in mother_ids :
continue
mother_ids . add ( id )
mom_ck = copy . deepcopy ( ck )
mom_ck [ " id " ] = id
mom_ck [ " content_with_weight " ] = mom
mom_ck [ " available_int " ] = 0
flds = list ( mom_ck . keys ( ) )
for fld in flds :
2026-06-08 04:08:23 -07:00
if fld not in [ " id " , " content_with_weight " , " doc_id " , " docnm_kwd " , " kb_id " , " available_int " , " position_int " , " create_timestamp_flt " , " page_num_int " , " top_int " ] :
2025-11-28 19:25:32 +08:00
del mom_ck [ fld ]
mothers . append ( mom_ck )
for b in range ( 0 , len ( mothers ) , settings . DOC_BULK_SIZE ) :
2026-06-08 04:08:23 -07:00
ret = await thread_pool_exec (
settings . docStoreConn . insert ,
mothers [ b : b + settings . DOC_BULK_SIZE ] ,
search . index_name ( task_tenant_id ) ,
task_dataset_id ,
)
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " docStoreConn.insert " , ret )
2025-11-28 19:25:32 +08:00
task_canceled = has_canceled ( task_id )
if task_canceled :
progress_callback ( - 1 , msg = " Task has been canceled. " )
return False
2025-11-06 09:36:38 +08:00
for b in range ( 0 , len ( chunks ) , settings . DOC_BULK_SIZE ) :
2026-06-08 04:08:23 -07:00
doc_store_result = await thread_pool_exec (
settings . docStoreConn . insert ,
chunks [ b : b + settings . DOC_BULK_SIZE ] ,
search . index_name ( task_tenant_id ) ,
task_dataset_id ,
)
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " docStoreConn.insert " , doc_store_result )
2025-10-09 12:36:19 +08:00
task_canceled = has_canceled ( task_id )
if task_canceled :
2026-05-11 15:42:31 -10:00
# Roll back partial RAPTOR summary inserts so the next run is not
# mistaken for a completed checkpoint by get_raptor_chunk_methods.
2026-06-08 04:08:23 -07:00
raptor_ids_to_rollback = [ c [ " id " ] for c in chunks [ : b + settings . DOC_BULK_SIZE ] if c . get ( " raptor_kwd " ) == " raptor " ]
2026-05-11 15:42:31 -10:00
if raptor_ids_to_rollback :
try :
2026-05-27 21:54:17 +08:00
ret = await thread_pool_exec (
2026-05-11 15:42:31 -10:00
settings . docStoreConn . delete ,
{ " id " : raptor_ids_to_rollback } ,
search . index_name ( task_tenant_id ) ,
task_dataset_id ,
)
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " docStoreConn.delete " , ret )
2026-05-11 15:42:31 -10:00
logging . info (
" insert_chunks: rolled back %d partial RAPTOR chunks after cancellation (task= %s ) " ,
2026-06-08 04:08:23 -07:00
len ( raptor_ids_to_rollback ) ,
task_id ,
2026-05-11 15:42:31 -10:00
)
except Exception :
logging . exception (
" insert_chunks: failed to roll back partial RAPTOR chunks after cancellation (task= %s ) " ,
task_id ,
)
2025-10-09 12:36:19 +08:00
progress_callback ( - 1 , msg = " Task has been canceled. " )
2025-11-12 19:00:15 +08:00
return False
2025-10-09 12:36:19 +08:00
if b % 128 == 0 :
progress_callback ( prog = 0.8 + 0.1 * ( b + 1 ) / len ( chunks ) , msg = " " )
if doc_store_result :
error_message = f " Insert chunk error: { doc_store_result } , please check log file and Elasticsearch/Infinity status! "
progress_callback ( - 1 , msg = error_message )
raise Exception ( error_message )
2026-06-08 04:08:23 -07:00
chunk_ids = [ chunk [ " id " ] for chunk in chunks [ : b + settings . DOC_BULK_SIZE ] ]
2025-10-09 12:36:19 +08:00
chunk_ids_str = " " . join ( chunk_ids )
try :
TaskService . update_chunk_ids ( task_id , chunk_ids_str )
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " TaskService.update_chunk_ids " , None )
2025-10-09 12:36:19 +08:00
except DoesNotExist :
logging . warning ( f " do_handle_task update_chunk_ids failed since task { task_id } is unknown. " )
2026-06-08 04:08:23 -07:00
doc_store_result = await thread_pool_exec (
settings . docStoreConn . delete ,
{ " id " : chunk_ids } ,
search . index_name ( task_tenant_id ) ,
task_dataset_id ,
)
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " docStoreConn.delete " , doc_store_result )
2025-12-09 19:23:14 +08:00
tasks = [ ]
for chunk_id in chunk_ids :
tasks . append ( asyncio . create_task ( delete_image ( task_dataset_id , chunk_id ) ) )
try :
await asyncio . gather ( * tasks , return_exceptions = False )
except Exception as e :
logging . error ( f " delete_image failed: { e } " )
for t in tasks :
t . cancel ( )
await asyncio . gather ( * tasks , return_exceptions = True )
raise
2025-10-09 12:36:19 +08:00
progress_callback ( - 1 , msg = f " Chunk updates failed since task { task_id } is unknown. " )
2025-11-12 19:00:15 +08:00
return False
2025-10-09 12:36:19 +08:00
return True
2025-12-29 12:01:18 +08:00
@timeout ( 60 * 60 * 3 , 1 )
2025-03-03 18:59:49 +08:00
async def do_handle_task ( task ) :
2025-10-09 12:36:19 +08:00
task_type = task . get ( " task_type " , " " )
2025-12-30 11:41:38 +08:00
if task_type == " memory " :
2026-05-27 21:54:17 +08:00
result = await handle_save_to_memory_task ( task )
get_recording_context ( ) . save_func_return_value ( " handle_save_to_memory_task " , result )
2025-12-30 11:41:38 +08:00
return
2025-10-09 12:36:19 +08:00
if task_type == " dataflow " and task . get ( " doc_id " , " " ) == CANVAS_DEBUG_DOC_ID :
await run_dataflow ( task )
return
2024-12-01 17:03:00 +08:00
task_id = task [ " id " ]
task_from_page = task [ " from_page " ]
task_to_page = task [ " to_page " ]
task_tenant_id = task [ " tenant_id " ]
task_embedding_id = task [ " embd_id " ]
2026-05-22 11:46:38 +08:00
task_language = task . get ( " language " ) or " Chinese "
if not task . get ( " language " ) :
logging . warning ( " Task %s has no language set, falling back to Chinese " , task_id )
2026-02-06 14:05:32 +08:00
doc_task_llm_id = task [ " parser_config " ] . get ( " llm_id " ) or task [ " llm_id " ]
2026-06-08 04:08:23 -07:00
kb_task_llm_id = task [ " kb_parser_config " ] . get ( " llm_id " ) or task [ " llm_id " ]
task [ " llm_id " ] = kb_task_llm_id
2024-12-01 17:03:00 +08:00
task_dataset_id = task [ " kb_id " ]
task_doc_id = task [ " doc_id " ]
task_document_name = task [ " name " ]
task_parser_config = task [ " parser_config " ]
2025-03-03 18:59:49 +08:00
task_start_ts = timer ( )
2025-10-14 14:14:52 +08:00
toc_thread = None
2026-05-11 15:42:31 -10:00
raptor_cleanup_chunks = [ ]
2024-12-01 17:03:00 +08:00
# prepare the progress callback function
progress_callback = partial ( set_progress , task_id , task_from_page , task_to_page )
2024-12-12 16:38:03 +08:00
2025-07-15 17:19:27 +08:00
task_canceled = has_canceled ( task_id )
2024-12-12 16:38:03 +08:00
if task_canceled :
progress_callback ( - 1 , msg = " Task has been canceled. " )
return
2024-11-15 18:51:09 +08:00
try :
2024-12-01 17:03:00 +08:00
# bind embedding model
2026-03-05 17:27:17 +08:00
if task_embedding_id :
2026-05-29 17:39:41 +08:00
embd_model_config = get_model_config_from_provider_instance ( task_tenant_id , LLMType . EMBEDDING , task_embedding_id )
2026-03-05 17:27:17 +08:00
else :
embd_model_config = get_tenant_default_model_by_type ( task_tenant_id , LLMType . EMBEDDING )
embedding_model = LLMBundle ( task_tenant_id , embd_model_config , lang = task_language )
2025-02-28 17:52:38 +08:00
vts , _ = embedding_model . encode ( [ " ok " ] )
vector_size = len ( vts [ 0 ] )
2024-11-15 18:51:09 +08:00
except Exception as e :
2026-06-08 04:08:23 -07:00
error_message = f " Fail to bind embedding model: { str ( e ) } "
2024-12-01 22:28:00 +08:00
progress_callback ( - 1 , msg = error_message )
logging . exception ( error_message )
2024-11-15 18:51:09 +08:00
raise
2024-12-01 17:03:00 +08:00
2025-01-22 19:43:14 +08:00
init_kb ( task , vector_size )
2026-06-08 04:08:23 -07:00
if task_type [ : len ( " dataflow " ) ] == " dataflow " :
2025-10-09 12:36:19 +08:00
await run_dataflow ( task )
2025-09-05 18:50:46 +08:00
return
2025-10-09 12:36:19 +08:00
if task_type == " raptor " :
ok , kb = KnowledgebaseService . get_by_id ( task_dataset_id )
if not ok :
2025-12-17 10:03:33 +08:00
progress_callback ( prog = - 1.0 , msg = " Cannot found valid dataset for RAPTOR task " )
2025-10-09 12:36:19 +08:00
return
kb_parser_config = kb . parser_config
if not kb_parser_config . get ( " raptor " , { } ) . get ( " use_raptor " , False ) :
2025-10-13 11:53:48 +08:00
kb_parser_config . update (
{
" raptor " : {
" use_raptor " : True ,
" prompt " : " Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following: \n {cluster_content} \n The above is the content you need to summarize. " ,
" max_token " : 256 ,
" threshold " : 0.1 ,
" max_cluster " : 64 ,
" random_seed " : 0 ,
2026-05-11 15:42:31 -10:00
" scope " : " file " ,
" clustering_method " : " gmm " ,
" tree_builder " : " raptor " ,
2025-10-13 11:53:48 +08:00
} ,
}
)
2026-05-27 21:54:17 +08:00
update_result = KnowledgebaseService . update_by_id ( kb . id , { " parser_config " : kb_parser_config } )
get_recording_context ( ) . save_func_return_value ( " KnowledgebaseService.update_by_id " , update_result )
if not update_result :
2025-10-13 11:53:48 +08:00
progress_callback ( prog = - 1.0 , msg = " Internal error: Invalid RAPTOR configuration " )
return
2025-03-03 18:59:49 +08:00
# bind LLM for raptor
2026-05-29 17:39:41 +08:00
chat_model_config = get_model_config_from_provider_instance ( task_tenant_id , LLMType . CHAT , kb_task_llm_id )
2026-03-05 17:27:17 +08:00
chat_model = LLMBundle ( task_tenant_id , chat_model_config , lang = task_language )
2025-03-03 18:59:49 +08:00
# run RAPTOR
2025-05-27 17:41:35 +08:00
async with kg_limiter :
2026-05-11 15:42:31 -10:00
chunks , token_count , raptor_cleanup_chunks = await run_raptor_for_kb (
2025-10-09 12:36:19 +08:00
row = task ,
kb_parser_config = kb_parser_config ,
chat_mdl = chat_model ,
embd_mdl = embedding_model ,
vector_size = vector_size ,
callback = progress_callback ,
doc_ids = task . get ( " doc_ids " , [ ] ) ,
)
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " raptor_chunks " , chunks )
get_recording_context ( ) . record ( " raptor_token_count " , token_count )
2025-11-06 17:18:03 +08:00
if fake_doc_ids := task . get ( " doc_ids " , [ ] ) :
2025-12-29 12:01:18 +08:00
task_doc_id = fake_doc_ids [ 0 ] # use the first document ID to represent this task for logging purposes
2025-01-22 19:43:14 +08:00
# Either using graphrag or Standard chunking methods
2025-09-05 18:50:46 +08:00
elif task_type == " graphrag " :
2025-10-09 12:36:19 +08:00
ok , kb = KnowledgebaseService . get_by_id ( task_dataset_id )
if not ok :
2025-12-17 10:03:33 +08:00
progress_callback ( prog = - 1.0 , msg = " Cannot found valid dataset for GraphRAG task " )
2025-10-09 12:36:19 +08:00
return
kb_parser_config = kb . parser_config
if not kb_parser_config . get ( " graphrag " , { } ) . get ( " use_graphrag " , False ) :
2025-10-13 11:53:48 +08:00
kb_parser_config . update (
{
" graphrag " : {
" use_graphrag " : True ,
" entity_types " : [
" organization " ,
" person " ,
" geo " ,
" event " ,
" category " ,
] ,
" method " : " light " ,
2026-05-18 16:10:21 +08:00
" batch_chunk_token_size " : 4096 ,
2026-05-22 13:16:39 +08:00
" retry_attempts " : 2 ,
" retry_backoff_seconds " : 2.0 ,
" retry_backoff_max_seconds " : 60.0 ,
" build_subgraph_timeout_per_chunk_seconds " : 300 ,
" build_subgraph_min_timeout_seconds " : 600 ,
" merge_timeout_seconds " : 180 ,
" resolution_timeout_seconds " : 1800 ,
" community_timeout_seconds " : 1800 ,
" lock_acquire_timeout_seconds " : 600 ,
2025-10-13 11:53:48 +08:00
}
}
)
2026-05-27 21:54:17 +08:00
update_result = KnowledgebaseService . update_by_id ( kb . id , { " parser_config " : kb_parser_config } )
get_recording_context ( ) . save_func_return_value ( " KnowledgebaseService.update_by_id " , update_result )
if not update_result :
2025-10-13 11:53:48 +08:00
progress_callback ( prog = - 1.0 , msg = " Internal error: Invalid GraphRAG configuration " )
return
2025-10-09 12:36:19 +08:00
graphrag_conf = kb_parser_config . get ( " graphrag " , { } )
2025-01-22 19:43:14 +08:00
start_ts = timer ( )
2026-05-29 17:39:41 +08:00
chat_model_config = get_model_config_from_provider_instance ( task_tenant_id , LLMType . CHAT , kb_task_llm_id )
2026-03-05 17:27:17 +08:00
chat_model = LLMBundle ( task_tenant_id , chat_model_config , lang = task_language )
2025-03-10 15:15:06 +08:00
with_resolution = graphrag_conf . get ( " resolution " , False )
with_community = graphrag_conf . get ( " community " , False )
2025-05-27 11:16:29 +08:00
async with kg_limiter :
2025-10-09 12:36:19 +08:00
# await run_graphrag(task, task_language, with_resolution, with_community, chat_model, embedding_model, progress_callback)
2026-06-08 04:08:23 -07:00
from rag . graphrag . general . index import run_graphrag_for_kb # Lazy load, save around 2s
2025-10-09 12:36:19 +08:00
result = await run_graphrag_for_kb (
row = task ,
doc_ids = task . get ( " doc_ids " , [ ] ) ,
language = task_language ,
kb_parser_config = kb_parser_config ,
chat_model = chat_model ,
embedding_model = embedding_model ,
callback = progress_callback ,
with_resolution = with_resolution ,
with_community = with_community ,
)
logging . info ( f " GraphRAG task result for task { task } : \n { result } " )
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " graphrag_result " , result )
2025-03-10 15:15:06 +08:00
progress_callback ( prog = 1.0 , msg = " Knowledge Graph done ( {:.2f} s) " . format ( timer ( ) - start_ts ) )
2025-01-22 19:43:14 +08:00
return
2025-10-09 12:36:19 +08:00
elif task_type == " mindmap " :
progress_callback ( 1 , " place holder " )
pass
return
2024-11-15 18:51:09 +08:00
else :
2024-12-01 17:03:00 +08:00
# Standard chunking methods
2026-06-08 04:08:23 -07:00
task [ " llm_id " ] = doc_task_llm_id
2024-12-01 17:03:00 +08:00
start_ts = timer ( )
2025-03-03 18:59:49 +08:00
chunks = await build_chunks ( task , progress_callback )
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " chunks " , chunks )
# Record chunk_ids_count for comparison
chunk_ids = [ c . get ( " id " ) for c in chunks if isinstance ( c , dict ) and " id " in c ]
get_recording_context ( ) . record ( " chunk_ids_count " , len ( chunk_ids ) )
# Record chunks array for content comparison (first, middle, last, random)
2024-12-01 17:03:00 +08:00
logging . info ( " Build document {} : {:.2f} s " . format ( task_document_name , timer ( ) - start_ts ) )
if not chunks :
2026-06-08 04:08:23 -07:00
progress_callback ( 1.0 , msg = f " No chunk built from { task_document_name } " )
2024-11-15 18:51:09 +08:00
return
2024-12-01 17:03:00 +08:00
progress_callback ( msg = " Generate {} chunks " . format ( len ( chunks ) ) )
start_ts = timer ( )
2024-11-15 18:51:09 +08:00
try :
2025-03-03 18:59:49 +08:00
token_count , vector_size = await embedding ( chunks , embedding_model , task_parser_config , progress_callback )
2026-02-06 14:48:24 +08:00
except TaskCanceledException :
raise
2024-11-15 18:51:09 +08:00
except Exception as e :
2024-12-01 22:28:00 +08:00
error_message = " Generate embedding error: {} " . format ( str ( e ) )
progress_callback ( - 1 , error_message )
logging . exception ( error_message )
token_count = 0
2024-11-15 18:51:09 +08:00
raise
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " token_count " , token_count )
get_recording_context ( ) . record ( " vector_size " , vector_size )
2024-12-01 22:28:00 +08:00
progress_message = " Embedding chunks ( {:.2f} s) " . format ( timer ( ) - start_ts )
logging . info ( progress_message )
progress_callback ( msg = progress_message )
2025-10-14 14:14:52 +08:00
if task [ " parser_id " ] . lower ( ) == " naive " and task [ " parser_config " ] . get ( " toc_extraction " , False ) :
2026-05-11 00:59:00 -04:00
toc_thread = asyncio . create_task ( asyncio . to_thread ( build_TOC , task , chunks , progress_callback ) )
2024-12-12 16:38:03 +08:00
2024-12-01 17:03:00 +08:00
chunk_count = len ( set ( [ chunk [ " id " ] for chunk in chunks ] ) )
start_ts = timer ( )
2025-07-15 17:19:45 +08:00
2026-01-19 19:35:14 +08:00
async def _maybe_insert_chunks ( _chunks ) :
2025-12-23 09:38:25 +08:00
if has_canceled ( task_id ) :
2026-02-06 14:48:24 +08:00
progress_callback ( - 1 , msg = " Task has been canceled. " )
return False
2026-01-19 19:35:14 +08:00
insert_result = await insert_chunks ( task_id , task_tenant_id , task_dataset_id , _chunks , progress_callback )
2025-12-30 11:09:18 +08:00
return bool ( insert_result )
2025-12-29 12:01:18 +08:00
2025-12-23 09:38:25 +08:00
try :
2026-01-19 19:35:14 +08:00
if not await _maybe_insert_chunks ( chunks ) :
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " insertion_result " , " failed " )
2025-12-23 09:38:25 +08:00
return
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " insertion_result " , " success " )
2026-02-06 14:48:24 +08:00
if has_canceled ( task_id ) :
progress_callback ( - 1 , msg = " Task has been canceled. " )
return
2024-11-15 18:51:09 +08:00
2026-05-11 15:42:31 -10:00
if raptor_cleanup_chunks :
cleaned_chunks = 0
for cleanup_doc_id , keep_method in raptor_cleanup_chunks :
2026-05-27 21:54:17 +08:00
ret = await delete_raptor_chunks (
2026-05-11 15:42:31 -10:00
cleanup_doc_id ,
task_tenant_id ,
task_dataset_id ,
keep_method = keep_method ,
)
2026-05-27 21:54:17 +08:00
cleaned_chunks + = ret
get_recording_context ( ) . save_func_return_value ( " delete_raptor_chunks " , ret )
2026-05-11 15:42:31 -10:00
if cleaned_chunks :
progress_callback ( msg = f " Cleaned up { cleaned_chunks } stale RAPTOR chunks. " )
2026-06-08 04:08:23 -07:00
logging . info ( " Indexing doc( {} ), page( {} - {} ), chunks( {} ), elapsed: {:.2f} " . format ( task_document_name , task_from_page , task_to_page , len ( chunks ) , timer ( ) - start_ts ) )
2024-12-01 22:28:00 +08:00
2026-05-27 21:54:17 +08:00
ret = DocumentService . increment_chunk_num ( task_doc_id , task_dataset_id , token_count , chunk_count , 0 )
get_recording_context ( ) . save_func_return_value ( " DocumentService.increment_chunk_num " , ret )
2025-10-14 14:14:52 +08:00
2026-06-08 04:08:23 -07:00
# Table parser: push metadata/both column values to document-level metadata for UI / chat filters
Feature/table parser column roles (#13710)
### What problem does this PR solve?
The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.
For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.
The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.
This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.
Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-05-11 07:06:04 +05:00
if task . get ( " parser_id " , " " ) . lower ( ) == " table " :
eff_pc = merge_table_parser_config_from_kb ( task )
2026-06-08 04:08:23 -07:00
logging . debug ( f " [TABLE_META_DEBUG] table post-index: table_column_mode= { eff_pc . get ( ' table_column_mode ' ) !r} " )
try :
agg = aggregate_table_doc_metadata ( chunks , task )
logging . debug ( f " [TABLE_META_DEBUG] aggregated metadata: { agg } " )
strip_keys = table_parser_strip_doc_metadata_keys ( eff_pc )
existing = DocMetadataService . get_document_metadata ( task_doc_id )
existing = existing if isinstance ( existing , dict ) else { }
preserved = { k : v for k , v in existing . items ( ) if k not in strip_keys }
merged = update_metadata_to ( dict ( preserved ) , agg )
logging . debug (
f " [TABLE_META_DEBUG] calling update_document_metadata for doc_id= { task_doc_id } , "
f " meta_fields keys= { list ( merged . keys ( ) ) } , "
f " table_strip_key_count= { len ( strip_keys ) } , agg_keys= { list ( agg . keys ( ) ) } "
)
Feature/table parser column roles (#13710)
### What problem does this PR solve?
The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.
For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.
The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.
This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.
Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-05-11 07:06:04 +05:00
try :
2026-06-08 04:08:23 -07:00
ret = DocMetadataService . update_document_metadata ( task_doc_id , merged )
get_recording_context ( ) . save_func_return_value ( " DocMetadataService.update_document_metadata " , ret )
logging . debug ( " [TABLE_META_DEBUG] update_document_metadata succeeded " )
except Exception as ue :
logging . error (
" update_document_metadata failed (table parser, doc_id= %s ): %s " ,
Feature/table parser column roles (#13710)
### What problem does this PR solve?
The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.
For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.
The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.
This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.
Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-05-11 07:06:04 +05:00
task_doc_id ,
2026-06-08 04:08:23 -07:00
ue ,
exc_info = True ,
Feature/table parser column roles (#13710)
### What problem does this PR solve?
The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.
For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.
The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.
This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.
Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-05-11 07:06:04 +05:00
)
2026-06-08 04:08:23 -07:00
except Exception as e :
logging . exception (
" Table parser document metadata aggregation failed (doc_id= %s ): %s " ,
task_doc_id ,
e ,
)
Feature/table parser column roles (#13710)
### What problem does this PR solve?
The table file parser (CSV/Excel) currently treats all columns
identically — every column is both vectorized (embedded in chunk text)
and stored as filterable metadata. There's no way for users to control
which columns should be searchable by semantic meaning versus which
should only be filterable attributes.
For example, when ingesting a news articles CSV with columns like title,
content, country, category, source, etc., the embedding includes
metadata fields like country: Brazil and source: Reuters in the chunk
text, which dilutes the semantic quality of the embedding without adding
retrieval value.
The RDBMS connector (MySQL/PostgreSQL) already supports content_columns
/ metadata_columns, but this capability was missing for file-based table
ingestion.
This PR adds column-level control (vectorize / metadata / both) for the
table file parser, following RAGFlow's existing patterns.
Backward compatible: Datasets without table_column_roles or with
table_column_mode: auto behave exactly as before (all columns = both).
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2026-05-11 07:06:04 +05:00
2025-12-23 09:38:25 +08:00
progress_callback ( msg = " Indexing done ( {:.2f} s). " . format ( timer ( ) - start_ts ) )
2024-11-15 18:51:09 +08:00
2025-12-23 09:38:25 +08:00
if toc_thread :
2026-05-11 00:59:00 -04:00
d = await toc_thread
2025-12-23 09:38:25 +08:00
if d :
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " toc_chunk " , [ d ] )
2026-01-19 19:35:14 +08:00
if not await _maybe_insert_chunks ( [ d ] ) :
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " toc_inserted " , False )
2025-12-23 09:38:25 +08:00
return
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " toc_inserted " , True )
ret = DocumentService . increment_chunk_num ( task_doc_id , task_dataset_id , 0 , 1 , 0 )
get_recording_context ( ) . save_func_return_value ( " DocumentService.increment_chunk_num " , ret )
2025-12-23 09:38:25 +08:00
if has_canceled ( task_id ) :
progress_callback ( - 1 , msg = " Task has been canceled. " )
return
task_time_cost = timer ( ) - task_start_ts
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . record ( " task_status " , " completed " )
2025-12-23 09:38:25 +08:00
progress_callback ( prog = 1.0 , msg = " Task done ( {:.2f} s) " . format ( task_time_cost ) )
2026-06-08 04:08:23 -07:00
logging . info ( " Chunk doc( {} ), page( {} - {} ), chunks( {} ), token( {} ), elapsed: {:.2f} " . format ( task_document_name , task_from_page , task_to_page , len ( chunks ) , token_count , task_time_cost ) )
2025-12-23 09:38:25 +08:00
finally :
2026-05-11 00:59:00 -04:00
if toc_thread is not None and not toc_thread . done ( ) :
toc_thread . cancel ( )
2025-12-23 09:38:25 +08:00
if has_canceled ( task_id ) :
try :
2026-01-20 13:29:37 +08:00
exists = await thread_pool_exec (
2026-01-07 15:08:17 +08:00
settings . docStoreConn . index_exist ,
2025-12-23 09:38:25 +08:00
search . index_name ( task_tenant_id ) ,
task_dataset_id ,
)
if exists :
2026-05-27 21:54:17 +08:00
ret = await thread_pool_exec (
2025-12-23 09:38:25 +08:00
settings . docStoreConn . delete ,
{ " doc_id " : task_doc_id } ,
search . index_name ( task_tenant_id ) ,
task_dataset_id ,
)
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " docStoreConn.delete " , ret )
2025-12-30 11:09:18 +08:00
except Exception as e :
2026-06-08 04:08:23 -07:00
logging . exception ( f " Remove doc( { task_doc_id } ) from docStore failed when task( { task_id } ) canceled, exception: { e } " )
2024-11-15 18:51:09 +08:00
2025-11-11 17:36:48 +08:00
2025-12-29 12:01:18 +08:00
async def handle_task ( ) :
2025-03-03 18:59:49 +08:00
global DONE_TASKS , FAILED_TASKS
redis_msg , task = await collect ( )
if not task :
2025-12-09 19:23:14 +08:00
await asyncio . sleep ( 5 )
2025-03-03 18:59:49 +08:00
return
2025-10-09 12:36:19 +08:00
task_type = task [ " task_type " ]
2026-06-08 04:08:23 -07:00
pipeline_task_type = TASK_TYPE_TO_PIPELINE_TASK_TYPE . get ( task_type , PipelineTaskType . PARSE ) or PipelineTaskType . PARSE
2025-12-30 11:09:18 +08:00
task_id = task [ " id " ]
2025-03-03 18:59:49 +08:00
try :
CURRENT_TASKS [ task [ " id " ] ] = copy . deepcopy ( task )
2026-05-27 21:54:17 +08:00
run_mode = os . environ . get ( " TE_RUN_MODE " , " 0 " )
logging . info ( f " TE_RUN_MODE is { run_mode } " )
# Check if dry-run comparison is enabled via environment variable
2026-06-08 04:08:23 -07:00
if run_mode == " 1 " : # dry run mode - compare
2026-05-27 21:54:17 +08:00
set_recording_context ( RecordingContext ( ) )
2026-06-08 04:08:23 -07:00
await do_handle_task ( task ) # original execution
2026-05-27 21:54:17 +08:00
# dry run mode
logging . info ( f " -----dry run task: { task_id } , { task . get ( ' name ' , ' ' ) } , doc id: { task . get ( ' doc_id ' , ' ' ) } " )
2026-06-08 04:08:23 -07:00
await TaskManager . dry_run_task ( task , get_recording_context ( ) , chat_limiter , minio_limiter , chunk_limiter , embed_limiter , kg_limiter , set_progress , has_canceled )
elif run_mode == " 0 " : # use refactor-ed version
2026-05-27 21:54:17 +08:00
# switch to refactor-ed version
logging . info ( f " -----run refactor-ed task executor: { task_id } , { task . get ( ' name ' , ' ' ) } , doc id: { task . get ( ' doc_id ' , ' ' ) } " )
refactor: overhaul task executor with layered architecture and comprehensive test suite (#15471)
## Summary
Decomposes the monolithic `task_executor.py` (1945 lines) into a 6-layer
architecture with clear separation of concerns. The refactored code is
functionally equivalent to the original, verified through 400 passing
tests and a production-vs-dry-run comparison framework.
## Architecture
```
entry (task_manager)
└─ orchestration (task_handler)
├─ services (chunk_service, embedding_service, dataflow_service, raptor_service, post_processor)
│ └─ utilities (chunk_builder, chunk_post_processor, embedding_utils)
└─ infrastructure (task_context, recording_context, interceptor)
```
Key design decisions:
- **TaskContext** — typed facade over raw task dict, injects rate
limiters + callbacks via composition
- **RecordingContext + Comparator** — enables side-by-side production vs
dry-run execution for safe migration
- **NullRecordingContext** — zero-allocation no-op for production, uses
`__slots__`
- **WriteOperationInterceptor** — FIFO replay of previous runs function
returns for comparison mode
## Migration Strategy
The original `handle_task()` in `task_executor.py` uses a 3-way switch
via `TE_RUN_MODE`:
- `TE_RUN_MODE=0` (default) → runs refactored code
- `TE_RUN_MODE=1` → runs both original + refactored, compares all
intermediate results
- `TE_RUN_MODE=2` → runs original code (fallback)
The comparison mode (`TE_RUN_MODE=1`) records ~40 intermediate values
(chunks, vectors, token counts, func return values) from the production
run and replays them during dry-run, then uses `ContextComparator` to
report mismatches.
## Functional Equivalence Fixes
All divergences between original and refactored code were identified and
fixed:
- Timeout decorators (handle/build_chunks/raptor/embedding)
- NullRecordingContext leak in finally block causing RuntimeError
- MinIO None-binary check with proper FileNotFoundError
- Dataflow dispatch after embedding binding + init_kb
- Memory task missing return after processing
- RAPTOR checkpoint progress reporting
- Tag cache (get_tags_from_cache/set_tags_to_cache) restoration
- dataflow_id correction in _load_dsl
- Language default Chinese, dead code guard removal
- embed_chunks made async with proper thread_pool_exec
- Full GraphRAG default configuration (10 parameters)
- Hardcoded q_768_vec fallback removal in RAPTOR
## Test Changes
- 20 new tests covering table parser manual mode, tag cache, embedding
edge cases, RAPTOR checkpoint, dataflow_id correction, storage binary
None, cancel cleanup, metadata=None boundary
- Unified `make_task_context`/`make_task_dict` factories eliminated 10+
duplicated helpers
- DataflowService tests migrated from internal method mocks to IO
boundary mocks (real orchestration code executes)
- Parametrized duplicate build_chunks post-processor tests
- 7 raptor tests modernized to @pytest.mark.asyncio
- Mock count per test reduced through boundary-level mocking strategy
**Test count: 400 passing, 0 warnings, 0 skips**
## Files Changed
| File | Change |
|------|--------|
| `rag/svr/task_executor.py` | +1 line (NullRecordingContext fix) |
| `rag/svr/task_executor_refactor/task_handler.py` | Orchestration
layer, 8 logic fixes |
| `rag/svr/task_executor_refactor/chunk_service.py` | +timeout +
None-check |
| `rag/svr/task_executor_refactor/embedding_service.py` | sync→async
rewrite |
| `rag/svr/task_executor_refactor/dataflow_service.py` | dataflow_id fix
+ timeout |
| `rag/svr/task_executor_refactor/raptor_service.py` | checkpoint fix +
assert |
| `rag/svr/task_executor_refactor/chunk_post_processor.py` | tag cache
restore |
| `rag/svr/task_executor_refactor/task_context.py` | language default
fix |
| `test/.../conftest.py` | +294 lines shared helpers |
| `test/.../*.py` | 15 test files refactored, 20 new tests |
---------
Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-03 17:18:31 +08:00
set_recording_context ( NullRecordingContext ( ) )
2026-06-08 04:08:23 -07:00
await TaskManager . run_refactored_task ( task , chat_limiter , minio_limiter , chunk_limiter , embed_limiter , kg_limiter , set_progress , has_canceled )
else : # original version
2026-05-27 21:54:17 +08:00
logging . info ( f " -----run original task executor: { task_id } , { task . get ( ' name ' , ' ' ) } , doc id: { task . get ( ' doc_id ' , ' ' ) } " )
set_recording_context ( NullRecordingContext ( ) )
await do_handle_task ( task )
2025-03-03 18:59:49 +08:00
DONE_TASKS + = 1
2025-12-30 11:09:18 +08:00
CURRENT_TASKS . pop ( task_id , None )
2025-03-03 18:59:49 +08:00
logging . info ( f " handle_task done for task { json . dumps ( task ) } " )
2026-02-06 14:48:24 +08:00
except TaskCanceledException as e :
DONE_TASKS + = 1
CURRENT_TASKS . pop ( task_id , None )
2026-06-08 04:08:23 -07:00
logging . info ( f " handle_task canceled for task { task_id } : { getattr ( e , ' msg ' , str ( e ) ) } " )
2025-03-03 18:59:49 +08:00
except Exception as e :
FAILED_TASKS + = 1
2025-12-30 11:09:18 +08:00
CURRENT_TASKS . pop ( task_id , None )
2024-11-15 18:51:09 +08:00
try :
2025-03-10 15:15:06 +08:00
err_msg = str ( e )
while isinstance ( e , exceptiongroup . ExceptionGroup ) :
e = e . exceptions [ 0 ]
2026-06-08 04:08:23 -07:00
err_msg + = " -- " + str ( e )
2025-12-30 11:09:18 +08:00
set_progress ( task_id , prog = - 1 , msg = f " [Exception]: { err_msg } " )
except Exception as e :
logging . exception ( f " [Exception]: { str ( e ) } " )
2025-03-03 18:59:49 +08:00
pass
logging . exception ( f " handle_task got exception for task { json . dumps ( task ) } " )
2025-10-09 12:36:19 +08:00
finally :
if not task . get ( " dataflow_id " , " " ) :
2026-04-16 13:08:36 +08:00
referred_document_id = None
if task_type in [ " graphrag " , " raptor " , " mindmap " ] :
referred_document_id = task [ " doc_ids " ] [ 0 ]
2026-06-08 04:08:23 -07:00
ret = PipelineOperationLogService . record_pipeline_operation (
document_id = task [ " doc_id " ] , pipeline_id = " " , task_type = pipeline_task_type , task_id = task_id , referred_document_id = referred_document_id
)
2026-05-27 21:54:17 +08:00
get_recording_context ( ) . save_func_return_value ( " PipelineOperationLogService.record_pipeline_operation " , ret )
2025-10-09 12:36:19 +08:00
2025-03-03 18:59:49 +08:00
redis_msg . ack ( )
2025-11-10 12:51:39 +08:00
async def get_server_ip ( ) - > str :
# get ip by udp
try :
with socket . socket ( socket . AF_INET , socket . SOCK_DGRAM ) as s :
s . connect ( ( " 8.8.8.8 " , 80 ) )
return s . getsockname ( ) [ 0 ]
except Exception as e :
logging . error ( str ( e ) )
2026-06-08 04:08:23 -07:00
return " Unknown "
2025-11-10 12:51:39 +08:00
2025-03-03 18:59:49 +08:00
async def report_status ( ) :
2026-01-04 11:24:05 +08:00
"""
Periodically reports the executor ' s heartbeat
"""
global PENDING_TASKS , LAG_TASKS , DONE_TASKS , FAILED_TASKS
ip_address = await get_server_ip ( )
pid = os . getpid ( )
# Register the executor in Redis
2024-11-15 14:43:55 +08:00
REDIS_CONN . sadd ( " TASKEXE " , CONSUMER_NAME )
2025-04-19 16:18:51 +08:00
redis_lock = RedisDistributedLock ( " clean_task_executor " , lock_value = CONSUMER_NAME , timeout = 60 )
2026-01-04 11:24:05 +08:00
2024-08-21 17:48:00 +08:00
while True :
2026-01-04 11:24:05 +08:00
now = datetime . now ( )
now_ts = now . timestamp ( )
group_info = REDIS_CONN . queue_info ( settings . get_svr_queue_name ( 0 ) , SVR_CONSUMER_GROUP_NAME ) or { }
PENDING_TASKS = int ( group_info . get ( " pending " , 0 ) )
LAG_TASKS = int ( group_info . get ( " lag " , 0 ) )
current = copy . deepcopy ( CURRENT_TASKS )
2026-06-08 04:08:23 -07:00
heartbeat = json . dumps (
{
" ip_address " : ip_address ,
" pid " : pid ,
" name " : CONSUMER_NAME ,
" now " : now . astimezone ( ) . isoformat ( timespec = " milliseconds " ) ,
" boot_at " : BOOT_AT ,
" pending " : PENDING_TASKS ,
" lag " : LAG_TASKS ,
" done " : DONE_TASKS ,
" failed " : FAILED_TASKS ,
" current " : current ,
}
)
2026-01-04 11:24:05 +08:00
# Report heartbeat to Redis
2024-08-21 17:48:00 +08:00
try :
2026-01-04 11:24:05 +08:00
REDIS_CONN . zadd ( CONSUMER_NAME , heartbeat , now_ts )
except Exception as e :
logging . warning ( f " Failed to report heartbeat: { e } " )
else :
2026-05-27 21:54:17 +08:00
logging . debug ( f " { CONSUMER_NAME } reported heartbeat: { heartbeat } " )
pass
2024-11-15 14:43:55 +08:00
2026-01-04 11:24:05 +08:00
# Clean up own expired heartbeat
try :
REDIS_CONN . zremrangebyscore ( CONSUMER_NAME , 0 , now_ts - 60 * 30 )
except Exception as e :
logging . warning ( f " Failed to clean heartbeat: { e } " )
2025-04-19 16:18:51 +08:00
2026-01-04 11:24:05 +08:00
# Clean other executors
lock_acquired = False
try :
lock_acquired = redis_lock . acquire ( )
2025-12-30 11:09:18 +08:00
except Exception as e :
2026-01-04 11:24:05 +08:00
logging . warning ( f " Failed to acquire Redis lock: { e } " )
if lock_acquired :
try :
task_executors = REDIS_CONN . smembers ( " TASKEXE " ) or set ( )
for worker_name in task_executors :
if worker_name == CONSUMER_NAME :
continue
try :
last_heartbeat = REDIS_CONN . REDIS . zrevrange ( worker_name , 0 , 0 , withscores = True )
except Exception as e :
logging . warning ( f " Failed to read zset for { worker_name } : { e } " )
continue
if not last_heartbeat or now_ts - last_heartbeat [ 0 ] [ 1 ] > WORKER_HEARTBEAT_TIMEOUT :
logging . info ( f " { worker_name } expired, removed " )
REDIS_CONN . srem ( " TASKEXE " , worker_name )
REDIS_CONN . delete ( worker_name )
except Exception as e :
logging . warning ( f " Failed to clean other executors: { e } " )
finally :
redis_lock . release ( )
2025-12-09 19:23:14 +08:00
await asyncio . sleep ( 30 )
2025-07-15 17:19:45 +08:00
2025-05-19 10:25:56 +08:00
async def task_manager ( ) :
2025-06-06 03:32:35 -03:00
try :
2025-05-19 10:25:56 +08:00
await handle_task ( )
2025-06-06 03:32:35 -03:00
finally :
task_limiter . release ( )
2025-04-19 16:18:51 +08:00
2025-03-03 18:59:49 +08:00
async def main ( ) :
task executor issues (#12006)
### What problem does this PR solve?
**Fixes #8706** - `InfinityException: TOO_MANY_CONNECTIONS` when running
multiple task executor workers
### Problem Description
When running RAGFlow with 8-16 task executor workers, most workers fail
to start properly. Checking logs revealed that workers were
stuck/hanging during Infinity connection initialization - only 1-2
workers would successfully register in Redis while the rest remained
blocked.
### Root Cause
The Infinity SDK `ConnectionPool` pre-allocates all connections in
`__init__`. With the default `max_size=32` and multiple workers (e.g.,
16), this creates 16×32=512 connections immediately on startup,
exceeding Infinity's default 128 connection limit. Workers hang while
waiting for connections that can never be established.
### Changes
1. **Prevent Infinity connection storm** (`rag/utils/infinity_conn.py`,
`rag/svr/task_executor.py`)
- Reduced ConnectionPool `max_size` from 32 to 4 (sufficient since
operations are synchronous)
- Added staggered startup delay (2s per worker) to spread connection
initialization
2. **Handle None children_delimiter** (`rag/app/naive.py`)
- Use `or ""` to handle explicitly set None values from parser config
3. **MinerU parser robustness** (`deepdoc/parser/mineru_parser.py`)
- Use `.get()` for optional output fields that may be missing
- Fix DISCARDED block handling: change `pass` to `continue` to skip
discarded blocks entirely
### Why `max_size=4` is sufficient
| Workers | Pool Size | Total Connections | Infinity Limit |
|---------|-----------|-------------------|----------------|
| 16 | 32 | 512 | 128 ❌ |
| 16 | 4 | 64 | 128 ✅ |
| 32 | 4 | 128 | 128 ✅ |
- All RAGFlow operations are synchronous: `get_conn()` → operation →
`release_conn()`
- No parallel `docStoreConn` operations in the codebase
- Maximum 1-2 concurrent connections needed per worker; 4 provides
safety margin
### MinerU DISCARDED block bug
When MinerU returns blocks with `type: "discarded"` (headers, footers,
watermarks, page numbers, artifacts), the previous code used `pass`
which left the `section` variable undefined, causing:
- **UnboundLocalError** if DISCARDED is the first block
- **Duplicate content** if DISCARDED follows another block (stale value
from previous iteration)
**Root cause confirmed via MinerU source code:**
From
[`mineru/utils/enum_class.py`](https://github.com/opendatalab/MinerU/blob/main/mineru/utils/enum_class.py#L14):
```python
class BlockType:
DISCARDED = 'discarded'
# VLM 2.5+ also has: HEADER, FOOTER, PAGE_NUMBER, ASIDE_TEXT, PAGE_FOOTNOTE
```
Per [MinerU
documentation](https://opendatalab.github.io/MinerU/reference/output_files/),
discarded blocks contain content that should be filtered out for clean
text extraction.
**Fix:** Changed `pass` to `continue` to skip discarded blocks entirely.
### Testing
- Verified all 16 workers now register successfully in Redis
- All workers heartbeating correctly
- Document parsing works as expected
- MinerU parsing with DISCARDED blocks no longer crashes
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
---------
Co-authored-by: user210 <user210@rt>
2025-12-18 04:03:30 +02:00
# Stagger executor startup to prevent connection storm to Infinity
# Extract worker number from CONSUMER_NAME (e.g., "task_executor_abc123_5" -> 5)
try :
worker_num = int ( CONSUMER_NAME . rsplit ( " _ " , 1 ) [ - 1 ] )
# Add random delay: base delay + worker_num * 2.0s + random jitter
# This spreads out connection attempts over several seconds
startup_delay = worker_num * 2.0 + random . uniform ( 0 , 0.5 )
if startup_delay > 0 :
logging . info ( f " Staggering startup by { startup_delay : .2f } s to prevent connection storm " )
await asyncio . sleep ( startup_delay )
except ( ValueError , IndexError ) :
pass # Non-standard consumer name, skip delay
2024-11-30 18:48:06 +08:00
logging . info ( r """
2025-10-22 09:29:20 +08:00
____ __ _
2025-10-21 09:38:20 +08:00
/ _ / ___ ____ ____ _____ / / _ ( _ ) ___ ____ ________ ______ _____ _____
/ / / __ \/ __ ` / _ \/ ___ / __ / / __ \/ __ \ / ___ / _ \/ ___ / | / / _ \/ ___ /
2025-10-22 09:29:20 +08:00
_ / / / / / / / _ / / __ ( __ ) / _ / / / _ / / / / / ( __ ) __ / / | | / / __ / /
/ ___ / _ / / _ / \__ , / \___ / ____ / \__ / _ / \____ / _ / / _ / / ____ / \___ / _ / | ___ / \___ / _ /
/ ____ /
2024-11-30 18:48:06 +08:00
""" )
2026-06-08 04:08:23 -07:00
logging . info ( f " RAGFlow ingestion version: { get_ragflow_version ( ) } " )
logging . info ( f " ENABLE_DRY_RUN_COMPARISON: { os . environ . get ( ' ENABLE_DRY_RUN_COMPARISON ' , ' 0 ' ) } " )
2025-10-23 23:02:27 +08:00
show_configs ( )
2024-11-15 22:55:41 +08:00
settings . init_settings ( )
2025-11-06 09:36:38 +08:00
settings . check_and_install_torch ( )
2026-06-08 04:08:23 -07:00
logging . info ( f " default embedding config: { settings . EMBEDDING_CFG } " )
2025-11-06 09:36:38 +08:00
settings . print_rag_settings ( )
2025-03-12 09:43:18 +08:00
if sys . platform != " win32 " :
signal . signal ( signal . SIGUSR1 , start_tracemalloc_and_snapshot )
signal . signal ( signal . SIGUSR2 , stop_tracemalloc )
2026-06-08 04:08:23 -07:00
TRACE_MALLOC_ENABLED = int ( os . environ . get ( " TRACE_MALLOC_ENABLED " , " 0 " ) )
2025-02-24 16:21:55 +08:00
if TRACE_MALLOC_ENABLED :
start_tracemalloc_and_snapshot ( None , None )
2025-04-19 16:18:51 +08:00
signal . signal ( signal . SIGINT , signal_handler )
signal . signal ( signal . SIGTERM , signal_handler )
2025-12-09 19:23:14 +08:00
report_task = asyncio . create_task ( report_status ( ) )
tasks = [ ]
2026-01-12 12:48:23 +08:00
logging . info ( f " RAGFlow ingestion is ready after { time . time ( ) - start_ts } s initialization. " )
2025-12-09 19:23:14 +08:00
try :
2025-04-19 16:18:51 +08:00
while not stop_event . is_set ( ) :
2025-06-06 03:32:35 -03:00
await task_limiter . acquire ( )
2025-12-09 19:23:14 +08:00
t = asyncio . create_task ( task_manager ( ) )
tasks . append ( t )
finally :
for t in tasks :
t . cancel ( )
await asyncio . gather ( * tasks , return_exceptions = True )
report_task . cancel ( )
await asyncio . gather ( report_task , return_exceptions = True )
2025-03-03 18:59:49 +08:00
logging . error ( " BUG!!! You should not reach here!!! " )
2024-11-30 18:48:06 +08:00
2025-12-29 12:01:18 +08:00
2024-11-15 18:51:09 +08:00
if __name__ == " __main__ " :
2026-05-27 21:54:17 +08:00
# Parse command line arguments (consistent with SAAS version)
2026-06-08 04:08:23 -07:00
parser = argparse . ArgumentParser ( description = " Task Executor " )
parser . add_argument ( " -i " , " --index " , type = str , default = " 0 " )
2026-05-27 21:54:17 +08:00
parser . add_argument ( " -t " , " --type " , type = str , default = " common " , help = " [common, graphrag, raptor, resume] " )
args = parser . parse_args ( )
2026-06-08 04:08:23 -07:00
2026-05-27 21:54:17 +08:00
# Update global variables
TASK_TYPE = args . type
TE_IDX = args . index
CONSUMER_NAME = f " task_executor_ { TASK_TYPE } _ { TE_IDX } "
2026-06-08 04:08:23 -07:00
2025-03-13 14:37:59 +08:00
faulthandler . enable ( )
2025-06-18 09:41:09 +08:00
init_root_logger ( CONSUMER_NAME )
2026-04-10 18:01:49 +08:00
try :
asyncio . run ( main ( ) )
except Exception as e :
logging . exception ( f " Unhandled exception: { e } " )
sys . exit ( 1 )