Files
ragflow/api/db/services/document_service.py

1110 lines
44 KiB
Python
Raw Normal View History

#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import random
from datetime import datetime
import xxhash
Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423) ### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2025-10-09 12:36:19 +08:00
from peewee import fn, Case, JOIN
from api.constants import IMG_BASE64_PREFIX, FILE_NAME_LEN_LIMIT
from api.db import PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES, FileType, UserTenantRole, CanvasCategory
from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTenant, File2Document, File, UserCanvas, User
from api.db.db_utils import bulk_insert_into_db
from api.db.services.common_service import CommonService, retry_deadlock_operation
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.doc_metadata_service import DocMetadataService
from common import settings
from common.constants import ParserType, StatusEnum, TaskStatus, SVR_CONSUMER_GROUP_NAME, MAXIMUM_TASK_PAGE_NUMBER
from common.doc_store.doc_store_base import OrderByExpr
from common.misc_utils import get_uuid
from common.time_utils import current_timestamp, get_format_time
from rag.nlp import search
from rag.utils.redis_conn import REDIS_CONN
class DocumentService(CommonService):
model = Document
@classmethod
def get_cls_model_fields(cls):
return [
cls.model.id,
cls.model.thumbnail,
cls.model.kb_id,
cls.model.parser_id,
Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423) ### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2025-10-09 12:36:19 +08:00
cls.model.pipeline_id,
cls.model.parser_config,
cls.model.source_type,
cls.model.type,
cls.model.created_by,
cls.model.name,
cls.model.location,
cls.model.size,
cls.model.token_num,
cls.model.chunk_num,
cls.model.progress,
cls.model.progress_msg,
cls.model.process_begin_at,
cls.model.process_duration,
cls.model.suffix,
cls.model.run,
cls.model.status,
cls.model.create_time,
cls.model.create_date,
cls.model.update_time,
cls.model.update_date,
]
@classmethod
@DB.connection_context()
def get_list(cls, kb_id, page_number, items_per_page, orderby, desc, keywords, id, name, suffix=None, run=None, doc_ids=None):
fields = cls.get_cls_model_fields()
docs = (
cls.model.select(*[*fields, UserCanvas.title])
.join(File2Document, on=(File2Document.document_id == cls.model.id))
.join(File, on=(File.id == File2Document.file_id))
.join(UserCanvas, on=((cls.model.pipeline_id == UserCanvas.id) & (UserCanvas.canvas_category == CanvasCategory.DataFlow.value)), join_type=JOIN.LEFT_OUTER)
Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423) ### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2025-10-09 12:36:19 +08:00
.where(cls.model.kb_id == kb_id)
)
if id:
docs = docs.where(cls.model.id == id)
if name:
docs = docs.where(cls.model.name == name)
if keywords:
docs = docs.where(fn.LOWER(cls.model.name).contains(keywords.lower()))
if doc_ids is not None:
docs = docs.where(cls.model.id.in_(doc_ids))
if suffix:
docs = docs.where(cls.model.suffix.in_(suffix))
if run:
docs = docs.where(cls.model.run.in_(run))
if desc:
docs = docs.order_by(cls.model.getter_by(orderby).desc())
else:
docs = docs.order_by(cls.model.getter_by(orderby).asc())
count = docs.count()
docs = docs.paginate(page_number, items_per_page)
docs_list = list(docs.dicts())
doc_ids_on_page = [doc["id"] for doc in docs_list]
metadata_map = DocMetadataService.get_metadata_for_documents(doc_ids_on_page, kb_id) if doc_ids_on_page else {}
for doc in docs_list:
doc["meta_fields"] = metadata_map.get(doc["id"], {})
return docs_list, count
@classmethod
@DB.connection_context()
def check_doc_health(cls, tenant_id: str, filename):
import os
MAX_FILE_NUM_PER_USER = int(os.environ.get("MAX_FILE_NUM_PER_USER", 0))
if 0 < MAX_FILE_NUM_PER_USER <= DocumentService.get_doc_count(tenant_id):
raise RuntimeError("Exceed the maximum file number of a free user!")
if len(filename.encode("utf-8")) > FILE_NAME_LEN_LIMIT:
raise RuntimeError("Exceed the maximum length of file name!")
return True
@classmethod
@DB.connection_context()
def get_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix, name=None, doc_ids=None, return_empty_metadata=False):
fields = cls.get_cls_model_fields()
if keywords:
docs = (
cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])
.join(File2Document, on=(File2Document.document_id == cls.model.id))
.join(File, on=(File.id == File2Document.file_id))
.join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)
.join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)
.where((cls.model.kb_id == kb_id), (fn.LOWER(cls.model.name).contains(keywords.lower())))
)
else:
docs = (
cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])
.join(File2Document, on=(File2Document.document_id == cls.model.id))
.join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)
.join(File, on=(File.id == File2Document.file_id))
.join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)
Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423) ### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2025-10-09 12:36:19 +08:00
.where(cls.model.kb_id == kb_id)
)
if doc_ids is not None:
docs = docs.where(cls.model.id.in_(doc_ids))
if run_status:
docs = docs.where(cls.model.run.in_(run_status))
if types:
docs = docs.where(cls.model.type.in_(types))
if suffix:
docs = docs.where(cls.model.suffix.in_(suffix))
if name:
docs = docs.where(cls.model.name == name)
if return_empty_metadata:
metadata_map = DocMetadataService.get_metadata_for_documents(None, kb_id)
doc_ids_with_metadata = set(metadata_map.keys())
if doc_ids_with_metadata:
docs = docs.where(cls.model.id.not_in(doc_ids_with_metadata))
count = docs.count()
if desc:
docs = docs.order_by(cls.model.getter_by(orderby).desc())
else:
docs = docs.order_by(cls.model.getter_by(orderby).asc())
if page_number and items_per_page:
docs = docs.paginate(page_number, items_per_page)
docs_list = list(docs.dicts())
if return_empty_metadata:
for doc in docs_list:
doc["meta_fields"] = {}
else:
doc_ids_on_page = [doc["id"] for doc in docs_list]
metadata_map = DocMetadataService.get_metadata_for_documents(doc_ids_on_page, kb_id) if doc_ids_on_page else {}
for doc in docs_list:
doc["meta_fields"] = metadata_map.get(doc["id"], {})
return docs_list, count
@classmethod
@DB.connection_context()
def get_filter_by_kb_id(cls, kb_id, keywords, run_status, types, suffix):
"""
returns:
{
"suffix": {
"ppt": 1,
"doxc": 2
},
"run_status": {
"1": 2,
"2": 2
}
"metadata": {
"key1": {
"key1_value1": 1,
"key1_value2": 2,
},
"key2": {
"key2_value1": 2,
"key2_value2": 1,
},
}
}, total
where "1" => RUNNING, "2" => CANCEL
"""
fields = cls.get_cls_model_fields()
if keywords:
query = (
cls.model.select(*fields)
.join(File2Document, on=(File2Document.document_id == cls.model.id))
.join(File, on=(File.id == File2Document.file_id))
.where((cls.model.kb_id == kb_id), (fn.LOWER(cls.model.name).contains(keywords.lower())))
)
else:
query = cls.model.select(*fields).join(File2Document, on=(File2Document.document_id == cls.model.id)).join(File, on=(File.id == File2Document.file_id)).where(cls.model.kb_id == kb_id)
if run_status:
query = query.where(cls.model.run.in_(run_status))
if types:
query = query.where(cls.model.type.in_(types))
if suffix:
query = query.where(cls.model.suffix.in_(suffix))
rows = query.select(cls.model.run, cls.model.suffix, cls.model.id)
total = rows.count()
suffix_counter = {}
run_status_counter = {}
metadata_counter = {}
empty_metadata_count = 0
doc_ids = [row.id for row in rows]
metadata = {}
if doc_ids:
try:
metadata = DocMetadataService.get_metadata_for_documents(doc_ids, kb_id)
except Exception as e:
logging.warning(f"Failed to fetch metadata from ES/Infinity: {e}")
for row in rows:
suffix_counter[row.suffix] = suffix_counter.get(row.suffix, 0) + 1
run_status_counter[str(row.run)] = run_status_counter.get(str(row.run), 0) + 1
meta_fields = metadata.get(row.id, {})
if not meta_fields:
empty_metadata_count += 1
continue
has_valid_meta = False
for key, value in meta_fields.items():
values = value if isinstance(value, list) else [value]
for vv in values:
if vv is None:
continue
if isinstance(vv, str) and not vv.strip():
continue
sv = str(vv)
if key not in metadata_counter:
metadata_counter[key] = {}
metadata_counter[key][sv] = metadata_counter[key].get(sv, 0) + 1
has_valid_meta = True
if not has_valid_meta:
empty_metadata_count += 1
metadata_counter["empty_metadata"] = {"true": empty_metadata_count}
return {
"suffix": suffix_counter,
"run_status": run_status_counter,
"metadata": metadata_counter,
}, total
@classmethod
@DB.connection_context()
def get_parsing_status_by_kb_ids(cls, kb_ids: list[str]) -> dict[str, dict[str, int]]:
"""Return aggregated document parsing status counts grouped by dataset (kb_id).
For each kb_id, counts documents in each run-status bucket:
- unstart_count (run == "0")
- running_count (run == "1")
- cancel_count (run == "2")
- done_count (run == "3")
- fail_count (run == "4")
Returns a dict keyed by kb_id, e.g.
{"kb-abc": {"unstart_count": 10, "running_count": 2, ...}, ...}
"""
if not kb_ids:
return {}
status_field_map = {
TaskStatus.UNSTART.value: "unstart_count",
TaskStatus.RUNNING.value: "running_count",
TaskStatus.CANCEL.value: "cancel_count",
TaskStatus.DONE.value: "done_count",
TaskStatus.FAIL.value: "fail_count",
}
empty_status = {v: 0 for v in status_field_map.values()}
result: dict[str, dict[str, int]] = {kb_id: dict(empty_status) for kb_id in kb_ids}
rows = (
cls.model.select(
cls.model.kb_id,
cls.model.run,
fn.COUNT(cls.model.id).alias("cnt"),
)
.where(cls.model.kb_id.in_(kb_ids))
.group_by(cls.model.kb_id, cls.model.run)
.dicts()
)
for row in rows:
kb_id = row["kb_id"]
run_val = str(row["run"])
field_name = status_field_map.get(run_val)
if field_name and kb_id in result:
result[kb_id][field_name] = int(row["cnt"])
return result
@classmethod
@DB.connection_context()
def count_by_kb_id(cls, kb_id, keywords, run_status, types):
if keywords:
docs = cls.model.select().where((cls.model.kb_id == kb_id), (fn.LOWER(cls.model.name).contains(keywords.lower())))
else:
docs = cls.model.select().where(cls.model.kb_id == kb_id)
if run_status:
docs = docs.where(cls.model.run.in_(run_status))
if types:
docs = docs.where(cls.model.type.in_(types))
count = docs.count()
return count
@classmethod
@DB.connection_context()
def get_total_size_by_kb_id(cls, kb_id, keywords="", run_status=[], types=[]):
query = cls.model.select(fn.COALESCE(fn.SUM(cls.model.size), 0)).where(cls.model.kb_id == kb_id)
if keywords:
query = query.where(fn.LOWER(cls.model.name).contains(keywords.lower()))
if run_status:
query = query.where(cls.model.run.in_(run_status))
if types:
query = query.where(cls.model.type.in_(types))
return int(query.scalar()) or 0
@classmethod
@DB.connection_context()
def get_all_doc_ids_by_kb_ids(cls, kb_ids):
fields = [cls.model.id, cls.model.kb_id]
docs = cls.model.select(*fields).where(cls.model.kb_id.in_(kb_ids))
docs.order_by(cls.model.create_time.asc())
# maybe cause slow query by deep paginate, optimize later
offset, limit = 0, 100
res = []
while True:
doc_batch = docs.offset(offset).limit(limit)
_temp = list(doc_batch.dicts())
if not _temp:
break
res.extend(_temp)
offset += limit
return res
@classmethod
@DB.connection_context()
def list_doc_headers_by_kb_and_source_type(cls, kb_id, source_type, page_size=500):
fields = [cls.model.id, cls.model.kb_id, cls.model.source_type, cls.model.name]
docs = cls.model.select(*fields).where(
cls.model.kb_id == kb_id,
cls.model.source_type == source_type,
).order_by(cls.model.create_time.asc())
offset = 0
res = []
while True:
doc_batch = docs.offset(offset).limit(page_size)
_temp = list(doc_batch.dicts())
if not _temp:
break
res.extend(_temp)
offset += page_size
return res
feat(connectors): ETag-based bypass for incremental S3 ingestion (#14628) (#14677) ### What problem does this PR solve? S3-family connector syncs currently re-download every in-window object just so we can compute `xxhash128(blob)` and compare against `Document.content_hash`. Anything that bumps `LastModified` without changing bytes (`aws s3 cp` touches, bucket re-encryption, etc.) pays full bandwidth and re-parses files that didn't actually change. #14628 covers the broader incremental-ingestion redesign; this PR is the first slice. The fix is a pre-listing short-circuit. `BlobStorageConnector` (S3 / R2 / GCS / OCI / S3-compat) now implements a new `FingerprintConnector` interface: `list_keys()` paginates `list_objects_v2` and yields `KeyRecord(key, fingerprint)` where `fingerprint = xxhash128(ETag)`. The orchestrator joins those against the connector's existing `{doc_id: content_hash}` map and only calls `get_value(key)` when the fingerprint differs. Unchanged keys are skipped entirely — no `GetObject`, no re-parse. No DDL. xxhash128(ETag) is 32 hex chars and reuses the existing `Document.content_hash` column per @yingfeng's suggestion; the connector decides at listing time whether to populate it. Local uploads and connectors that don't opt in fall through to the existing post-download `xxhash128(blob)` path with no behavior change. This is PR-1 of a 4-PR series — full design lives on #14628. Subsequent PRs extend tier 1 to local FS / WebDAV / Dropbox / Seafile / RDBMS (PR-2), wire up tier 2 cursor connectors with `SyncLogs.next_checkpoint` (PR-3), and unify deletion via `KeyRecord(deleted=True)` reconciliation (PR-4). Holding those back keeps this PR additive and reviewable on its own. #### Files touched - `common/data_source/models.py` — new `KeyRecord`; optional `fingerprint` on `Document` - `common/data_source/interfaces.py` — `IncrementalCapability` enum, `FingerprintConnector` ABC - `common/data_source/blob_connector.py` — `BlobStorageConnector` implements `FingerprintConnector`; per-object download factored into `_build_document_from_obj()` so `_yield_blob_objects`, `list_keys`, `get_value` all share it - `rag/svr/sync_data_source.py` — `_BlobLikeBase._fingerprint_filtered_generator` does the bypass loop; `_run_task_logic` plumbs `doc.fingerprint` into the upload dict - `api/db/services/document_service.py` — `list_id_content_hash_map_by_kb_and_source_type()` helper - `api/db/services/connector_service.py` + `file_service.py` — fingerprint flows through `duplicate_and_parse → upload_document` and lands in `content_hash` - `test/unit_test/common/test_blob_connector_fingerprint.py` — 14 tests covering ETag normalization (single-part, multipart, quoted, empty), `list_keys()` not calling `GetObject`, `get_value()` materializing with fingerprint, deterministic/stable fingerprints, and the bypass loop asserting `GetObject` is *not* called on a match #### Worth flagging for review Old `_BlobLikeBase._generate` called `poll_source(start, now)` with a `LastModified` window when `poll_range_start` was set. New code uses `_fingerprint_filtered_generator` (full bucket listing + fingerprint compare) outside of explicit `reindex=1`. Strictly better for unchanged-bucket cases since it skips `GetObject`, but it does mean every sync now does a full `list_objects_v2` paginate. Should still be cheap for most buckets — flagging in case anyone has a very large bucket where the time-window filter was meaningful. On migration: existing rows have `content_hash = xxhash128(blob)` from the old code. The first sync after this lands sees ETag-derived fingerprints that don't match, re-fetches every object once, and writes the new fingerprint. From the second sync onward the bypass works as expected. "Slow day one, fast every day after." A `fingerprint_backfill: trust` opt-out is sketched in the design doc but not in this PR. #### Test plan - [x] `uv run ruff check` — clean on all 8 touched files - [x] `uv run pytest test/unit_test/common/test_blob_connector_fingerprint.py -v` — 14 passed - [x] Broader unit-test suite — no regressions in anything I touched - [ ] Manual smoke against a real S3 bucket — configure a connector, run sync twice, expect the second sync to log `bypassed=N, fetched=0` and no `GetObject` calls in CloudTrail / bucket access logs - [ ] Manual smoke with `reindex=1` — confirm the full re-download path still works ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
2026-05-09 05:03:56 -07:00
@classmethod
@DB.connection_context()
def list_id_content_hash_map_by_kb_and_source_type(cls, kb_id, source_type, page_size=500):
"""Return {doc_id: content_hash} for the connector's existing docs.
Used by the fingerprint-bypass path to decide which keys can skip a
re-fetch -- if the connector's listing fingerprint equals content_hash,
the body hasn't changed since the last sync.
Ordered by create_time so LIMIT/OFFSET pagination is stable under
concurrent writes; without this, page boundaries can drop or duplicate
rows and the resulting map would silently miss entries.
"""
fields = [cls.model.id, cls.model.content_hash]
docs = cls.model.select(*fields).where(
cls.model.kb_id == kb_id,
cls.model.source_type == source_type,
).order_by(cls.model.create_time.asc())
offset = 0
result: dict[str, str] = {}
while True:
batch = list(docs.offset(offset).limit(page_size).dicts())
if not batch:
break
for row in batch:
result[row["id"]] = row.get("content_hash") or ""
offset += page_size
return result
@classmethod
@DB.connection_context()
def get_all_docs_by_creator_id(cls, creator_id):
fields = [cls.model.id, cls.model.kb_id, cls.model.token_num, cls.model.chunk_num, Knowledgebase.tenant_id]
docs = cls.model.select(*fields).join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id)).where(cls.model.created_by == creator_id)
docs.order_by(cls.model.create_time.asc())
# maybe cause slow query by deep paginate, optimize later
offset, limit = 0, 100
res = []
while True:
doc_batch = docs.offset(offset).limit(limit)
_temp = list(doc_batch.dicts())
if not _temp:
break
res.extend(_temp)
offset += limit
return res
@classmethod
@DB.connection_context()
def insert(cls, doc):
if not cls.save(**doc):
raise RuntimeError("Database error (Document)!")
if not KnowledgebaseService.atomic_increase_doc_num_by_id(doc["kb_id"]):
raise RuntimeError("Database error (Knowledgebase)!")
return Document(**doc)
@classmethod
@DB.connection_context()
def remove_document(cls, doc, tenant_id):
Fix: Cancel tasks before document or datasets deletion to prevent queue blocking (#12799) ### What problem does this PR solve? When deleting the knowledge base, the records in the Document and Knowledgebase tables are immediately deleted But there are still a large number of pending task messages in the Redis queue (asynchronous queue) if you did not click on stopping tasks before deleting knowledge base. TaskService.get_task() uses a JOIN query to associate three tables (Task ← Document ← Knowledgebase) Since Document/Knowledgebase have been deleted, the JOIN returns an empty result, even though the Task records still exist task-executor considers the task does not exist ("collect task xxx is unknown"), can only skip and warn log:2026-01-23 16:43:21,716 WARNING 1190179 collect task 110fbf70f5bd11f0945a23b0930487df is unknown 2026-01-23 16:43:21,818 WARNING 1190179 collect task 11146bc4f5bd11f0945a23b0930487df is unknown 2026-01-23 16:43:21,918 WARNING 1190179 collect task 111c3336f5bd11f0945a23b0930487df is unknown 2026-01-23 16:43:22,021 WARNING 1190179 collect task 112471b8f5bd11f0945a23b0930487df is unknown 2026-01-23 16:43:26,719 WARNING 1190179 collect task 112e855ef5bd11f0945a23b0930487df is unknown 2026-01-23 16:43:26,734 WARNING 1190179 collect task 1134380af5bd11f0945a23b0930487df is unknown 2026-01-23 16:43:26,834 WARNING 1190179 collect task 1138cb2cf5bd11f0945a23b0930487df is unknown As a consequence, a large number of such tasks occupy the queue processing capacity, causing new tasks to queue and wait <img width="1910" height="947" alt="9a00f2e0-9112-4dbb-b357-7f66b8eb5acf" src="https://github.com/user-attachments/assets/0e1227c2-a2df-4ef3-ba8f-e04c3f6ef0e1" /> Solution Add logic to stop all ongoing tasks before deleting the knowledge base and Tasks ### Type of change - Bug Fix (non-breaking change which fixes an issue)
2026-01-26 10:45:59 +08:00
from api.db.services.task_service import TaskService, cancel_all_task_of
if not cls.delete_document_and_update_kb_counts(doc.id):
return True
chunk_index_name = search.index_name(tenant_id)
chunk_index_exists = settings.docStoreConn.index_exist(chunk_index_name, doc.kb_id)
# Cancel all running tasks first using preset function in task_service.py --- set cancel flag in Redis
Fix: Cancel tasks before document or datasets deletion to prevent queue blocking (#12799) ### What problem does this PR solve? When deleting the knowledge base, the records in the Document and Knowledgebase tables are immediately deleted But there are still a large number of pending task messages in the Redis queue (asynchronous queue) if you did not click on stopping tasks before deleting knowledge base. TaskService.get_task() uses a JOIN query to associate three tables (Task ← Document ← Knowledgebase) Since Document/Knowledgebase have been deleted, the JOIN returns an empty result, even though the Task records still exist task-executor considers the task does not exist ("collect task xxx is unknown"), can only skip and warn log:2026-01-23 16:43:21,716 WARNING 1190179 collect task 110fbf70f5bd11f0945a23b0930487df is unknown 2026-01-23 16:43:21,818 WARNING 1190179 collect task 11146bc4f5bd11f0945a23b0930487df is unknown 2026-01-23 16:43:21,918 WARNING 1190179 collect task 111c3336f5bd11f0945a23b0930487df is unknown 2026-01-23 16:43:22,021 WARNING 1190179 collect task 112471b8f5bd11f0945a23b0930487df is unknown 2026-01-23 16:43:26,719 WARNING 1190179 collect task 112e855ef5bd11f0945a23b0930487df is unknown 2026-01-23 16:43:26,734 WARNING 1190179 collect task 1134380af5bd11f0945a23b0930487df is unknown 2026-01-23 16:43:26,834 WARNING 1190179 collect task 1138cb2cf5bd11f0945a23b0930487df is unknown As a consequence, a large number of such tasks occupy the queue processing capacity, causing new tasks to queue and wait <img width="1910" height="947" alt="9a00f2e0-9112-4dbb-b357-7f66b8eb5acf" src="https://github.com/user-attachments/assets/0e1227c2-a2df-4ef3-ba8f-e04c3f6ef0e1" /> Solution Add logic to stop all ongoing tasks before deleting the knowledge base and Tasks ### Type of change - Bug Fix (non-breaking change which fixes an issue)
2026-01-26 10:45:59 +08:00
try:
cancel_all_task_of(doc.id)
logging.info(f"Cancelled all tasks for document {doc.id}")
except Exception as e:
logging.warning(f"Failed to cancel tasks for document {doc.id}: {e}")
# Delete tasks from database
try:
TaskService.filter_delete([Task.doc_id == doc.id])
except Exception as e:
logging.warning(f"Failed to delete tasks for document {doc.id}: {e}")
# Delete chunk images (non-critical, log and continue)
try:
if chunk_index_exists:
cls.delete_chunk_images(doc, tenant_id)
except Exception as e:
logging.warning(f"Failed to delete chunk images for document {doc.id}: {e}")
# Delete thumbnail (non-critical, log and continue)
try:
if doc.thumbnail and not doc.thumbnail.startswith(IMG_BASE64_PREFIX):
if settings.STORAGE_IMPL.obj_exist(doc.kb_id, doc.thumbnail):
settings.STORAGE_IMPL.rm(doc.kb_id, doc.thumbnail)
except Exception as e:
logging.warning(f"Failed to delete thumbnail for document {doc.id}: {e}")
# Delete chunks from doc store - this is critical, log errors
try:
settings.docStoreConn.delete({"doc_id": doc.id}, chunk_index_name, doc.kb_id)
except Exception as e:
logging.error(f"Failed to delete chunks from doc store for document {doc.id}: {e}")
# Delete document metadata (non-critical, log and continue)
try:
DocMetadataService.delete_document_metadata(doc.id, doc.kb_id, tenant_id)
except Exception as e:
logging.warning(f"Failed to delete metadata for document {doc.id}: {e}")
# Cleanup knowledge graph references (non-critical, log and continue)
try:
if chunk_index_exists:
graph_source = settings.docStoreConn.get_fields(
settings.docStoreConn.search(["source_id"], [], {"kb_id": doc.kb_id, "knowledge_graph_kwd": ["graph"]}, [], OrderByExpr(), 0, 1, chunk_index_name, [doc.kb_id]),
["source_id"],
)
if len(graph_source) > 0 and doc.id in list(graph_source.values())[0]["source_id"]:
settings.docStoreConn.update(
{"kb_id": doc.kb_id, "knowledge_graph_kwd": ["entity", "relation", "graph", "subgraph", "community_report"], "source_id": doc.id},
{"remove": {"source_id": doc.id}},
chunk_index_name,
doc.kb_id,
)
settings.docStoreConn.update({"kb_id": doc.kb_id, "knowledge_graph_kwd": ["graph"]}, {"removed_kwd": "Y"}, chunk_index_name, doc.kb_id)
settings.docStoreConn.delete(
{"kb_id": doc.kb_id, "knowledge_graph_kwd": ["entity", "relation", "graph", "subgraph", "community_report"], "must_not": {"exists": "source_id"}},
chunk_index_name,
doc.kb_id,
)
except Exception as e:
logging.warning(f"Failed to cleanup knowledge graph for document {doc.id}: {e}")
return True
@classmethod
@DB.connection_context()
def delete_chunk_images(cls, doc, tenant_id):
page = 0
page_size = 1000
while True:
chunks = settings.docStoreConn.search(["img_id"], [], {"doc_id": doc.id}, [], OrderByExpr(), page * page_size, page_size, search.index_name(tenant_id), [doc.kb_id])
chunk_ids = settings.docStoreConn.get_doc_ids(chunks)
if not chunk_ids:
break
for cid in chunk_ids:
if settings.STORAGE_IMPL.obj_exist(doc.kb_id, cid):
settings.STORAGE_IMPL.rm(doc.kb_id, cid)
page += 1
@classmethod
@DB.connection_context()
def get_newly_uploaded(cls):
fields = [
cls.model.id,
cls.model.kb_id,
cls.model.parser_id,
cls.model.parser_config,
cls.model.name,
cls.model.type,
cls.model.location,
cls.model.size,
Knowledgebase.tenant_id,
Tenant.embd_id,
Tenant.img2txt_id,
Tenant.asr_id,
cls.model.update_time,
]
docs = (
cls.model.select(*fields)
.join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id))
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
.where(
cls.model.status == StatusEnum.VALID.value,
~(cls.model.type == FileType.VIRTUAL.value),
cls.model.progress == 0,
cls.model.update_time >= current_timestamp() - 1000 * 600,
cls.model.run == TaskStatus.RUNNING.value,
)
.order_by(cls.model.update_time.asc())
)
return list(docs.dicts())
@classmethod
@DB.connection_context()
def get_unfinished_docs(cls):
fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg,
cls.model.run, cls.model.parser_id]
unfinished_task_query = Task.select(Task.doc_id).where(
(Task.progress >= 0) & (Task.progress < 1)
)
docs_with_non_failed_tasks = Task.select(Task.doc_id).where(Task.progress >= 0).distinct()
docs = cls.model.select(*fields).where(
cls.model.status == StatusEnum.VALID.value,
~(cls.model.type == FileType.VIRTUAL.value),
((cls.model.run.is_null(True)) | (cls.model.run != TaskStatus.CANCEL.value)),
(((cls.model.progress < 1) & (cls.model.progress > 0)) |
(cls.model.id.in_(unfinished_task_query)) |
((cls.model.progress == -1) & (cls.model.run == TaskStatus.FAIL.value) &
(cls.model.id.in_(docs_with_non_failed_tasks))))) # including GraphRAG/RAPTOR/Mindmap; re-sync failed docs
return list(docs.dicts())
@classmethod
@DB.connection_context()
def increment_chunk_num(cls, doc_id, kb_id, token_num, chunk_num, duration):
fix: atomic chunk/token counter updates for documents and knowledge b… (#14867) ### What problem does this PR solve? Fixes #14866. Previously, `DocumentService.increment_chunk_num` and `decrement_chunk_num` updated the `Document` row and its parent `Knowledgebase` row in two separate, non-transactional statements. If the second update failed (DB error, connection drop, etc.) after the first one succeeded, the document and knowledge base chunk/token counters would drift apart and stay inconsistent. There was also a behavioral asymmetry between the two methods: - `increment_chunk_num` only logged a warning when the document row was missing and returned a value that callers usually treated as success. - `decrement_chunk_num` raised `LookupError` in the same situation. This PR makes the counter updates atomic and aligns the missing-document behavior between the two methods: - Wrap the `Document` and `Knowledgebase` updates in `increment_chunk_num` / `decrement_chunk_num` inside a `DB.atomic()` block so both succeed or both roll back together. - Raise `LookupError` from `increment_chunk_num` when the target document no longer exists, matching `decrement_chunk_num`. - Update `reset_document_for_reparse` in `document_api_service.py` to catch the new `LookupError` and return a proper "Document not found!" API error instead of propagating the exception. No schema changes, no API contract changes for the success path; only the failure mode for a missing document during reparse is now a clean error response instead of an uncaught exception. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-05-13 23:48:52 -07:00
"""Atomically add chunk/token counters on the document and its knowledge base."""
with DB.atomic():
num = (
cls.model.update(
token_num=cls.model.token_num + token_num,
chunk_num=cls.model.chunk_num + chunk_num,
process_duration=cls.model.process_duration + duration,
)
.where((cls.model.id == doc_id) & (cls.model.kb_id == kb_id))
.execute()
)
if num == 0:
logging.error(
"increment_chunk_num: no document matched doc_id=%s kb_id=%s "
"token_num=%s chunk_num=%s duration=%s",
doc_id,
kb_id,
token_num,
chunk_num,
duration,
)
raise LookupError("Document not found which is supposed to be there")
num = (
Knowledgebase.update(
token_num=Knowledgebase.token_num + token_num,
chunk_num=Knowledgebase.chunk_num + chunk_num,
)
.where(Knowledgebase.id == kb_id)
.execute()
)
if num == 0:
logging.error(
"increment_chunk_num: no knowledgebase matched kb_id=%s for doc_id=%s "
"token_num=%s chunk_num=%s duration=%s",
kb_id,
doc_id,
token_num,
chunk_num,
duration,
)
raise LookupError("Knowledgebase not found which is supposed to be there")
return num
@classmethod
@DB.connection_context()
def decrement_chunk_num(cls, doc_id, kb_id, token_num, chunk_num, duration):
fix: atomic chunk/token counter updates for documents and knowledge b… (#14867) ### What problem does this PR solve? Fixes #14866. Previously, `DocumentService.increment_chunk_num` and `decrement_chunk_num` updated the `Document` row and its parent `Knowledgebase` row in two separate, non-transactional statements. If the second update failed (DB error, connection drop, etc.) after the first one succeeded, the document and knowledge base chunk/token counters would drift apart and stay inconsistent. There was also a behavioral asymmetry between the two methods: - `increment_chunk_num` only logged a warning when the document row was missing and returned a value that callers usually treated as success. - `decrement_chunk_num` raised `LookupError` in the same situation. This PR makes the counter updates atomic and aligns the missing-document behavior between the two methods: - Wrap the `Document` and `Knowledgebase` updates in `increment_chunk_num` / `decrement_chunk_num` inside a `DB.atomic()` block so both succeed or both roll back together. - Raise `LookupError` from `increment_chunk_num` when the target document no longer exists, matching `decrement_chunk_num`. - Update `reset_document_for_reparse` in `document_api_service.py` to catch the new `LookupError` and return a proper "Document not found!" API error instead of propagating the exception. No schema changes, no API contract changes for the success path; only the failure mode for a missing document during reparse is now a clean error response instead of an uncaught exception. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-05-13 23:48:52 -07:00
"""Atomically subtract chunk/token counters on the document and its knowledge base."""
with DB.atomic():
num = (
cls.model.update(
token_num=cls.model.token_num - token_num,
chunk_num=cls.model.chunk_num - chunk_num,
process_duration=cls.model.process_duration + duration,
)
.where((cls.model.id == doc_id) & (cls.model.kb_id == kb_id))
.execute()
)
if num == 0:
raise LookupError("Document not found which is supposed to be there")
num = (
Knowledgebase.update(
token_num=Knowledgebase.token_num - token_num,
chunk_num=Knowledgebase.chunk_num - chunk_num,
)
.where(Knowledgebase.id == kb_id)
.execute()
)
if num == 0:
logging.error(
"decrement_chunk_num: no knowledgebase matched kb_id=%s for doc_id=%s "
"token_num=%s chunk_num=%s duration=%s",
kb_id,
doc_id,
token_num,
chunk_num,
duration,
)
raise LookupError("Knowledgebase not found which is supposed to be there")
return num
@classmethod
@retry_deadlock_operation()
@DB.connection_context()
def delete_document_and_update_kb_counts(cls, doc_id) -> bool:
"""Atomically delete the document row and update KB counters.
Returns True if the document was deleted by this call, False if it was
already deleted by a concurrent request (idempotent).
"""
with DB.atomic():
doc = (
cls.model.select(
cls.model.id,
cls.model.kb_id,
cls.model.token_num,
cls.model.chunk_num,
)
.where(cls.model.id == doc_id)
.for_update()
.get_or_none()
)
if doc is None:
return False
deleted = cls.model.delete().where(cls.model.id == doc_id).execute()
if not deleted:
return False
Knowledgebase.update(
token_num=Knowledgebase.token_num - doc.token_num,
chunk_num=Knowledgebase.chunk_num - doc.chunk_num,
doc_num=Knowledgebase.doc_num - 1,
).where(Knowledgebase.id == doc.kb_id).execute()
return True
@classmethod
@DB.connection_context()
def clear_chunk_num(cls, doc_id):
"""Deprecated: use delete_document_and_update_kb_counts instead."""
doc = cls.model.get_by_id(doc_id)
assert doc, "Can't find document in database."
num = (
Knowledgebase.update(token_num=Knowledgebase.token_num - doc.token_num, chunk_num=Knowledgebase.chunk_num - doc.chunk_num, doc_num=Knowledgebase.doc_num - 1)
.where(Knowledgebase.id == doc.kb_id)
.execute()
)
return num
@classmethod
@DB.connection_context()
def clear_chunk_num_when_rerun(cls, doc_id):
doc = cls.model.get_by_id(doc_id)
assert doc, "Can't find document in database."
num = (
Knowledgebase.update(
token_num=Knowledgebase.token_num - doc.token_num,
chunk_num=Knowledgebase.chunk_num - doc.chunk_num,
)
.where(Knowledgebase.id == doc.kb_id)
.execute()
)
return num
@classmethod
@DB.connection_context()
def get_tenant_id(cls, doc_id):
docs = cls.model.select(Knowledgebase.tenant_id).join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id)).where(cls.model.id == doc_id, Knowledgebase.status == StatusEnum.VALID.value)
docs = docs.dicts()
if not docs:
return None
return docs[0]["tenant_id"]
@classmethod
@DB.connection_context()
def get_knowledgebase_id(cls, doc_id):
docs = cls.model.select(cls.model.kb_id).where(cls.model.id == doc_id)
docs = docs.dicts()
if not docs:
return None
return docs[0]["kb_id"]
@classmethod
@DB.connection_context()
def get_tenant_id_by_name(cls, name):
docs = cls.model.select(Knowledgebase.tenant_id).join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id)).where(cls.model.name == name, Knowledgebase.status == StatusEnum.VALID.value)
docs = docs.dicts()
if not docs:
return None
return docs[0]["tenant_id"]
@classmethod
@DB.connection_context()
def accessible(cls, doc_id, user_id):
Fix: private dataset authorization bypass in shared dataset access checks (#14645) ### Related issues Closes #14644 ### What problem does this PR solve? This PR fixes an authorization bug where datasets marked with `permission = me` could still be accessed by other members of the same tenant through APIs that relied on `KnowledgebaseService.accessible()` or `DocumentService.accessible()`. Before this change, those shared access helpers only checked tenant membership and did not enforce the dataset's permission mode. As a result, a non-owner who knew a private `dataset_id` could still reach downstream document and chunk operations even though the dataset was intended to be owner-only. This change updates the central access checks so that: - dataset owners always retain access - joined tenant members only get access when the dataset permission is `TEAM` - private datasets with `permission = me` remain inaccessible to non-owners - document-level access follows the same dataset permission rules The PR also adds regression coverage for private-vs-team dataset access behavior. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): ### Testing - Added `test/unit_test/api/db/services/test_dataset_access_permissions.py` - Attempted to run: `python -m pytest test\\unit_test\\api\\db\\services\\test_dataset_access_permissions.py -q` - Local execution in this workspace is currently blocked during test collection because the environment is missing the `strenum` dependency --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: jony376 <jony376@gmail.com> Co-authored-by: Wang Qi <wangq8@outlook.com> Co-authored-by: d 🔹 <liusway405@gmail.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Magicbook1108 <newyorkupperbay@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: sxxtony <166789813+sxxtony@users.noreply.github.com> Co-authored-by: sxxtony <sxxtony@users.noreply.github.com> Co-authored-by: Baki Burak Öğün <63836730+bakiburakogun@users.noreply.github.com> Co-authored-by: bakiburakogun <bakiburakogun@users.noreply.github.com> Co-authored-by: Panda Dev <56657208+pandadev66@users.noreply.github.com> Co-authored-by: Haruko386 <tryeverypossible@163.com> Co-authored-by: D2758695161 <13510221939@163.com> Co-authored-by: Hunter <hunter@yitong.ai> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: buua436 <sz_buua@foxmail.com> Co-authored-by: web-dev0521 <jasonpette1783@gmail.com> Co-authored-by: Tim Wang <38489718+wanghualoong@users.noreply.github.com> Co-authored-by: wanghualoong <wanghualoong@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: qinling0210 <88864212+qinling0210@users.noreply.github.com> Co-authored-by: dale053 <star05223@outlook.com>
2026-05-08 22:30:14 -07:00
e, doc = cls.get_by_id(doc_id)
if not e:
return False
Fix: private dataset authorization bypass in shared dataset access checks (#14645) ### Related issues Closes #14644 ### What problem does this PR solve? This PR fixes an authorization bug where datasets marked with `permission = me` could still be accessed by other members of the same tenant through APIs that relied on `KnowledgebaseService.accessible()` or `DocumentService.accessible()`. Before this change, those shared access helpers only checked tenant membership and did not enforce the dataset's permission mode. As a result, a non-owner who knew a private `dataset_id` could still reach downstream document and chunk operations even though the dataset was intended to be owner-only. This change updates the central access checks so that: - dataset owners always retain access - joined tenant members only get access when the dataset permission is `TEAM` - private datasets with `permission = me` remain inaccessible to non-owners - document-level access follows the same dataset permission rules The PR also adds regression coverage for private-vs-team dataset access behavior. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): ### Testing - Added `test/unit_test/api/db/services/test_dataset_access_permissions.py` - Attempted to run: `python -m pytest test\\unit_test\\api\\db\\services\\test_dataset_access_permissions.py -q` - Local execution in this workspace is currently blocked during test collection because the environment is missing the `strenum` dependency --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: jony376 <jony376@gmail.com> Co-authored-by: Wang Qi <wangq8@outlook.com> Co-authored-by: d 🔹 <liusway405@gmail.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Magicbook1108 <newyorkupperbay@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: sxxtony <166789813+sxxtony@users.noreply.github.com> Co-authored-by: sxxtony <sxxtony@users.noreply.github.com> Co-authored-by: Baki Burak Öğün <63836730+bakiburakogun@users.noreply.github.com> Co-authored-by: bakiburakogun <bakiburakogun@users.noreply.github.com> Co-authored-by: Panda Dev <56657208+pandadev66@users.noreply.github.com> Co-authored-by: Haruko386 <tryeverypossible@163.com> Co-authored-by: D2758695161 <13510221939@163.com> Co-authored-by: Hunter <hunter@yitong.ai> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: buua436 <sz_buua@foxmail.com> Co-authored-by: web-dev0521 <jasonpette1783@gmail.com> Co-authored-by: Tim Wang <38489718+wanghualoong@users.noreply.github.com> Co-authored-by: wanghualoong <wanghualoong@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: qinling0210 <88864212+qinling0210@users.noreply.github.com> Co-authored-by: dale053 <star05223@outlook.com>
2026-05-08 22:30:14 -07:00
return KnowledgebaseService.accessible(doc.kb_id, user_id)
@classmethod
@DB.connection_context()
def accessible4deletion(cls, doc_id, user_id):
docs = (
cls.model.select(cls.model.id)
.join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id))
.join(UserTenant, on=((UserTenant.tenant_id == Knowledgebase.created_by) & (UserTenant.user_id == user_id)))
.where(cls.model.id == doc_id, UserTenant.status == StatusEnum.VALID.value, ((UserTenant.role == UserTenantRole.NORMAL) | (UserTenant.role == UserTenantRole.OWNER)))
.paginate(0, 1)
)
docs = docs.dicts()
if not docs:
return False
return True
@classmethod
@DB.connection_context()
def get_embd_id(cls, doc_id):
docs = cls.model.select(Knowledgebase.embd_id).join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id)).where(cls.model.id == doc_id, Knowledgebase.status == StatusEnum.VALID.value)
docs = docs.dicts()
if not docs:
return None
return docs[0]["embd_id"]
@classmethod
@DB.connection_context()
def get_tenant_embd_id(cls, doc_id):
docs = (
cls.model.select(Knowledgebase.tenant_embd_id).join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id)).where(cls.model.id == doc_id, Knowledgebase.status == StatusEnum.VALID.value)
)
docs = docs.dicts()
if not docs:
return None
return docs[0]["tenant_embd_id"]
@classmethod
@DB.connection_context()
def get_chunking_config(cls, doc_id):
configs = (
cls.model.select(
cls.model.id,
cls.model.kb_id,
cls.model.parser_id,
cls.model.parser_config,
fix: re-chunk documents when data source content is updated (#12918) Closes: #12889 ### What problem does this PR solve? When syncing external data sources (e.g., Jira, Confluence, Google Drive), updated documents were not being re-chunked. The raw content was correctly updated in blob storage, but the vector database retained stale chunks, causing search results to return outdated information. **Root cause:** The task digest used for chunk reuse optimization was calculated only from parser configuration fields (`parser_id`, `parser_config`, `kb_id`, etc.), without any content-dependent fields. When a document's content changed but the parser configuration remained the same, the system incorrectly reused old chunks instead of regenerating new ones. **Example scenario:** 1. User syncs a Jira issue: "Meeting scheduled for Monday" 2. User updates the Jira issue to: "Meeting rescheduled to Friday" 3. User triggers sync again 4. Raw content panel shows updated text ✓ 5. Chunk panel still shows old text "Monday" ✗ **Solution:** 1. Include `update_time` and `size` in the chunking config, so the task digest changes when document content is updated 2. Track updated documents separately in `upload_document()` and return them for processing 3. Process updated documents through the re-parsing pipeline to regenerate chunks [1.webm](https://github.com/user-attachments/assets/d21d4dcd-e189-4d39-8700-053bae0ca5a0) ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-03-06 06:48:47 +02:00
cls.model.size,
cls.model.content_hash,
Knowledgebase.language,
Knowledgebase.embd_id,
Tenant.id.alias("tenant_id"),
Tenant.img2txt_id,
Tenant.asr_id,
Tenant.llm_id,
)
.join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id))
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
.where(cls.model.id == doc_id)
)
configs = configs.dicts()
if not configs:
return None
return configs[0]
@classmethod
@DB.connection_context()
def get_doc_id_by_doc_name(cls, doc_name):
fields = [cls.model.id]
doc_id = cls.model.select(*fields).where(cls.model.name == doc_name)
doc_id = doc_id.dicts()
if not doc_id:
return None
return doc_id[0]["id"]
@classmethod
@DB.connection_context()
def get_doc_ids_by_doc_names(cls, doc_names):
if not doc_names:
return []
query = cls.model.select(cls.model.id).where(cls.model.name.in_(doc_names))
return list(query.scalars().iterator())
@classmethod
@DB.connection_context()
def get_thumbnails(cls, docids):
fields = [cls.model.id, cls.model.kb_id, cls.model.thumbnail]
return list(cls.model.select(*fields).where(cls.model.id.in_(docids)).dicts())
@classmethod
@DB.connection_context()
def update_parser_config(cls, id, config):
if not config:
return
e, d = cls.get_by_id(id)
if not e:
raise LookupError(f"Document({id}) not found.")
def dfs_update(old, new):
for k, v in new.items():
if k not in old:
old[k] = v
continue
if isinstance(v, dict) and isinstance(old[k], dict):
dfs_update(old[k], v)
else:
old[k] = v
dfs_update(d.parser_config, config)
if not config.get("raptor") and d.parser_config.get("raptor"):
del d.parser_config["raptor"]
cls.update_by_id(id, {"parser_config": d.parser_config})
@classmethod
@DB.connection_context()
def get_doc_count(cls, tenant_id):
docs = cls.model.select(cls.model.id).join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id)).where(Knowledgebase.tenant_id == tenant_id)
return len(docs)
@classmethod
@DB.connection_context()
def begin2parse(cls, doc_id, keep_progress=False):
info = {
"progress_msg": "Task is queued...",
"process_begin_at": get_format_time(),
}
if not keep_progress:
info["progress"] = random.random() * 1 / 100.0
info["run"] = TaskStatus.RUNNING.value
# keep the doc in DONE state when keep_progress=True for GraphRAG, RAPTOR and Mindmap tasks
cls.update_by_id(doc_id, info)
@classmethod
@DB.connection_context()
def update_progress(cls):
docs = cls.get_unfinished_docs()
Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423) ### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2025-10-09 12:36:19 +08:00
cls._sync_progress(docs)
@classmethod
@DB.connection_context()
def update_progress_immediately(cls, docs: list[dict]):
Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423) ### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2025-10-09 12:36:19 +08:00
if not docs:
return
cls._sync_progress(docs)
@classmethod
@DB.connection_context()
def _sync_progress(cls, docs: list[dict]):
from api.db.services.task_service import TaskService
for d in docs:
try:
tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time)
if not tsks:
continue
msg = []
prg = 0
finished = True
bad = 0
e, doc = DocumentService.get_by_id(d["id"])
status = doc.run # TaskStatus.RUNNING.value
if status == TaskStatus.CANCEL.value:
continue
doc_progress = doc.progress if doc and doc.progress else 0.0
special_task_running = False
priority = 0
for t in tsks:
task_type = (t.task_type or "").lower()
if task_type in PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES:
special_task_running = True
if 0 <= t.progress < 1:
finished = False
if t.progress == -1:
bad += 1
prg += t.progress if t.progress >= 0 else 0
if t.progress_msg.strip():
msg.append(t.progress_msg)
priority = max(priority, t.priority)
prg /= len(tsks)
if finished and bad:
prg = -1
status = TaskStatus.FAIL.value
elif finished:
Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423) ### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2025-10-09 12:36:19 +08:00
prg = 1
status = TaskStatus.DONE.value
elif not finished:
status = TaskStatus.RUNNING.value
# only for special task and parsed docs and unfinished
freeze_progress = special_task_running and doc_progress >= 1 and not finished
msg = "\n".join(sorted(msg))
begin_at = d.get("process_begin_at")
if not begin_at:
begin_at = datetime.now()
# fallback
cls.update_by_id(d["id"], {"process_begin_at": begin_at})
info = {"process_duration": max(datetime.timestamp(datetime.now()) - begin_at.timestamp(), 0), "run": status}
if prg != 0 and not freeze_progress:
info["progress"] = prg
if msg:
info["progress_msg"] = msg
Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423) ### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2025-10-09 12:36:19 +08:00
if msg.endswith("created task graphrag") or msg.endswith("created task raptor") or msg.endswith("created task mindmap"):
info["progress_msg"] += "\n%d tasks are ahead in the queue..." % get_queue_length(priority)
else:
info["progress_msg"] = "%d tasks are ahead in the queue..." % get_queue_length(priority)
info["update_time"] = current_timestamp()
info["update_date"] = get_format_time()
(cls.model.update(info).where((cls.model.id == d["id"]) & ((cls.model.run.is_null(True)) | (cls.model.run != TaskStatus.CANCEL.value))).execute())
except Exception as e:
if str(e).find("'0'") < 0:
logging.exception("fetch task exception")
@classmethod
@DB.connection_context()
def get_kb_doc_count(cls, kb_id):
return cls.model.select().where(cls.model.kb_id == kb_id).count()
@classmethod
@DB.connection_context()
def get_all_kb_doc_count(cls):
result = {}
rows = cls.model.select(cls.model.kb_id, fn.COUNT(cls.model.id).alias("count")).group_by(cls.model.kb_id)
for row in rows:
result[row.kb_id] = row.count
return result
@classmethod
@DB.connection_context()
def do_cancel(cls, doc_id):
try:
_, doc = DocumentService.get_by_id(doc_id)
return doc.run == TaskStatus.CANCEL.value or doc.progress < 0
except Exception:
pass
return False
@classmethod
@DB.connection_context()
def knowledgebase_basic_info(cls, kb_id: str) -> dict[str, int]:
# cancelled: run == "2"
cancelled = cls.model.select(fn.COUNT(1)).where((cls.model.kb_id == kb_id) & (cls.model.run == TaskStatus.CANCEL)).scalar()
downloaded = cls.model.select(fn.COUNT(1)).where(cls.model.kb_id == kb_id, cls.model.source_type != "local").scalar()
row = (
cls.model.select(
# finished: progress == 1
fn.COALESCE(fn.SUM(Case(None, [(cls.model.progress == 1, 1)], 0)), 0).alias("finished"),
# failed: progress == -1
fn.COALESCE(fn.SUM(Case(None, [(cls.model.progress == -1, 1)], 0)), 0).alias("failed"),
# processing: 0 <= progress < 1
fn.COALESCE(
fn.SUM(
Case(
None,
[
(((cls.model.progress == 0) | ((cls.model.progress > 0) & (cls.model.progress < 1))), 1),
],
0,
)
),
0,
).alias("processing"),
)
.where((cls.model.kb_id == kb_id) & ((cls.model.run.is_null(True)) | (cls.model.run != TaskStatus.CANCEL)))
.dicts()
.get()
)
return {"processing": int(row["processing"]), "finished": int(row["finished"]), "failed": int(row["failed"]), "cancelled": int(cancelled), "downloaded": int(downloaded)}
@classmethod
def run(cls, tenant_id: str, doc: dict, kb_table_num_map: dict):
from api.db.services.task_service import queue_dataflow, queue_tasks
from api.db.services.file2document_service import File2DocumentService
doc["tenant_id"] = tenant_id
doc_parser = doc.get("parser_id", ParserType.NAIVE)
if doc_parser == ParserType.TABLE:
kb_id = doc.get("kb_id")
if not kb_id:
return
if kb_id not in kb_table_num_map:
count = DocumentService.count_by_kb_id(kb_id=kb_id, keywords="", run_status=[TaskStatus.DONE], types=[])
kb_table_num_map[kb_id] = count
if kb_table_num_map[kb_id] <= 0:
KnowledgebaseService.delete_field_map(kb_id)
if doc.get("pipeline_id", ""):
queue_dataflow(tenant_id, flow_id=doc["pipeline_id"], task_id=get_uuid(), doc_id=doc["id"])
else:
bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
queue_tasks(doc, bucket, name, 0)
def queue_raptor_o_graphrag_tasks(sample_doc, ty, priority, fake_doc_id="", doc_ids=None):
Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423) ### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2025-10-09 12:36:19 +08:00
"""
You can provide a fake_doc_id to bypass the restriction of tasks at the knowledgebase level.
Optionally, specify a list of doc_ids to determine which documents participate in the task.
"""
if doc_ids is None:
doc_ids = []
assert ty in ["graphrag", "raptor", "mindmap"], "type should be graphrag, raptor or mindmap"
chunking_config = DocumentService.get_chunking_config(sample_doc["id"])
hasher = xxhash.xxh64()
for field in sorted(chunking_config.keys()):
hasher.update(str(chunking_config[field]).encode("utf-8"))
def new_task():
return {
"id": get_uuid(),
"doc_id": fake_doc_id,
Fix: Remove hardcoded page limits causing parsing failures on large PDFs (>300 pages) (#14382) ### What problem does this PR solve? Fixes #14196 ## Problem When using DeepDOC to parse large PDFs (over 1000 pages), the parser silently truncated processing at 300 pages due to a hardcoded default `page_to=299` in `RAGFlowPdfParser.__images__()`. This caused: - **Errors** on pages beyond the limit - **Poor image quality** as the parser attempted to compensate with missing page data - **Inconsistent chunk splitting** between full PDF imports and partial imports Additionally, the codebase scattered magic numbers (`299`, `600`, `10000`, `100000`, `100000000`, `10000000000`, `10**9`) across 22 files as sentinel values for "parse all pages", making future maintenance error-prone. ## Root Cause ```python # deepdoc/parser/pdf_parser.py (before) def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): # Only the first 300 pages were rendered; everything beyond was silently dropped ``` While most callers in `rag/app/*.py` correctly passed `to_page=100000`, the base class `RAGFlowPdfParser.__call__()` and `parse_into_bboxes()` invoked `__images__` **without** forwarding `page_from`/`page_to`, falling back to the restrictive default of 299. ## Solution ### 1. Define constants in `common/constants.py` ```python MAXIMUM_PAGE_NUMBER = 100000 # Used by the parsing layer MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000 # Used by the task/DB layer ``` ### 2. Replace all hardcoded sentinel values | Layer | Files Changed | Old Values | New Value | |---|---|---|---| | **Deepdoc parsers** | `pdf_parser.py`, `mineru_parser.py`, `docling_parser.py`, `opendataloader_parser.py`, `paddleocr_parser.py`, `docx_parser.py` | `299`, `600`, `10**9`, `100000000` | `MAXIMUM_PAGE_NUMBER` | | **Chunk parsers** | `naive.py`, `book.py`, `qa.py`, `one.py`, `manual.py`, `paper.py`, `presentation.py`, `laws.py`, `resume.py`, `email.py`, `table.py` | `100000`, `10000`, `10000000000` | `MAXIMUM_PAGE_NUMBER` | | **Task/DB layer** | `db_models.py`, `task_service.py`, `document_service.py`, `file_service.py` | `100000000` | `MAXIMUM_TASK_PAGE_NUMBER` | ### 3. Fix `parse_into_bboxes()` missing parameters Added `from_page`/`to_page` parameters to `parse_into_bboxes()` so that the `rag/flow/parser/parser.py` DeepDOC path no longer falls back to the restrictive default. ## Files Changed (22) - `common/constants.py` - `deepdoc/parser/pdf_parser.py` - `deepdoc/parser/mineru_parser.py` - `deepdoc/parser/docling_parser.py` - `deepdoc/parser/opendataloader_parser.py` - `deepdoc/parser/paddleocr_parser.py` - `deepdoc/parser/docx_parser.py` - `rag/app/naive.py` - `rag/app/book.py` - `rag/app/qa.py` - `rag/app/one.py` - `rag/app/manual.py` - `rag/app/paper.py` - `rag/app/presentation.py` - `rag/app/laws.py` - `rag/app/resume.py` - `rag/app/email.py` - `rag/app/table.py` - `api/db/db_models.py` - `api/db/services/task_service.py` - `api/db/services/document_service.py` - `api/db/services/file_service.py` ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Refactoring --------- Signed-off-by: noob <yixiao121314@outlook.com>
2026-04-27 06:57:20 +00:00
"from_page": MAXIMUM_TASK_PAGE_NUMBER,
"to_page": MAXIMUM_TASK_PAGE_NUMBER,
"task_type": ty,
"progress_msg": datetime.now().strftime("%H:%M:%S") + " created task " + ty,
"begin_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
}
task = new_task()
for field in ["doc_id", "from_page", "to_page"]:
hasher.update(str(task.get(field, "")).encode("utf-8"))
hasher.update(ty.encode("utf-8"))
task["digest"] = hasher.hexdigest()
bulk_insert_into_db(Task, [task], True)
Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423) ### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2025-10-09 12:36:19 +08:00
task["doc_ids"] = doc_ids
DocumentService.begin2parse(task["doc_id"], keep_progress=True)
assert REDIS_CONN.queue_product(settings.get_svr_queue_name(priority, ty), message=task), "Can't access Redis. Please check the Redis' status."
Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423) ### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2025-10-09 12:36:19 +08:00
return task["id"]
def get_queue_length(priority, suffix="common"):
group_info = REDIS_CONN.queue_info(settings.get_svr_queue_name(priority, suffix), SVR_CONSUMER_GROUP_NAME)
if not group_info:
return 0
return int(group_info.get("lag", 0) or 0)