2024-08-15 09:17:36 +08:00
|
|
|
#
|
|
|
|
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
#
|
2025-03-14 16:31:44 +08:00
|
|
|
import logging
|
2024-08-15 09:17:36 +08:00
|
|
|
import random
|
|
|
|
|
from datetime import datetime
|
2024-08-15 19:30:43 +08:00
|
|
|
|
2025-03-14 16:31:44 +08:00
|
|
|
import xxhash
|
2025-10-09 12:36:19 +08:00
|
|
|
from peewee import fn, Case, JOIN
|
2024-08-15 09:17:36 +08:00
|
|
|
|
2025-07-30 19:41:09 +08:00
|
|
|
from api.constants import IMG_BASE64_PREFIX, FILE_NAME_LEN_LIMIT
|
2025-11-12 12:03:41 +08:00
|
|
|
from api.db import PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES, FileType, UserTenantRole, CanvasCategory
|
2026-03-10 18:05:45 +08:00
|
|
|
from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTenant, File2Document, File, UserCanvas, User
|
2025-03-14 16:31:44 +08:00
|
|
|
from api.db.db_utils import bulk_insert_into_db
|
2026-03-12 12:39:01 +08:00
|
|
|
from api.db.services.common_service import CommonService, retry_deadlock_operation
|
2024-08-15 09:17:36 +08:00
|
|
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
2026-01-28 13:29:34 +08:00
|
|
|
from api.db.services.doc_metadata_service import DocMetadataService
|
2026-04-27 20:35:00 +08:00
|
|
|
|
|
|
|
|
from common import settings
|
|
|
|
|
from common.constants import ParserType, StatusEnum, TaskStatus, SVR_CONSUMER_GROUP_NAME, MAXIMUM_TASK_PAGE_NUMBER
|
|
|
|
|
from common.doc_store.doc_store_base import OrderByExpr
|
2025-10-31 16:42:01 +08:00
|
|
|
from common.misc_utils import get_uuid
|
2025-10-28 19:09:14 +08:00
|
|
|
from common.time_utils import current_timestamp, get_format_time
|
2026-04-27 20:35:00 +08:00
|
|
|
|
|
|
|
|
from rag.nlp import search
|
2024-08-15 09:17:36 +08:00
|
|
|
from rag.utils.redis_conn import REDIS_CONN
|
|
|
|
|
|
2025-11-18 17:05:16 +08:00
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
class DocumentService(CommonService):
|
|
|
|
|
model = Document
|
|
|
|
|
|
2025-07-25 19:26:31 +08:00
|
|
|
@classmethod
|
|
|
|
|
def get_cls_model_fields(cls):
|
|
|
|
|
return [
|
|
|
|
|
cls.model.id,
|
|
|
|
|
cls.model.thumbnail,
|
|
|
|
|
cls.model.kb_id,
|
|
|
|
|
cls.model.parser_id,
|
2025-10-09 12:36:19 +08:00
|
|
|
cls.model.pipeline_id,
|
2025-07-25 19:26:31 +08:00
|
|
|
cls.model.parser_config,
|
|
|
|
|
cls.model.source_type,
|
|
|
|
|
cls.model.type,
|
|
|
|
|
cls.model.created_by,
|
|
|
|
|
cls.model.name,
|
|
|
|
|
cls.model.location,
|
|
|
|
|
cls.model.size,
|
|
|
|
|
cls.model.token_num,
|
|
|
|
|
cls.model.chunk_num,
|
|
|
|
|
cls.model.progress,
|
|
|
|
|
cls.model.progress_msg,
|
|
|
|
|
cls.model.process_begin_at,
|
|
|
|
|
cls.model.process_duration,
|
|
|
|
|
cls.model.suffix,
|
|
|
|
|
cls.model.run,
|
|
|
|
|
cls.model.status,
|
|
|
|
|
cls.model.create_time,
|
|
|
|
|
cls.model.create_date,
|
|
|
|
|
cls.model.update_time,
|
|
|
|
|
cls.model.update_date,
|
|
|
|
|
]
|
|
|
|
|
|
2024-10-12 19:35:19 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
2026-03-10 18:05:45 +08:00
|
|
|
def get_list(cls, kb_id, page_number, items_per_page, orderby, desc, keywords, id, name, suffix=None, run=None, doc_ids=None):
|
2025-07-25 19:26:31 +08:00
|
|
|
fields = cls.get_cls_model_fields()
|
2026-03-10 18:05:45 +08:00
|
|
|
docs = (
|
|
|
|
|
cls.model.select(*[*fields, UserCanvas.title])
|
|
|
|
|
.join(File2Document, on=(File2Document.document_id == cls.model.id))
|
|
|
|
|
.join(File, on=(File.id == File2Document.file_id))
|
|
|
|
|
.join(UserCanvas, on=((cls.model.pipeline_id == UserCanvas.id) & (UserCanvas.canvas_category == CanvasCategory.DataFlow.value)), join_type=JOIN.LEFT_OUTER)
|
2025-10-09 12:36:19 +08:00
|
|
|
.where(cls.model.kb_id == kb_id)
|
2026-03-10 18:05:45 +08:00
|
|
|
)
|
2024-10-12 19:35:19 +08:00
|
|
|
if id:
|
2026-03-10 18:05:45 +08:00
|
|
|
docs = docs.where(cls.model.id == id)
|
2024-11-01 22:59:17 +08:00
|
|
|
if name:
|
2026-03-10 18:05:45 +08:00
|
|
|
docs = docs.where(cls.model.name == name)
|
2024-10-12 19:35:19 +08:00
|
|
|
if keywords:
|
2026-03-10 18:05:45 +08:00
|
|
|
docs = docs.where(fn.LOWER(cls.model.name).contains(keywords.lower()))
|
2026-05-18 15:54:30 +05:00
|
|
|
if doc_ids is not None:
|
2025-12-11 09:59:15 +08:00
|
|
|
docs = docs.where(cls.model.id.in_(doc_ids))
|
2025-10-21 10:38:40 +08:00
|
|
|
if suffix:
|
|
|
|
|
docs = docs.where(cls.model.suffix.in_(suffix))
|
|
|
|
|
if run:
|
|
|
|
|
docs = docs.where(cls.model.run.in_(run))
|
2024-10-12 19:35:19 +08:00
|
|
|
if desc:
|
|
|
|
|
docs = docs.order_by(cls.model.getter_by(orderby).desc())
|
|
|
|
|
else:
|
|
|
|
|
docs = docs.order_by(cls.model.getter_by(orderby).asc())
|
|
|
|
|
|
2024-10-16 18:41:24 +08:00
|
|
|
count = docs.count()
|
2025-01-03 10:02:30 +08:00
|
|
|
docs = docs.paginate(page_number, items_per_page)
|
2026-01-30 14:06:49 +08:00
|
|
|
|
|
|
|
|
docs_list = list(docs.dicts())
|
2026-03-11 06:42:16 +01:00
|
|
|
doc_ids_on_page = [doc["id"] for doc in docs_list]
|
|
|
|
|
metadata_map = DocMetadataService.get_metadata_for_documents(doc_ids_on_page, kb_id) if doc_ids_on_page else {}
|
2026-01-30 14:06:49 +08:00
|
|
|
for doc in docs_list:
|
|
|
|
|
doc["meta_fields"] = metadata_map.get(doc["id"], {})
|
|
|
|
|
return docs_list, count
|
2024-10-12 19:35:19 +08:00
|
|
|
|
2025-07-30 19:41:09 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def check_doc_health(cls, tenant_id: str, filename):
|
|
|
|
|
import os
|
2026-03-10 18:05:45 +08:00
|
|
|
|
2025-07-30 19:41:09 +08:00
|
|
|
MAX_FILE_NUM_PER_USER = int(os.environ.get("MAX_FILE_NUM_PER_USER", 0))
|
2025-11-16 19:29:20 +08:00
|
|
|
if 0 < MAX_FILE_NUM_PER_USER <= DocumentService.get_doc_count(tenant_id):
|
2025-07-30 19:41:09 +08:00
|
|
|
raise RuntimeError("Exceed the maximum file number of a free user!")
|
|
|
|
|
if len(filename.encode("utf-8")) > FILE_NAME_LEN_LIMIT:
|
|
|
|
|
raise RuntimeError("Exceed the maximum length of file name!")
|
|
|
|
|
return True
|
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
2026-04-21 18:55:30 +08:00
|
|
|
def get_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix, name=None, doc_ids=None, return_empty_metadata=False):
|
2025-07-25 19:26:31 +08:00
|
|
|
fields = cls.get_cls_model_fields()
|
2024-08-15 09:17:36 +08:00
|
|
|
if keywords:
|
2025-12-25 14:06:50 +08:00
|
|
|
docs = (
|
|
|
|
|
cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])
|
|
|
|
|
.join(File2Document, on=(File2Document.document_id == cls.model.id))
|
|
|
|
|
.join(File, on=(File.id == File2Document.file_id))
|
|
|
|
|
.join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)
|
|
|
|
|
.join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)
|
|
|
|
|
.where((cls.model.kb_id == kb_id), (fn.LOWER(cls.model.name).contains(keywords.lower())))
|
|
|
|
|
)
|
2024-08-15 09:17:36 +08:00
|
|
|
else:
|
2025-12-25 14:06:50 +08:00
|
|
|
docs = (
|
|
|
|
|
cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])
|
|
|
|
|
.join(File2Document, on=(File2Document.document_id == cls.model.id))
|
|
|
|
|
.join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)
|
|
|
|
|
.join(File, on=(File.id == File2Document.file_id))
|
|
|
|
|
.join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)
|
2025-10-09 12:36:19 +08:00
|
|
|
.where(cls.model.kb_id == kb_id)
|
2025-12-25 14:06:50 +08:00
|
|
|
)
|
2026-05-18 15:54:30 +05:00
|
|
|
if doc_ids is not None:
|
2026-04-21 18:55:30 +08:00
|
|
|
docs = docs.where(cls.model.id.in_(doc_ids))
|
2025-04-27 16:48:27 +08:00
|
|
|
if run_status:
|
|
|
|
|
docs = docs.where(cls.model.run.in_(run_status))
|
|
|
|
|
if types:
|
|
|
|
|
docs = docs.where(cls.model.type.in_(types))
|
2025-07-09 09:33:11 +08:00
|
|
|
if suffix:
|
|
|
|
|
docs = docs.where(cls.model.suffix.in_(suffix))
|
2026-04-20 14:54:40 +08:00
|
|
|
if name:
|
|
|
|
|
docs = docs.where(cls.model.name == name)
|
2026-01-28 13:29:34 +08:00
|
|
|
|
2026-03-11 06:42:16 +01:00
|
|
|
if return_empty_metadata:
|
|
|
|
|
metadata_map = DocMetadataService.get_metadata_for_documents(None, kb_id)
|
|
|
|
|
doc_ids_with_metadata = set(metadata_map.keys())
|
|
|
|
|
if doc_ids_with_metadata:
|
|
|
|
|
docs = docs.where(cls.model.id.not_in(doc_ids_with_metadata))
|
2025-04-27 16:48:27 +08:00
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
count = docs.count()
|
|
|
|
|
if desc:
|
|
|
|
|
docs = docs.order_by(cls.model.getter_by(orderby).desc())
|
|
|
|
|
else:
|
|
|
|
|
docs = docs.order_by(cls.model.getter_by(orderby).asc())
|
|
|
|
|
|
2025-04-27 16:48:27 +08:00
|
|
|
if page_number and items_per_page:
|
|
|
|
|
docs = docs.paginate(page_number, items_per_page)
|
2024-08-15 09:17:36 +08:00
|
|
|
|
2026-01-28 13:29:34 +08:00
|
|
|
docs_list = list(docs.dicts())
|
|
|
|
|
if return_empty_metadata:
|
|
|
|
|
for doc in docs_list:
|
|
|
|
|
doc["meta_fields"] = {}
|
|
|
|
|
else:
|
2026-03-11 06:42:16 +01:00
|
|
|
doc_ids_on_page = [doc["id"] for doc in docs_list]
|
|
|
|
|
metadata_map = DocMetadataService.get_metadata_for_documents(doc_ids_on_page, kb_id) if doc_ids_on_page else {}
|
2026-01-28 13:29:34 +08:00
|
|
|
for doc in docs_list:
|
|
|
|
|
doc["meta_fields"] = metadata_map.get(doc["id"], {})
|
|
|
|
|
return docs_list, count
|
2024-08-15 09:17:36 +08:00
|
|
|
|
2025-07-09 09:33:11 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_filter_by_kb_id(cls, kb_id, keywords, run_status, types, suffix):
|
|
|
|
|
"""
|
|
|
|
|
returns:
|
|
|
|
|
{
|
|
|
|
|
"suffix": {
|
|
|
|
|
"ppt": 1,
|
|
|
|
|
"doxc": 2
|
|
|
|
|
},
|
|
|
|
|
"run_status": {
|
|
|
|
|
"1": 2,
|
|
|
|
|
"2": 2
|
|
|
|
|
}
|
2025-12-22 09:35:11 +08:00
|
|
|
"metadata": {
|
|
|
|
|
"key1": {
|
|
|
|
|
"key1_value1": 1,
|
|
|
|
|
"key1_value2": 2,
|
|
|
|
|
},
|
|
|
|
|
"key2": {
|
|
|
|
|
"key2_value1": 2,
|
|
|
|
|
"key2_value2": 1,
|
|
|
|
|
},
|
|
|
|
|
}
|
2025-07-09 09:33:11 +08:00
|
|
|
}, total
|
|
|
|
|
where "1" => RUNNING, "2" => CANCEL
|
|
|
|
|
"""
|
2025-07-25 19:26:31 +08:00
|
|
|
fields = cls.get_cls_model_fields()
|
2025-07-09 09:33:11 +08:00
|
|
|
if keywords:
|
2026-03-10 18:05:45 +08:00
|
|
|
query = (
|
|
|
|
|
cls.model.select(*fields)
|
|
|
|
|
.join(File2Document, on=(File2Document.document_id == cls.model.id))
|
|
|
|
|
.join(File, on=(File.id == File2Document.file_id))
|
|
|
|
|
.where((cls.model.kb_id == kb_id), (fn.LOWER(cls.model.name).contains(keywords.lower())))
|
2025-07-09 09:33:11 +08:00
|
|
|
)
|
|
|
|
|
else:
|
2026-03-10 18:05:45 +08:00
|
|
|
query = cls.model.select(*fields).join(File2Document, on=(File2Document.document_id == cls.model.id)).join(File, on=(File.id == File2Document.file_id)).where(cls.model.kb_id == kb_id)
|
2025-07-09 09:33:11 +08:00
|
|
|
|
|
|
|
|
if run_status:
|
|
|
|
|
query = query.where(cls.model.run.in_(run_status))
|
|
|
|
|
if types:
|
|
|
|
|
query = query.where(cls.model.type.in_(types))
|
|
|
|
|
if suffix:
|
|
|
|
|
query = query.where(cls.model.suffix.in_(suffix))
|
|
|
|
|
|
2026-01-28 13:29:34 +08:00
|
|
|
rows = query.select(cls.model.run, cls.model.suffix, cls.model.id)
|
2025-07-09 09:33:11 +08:00
|
|
|
total = rows.count()
|
|
|
|
|
|
|
|
|
|
suffix_counter = {}
|
|
|
|
|
run_status_counter = {}
|
2025-12-22 09:35:11 +08:00
|
|
|
metadata_counter = {}
|
2025-12-25 14:06:50 +08:00
|
|
|
empty_metadata_count = 0
|
2025-07-09 09:33:11 +08:00
|
|
|
|
2026-01-28 13:29:34 +08:00
|
|
|
doc_ids = [row.id for row in rows]
|
|
|
|
|
metadata = {}
|
|
|
|
|
if doc_ids:
|
|
|
|
|
try:
|
|
|
|
|
metadata = DocMetadataService.get_metadata_for_documents(doc_ids, kb_id)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.warning(f"Failed to fetch metadata from ES/Infinity: {e}")
|
|
|
|
|
|
2025-07-09 09:33:11 +08:00
|
|
|
for row in rows:
|
|
|
|
|
suffix_counter[row.suffix] = suffix_counter.get(row.suffix, 0) + 1
|
|
|
|
|
run_status_counter[str(row.run)] = run_status_counter.get(str(row.run), 0) + 1
|
2026-01-28 13:29:34 +08:00
|
|
|
meta_fields = metadata.get(row.id, {})
|
2025-12-25 14:06:50 +08:00
|
|
|
if not meta_fields:
|
|
|
|
|
empty_metadata_count += 1
|
2025-12-22 09:35:11 +08:00
|
|
|
continue
|
2025-12-25 14:06:50 +08:00
|
|
|
has_valid_meta = False
|
2025-12-22 09:35:11 +08:00
|
|
|
for key, value in meta_fields.items():
|
|
|
|
|
values = value if isinstance(value, list) else [value]
|
|
|
|
|
for vv in values:
|
|
|
|
|
if vv is None:
|
|
|
|
|
continue
|
|
|
|
|
if isinstance(vv, str) and not vv.strip():
|
|
|
|
|
continue
|
|
|
|
|
sv = str(vv)
|
|
|
|
|
if key not in metadata_counter:
|
|
|
|
|
metadata_counter[key] = {}
|
|
|
|
|
metadata_counter[key][sv] = metadata_counter[key].get(sv, 0) + 1
|
2025-12-25 14:06:50 +08:00
|
|
|
has_valid_meta = True
|
|
|
|
|
if not has_valid_meta:
|
|
|
|
|
empty_metadata_count += 1
|
2025-07-09 09:33:11 +08:00
|
|
|
|
2025-12-25 14:06:50 +08:00
|
|
|
metadata_counter["empty_metadata"] = {"true": empty_metadata_count}
|
2025-07-09 09:33:11 +08:00
|
|
|
return {
|
|
|
|
|
"suffix": suffix_counter,
|
2025-12-22 09:35:11 +08:00
|
|
|
"run_status": run_status_counter,
|
|
|
|
|
"metadata": metadata_counter,
|
2025-07-09 09:33:11 +08:00
|
|
|
}, total
|
|
|
|
|
|
2026-03-10 18:05:45 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_parsing_status_by_kb_ids(cls, kb_ids: list[str]) -> dict[str, dict[str, int]]:
|
|
|
|
|
"""Return aggregated document parsing status counts grouped by dataset (kb_id).
|
|
|
|
|
|
|
|
|
|
For each kb_id, counts documents in each run-status bucket:
|
|
|
|
|
- unstart_count (run == "0")
|
|
|
|
|
- running_count (run == "1")
|
|
|
|
|
- cancel_count (run == "2")
|
|
|
|
|
- done_count (run == "3")
|
|
|
|
|
- fail_count (run == "4")
|
|
|
|
|
|
|
|
|
|
Returns a dict keyed by kb_id, e.g.
|
|
|
|
|
{"kb-abc": {"unstart_count": 10, "running_count": 2, ...}, ...}
|
|
|
|
|
"""
|
|
|
|
|
if not kb_ids:
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
status_field_map = {
|
|
|
|
|
TaskStatus.UNSTART.value: "unstart_count",
|
|
|
|
|
TaskStatus.RUNNING.value: "running_count",
|
|
|
|
|
TaskStatus.CANCEL.value: "cancel_count",
|
|
|
|
|
TaskStatus.DONE.value: "done_count",
|
|
|
|
|
TaskStatus.FAIL.value: "fail_count",
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
empty_status = {v: 0 for v in status_field_map.values()}
|
|
|
|
|
result: dict[str, dict[str, int]] = {kb_id: dict(empty_status) for kb_id in kb_ids}
|
|
|
|
|
|
|
|
|
|
rows = (
|
|
|
|
|
cls.model.select(
|
|
|
|
|
cls.model.kb_id,
|
|
|
|
|
cls.model.run,
|
|
|
|
|
fn.COUNT(cls.model.id).alias("cnt"),
|
|
|
|
|
)
|
|
|
|
|
.where(cls.model.kb_id.in_(kb_ids))
|
|
|
|
|
.group_by(cls.model.kb_id, cls.model.run)
|
|
|
|
|
.dicts()
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
for row in rows:
|
|
|
|
|
kb_id = row["kb_id"]
|
|
|
|
|
run_val = str(row["run"])
|
|
|
|
|
field_name = status_field_map.get(run_val)
|
|
|
|
|
if field_name and kb_id in result:
|
|
|
|
|
result[kb_id][field_name] = int(row["cnt"])
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
2025-05-06 09:44:38 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def count_by_kb_id(cls, kb_id, keywords, run_status, types):
|
|
|
|
|
if keywords:
|
2026-03-10 18:05:45 +08:00
|
|
|
docs = cls.model.select().where((cls.model.kb_id == kb_id), (fn.LOWER(cls.model.name).contains(keywords.lower())))
|
2025-05-06 09:44:38 +08:00
|
|
|
else:
|
|
|
|
|
docs = cls.model.select().where(cls.model.kb_id == kb_id)
|
|
|
|
|
|
|
|
|
|
if run_status:
|
|
|
|
|
docs = docs.where(cls.model.run.in_(run_status))
|
|
|
|
|
if types:
|
|
|
|
|
docs = docs.where(cls.model.type.in_(types))
|
|
|
|
|
|
|
|
|
|
count = docs.count()
|
|
|
|
|
|
|
|
|
|
return count
|
|
|
|
|
|
2025-05-09 11:48:54 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_total_size_by_kb_id(cls, kb_id, keywords="", run_status=[], types=[]):
|
2026-03-10 18:05:45 +08:00
|
|
|
query = cls.model.select(fn.COALESCE(fn.SUM(cls.model.size), 0)).where(cls.model.kb_id == kb_id)
|
2025-05-09 11:48:54 +08:00
|
|
|
|
|
|
|
|
if keywords:
|
|
|
|
|
query = query.where(fn.LOWER(cls.model.name).contains(keywords.lower()))
|
|
|
|
|
if run_status:
|
|
|
|
|
query = query.where(cls.model.run.in_(run_status))
|
|
|
|
|
if types:
|
|
|
|
|
query = query.where(cls.model.type.in_(types))
|
|
|
|
|
|
2025-05-09 15:32:02 +08:00
|
|
|
return int(query.scalar()) or 0
|
2025-05-09 11:48:54 +08:00
|
|
|
|
2025-09-29 10:16:13 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_all_doc_ids_by_kb_ids(cls, kb_ids):
|
2026-03-10 13:44:24 +08:00
|
|
|
fields = [cls.model.id, cls.model.kb_id]
|
2025-09-29 10:16:13 +08:00
|
|
|
docs = cls.model.select(*fields).where(cls.model.kb_id.in_(kb_ids))
|
|
|
|
|
docs.order_by(cls.model.create_time.asc())
|
|
|
|
|
# maybe cause slow query by deep paginate, optimize later
|
|
|
|
|
offset, limit = 0, 100
|
|
|
|
|
res = []
|
|
|
|
|
while True:
|
|
|
|
|
doc_batch = docs.offset(offset).limit(limit)
|
|
|
|
|
_temp = list(doc_batch.dicts())
|
|
|
|
|
if not _temp:
|
|
|
|
|
break
|
|
|
|
|
res.extend(_temp)
|
|
|
|
|
offset += limit
|
|
|
|
|
return res
|
|
|
|
|
|
2026-04-09 16:40:14 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def list_doc_headers_by_kb_and_source_type(cls, kb_id, source_type, page_size=500):
|
|
|
|
|
fields = [cls.model.id, cls.model.kb_id, cls.model.source_type, cls.model.name]
|
|
|
|
|
docs = cls.model.select(*fields).where(
|
|
|
|
|
cls.model.kb_id == kb_id,
|
|
|
|
|
cls.model.source_type == source_type,
|
|
|
|
|
).order_by(cls.model.create_time.asc())
|
|
|
|
|
offset = 0
|
|
|
|
|
res = []
|
|
|
|
|
while True:
|
|
|
|
|
doc_batch = docs.offset(offset).limit(page_size)
|
|
|
|
|
_temp = list(doc_batch.dicts())
|
|
|
|
|
if not _temp:
|
|
|
|
|
break
|
|
|
|
|
res.extend(_temp)
|
|
|
|
|
offset += page_size
|
|
|
|
|
return res
|
|
|
|
|
|
feat(connectors): ETag-based bypass for incremental S3 ingestion (#14628) (#14677)
### What problem does this PR solve?
S3-family connector syncs currently re-download every in-window object
just so we can compute `xxhash128(blob)` and compare against
`Document.content_hash`. Anything that bumps `LastModified` without
changing bytes (`aws s3 cp` touches, bucket re-encryption, etc.) pays
full bandwidth and re-parses files that didn't actually change. #14628
covers the broader incremental-ingestion redesign; this PR is the first
slice.
The fix is a pre-listing short-circuit. `BlobStorageConnector` (S3 / R2
/ GCS / OCI / S3-compat) now implements a new `FingerprintConnector`
interface: `list_keys()` paginates `list_objects_v2` and yields
`KeyRecord(key, fingerprint)` where `fingerprint = xxhash128(ETag)`. The
orchestrator joins those against the connector's existing `{doc_id:
content_hash}` map and only calls `get_value(key)` when the fingerprint
differs. Unchanged keys are skipped entirely — no `GetObject`, no
re-parse.
No DDL. xxhash128(ETag) is 32 hex chars and reuses the existing
`Document.content_hash` column per @yingfeng's suggestion; the connector
decides at listing time whether to populate it. Local uploads and
connectors that don't opt in fall through to the existing post-download
`xxhash128(blob)` path with no behavior change.
This is PR-1 of a 4-PR series — full design lives on #14628. Subsequent
PRs extend tier 1 to local FS / WebDAV / Dropbox / Seafile / RDBMS
(PR-2), wire up tier 2 cursor connectors with `SyncLogs.next_checkpoint`
(PR-3), and unify deletion via `KeyRecord(deleted=True)` reconciliation
(PR-4). Holding those back keeps this PR additive and reviewable on its
own.
#### Files touched
- `common/data_source/models.py` — new `KeyRecord`; optional
`fingerprint` on `Document`
- `common/data_source/interfaces.py` — `IncrementalCapability` enum,
`FingerprintConnector` ABC
- `common/data_source/blob_connector.py` — `BlobStorageConnector`
implements `FingerprintConnector`; per-object download factored into
`_build_document_from_obj()` so `_yield_blob_objects`, `list_keys`,
`get_value` all share it
- `rag/svr/sync_data_source.py` —
`_BlobLikeBase._fingerprint_filtered_generator` does the bypass loop;
`_run_task_logic` plumbs `doc.fingerprint` into the upload dict
- `api/db/services/document_service.py` —
`list_id_content_hash_map_by_kb_and_source_type()` helper
- `api/db/services/connector_service.py` + `file_service.py` —
fingerprint flows through `duplicate_and_parse → upload_document` and
lands in `content_hash`
- `test/unit_test/common/test_blob_connector_fingerprint.py` — 14 tests
covering ETag normalization (single-part, multipart, quoted, empty),
`list_keys()` not calling `GetObject`, `get_value()` materializing with
fingerprint, deterministic/stable fingerprints, and the bypass loop
asserting `GetObject` is *not* called on a match
#### Worth flagging for review
Old `_BlobLikeBase._generate` called `poll_source(start, now)` with a
`LastModified` window when `poll_range_start` was set. New code uses
`_fingerprint_filtered_generator` (full bucket listing + fingerprint
compare) outside of explicit `reindex=1`. Strictly better for
unchanged-bucket cases since it skips `GetObject`, but it does mean
every sync now does a full `list_objects_v2` paginate. Should still be
cheap for most buckets — flagging in case anyone has a very large bucket
where the time-window filter was meaningful.
On migration: existing rows have `content_hash = xxhash128(blob)` from
the old code. The first sync after this lands sees ETag-derived
fingerprints that don't match, re-fetches every object once, and writes
the new fingerprint. From the second sync onward the bypass works as
expected. "Slow day one, fast every day after." A `fingerprint_backfill:
trust` opt-out is sketched in the design doc but not in this PR.
#### Test plan
- [x] `uv run ruff check` — clean on all 8 touched files
- [x] `uv run pytest
test/unit_test/common/test_blob_connector_fingerprint.py -v` — 14 passed
- [x] Broader unit-test suite — no regressions in anything I touched
- [ ] Manual smoke against a real S3 bucket — configure a connector, run
sync twice, expect the second sync to log `bypassed=N, fetched=0` and no
`GetObject` calls in CloudTrail / bucket access logs
- [ ] Manual smoke with `reindex=1` — confirm the full re-download path
still works
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
---------
Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
2026-05-09 05:03:56 -07:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def list_id_content_hash_map_by_kb_and_source_type(cls, kb_id, source_type, page_size=500):
|
|
|
|
|
"""Return {doc_id: content_hash} for the connector's existing docs.
|
|
|
|
|
|
|
|
|
|
Used by the fingerprint-bypass path to decide which keys can skip a
|
|
|
|
|
re-fetch -- if the connector's listing fingerprint equals content_hash,
|
|
|
|
|
the body hasn't changed since the last sync.
|
|
|
|
|
|
|
|
|
|
Ordered by create_time so LIMIT/OFFSET pagination is stable under
|
|
|
|
|
concurrent writes; without this, page boundaries can drop or duplicate
|
|
|
|
|
rows and the resulting map would silently miss entries.
|
|
|
|
|
"""
|
|
|
|
|
fields = [cls.model.id, cls.model.content_hash]
|
|
|
|
|
docs = cls.model.select(*fields).where(
|
|
|
|
|
cls.model.kb_id == kb_id,
|
|
|
|
|
cls.model.source_type == source_type,
|
|
|
|
|
).order_by(cls.model.create_time.asc())
|
|
|
|
|
offset = 0
|
|
|
|
|
result: dict[str, str] = {}
|
|
|
|
|
while True:
|
|
|
|
|
batch = list(docs.offset(offset).limit(page_size).dicts())
|
|
|
|
|
if not batch:
|
|
|
|
|
break
|
|
|
|
|
for row in batch:
|
|
|
|
|
result[row["id"]] = row.get("content_hash") or ""
|
|
|
|
|
offset += page_size
|
|
|
|
|
return result
|
|
|
|
|
|
2025-09-29 10:16:13 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_all_docs_by_creator_id(cls, creator_id):
|
2026-03-10 18:05:45 +08:00
|
|
|
fields = [cls.model.id, cls.model.kb_id, cls.model.token_num, cls.model.chunk_num, Knowledgebase.tenant_id]
|
|
|
|
|
docs = cls.model.select(*fields).join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id)).where(cls.model.created_by == creator_id)
|
2025-09-29 10:16:13 +08:00
|
|
|
docs.order_by(cls.model.create_time.asc())
|
|
|
|
|
# maybe cause slow query by deep paginate, optimize later
|
|
|
|
|
offset, limit = 0, 100
|
|
|
|
|
res = []
|
|
|
|
|
while True:
|
|
|
|
|
doc_batch = docs.offset(offset).limit(limit)
|
|
|
|
|
_temp = list(doc_batch.dicts())
|
|
|
|
|
if not _temp:
|
|
|
|
|
break
|
|
|
|
|
res.extend(_temp)
|
|
|
|
|
offset += limit
|
|
|
|
|
return res
|
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def insert(cls, doc):
|
|
|
|
|
if not cls.save(**doc):
|
|
|
|
|
raise RuntimeError("Database error (Document)!")
|
2025-03-14 16:31:44 +08:00
|
|
|
if not KnowledgebaseService.atomic_increase_doc_num_by_id(doc["kb_id"]):
|
2024-08-15 09:17:36 +08:00
|
|
|
raise RuntimeError("Database error (Knowledgebase)!")
|
2024-12-31 14:31:31 +08:00
|
|
|
return Document(**doc)
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def remove_document(cls, doc, tenant_id):
|
Fix: Cancel tasks before document or datasets deletion to prevent queue blocking (#12799)
### What problem does this PR solve?
When deleting the knowledge base, the records in the Document and
Knowledgebase tables are immediately deleted
But there are still a large number of pending task messages in the Redis
queue (asynchronous queue) if you did not click on stopping tasks before
deleting knowledge base.
TaskService.get_task() uses a JOIN query to associate three tables (Task
← Document ← Knowledgebase)
Since Document/Knowledgebase have been deleted, the JOIN returns an
empty result, even though the Task records still exist
task-executor considers the task does not exist ("collect task xxx is
unknown"), can only skip and warn
log:2026-01-23 16:43:21,716 WARNING 1190179 collect task
110fbf70f5bd11f0945a23b0930487df is unknown
2026-01-23 16:43:21,818 WARNING 1190179 collect task
11146bc4f5bd11f0945a23b0930487df is unknown
2026-01-23 16:43:21,918 WARNING 1190179 collect task
111c3336f5bd11f0945a23b0930487df is unknown
2026-01-23 16:43:22,021 WARNING 1190179 collect task
112471b8f5bd11f0945a23b0930487df is unknown
2026-01-23 16:43:26,719 WARNING 1190179 collect task
112e855ef5bd11f0945a23b0930487df is unknown
2026-01-23 16:43:26,734 WARNING 1190179 collect task
1134380af5bd11f0945a23b0930487df is unknown
2026-01-23 16:43:26,834 WARNING 1190179 collect task
1138cb2cf5bd11f0945a23b0930487df is unknown
As a consequence, a large number of such tasks occupy the queue
processing capacity, causing new tasks to queue and wait
<img width="1910" height="947"
alt="9a00f2e0-9112-4dbb-b357-7f66b8eb5acf"
src="https://github.com/user-attachments/assets/0e1227c2-a2df-4ef3-ba8f-e04c3f6ef0e1"
/>
Solution
Add logic to stop all ongoing tasks before deleting the knowledge base
and Tasks
### Type of change
- Bug Fix (non-breaking change which fixes an issue)
2026-01-26 10:45:59 +08:00
|
|
|
from api.db.services.task_service import TaskService, cancel_all_task_of
|
2026-03-10 18:05:45 +08:00
|
|
|
|
2026-02-28 11:23:24 +08:00
|
|
|
if not cls.delete_document_and_update_kb_counts(doc.id):
|
|
|
|
|
return True
|
2026-01-15 12:15:55 +05:30
|
|
|
|
2026-04-28 15:07:14 +08:00
|
|
|
chunk_index_name = search.index_name(tenant_id)
|
|
|
|
|
chunk_index_exists = settings.docStoreConn.index_exist(chunk_index_name, doc.kb_id)
|
|
|
|
|
|
2026-05-11 17:48:48 +09:00
|
|
|
# Cancel all running tasks first using preset function in task_service.py --- set cancel flag in Redis
|
Fix: Cancel tasks before document or datasets deletion to prevent queue blocking (#12799)
### What problem does this PR solve?
When deleting the knowledge base, the records in the Document and
Knowledgebase tables are immediately deleted
But there are still a large number of pending task messages in the Redis
queue (asynchronous queue) if you did not click on stopping tasks before
deleting knowledge base.
TaskService.get_task() uses a JOIN query to associate three tables (Task
← Document ← Knowledgebase)
Since Document/Knowledgebase have been deleted, the JOIN returns an
empty result, even though the Task records still exist
task-executor considers the task does not exist ("collect task xxx is
unknown"), can only skip and warn
log:2026-01-23 16:43:21,716 WARNING 1190179 collect task
110fbf70f5bd11f0945a23b0930487df is unknown
2026-01-23 16:43:21,818 WARNING 1190179 collect task
11146bc4f5bd11f0945a23b0930487df is unknown
2026-01-23 16:43:21,918 WARNING 1190179 collect task
111c3336f5bd11f0945a23b0930487df is unknown
2026-01-23 16:43:22,021 WARNING 1190179 collect task
112471b8f5bd11f0945a23b0930487df is unknown
2026-01-23 16:43:26,719 WARNING 1190179 collect task
112e855ef5bd11f0945a23b0930487df is unknown
2026-01-23 16:43:26,734 WARNING 1190179 collect task
1134380af5bd11f0945a23b0930487df is unknown
2026-01-23 16:43:26,834 WARNING 1190179 collect task
1138cb2cf5bd11f0945a23b0930487df is unknown
As a consequence, a large number of such tasks occupy the queue
processing capacity, causing new tasks to queue and wait
<img width="1910" height="947"
alt="9a00f2e0-9112-4dbb-b357-7f66b8eb5acf"
src="https://github.com/user-attachments/assets/0e1227c2-a2df-4ef3-ba8f-e04c3f6ef0e1"
/>
Solution
Add logic to stop all ongoing tasks before deleting the knowledge base
and Tasks
### Type of change
- Bug Fix (non-breaking change which fixes an issue)
2026-01-26 10:45:59 +08:00
|
|
|
try:
|
|
|
|
|
cancel_all_task_of(doc.id)
|
|
|
|
|
logging.info(f"Cancelled all tasks for document {doc.id}")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.warning(f"Failed to cancel tasks for document {doc.id}: {e}")
|
|
|
|
|
|
|
|
|
|
# Delete tasks from database
|
2025-01-22 19:43:14 +08:00
|
|
|
try:
|
2025-08-12 15:55:04 +08:00
|
|
|
TaskService.filter_delete([Task.doc_id == doc.id])
|
2026-01-15 12:15:55 +05:30
|
|
|
except Exception as e:
|
|
|
|
|
logging.warning(f"Failed to delete tasks for document {doc.id}: {e}")
|
|
|
|
|
|
|
|
|
|
# Delete chunk images (non-critical, log and continue)
|
|
|
|
|
try:
|
2026-04-28 15:07:14 +08:00
|
|
|
if chunk_index_exists:
|
|
|
|
|
cls.delete_chunk_images(doc, tenant_id)
|
2026-01-15 12:15:55 +05:30
|
|
|
except Exception as e:
|
|
|
|
|
logging.warning(f"Failed to delete chunk images for document {doc.id}: {e}")
|
|
|
|
|
|
|
|
|
|
# Delete thumbnail (non-critical, log and continue)
|
|
|
|
|
try:
|
2025-05-30 12:56:33 +08:00
|
|
|
if doc.thumbnail and not doc.thumbnail.startswith(IMG_BASE64_PREFIX):
|
2025-11-06 09:36:38 +08:00
|
|
|
if settings.STORAGE_IMPL.obj_exist(doc.kb_id, doc.thumbnail):
|
|
|
|
|
settings.STORAGE_IMPL.rm(doc.kb_id, doc.thumbnail)
|
2026-01-15 12:15:55 +05:30
|
|
|
except Exception as e:
|
|
|
|
|
logging.warning(f"Failed to delete thumbnail for document {doc.id}: {e}")
|
|
|
|
|
|
|
|
|
|
# Delete chunks from doc store - this is critical, log errors
|
|
|
|
|
try:
|
2026-04-28 15:07:14 +08:00
|
|
|
settings.docStoreConn.delete({"doc_id": doc.id}, chunk_index_name, doc.kb_id)
|
2026-01-15 12:15:55 +05:30
|
|
|
except Exception as e:
|
|
|
|
|
logging.error(f"Failed to delete chunks from doc store for document {doc.id}: {e}")
|
2025-05-30 12:56:33 +08:00
|
|
|
|
2026-01-28 13:29:34 +08:00
|
|
|
# Delete document metadata (non-critical, log and continue)
|
|
|
|
|
try:
|
2026-03-10 13:44:24 +08:00
|
|
|
DocMetadataService.delete_document_metadata(doc.id, doc.kb_id, tenant_id)
|
2026-01-28 13:29:34 +08:00
|
|
|
except Exception as e:
|
|
|
|
|
logging.warning(f"Failed to delete metadata for document {doc.id}: {e}")
|
|
|
|
|
|
2026-01-15 12:15:55 +05:30
|
|
|
# Cleanup knowledge graph references (non-critical, log and continue)
|
|
|
|
|
try:
|
2026-04-28 15:07:14 +08:00
|
|
|
if chunk_index_exists:
|
|
|
|
|
graph_source = settings.docStoreConn.get_fields(
|
|
|
|
|
settings.docStoreConn.search(["source_id"], [], {"kb_id": doc.kb_id, "knowledge_graph_kwd": ["graph"]}, [], OrderByExpr(), 0, 1, chunk_index_name, [doc.kb_id]),
|
|
|
|
|
["source_id"],
|
2026-03-10 18:05:45 +08:00
|
|
|
)
|
2026-04-28 15:07:14 +08:00
|
|
|
if len(graph_source) > 0 and doc.id in list(graph_source.values())[0]["source_id"]:
|
|
|
|
|
settings.docStoreConn.update(
|
|
|
|
|
{"kb_id": doc.kb_id, "knowledge_graph_kwd": ["entity", "relation", "graph", "subgraph", "community_report"], "source_id": doc.id},
|
|
|
|
|
{"remove": {"source_id": doc.id}},
|
|
|
|
|
chunk_index_name,
|
|
|
|
|
doc.kb_id,
|
|
|
|
|
)
|
|
|
|
|
settings.docStoreConn.update({"kb_id": doc.kb_id, "knowledge_graph_kwd": ["graph"]}, {"removed_kwd": "Y"}, chunk_index_name, doc.kb_id)
|
|
|
|
|
settings.docStoreConn.delete(
|
|
|
|
|
{"kb_id": doc.kb_id, "knowledge_graph_kwd": ["entity", "relation", "graph", "subgraph", "community_report"], "must_not": {"exists": "source_id"}},
|
|
|
|
|
chunk_index_name,
|
|
|
|
|
doc.kb_id,
|
|
|
|
|
)
|
2026-01-15 12:15:55 +05:30
|
|
|
except Exception as e:
|
|
|
|
|
logging.warning(f"Failed to cleanup knowledge graph for document {doc.id}: {e}")
|
|
|
|
|
|
2026-02-28 11:23:24 +08:00
|
|
|
return True
|
2024-08-15 09:17:36 +08:00
|
|
|
|
2025-12-29 12:54:11 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def delete_chunk_images(cls, doc, tenant_id):
|
|
|
|
|
page = 0
|
|
|
|
|
page_size = 1000
|
|
|
|
|
while True:
|
2026-03-10 18:05:45 +08:00
|
|
|
chunks = settings.docStoreConn.search(["img_id"], [], {"doc_id": doc.id}, [], OrderByExpr(), page * page_size, page_size, search.index_name(tenant_id), [doc.kb_id])
|
2025-12-29 12:54:11 +08:00
|
|
|
chunk_ids = settings.docStoreConn.get_doc_ids(chunks)
|
|
|
|
|
if not chunk_ids:
|
|
|
|
|
break
|
|
|
|
|
for cid in chunk_ids:
|
|
|
|
|
if settings.STORAGE_IMPL.obj_exist(doc.kb_id, cid):
|
|
|
|
|
settings.STORAGE_IMPL.rm(doc.kb_id, cid)
|
|
|
|
|
page += 1
|
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_newly_uploaded(cls):
|
|
|
|
|
fields = [
|
|
|
|
|
cls.model.id,
|
|
|
|
|
cls.model.kb_id,
|
|
|
|
|
cls.model.parser_id,
|
|
|
|
|
cls.model.parser_config,
|
|
|
|
|
cls.model.name,
|
|
|
|
|
cls.model.type,
|
|
|
|
|
cls.model.location,
|
|
|
|
|
cls.model.size,
|
|
|
|
|
Knowledgebase.tenant_id,
|
|
|
|
|
Tenant.embd_id,
|
|
|
|
|
Tenant.img2txt_id,
|
|
|
|
|
Tenant.asr_id,
|
2026-03-10 18:05:45 +08:00
|
|
|
cls.model.update_time,
|
|
|
|
|
]
|
|
|
|
|
docs = (
|
|
|
|
|
cls.model.select(*fields)
|
|
|
|
|
.join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id))
|
|
|
|
|
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
|
2024-08-15 09:17:36 +08:00
|
|
|
.where(
|
2026-03-10 18:05:45 +08:00
|
|
|
cls.model.status == StatusEnum.VALID.value,
|
|
|
|
|
~(cls.model.type == FileType.VIRTUAL.value),
|
|
|
|
|
cls.model.progress == 0,
|
|
|
|
|
cls.model.update_time >= current_timestamp() - 1000 * 600,
|
|
|
|
|
cls.model.run == TaskStatus.RUNNING.value,
|
|
|
|
|
)
|
2024-08-15 09:17:36 +08:00
|
|
|
.order_by(cls.model.update_time.asc())
|
2026-03-10 18:05:45 +08:00
|
|
|
)
|
2024-08-15 09:17:36 +08:00
|
|
|
return list(docs.dicts())
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_unfinished_docs(cls):
|
2026-03-12 18:02:12 +08:00
|
|
|
fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg,
|
|
|
|
|
cls.model.run, cls.model.parser_id]
|
|
|
|
|
unfinished_task_query = Task.select(Task.doc_id).where(
|
|
|
|
|
(Task.progress >= 0) & (Task.progress < 1)
|
|
|
|
|
)
|
|
|
|
|
docs_with_non_failed_tasks = Task.select(Task.doc_id).where(Task.progress >= 0).distinct()
|
2025-11-12 12:03:41 +08:00
|
|
|
|
2026-03-10 18:05:45 +08:00
|
|
|
docs = cls.model.select(*fields).where(
|
2024-11-01 22:59:17 +08:00
|
|
|
cls.model.status == StatusEnum.VALID.value,
|
|
|
|
|
~(cls.model.type == FileType.VIRTUAL.value),
|
2026-01-16 11:09:22 +08:00
|
|
|
((cls.model.run.is_null(True)) | (cls.model.run != TaskStatus.CANCEL.value)),
|
2026-03-12 18:02:12 +08:00
|
|
|
(((cls.model.progress < 1) & (cls.model.progress > 0)) |
|
|
|
|
|
(cls.model.id.in_(unfinished_task_query)) |
|
|
|
|
|
((cls.model.progress == -1) & (cls.model.run == TaskStatus.FAIL.value) &
|
|
|
|
|
(cls.model.id.in_(docs_with_non_failed_tasks))))) # including GraphRAG/RAPTOR/Mindmap; re-sync failed docs
|
2024-08-15 09:17:36 +08:00
|
|
|
return list(docs.dicts())
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
2025-07-07 14:11:47 +08:00
|
|
|
def increment_chunk_num(cls, doc_id, kb_id, token_num, chunk_num, duration):
|
2026-05-13 23:48:52 -07:00
|
|
|
"""Atomically add chunk/token counters on the document and its knowledge base."""
|
|
|
|
|
with DB.atomic():
|
|
|
|
|
num = (
|
|
|
|
|
cls.model.update(
|
|
|
|
|
token_num=cls.model.token_num + token_num,
|
|
|
|
|
chunk_num=cls.model.chunk_num + chunk_num,
|
|
|
|
|
process_duration=cls.model.process_duration + duration,
|
|
|
|
|
)
|
|
|
|
|
.where((cls.model.id == doc_id) & (cls.model.kb_id == kb_id))
|
|
|
|
|
.execute()
|
|
|
|
|
)
|
|
|
|
|
if num == 0:
|
|
|
|
|
logging.error(
|
|
|
|
|
"increment_chunk_num: no document matched doc_id=%s kb_id=%s "
|
|
|
|
|
"token_num=%s chunk_num=%s duration=%s",
|
|
|
|
|
doc_id,
|
|
|
|
|
kb_id,
|
|
|
|
|
token_num,
|
|
|
|
|
chunk_num,
|
|
|
|
|
duration,
|
|
|
|
|
)
|
|
|
|
|
raise LookupError("Document not found which is supposed to be there")
|
|
|
|
|
num = (
|
|
|
|
|
Knowledgebase.update(
|
|
|
|
|
token_num=Knowledgebase.token_num + token_num,
|
|
|
|
|
chunk_num=Knowledgebase.chunk_num + chunk_num,
|
|
|
|
|
)
|
|
|
|
|
.where(Knowledgebase.id == kb_id)
|
|
|
|
|
.execute()
|
|
|
|
|
)
|
|
|
|
|
if num == 0:
|
|
|
|
|
logging.error(
|
|
|
|
|
"increment_chunk_num: no knowledgebase matched kb_id=%s for doc_id=%s "
|
|
|
|
|
"token_num=%s chunk_num=%s duration=%s",
|
|
|
|
|
kb_id,
|
|
|
|
|
doc_id,
|
|
|
|
|
token_num,
|
|
|
|
|
chunk_num,
|
|
|
|
|
duration,
|
|
|
|
|
)
|
|
|
|
|
raise LookupError("Knowledgebase not found which is supposed to be there")
|
2024-08-15 09:17:36 +08:00
|
|
|
return num
|
2024-11-01 22:59:17 +08:00
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
2025-07-07 14:11:47 +08:00
|
|
|
def decrement_chunk_num(cls, doc_id, kb_id, token_num, chunk_num, duration):
|
2026-05-13 23:48:52 -07:00
|
|
|
"""Atomically subtract chunk/token counters on the document and its knowledge base."""
|
|
|
|
|
with DB.atomic():
|
|
|
|
|
num = (
|
|
|
|
|
cls.model.update(
|
|
|
|
|
token_num=cls.model.token_num - token_num,
|
|
|
|
|
chunk_num=cls.model.chunk_num - chunk_num,
|
|
|
|
|
process_duration=cls.model.process_duration + duration,
|
|
|
|
|
)
|
|
|
|
|
.where((cls.model.id == doc_id) & (cls.model.kb_id == kb_id))
|
|
|
|
|
.execute()
|
|
|
|
|
)
|
|
|
|
|
if num == 0:
|
|
|
|
|
raise LookupError("Document not found which is supposed to be there")
|
|
|
|
|
num = (
|
|
|
|
|
Knowledgebase.update(
|
|
|
|
|
token_num=Knowledgebase.token_num - token_num,
|
|
|
|
|
chunk_num=Knowledgebase.chunk_num - chunk_num,
|
|
|
|
|
)
|
|
|
|
|
.where(Knowledgebase.id == kb_id)
|
|
|
|
|
.execute()
|
|
|
|
|
)
|
|
|
|
|
if num == 0:
|
|
|
|
|
logging.error(
|
|
|
|
|
"decrement_chunk_num: no knowledgebase matched kb_id=%s for doc_id=%s "
|
|
|
|
|
"token_num=%s chunk_num=%s duration=%s",
|
|
|
|
|
kb_id,
|
|
|
|
|
doc_id,
|
|
|
|
|
token_num,
|
|
|
|
|
chunk_num,
|
|
|
|
|
duration,
|
|
|
|
|
)
|
|
|
|
|
raise LookupError("Knowledgebase not found which is supposed to be there")
|
2024-08-15 09:17:36 +08:00
|
|
|
return num
|
2024-11-01 22:59:17 +08:00
|
|
|
|
2026-02-28 11:23:24 +08:00
|
|
|
@classmethod
|
2026-03-12 12:39:01 +08:00
|
|
|
@retry_deadlock_operation()
|
2026-02-28 11:23:24 +08:00
|
|
|
@DB.connection_context()
|
|
|
|
|
def delete_document_and_update_kb_counts(cls, doc_id) -> bool:
|
|
|
|
|
"""Atomically delete the document row and update KB counters.
|
|
|
|
|
|
|
|
|
|
Returns True if the document was deleted by this call, False if it was
|
|
|
|
|
already deleted by a concurrent request (idempotent).
|
|
|
|
|
"""
|
|
|
|
|
with DB.atomic():
|
2026-03-12 12:39:01 +08:00
|
|
|
doc = (
|
|
|
|
|
cls.model.select(
|
|
|
|
|
cls.model.id,
|
|
|
|
|
cls.model.kb_id,
|
|
|
|
|
cls.model.token_num,
|
|
|
|
|
cls.model.chunk_num,
|
|
|
|
|
)
|
|
|
|
|
.where(cls.model.id == doc_id)
|
|
|
|
|
.for_update()
|
|
|
|
|
.get_or_none()
|
|
|
|
|
)
|
2026-02-28 11:23:24 +08:00
|
|
|
if doc is None:
|
|
|
|
|
return False
|
|
|
|
|
deleted = cls.model.delete().where(cls.model.id == doc_id).execute()
|
|
|
|
|
if not deleted:
|
|
|
|
|
return False
|
|
|
|
|
Knowledgebase.update(
|
|
|
|
|
token_num=Knowledgebase.token_num - doc.token_num,
|
|
|
|
|
chunk_num=Knowledgebase.chunk_num - doc.chunk_num,
|
|
|
|
|
doc_num=Knowledgebase.doc_num - 1,
|
|
|
|
|
).where(Knowledgebase.id == doc.kb_id).execute()
|
|
|
|
|
return True
|
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def clear_chunk_num(cls, doc_id):
|
2026-02-28 11:23:24 +08:00
|
|
|
"""Deprecated: use delete_document_and_update_kb_counts instead."""
|
2024-08-15 09:17:36 +08:00
|
|
|
doc = cls.model.get_by_id(doc_id)
|
2026-05-19 10:47:06 +08:00
|
|
|
assert doc, "Can't find document in database."
|
2024-08-15 09:17:36 +08:00
|
|
|
|
2026-03-10 18:05:45 +08:00
|
|
|
num = (
|
|
|
|
|
Knowledgebase.update(token_num=Knowledgebase.token_num - doc.token_num, chunk_num=Knowledgebase.chunk_num - doc.chunk_num, doc_num=Knowledgebase.doc_num - 1)
|
|
|
|
|
.where(Knowledgebase.id == doc.kb_id)
|
|
|
|
|
.execute()
|
|
|
|
|
)
|
2024-08-15 09:17:36 +08:00
|
|
|
return num
|
|
|
|
|
|
2025-06-26 17:46:53 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def clear_chunk_num_when_rerun(cls, doc_id):
|
|
|
|
|
doc = cls.model.get_by_id(doc_id)
|
2026-05-19 10:47:06 +08:00
|
|
|
assert doc, "Can't find document in database."
|
2025-06-26 17:46:53 +08:00
|
|
|
|
|
|
|
|
num = (
|
|
|
|
|
Knowledgebase.update(
|
|
|
|
|
token_num=Knowledgebase.token_num - doc.token_num,
|
|
|
|
|
chunk_num=Knowledgebase.chunk_num - doc.chunk_num,
|
|
|
|
|
)
|
|
|
|
|
.where(Knowledgebase.id == doc.kb_id)
|
|
|
|
|
.execute()
|
|
|
|
|
)
|
|
|
|
|
return num
|
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_tenant_id(cls, doc_id):
|
2026-03-10 18:05:45 +08:00
|
|
|
docs = cls.model.select(Knowledgebase.tenant_id).join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id)).where(cls.model.id == doc_id, Knowledgebase.status == StatusEnum.VALID.value)
|
2024-08-15 09:17:36 +08:00
|
|
|
docs = docs.dicts()
|
|
|
|
|
if not docs:
|
2025-11-16 19:29:20 +08:00
|
|
|
return None
|
2024-08-15 09:17:36 +08:00
|
|
|
return docs[0]["tenant_id"]
|
|
|
|
|
|
2024-11-12 14:59:41 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_knowledgebase_id(cls, doc_id):
|
|
|
|
|
docs = cls.model.select(cls.model.kb_id).where(cls.model.id == doc_id)
|
|
|
|
|
docs = docs.dicts()
|
|
|
|
|
if not docs:
|
2025-11-16 19:29:20 +08:00
|
|
|
return None
|
2024-11-12 14:59:41 +08:00
|
|
|
return docs[0]["kb_id"]
|
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_tenant_id_by_name(cls, name):
|
2026-03-10 18:05:45 +08:00
|
|
|
docs = cls.model.select(Knowledgebase.tenant_id).join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id)).where(cls.model.name == name, Knowledgebase.status == StatusEnum.VALID.value)
|
2024-08-15 09:17:36 +08:00
|
|
|
docs = docs.dicts()
|
|
|
|
|
if not docs:
|
2025-11-16 19:29:20 +08:00
|
|
|
return None
|
2024-08-15 09:17:36 +08:00
|
|
|
return docs[0]["tenant_id"]
|
|
|
|
|
|
2024-10-18 13:48:57 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def accessible(cls, doc_id, user_id):
|
2026-05-08 22:30:14 -07:00
|
|
|
e, doc = cls.get_by_id(doc_id)
|
|
|
|
|
if not e:
|
2024-10-18 13:48:57 +08:00
|
|
|
return False
|
2026-05-08 22:30:14 -07:00
|
|
|
return KnowledgebaseService.accessible(doc.kb_id, user_id)
|
2024-10-18 13:48:57 +08:00
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def accessible4deletion(cls, doc_id, user_id):
|
2026-03-10 18:05:45 +08:00
|
|
|
docs = (
|
|
|
|
|
cls.model.select(cls.model.id)
|
|
|
|
|
.join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id))
|
|
|
|
|
.join(UserTenant, on=((UserTenant.tenant_id == Knowledgebase.created_by) & (UserTenant.user_id == user_id)))
|
|
|
|
|
.where(cls.model.id == doc_id, UserTenant.status == StatusEnum.VALID.value, ((UserTenant.role == UserTenantRole.NORMAL) | (UserTenant.role == UserTenantRole.OWNER)))
|
|
|
|
|
.paginate(0, 1)
|
|
|
|
|
)
|
2024-10-18 13:48:57 +08:00
|
|
|
docs = docs.dicts()
|
|
|
|
|
if not docs:
|
|
|
|
|
return False
|
|
|
|
|
return True
|
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_embd_id(cls, doc_id):
|
2026-03-10 18:05:45 +08:00
|
|
|
docs = cls.model.select(Knowledgebase.embd_id).join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id)).where(cls.model.id == doc_id, Knowledgebase.status == StatusEnum.VALID.value)
|
2024-08-15 09:17:36 +08:00
|
|
|
docs = docs.dicts()
|
|
|
|
|
if not docs:
|
2025-11-16 19:29:20 +08:00
|
|
|
return None
|
2024-08-15 09:17:36 +08:00
|
|
|
return docs[0]["embd_id"]
|
2024-11-01 22:59:17 +08:00
|
|
|
|
2026-03-05 17:27:17 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_tenant_embd_id(cls, doc_id):
|
2026-03-10 18:05:45 +08:00
|
|
|
docs = (
|
|
|
|
|
cls.model.select(Knowledgebase.tenant_embd_id).join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id)).where(cls.model.id == doc_id, Knowledgebase.status == StatusEnum.VALID.value)
|
|
|
|
|
)
|
2026-03-05 17:27:17 +08:00
|
|
|
docs = docs.dicts()
|
|
|
|
|
if not docs:
|
|
|
|
|
return None
|
|
|
|
|
return docs[0]["tenant_embd_id"]
|
|
|
|
|
|
2024-12-12 16:38:03 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_chunking_config(cls, doc_id):
|
|
|
|
|
configs = (
|
|
|
|
|
cls.model.select(
|
|
|
|
|
cls.model.id,
|
|
|
|
|
cls.model.kb_id,
|
|
|
|
|
cls.model.parser_id,
|
|
|
|
|
cls.model.parser_config,
|
fix: re-chunk documents when data source content is updated (#12918)
Closes: #12889
### What problem does this PR solve?
When syncing external data sources (e.g., Jira, Confluence, Google
Drive), updated documents were not being re-chunked. The raw content was
correctly updated in blob storage, but the vector database retained
stale chunks, causing search results to return outdated information.
**Root cause:** The task digest used for chunk reuse optimization was
calculated only from parser configuration fields (`parser_id`,
`parser_config`, `kb_id`, etc.), without any content-dependent fields.
When a document's content changed but the parser configuration remained
the same, the system incorrectly reused old chunks instead of
regenerating new ones.
**Example scenario:**
1. User syncs a Jira issue: "Meeting scheduled for Monday"
2. User updates the Jira issue to: "Meeting rescheduled to Friday"
3. User triggers sync again
4. Raw content panel shows updated text ✓
5. Chunk panel still shows old text "Monday" ✗
**Solution:**
1. Include `update_time` and `size` in the chunking config, so the task
digest changes when document content is updated
2. Track updated documents separately in `upload_document()` and return
them for processing
3. Process updated documents through the re-parsing pipeline to
regenerate chunks
[1.webm](https://github.com/user-attachments/assets/d21d4dcd-e189-4d39-8700-053bae0ca5a0)
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2026-03-06 06:48:47 +02:00
|
|
|
cls.model.size,
|
|
|
|
|
cls.model.content_hash,
|
2024-12-12 16:38:03 +08:00
|
|
|
Knowledgebase.language,
|
|
|
|
|
Knowledgebase.embd_id,
|
|
|
|
|
Tenant.id.alias("tenant_id"),
|
|
|
|
|
Tenant.img2txt_id,
|
|
|
|
|
Tenant.asr_id,
|
|
|
|
|
Tenant.llm_id,
|
|
|
|
|
)
|
2025-03-14 16:31:44 +08:00
|
|
|
.join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id))
|
|
|
|
|
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
|
|
|
|
|
.where(cls.model.id == doc_id)
|
2024-12-12 16:38:03 +08:00
|
|
|
)
|
|
|
|
|
configs = configs.dicts()
|
|
|
|
|
if not configs:
|
|
|
|
|
return None
|
|
|
|
|
return configs[0]
|
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_doc_id_by_doc_name(cls, doc_name):
|
|
|
|
|
fields = [cls.model.id]
|
2026-03-10 18:05:45 +08:00
|
|
|
doc_id = cls.model.select(*fields).where(cls.model.name == doc_name)
|
2024-08-15 09:17:36 +08:00
|
|
|
doc_id = doc_id.dicts()
|
|
|
|
|
if not doc_id:
|
2025-11-16 19:29:20 +08:00
|
|
|
return None
|
2024-08-15 09:17:36 +08:00
|
|
|
return doc_id[0]["id"]
|
2025-07-30 19:41:09 +08:00
|
|
|
|
2025-05-07 10:55:08 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_doc_ids_by_doc_names(cls, doc_names):
|
|
|
|
|
if not doc_names:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
query = cls.model.select(cls.model.id).where(cls.model.name.in_(doc_names))
|
|
|
|
|
return list(query.scalars().iterator())
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_thumbnails(cls, docids):
|
2024-10-21 14:33:26 +08:00
|
|
|
fields = [cls.model.id, cls.model.kb_id, cls.model.thumbnail]
|
2026-03-10 18:05:45 +08:00
|
|
|
return list(cls.model.select(*fields).where(cls.model.id.in_(docids)).dicts())
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def update_parser_config(cls, id, config):
|
2025-03-14 16:06:16 +08:00
|
|
|
if not config:
|
|
|
|
|
return
|
2024-08-15 09:17:36 +08:00
|
|
|
e, d = cls.get_by_id(id)
|
|
|
|
|
if not e:
|
|
|
|
|
raise LookupError(f"Document({id}) not found.")
|
|
|
|
|
|
|
|
|
|
def dfs_update(old, new):
|
|
|
|
|
for k, v in new.items():
|
|
|
|
|
if k not in old:
|
|
|
|
|
old[k] = v
|
|
|
|
|
continue
|
2026-01-26 18:02:44 +08:00
|
|
|
if isinstance(v, dict) and isinstance(old[k], dict):
|
2024-08-15 09:17:36 +08:00
|
|
|
dfs_update(old[k], v)
|
|
|
|
|
else:
|
|
|
|
|
old[k] = v
|
2024-11-01 22:59:17 +08:00
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
dfs_update(d.parser_config, config)
|
2024-12-17 15:28:35 +08:00
|
|
|
if not config.get("raptor") and d.parser_config.get("raptor"):
|
|
|
|
|
del d.parser_config["raptor"]
|
2024-08-15 09:17:36 +08:00
|
|
|
cls.update_by_id(id, {"parser_config": d.parser_config})
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_doc_count(cls, tenant_id):
|
2026-03-10 18:05:45 +08:00
|
|
|
docs = cls.model.select(cls.model.id).join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id)).where(Knowledgebase.tenant_id == tenant_id)
|
2024-08-15 09:17:36 +08:00
|
|
|
return len(docs)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
2025-11-12 12:03:41 +08:00
|
|
|
def begin2parse(cls, doc_id, keep_progress=False):
|
|
|
|
|
info = {
|
|
|
|
|
"progress_msg": "Task is queued...",
|
|
|
|
|
"process_begin_at": get_format_time(),
|
|
|
|
|
}
|
|
|
|
|
if not keep_progress:
|
2026-03-10 18:05:45 +08:00
|
|
|
info["progress"] = random.random() * 1 / 100.0
|
2025-11-12 12:03:41 +08:00
|
|
|
info["run"] = TaskStatus.RUNNING.value
|
|
|
|
|
# keep the doc in DONE state when keep_progress=True for GraphRAG, RAPTOR and Mindmap tasks
|
|
|
|
|
|
|
|
|
|
cls.update_by_id(doc_id, info)
|
2025-03-14 16:31:44 +08:00
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def update_progress(cls):
|
|
|
|
|
docs = cls.get_unfinished_docs()
|
2025-10-09 12:36:19 +08:00
|
|
|
|
|
|
|
|
cls._sync_progress(docs)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
2026-03-10 18:05:45 +08:00
|
|
|
def update_progress_immediately(cls, docs: list[dict]):
|
2025-10-09 12:36:19 +08:00
|
|
|
if not docs:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
cls._sync_progress(docs)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
2026-03-10 18:05:45 +08:00
|
|
|
def _sync_progress(cls, docs: list[dict]):
|
2025-10-22 09:29:20 +08:00
|
|
|
from api.db.services.task_service import TaskService
|
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
for d in docs:
|
|
|
|
|
try:
|
2025-10-22 09:29:20 +08:00
|
|
|
tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time)
|
2024-08-15 09:17:36 +08:00
|
|
|
if not tsks:
|
|
|
|
|
continue
|
|
|
|
|
msg = []
|
|
|
|
|
prg = 0
|
|
|
|
|
finished = True
|
|
|
|
|
bad = 0
|
|
|
|
|
e, doc = DocumentService.get_by_id(d["id"])
|
2024-11-01 22:59:17 +08:00
|
|
|
status = doc.run # TaskStatus.RUNNING.value
|
2026-01-16 11:09:22 +08:00
|
|
|
if status == TaskStatus.CANCEL.value:
|
|
|
|
|
continue
|
2025-11-12 12:03:41 +08:00
|
|
|
doc_progress = doc.progress if doc and doc.progress else 0.0
|
|
|
|
|
special_task_running = False
|
2025-03-14 23:43:46 +08:00
|
|
|
priority = 0
|
2024-08-15 09:17:36 +08:00
|
|
|
for t in tsks:
|
2025-11-12 12:03:41 +08:00
|
|
|
task_type = (t.task_type or "").lower()
|
|
|
|
|
if task_type in PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES:
|
|
|
|
|
special_task_running = True
|
2024-08-15 09:17:36 +08:00
|
|
|
if 0 <= t.progress < 1:
|
|
|
|
|
finished = False
|
|
|
|
|
if t.progress == -1:
|
|
|
|
|
bad += 1
|
2025-03-05 14:48:03 +08:00
|
|
|
prg += t.progress if t.progress >= 0 else 0
|
2025-06-13 17:32:40 +08:00
|
|
|
if t.progress_msg.strip():
|
|
|
|
|
msg.append(t.progress_msg)
|
2025-03-14 23:43:46 +08:00
|
|
|
priority = max(priority, t.priority)
|
2024-08-15 09:17:36 +08:00
|
|
|
prg /= len(tsks)
|
|
|
|
|
if finished and bad:
|
|
|
|
|
prg = -1
|
|
|
|
|
status = TaskStatus.FAIL.value
|
|
|
|
|
elif finished:
|
2025-10-09 12:36:19 +08:00
|
|
|
prg = 1
|
|
|
|
|
status = TaskStatus.DONE.value
|
2026-03-12 18:02:12 +08:00
|
|
|
elif not finished:
|
|
|
|
|
status = TaskStatus.RUNNING.value
|
2024-08-15 09:17:36 +08:00
|
|
|
|
2025-11-16 19:29:20 +08:00
|
|
|
# only for special task and parsed docs and unfinished
|
2025-11-12 12:03:41 +08:00
|
|
|
freeze_progress = special_task_running and doc_progress >= 1 and not finished
|
2025-01-22 19:43:14 +08:00
|
|
|
msg = "\n".join(sorted(msg))
|
2025-12-03 14:19:53 +08:00
|
|
|
begin_at = d.get("process_begin_at")
|
|
|
|
|
if not begin_at:
|
|
|
|
|
begin_at = datetime.now()
|
|
|
|
|
# fallback
|
|
|
|
|
cls.update_by_id(d["id"], {"process_begin_at": begin_at})
|
|
|
|
|
|
2026-03-10 18:05:45 +08:00
|
|
|
info = {"process_duration": max(datetime.timestamp(datetime.now()) - begin_at.timestamp(), 0), "run": status}
|
2025-11-12 12:03:41 +08:00
|
|
|
if prg != 0 and not freeze_progress:
|
2024-08-15 09:17:36 +08:00
|
|
|
info["progress"] = prg
|
|
|
|
|
if msg:
|
|
|
|
|
info["progress_msg"] = msg
|
2025-10-09 12:36:19 +08:00
|
|
|
if msg.endswith("created task graphrag") or msg.endswith("created task raptor") or msg.endswith("created task mindmap"):
|
2026-03-10 18:05:45 +08:00
|
|
|
info["progress_msg"] += "\n%d tasks are ahead in the queue..." % get_queue_length(priority)
|
2025-06-13 17:32:40 +08:00
|
|
|
else:
|
2026-03-10 18:05:45 +08:00
|
|
|
info["progress_msg"] = "%d tasks are ahead in the queue..." % get_queue_length(priority)
|
2026-01-16 11:09:22 +08:00
|
|
|
info["update_time"] = current_timestamp()
|
|
|
|
|
info["update_date"] = get_format_time()
|
2026-03-10 18:05:45 +08:00
|
|
|
(cls.model.update(info).where((cls.model.id == d["id"]) & ((cls.model.run.is_null(True)) | (cls.model.run != TaskStatus.CANCEL.value))).execute())
|
2024-08-15 09:17:36 +08:00
|
|
|
except Exception as e:
|
2024-10-16 16:10:24 +08:00
|
|
|
if str(e).find("'0'") < 0:
|
2024-11-14 17:13:48 +08:00
|
|
|
logging.exception("fetch task exception")
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_kb_doc_count(cls, kb_id):
|
2025-09-19 19:11:50 +08:00
|
|
|
return cls.model.select().where(cls.model.kb_id == kb_id).count()
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def get_all_kb_doc_count(cls):
|
|
|
|
|
result = {}
|
2026-03-10 18:05:45 +08:00
|
|
|
rows = cls.model.select(cls.model.kb_id, fn.COUNT(cls.model.id).alias("count")).group_by(cls.model.kb_id)
|
2025-09-19 19:11:50 +08:00
|
|
|
for row in rows:
|
|
|
|
|
result[row.kb_id] = row.count
|
|
|
|
|
return result
|
2024-08-15 09:17:36 +08:00
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def do_cancel(cls, doc_id):
|
|
|
|
|
try:
|
|
|
|
|
_, doc = DocumentService.get_by_id(doc_id)
|
|
|
|
|
return doc.run == TaskStatus.CANCEL.value or doc.progress < 0
|
2024-11-05 14:34:49 +08:00
|
|
|
except Exception:
|
2024-08-15 09:17:36 +08:00
|
|
|
pass
|
|
|
|
|
return False
|
|
|
|
|
|
2025-09-18 09:52:33 +08:00
|
|
|
@classmethod
|
|
|
|
|
@DB.connection_context()
|
|
|
|
|
def knowledgebase_basic_info(cls, kb_id: str) -> dict[str, int]:
|
2026-01-16 11:09:22 +08:00
|
|
|
# cancelled: run == "2"
|
2026-03-10 18:05:45 +08:00
|
|
|
cancelled = cls.model.select(fn.COUNT(1)).where((cls.model.kb_id == kb_id) & (cls.model.run == TaskStatus.CANCEL)).scalar()
|
|
|
|
|
downloaded = cls.model.select(fn.COUNT(1)).where(cls.model.kb_id == kb_id, cls.model.source_type != "local").scalar()
|
2025-09-18 09:52:33 +08:00
|
|
|
|
|
|
|
|
row = (
|
|
|
|
|
cls.model.select(
|
|
|
|
|
# finished: progress == 1
|
|
|
|
|
fn.COALESCE(fn.SUM(Case(None, [(cls.model.progress == 1, 1)], 0)), 0).alias("finished"),
|
|
|
|
|
# failed: progress == -1
|
|
|
|
|
fn.COALESCE(fn.SUM(Case(None, [(cls.model.progress == -1, 1)], 0)), 0).alias("failed"),
|
|
|
|
|
# processing: 0 <= progress < 1
|
|
|
|
|
fn.COALESCE(
|
|
|
|
|
fn.SUM(
|
|
|
|
|
Case(
|
|
|
|
|
None,
|
|
|
|
|
[
|
|
|
|
|
(((cls.model.progress == 0) | ((cls.model.progress > 0) & (cls.model.progress < 1))), 1),
|
|
|
|
|
],
|
|
|
|
|
0,
|
|
|
|
|
)
|
|
|
|
|
),
|
|
|
|
|
0,
|
|
|
|
|
).alias("processing"),
|
|
|
|
|
)
|
2026-03-10 18:05:45 +08:00
|
|
|
.where((cls.model.kb_id == kb_id) & ((cls.model.run.is_null(True)) | (cls.model.run != TaskStatus.CANCEL)))
|
2025-09-18 09:52:33 +08:00
|
|
|
.dicts()
|
|
|
|
|
.get()
|
|
|
|
|
)
|
|
|
|
|
|
2026-03-10 18:05:45 +08:00
|
|
|
return {"processing": int(row["processing"]), "finished": int(row["finished"]), "failed": int(row["failed"]), "cancelled": int(cancelled), "downloaded": int(downloaded)}
|
2025-09-18 09:52:33 +08:00
|
|
|
|
2025-11-03 19:59:18 +08:00
|
|
|
@classmethod
|
2026-03-10 18:05:45 +08:00
|
|
|
def run(cls, tenant_id: str, doc: dict, kb_table_num_map: dict):
|
2025-11-03 19:59:18 +08:00
|
|
|
from api.db.services.task_service import queue_dataflow, queue_tasks
|
|
|
|
|
from api.db.services.file2document_service import File2DocumentService
|
|
|
|
|
|
|
|
|
|
doc["tenant_id"] = tenant_id
|
|
|
|
|
doc_parser = doc.get("parser_id", ParserType.NAIVE)
|
|
|
|
|
if doc_parser == ParserType.TABLE:
|
|
|
|
|
kb_id = doc.get("kb_id")
|
|
|
|
|
if not kb_id:
|
|
|
|
|
return
|
|
|
|
|
if kb_id not in kb_table_num_map:
|
|
|
|
|
count = DocumentService.count_by_kb_id(kb_id=kb_id, keywords="", run_status=[TaskStatus.DONE], types=[])
|
|
|
|
|
kb_table_num_map[kb_id] = count
|
|
|
|
|
if kb_table_num_map[kb_id] <= 0:
|
|
|
|
|
KnowledgebaseService.delete_field_map(kb_id)
|
|
|
|
|
if doc.get("pipeline_id", ""):
|
|
|
|
|
queue_dataflow(tenant_id, flow_id=doc["pipeline_id"], task_id=get_uuid(), doc_id=doc["id"])
|
|
|
|
|
else:
|
|
|
|
|
bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
|
|
|
|
|
queue_tasks(doc, bucket, name, 0)
|
|
|
|
|
|
|
|
|
|
|
2026-05-14 14:46:47 +08:00
|
|
|
def queue_raptor_o_graphrag_tasks(sample_doc, ty, priority, fake_doc_id="", doc_ids=None):
|
2025-10-09 12:36:19 +08:00
|
|
|
"""
|
|
|
|
|
You can provide a fake_doc_id to bypass the restriction of tasks at the knowledgebase level.
|
|
|
|
|
Optionally, specify a list of doc_ids to determine which documents participate in the task.
|
|
|
|
|
"""
|
2026-05-14 14:46:47 +08:00
|
|
|
if doc_ids is None:
|
|
|
|
|
doc_ids = []
|
2025-10-22 09:29:20 +08:00
|
|
|
assert ty in ["graphrag", "raptor", "mindmap"], "type should be graphrag, raptor or mindmap"
|
|
|
|
|
|
2026-04-14 16:37:41 +08:00
|
|
|
chunking_config = DocumentService.get_chunking_config(sample_doc["id"])
|
2024-12-17 15:28:35 +08:00
|
|
|
hasher = xxhash.xxh64()
|
|
|
|
|
for field in sorted(chunking_config.keys()):
|
|
|
|
|
hasher.update(str(chunking_config[field]).encode("utf-8"))
|
|
|
|
|
|
2024-08-15 09:17:36 +08:00
|
|
|
def new_task():
|
|
|
|
|
return {
|
|
|
|
|
"id": get_uuid(),
|
2026-04-14 16:37:41 +08:00
|
|
|
"doc_id": fake_doc_id,
|
Fix: Remove hardcoded page limits causing parsing failures on large PDFs (>300 pages) (#14382)
### What problem does this PR solve?
Fixes #14196
## Problem
When using DeepDOC to parse large PDFs (over 1000 pages), the parser
silently truncated processing at 300 pages due to a hardcoded default
`page_to=299` in `RAGFlowPdfParser.__images__()`. This caused:
- **Errors** on pages beyond the limit
- **Poor image quality** as the parser attempted to compensate with
missing page data
- **Inconsistent chunk splitting** between full PDF imports and partial
imports
Additionally, the codebase scattered magic numbers (`299`, `600`,
`10000`, `100000`, `100000000`, `10000000000`, `10**9`) across 22 files
as sentinel values for "parse all pages", making future maintenance
error-prone.
## Root Cause
```python
# deepdoc/parser/pdf_parser.py (before)
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
# Only the first 300 pages were rendered; everything beyond was silently dropped
```
While most callers in `rag/app/*.py` correctly passed `to_page=100000`,
the base class `RAGFlowPdfParser.__call__()` and `parse_into_bboxes()`
invoked `__images__` **without** forwarding `page_from`/`page_to`,
falling back to the restrictive default of 299.
## Solution
### 1. Define constants in `common/constants.py`
```python
MAXIMUM_PAGE_NUMBER = 100000 # Used by the parsing layer
MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000 # Used by the task/DB layer
```
### 2. Replace all hardcoded sentinel values
| Layer | Files Changed | Old Values | New Value |
|---|---|---|---|
| **Deepdoc parsers** | `pdf_parser.py`, `mineru_parser.py`,
`docling_parser.py`, `opendataloader_parser.py`, `paddleocr_parser.py`,
`docx_parser.py` | `299`, `600`, `10**9`, `100000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Chunk parsers** | `naive.py`, `book.py`, `qa.py`, `one.py`,
`manual.py`, `paper.py`, `presentation.py`, `laws.py`, `resume.py`,
`email.py`, `table.py` | `100000`, `10000`, `10000000000` |
`MAXIMUM_PAGE_NUMBER` |
| **Task/DB layer** | `db_models.py`, `task_service.py`,
`document_service.py`, `file_service.py` | `100000000` |
`MAXIMUM_TASK_PAGE_NUMBER` |
### 3. Fix `parse_into_bboxes()` missing parameters
Added `from_page`/`to_page` parameters to `parse_into_bboxes()` so that
the `rag/flow/parser/parser.py` DeepDOC path no longer falls back to the
restrictive default.
## Files Changed (22)
- `common/constants.py`
- `deepdoc/parser/pdf_parser.py`
- `deepdoc/parser/mineru_parser.py`
- `deepdoc/parser/docling_parser.py`
- `deepdoc/parser/opendataloader_parser.py`
- `deepdoc/parser/paddleocr_parser.py`
- `deepdoc/parser/docx_parser.py`
- `rag/app/naive.py`
- `rag/app/book.py`
- `rag/app/qa.py`
- `rag/app/one.py`
- `rag/app/manual.py`
- `rag/app/paper.py`
- `rag/app/presentation.py`
- `rag/app/laws.py`
- `rag/app/resume.py`
- `rag/app/email.py`
- `rag/app/table.py`
- `api/db/db_models.py`
- `api/db/services/task_service.py`
- `api/db/services/document_service.py`
- `api/db/services/file_service.py`
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring
---------
Signed-off-by: noob <yixiao121314@outlook.com>
2026-04-27 06:57:20 +00:00
|
|
|
"from_page": MAXIMUM_TASK_PAGE_NUMBER,
|
|
|
|
|
"to_page": MAXIMUM_TASK_PAGE_NUMBER,
|
2025-03-05 14:48:03 +08:00
|
|
|
"task_type": ty,
|
2026-03-10 18:05:45 +08:00
|
|
|
"progress_msg": datetime.now().strftime("%H:%M:%S") + " created task " + ty,
|
2025-11-11 19:46:41 +08:00
|
|
|
"begin_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
2024-08-15 09:17:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
task = new_task()
|
2024-12-17 15:28:35 +08:00
|
|
|
for field in ["doc_id", "from_page", "to_page"]:
|
|
|
|
|
hasher.update(str(task.get(field, "")).encode("utf-8"))
|
2025-01-22 19:43:14 +08:00
|
|
|
hasher.update(ty.encode("utf-8"))
|
2024-12-17 15:28:35 +08:00
|
|
|
task["digest"] = hasher.hexdigest()
|
2024-08-15 09:17:36 +08:00
|
|
|
bulk_insert_into_db(Task, [task], True)
|
2025-10-09 12:36:19 +08:00
|
|
|
|
2025-10-22 09:29:20 +08:00
|
|
|
task["doc_ids"] = doc_ids
|
2026-04-14 16:37:41 +08:00
|
|
|
DocumentService.begin2parse(task["doc_id"], keep_progress=True)
|
2026-05-27 21:54:17 +08:00
|
|
|
assert REDIS_CONN.queue_product(settings.get_svr_queue_name(priority, ty), message=task), "Can't access Redis. Please check the Redis' status."
|
2025-10-09 12:36:19 +08:00
|
|
|
return task["id"]
|
2024-08-15 19:30:43 +08:00
|
|
|
|
|
|
|
|
|
2026-05-27 21:54:17 +08:00
|
|
|
def get_queue_length(priority, suffix="common"):
|
|
|
|
|
group_info = REDIS_CONN.queue_info(settings.get_svr_queue_name(priority, suffix), SVR_CONSUMER_GROUP_NAME)
|
2025-09-17 06:30:45 +04:00
|
|
|
if not group_info:
|
|
|
|
|
return 0
|
Fix: Redis stream lag can be nil (#9139)
### What problem does this PR solve?
```bash
Traceback (most recent call last):
File "/home/infiniflow/workspace/ragflow/api/db/services/document_service.py", line 635, in update_progress
info["progress_msg"] = "%d tasks are ahead in the queue..."%get_queue_length(priority)
File "/home/infiniflow/workspace/ragflow/api/db/services/document_service.py", line 686, in get_queue_length
return int(group_info.get("lag", 0))
TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
```
This issue can happen very rare. When a `stream` is first created, the
`lag` value may be nil, which can cause this issue. However, once any
message is synced, the `lag` will become `0` afterwards.
```bash
> XINFO GROUPS rag_flow_svr_queue
1) 1) "name"
2) "rag_flow_svr_task_broker"
3) "consumers"
4) (integer) 0
5) "pending"
6) (integer) 0
7) "last-delivered-id"
8) "1753952489937-0"
9) "entries-read"
10) (nil)
11) "lag"
12) (nil)
```
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
2025-08-01 09:39:41 +08:00
|
|
|
return int(group_info.get("lag", 0) or 0)
|